django-cfg 1.1.81__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_cfg/__init__.py +20 -448
- django_cfg/apps/accounts/README.md +3 -3
- django_cfg/apps/accounts/admin/__init__.py +0 -2
- django_cfg/apps/accounts/admin/activity.py +2 -9
- django_cfg/apps/accounts/admin/filters.py +0 -42
- django_cfg/apps/accounts/admin/inlines.py +8 -8
- django_cfg/apps/accounts/admin/otp.py +5 -5
- django_cfg/apps/accounts/admin/registration_source.py +1 -8
- django_cfg/apps/accounts/admin/user.py +12 -20
- django_cfg/apps/accounts/managers/user_manager.py +2 -129
- django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
- django_cfg/apps/accounts/models.py +3 -123
- django_cfg/apps/accounts/serializers/otp.py +40 -44
- django_cfg/apps/accounts/serializers/profile.py +0 -2
- django_cfg/apps/accounts/services/otp_service.py +98 -186
- django_cfg/apps/accounts/signals.py +25 -15
- django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
- django_cfg/apps/accounts/views/otp.py +35 -36
- django_cfg/apps/agents/README.md +129 -0
- django_cfg/apps/agents/__init__.py +68 -0
- django_cfg/apps/agents/admin/__init__.py +17 -0
- django_cfg/apps/agents/admin/execution_admin.py +460 -0
- django_cfg/apps/agents/admin/registry_admin.py +360 -0
- django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
- django_cfg/apps/agents/apps.py +29 -0
- django_cfg/apps/agents/core/__init__.py +20 -0
- django_cfg/apps/agents/core/agent.py +281 -0
- django_cfg/apps/agents/core/dependencies.py +154 -0
- django_cfg/apps/agents/core/exceptions.py +66 -0
- django_cfg/apps/agents/core/models.py +106 -0
- django_cfg/apps/agents/core/orchestrator.py +391 -0
- django_cfg/apps/agents/examples/__init__.py +3 -0
- django_cfg/apps/agents/examples/simple_example.py +161 -0
- django_cfg/apps/agents/integration/__init__.py +14 -0
- django_cfg/apps/agents/integration/middleware.py +80 -0
- django_cfg/apps/agents/integration/registry.py +345 -0
- django_cfg/apps/agents/integration/signals.py +50 -0
- django_cfg/apps/agents/management/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/create_agent.py +365 -0
- django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
- django_cfg/apps/agents/managers/__init__.py +23 -0
- django_cfg/apps/agents/managers/execution.py +236 -0
- django_cfg/apps/agents/managers/registry.py +254 -0
- django_cfg/apps/agents/managers/toolsets.py +496 -0
- django_cfg/apps/agents/migrations/0001_initial.py +286 -0
- django_cfg/apps/agents/migrations/__init__.py +5 -0
- django_cfg/apps/agents/models/__init__.py +15 -0
- django_cfg/apps/agents/models/execution.py +215 -0
- django_cfg/apps/agents/models/registry.py +220 -0
- django_cfg/apps/agents/models/toolsets.py +305 -0
- django_cfg/apps/agents/patterns/__init__.py +24 -0
- django_cfg/apps/agents/patterns/content_agents.py +234 -0
- django_cfg/apps/agents/toolsets/__init__.py +15 -0
- django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
- django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
- django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
- django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
- django_cfg/apps/agents/urls.py +46 -0
- django_cfg/apps/knowbase/README.md +150 -0
- django_cfg/apps/knowbase/__init__.py +27 -0
- django_cfg/apps/knowbase/admin/__init__.py +23 -0
- django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
- django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
- django_cfg/apps/knowbase/admin/document_admin.py +650 -0
- django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
- django_cfg/apps/knowbase/apps.py +81 -0
- django_cfg/apps/knowbase/config/README.md +176 -0
- django_cfg/apps/knowbase/config/__init__.py +51 -0
- django_cfg/apps/knowbase/config/constance_fields.py +186 -0
- django_cfg/apps/knowbase/config/constance_settings.py +200 -0
- django_cfg/apps/knowbase/config/settings.py +444 -0
- django_cfg/apps/knowbase/examples/__init__.py +3 -0
- django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
- django_cfg/apps/knowbase/management/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
- django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
- django_cfg/apps/knowbase/managers/__init__.py +22 -0
- django_cfg/apps/knowbase/managers/archive.py +426 -0
- django_cfg/apps/knowbase/managers/base.py +32 -0
- django_cfg/apps/knowbase/managers/chat.py +141 -0
- django_cfg/apps/knowbase/managers/document.py +203 -0
- django_cfg/apps/knowbase/managers/external_data.py +471 -0
- django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
- django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
- django_cfg/apps/knowbase/migrations/__init__.py +5 -0
- django_cfg/apps/knowbase/mixins/__init__.py +15 -0
- django_cfg/apps/knowbase/mixins/config.py +108 -0
- django_cfg/apps/knowbase/mixins/creator.py +81 -0
- django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
- django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
- django_cfg/apps/knowbase/mixins/service.py +362 -0
- django_cfg/apps/knowbase/models/__init__.py +41 -0
- django_cfg/apps/knowbase/models/archive.py +599 -0
- django_cfg/apps/knowbase/models/base.py +58 -0
- django_cfg/apps/knowbase/models/chat.py +157 -0
- django_cfg/apps/knowbase/models/document.py +267 -0
- django_cfg/apps/knowbase/models/external_data.py +376 -0
- django_cfg/apps/knowbase/serializers/__init__.py +68 -0
- django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
- django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
- django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
- django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
- django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
- django_cfg/apps/knowbase/services/__init__.py +40 -0
- django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
- django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
- django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
- django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
- django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
- django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
- django_cfg/apps/knowbase/services/base.py +53 -0
- django_cfg/apps/knowbase/services/chat_service.py +239 -0
- django_cfg/apps/knowbase/services/document_service.py +144 -0
- django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
- django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
- django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
- django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
- django_cfg/apps/knowbase/services/embedding/models.py +229 -0
- django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
- django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
- django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
- django_cfg/apps/knowbase/services/search_service.py +293 -0
- django_cfg/apps/knowbase/signals/__init__.py +21 -0
- django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
- django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
- django_cfg/apps/knowbase/signals/document_signals.py +143 -0
- django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
- django_cfg/apps/knowbase/tasks/__init__.py +39 -0
- django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
- django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
- django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
- django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
- django_cfg/apps/knowbase/urls.py +43 -0
- django_cfg/apps/knowbase/utils/__init__.py +12 -0
- django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
- django_cfg/apps/knowbase/utils/text_processing.py +375 -0
- django_cfg/apps/knowbase/utils/validation.py +99 -0
- django_cfg/apps/knowbase/views/__init__.py +28 -0
- django_cfg/apps/knowbase/views/archive_views.py +469 -0
- django_cfg/apps/knowbase/views/base.py +49 -0
- django_cfg/apps/knowbase/views/chat_views.py +181 -0
- django_cfg/apps/knowbase/views/document_views.py +183 -0
- django_cfg/apps/knowbase/views/public_views.py +129 -0
- django_cfg/apps/leads/admin.py +70 -0
- django_cfg/apps/newsletter/admin.py +234 -0
- django_cfg/apps/newsletter/admin_filters.py +124 -0
- django_cfg/apps/support/admin.py +196 -0
- django_cfg/apps/support/admin_filters.py +71 -0
- django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
- django_cfg/apps/urls.py +5 -4
- django_cfg/cli/README.md +1 -1
- django_cfg/cli/commands/create_project.py +2 -2
- django_cfg/cli/commands/info.py +1 -1
- django_cfg/config.py +44 -0
- django_cfg/core/config.py +29 -82
- django_cfg/core/environment.py +1 -1
- django_cfg/core/generation.py +19 -107
- django_cfg/{integration.py → core/integration.py} +18 -16
- django_cfg/core/validation.py +1 -1
- django_cfg/management/__init__.py +1 -1
- django_cfg/management/commands/__init__.py +1 -1
- django_cfg/management/commands/auto_generate.py +482 -0
- django_cfg/management/commands/migrator.py +19 -101
- django_cfg/management/commands/test_email.py +1 -1
- django_cfg/middleware/README.md +0 -158
- django_cfg/middleware/__init__.py +0 -2
- django_cfg/middleware/user_activity.py +3 -3
- django_cfg/models/api.py +145 -0
- django_cfg/models/base.py +287 -0
- django_cfg/models/cache.py +4 -4
- django_cfg/models/constance.py +25 -88
- django_cfg/models/database.py +9 -9
- django_cfg/models/drf.py +3 -36
- django_cfg/models/email.py +163 -0
- django_cfg/models/environment.py +276 -0
- django_cfg/models/limits.py +1 -1
- django_cfg/models/logging.py +366 -0
- django_cfg/models/revolution.py +41 -2
- django_cfg/models/security.py +125 -0
- django_cfg/models/services.py +1 -1
- django_cfg/modules/__init__.py +2 -56
- django_cfg/modules/base.py +78 -52
- django_cfg/modules/django_currency/service.py +2 -2
- django_cfg/modules/django_email.py +2 -2
- django_cfg/modules/django_health.py +267 -0
- django_cfg/modules/django_llm/llm/client.py +79 -17
- django_cfg/modules/django_llm/translator/translator.py +2 -2
- django_cfg/modules/django_logger.py +2 -2
- django_cfg/modules/django_ngrok.py +2 -2
- django_cfg/modules/django_tasks.py +68 -3
- django_cfg/modules/django_telegram.py +3 -3
- django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
- django_cfg/modules/django_twilio/service.py +2 -2
- django_cfg/modules/django_twilio/simple_service.py +2 -2
- django_cfg/modules/django_twilio/templates/guide.md +266 -0
- django_cfg/modules/django_twilio/twilio_service.py +2 -2
- django_cfg/modules/django_unfold/__init__.py +69 -0
- django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
- django_cfg/modules/django_unfold/dashboard.py +278 -0
- django_cfg/modules/django_unfold/icons/README.md +145 -0
- django_cfg/modules/django_unfold/icons/__init__.py +12 -0
- django_cfg/modules/django_unfold/icons/constants.py +2851 -0
- django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
- django_cfg/modules/django_unfold/models/__init__.py +42 -0
- django_cfg/modules/django_unfold/models/config.py +601 -0
- django_cfg/modules/django_unfold/models/dashboard.py +206 -0
- django_cfg/modules/django_unfold/models/dropdown.py +40 -0
- django_cfg/modules/django_unfold/models/navigation.py +73 -0
- django_cfg/modules/django_unfold/models/tabs.py +25 -0
- django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
- django_cfg/modules/django_unfold/utils.py +140 -0
- django_cfg/registry/__init__.py +23 -0
- django_cfg/registry/core.py +61 -0
- django_cfg/registry/exceptions.py +11 -0
- django_cfg/registry/modules.py +12 -0
- django_cfg/registry/services.py +26 -0
- django_cfg/registry/third_party.py +52 -0
- django_cfg/routing/__init__.py +19 -0
- django_cfg/routing/callbacks.py +198 -0
- django_cfg/routing/routers.py +48 -0
- django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
- django_cfg/templatetags/__init__.py +0 -0
- django_cfg/templatetags/django_cfg.py +33 -0
- django_cfg/urls.py +33 -0
- django_cfg/utils/path_resolution.py +1 -1
- django_cfg/utils/smart_defaults.py +7 -61
- django_cfg/utils/toolkit.py +663 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/METADATA +83 -86
- django_cfg-1.2.0.dist-info/RECORD +441 -0
- django_cfg/apps/tasks/@docs/README.md +0 -195
- django_cfg/archive/django_sample.zip +0 -0
- django_cfg/models/unfold.py +0 -271
- django_cfg/modules/unfold/__init__.py +0 -29
- django_cfg/modules/unfold/dashboard.py +0 -318
- django_cfg/pyproject.toml +0 -370
- django_cfg/routers.py +0 -83
- django_cfg-1.1.81.dist-info/RECORD +0 -278
- /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
- /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
- /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
- /django_cfg/{version_check.py → utils/version_check.py} +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/WHEEL +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/entry_points.txt +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,791 @@
|
|
1
|
+
"""
|
2
|
+
Contextual chunking service.
|
3
|
+
|
4
|
+
Creates context-aware chunks with rich metadata for AI understanding.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import re
|
8
|
+
import ast
|
9
|
+
import logging
|
10
|
+
from typing import List, Dict, Any, Optional, Tuple
|
11
|
+
from django.contrib.auth import get_user_model
|
12
|
+
from pydantic import BaseModel
|
13
|
+
|
14
|
+
from ...models.archive import ArchiveItem, ArchiveItemChunk, ContentType, ChunkType
|
15
|
+
from ...utils.chunk_settings import get_chunking_params_for_type
|
16
|
+
from ..base import BaseService
|
17
|
+
from .exceptions import ChunkingError
|
18
|
+
|
19
|
+
User = get_user_model()
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
class ChunkContextMetadata(BaseModel):
|
25
|
+
"""Rich context metadata for chunks."""
|
26
|
+
|
27
|
+
# Parent hierarchy
|
28
|
+
archive_info: Dict[str, Any]
|
29
|
+
item_info: Dict[str, Any]
|
30
|
+
|
31
|
+
# Position and structure
|
32
|
+
position_info: Dict[str, Any]
|
33
|
+
structure_info: Dict[str, Any]
|
34
|
+
|
35
|
+
# Semantic context
|
36
|
+
semantic_info: Dict[str, Any]
|
37
|
+
|
38
|
+
# Relational context
|
39
|
+
relationship_info: Dict[str, Any]
|
40
|
+
|
41
|
+
# Processing provenance
|
42
|
+
processing_info: Dict[str, Any]
|
43
|
+
|
44
|
+
|
45
|
+
class ChunkData(BaseModel):
|
46
|
+
"""Data structure for created chunk."""
|
47
|
+
|
48
|
+
content: str
|
49
|
+
chunk_index: int
|
50
|
+
chunk_type: str
|
51
|
+
context_metadata: Dict[str, Any]
|
52
|
+
|
53
|
+
|
54
|
+
class ContextualChunkingService(BaseService):
|
55
|
+
"""Service for creating context-aware chunks."""
|
56
|
+
|
57
|
+
def __init__(self, user: User):
|
58
|
+
super().__init__(user)
|
59
|
+
# Get dynamic settings from Constance
|
60
|
+
chunking_params = get_chunking_params_for_type('archive')
|
61
|
+
self.chunk_size = chunking_params['chunk_size']
|
62
|
+
self.overlap = chunking_params['overlap']
|
63
|
+
|
64
|
+
logger.info(f"📦 Archive chunking initialized: chunk_size={self.chunk_size}, overlap={self.overlap}")
|
65
|
+
|
66
|
+
def create_chunks_with_context(
|
67
|
+
self,
|
68
|
+
item: ArchiveItem,
|
69
|
+
chunk_size: Optional[int] = None,
|
70
|
+
overlap: Optional[int] = None
|
71
|
+
) -> List[ArchiveItemChunk]:
|
72
|
+
"""Create chunks with rich context metadata."""
|
73
|
+
|
74
|
+
if not item.raw_content or not item.is_processable:
|
75
|
+
return []
|
76
|
+
|
77
|
+
# Use instance settings if parameters not provided
|
78
|
+
final_chunk_size = chunk_size or self.chunk_size
|
79
|
+
final_overlap = overlap or self.overlap
|
80
|
+
|
81
|
+
logger.debug(f"📦 Chunking {item.relative_path}: size={final_chunk_size}, overlap={final_overlap}")
|
82
|
+
|
83
|
+
try:
|
84
|
+
# Debug logging
|
85
|
+
logger.info(f"Creating chunks for item: {item.relative_path}, content_type: {item.content_type}")
|
86
|
+
|
87
|
+
# Choose chunking strategy based on content type
|
88
|
+
if item.content_type == ContentType.CODE:
|
89
|
+
logger.debug(f"Using code chunking for {item.relative_path}")
|
90
|
+
chunks_data = self._chunk_code_content(item, final_chunk_size, final_overlap)
|
91
|
+
elif item.content_type == ContentType.DOCUMENT:
|
92
|
+
logger.debug(f"Using document chunking for {item.relative_path}")
|
93
|
+
chunks_data = self._chunk_document_content(item, final_chunk_size, final_overlap)
|
94
|
+
elif item.content_type == ContentType.DATA:
|
95
|
+
logger.debug(f"Using data chunking for {item.relative_path}")
|
96
|
+
chunks_data = self._chunk_data_content(item, final_chunk_size, final_overlap)
|
97
|
+
else:
|
98
|
+
logger.debug(f"Using generic chunking for {item.relative_path}")
|
99
|
+
chunks_data = self._chunk_generic_content(item, final_chunk_size, final_overlap)
|
100
|
+
|
101
|
+
logger.info(f"Generated {len(chunks_data)} chunks for {item.relative_path}")
|
102
|
+
|
103
|
+
# Create chunk records
|
104
|
+
chunk_objects = []
|
105
|
+
|
106
|
+
for chunk_data in chunks_data:
|
107
|
+
# Use objects to avoid custom manager issues
|
108
|
+
chunk = ArchiveItemChunk.objects.create(
|
109
|
+
user=self.user,
|
110
|
+
archive=item.archive,
|
111
|
+
item=item,
|
112
|
+
content=chunk_data.content,
|
113
|
+
chunk_index=chunk_data.chunk_index,
|
114
|
+
chunk_type=chunk_data.chunk_type,
|
115
|
+
context_metadata=chunk_data.context_metadata
|
116
|
+
)
|
117
|
+
chunk_objects.append(chunk)
|
118
|
+
|
119
|
+
return chunk_objects
|
120
|
+
|
121
|
+
except Exception as e:
|
122
|
+
logger.error(f"Chunking failed for {item.relative_path}: {str(e)}", exc_info=True)
|
123
|
+
raise ChunkingError(
|
124
|
+
message=f"Failed to create chunks for item {item.relative_path}",
|
125
|
+
code="CHUNKING_FAILED",
|
126
|
+
details={
|
127
|
+
"item_id": str(item.id),
|
128
|
+
"item_path": item.relative_path,
|
129
|
+
"error": str(e),
|
130
|
+
"content_type": str(item.content_type),
|
131
|
+
"content_length": len(item.raw_content) if item.raw_content else 0
|
132
|
+
}
|
133
|
+
) from e
|
134
|
+
|
135
|
+
def _chunk_code_content(
|
136
|
+
self,
|
137
|
+
item: ArchiveItem,
|
138
|
+
chunk_size: int,
|
139
|
+
overlap: int
|
140
|
+
) -> List[ChunkData]:
|
141
|
+
"""Chunk code files by logical boundaries."""
|
142
|
+
|
143
|
+
if item.language == 'python':
|
144
|
+
return self._chunk_python_code(item)
|
145
|
+
elif item.language in ['javascript', 'typescript']:
|
146
|
+
return self._chunk_js_code(item)
|
147
|
+
else:
|
148
|
+
return self._chunk_generic_code(item, chunk_size, overlap)
|
149
|
+
|
150
|
+
def _chunk_python_code(self, item: ArchiveItem) -> List[ChunkData]:
|
151
|
+
"""Chunk Python code by classes and functions."""
|
152
|
+
|
153
|
+
content = item.raw_content
|
154
|
+
lines = content.split('\n')
|
155
|
+
chunks = []
|
156
|
+
|
157
|
+
try:
|
158
|
+
tree = ast.parse(content)
|
159
|
+
|
160
|
+
# Extract imports first
|
161
|
+
imports_chunk = self._extract_python_imports(tree, lines, item, 0)
|
162
|
+
if imports_chunk:
|
163
|
+
chunks.append(imports_chunk)
|
164
|
+
|
165
|
+
# Extract classes and functions
|
166
|
+
for node in ast.walk(tree):
|
167
|
+
if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
|
168
|
+
chunk = self._create_python_element_chunk(
|
169
|
+
node, lines, item, len(chunks)
|
170
|
+
)
|
171
|
+
chunks.append(chunk)
|
172
|
+
|
173
|
+
# Handle module-level code
|
174
|
+
remaining_chunk = self._extract_python_remaining_code(
|
175
|
+
tree, lines, item, len(chunks)
|
176
|
+
)
|
177
|
+
if remaining_chunk:
|
178
|
+
chunks.append(remaining_chunk)
|
179
|
+
|
180
|
+
except SyntaxError:
|
181
|
+
# Fallback to line-based chunking
|
182
|
+
return self._chunk_generic_code(item, self.chunk_size, self.overlap)
|
183
|
+
|
184
|
+
return chunks
|
185
|
+
|
186
|
+
def _create_python_element_chunk(
|
187
|
+
self,
|
188
|
+
node: ast.AST,
|
189
|
+
lines: List[str],
|
190
|
+
item: ArchiveItem,
|
191
|
+
chunk_index: int
|
192
|
+
) -> ChunkData:
|
193
|
+
"""Create chunk for Python code element."""
|
194
|
+
|
195
|
+
start_line = node.lineno - 1
|
196
|
+
end_line = self._find_python_block_end(node, lines)
|
197
|
+
|
198
|
+
content = '\n'.join(lines[start_line:end_line])
|
199
|
+
|
200
|
+
# Analyze code structure
|
201
|
+
code_info = self._analyze_python_structure(node, content)
|
202
|
+
|
203
|
+
# Build context metadata
|
204
|
+
context = self._build_code_chunk_context(
|
205
|
+
item, chunk_index, content, start_line, end_line, code_info
|
206
|
+
)
|
207
|
+
|
208
|
+
return ChunkData(
|
209
|
+
content=content,
|
210
|
+
chunk_index=chunk_index,
|
211
|
+
chunk_type=ChunkType.CODE,
|
212
|
+
context_metadata=context
|
213
|
+
)
|
214
|
+
|
215
|
+
def _analyze_python_structure(self, node: ast.AST, content: str) -> Dict[str, Any]:
|
216
|
+
"""Analyze Python code structure for context."""
|
217
|
+
|
218
|
+
info = {
|
219
|
+
'element_name': node.name,
|
220
|
+
'element_type': 'class' if isinstance(node, ast.ClassDef) else 'function',
|
221
|
+
'is_async': isinstance(node, ast.AsyncFunctionDef),
|
222
|
+
'docstring': ast.get_docstring(node),
|
223
|
+
'decorators': [d.id for d in getattr(node, 'decorator_list', []) if hasattr(d, 'id')],
|
224
|
+
'complexity_score': self._calculate_code_complexity(content),
|
225
|
+
'purpose': self._detect_code_purpose(node.name, content),
|
226
|
+
}
|
227
|
+
|
228
|
+
# Extract function/method arguments
|
229
|
+
if hasattr(node, 'args'):
|
230
|
+
info['arguments'] = [arg.arg for arg in node.args.args]
|
231
|
+
|
232
|
+
# Extract class bases
|
233
|
+
if isinstance(node, ast.ClassDef):
|
234
|
+
info['base_classes'] = [base.id for base in node.bases if hasattr(base, 'id')]
|
235
|
+
|
236
|
+
return info
|
237
|
+
|
238
|
+
def _chunk_document_content(
|
239
|
+
self,
|
240
|
+
item: ArchiveItem,
|
241
|
+
chunk_size: int,
|
242
|
+
overlap: int
|
243
|
+
) -> List[ChunkData]:
|
244
|
+
"""Chunk document files by structure."""
|
245
|
+
|
246
|
+
if item.language == 'markdown':
|
247
|
+
return self._chunk_markdown_content(item)
|
248
|
+
else:
|
249
|
+
return self._chunk_generic_content(item, chunk_size, overlap)
|
250
|
+
|
251
|
+
def _chunk_markdown_content(self, item: ArchiveItem) -> List[ChunkData]:
|
252
|
+
"""Chunk markdown by headings and sections."""
|
253
|
+
|
254
|
+
content = item.raw_content
|
255
|
+
lines = content.split('\n')
|
256
|
+
chunks = []
|
257
|
+
|
258
|
+
current_section = {'title': '', 'level': 0, 'start_line': 0}
|
259
|
+
|
260
|
+
for i, line in enumerate(lines):
|
261
|
+
if line.startswith('#'):
|
262
|
+
# New section found
|
263
|
+
if current_section['start_line'] < i:
|
264
|
+
# Create chunk for previous section
|
265
|
+
chunk = self._create_markdown_section_chunk(
|
266
|
+
lines[current_section['start_line']:i],
|
267
|
+
current_section,
|
268
|
+
item,
|
269
|
+
len(chunks)
|
270
|
+
)
|
271
|
+
chunks.append(chunk)
|
272
|
+
|
273
|
+
# Start new section
|
274
|
+
level = len(line) - len(line.lstrip('#'))
|
275
|
+
current_section = {
|
276
|
+
'title': line.lstrip('# ').strip(),
|
277
|
+
'level': level,
|
278
|
+
'start_line': i
|
279
|
+
}
|
280
|
+
|
281
|
+
# Handle last section
|
282
|
+
if current_section['start_line'] < len(lines):
|
283
|
+
chunk = self._create_markdown_section_chunk(
|
284
|
+
lines[current_section['start_line']:],
|
285
|
+
current_section,
|
286
|
+
item,
|
287
|
+
len(chunks)
|
288
|
+
)
|
289
|
+
chunks.append(chunk)
|
290
|
+
|
291
|
+
return chunks
|
292
|
+
|
293
|
+
def _create_markdown_section_chunk(
|
294
|
+
self,
|
295
|
+
section_lines: List[str],
|
296
|
+
section_info: Dict[str, Any],
|
297
|
+
item: ArchiveItem,
|
298
|
+
chunk_index: int
|
299
|
+
) -> ChunkData:
|
300
|
+
"""Create chunk for markdown section."""
|
301
|
+
|
302
|
+
content = '\n'.join(section_lines)
|
303
|
+
|
304
|
+
# Build context metadata
|
305
|
+
context = self._build_document_chunk_context(
|
306
|
+
item, chunk_index, content, section_info
|
307
|
+
)
|
308
|
+
|
309
|
+
chunk_type = ChunkType.HEADING if section_info['title'] else ChunkType.TEXT
|
310
|
+
|
311
|
+
return ChunkData(
|
312
|
+
content=content,
|
313
|
+
chunk_index=chunk_index,
|
314
|
+
chunk_type=chunk_type,
|
315
|
+
context_metadata=context
|
316
|
+
)
|
317
|
+
|
318
|
+
def _chunk_data_content(
|
319
|
+
self,
|
320
|
+
item: ArchiveItem,
|
321
|
+
chunk_size: int,
|
322
|
+
overlap: int
|
323
|
+
) -> List[ChunkData]:
|
324
|
+
"""Chunk data files by logical structure."""
|
325
|
+
|
326
|
+
if item.language == 'json':
|
327
|
+
return self._chunk_json_content(item)
|
328
|
+
elif item.language in ['yaml', 'yml']:
|
329
|
+
return self._chunk_yaml_content(item)
|
330
|
+
else:
|
331
|
+
return self._chunk_generic_content(item, chunk_size, overlap)
|
332
|
+
|
333
|
+
def _chunk_json_content(self, item: ArchiveItem) -> List[ChunkData]:
|
334
|
+
"""Chunk JSON by object structure."""
|
335
|
+
|
336
|
+
import json
|
337
|
+
|
338
|
+
try:
|
339
|
+
data = json.loads(item.raw_content)
|
340
|
+
chunks = []
|
341
|
+
|
342
|
+
if isinstance(data, dict):
|
343
|
+
# Chunk by top-level keys
|
344
|
+
for key, value in data.items():
|
345
|
+
chunk_content = json.dumps({key: value}, indent=2)
|
346
|
+
|
347
|
+
context = self._build_data_chunk_context(
|
348
|
+
item, len(chunks), chunk_content, 'json_object', key
|
349
|
+
)
|
350
|
+
|
351
|
+
chunks.append(ChunkData(
|
352
|
+
content=chunk_content,
|
353
|
+
chunk_index=len(chunks),
|
354
|
+
chunk_type=ChunkType.METADATA,
|
355
|
+
context_metadata=context
|
356
|
+
))
|
357
|
+
|
358
|
+
return chunks
|
359
|
+
|
360
|
+
except json.JSONDecodeError:
|
361
|
+
# Fallback to text chunking
|
362
|
+
return self._chunk_generic_content(item, self.chunk_size, self.overlap)
|
363
|
+
|
364
|
+
def _chunk_generic_content(
|
365
|
+
self,
|
366
|
+
item: ArchiveItem,
|
367
|
+
chunk_size: int,
|
368
|
+
overlap: int
|
369
|
+
) -> List[ChunkData]:
|
370
|
+
"""Generic text chunking with overlap."""
|
371
|
+
|
372
|
+
content = item.raw_content
|
373
|
+
chunks = []
|
374
|
+
|
375
|
+
# Simple text splitting with overlap
|
376
|
+
start = 0
|
377
|
+
chunk_index = 0
|
378
|
+
|
379
|
+
while start < len(content):
|
380
|
+
end = start + chunk_size
|
381
|
+
|
382
|
+
# Try to break at word boundary
|
383
|
+
if end < len(content):
|
384
|
+
# Look for good break points
|
385
|
+
break_point = self._find_good_break_point(content, start, end)
|
386
|
+
if break_point > start:
|
387
|
+
end = break_point
|
388
|
+
|
389
|
+
chunk_content = content[start:end].strip()
|
390
|
+
|
391
|
+
if chunk_content:
|
392
|
+
context = self._build_generic_chunk_context(
|
393
|
+
item, chunk_index, chunk_content, start, end
|
394
|
+
)
|
395
|
+
|
396
|
+
chunks.append(ChunkData(
|
397
|
+
content=chunk_content,
|
398
|
+
chunk_index=chunk_index,
|
399
|
+
chunk_type=ChunkType.TEXT,
|
400
|
+
context_metadata=context
|
401
|
+
))
|
402
|
+
|
403
|
+
chunk_index += 1
|
404
|
+
|
405
|
+
# Move start position with overlap
|
406
|
+
start = max(start + chunk_size - overlap, end)
|
407
|
+
|
408
|
+
return chunks
|
409
|
+
|
410
|
+
def _find_good_break_point(self, content: str, start: int, end: int) -> int:
|
411
|
+
"""Find good break point for text chunking."""
|
412
|
+
|
413
|
+
# Look for sentence endings
|
414
|
+
for i in range(end - 1, start, -1):
|
415
|
+
if content[i] in '.!?\n':
|
416
|
+
return i + 1
|
417
|
+
|
418
|
+
# Look for word boundaries
|
419
|
+
for i in range(end - 1, start, -1):
|
420
|
+
if content[i].isspace():
|
421
|
+
return i
|
422
|
+
|
423
|
+
return end
|
424
|
+
|
425
|
+
def _build_code_chunk_context(
|
426
|
+
self,
|
427
|
+
item: ArchiveItem,
|
428
|
+
chunk_index: int,
|
429
|
+
content: str,
|
430
|
+
start_line: int,
|
431
|
+
end_line: int,
|
432
|
+
code_info: Dict[str, Any]
|
433
|
+
) -> Dict[str, Any]:
|
434
|
+
"""Build context metadata for code chunk."""
|
435
|
+
|
436
|
+
return {
|
437
|
+
'archive_info': {
|
438
|
+
'id': str(item.archive.id),
|
439
|
+
'title': item.archive.title,
|
440
|
+
'description': item.archive.description,
|
441
|
+
},
|
442
|
+
'item_info': {
|
443
|
+
'id': str(item.id),
|
444
|
+
'relative_path': item.relative_path,
|
445
|
+
'item_name': item.item_name,
|
446
|
+
'content_type': item.content_type,
|
447
|
+
'language': item.language,
|
448
|
+
},
|
449
|
+
'position_info': {
|
450
|
+
'chunk_index': chunk_index,
|
451
|
+
'start_line': start_line + 1,
|
452
|
+
'end_line': end_line,
|
453
|
+
'total_lines': len(item.raw_content.split('\n')),
|
454
|
+
},
|
455
|
+
'structure_info': {
|
456
|
+
'element_name': code_info.get('element_name'),
|
457
|
+
'element_type': code_info.get('element_type'),
|
458
|
+
'is_async': code_info.get('is_async', False),
|
459
|
+
'has_docstring': bool(code_info.get('docstring')),
|
460
|
+
},
|
461
|
+
'semantic_info': {
|
462
|
+
'chunk_type': 'code',
|
463
|
+
'content_purpose': code_info.get('purpose', 'implementation'),
|
464
|
+
'complexity_score': code_info.get('complexity_score', 0.0),
|
465
|
+
'technical_tags': self._generate_code_tags(content, code_info),
|
466
|
+
},
|
467
|
+
'processing_info': {
|
468
|
+
'extraction_method': 'ast_parser',
|
469
|
+
'chunking_strategy': 'logical_units',
|
470
|
+
'quality_score': self._assess_code_quality(content),
|
471
|
+
}
|
472
|
+
}
|
473
|
+
|
474
|
+
def _build_document_chunk_context(
|
475
|
+
self,
|
476
|
+
item: ArchiveItem,
|
477
|
+
chunk_index: int,
|
478
|
+
content: str,
|
479
|
+
section_info: Dict[str, Any]
|
480
|
+
) -> Dict[str, Any]:
|
481
|
+
"""Build context metadata for document chunk."""
|
482
|
+
|
483
|
+
return {
|
484
|
+
'archive_info': {
|
485
|
+
'id': str(item.archive.id),
|
486
|
+
'title': item.archive.title,
|
487
|
+
},
|
488
|
+
'item_info': {
|
489
|
+
'id': str(item.id),
|
490
|
+
'relative_path': item.relative_path,
|
491
|
+
'content_type': item.content_type,
|
492
|
+
'language': item.language,
|
493
|
+
},
|
494
|
+
'position_info': {
|
495
|
+
'chunk_index': chunk_index,
|
496
|
+
},
|
497
|
+
'structure_info': {
|
498
|
+
'section_title': section_info.get('title'),
|
499
|
+
'section_level': section_info.get('level', 0),
|
500
|
+
},
|
501
|
+
'semantic_info': {
|
502
|
+
'chunk_type': 'heading' if section_info.get('title') else 'text',
|
503
|
+
'content_purpose': 'documentation',
|
504
|
+
'topic_tags': self._generate_document_tags(content),
|
505
|
+
},
|
506
|
+
'processing_info': {
|
507
|
+
'extraction_method': 'markdown_parser',
|
508
|
+
'chunking_strategy': 'heading_based',
|
509
|
+
}
|
510
|
+
}
|
511
|
+
|
512
|
+
def _build_data_chunk_context(
|
513
|
+
self,
|
514
|
+
item: ArchiveItem,
|
515
|
+
chunk_index: int,
|
516
|
+
content: str,
|
517
|
+
data_type: str,
|
518
|
+
key_name: Optional[str] = None
|
519
|
+
) -> Dict[str, Any]:
|
520
|
+
"""Build context metadata for data chunk."""
|
521
|
+
|
522
|
+
return {
|
523
|
+
'archive_info': {
|
524
|
+
'id': str(item.archive.id),
|
525
|
+
'title': item.archive.title,
|
526
|
+
},
|
527
|
+
'item_info': {
|
528
|
+
'id': str(item.id),
|
529
|
+
'relative_path': item.relative_path,
|
530
|
+
'content_type': item.content_type,
|
531
|
+
},
|
532
|
+
'position_info': {
|
533
|
+
'chunk_index': chunk_index,
|
534
|
+
},
|
535
|
+
'structure_info': {
|
536
|
+
'data_key': key_name,
|
537
|
+
'data_type': data_type,
|
538
|
+
},
|
539
|
+
'semantic_info': {
|
540
|
+
'chunk_type': 'metadata',
|
541
|
+
'content_purpose': 'data_definition',
|
542
|
+
},
|
543
|
+
'processing_info': {
|
544
|
+
'extraction_method': 'json_parser',
|
545
|
+
'chunking_strategy': 'object_properties',
|
546
|
+
}
|
547
|
+
}
|
548
|
+
|
549
|
+
def _build_generic_chunk_context(
|
550
|
+
self,
|
551
|
+
item: ArchiveItem,
|
552
|
+
chunk_index: int,
|
553
|
+
content: str,
|
554
|
+
start_pos: int,
|
555
|
+
end_pos: int
|
556
|
+
) -> Dict[str, Any]:
|
557
|
+
"""Build context metadata for generic text chunk."""
|
558
|
+
|
559
|
+
return {
|
560
|
+
'archive_info': {
|
561
|
+
'id': str(item.archive.id),
|
562
|
+
'title': item.archive.title,
|
563
|
+
},
|
564
|
+
'item_info': {
|
565
|
+
'id': str(item.id),
|
566
|
+
'relative_path': item.relative_path,
|
567
|
+
'content_type': item.content_type,
|
568
|
+
},
|
569
|
+
'position_info': {
|
570
|
+
'chunk_index': chunk_index,
|
571
|
+
'start_char': start_pos,
|
572
|
+
'end_char': end_pos,
|
573
|
+
'relative_position': start_pos / len(item.raw_content),
|
574
|
+
},
|
575
|
+
'semantic_info': {
|
576
|
+
'chunk_type': 'text',
|
577
|
+
'content_purpose': 'content',
|
578
|
+
},
|
579
|
+
'processing_info': {
|
580
|
+
'extraction_method': 'text_splitting',
|
581
|
+
'chunking_strategy': 'fixed_size_overlap',
|
582
|
+
}
|
583
|
+
}
|
584
|
+
|
585
|
+
def _generate_code_tags(self, content: str, code_info: Dict[str, Any]) -> List[str]:
|
586
|
+
"""Generate technical tags for code content."""
|
587
|
+
|
588
|
+
tags = []
|
589
|
+
|
590
|
+
# Element type tags
|
591
|
+
if code_info.get('element_type'):
|
592
|
+
tags.append(f"contains:{code_info['element_type']}")
|
593
|
+
|
594
|
+
# Async tag
|
595
|
+
if code_info.get('is_async'):
|
596
|
+
tags.append('async')
|
597
|
+
|
598
|
+
# Pattern detection
|
599
|
+
if 'import ' in content or 'from ' in content:
|
600
|
+
tags.append('contains:imports')
|
601
|
+
|
602
|
+
if 'class ' in content:
|
603
|
+
tags.append('contains:class_definition')
|
604
|
+
|
605
|
+
if 'def ' in content:
|
606
|
+
tags.append('contains:function_definition')
|
607
|
+
|
608
|
+
if 'test' in code_info.get('element_name', '').lower():
|
609
|
+
tags.append('purpose:testing')
|
610
|
+
|
611
|
+
return tags
|
612
|
+
|
613
|
+
def _generate_document_tags(self, content: str) -> List[str]:
|
614
|
+
"""Generate topic tags for document content."""
|
615
|
+
|
616
|
+
tags = []
|
617
|
+
|
618
|
+
# Detect headings
|
619
|
+
if content.strip().startswith('#'):
|
620
|
+
tags.append('contains:heading')
|
621
|
+
|
622
|
+
# Detect lists
|
623
|
+
if re.search(r'^\s*[-*+]\s', content, re.MULTILINE):
|
624
|
+
tags.append('contains:list')
|
625
|
+
|
626
|
+
# Detect code blocks
|
627
|
+
if '```' in content or ' ' in content:
|
628
|
+
tags.append('contains:code_block')
|
629
|
+
|
630
|
+
return tags
|
631
|
+
|
632
|
+
def _calculate_code_complexity(self, content: str) -> float:
|
633
|
+
"""Calculate code complexity score."""
|
634
|
+
|
635
|
+
# Simple complexity based on lines and control structures
|
636
|
+
lines = content.split('\n')
|
637
|
+
complexity = len(lines) / 100.0 # Base complexity
|
638
|
+
|
639
|
+
# Add complexity for control structures
|
640
|
+
control_keywords = ['if', 'for', 'while', 'try', 'except', 'with']
|
641
|
+
for keyword in control_keywords:
|
642
|
+
complexity += content.count(keyword) * 0.1
|
643
|
+
|
644
|
+
return min(1.0, complexity)
|
645
|
+
|
646
|
+
def _assess_code_quality(self, content: str) -> float:
|
647
|
+
"""Assess code quality score."""
|
648
|
+
|
649
|
+
# Simple quality assessment
|
650
|
+
quality = 0.5 # Base quality
|
651
|
+
|
652
|
+
# Boost for docstrings
|
653
|
+
if '"""' in content or "'''" in content:
|
654
|
+
quality += 0.2
|
655
|
+
|
656
|
+
# Boost for comments
|
657
|
+
comment_lines = len([line for line in content.split('\n') if line.strip().startswith('#')])
|
658
|
+
quality += min(0.2, comment_lines / 10.0)
|
659
|
+
|
660
|
+
# Penalty for very long lines
|
661
|
+
long_lines = len([line for line in content.split('\n') if len(line) > 100])
|
662
|
+
quality -= min(0.2, long_lines / 10.0)
|
663
|
+
|
664
|
+
return max(0.0, min(1.0, quality))
|
665
|
+
|
666
|
+
def _detect_code_purpose(self, element_name: str, content: str) -> str:
|
667
|
+
"""Detect purpose of code element."""
|
668
|
+
|
669
|
+
name_lower = element_name.lower()
|
670
|
+
|
671
|
+
if name_lower.startswith('test_'):
|
672
|
+
return 'test'
|
673
|
+
elif name_lower.startswith('_'):
|
674
|
+
return 'private_method'
|
675
|
+
elif 'config' in name_lower:
|
676
|
+
return 'configuration'
|
677
|
+
elif 'init' in name_lower:
|
678
|
+
return 'initialization'
|
679
|
+
elif 'main' in name_lower:
|
680
|
+
return 'main_function'
|
681
|
+
else:
|
682
|
+
return 'implementation'
|
683
|
+
|
684
|
+
def _find_python_block_end(self, node: ast.AST, lines: List[str]) -> int:
|
685
|
+
"""Find end line of Python code block."""
|
686
|
+
|
687
|
+
# Start from the node's end line
|
688
|
+
start_line = getattr(node, 'end_lineno', node.lineno) or node.lineno
|
689
|
+
|
690
|
+
# Look for the actual end by checking indentation
|
691
|
+
for i in range(start_line, len(lines)):
|
692
|
+
line = lines[i]
|
693
|
+
if line.strip() and not line.startswith(' ') and not line.startswith('\t'):
|
694
|
+
return i
|
695
|
+
|
696
|
+
return len(lines)
|
697
|
+
|
698
|
+
def _extract_python_imports(
|
699
|
+
self,
|
700
|
+
tree: ast.AST,
|
701
|
+
lines: List[str],
|
702
|
+
item: ArchiveItem,
|
703
|
+
chunk_index: int
|
704
|
+
) -> Optional[ChunkData]:
|
705
|
+
"""Extract imports as separate chunk."""
|
706
|
+
|
707
|
+
import_lines = []
|
708
|
+
|
709
|
+
for node in ast.walk(tree):
|
710
|
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
711
|
+
import_lines.append(node.lineno - 1)
|
712
|
+
|
713
|
+
if not import_lines:
|
714
|
+
return None
|
715
|
+
|
716
|
+
# Get all import lines
|
717
|
+
import_content = '\n'.join(lines[min(import_lines):max(import_lines) + 1])
|
718
|
+
|
719
|
+
context = self._build_code_chunk_context(
|
720
|
+
item, chunk_index, import_content,
|
721
|
+
min(import_lines), max(import_lines) + 1,
|
722
|
+
{'element_name': 'imports', 'element_type': 'imports', 'purpose': 'imports'}
|
723
|
+
)
|
724
|
+
|
725
|
+
return ChunkData(
|
726
|
+
content=import_content,
|
727
|
+
chunk_index=chunk_index,
|
728
|
+
chunk_type=ChunkType.METADATA,
|
729
|
+
context_metadata=context
|
730
|
+
)
|
731
|
+
|
732
|
+
def _extract_python_remaining_code(
|
733
|
+
self,
|
734
|
+
tree: ast.AST,
|
735
|
+
lines: List[str],
|
736
|
+
item: ArchiveItem,
|
737
|
+
chunk_index: int
|
738
|
+
) -> Optional[ChunkData]:
|
739
|
+
"""Extract remaining module-level code."""
|
740
|
+
|
741
|
+
# This is a simplified implementation
|
742
|
+
# In practice, you'd want to identify module-level statements
|
743
|
+
# that aren't part of classes or functions
|
744
|
+
|
745
|
+
return None # Skip for now
|
746
|
+
|
747
|
+
def _chunk_generic_code(
|
748
|
+
self,
|
749
|
+
item: ArchiveItem,
|
750
|
+
chunk_size: int,
|
751
|
+
overlap: int
|
752
|
+
) -> List[ChunkData]:
|
753
|
+
"""Generic code chunking for unsupported languages."""
|
754
|
+
|
755
|
+
return self._chunk_generic_content(item, chunk_size, overlap)
|
756
|
+
|
757
|
+
def _chunk_js_code(self, item: ArchiveItem) -> List[ChunkData]:
|
758
|
+
"""Chunk JavaScript/TypeScript code."""
|
759
|
+
|
760
|
+
# Simplified implementation - could be enhanced with proper JS parsing
|
761
|
+
return self._chunk_generic_content(item, self.chunk_size, self.overlap)
|
762
|
+
|
763
|
+
def _chunk_yaml_content(self, item: ArchiveItem) -> List[ChunkData]:
|
764
|
+
"""Chunk YAML content."""
|
765
|
+
|
766
|
+
# Simplified implementation - could be enhanced with YAML parsing
|
767
|
+
return self._chunk_generic_content(item, self.chunk_size, self.overlap)
|
768
|
+
|
769
|
+
|
770
|
+
class ChunkContextBuilder:
|
771
|
+
"""Helper class for building chunk context metadata."""
|
772
|
+
|
773
|
+
@staticmethod
|
774
|
+
def build_context(
|
775
|
+
archive_info: Dict[str, Any],
|
776
|
+
item_info: Dict[str, Any],
|
777
|
+
position_info: Dict[str, Any],
|
778
|
+
structure_info: Dict[str, Any],
|
779
|
+
semantic_info: Dict[str, Any],
|
780
|
+
processing_info: Dict[str, Any]
|
781
|
+
) -> Dict[str, Any]:
|
782
|
+
"""Build complete context metadata."""
|
783
|
+
|
784
|
+
return {
|
785
|
+
'archive_info': archive_info,
|
786
|
+
'item_info': item_info,
|
787
|
+
'position_info': position_info,
|
788
|
+
'structure_info': structure_info,
|
789
|
+
'semantic_info': semantic_info,
|
790
|
+
'processing_info': processing_info
|
791
|
+
}
|