django-cfg 1.1.81__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. django_cfg/__init__.py +20 -448
  2. django_cfg/apps/accounts/README.md +3 -3
  3. django_cfg/apps/accounts/admin/__init__.py +0 -2
  4. django_cfg/apps/accounts/admin/activity.py +2 -9
  5. django_cfg/apps/accounts/admin/filters.py +0 -42
  6. django_cfg/apps/accounts/admin/inlines.py +8 -8
  7. django_cfg/apps/accounts/admin/otp.py +5 -5
  8. django_cfg/apps/accounts/admin/registration_source.py +1 -8
  9. django_cfg/apps/accounts/admin/user.py +12 -20
  10. django_cfg/apps/accounts/managers/user_manager.py +2 -129
  11. django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
  12. django_cfg/apps/accounts/models.py +3 -123
  13. django_cfg/apps/accounts/serializers/otp.py +40 -44
  14. django_cfg/apps/accounts/serializers/profile.py +0 -2
  15. django_cfg/apps/accounts/services/otp_service.py +98 -186
  16. django_cfg/apps/accounts/signals.py +25 -15
  17. django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
  18. django_cfg/apps/accounts/views/otp.py +35 -36
  19. django_cfg/apps/agents/README.md +129 -0
  20. django_cfg/apps/agents/__init__.py +68 -0
  21. django_cfg/apps/agents/admin/__init__.py +17 -0
  22. django_cfg/apps/agents/admin/execution_admin.py +460 -0
  23. django_cfg/apps/agents/admin/registry_admin.py +360 -0
  24. django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
  25. django_cfg/apps/agents/apps.py +29 -0
  26. django_cfg/apps/agents/core/__init__.py +20 -0
  27. django_cfg/apps/agents/core/agent.py +281 -0
  28. django_cfg/apps/agents/core/dependencies.py +154 -0
  29. django_cfg/apps/agents/core/exceptions.py +66 -0
  30. django_cfg/apps/agents/core/models.py +106 -0
  31. django_cfg/apps/agents/core/orchestrator.py +391 -0
  32. django_cfg/apps/agents/examples/__init__.py +3 -0
  33. django_cfg/apps/agents/examples/simple_example.py +161 -0
  34. django_cfg/apps/agents/integration/__init__.py +14 -0
  35. django_cfg/apps/agents/integration/middleware.py +80 -0
  36. django_cfg/apps/agents/integration/registry.py +345 -0
  37. django_cfg/apps/agents/integration/signals.py +50 -0
  38. django_cfg/apps/agents/management/__init__.py +3 -0
  39. django_cfg/apps/agents/management/commands/__init__.py +3 -0
  40. django_cfg/apps/agents/management/commands/create_agent.py +365 -0
  41. django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
  42. django_cfg/apps/agents/managers/__init__.py +23 -0
  43. django_cfg/apps/agents/managers/execution.py +236 -0
  44. django_cfg/apps/agents/managers/registry.py +254 -0
  45. django_cfg/apps/agents/managers/toolsets.py +496 -0
  46. django_cfg/apps/agents/migrations/0001_initial.py +286 -0
  47. django_cfg/apps/agents/migrations/__init__.py +5 -0
  48. django_cfg/apps/agents/models/__init__.py +15 -0
  49. django_cfg/apps/agents/models/execution.py +215 -0
  50. django_cfg/apps/agents/models/registry.py +220 -0
  51. django_cfg/apps/agents/models/toolsets.py +305 -0
  52. django_cfg/apps/agents/patterns/__init__.py +24 -0
  53. django_cfg/apps/agents/patterns/content_agents.py +234 -0
  54. django_cfg/apps/agents/toolsets/__init__.py +15 -0
  55. django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
  56. django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
  57. django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
  58. django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
  59. django_cfg/apps/agents/urls.py +46 -0
  60. django_cfg/apps/knowbase/README.md +150 -0
  61. django_cfg/apps/knowbase/__init__.py +27 -0
  62. django_cfg/apps/knowbase/admin/__init__.py +23 -0
  63. django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
  64. django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
  65. django_cfg/apps/knowbase/admin/document_admin.py +650 -0
  66. django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
  67. django_cfg/apps/knowbase/apps.py +81 -0
  68. django_cfg/apps/knowbase/config/README.md +176 -0
  69. django_cfg/apps/knowbase/config/__init__.py +51 -0
  70. django_cfg/apps/knowbase/config/constance_fields.py +186 -0
  71. django_cfg/apps/knowbase/config/constance_settings.py +200 -0
  72. django_cfg/apps/knowbase/config/settings.py +444 -0
  73. django_cfg/apps/knowbase/examples/__init__.py +3 -0
  74. django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
  75. django_cfg/apps/knowbase/management/__init__.py +0 -0
  76. django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
  77. django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
  78. django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
  79. django_cfg/apps/knowbase/managers/__init__.py +22 -0
  80. django_cfg/apps/knowbase/managers/archive.py +426 -0
  81. django_cfg/apps/knowbase/managers/base.py +32 -0
  82. django_cfg/apps/knowbase/managers/chat.py +141 -0
  83. django_cfg/apps/knowbase/managers/document.py +203 -0
  84. django_cfg/apps/knowbase/managers/external_data.py +471 -0
  85. django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
  86. django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
  87. django_cfg/apps/knowbase/migrations/__init__.py +5 -0
  88. django_cfg/apps/knowbase/mixins/__init__.py +15 -0
  89. django_cfg/apps/knowbase/mixins/config.py +108 -0
  90. django_cfg/apps/knowbase/mixins/creator.py +81 -0
  91. django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
  92. django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
  93. django_cfg/apps/knowbase/mixins/service.py +362 -0
  94. django_cfg/apps/knowbase/models/__init__.py +41 -0
  95. django_cfg/apps/knowbase/models/archive.py +599 -0
  96. django_cfg/apps/knowbase/models/base.py +58 -0
  97. django_cfg/apps/knowbase/models/chat.py +157 -0
  98. django_cfg/apps/knowbase/models/document.py +267 -0
  99. django_cfg/apps/knowbase/models/external_data.py +376 -0
  100. django_cfg/apps/knowbase/serializers/__init__.py +68 -0
  101. django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
  102. django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
  103. django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
  104. django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
  105. django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
  106. django_cfg/apps/knowbase/services/__init__.py +40 -0
  107. django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
  108. django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
  109. django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
  110. django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
  111. django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
  112. django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
  113. django_cfg/apps/knowbase/services/base.py +53 -0
  114. django_cfg/apps/knowbase/services/chat_service.py +239 -0
  115. django_cfg/apps/knowbase/services/document_service.py +144 -0
  116. django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
  117. django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
  118. django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
  119. django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
  120. django_cfg/apps/knowbase/services/embedding/models.py +229 -0
  121. django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
  122. django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
  123. django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
  124. django_cfg/apps/knowbase/services/search_service.py +293 -0
  125. django_cfg/apps/knowbase/signals/__init__.py +21 -0
  126. django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
  127. django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
  128. django_cfg/apps/knowbase/signals/document_signals.py +143 -0
  129. django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
  130. django_cfg/apps/knowbase/tasks/__init__.py +39 -0
  131. django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
  132. django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
  133. django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
  134. django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
  135. django_cfg/apps/knowbase/urls.py +43 -0
  136. django_cfg/apps/knowbase/utils/__init__.py +12 -0
  137. django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
  138. django_cfg/apps/knowbase/utils/text_processing.py +375 -0
  139. django_cfg/apps/knowbase/utils/validation.py +99 -0
  140. django_cfg/apps/knowbase/views/__init__.py +28 -0
  141. django_cfg/apps/knowbase/views/archive_views.py +469 -0
  142. django_cfg/apps/knowbase/views/base.py +49 -0
  143. django_cfg/apps/knowbase/views/chat_views.py +181 -0
  144. django_cfg/apps/knowbase/views/document_views.py +183 -0
  145. django_cfg/apps/knowbase/views/public_views.py +129 -0
  146. django_cfg/apps/leads/admin.py +70 -0
  147. django_cfg/apps/newsletter/admin.py +234 -0
  148. django_cfg/apps/newsletter/admin_filters.py +124 -0
  149. django_cfg/apps/support/admin.py +196 -0
  150. django_cfg/apps/support/admin_filters.py +71 -0
  151. django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
  152. django_cfg/apps/urls.py +5 -4
  153. django_cfg/cli/README.md +1 -1
  154. django_cfg/cli/commands/create_project.py +2 -2
  155. django_cfg/cli/commands/info.py +1 -1
  156. django_cfg/config.py +44 -0
  157. django_cfg/core/config.py +29 -82
  158. django_cfg/core/environment.py +1 -1
  159. django_cfg/core/generation.py +19 -107
  160. django_cfg/{integration.py → core/integration.py} +18 -16
  161. django_cfg/core/validation.py +1 -1
  162. django_cfg/management/__init__.py +1 -1
  163. django_cfg/management/commands/__init__.py +1 -1
  164. django_cfg/management/commands/auto_generate.py +482 -0
  165. django_cfg/management/commands/migrator.py +19 -101
  166. django_cfg/management/commands/test_email.py +1 -1
  167. django_cfg/middleware/README.md +0 -158
  168. django_cfg/middleware/__init__.py +0 -2
  169. django_cfg/middleware/user_activity.py +3 -3
  170. django_cfg/models/api.py +145 -0
  171. django_cfg/models/base.py +287 -0
  172. django_cfg/models/cache.py +4 -4
  173. django_cfg/models/constance.py +25 -88
  174. django_cfg/models/database.py +9 -9
  175. django_cfg/models/drf.py +3 -36
  176. django_cfg/models/email.py +163 -0
  177. django_cfg/models/environment.py +276 -0
  178. django_cfg/models/limits.py +1 -1
  179. django_cfg/models/logging.py +366 -0
  180. django_cfg/models/revolution.py +41 -2
  181. django_cfg/models/security.py +125 -0
  182. django_cfg/models/services.py +1 -1
  183. django_cfg/modules/__init__.py +2 -56
  184. django_cfg/modules/base.py +78 -52
  185. django_cfg/modules/django_currency/service.py +2 -2
  186. django_cfg/modules/django_email.py +2 -2
  187. django_cfg/modules/django_health.py +267 -0
  188. django_cfg/modules/django_llm/llm/client.py +79 -17
  189. django_cfg/modules/django_llm/translator/translator.py +2 -2
  190. django_cfg/modules/django_logger.py +2 -2
  191. django_cfg/modules/django_ngrok.py +2 -2
  192. django_cfg/modules/django_tasks.py +68 -3
  193. django_cfg/modules/django_telegram.py +3 -3
  194. django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
  195. django_cfg/modules/django_twilio/service.py +2 -2
  196. django_cfg/modules/django_twilio/simple_service.py +2 -2
  197. django_cfg/modules/django_twilio/templates/guide.md +266 -0
  198. django_cfg/modules/django_twilio/twilio_service.py +2 -2
  199. django_cfg/modules/django_unfold/__init__.py +69 -0
  200. django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
  201. django_cfg/modules/django_unfold/dashboard.py +278 -0
  202. django_cfg/modules/django_unfold/icons/README.md +145 -0
  203. django_cfg/modules/django_unfold/icons/__init__.py +12 -0
  204. django_cfg/modules/django_unfold/icons/constants.py +2851 -0
  205. django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
  206. django_cfg/modules/django_unfold/models/__init__.py +42 -0
  207. django_cfg/modules/django_unfold/models/config.py +601 -0
  208. django_cfg/modules/django_unfold/models/dashboard.py +206 -0
  209. django_cfg/modules/django_unfold/models/dropdown.py +40 -0
  210. django_cfg/modules/django_unfold/models/navigation.py +73 -0
  211. django_cfg/modules/django_unfold/models/tabs.py +25 -0
  212. django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
  213. django_cfg/modules/django_unfold/utils.py +140 -0
  214. django_cfg/registry/__init__.py +23 -0
  215. django_cfg/registry/core.py +61 -0
  216. django_cfg/registry/exceptions.py +11 -0
  217. django_cfg/registry/modules.py +12 -0
  218. django_cfg/registry/services.py +26 -0
  219. django_cfg/registry/third_party.py +52 -0
  220. django_cfg/routing/__init__.py +19 -0
  221. django_cfg/routing/callbacks.py +198 -0
  222. django_cfg/routing/routers.py +48 -0
  223. django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
  224. django_cfg/templatetags/__init__.py +0 -0
  225. django_cfg/templatetags/django_cfg.py +33 -0
  226. django_cfg/urls.py +33 -0
  227. django_cfg/utils/path_resolution.py +1 -1
  228. django_cfg/utils/smart_defaults.py +7 -61
  229. django_cfg/utils/toolkit.py +663 -0
  230. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/METADATA +83 -86
  231. django_cfg-1.2.0.dist-info/RECORD +441 -0
  232. django_cfg/apps/tasks/@docs/README.md +0 -195
  233. django_cfg/archive/django_sample.zip +0 -0
  234. django_cfg/models/unfold.py +0 -271
  235. django_cfg/modules/unfold/__init__.py +0 -29
  236. django_cfg/modules/unfold/dashboard.py +0 -318
  237. django_cfg/pyproject.toml +0 -370
  238. django_cfg/routers.py +0 -83
  239. django_cfg-1.1.81.dist-info/RECORD +0 -278
  240. /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
  241. /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
  242. /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
  243. /django_cfg/{version_check.py → utils/version_check.py} +0 -0
  244. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/WHEEL +0 -0
  245. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/entry_points.txt +0 -0
  246. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,791 @@
1
+ """
2
+ Contextual chunking service.
3
+
4
+ Creates context-aware chunks with rich metadata for AI understanding.
5
+ """
6
+
7
+ import re
8
+ import ast
9
+ import logging
10
+ from typing import List, Dict, Any, Optional, Tuple
11
+ from django.contrib.auth import get_user_model
12
+ from pydantic import BaseModel
13
+
14
+ from ...models.archive import ArchiveItem, ArchiveItemChunk, ContentType, ChunkType
15
+ from ...utils.chunk_settings import get_chunking_params_for_type
16
+ from ..base import BaseService
17
+ from .exceptions import ChunkingError
18
+
19
+ User = get_user_model()
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class ChunkContextMetadata(BaseModel):
25
+ """Rich context metadata for chunks."""
26
+
27
+ # Parent hierarchy
28
+ archive_info: Dict[str, Any]
29
+ item_info: Dict[str, Any]
30
+
31
+ # Position and structure
32
+ position_info: Dict[str, Any]
33
+ structure_info: Dict[str, Any]
34
+
35
+ # Semantic context
36
+ semantic_info: Dict[str, Any]
37
+
38
+ # Relational context
39
+ relationship_info: Dict[str, Any]
40
+
41
+ # Processing provenance
42
+ processing_info: Dict[str, Any]
43
+
44
+
45
+ class ChunkData(BaseModel):
46
+ """Data structure for created chunk."""
47
+
48
+ content: str
49
+ chunk_index: int
50
+ chunk_type: str
51
+ context_metadata: Dict[str, Any]
52
+
53
+
54
+ class ContextualChunkingService(BaseService):
55
+ """Service for creating context-aware chunks."""
56
+
57
+ def __init__(self, user: User):
58
+ super().__init__(user)
59
+ # Get dynamic settings from Constance
60
+ chunking_params = get_chunking_params_for_type('archive')
61
+ self.chunk_size = chunking_params['chunk_size']
62
+ self.overlap = chunking_params['overlap']
63
+
64
+ logger.info(f"📦 Archive chunking initialized: chunk_size={self.chunk_size}, overlap={self.overlap}")
65
+
66
+ def create_chunks_with_context(
67
+ self,
68
+ item: ArchiveItem,
69
+ chunk_size: Optional[int] = None,
70
+ overlap: Optional[int] = None
71
+ ) -> List[ArchiveItemChunk]:
72
+ """Create chunks with rich context metadata."""
73
+
74
+ if not item.raw_content or not item.is_processable:
75
+ return []
76
+
77
+ # Use instance settings if parameters not provided
78
+ final_chunk_size = chunk_size or self.chunk_size
79
+ final_overlap = overlap or self.overlap
80
+
81
+ logger.debug(f"📦 Chunking {item.relative_path}: size={final_chunk_size}, overlap={final_overlap}")
82
+
83
+ try:
84
+ # Debug logging
85
+ logger.info(f"Creating chunks for item: {item.relative_path}, content_type: {item.content_type}")
86
+
87
+ # Choose chunking strategy based on content type
88
+ if item.content_type == ContentType.CODE:
89
+ logger.debug(f"Using code chunking for {item.relative_path}")
90
+ chunks_data = self._chunk_code_content(item, final_chunk_size, final_overlap)
91
+ elif item.content_type == ContentType.DOCUMENT:
92
+ logger.debug(f"Using document chunking for {item.relative_path}")
93
+ chunks_data = self._chunk_document_content(item, final_chunk_size, final_overlap)
94
+ elif item.content_type == ContentType.DATA:
95
+ logger.debug(f"Using data chunking for {item.relative_path}")
96
+ chunks_data = self._chunk_data_content(item, final_chunk_size, final_overlap)
97
+ else:
98
+ logger.debug(f"Using generic chunking for {item.relative_path}")
99
+ chunks_data = self._chunk_generic_content(item, final_chunk_size, final_overlap)
100
+
101
+ logger.info(f"Generated {len(chunks_data)} chunks for {item.relative_path}")
102
+
103
+ # Create chunk records
104
+ chunk_objects = []
105
+
106
+ for chunk_data in chunks_data:
107
+ # Use objects to avoid custom manager issues
108
+ chunk = ArchiveItemChunk.objects.create(
109
+ user=self.user,
110
+ archive=item.archive,
111
+ item=item,
112
+ content=chunk_data.content,
113
+ chunk_index=chunk_data.chunk_index,
114
+ chunk_type=chunk_data.chunk_type,
115
+ context_metadata=chunk_data.context_metadata
116
+ )
117
+ chunk_objects.append(chunk)
118
+
119
+ return chunk_objects
120
+
121
+ except Exception as e:
122
+ logger.error(f"Chunking failed for {item.relative_path}: {str(e)}", exc_info=True)
123
+ raise ChunkingError(
124
+ message=f"Failed to create chunks for item {item.relative_path}",
125
+ code="CHUNKING_FAILED",
126
+ details={
127
+ "item_id": str(item.id),
128
+ "item_path": item.relative_path,
129
+ "error": str(e),
130
+ "content_type": str(item.content_type),
131
+ "content_length": len(item.raw_content) if item.raw_content else 0
132
+ }
133
+ ) from e
134
+
135
+ def _chunk_code_content(
136
+ self,
137
+ item: ArchiveItem,
138
+ chunk_size: int,
139
+ overlap: int
140
+ ) -> List[ChunkData]:
141
+ """Chunk code files by logical boundaries."""
142
+
143
+ if item.language == 'python':
144
+ return self._chunk_python_code(item)
145
+ elif item.language in ['javascript', 'typescript']:
146
+ return self._chunk_js_code(item)
147
+ else:
148
+ return self._chunk_generic_code(item, chunk_size, overlap)
149
+
150
+ def _chunk_python_code(self, item: ArchiveItem) -> List[ChunkData]:
151
+ """Chunk Python code by classes and functions."""
152
+
153
+ content = item.raw_content
154
+ lines = content.split('\n')
155
+ chunks = []
156
+
157
+ try:
158
+ tree = ast.parse(content)
159
+
160
+ # Extract imports first
161
+ imports_chunk = self._extract_python_imports(tree, lines, item, 0)
162
+ if imports_chunk:
163
+ chunks.append(imports_chunk)
164
+
165
+ # Extract classes and functions
166
+ for node in ast.walk(tree):
167
+ if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
168
+ chunk = self._create_python_element_chunk(
169
+ node, lines, item, len(chunks)
170
+ )
171
+ chunks.append(chunk)
172
+
173
+ # Handle module-level code
174
+ remaining_chunk = self._extract_python_remaining_code(
175
+ tree, lines, item, len(chunks)
176
+ )
177
+ if remaining_chunk:
178
+ chunks.append(remaining_chunk)
179
+
180
+ except SyntaxError:
181
+ # Fallback to line-based chunking
182
+ return self._chunk_generic_code(item, self.chunk_size, self.overlap)
183
+
184
+ return chunks
185
+
186
+ def _create_python_element_chunk(
187
+ self,
188
+ node: ast.AST,
189
+ lines: List[str],
190
+ item: ArchiveItem,
191
+ chunk_index: int
192
+ ) -> ChunkData:
193
+ """Create chunk for Python code element."""
194
+
195
+ start_line = node.lineno - 1
196
+ end_line = self._find_python_block_end(node, lines)
197
+
198
+ content = '\n'.join(lines[start_line:end_line])
199
+
200
+ # Analyze code structure
201
+ code_info = self._analyze_python_structure(node, content)
202
+
203
+ # Build context metadata
204
+ context = self._build_code_chunk_context(
205
+ item, chunk_index, content, start_line, end_line, code_info
206
+ )
207
+
208
+ return ChunkData(
209
+ content=content,
210
+ chunk_index=chunk_index,
211
+ chunk_type=ChunkType.CODE,
212
+ context_metadata=context
213
+ )
214
+
215
+ def _analyze_python_structure(self, node: ast.AST, content: str) -> Dict[str, Any]:
216
+ """Analyze Python code structure for context."""
217
+
218
+ info = {
219
+ 'element_name': node.name,
220
+ 'element_type': 'class' if isinstance(node, ast.ClassDef) else 'function',
221
+ 'is_async': isinstance(node, ast.AsyncFunctionDef),
222
+ 'docstring': ast.get_docstring(node),
223
+ 'decorators': [d.id for d in getattr(node, 'decorator_list', []) if hasattr(d, 'id')],
224
+ 'complexity_score': self._calculate_code_complexity(content),
225
+ 'purpose': self._detect_code_purpose(node.name, content),
226
+ }
227
+
228
+ # Extract function/method arguments
229
+ if hasattr(node, 'args'):
230
+ info['arguments'] = [arg.arg for arg in node.args.args]
231
+
232
+ # Extract class bases
233
+ if isinstance(node, ast.ClassDef):
234
+ info['base_classes'] = [base.id for base in node.bases if hasattr(base, 'id')]
235
+
236
+ return info
237
+
238
+ def _chunk_document_content(
239
+ self,
240
+ item: ArchiveItem,
241
+ chunk_size: int,
242
+ overlap: int
243
+ ) -> List[ChunkData]:
244
+ """Chunk document files by structure."""
245
+
246
+ if item.language == 'markdown':
247
+ return self._chunk_markdown_content(item)
248
+ else:
249
+ return self._chunk_generic_content(item, chunk_size, overlap)
250
+
251
+ def _chunk_markdown_content(self, item: ArchiveItem) -> List[ChunkData]:
252
+ """Chunk markdown by headings and sections."""
253
+
254
+ content = item.raw_content
255
+ lines = content.split('\n')
256
+ chunks = []
257
+
258
+ current_section = {'title': '', 'level': 0, 'start_line': 0}
259
+
260
+ for i, line in enumerate(lines):
261
+ if line.startswith('#'):
262
+ # New section found
263
+ if current_section['start_line'] < i:
264
+ # Create chunk for previous section
265
+ chunk = self._create_markdown_section_chunk(
266
+ lines[current_section['start_line']:i],
267
+ current_section,
268
+ item,
269
+ len(chunks)
270
+ )
271
+ chunks.append(chunk)
272
+
273
+ # Start new section
274
+ level = len(line) - len(line.lstrip('#'))
275
+ current_section = {
276
+ 'title': line.lstrip('# ').strip(),
277
+ 'level': level,
278
+ 'start_line': i
279
+ }
280
+
281
+ # Handle last section
282
+ if current_section['start_line'] < len(lines):
283
+ chunk = self._create_markdown_section_chunk(
284
+ lines[current_section['start_line']:],
285
+ current_section,
286
+ item,
287
+ len(chunks)
288
+ )
289
+ chunks.append(chunk)
290
+
291
+ return chunks
292
+
293
+ def _create_markdown_section_chunk(
294
+ self,
295
+ section_lines: List[str],
296
+ section_info: Dict[str, Any],
297
+ item: ArchiveItem,
298
+ chunk_index: int
299
+ ) -> ChunkData:
300
+ """Create chunk for markdown section."""
301
+
302
+ content = '\n'.join(section_lines)
303
+
304
+ # Build context metadata
305
+ context = self._build_document_chunk_context(
306
+ item, chunk_index, content, section_info
307
+ )
308
+
309
+ chunk_type = ChunkType.HEADING if section_info['title'] else ChunkType.TEXT
310
+
311
+ return ChunkData(
312
+ content=content,
313
+ chunk_index=chunk_index,
314
+ chunk_type=chunk_type,
315
+ context_metadata=context
316
+ )
317
+
318
+ def _chunk_data_content(
319
+ self,
320
+ item: ArchiveItem,
321
+ chunk_size: int,
322
+ overlap: int
323
+ ) -> List[ChunkData]:
324
+ """Chunk data files by logical structure."""
325
+
326
+ if item.language == 'json':
327
+ return self._chunk_json_content(item)
328
+ elif item.language in ['yaml', 'yml']:
329
+ return self._chunk_yaml_content(item)
330
+ else:
331
+ return self._chunk_generic_content(item, chunk_size, overlap)
332
+
333
+ def _chunk_json_content(self, item: ArchiveItem) -> List[ChunkData]:
334
+ """Chunk JSON by object structure."""
335
+
336
+ import json
337
+
338
+ try:
339
+ data = json.loads(item.raw_content)
340
+ chunks = []
341
+
342
+ if isinstance(data, dict):
343
+ # Chunk by top-level keys
344
+ for key, value in data.items():
345
+ chunk_content = json.dumps({key: value}, indent=2)
346
+
347
+ context = self._build_data_chunk_context(
348
+ item, len(chunks), chunk_content, 'json_object', key
349
+ )
350
+
351
+ chunks.append(ChunkData(
352
+ content=chunk_content,
353
+ chunk_index=len(chunks),
354
+ chunk_type=ChunkType.METADATA,
355
+ context_metadata=context
356
+ ))
357
+
358
+ return chunks
359
+
360
+ except json.JSONDecodeError:
361
+ # Fallback to text chunking
362
+ return self._chunk_generic_content(item, self.chunk_size, self.overlap)
363
+
364
+ def _chunk_generic_content(
365
+ self,
366
+ item: ArchiveItem,
367
+ chunk_size: int,
368
+ overlap: int
369
+ ) -> List[ChunkData]:
370
+ """Generic text chunking with overlap."""
371
+
372
+ content = item.raw_content
373
+ chunks = []
374
+
375
+ # Simple text splitting with overlap
376
+ start = 0
377
+ chunk_index = 0
378
+
379
+ while start < len(content):
380
+ end = start + chunk_size
381
+
382
+ # Try to break at word boundary
383
+ if end < len(content):
384
+ # Look for good break points
385
+ break_point = self._find_good_break_point(content, start, end)
386
+ if break_point > start:
387
+ end = break_point
388
+
389
+ chunk_content = content[start:end].strip()
390
+
391
+ if chunk_content:
392
+ context = self._build_generic_chunk_context(
393
+ item, chunk_index, chunk_content, start, end
394
+ )
395
+
396
+ chunks.append(ChunkData(
397
+ content=chunk_content,
398
+ chunk_index=chunk_index,
399
+ chunk_type=ChunkType.TEXT,
400
+ context_metadata=context
401
+ ))
402
+
403
+ chunk_index += 1
404
+
405
+ # Move start position with overlap
406
+ start = max(start + chunk_size - overlap, end)
407
+
408
+ return chunks
409
+
410
+ def _find_good_break_point(self, content: str, start: int, end: int) -> int:
411
+ """Find good break point for text chunking."""
412
+
413
+ # Look for sentence endings
414
+ for i in range(end - 1, start, -1):
415
+ if content[i] in '.!?\n':
416
+ return i + 1
417
+
418
+ # Look for word boundaries
419
+ for i in range(end - 1, start, -1):
420
+ if content[i].isspace():
421
+ return i
422
+
423
+ return end
424
+
425
+ def _build_code_chunk_context(
426
+ self,
427
+ item: ArchiveItem,
428
+ chunk_index: int,
429
+ content: str,
430
+ start_line: int,
431
+ end_line: int,
432
+ code_info: Dict[str, Any]
433
+ ) -> Dict[str, Any]:
434
+ """Build context metadata for code chunk."""
435
+
436
+ return {
437
+ 'archive_info': {
438
+ 'id': str(item.archive.id),
439
+ 'title': item.archive.title,
440
+ 'description': item.archive.description,
441
+ },
442
+ 'item_info': {
443
+ 'id': str(item.id),
444
+ 'relative_path': item.relative_path,
445
+ 'item_name': item.item_name,
446
+ 'content_type': item.content_type,
447
+ 'language': item.language,
448
+ },
449
+ 'position_info': {
450
+ 'chunk_index': chunk_index,
451
+ 'start_line': start_line + 1,
452
+ 'end_line': end_line,
453
+ 'total_lines': len(item.raw_content.split('\n')),
454
+ },
455
+ 'structure_info': {
456
+ 'element_name': code_info.get('element_name'),
457
+ 'element_type': code_info.get('element_type'),
458
+ 'is_async': code_info.get('is_async', False),
459
+ 'has_docstring': bool(code_info.get('docstring')),
460
+ },
461
+ 'semantic_info': {
462
+ 'chunk_type': 'code',
463
+ 'content_purpose': code_info.get('purpose', 'implementation'),
464
+ 'complexity_score': code_info.get('complexity_score', 0.0),
465
+ 'technical_tags': self._generate_code_tags(content, code_info),
466
+ },
467
+ 'processing_info': {
468
+ 'extraction_method': 'ast_parser',
469
+ 'chunking_strategy': 'logical_units',
470
+ 'quality_score': self._assess_code_quality(content),
471
+ }
472
+ }
473
+
474
+ def _build_document_chunk_context(
475
+ self,
476
+ item: ArchiveItem,
477
+ chunk_index: int,
478
+ content: str,
479
+ section_info: Dict[str, Any]
480
+ ) -> Dict[str, Any]:
481
+ """Build context metadata for document chunk."""
482
+
483
+ return {
484
+ 'archive_info': {
485
+ 'id': str(item.archive.id),
486
+ 'title': item.archive.title,
487
+ },
488
+ 'item_info': {
489
+ 'id': str(item.id),
490
+ 'relative_path': item.relative_path,
491
+ 'content_type': item.content_type,
492
+ 'language': item.language,
493
+ },
494
+ 'position_info': {
495
+ 'chunk_index': chunk_index,
496
+ },
497
+ 'structure_info': {
498
+ 'section_title': section_info.get('title'),
499
+ 'section_level': section_info.get('level', 0),
500
+ },
501
+ 'semantic_info': {
502
+ 'chunk_type': 'heading' if section_info.get('title') else 'text',
503
+ 'content_purpose': 'documentation',
504
+ 'topic_tags': self._generate_document_tags(content),
505
+ },
506
+ 'processing_info': {
507
+ 'extraction_method': 'markdown_parser',
508
+ 'chunking_strategy': 'heading_based',
509
+ }
510
+ }
511
+
512
+ def _build_data_chunk_context(
513
+ self,
514
+ item: ArchiveItem,
515
+ chunk_index: int,
516
+ content: str,
517
+ data_type: str,
518
+ key_name: Optional[str] = None
519
+ ) -> Dict[str, Any]:
520
+ """Build context metadata for data chunk."""
521
+
522
+ return {
523
+ 'archive_info': {
524
+ 'id': str(item.archive.id),
525
+ 'title': item.archive.title,
526
+ },
527
+ 'item_info': {
528
+ 'id': str(item.id),
529
+ 'relative_path': item.relative_path,
530
+ 'content_type': item.content_type,
531
+ },
532
+ 'position_info': {
533
+ 'chunk_index': chunk_index,
534
+ },
535
+ 'structure_info': {
536
+ 'data_key': key_name,
537
+ 'data_type': data_type,
538
+ },
539
+ 'semantic_info': {
540
+ 'chunk_type': 'metadata',
541
+ 'content_purpose': 'data_definition',
542
+ },
543
+ 'processing_info': {
544
+ 'extraction_method': 'json_parser',
545
+ 'chunking_strategy': 'object_properties',
546
+ }
547
+ }
548
+
549
+ def _build_generic_chunk_context(
550
+ self,
551
+ item: ArchiveItem,
552
+ chunk_index: int,
553
+ content: str,
554
+ start_pos: int,
555
+ end_pos: int
556
+ ) -> Dict[str, Any]:
557
+ """Build context metadata for generic text chunk."""
558
+
559
+ return {
560
+ 'archive_info': {
561
+ 'id': str(item.archive.id),
562
+ 'title': item.archive.title,
563
+ },
564
+ 'item_info': {
565
+ 'id': str(item.id),
566
+ 'relative_path': item.relative_path,
567
+ 'content_type': item.content_type,
568
+ },
569
+ 'position_info': {
570
+ 'chunk_index': chunk_index,
571
+ 'start_char': start_pos,
572
+ 'end_char': end_pos,
573
+ 'relative_position': start_pos / len(item.raw_content),
574
+ },
575
+ 'semantic_info': {
576
+ 'chunk_type': 'text',
577
+ 'content_purpose': 'content',
578
+ },
579
+ 'processing_info': {
580
+ 'extraction_method': 'text_splitting',
581
+ 'chunking_strategy': 'fixed_size_overlap',
582
+ }
583
+ }
584
+
585
+ def _generate_code_tags(self, content: str, code_info: Dict[str, Any]) -> List[str]:
586
+ """Generate technical tags for code content."""
587
+
588
+ tags = []
589
+
590
+ # Element type tags
591
+ if code_info.get('element_type'):
592
+ tags.append(f"contains:{code_info['element_type']}")
593
+
594
+ # Async tag
595
+ if code_info.get('is_async'):
596
+ tags.append('async')
597
+
598
+ # Pattern detection
599
+ if 'import ' in content or 'from ' in content:
600
+ tags.append('contains:imports')
601
+
602
+ if 'class ' in content:
603
+ tags.append('contains:class_definition')
604
+
605
+ if 'def ' in content:
606
+ tags.append('contains:function_definition')
607
+
608
+ if 'test' in code_info.get('element_name', '').lower():
609
+ tags.append('purpose:testing')
610
+
611
+ return tags
612
+
613
+ def _generate_document_tags(self, content: str) -> List[str]:
614
+ """Generate topic tags for document content."""
615
+
616
+ tags = []
617
+
618
+ # Detect headings
619
+ if content.strip().startswith('#'):
620
+ tags.append('contains:heading')
621
+
622
+ # Detect lists
623
+ if re.search(r'^\s*[-*+]\s', content, re.MULTILINE):
624
+ tags.append('contains:list')
625
+
626
+ # Detect code blocks
627
+ if '```' in content or ' ' in content:
628
+ tags.append('contains:code_block')
629
+
630
+ return tags
631
+
632
+ def _calculate_code_complexity(self, content: str) -> float:
633
+ """Calculate code complexity score."""
634
+
635
+ # Simple complexity based on lines and control structures
636
+ lines = content.split('\n')
637
+ complexity = len(lines) / 100.0 # Base complexity
638
+
639
+ # Add complexity for control structures
640
+ control_keywords = ['if', 'for', 'while', 'try', 'except', 'with']
641
+ for keyword in control_keywords:
642
+ complexity += content.count(keyword) * 0.1
643
+
644
+ return min(1.0, complexity)
645
+
646
+ def _assess_code_quality(self, content: str) -> float:
647
+ """Assess code quality score."""
648
+
649
+ # Simple quality assessment
650
+ quality = 0.5 # Base quality
651
+
652
+ # Boost for docstrings
653
+ if '"""' in content or "'''" in content:
654
+ quality += 0.2
655
+
656
+ # Boost for comments
657
+ comment_lines = len([line for line in content.split('\n') if line.strip().startswith('#')])
658
+ quality += min(0.2, comment_lines / 10.0)
659
+
660
+ # Penalty for very long lines
661
+ long_lines = len([line for line in content.split('\n') if len(line) > 100])
662
+ quality -= min(0.2, long_lines / 10.0)
663
+
664
+ return max(0.0, min(1.0, quality))
665
+
666
+ def _detect_code_purpose(self, element_name: str, content: str) -> str:
667
+ """Detect purpose of code element."""
668
+
669
+ name_lower = element_name.lower()
670
+
671
+ if name_lower.startswith('test_'):
672
+ return 'test'
673
+ elif name_lower.startswith('_'):
674
+ return 'private_method'
675
+ elif 'config' in name_lower:
676
+ return 'configuration'
677
+ elif 'init' in name_lower:
678
+ return 'initialization'
679
+ elif 'main' in name_lower:
680
+ return 'main_function'
681
+ else:
682
+ return 'implementation'
683
+
684
+ def _find_python_block_end(self, node: ast.AST, lines: List[str]) -> int:
685
+ """Find end line of Python code block."""
686
+
687
+ # Start from the node's end line
688
+ start_line = getattr(node, 'end_lineno', node.lineno) or node.lineno
689
+
690
+ # Look for the actual end by checking indentation
691
+ for i in range(start_line, len(lines)):
692
+ line = lines[i]
693
+ if line.strip() and not line.startswith(' ') and not line.startswith('\t'):
694
+ return i
695
+
696
+ return len(lines)
697
+
698
+ def _extract_python_imports(
699
+ self,
700
+ tree: ast.AST,
701
+ lines: List[str],
702
+ item: ArchiveItem,
703
+ chunk_index: int
704
+ ) -> Optional[ChunkData]:
705
+ """Extract imports as separate chunk."""
706
+
707
+ import_lines = []
708
+
709
+ for node in ast.walk(tree):
710
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
711
+ import_lines.append(node.lineno - 1)
712
+
713
+ if not import_lines:
714
+ return None
715
+
716
+ # Get all import lines
717
+ import_content = '\n'.join(lines[min(import_lines):max(import_lines) + 1])
718
+
719
+ context = self._build_code_chunk_context(
720
+ item, chunk_index, import_content,
721
+ min(import_lines), max(import_lines) + 1,
722
+ {'element_name': 'imports', 'element_type': 'imports', 'purpose': 'imports'}
723
+ )
724
+
725
+ return ChunkData(
726
+ content=import_content,
727
+ chunk_index=chunk_index,
728
+ chunk_type=ChunkType.METADATA,
729
+ context_metadata=context
730
+ )
731
+
732
+ def _extract_python_remaining_code(
733
+ self,
734
+ tree: ast.AST,
735
+ lines: List[str],
736
+ item: ArchiveItem,
737
+ chunk_index: int
738
+ ) -> Optional[ChunkData]:
739
+ """Extract remaining module-level code."""
740
+
741
+ # This is a simplified implementation
742
+ # In practice, you'd want to identify module-level statements
743
+ # that aren't part of classes or functions
744
+
745
+ return None # Skip for now
746
+
747
+ def _chunk_generic_code(
748
+ self,
749
+ item: ArchiveItem,
750
+ chunk_size: int,
751
+ overlap: int
752
+ ) -> List[ChunkData]:
753
+ """Generic code chunking for unsupported languages."""
754
+
755
+ return self._chunk_generic_content(item, chunk_size, overlap)
756
+
757
+ def _chunk_js_code(self, item: ArchiveItem) -> List[ChunkData]:
758
+ """Chunk JavaScript/TypeScript code."""
759
+
760
+ # Simplified implementation - could be enhanced with proper JS parsing
761
+ return self._chunk_generic_content(item, self.chunk_size, self.overlap)
762
+
763
+ def _chunk_yaml_content(self, item: ArchiveItem) -> List[ChunkData]:
764
+ """Chunk YAML content."""
765
+
766
+ # Simplified implementation - could be enhanced with YAML parsing
767
+ return self._chunk_generic_content(item, self.chunk_size, self.overlap)
768
+
769
+
770
+ class ChunkContextBuilder:
771
+ """Helper class for building chunk context metadata."""
772
+
773
+ @staticmethod
774
+ def build_context(
775
+ archive_info: Dict[str, Any],
776
+ item_info: Dict[str, Any],
777
+ position_info: Dict[str, Any],
778
+ structure_info: Dict[str, Any],
779
+ semantic_info: Dict[str, Any],
780
+ processing_info: Dict[str, Any]
781
+ ) -> Dict[str, Any]:
782
+ """Build complete context metadata."""
783
+
784
+ return {
785
+ 'archive_info': archive_info,
786
+ 'item_info': item_info,
787
+ 'position_info': position_info,
788
+ 'structure_info': structure_info,
789
+ 'semantic_info': semantic_info,
790
+ 'processing_info': processing_info
791
+ }