django-cfg 1.1.82__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. django_cfg/__init__.py +20 -448
  2. django_cfg/apps/accounts/README.md +3 -3
  3. django_cfg/apps/accounts/admin/__init__.py +0 -2
  4. django_cfg/apps/accounts/admin/activity.py +2 -9
  5. django_cfg/apps/accounts/admin/filters.py +0 -42
  6. django_cfg/apps/accounts/admin/inlines.py +8 -8
  7. django_cfg/apps/accounts/admin/otp.py +5 -5
  8. django_cfg/apps/accounts/admin/registration_source.py +1 -8
  9. django_cfg/apps/accounts/admin/user.py +12 -20
  10. django_cfg/apps/accounts/managers/user_manager.py +2 -129
  11. django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
  12. django_cfg/apps/accounts/models.py +3 -123
  13. django_cfg/apps/accounts/serializers/otp.py +40 -44
  14. django_cfg/apps/accounts/serializers/profile.py +0 -2
  15. django_cfg/apps/accounts/services/otp_service.py +98 -186
  16. django_cfg/apps/accounts/signals.py +25 -15
  17. django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
  18. django_cfg/apps/accounts/views/otp.py +35 -36
  19. django_cfg/apps/agents/README.md +129 -0
  20. django_cfg/apps/agents/__init__.py +68 -0
  21. django_cfg/apps/agents/admin/__init__.py +17 -0
  22. django_cfg/apps/agents/admin/execution_admin.py +460 -0
  23. django_cfg/apps/agents/admin/registry_admin.py +360 -0
  24. django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
  25. django_cfg/apps/agents/apps.py +29 -0
  26. django_cfg/apps/agents/core/__init__.py +20 -0
  27. django_cfg/apps/agents/core/agent.py +281 -0
  28. django_cfg/apps/agents/core/dependencies.py +154 -0
  29. django_cfg/apps/agents/core/exceptions.py +66 -0
  30. django_cfg/apps/agents/core/models.py +106 -0
  31. django_cfg/apps/agents/core/orchestrator.py +391 -0
  32. django_cfg/apps/agents/examples/__init__.py +3 -0
  33. django_cfg/apps/agents/examples/simple_example.py +161 -0
  34. django_cfg/apps/agents/integration/__init__.py +14 -0
  35. django_cfg/apps/agents/integration/middleware.py +80 -0
  36. django_cfg/apps/agents/integration/registry.py +345 -0
  37. django_cfg/apps/agents/integration/signals.py +50 -0
  38. django_cfg/apps/agents/management/__init__.py +3 -0
  39. django_cfg/apps/agents/management/commands/__init__.py +3 -0
  40. django_cfg/apps/agents/management/commands/create_agent.py +365 -0
  41. django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
  42. django_cfg/apps/agents/managers/__init__.py +23 -0
  43. django_cfg/apps/agents/managers/execution.py +236 -0
  44. django_cfg/apps/agents/managers/registry.py +254 -0
  45. django_cfg/apps/agents/managers/toolsets.py +496 -0
  46. django_cfg/apps/agents/migrations/0001_initial.py +286 -0
  47. django_cfg/apps/agents/migrations/__init__.py +5 -0
  48. django_cfg/apps/agents/models/__init__.py +15 -0
  49. django_cfg/apps/agents/models/execution.py +215 -0
  50. django_cfg/apps/agents/models/registry.py +220 -0
  51. django_cfg/apps/agents/models/toolsets.py +305 -0
  52. django_cfg/apps/agents/patterns/__init__.py +24 -0
  53. django_cfg/apps/agents/patterns/content_agents.py +234 -0
  54. django_cfg/apps/agents/toolsets/__init__.py +15 -0
  55. django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
  56. django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
  57. django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
  58. django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
  59. django_cfg/apps/agents/urls.py +46 -0
  60. django_cfg/apps/knowbase/README.md +150 -0
  61. django_cfg/apps/knowbase/__init__.py +27 -0
  62. django_cfg/apps/knowbase/admin/__init__.py +23 -0
  63. django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
  64. django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
  65. django_cfg/apps/knowbase/admin/document_admin.py +650 -0
  66. django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
  67. django_cfg/apps/knowbase/apps.py +81 -0
  68. django_cfg/apps/knowbase/config/README.md +176 -0
  69. django_cfg/apps/knowbase/config/__init__.py +51 -0
  70. django_cfg/apps/knowbase/config/constance_fields.py +186 -0
  71. django_cfg/apps/knowbase/config/constance_settings.py +200 -0
  72. django_cfg/apps/knowbase/config/settings.py +450 -0
  73. django_cfg/apps/knowbase/examples/__init__.py +3 -0
  74. django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
  75. django_cfg/apps/knowbase/management/__init__.py +0 -0
  76. django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
  77. django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
  78. django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
  79. django_cfg/apps/knowbase/managers/__init__.py +22 -0
  80. django_cfg/apps/knowbase/managers/archive.py +426 -0
  81. django_cfg/apps/knowbase/managers/base.py +32 -0
  82. django_cfg/apps/knowbase/managers/chat.py +141 -0
  83. django_cfg/apps/knowbase/managers/document.py +203 -0
  84. django_cfg/apps/knowbase/managers/external_data.py +471 -0
  85. django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
  86. django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
  87. django_cfg/apps/knowbase/migrations/__init__.py +5 -0
  88. django_cfg/apps/knowbase/mixins/__init__.py +15 -0
  89. django_cfg/apps/knowbase/mixins/config.py +108 -0
  90. django_cfg/apps/knowbase/mixins/creator.py +81 -0
  91. django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
  92. django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
  93. django_cfg/apps/knowbase/mixins/service.py +362 -0
  94. django_cfg/apps/knowbase/models/__init__.py +41 -0
  95. django_cfg/apps/knowbase/models/archive.py +599 -0
  96. django_cfg/apps/knowbase/models/base.py +58 -0
  97. django_cfg/apps/knowbase/models/chat.py +157 -0
  98. django_cfg/apps/knowbase/models/document.py +267 -0
  99. django_cfg/apps/knowbase/models/external_data.py +376 -0
  100. django_cfg/apps/knowbase/serializers/__init__.py +68 -0
  101. django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
  102. django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
  103. django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
  104. django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
  105. django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
  106. django_cfg/apps/knowbase/services/__init__.py +40 -0
  107. django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
  108. django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
  109. django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
  110. django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
  111. django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
  112. django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
  113. django_cfg/apps/knowbase/services/base.py +53 -0
  114. django_cfg/apps/knowbase/services/chat_service.py +239 -0
  115. django_cfg/apps/knowbase/services/document_service.py +144 -0
  116. django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
  117. django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
  118. django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
  119. django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
  120. django_cfg/apps/knowbase/services/embedding/models.py +229 -0
  121. django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
  122. django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
  123. django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
  124. django_cfg/apps/knowbase/services/search_service.py +293 -0
  125. django_cfg/apps/knowbase/signals/__init__.py +21 -0
  126. django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
  127. django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
  128. django_cfg/apps/knowbase/signals/document_signals.py +143 -0
  129. django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
  130. django_cfg/apps/knowbase/tasks/__init__.py +39 -0
  131. django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
  132. django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
  133. django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
  134. django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
  135. django_cfg/apps/knowbase/urls.py +43 -0
  136. django_cfg/apps/knowbase/utils/__init__.py +12 -0
  137. django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
  138. django_cfg/apps/knowbase/utils/text_processing.py +375 -0
  139. django_cfg/apps/knowbase/utils/validation.py +99 -0
  140. django_cfg/apps/knowbase/views/__init__.py +28 -0
  141. django_cfg/apps/knowbase/views/archive_views.py +469 -0
  142. django_cfg/apps/knowbase/views/base.py +49 -0
  143. django_cfg/apps/knowbase/views/chat_views.py +181 -0
  144. django_cfg/apps/knowbase/views/document_views.py +183 -0
  145. django_cfg/apps/knowbase/views/public_views.py +129 -0
  146. django_cfg/apps/leads/admin.py +70 -0
  147. django_cfg/apps/newsletter/admin.py +234 -0
  148. django_cfg/apps/newsletter/admin_filters.py +124 -0
  149. django_cfg/apps/support/admin.py +196 -0
  150. django_cfg/apps/support/admin_filters.py +71 -0
  151. django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
  152. django_cfg/apps/urls.py +5 -4
  153. django_cfg/cli/README.md +1 -1
  154. django_cfg/cli/commands/create_project.py +2 -2
  155. django_cfg/cli/commands/info.py +1 -1
  156. django_cfg/config.py +44 -0
  157. django_cfg/core/config.py +29 -82
  158. django_cfg/core/environment.py +1 -1
  159. django_cfg/core/generation.py +19 -107
  160. django_cfg/{integration.py → core/integration.py} +18 -16
  161. django_cfg/core/validation.py +1 -1
  162. django_cfg/management/__init__.py +1 -1
  163. django_cfg/management/commands/__init__.py +1 -1
  164. django_cfg/management/commands/auto_generate.py +482 -0
  165. django_cfg/management/commands/migrator.py +19 -101
  166. django_cfg/management/commands/test_email.py +1 -1
  167. django_cfg/middleware/README.md +0 -158
  168. django_cfg/middleware/__init__.py +0 -2
  169. django_cfg/middleware/user_activity.py +3 -3
  170. django_cfg/models/api.py +145 -0
  171. django_cfg/models/base.py +287 -0
  172. django_cfg/models/cache.py +4 -4
  173. django_cfg/models/constance.py +25 -88
  174. django_cfg/models/database.py +9 -9
  175. django_cfg/models/drf.py +3 -36
  176. django_cfg/models/email.py +163 -0
  177. django_cfg/models/environment.py +276 -0
  178. django_cfg/models/limits.py +1 -1
  179. django_cfg/models/logging.py +366 -0
  180. django_cfg/models/revolution.py +41 -2
  181. django_cfg/models/security.py +125 -0
  182. django_cfg/models/services.py +1 -1
  183. django_cfg/modules/__init__.py +2 -56
  184. django_cfg/modules/base.py +78 -52
  185. django_cfg/modules/django_currency/service.py +2 -2
  186. django_cfg/modules/django_email.py +2 -2
  187. django_cfg/modules/django_health.py +267 -0
  188. django_cfg/modules/django_llm/llm/client.py +91 -19
  189. django_cfg/modules/django_llm/translator/translator.py +2 -2
  190. django_cfg/modules/django_logger.py +2 -2
  191. django_cfg/modules/django_ngrok.py +2 -2
  192. django_cfg/modules/django_tasks.py +68 -3
  193. django_cfg/modules/django_telegram.py +3 -3
  194. django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
  195. django_cfg/modules/django_twilio/service.py +2 -2
  196. django_cfg/modules/django_twilio/simple_service.py +2 -2
  197. django_cfg/modules/django_twilio/twilio_service.py +2 -2
  198. django_cfg/modules/django_unfold/__init__.py +69 -0
  199. django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
  200. django_cfg/modules/django_unfold/dashboard.py +278 -0
  201. django_cfg/modules/django_unfold/icons/README.md +145 -0
  202. django_cfg/modules/django_unfold/icons/__init__.py +12 -0
  203. django_cfg/modules/django_unfold/icons/constants.py +2851 -0
  204. django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
  205. django_cfg/modules/django_unfold/models/__init__.py +42 -0
  206. django_cfg/modules/django_unfold/models/config.py +601 -0
  207. django_cfg/modules/django_unfold/models/dashboard.py +206 -0
  208. django_cfg/modules/django_unfold/models/dropdown.py +40 -0
  209. django_cfg/modules/django_unfold/models/navigation.py +73 -0
  210. django_cfg/modules/django_unfold/models/tabs.py +25 -0
  211. django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
  212. django_cfg/modules/django_unfold/utils.py +140 -0
  213. django_cfg/registry/__init__.py +23 -0
  214. django_cfg/registry/core.py +61 -0
  215. django_cfg/registry/exceptions.py +11 -0
  216. django_cfg/registry/modules.py +12 -0
  217. django_cfg/registry/services.py +26 -0
  218. django_cfg/registry/third_party.py +52 -0
  219. django_cfg/routing/__init__.py +19 -0
  220. django_cfg/routing/callbacks.py +198 -0
  221. django_cfg/routing/routers.py +48 -0
  222. django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
  223. django_cfg/templatetags/__init__.py +0 -0
  224. django_cfg/templatetags/django_cfg.py +33 -0
  225. django_cfg/urls.py +33 -0
  226. django_cfg/utils/path_resolution.py +1 -1
  227. django_cfg/utils/smart_defaults.py +7 -61
  228. django_cfg/utils/toolkit.py +663 -0
  229. {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/METADATA +83 -86
  230. django_cfg-1.2.1.dist-info/RECORD +441 -0
  231. django_cfg/archive/django_sample.zip +0 -0
  232. django_cfg/models/unfold.py +0 -271
  233. django_cfg/modules/unfold/__init__.py +0 -29
  234. django_cfg/modules/unfold/dashboard.py +0 -318
  235. django_cfg/pyproject.toml +0 -370
  236. django_cfg/routers.py +0 -83
  237. django_cfg-1.1.82.dist-info/RECORD +0 -278
  238. /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
  239. /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
  240. /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
  241. /django_cfg/{version_check.py → utils/version_check.py} +0 -0
  242. {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/WHEEL +0 -0
  243. {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/entry_points.txt +0 -0
  244. {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,52 @@
1
+ """
2
+ Archive processing exceptions.
3
+
4
+ Custom exception hierarchy for archive processing operations.
5
+ """
6
+
7
+ from typing import Optional, Dict, Any
8
+
9
+
10
+ class ArchiveProcessingError(Exception):
11
+ """Base exception for archive processing errors."""
12
+
13
+ def __init__(
14
+ self,
15
+ message: str,
16
+ code: str,
17
+ details: Optional[Dict[str, Any]] = None
18
+ ):
19
+ self.message = message
20
+ self.code = code
21
+ self.details = details or {}
22
+ super().__init__(message)
23
+
24
+
25
+ class ArchiveValidationError(ArchiveProcessingError):
26
+ """Archive validation errors."""
27
+ pass
28
+
29
+
30
+ class ExtractionError(ArchiveProcessingError):
31
+ """Archive extraction errors."""
32
+ pass
33
+
34
+
35
+ class ChunkingError(ArchiveProcessingError):
36
+ """Content chunking errors."""
37
+ pass
38
+
39
+
40
+ class VectorizationError(ArchiveProcessingError):
41
+ """Vectorization processing errors."""
42
+ pass
43
+
44
+
45
+ class ContentTypeDetectionError(ArchiveProcessingError):
46
+ """Content type detection errors."""
47
+ pass
48
+
49
+
50
+ class ProcessingTimeoutError(ArchiveProcessingError):
51
+ """Processing timeout errors."""
52
+ pass
@@ -0,0 +1,508 @@
1
+ """
2
+ Archive extraction services.
3
+
4
+ Handles extraction of different archive formats and content processing.
5
+ """
6
+
7
+ import os
8
+ import zipfile
9
+ import tarfile
10
+ import tempfile
11
+ import shutil
12
+ import hashlib
13
+ import mimetypes
14
+ from pathlib import Path
15
+ from typing import List, Dict, Any, Optional, Set
16
+ from pydantic import BaseModel
17
+
18
+ from ...models.archive import ArchiveType, ContentType
19
+ from .exceptions import ExtractionError, ContentTypeDetectionError
20
+
21
+
22
+ class ExtractedItemData(BaseModel):
23
+ """Data structure for extracted archive item."""
24
+
25
+ relative_path: str
26
+ item_name: str
27
+ file_size: int
28
+ content: Optional[str] = None
29
+ content_hash: str
30
+ is_processable: bool
31
+ content_type: str
32
+ language: Optional[str] = None
33
+ metadata: Dict[str, Any]
34
+
35
+
36
+ class ArchiveExtractionService:
37
+ """Service for extracting archives and processing content."""
38
+
39
+ # File size limits
40
+ MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB per file
41
+
42
+ # Text file extensions
43
+ TEXT_EXTENSIONS: Set[str] = {
44
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go', '.rs',
45
+ '.cpp', '.c', '.h', '.hpp', '.php', '.rb', '.cs', '.swift',
46
+ '.kt', '.scala', '.clj', '.hs', '.ml', '.fs', '.elm',
47
+ '.md', '.txt', '.rst', '.adoc',
48
+ '.yml', '.yaml', '.json', '.toml', '.ini', '.cfg', '.conf',
49
+ '.xml', '.html', '.css', '.scss', '.less',
50
+ '.sql', '.sh', '.bash', '.zsh', '.fish',
51
+ '.dockerfile', '.makefile', '.gitignore', '.env',
52
+ '.tf', '.hcl'
53
+ }
54
+
55
+ def extract_archive(
56
+ self,
57
+ archive_path: str,
58
+ archive_type: str
59
+ ) -> List[ExtractedItemData]:
60
+ """Extract archive and return processed item data."""
61
+
62
+ extract_dir = tempfile.mkdtemp(prefix='extracted_')
63
+
64
+ try:
65
+ # Extract based on type
66
+ file_list = self._extract_by_type(archive_path, archive_type, extract_dir)
67
+
68
+ # Process extracted files
69
+ extracted_items = []
70
+
71
+ for relative_path in file_list:
72
+ full_path = os.path.join(extract_dir, relative_path)
73
+
74
+ # Skip directories
75
+ if os.path.isdir(full_path):
76
+ continue
77
+
78
+ # Skip unwanted files
79
+ if self._should_skip_file(relative_path):
80
+ continue
81
+
82
+ try:
83
+ item_data = self._process_extracted_file(full_path, relative_path)
84
+ if item_data:
85
+ extracted_items.append(item_data)
86
+ except Exception as e:
87
+ # Log error but continue with other files
88
+ continue
89
+
90
+ return extracted_items
91
+
92
+ finally:
93
+ # Always cleanup extraction directory
94
+ if os.path.exists(extract_dir):
95
+ shutil.rmtree(extract_dir)
96
+
97
+ def _extract_by_type(
98
+ self,
99
+ archive_path: str,
100
+ archive_type: str,
101
+ extract_dir: str
102
+ ) -> List[str]:
103
+ """Extract archive based on its type."""
104
+
105
+ try:
106
+ if archive_type == ArchiveType.ZIP:
107
+ return self._extract_zip(archive_path, extract_dir)
108
+ elif archive_type in [ArchiveType.TAR, ArchiveType.TAR_GZ, ArchiveType.TAR_BZ2]:
109
+ return self._extract_tar(archive_path, archive_type, extract_dir)
110
+ else:
111
+ raise ExtractionError(
112
+ message=f"Unsupported archive type: {archive_type}",
113
+ code="UNSUPPORTED_ARCHIVE_TYPE",
114
+ details={"archive_type": archive_type}
115
+ )
116
+ except Exception as e:
117
+ if isinstance(e, ExtractionError):
118
+ raise
119
+
120
+ raise ExtractionError(
121
+ message=f"Failed to extract archive: {str(e)}",
122
+ code="EXTRACTION_FAILED",
123
+ details={"archive_path": archive_path, "error": str(e)}
124
+ ) from e
125
+
126
+ def _extract_zip(self, archive_path: str, extract_dir: str) -> List[str]:
127
+ """Extract ZIP archive."""
128
+ with zipfile.ZipFile(archive_path, 'r') as zip_file:
129
+ # Check for zip bomb
130
+ self._check_zip_bomb(zip_file)
131
+
132
+ zip_file.extractall(extract_dir)
133
+ return zip_file.namelist()
134
+
135
+ def _extract_tar(
136
+ self,
137
+ archive_path: str,
138
+ archive_type: str,
139
+ extract_dir: str
140
+ ) -> List[str]:
141
+ """Extract TAR archive (including compressed variants)."""
142
+
143
+ mode_map = {
144
+ ArchiveType.TAR: 'r',
145
+ ArchiveType.TAR_GZ: 'r:gz',
146
+ ArchiveType.TAR_BZ2: 'r:bz2'
147
+ }
148
+
149
+ with tarfile.open(archive_path, mode_map[archive_type]) as tar_file:
150
+ # Security check for path traversal
151
+ self._check_tar_security(tar_file)
152
+
153
+ tar_file.extractall(extract_dir)
154
+ return tar_file.getnames()
155
+
156
+ def _check_zip_bomb(self, zip_file: zipfile.ZipFile) -> None:
157
+ """Check for zip bomb attacks."""
158
+
159
+ total_uncompressed = 0
160
+ total_compressed = 0
161
+
162
+ for info in zip_file.infolist():
163
+ total_uncompressed += info.file_size
164
+ total_compressed += info.compress_size
165
+
166
+ # Check compression ratio
167
+ if total_compressed > 0:
168
+ ratio = total_uncompressed / total_compressed
169
+ if ratio > 100: # Suspicious compression ratio
170
+ raise ExtractionError(
171
+ message="Suspicious compression ratio detected",
172
+ code="ZIP_BOMB_DETECTED",
173
+ details={
174
+ "compression_ratio": ratio,
175
+ "uncompressed_size": total_uncompressed
176
+ }
177
+ )
178
+
179
+ # Check total uncompressed size
180
+ if total_uncompressed > 1024 * 1024 * 1024: # 1GB limit
181
+ raise ExtractionError(
182
+ message="Archive too large when uncompressed",
183
+ code="ARCHIVE_TOO_LARGE_UNCOMPRESSED",
184
+ details={"uncompressed_size": total_uncompressed}
185
+ )
186
+
187
+ def _check_tar_security(self, tar_file: tarfile.TarFile) -> None:
188
+ """Check TAR file for security issues."""
189
+
190
+ for member in tar_file.getmembers():
191
+ # Check for path traversal
192
+ if os.path.isabs(member.name) or ".." in member.name:
193
+ raise ExtractionError(
194
+ message="Path traversal attempt detected",
195
+ code="PATH_TRAVERSAL_DETECTED",
196
+ details={"member_name": member.name}
197
+ )
198
+
199
+ # Check for suspicious file sizes
200
+ if member.size > self.MAX_FILE_SIZE * 10: # 100MB limit per file
201
+ raise ExtractionError(
202
+ message="File too large in archive",
203
+ code="FILE_TOO_LARGE",
204
+ details={
205
+ "file_name": member.name,
206
+ "file_size": member.size
207
+ }
208
+ )
209
+
210
+ def _should_skip_file(self, relative_path: str) -> bool:
211
+ """Check if file should be skipped during processing."""
212
+
213
+ # Normalize path for consistent checking
214
+ relative_path_lower = relative_path.lower()
215
+ path_parts = relative_path_lower.split('/')
216
+
217
+ # Check for hidden files (starting with dot)
218
+ for part in path_parts:
219
+ if part.startswith('.') and part not in ['.', '..']:
220
+ return True
221
+
222
+ # Check for system/build directories
223
+ skip_directories = {
224
+ '__pycache__', 'node_modules', 'dist', 'build', 'target',
225
+ '.git', '.svn', '.hg', '.vscode', '.idea', '.eclipse'
226
+ }
227
+
228
+ for part in path_parts:
229
+ if part in skip_directories:
230
+ return True
231
+
232
+ # Check file extensions
233
+ skip_extensions = {
234
+ '.pyc', '.pyo', '.tmp', '.temp', '.swp', '.bak',
235
+ '.exe', '.dll', '.so', '.dylib', '.jar', '.war', '.ear', '.iso', '.dmg'
236
+ }
237
+
238
+ for ext in skip_extensions:
239
+ if relative_path_lower.endswith(ext):
240
+ return True
241
+
242
+ # Check specific filenames
243
+ filename = path_parts[-1] if path_parts else ''
244
+ skip_filenames = {'.ds_store', 'thumbs.db'}
245
+
246
+ if filename in skip_filenames:
247
+ return True
248
+
249
+ # Skip very deep paths (potential zip bomb)
250
+ if relative_path.count('/') > 10:
251
+ return True
252
+
253
+ return False
254
+
255
+ def _process_extracted_file(
256
+ self,
257
+ full_path: str,
258
+ relative_path: str
259
+ ) -> Optional[ExtractedItemData]:
260
+ """Process individual extracted file."""
261
+
262
+ try:
263
+ stat = os.stat(full_path)
264
+ file_size = stat.st_size
265
+
266
+ # Skip very large files
267
+ if file_size > self.MAX_FILE_SIZE:
268
+ return None
269
+
270
+ item_name = os.path.basename(relative_path)
271
+
272
+ # Detect content type and processability
273
+ content_type = self._detect_content_type(item_name, full_path)
274
+ is_text_file = self._is_text_file(item_name, full_path)
275
+ is_processable = is_text_file and content_type in [
276
+ ContentType.DOCUMENT,
277
+ ContentType.CODE,
278
+ ContentType.DATA
279
+ ]
280
+
281
+ # Extract content for processable files
282
+ content = None
283
+ if is_processable:
284
+ content = self._extract_text_content(full_path)
285
+
286
+ # Generate content hash
287
+ content_hash = self._generate_content_hash(full_path, content)
288
+
289
+ # Detect language
290
+ language = self._detect_language(item_name, content_type)
291
+
292
+ # Build metadata
293
+ metadata = {
294
+ 'mime_type': mimetypes.guess_type(item_name)[0] or 'application/octet-stream',
295
+ 'is_text_file': is_text_file,
296
+ 'extraction_method': 'direct_read' if is_text_file else 'binary_skip',
297
+ 'file_extension': Path(item_name).suffix.lower(),
298
+ }
299
+
300
+ return ExtractedItemData(
301
+ relative_path=relative_path,
302
+ item_name=item_name,
303
+ file_size=file_size,
304
+ content=content,
305
+ content_hash=content_hash,
306
+ is_processable=is_processable,
307
+ content_type=content_type,
308
+ language=language,
309
+ metadata=metadata
310
+ )
311
+
312
+ except Exception as e:
313
+ # Return None for problematic files
314
+ return None
315
+
316
+ def _detect_content_type(self, item_name: str, full_path: str) -> str:
317
+ """Detect content type from file extension and content."""
318
+
319
+ file_path = Path(item_name)
320
+ extension = file_path.suffix.lower()
321
+
322
+ # Code files
323
+ code_extensions = {
324
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go', '.rs',
325
+ '.cpp', '.c', '.h', '.hpp', '.php', '.rb', '.cs', '.swift',
326
+ '.kt', '.scala', '.clj', '.hs', '.ml', '.fs', '.elm'
327
+ }
328
+
329
+ # Document files
330
+ document_extensions = {
331
+ '.md', '.txt', '.rst', '.adoc', '.pdf', '.docx', '.doc'
332
+ }
333
+
334
+ # Data files
335
+ data_extensions = {
336
+ '.json', '.csv', '.xml', '.yml', '.yaml', '.toml', '.ini'
337
+ }
338
+
339
+ # Image files
340
+ image_extensions = {
341
+ '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'
342
+ }
343
+
344
+ # Archive files
345
+ archive_extensions = {
346
+ '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
347
+ }
348
+
349
+ if extension in code_extensions:
350
+ return ContentType.CODE
351
+ elif extension in document_extensions:
352
+ return ContentType.DOCUMENT
353
+ elif extension in data_extensions:
354
+ return ContentType.DATA
355
+ elif extension in image_extensions:
356
+ return ContentType.IMAGE
357
+ elif extension in archive_extensions:
358
+ return ContentType.ARCHIVE
359
+ else:
360
+ return ContentType.UNKNOWN
361
+
362
+ def _is_text_file(self, item_name: str, full_path: str) -> bool:
363
+ """Check if file is a text file."""
364
+
365
+ # Check by extension first
366
+ file_path = Path(item_name)
367
+ extension = file_path.suffix.lower()
368
+
369
+ if extension in self.TEXT_EXTENSIONS:
370
+ return True
371
+
372
+ # Special filenames
373
+ special_names = {
374
+ 'dockerfile', 'makefile', 'readme', 'license', 'changelog',
375
+ '.gitignore', '.dockerignore', '.env', '.settings.example'
376
+ }
377
+
378
+ if file_path.name.lower() in special_names:
379
+ return True
380
+
381
+ # Try to detect by content (sample first 1KB)
382
+ try:
383
+ with open(full_path, 'rb') as f:
384
+ sample = f.read(1024)
385
+
386
+ # Check for null bytes (binary indicator)
387
+ if b'\x00' in sample:
388
+ return False
389
+
390
+ # Try to decode as UTF-8
391
+ try:
392
+ sample.decode('utf-8')
393
+ return True
394
+ except UnicodeDecodeError:
395
+ return False
396
+
397
+ except Exception:
398
+ return False
399
+
400
+ def _extract_text_content(self, full_path: str) -> Optional[str]:
401
+ """Extract text content from file."""
402
+
403
+ try:
404
+ # Try different encodings
405
+ encodings = ['utf-8', 'utf-16', 'latin1', 'cp1252']
406
+
407
+ for encoding in encodings:
408
+ try:
409
+ with open(full_path, 'r', encoding=encoding) as f:
410
+ content = f.read()
411
+
412
+ # Validate content is reasonable
413
+ if len(content) > 0 and len(content) < 1024 * 1024: # Max 1MB text
414
+ return content
415
+
416
+ except UnicodeDecodeError:
417
+ continue
418
+ except Exception:
419
+ break
420
+
421
+ return None
422
+
423
+ except Exception:
424
+ return None
425
+
426
+ def _generate_content_hash(
427
+ self,
428
+ full_path: str,
429
+ content: Optional[str]
430
+ ) -> str:
431
+ """Generate SHA-256 hash of file content."""
432
+
433
+ if content:
434
+ return hashlib.sha256(content.encode()).hexdigest()
435
+ else:
436
+ # Hash binary file
437
+ hash_sha256 = hashlib.sha256()
438
+ try:
439
+ with open(full_path, 'rb') as f:
440
+ for chunk in iter(lambda: f.read(4096), b""):
441
+ hash_sha256.update(chunk)
442
+ return hash_sha256.hexdigest()
443
+ except Exception:
444
+ # Fallback to path-based hash
445
+ return hashlib.sha256(full_path.encode()).hexdigest()
446
+
447
+ def _detect_language(self, item_name: str, content_type: str) -> Optional[str]:
448
+ """Detect programming language from file extension."""
449
+
450
+ if content_type != ContentType.CODE:
451
+ return None
452
+
453
+ language_map = {
454
+ '.py': 'python',
455
+ '.js': 'javascript',
456
+ '.ts': 'typescript',
457
+ '.jsx': 'react',
458
+ '.tsx': 'react-typescript',
459
+ '.java': 'java',
460
+ '.go': 'golang',
461
+ '.rs': 'rust',
462
+ '.cpp': 'cpp',
463
+ '.c': 'c',
464
+ '.php': 'php',
465
+ '.rb': 'ruby',
466
+ '.cs': 'csharp',
467
+ '.swift': 'swift',
468
+ '.kt': 'kotlin',
469
+ '.scala': 'scala',
470
+ '.clj': 'clojure',
471
+ '.hs': 'haskell',
472
+ '.ml': 'ocaml',
473
+ '.fs': 'fsharp',
474
+ '.elm': 'elm',
475
+ }
476
+
477
+ file_path = Path(item_name)
478
+ extension = file_path.suffix.lower()
479
+
480
+ # Special cases
481
+ if file_path.name.lower() in ['dockerfile']:
482
+ return 'dockerfile'
483
+ elif file_path.name.lower() in ['makefile']:
484
+ return 'makefile'
485
+
486
+ return language_map.get(extension)
487
+
488
+
489
+ class ContentExtractionService:
490
+ """Service for extracting content from specific file types."""
491
+
492
+ def extract_pdf_content(self, file_path: str) -> Optional[str]:
493
+ """Extract text from PDF file."""
494
+ # TODO: Implement PDF text extraction
495
+ # Could use PyPDF2, pdfplumber, or similar
496
+ return None
497
+
498
+ def extract_docx_content(self, file_path: str) -> Optional[str]:
499
+ """Extract text from DOCX file."""
500
+ # TODO: Implement DOCX text extraction
501
+ # Could use python-docx
502
+ return None
503
+
504
+ def extract_image_text(self, file_path: str) -> Optional[str]:
505
+ """Extract text from image using OCR."""
506
+ # TODO: Implement OCR text extraction
507
+ # Could use pytesseract
508
+ return None