django-cfg 1.1.82__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_cfg/__init__.py +20 -448
- django_cfg/apps/accounts/README.md +3 -3
- django_cfg/apps/accounts/admin/__init__.py +0 -2
- django_cfg/apps/accounts/admin/activity.py +2 -9
- django_cfg/apps/accounts/admin/filters.py +0 -42
- django_cfg/apps/accounts/admin/inlines.py +8 -8
- django_cfg/apps/accounts/admin/otp.py +5 -5
- django_cfg/apps/accounts/admin/registration_source.py +1 -8
- django_cfg/apps/accounts/admin/user.py +12 -20
- django_cfg/apps/accounts/managers/user_manager.py +2 -129
- django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
- django_cfg/apps/accounts/models.py +3 -123
- django_cfg/apps/accounts/serializers/otp.py +40 -44
- django_cfg/apps/accounts/serializers/profile.py +0 -2
- django_cfg/apps/accounts/services/otp_service.py +98 -186
- django_cfg/apps/accounts/signals.py +25 -15
- django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
- django_cfg/apps/accounts/views/otp.py +35 -36
- django_cfg/apps/agents/README.md +129 -0
- django_cfg/apps/agents/__init__.py +68 -0
- django_cfg/apps/agents/admin/__init__.py +17 -0
- django_cfg/apps/agents/admin/execution_admin.py +460 -0
- django_cfg/apps/agents/admin/registry_admin.py +360 -0
- django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
- django_cfg/apps/agents/apps.py +29 -0
- django_cfg/apps/agents/core/__init__.py +20 -0
- django_cfg/apps/agents/core/agent.py +281 -0
- django_cfg/apps/agents/core/dependencies.py +154 -0
- django_cfg/apps/agents/core/exceptions.py +66 -0
- django_cfg/apps/agents/core/models.py +106 -0
- django_cfg/apps/agents/core/orchestrator.py +391 -0
- django_cfg/apps/agents/examples/__init__.py +3 -0
- django_cfg/apps/agents/examples/simple_example.py +161 -0
- django_cfg/apps/agents/integration/__init__.py +14 -0
- django_cfg/apps/agents/integration/middleware.py +80 -0
- django_cfg/apps/agents/integration/registry.py +345 -0
- django_cfg/apps/agents/integration/signals.py +50 -0
- django_cfg/apps/agents/management/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/create_agent.py +365 -0
- django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
- django_cfg/apps/agents/managers/__init__.py +23 -0
- django_cfg/apps/agents/managers/execution.py +236 -0
- django_cfg/apps/agents/managers/registry.py +254 -0
- django_cfg/apps/agents/managers/toolsets.py +496 -0
- django_cfg/apps/agents/migrations/0001_initial.py +286 -0
- django_cfg/apps/agents/migrations/__init__.py +5 -0
- django_cfg/apps/agents/models/__init__.py +15 -0
- django_cfg/apps/agents/models/execution.py +215 -0
- django_cfg/apps/agents/models/registry.py +220 -0
- django_cfg/apps/agents/models/toolsets.py +305 -0
- django_cfg/apps/agents/patterns/__init__.py +24 -0
- django_cfg/apps/agents/patterns/content_agents.py +234 -0
- django_cfg/apps/agents/toolsets/__init__.py +15 -0
- django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
- django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
- django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
- django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
- django_cfg/apps/agents/urls.py +46 -0
- django_cfg/apps/knowbase/README.md +150 -0
- django_cfg/apps/knowbase/__init__.py +27 -0
- django_cfg/apps/knowbase/admin/__init__.py +23 -0
- django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
- django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
- django_cfg/apps/knowbase/admin/document_admin.py +650 -0
- django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
- django_cfg/apps/knowbase/apps.py +81 -0
- django_cfg/apps/knowbase/config/README.md +176 -0
- django_cfg/apps/knowbase/config/__init__.py +51 -0
- django_cfg/apps/knowbase/config/constance_fields.py +186 -0
- django_cfg/apps/knowbase/config/constance_settings.py +200 -0
- django_cfg/apps/knowbase/config/settings.py +450 -0
- django_cfg/apps/knowbase/examples/__init__.py +3 -0
- django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
- django_cfg/apps/knowbase/management/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
- django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
- django_cfg/apps/knowbase/managers/__init__.py +22 -0
- django_cfg/apps/knowbase/managers/archive.py +426 -0
- django_cfg/apps/knowbase/managers/base.py +32 -0
- django_cfg/apps/knowbase/managers/chat.py +141 -0
- django_cfg/apps/knowbase/managers/document.py +203 -0
- django_cfg/apps/knowbase/managers/external_data.py +471 -0
- django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
- django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
- django_cfg/apps/knowbase/migrations/__init__.py +5 -0
- django_cfg/apps/knowbase/mixins/__init__.py +15 -0
- django_cfg/apps/knowbase/mixins/config.py +108 -0
- django_cfg/apps/knowbase/mixins/creator.py +81 -0
- django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
- django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
- django_cfg/apps/knowbase/mixins/service.py +362 -0
- django_cfg/apps/knowbase/models/__init__.py +41 -0
- django_cfg/apps/knowbase/models/archive.py +599 -0
- django_cfg/apps/knowbase/models/base.py +58 -0
- django_cfg/apps/knowbase/models/chat.py +157 -0
- django_cfg/apps/knowbase/models/document.py +267 -0
- django_cfg/apps/knowbase/models/external_data.py +376 -0
- django_cfg/apps/knowbase/serializers/__init__.py +68 -0
- django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
- django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
- django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
- django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
- django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
- django_cfg/apps/knowbase/services/__init__.py +40 -0
- django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
- django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
- django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
- django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
- django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
- django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
- django_cfg/apps/knowbase/services/base.py +53 -0
- django_cfg/apps/knowbase/services/chat_service.py +239 -0
- django_cfg/apps/knowbase/services/document_service.py +144 -0
- django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
- django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
- django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
- django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
- django_cfg/apps/knowbase/services/embedding/models.py +229 -0
- django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
- django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
- django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
- django_cfg/apps/knowbase/services/search_service.py +293 -0
- django_cfg/apps/knowbase/signals/__init__.py +21 -0
- django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
- django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
- django_cfg/apps/knowbase/signals/document_signals.py +143 -0
- django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
- django_cfg/apps/knowbase/tasks/__init__.py +39 -0
- django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
- django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
- django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
- django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
- django_cfg/apps/knowbase/urls.py +43 -0
- django_cfg/apps/knowbase/utils/__init__.py +12 -0
- django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
- django_cfg/apps/knowbase/utils/text_processing.py +375 -0
- django_cfg/apps/knowbase/utils/validation.py +99 -0
- django_cfg/apps/knowbase/views/__init__.py +28 -0
- django_cfg/apps/knowbase/views/archive_views.py +469 -0
- django_cfg/apps/knowbase/views/base.py +49 -0
- django_cfg/apps/knowbase/views/chat_views.py +181 -0
- django_cfg/apps/knowbase/views/document_views.py +183 -0
- django_cfg/apps/knowbase/views/public_views.py +129 -0
- django_cfg/apps/leads/admin.py +70 -0
- django_cfg/apps/newsletter/admin.py +234 -0
- django_cfg/apps/newsletter/admin_filters.py +124 -0
- django_cfg/apps/support/admin.py +196 -0
- django_cfg/apps/support/admin_filters.py +71 -0
- django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
- django_cfg/apps/urls.py +5 -4
- django_cfg/cli/README.md +1 -1
- django_cfg/cli/commands/create_project.py +2 -2
- django_cfg/cli/commands/info.py +1 -1
- django_cfg/config.py +44 -0
- django_cfg/core/config.py +29 -82
- django_cfg/core/environment.py +1 -1
- django_cfg/core/generation.py +19 -107
- django_cfg/{integration.py → core/integration.py} +18 -16
- django_cfg/core/validation.py +1 -1
- django_cfg/management/__init__.py +1 -1
- django_cfg/management/commands/__init__.py +1 -1
- django_cfg/management/commands/auto_generate.py +482 -0
- django_cfg/management/commands/migrator.py +19 -101
- django_cfg/management/commands/test_email.py +1 -1
- django_cfg/middleware/README.md +0 -158
- django_cfg/middleware/__init__.py +0 -2
- django_cfg/middleware/user_activity.py +3 -3
- django_cfg/models/api.py +145 -0
- django_cfg/models/base.py +287 -0
- django_cfg/models/cache.py +4 -4
- django_cfg/models/constance.py +25 -88
- django_cfg/models/database.py +9 -9
- django_cfg/models/drf.py +3 -36
- django_cfg/models/email.py +163 -0
- django_cfg/models/environment.py +276 -0
- django_cfg/models/limits.py +1 -1
- django_cfg/models/logging.py +366 -0
- django_cfg/models/revolution.py +41 -2
- django_cfg/models/security.py +125 -0
- django_cfg/models/services.py +1 -1
- django_cfg/modules/__init__.py +2 -56
- django_cfg/modules/base.py +78 -52
- django_cfg/modules/django_currency/service.py +2 -2
- django_cfg/modules/django_email.py +2 -2
- django_cfg/modules/django_health.py +267 -0
- django_cfg/modules/django_llm/llm/client.py +91 -19
- django_cfg/modules/django_llm/translator/translator.py +2 -2
- django_cfg/modules/django_logger.py +2 -2
- django_cfg/modules/django_ngrok.py +2 -2
- django_cfg/modules/django_tasks.py +68 -3
- django_cfg/modules/django_telegram.py +3 -3
- django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
- django_cfg/modules/django_twilio/service.py +2 -2
- django_cfg/modules/django_twilio/simple_service.py +2 -2
- django_cfg/modules/django_twilio/twilio_service.py +2 -2
- django_cfg/modules/django_unfold/__init__.py +69 -0
- django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
- django_cfg/modules/django_unfold/dashboard.py +278 -0
- django_cfg/modules/django_unfold/icons/README.md +145 -0
- django_cfg/modules/django_unfold/icons/__init__.py +12 -0
- django_cfg/modules/django_unfold/icons/constants.py +2851 -0
- django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
- django_cfg/modules/django_unfold/models/__init__.py +42 -0
- django_cfg/modules/django_unfold/models/config.py +601 -0
- django_cfg/modules/django_unfold/models/dashboard.py +206 -0
- django_cfg/modules/django_unfold/models/dropdown.py +40 -0
- django_cfg/modules/django_unfold/models/navigation.py +73 -0
- django_cfg/modules/django_unfold/models/tabs.py +25 -0
- django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
- django_cfg/modules/django_unfold/utils.py +140 -0
- django_cfg/registry/__init__.py +23 -0
- django_cfg/registry/core.py +61 -0
- django_cfg/registry/exceptions.py +11 -0
- django_cfg/registry/modules.py +12 -0
- django_cfg/registry/services.py +26 -0
- django_cfg/registry/third_party.py +52 -0
- django_cfg/routing/__init__.py +19 -0
- django_cfg/routing/callbacks.py +198 -0
- django_cfg/routing/routers.py +48 -0
- django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
- django_cfg/templatetags/__init__.py +0 -0
- django_cfg/templatetags/django_cfg.py +33 -0
- django_cfg/urls.py +33 -0
- django_cfg/utils/path_resolution.py +1 -1
- django_cfg/utils/smart_defaults.py +7 -61
- django_cfg/utils/toolkit.py +663 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/METADATA +83 -86
- django_cfg-1.2.1.dist-info/RECORD +441 -0
- django_cfg/archive/django_sample.zip +0 -0
- django_cfg/models/unfold.py +0 -271
- django_cfg/modules/unfold/__init__.py +0 -29
- django_cfg/modules/unfold/dashboard.py +0 -318
- django_cfg/pyproject.toml +0 -370
- django_cfg/routers.py +0 -83
- django_cfg-1.1.82.dist-info/RECORD +0 -278
- /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
- /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
- /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
- /django_cfg/{version_check.py → utils/version_check.py} +0 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/WHEEL +0 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/entry_points.txt +0 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
"""
|
2
|
+
Archive processing exceptions.
|
3
|
+
|
4
|
+
Custom exception hierarchy for archive processing operations.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from typing import Optional, Dict, Any
|
8
|
+
|
9
|
+
|
10
|
+
class ArchiveProcessingError(Exception):
|
11
|
+
"""Base exception for archive processing errors."""
|
12
|
+
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
message: str,
|
16
|
+
code: str,
|
17
|
+
details: Optional[Dict[str, Any]] = None
|
18
|
+
):
|
19
|
+
self.message = message
|
20
|
+
self.code = code
|
21
|
+
self.details = details or {}
|
22
|
+
super().__init__(message)
|
23
|
+
|
24
|
+
|
25
|
+
class ArchiveValidationError(ArchiveProcessingError):
|
26
|
+
"""Archive validation errors."""
|
27
|
+
pass
|
28
|
+
|
29
|
+
|
30
|
+
class ExtractionError(ArchiveProcessingError):
|
31
|
+
"""Archive extraction errors."""
|
32
|
+
pass
|
33
|
+
|
34
|
+
|
35
|
+
class ChunkingError(ArchiveProcessingError):
|
36
|
+
"""Content chunking errors."""
|
37
|
+
pass
|
38
|
+
|
39
|
+
|
40
|
+
class VectorizationError(ArchiveProcessingError):
|
41
|
+
"""Vectorization processing errors."""
|
42
|
+
pass
|
43
|
+
|
44
|
+
|
45
|
+
class ContentTypeDetectionError(ArchiveProcessingError):
|
46
|
+
"""Content type detection errors."""
|
47
|
+
pass
|
48
|
+
|
49
|
+
|
50
|
+
class ProcessingTimeoutError(ArchiveProcessingError):
|
51
|
+
"""Processing timeout errors."""
|
52
|
+
pass
|
@@ -0,0 +1,508 @@
|
|
1
|
+
"""
|
2
|
+
Archive extraction services.
|
3
|
+
|
4
|
+
Handles extraction of different archive formats and content processing.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import zipfile
|
9
|
+
import tarfile
|
10
|
+
import tempfile
|
11
|
+
import shutil
|
12
|
+
import hashlib
|
13
|
+
import mimetypes
|
14
|
+
from pathlib import Path
|
15
|
+
from typing import List, Dict, Any, Optional, Set
|
16
|
+
from pydantic import BaseModel
|
17
|
+
|
18
|
+
from ...models.archive import ArchiveType, ContentType
|
19
|
+
from .exceptions import ExtractionError, ContentTypeDetectionError
|
20
|
+
|
21
|
+
|
22
|
+
class ExtractedItemData(BaseModel):
|
23
|
+
"""Data structure for extracted archive item."""
|
24
|
+
|
25
|
+
relative_path: str
|
26
|
+
item_name: str
|
27
|
+
file_size: int
|
28
|
+
content: Optional[str] = None
|
29
|
+
content_hash: str
|
30
|
+
is_processable: bool
|
31
|
+
content_type: str
|
32
|
+
language: Optional[str] = None
|
33
|
+
metadata: Dict[str, Any]
|
34
|
+
|
35
|
+
|
36
|
+
class ArchiveExtractionService:
|
37
|
+
"""Service for extracting archives and processing content."""
|
38
|
+
|
39
|
+
# File size limits
|
40
|
+
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB per file
|
41
|
+
|
42
|
+
# Text file extensions
|
43
|
+
TEXT_EXTENSIONS: Set[str] = {
|
44
|
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go', '.rs',
|
45
|
+
'.cpp', '.c', '.h', '.hpp', '.php', '.rb', '.cs', '.swift',
|
46
|
+
'.kt', '.scala', '.clj', '.hs', '.ml', '.fs', '.elm',
|
47
|
+
'.md', '.txt', '.rst', '.adoc',
|
48
|
+
'.yml', '.yaml', '.json', '.toml', '.ini', '.cfg', '.conf',
|
49
|
+
'.xml', '.html', '.css', '.scss', '.less',
|
50
|
+
'.sql', '.sh', '.bash', '.zsh', '.fish',
|
51
|
+
'.dockerfile', '.makefile', '.gitignore', '.env',
|
52
|
+
'.tf', '.hcl'
|
53
|
+
}
|
54
|
+
|
55
|
+
def extract_archive(
|
56
|
+
self,
|
57
|
+
archive_path: str,
|
58
|
+
archive_type: str
|
59
|
+
) -> List[ExtractedItemData]:
|
60
|
+
"""Extract archive and return processed item data."""
|
61
|
+
|
62
|
+
extract_dir = tempfile.mkdtemp(prefix='extracted_')
|
63
|
+
|
64
|
+
try:
|
65
|
+
# Extract based on type
|
66
|
+
file_list = self._extract_by_type(archive_path, archive_type, extract_dir)
|
67
|
+
|
68
|
+
# Process extracted files
|
69
|
+
extracted_items = []
|
70
|
+
|
71
|
+
for relative_path in file_list:
|
72
|
+
full_path = os.path.join(extract_dir, relative_path)
|
73
|
+
|
74
|
+
# Skip directories
|
75
|
+
if os.path.isdir(full_path):
|
76
|
+
continue
|
77
|
+
|
78
|
+
# Skip unwanted files
|
79
|
+
if self._should_skip_file(relative_path):
|
80
|
+
continue
|
81
|
+
|
82
|
+
try:
|
83
|
+
item_data = self._process_extracted_file(full_path, relative_path)
|
84
|
+
if item_data:
|
85
|
+
extracted_items.append(item_data)
|
86
|
+
except Exception as e:
|
87
|
+
# Log error but continue with other files
|
88
|
+
continue
|
89
|
+
|
90
|
+
return extracted_items
|
91
|
+
|
92
|
+
finally:
|
93
|
+
# Always cleanup extraction directory
|
94
|
+
if os.path.exists(extract_dir):
|
95
|
+
shutil.rmtree(extract_dir)
|
96
|
+
|
97
|
+
def _extract_by_type(
|
98
|
+
self,
|
99
|
+
archive_path: str,
|
100
|
+
archive_type: str,
|
101
|
+
extract_dir: str
|
102
|
+
) -> List[str]:
|
103
|
+
"""Extract archive based on its type."""
|
104
|
+
|
105
|
+
try:
|
106
|
+
if archive_type == ArchiveType.ZIP:
|
107
|
+
return self._extract_zip(archive_path, extract_dir)
|
108
|
+
elif archive_type in [ArchiveType.TAR, ArchiveType.TAR_GZ, ArchiveType.TAR_BZ2]:
|
109
|
+
return self._extract_tar(archive_path, archive_type, extract_dir)
|
110
|
+
else:
|
111
|
+
raise ExtractionError(
|
112
|
+
message=f"Unsupported archive type: {archive_type}",
|
113
|
+
code="UNSUPPORTED_ARCHIVE_TYPE",
|
114
|
+
details={"archive_type": archive_type}
|
115
|
+
)
|
116
|
+
except Exception as e:
|
117
|
+
if isinstance(e, ExtractionError):
|
118
|
+
raise
|
119
|
+
|
120
|
+
raise ExtractionError(
|
121
|
+
message=f"Failed to extract archive: {str(e)}",
|
122
|
+
code="EXTRACTION_FAILED",
|
123
|
+
details={"archive_path": archive_path, "error": str(e)}
|
124
|
+
) from e
|
125
|
+
|
126
|
+
def _extract_zip(self, archive_path: str, extract_dir: str) -> List[str]:
|
127
|
+
"""Extract ZIP archive."""
|
128
|
+
with zipfile.ZipFile(archive_path, 'r') as zip_file:
|
129
|
+
# Check for zip bomb
|
130
|
+
self._check_zip_bomb(zip_file)
|
131
|
+
|
132
|
+
zip_file.extractall(extract_dir)
|
133
|
+
return zip_file.namelist()
|
134
|
+
|
135
|
+
def _extract_tar(
|
136
|
+
self,
|
137
|
+
archive_path: str,
|
138
|
+
archive_type: str,
|
139
|
+
extract_dir: str
|
140
|
+
) -> List[str]:
|
141
|
+
"""Extract TAR archive (including compressed variants)."""
|
142
|
+
|
143
|
+
mode_map = {
|
144
|
+
ArchiveType.TAR: 'r',
|
145
|
+
ArchiveType.TAR_GZ: 'r:gz',
|
146
|
+
ArchiveType.TAR_BZ2: 'r:bz2'
|
147
|
+
}
|
148
|
+
|
149
|
+
with tarfile.open(archive_path, mode_map[archive_type]) as tar_file:
|
150
|
+
# Security check for path traversal
|
151
|
+
self._check_tar_security(tar_file)
|
152
|
+
|
153
|
+
tar_file.extractall(extract_dir)
|
154
|
+
return tar_file.getnames()
|
155
|
+
|
156
|
+
def _check_zip_bomb(self, zip_file: zipfile.ZipFile) -> None:
|
157
|
+
"""Check for zip bomb attacks."""
|
158
|
+
|
159
|
+
total_uncompressed = 0
|
160
|
+
total_compressed = 0
|
161
|
+
|
162
|
+
for info in zip_file.infolist():
|
163
|
+
total_uncompressed += info.file_size
|
164
|
+
total_compressed += info.compress_size
|
165
|
+
|
166
|
+
# Check compression ratio
|
167
|
+
if total_compressed > 0:
|
168
|
+
ratio = total_uncompressed / total_compressed
|
169
|
+
if ratio > 100: # Suspicious compression ratio
|
170
|
+
raise ExtractionError(
|
171
|
+
message="Suspicious compression ratio detected",
|
172
|
+
code="ZIP_BOMB_DETECTED",
|
173
|
+
details={
|
174
|
+
"compression_ratio": ratio,
|
175
|
+
"uncompressed_size": total_uncompressed
|
176
|
+
}
|
177
|
+
)
|
178
|
+
|
179
|
+
# Check total uncompressed size
|
180
|
+
if total_uncompressed > 1024 * 1024 * 1024: # 1GB limit
|
181
|
+
raise ExtractionError(
|
182
|
+
message="Archive too large when uncompressed",
|
183
|
+
code="ARCHIVE_TOO_LARGE_UNCOMPRESSED",
|
184
|
+
details={"uncompressed_size": total_uncompressed}
|
185
|
+
)
|
186
|
+
|
187
|
+
def _check_tar_security(self, tar_file: tarfile.TarFile) -> None:
|
188
|
+
"""Check TAR file for security issues."""
|
189
|
+
|
190
|
+
for member in tar_file.getmembers():
|
191
|
+
# Check for path traversal
|
192
|
+
if os.path.isabs(member.name) or ".." in member.name:
|
193
|
+
raise ExtractionError(
|
194
|
+
message="Path traversal attempt detected",
|
195
|
+
code="PATH_TRAVERSAL_DETECTED",
|
196
|
+
details={"member_name": member.name}
|
197
|
+
)
|
198
|
+
|
199
|
+
# Check for suspicious file sizes
|
200
|
+
if member.size > self.MAX_FILE_SIZE * 10: # 100MB limit per file
|
201
|
+
raise ExtractionError(
|
202
|
+
message="File too large in archive",
|
203
|
+
code="FILE_TOO_LARGE",
|
204
|
+
details={
|
205
|
+
"file_name": member.name,
|
206
|
+
"file_size": member.size
|
207
|
+
}
|
208
|
+
)
|
209
|
+
|
210
|
+
def _should_skip_file(self, relative_path: str) -> bool:
|
211
|
+
"""Check if file should be skipped during processing."""
|
212
|
+
|
213
|
+
# Normalize path for consistent checking
|
214
|
+
relative_path_lower = relative_path.lower()
|
215
|
+
path_parts = relative_path_lower.split('/')
|
216
|
+
|
217
|
+
# Check for hidden files (starting with dot)
|
218
|
+
for part in path_parts:
|
219
|
+
if part.startswith('.') and part not in ['.', '..']:
|
220
|
+
return True
|
221
|
+
|
222
|
+
# Check for system/build directories
|
223
|
+
skip_directories = {
|
224
|
+
'__pycache__', 'node_modules', 'dist', 'build', 'target',
|
225
|
+
'.git', '.svn', '.hg', '.vscode', '.idea', '.eclipse'
|
226
|
+
}
|
227
|
+
|
228
|
+
for part in path_parts:
|
229
|
+
if part in skip_directories:
|
230
|
+
return True
|
231
|
+
|
232
|
+
# Check file extensions
|
233
|
+
skip_extensions = {
|
234
|
+
'.pyc', '.pyo', '.tmp', '.temp', '.swp', '.bak',
|
235
|
+
'.exe', '.dll', '.so', '.dylib', '.jar', '.war', '.ear', '.iso', '.dmg'
|
236
|
+
}
|
237
|
+
|
238
|
+
for ext in skip_extensions:
|
239
|
+
if relative_path_lower.endswith(ext):
|
240
|
+
return True
|
241
|
+
|
242
|
+
# Check specific filenames
|
243
|
+
filename = path_parts[-1] if path_parts else ''
|
244
|
+
skip_filenames = {'.ds_store', 'thumbs.db'}
|
245
|
+
|
246
|
+
if filename in skip_filenames:
|
247
|
+
return True
|
248
|
+
|
249
|
+
# Skip very deep paths (potential zip bomb)
|
250
|
+
if relative_path.count('/') > 10:
|
251
|
+
return True
|
252
|
+
|
253
|
+
return False
|
254
|
+
|
255
|
+
def _process_extracted_file(
|
256
|
+
self,
|
257
|
+
full_path: str,
|
258
|
+
relative_path: str
|
259
|
+
) -> Optional[ExtractedItemData]:
|
260
|
+
"""Process individual extracted file."""
|
261
|
+
|
262
|
+
try:
|
263
|
+
stat = os.stat(full_path)
|
264
|
+
file_size = stat.st_size
|
265
|
+
|
266
|
+
# Skip very large files
|
267
|
+
if file_size > self.MAX_FILE_SIZE:
|
268
|
+
return None
|
269
|
+
|
270
|
+
item_name = os.path.basename(relative_path)
|
271
|
+
|
272
|
+
# Detect content type and processability
|
273
|
+
content_type = self._detect_content_type(item_name, full_path)
|
274
|
+
is_text_file = self._is_text_file(item_name, full_path)
|
275
|
+
is_processable = is_text_file and content_type in [
|
276
|
+
ContentType.DOCUMENT,
|
277
|
+
ContentType.CODE,
|
278
|
+
ContentType.DATA
|
279
|
+
]
|
280
|
+
|
281
|
+
# Extract content for processable files
|
282
|
+
content = None
|
283
|
+
if is_processable:
|
284
|
+
content = self._extract_text_content(full_path)
|
285
|
+
|
286
|
+
# Generate content hash
|
287
|
+
content_hash = self._generate_content_hash(full_path, content)
|
288
|
+
|
289
|
+
# Detect language
|
290
|
+
language = self._detect_language(item_name, content_type)
|
291
|
+
|
292
|
+
# Build metadata
|
293
|
+
metadata = {
|
294
|
+
'mime_type': mimetypes.guess_type(item_name)[0] or 'application/octet-stream',
|
295
|
+
'is_text_file': is_text_file,
|
296
|
+
'extraction_method': 'direct_read' if is_text_file else 'binary_skip',
|
297
|
+
'file_extension': Path(item_name).suffix.lower(),
|
298
|
+
}
|
299
|
+
|
300
|
+
return ExtractedItemData(
|
301
|
+
relative_path=relative_path,
|
302
|
+
item_name=item_name,
|
303
|
+
file_size=file_size,
|
304
|
+
content=content,
|
305
|
+
content_hash=content_hash,
|
306
|
+
is_processable=is_processable,
|
307
|
+
content_type=content_type,
|
308
|
+
language=language,
|
309
|
+
metadata=metadata
|
310
|
+
)
|
311
|
+
|
312
|
+
except Exception as e:
|
313
|
+
# Return None for problematic files
|
314
|
+
return None
|
315
|
+
|
316
|
+
def _detect_content_type(self, item_name: str, full_path: str) -> str:
|
317
|
+
"""Detect content type from file extension and content."""
|
318
|
+
|
319
|
+
file_path = Path(item_name)
|
320
|
+
extension = file_path.suffix.lower()
|
321
|
+
|
322
|
+
# Code files
|
323
|
+
code_extensions = {
|
324
|
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go', '.rs',
|
325
|
+
'.cpp', '.c', '.h', '.hpp', '.php', '.rb', '.cs', '.swift',
|
326
|
+
'.kt', '.scala', '.clj', '.hs', '.ml', '.fs', '.elm'
|
327
|
+
}
|
328
|
+
|
329
|
+
# Document files
|
330
|
+
document_extensions = {
|
331
|
+
'.md', '.txt', '.rst', '.adoc', '.pdf', '.docx', '.doc'
|
332
|
+
}
|
333
|
+
|
334
|
+
# Data files
|
335
|
+
data_extensions = {
|
336
|
+
'.json', '.csv', '.xml', '.yml', '.yaml', '.toml', '.ini'
|
337
|
+
}
|
338
|
+
|
339
|
+
# Image files
|
340
|
+
image_extensions = {
|
341
|
+
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'
|
342
|
+
}
|
343
|
+
|
344
|
+
# Archive files
|
345
|
+
archive_extensions = {
|
346
|
+
'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
|
347
|
+
}
|
348
|
+
|
349
|
+
if extension in code_extensions:
|
350
|
+
return ContentType.CODE
|
351
|
+
elif extension in document_extensions:
|
352
|
+
return ContentType.DOCUMENT
|
353
|
+
elif extension in data_extensions:
|
354
|
+
return ContentType.DATA
|
355
|
+
elif extension in image_extensions:
|
356
|
+
return ContentType.IMAGE
|
357
|
+
elif extension in archive_extensions:
|
358
|
+
return ContentType.ARCHIVE
|
359
|
+
else:
|
360
|
+
return ContentType.UNKNOWN
|
361
|
+
|
362
|
+
def _is_text_file(self, item_name: str, full_path: str) -> bool:
|
363
|
+
"""Check if file is a text file."""
|
364
|
+
|
365
|
+
# Check by extension first
|
366
|
+
file_path = Path(item_name)
|
367
|
+
extension = file_path.suffix.lower()
|
368
|
+
|
369
|
+
if extension in self.TEXT_EXTENSIONS:
|
370
|
+
return True
|
371
|
+
|
372
|
+
# Special filenames
|
373
|
+
special_names = {
|
374
|
+
'dockerfile', 'makefile', 'readme', 'license', 'changelog',
|
375
|
+
'.gitignore', '.dockerignore', '.env', '.settings.example'
|
376
|
+
}
|
377
|
+
|
378
|
+
if file_path.name.lower() in special_names:
|
379
|
+
return True
|
380
|
+
|
381
|
+
# Try to detect by content (sample first 1KB)
|
382
|
+
try:
|
383
|
+
with open(full_path, 'rb') as f:
|
384
|
+
sample = f.read(1024)
|
385
|
+
|
386
|
+
# Check for null bytes (binary indicator)
|
387
|
+
if b'\x00' in sample:
|
388
|
+
return False
|
389
|
+
|
390
|
+
# Try to decode as UTF-8
|
391
|
+
try:
|
392
|
+
sample.decode('utf-8')
|
393
|
+
return True
|
394
|
+
except UnicodeDecodeError:
|
395
|
+
return False
|
396
|
+
|
397
|
+
except Exception:
|
398
|
+
return False
|
399
|
+
|
400
|
+
def _extract_text_content(self, full_path: str) -> Optional[str]:
|
401
|
+
"""Extract text content from file."""
|
402
|
+
|
403
|
+
try:
|
404
|
+
# Try different encodings
|
405
|
+
encodings = ['utf-8', 'utf-16', 'latin1', 'cp1252']
|
406
|
+
|
407
|
+
for encoding in encodings:
|
408
|
+
try:
|
409
|
+
with open(full_path, 'r', encoding=encoding) as f:
|
410
|
+
content = f.read()
|
411
|
+
|
412
|
+
# Validate content is reasonable
|
413
|
+
if len(content) > 0 and len(content) < 1024 * 1024: # Max 1MB text
|
414
|
+
return content
|
415
|
+
|
416
|
+
except UnicodeDecodeError:
|
417
|
+
continue
|
418
|
+
except Exception:
|
419
|
+
break
|
420
|
+
|
421
|
+
return None
|
422
|
+
|
423
|
+
except Exception:
|
424
|
+
return None
|
425
|
+
|
426
|
+
def _generate_content_hash(
|
427
|
+
self,
|
428
|
+
full_path: str,
|
429
|
+
content: Optional[str]
|
430
|
+
) -> str:
|
431
|
+
"""Generate SHA-256 hash of file content."""
|
432
|
+
|
433
|
+
if content:
|
434
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
435
|
+
else:
|
436
|
+
# Hash binary file
|
437
|
+
hash_sha256 = hashlib.sha256()
|
438
|
+
try:
|
439
|
+
with open(full_path, 'rb') as f:
|
440
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
441
|
+
hash_sha256.update(chunk)
|
442
|
+
return hash_sha256.hexdigest()
|
443
|
+
except Exception:
|
444
|
+
# Fallback to path-based hash
|
445
|
+
return hashlib.sha256(full_path.encode()).hexdigest()
|
446
|
+
|
447
|
+
def _detect_language(self, item_name: str, content_type: str) -> Optional[str]:
|
448
|
+
"""Detect programming language from file extension."""
|
449
|
+
|
450
|
+
if content_type != ContentType.CODE:
|
451
|
+
return None
|
452
|
+
|
453
|
+
language_map = {
|
454
|
+
'.py': 'python',
|
455
|
+
'.js': 'javascript',
|
456
|
+
'.ts': 'typescript',
|
457
|
+
'.jsx': 'react',
|
458
|
+
'.tsx': 'react-typescript',
|
459
|
+
'.java': 'java',
|
460
|
+
'.go': 'golang',
|
461
|
+
'.rs': 'rust',
|
462
|
+
'.cpp': 'cpp',
|
463
|
+
'.c': 'c',
|
464
|
+
'.php': 'php',
|
465
|
+
'.rb': 'ruby',
|
466
|
+
'.cs': 'csharp',
|
467
|
+
'.swift': 'swift',
|
468
|
+
'.kt': 'kotlin',
|
469
|
+
'.scala': 'scala',
|
470
|
+
'.clj': 'clojure',
|
471
|
+
'.hs': 'haskell',
|
472
|
+
'.ml': 'ocaml',
|
473
|
+
'.fs': 'fsharp',
|
474
|
+
'.elm': 'elm',
|
475
|
+
}
|
476
|
+
|
477
|
+
file_path = Path(item_name)
|
478
|
+
extension = file_path.suffix.lower()
|
479
|
+
|
480
|
+
# Special cases
|
481
|
+
if file_path.name.lower() in ['dockerfile']:
|
482
|
+
return 'dockerfile'
|
483
|
+
elif file_path.name.lower() in ['makefile']:
|
484
|
+
return 'makefile'
|
485
|
+
|
486
|
+
return language_map.get(extension)
|
487
|
+
|
488
|
+
|
489
|
+
class ContentExtractionService:
|
490
|
+
"""Service for extracting content from specific file types."""
|
491
|
+
|
492
|
+
def extract_pdf_content(self, file_path: str) -> Optional[str]:
|
493
|
+
"""Extract text from PDF file."""
|
494
|
+
# TODO: Implement PDF text extraction
|
495
|
+
# Could use PyPDF2, pdfplumber, or similar
|
496
|
+
return None
|
497
|
+
|
498
|
+
def extract_docx_content(self, file_path: str) -> Optional[str]:
|
499
|
+
"""Extract text from DOCX file."""
|
500
|
+
# TODO: Implement DOCX text extraction
|
501
|
+
# Could use python-docx
|
502
|
+
return None
|
503
|
+
|
504
|
+
def extract_image_text(self, file_path: str) -> Optional[str]:
|
505
|
+
"""Extract text from image using OCR."""
|
506
|
+
# TODO: Implement OCR text extraction
|
507
|
+
# Could use pytesseract
|
508
|
+
return None
|