django-cfg 1.1.82__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_cfg/__init__.py +20 -448
- django_cfg/apps/accounts/README.md +3 -3
- django_cfg/apps/accounts/admin/__init__.py +0 -2
- django_cfg/apps/accounts/admin/activity.py +2 -9
- django_cfg/apps/accounts/admin/filters.py +0 -42
- django_cfg/apps/accounts/admin/inlines.py +8 -8
- django_cfg/apps/accounts/admin/otp.py +5 -5
- django_cfg/apps/accounts/admin/registration_source.py +1 -8
- django_cfg/apps/accounts/admin/user.py +12 -20
- django_cfg/apps/accounts/managers/user_manager.py +2 -129
- django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
- django_cfg/apps/accounts/models.py +3 -123
- django_cfg/apps/accounts/serializers/otp.py +40 -44
- django_cfg/apps/accounts/serializers/profile.py +0 -2
- django_cfg/apps/accounts/services/otp_service.py +98 -186
- django_cfg/apps/accounts/signals.py +25 -15
- django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
- django_cfg/apps/accounts/views/otp.py +35 -36
- django_cfg/apps/agents/README.md +129 -0
- django_cfg/apps/agents/__init__.py +68 -0
- django_cfg/apps/agents/admin/__init__.py +17 -0
- django_cfg/apps/agents/admin/execution_admin.py +460 -0
- django_cfg/apps/agents/admin/registry_admin.py +360 -0
- django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
- django_cfg/apps/agents/apps.py +29 -0
- django_cfg/apps/agents/core/__init__.py +20 -0
- django_cfg/apps/agents/core/agent.py +281 -0
- django_cfg/apps/agents/core/dependencies.py +154 -0
- django_cfg/apps/agents/core/exceptions.py +66 -0
- django_cfg/apps/agents/core/models.py +106 -0
- django_cfg/apps/agents/core/orchestrator.py +391 -0
- django_cfg/apps/agents/examples/__init__.py +3 -0
- django_cfg/apps/agents/examples/simple_example.py +161 -0
- django_cfg/apps/agents/integration/__init__.py +14 -0
- django_cfg/apps/agents/integration/middleware.py +80 -0
- django_cfg/apps/agents/integration/registry.py +345 -0
- django_cfg/apps/agents/integration/signals.py +50 -0
- django_cfg/apps/agents/management/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/create_agent.py +365 -0
- django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
- django_cfg/apps/agents/managers/__init__.py +23 -0
- django_cfg/apps/agents/managers/execution.py +236 -0
- django_cfg/apps/agents/managers/registry.py +254 -0
- django_cfg/apps/agents/managers/toolsets.py +496 -0
- django_cfg/apps/agents/migrations/0001_initial.py +286 -0
- django_cfg/apps/agents/migrations/__init__.py +5 -0
- django_cfg/apps/agents/models/__init__.py +15 -0
- django_cfg/apps/agents/models/execution.py +215 -0
- django_cfg/apps/agents/models/registry.py +220 -0
- django_cfg/apps/agents/models/toolsets.py +305 -0
- django_cfg/apps/agents/patterns/__init__.py +24 -0
- django_cfg/apps/agents/patterns/content_agents.py +234 -0
- django_cfg/apps/agents/toolsets/__init__.py +15 -0
- django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
- django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
- django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
- django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
- django_cfg/apps/agents/urls.py +46 -0
- django_cfg/apps/knowbase/README.md +150 -0
- django_cfg/apps/knowbase/__init__.py +27 -0
- django_cfg/apps/knowbase/admin/__init__.py +23 -0
- django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
- django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
- django_cfg/apps/knowbase/admin/document_admin.py +650 -0
- django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
- django_cfg/apps/knowbase/apps.py +81 -0
- django_cfg/apps/knowbase/config/README.md +176 -0
- django_cfg/apps/knowbase/config/__init__.py +51 -0
- django_cfg/apps/knowbase/config/constance_fields.py +186 -0
- django_cfg/apps/knowbase/config/constance_settings.py +200 -0
- django_cfg/apps/knowbase/config/settings.py +450 -0
- django_cfg/apps/knowbase/examples/__init__.py +3 -0
- django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
- django_cfg/apps/knowbase/management/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
- django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
- django_cfg/apps/knowbase/managers/__init__.py +22 -0
- django_cfg/apps/knowbase/managers/archive.py +426 -0
- django_cfg/apps/knowbase/managers/base.py +32 -0
- django_cfg/apps/knowbase/managers/chat.py +141 -0
- django_cfg/apps/knowbase/managers/document.py +203 -0
- django_cfg/apps/knowbase/managers/external_data.py +471 -0
- django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
- django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
- django_cfg/apps/knowbase/migrations/__init__.py +5 -0
- django_cfg/apps/knowbase/mixins/__init__.py +15 -0
- django_cfg/apps/knowbase/mixins/config.py +108 -0
- django_cfg/apps/knowbase/mixins/creator.py +81 -0
- django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
- django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
- django_cfg/apps/knowbase/mixins/service.py +362 -0
- django_cfg/apps/knowbase/models/__init__.py +41 -0
- django_cfg/apps/knowbase/models/archive.py +599 -0
- django_cfg/apps/knowbase/models/base.py +58 -0
- django_cfg/apps/knowbase/models/chat.py +157 -0
- django_cfg/apps/knowbase/models/document.py +267 -0
- django_cfg/apps/knowbase/models/external_data.py +376 -0
- django_cfg/apps/knowbase/serializers/__init__.py +68 -0
- django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
- django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
- django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
- django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
- django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
- django_cfg/apps/knowbase/services/__init__.py +40 -0
- django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
- django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
- django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
- django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
- django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
- django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
- django_cfg/apps/knowbase/services/base.py +53 -0
- django_cfg/apps/knowbase/services/chat_service.py +239 -0
- django_cfg/apps/knowbase/services/document_service.py +144 -0
- django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
- django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
- django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
- django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
- django_cfg/apps/knowbase/services/embedding/models.py +229 -0
- django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
- django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
- django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
- django_cfg/apps/knowbase/services/search_service.py +293 -0
- django_cfg/apps/knowbase/signals/__init__.py +21 -0
- django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
- django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
- django_cfg/apps/knowbase/signals/document_signals.py +143 -0
- django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
- django_cfg/apps/knowbase/tasks/__init__.py +39 -0
- django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
- django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
- django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
- django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
- django_cfg/apps/knowbase/urls.py +43 -0
- django_cfg/apps/knowbase/utils/__init__.py +12 -0
- django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
- django_cfg/apps/knowbase/utils/text_processing.py +375 -0
- django_cfg/apps/knowbase/utils/validation.py +99 -0
- django_cfg/apps/knowbase/views/__init__.py +28 -0
- django_cfg/apps/knowbase/views/archive_views.py +469 -0
- django_cfg/apps/knowbase/views/base.py +49 -0
- django_cfg/apps/knowbase/views/chat_views.py +181 -0
- django_cfg/apps/knowbase/views/document_views.py +183 -0
- django_cfg/apps/knowbase/views/public_views.py +129 -0
- django_cfg/apps/leads/admin.py +70 -0
- django_cfg/apps/newsletter/admin.py +234 -0
- django_cfg/apps/newsletter/admin_filters.py +124 -0
- django_cfg/apps/support/admin.py +196 -0
- django_cfg/apps/support/admin_filters.py +71 -0
- django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
- django_cfg/apps/urls.py +5 -4
- django_cfg/cli/README.md +1 -1
- django_cfg/cli/commands/create_project.py +2 -2
- django_cfg/cli/commands/info.py +1 -1
- django_cfg/config.py +44 -0
- django_cfg/core/config.py +29 -82
- django_cfg/core/environment.py +1 -1
- django_cfg/core/generation.py +19 -107
- django_cfg/{integration.py → core/integration.py} +18 -16
- django_cfg/core/validation.py +1 -1
- django_cfg/management/__init__.py +1 -1
- django_cfg/management/commands/__init__.py +1 -1
- django_cfg/management/commands/auto_generate.py +482 -0
- django_cfg/management/commands/migrator.py +19 -101
- django_cfg/management/commands/test_email.py +1 -1
- django_cfg/middleware/README.md +0 -158
- django_cfg/middleware/__init__.py +0 -2
- django_cfg/middleware/user_activity.py +3 -3
- django_cfg/models/api.py +145 -0
- django_cfg/models/base.py +287 -0
- django_cfg/models/cache.py +4 -4
- django_cfg/models/constance.py +25 -88
- django_cfg/models/database.py +9 -9
- django_cfg/models/drf.py +3 -36
- django_cfg/models/email.py +163 -0
- django_cfg/models/environment.py +276 -0
- django_cfg/models/limits.py +1 -1
- django_cfg/models/logging.py +366 -0
- django_cfg/models/revolution.py +41 -2
- django_cfg/models/security.py +125 -0
- django_cfg/models/services.py +1 -1
- django_cfg/modules/__init__.py +2 -56
- django_cfg/modules/base.py +78 -52
- django_cfg/modules/django_currency/service.py +2 -2
- django_cfg/modules/django_email.py +2 -2
- django_cfg/modules/django_health.py +267 -0
- django_cfg/modules/django_llm/llm/client.py +91 -19
- django_cfg/modules/django_llm/translator/translator.py +2 -2
- django_cfg/modules/django_logger.py +2 -2
- django_cfg/modules/django_ngrok.py +2 -2
- django_cfg/modules/django_tasks.py +68 -3
- django_cfg/modules/django_telegram.py +3 -3
- django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
- django_cfg/modules/django_twilio/service.py +2 -2
- django_cfg/modules/django_twilio/simple_service.py +2 -2
- django_cfg/modules/django_twilio/twilio_service.py +2 -2
- django_cfg/modules/django_unfold/__init__.py +69 -0
- django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
- django_cfg/modules/django_unfold/dashboard.py +278 -0
- django_cfg/modules/django_unfold/icons/README.md +145 -0
- django_cfg/modules/django_unfold/icons/__init__.py +12 -0
- django_cfg/modules/django_unfold/icons/constants.py +2851 -0
- django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
- django_cfg/modules/django_unfold/models/__init__.py +42 -0
- django_cfg/modules/django_unfold/models/config.py +601 -0
- django_cfg/modules/django_unfold/models/dashboard.py +206 -0
- django_cfg/modules/django_unfold/models/dropdown.py +40 -0
- django_cfg/modules/django_unfold/models/navigation.py +73 -0
- django_cfg/modules/django_unfold/models/tabs.py +25 -0
- django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
- django_cfg/modules/django_unfold/utils.py +140 -0
- django_cfg/registry/__init__.py +23 -0
- django_cfg/registry/core.py +61 -0
- django_cfg/registry/exceptions.py +11 -0
- django_cfg/registry/modules.py +12 -0
- django_cfg/registry/services.py +26 -0
- django_cfg/registry/third_party.py +52 -0
- django_cfg/routing/__init__.py +19 -0
- django_cfg/routing/callbacks.py +198 -0
- django_cfg/routing/routers.py +48 -0
- django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
- django_cfg/templatetags/__init__.py +0 -0
- django_cfg/templatetags/django_cfg.py +33 -0
- django_cfg/urls.py +33 -0
- django_cfg/utils/path_resolution.py +1 -1
- django_cfg/utils/smart_defaults.py +7 -61
- django_cfg/utils/toolkit.py +663 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/METADATA +83 -86
- django_cfg-1.2.1.dist-info/RECORD +441 -0
- django_cfg/archive/django_sample.zip +0 -0
- django_cfg/models/unfold.py +0 -271
- django_cfg/modules/unfold/__init__.py +0 -29
- django_cfg/modules/unfold/dashboard.py +0 -318
- django_cfg/pyproject.toml +0 -370
- django_cfg/routers.py +0 -83
- django_cfg-1.1.82.dist-info/RECORD +0 -278
- /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
- /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
- /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
- /django_cfg/{version_check.py → utils/version_check.py} +0 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/WHEEL +0 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/entry_points.txt +0 -0
- {django_cfg-1.1.82.dist-info → django_cfg-1.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,316 @@
|
|
1
|
+
"""
|
2
|
+
Archive processing tasks with Dramatiq.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import dramatiq
|
6
|
+
import logging
|
7
|
+
import time
|
8
|
+
from typing import Dict, Any
|
9
|
+
from django.db import transaction
|
10
|
+
from django.utils import timezone
|
11
|
+
from django.contrib.auth import get_user_model
|
12
|
+
|
13
|
+
from ..models.archive import DocumentArchive, ArchiveItem, ArchiveItemChunk
|
14
|
+
from ..models.base import ProcessingStatus
|
15
|
+
from ..services.archive import (
|
16
|
+
DocumentArchiveService,
|
17
|
+
ArchiveVectorizationService,
|
18
|
+
ArchiveProcessingError
|
19
|
+
)
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
User = get_user_model()
|
23
|
+
|
24
|
+
|
25
|
+
@dramatiq.actor(
|
26
|
+
queue_name="knowledge",
|
27
|
+
max_retries=3,
|
28
|
+
min_backoff=1000, # 1 second
|
29
|
+
max_backoff=30000, # 30 seconds
|
30
|
+
priority=5
|
31
|
+
)
|
32
|
+
def process_archive_task(archive_id: str, user_id: str) -> bool:
|
33
|
+
"""
|
34
|
+
Process a document archive asynchronously.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
archive_id: ID of the archive to process
|
38
|
+
user_id: ID of the user who owns the archive
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
True if processing was successful
|
42
|
+
|
43
|
+
Raises:
|
44
|
+
ArchiveProcessingError: If processing fails
|
45
|
+
"""
|
46
|
+
logger.info(f"Starting archive processing for archive {archive_id}")
|
47
|
+
|
48
|
+
try:
|
49
|
+
# Get archive and user
|
50
|
+
archive = DocumentArchive.objects.all_users().get(pk=archive_id)
|
51
|
+
user = User.objects.get(pk=user_id)
|
52
|
+
|
53
|
+
# Debug logging
|
54
|
+
logger.info(f"Retrieved archive: {archive}, type: {type(archive)}")
|
55
|
+
logger.info(f"Archive ID: {archive.id if archive else 'None'}")
|
56
|
+
logger.info(f"Archive file: {archive.archive_file if archive else 'None'}")
|
57
|
+
|
58
|
+
if not archive:
|
59
|
+
raise ArchiveProcessingError(
|
60
|
+
message=f"Archive {archive_id} not found or is None",
|
61
|
+
code="ARCHIVE_NOT_FOUND"
|
62
|
+
)
|
63
|
+
|
64
|
+
# Verify user owns the archive
|
65
|
+
if archive.user_id != user.id:
|
66
|
+
raise ArchiveProcessingError(
|
67
|
+
message=f"User {user_id} does not own archive {archive_id}",
|
68
|
+
code="UNAUTHORIZED_ACCESS"
|
69
|
+
)
|
70
|
+
|
71
|
+
# Initialize services
|
72
|
+
service = DocumentArchiveService(user=user)
|
73
|
+
|
74
|
+
# Process the archive (remove transaction.atomic to avoid nested transaction conflicts)
|
75
|
+
success = service.process_archive(archive)
|
76
|
+
|
77
|
+
if success:
|
78
|
+
logger.info(f"Successfully processed archive {archive_id}")
|
79
|
+
return True
|
80
|
+
else:
|
81
|
+
logger.error(f"Failed to process archive {archive_id}")
|
82
|
+
return False
|
83
|
+
|
84
|
+
except DocumentArchive.DoesNotExist:
|
85
|
+
logger.error(f"Archive {archive_id} not found")
|
86
|
+
raise
|
87
|
+
except User.DoesNotExist:
|
88
|
+
logger.error(f"User {user_id} not found")
|
89
|
+
raise
|
90
|
+
except Exception as e:
|
91
|
+
logger.error(f"Error processing archive {archive_id}: {str(e)}")
|
92
|
+
raise ArchiveProcessingError(
|
93
|
+
message=f"Archive processing failed: {str(e)}",
|
94
|
+
code="PROCESSING_FAILED"
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
@dramatiq.actor(
|
99
|
+
queue_name="knowledge",
|
100
|
+
max_retries=2,
|
101
|
+
min_backoff=2000, # 2 seconds
|
102
|
+
max_backoff=60000, # 60 seconds
|
103
|
+
priority=4
|
104
|
+
)
|
105
|
+
def vectorize_archive_items_task(archive_id: str, user_id: str) -> int:
|
106
|
+
"""
|
107
|
+
Vectorize all items in a document archive.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
archive_id: ID of the archive to vectorize
|
111
|
+
user_id: ID of the user who owns the archive
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
Number of items vectorized
|
115
|
+
|
116
|
+
Raises:
|
117
|
+
ArchiveProcessingError: If vectorization fails
|
118
|
+
"""
|
119
|
+
logger.info(f"Starting vectorization for archive {archive_id}")
|
120
|
+
|
121
|
+
try:
|
122
|
+
# Get archive and user
|
123
|
+
archive = DocumentArchive.objects.all_users().get(pk=archive_id)
|
124
|
+
user = User.objects.get(pk=user_id)
|
125
|
+
|
126
|
+
# Verify user owns the archive
|
127
|
+
if archive.user_id != user.id:
|
128
|
+
raise ArchiveProcessingError(
|
129
|
+
message=f"User {user_id} does not own archive {archive_id}",
|
130
|
+
code="UNAUTHORIZED_ACCESS"
|
131
|
+
)
|
132
|
+
|
133
|
+
# Initialize vectorization service
|
134
|
+
service = ArchiveVectorizationService(user=user)
|
135
|
+
|
136
|
+
# Vectorize archive items
|
137
|
+
vectorized_count = service.vectorize_archive_items(archive)
|
138
|
+
|
139
|
+
logger.info(f"Successfully vectorized {vectorized_count} items for archive {archive_id}")
|
140
|
+
return vectorized_count
|
141
|
+
|
142
|
+
except DocumentArchive.DoesNotExist:
|
143
|
+
logger.error(f"Archive {archive_id} not found")
|
144
|
+
raise
|
145
|
+
except User.DoesNotExist:
|
146
|
+
logger.error(f"User {user_id} not found")
|
147
|
+
raise
|
148
|
+
except Exception as e:
|
149
|
+
logger.error(f"Error vectorizing archive {archive_id}: {str(e)}")
|
150
|
+
raise ArchiveProcessingError(
|
151
|
+
message=f"Archive vectorization failed: {str(e)}",
|
152
|
+
code="VECTORIZATION_FAILED"
|
153
|
+
)
|
154
|
+
|
155
|
+
|
156
|
+
@dramatiq.actor(
|
157
|
+
queue_name="knowledge",
|
158
|
+
max_retries=1,
|
159
|
+
priority=2
|
160
|
+
)
|
161
|
+
def cleanup_failed_archives_task(days_old: int = 7) -> int:
|
162
|
+
"""
|
163
|
+
Clean up failed archives older than specified days.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
days_old: Age threshold for cleanup (default: 7 days)
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
Number of archives cleaned up
|
170
|
+
"""
|
171
|
+
logger.info(f"Starting cleanup of failed archives older than {days_old} days")
|
172
|
+
|
173
|
+
try:
|
174
|
+
cutoff_date = timezone.now() - timezone.timedelta(days=days_old)
|
175
|
+
|
176
|
+
# Find failed archives older than cutoff
|
177
|
+
failed_archives = DocumentArchive.objects.filter(
|
178
|
+
processing_status=ProcessingStatus.FAILED,
|
179
|
+
created_at__lt=cutoff_date
|
180
|
+
)
|
181
|
+
|
182
|
+
count = failed_archives.count()
|
183
|
+
|
184
|
+
# Delete the archives (cascade will handle related objects)
|
185
|
+
deleted_count, _ = failed_archives.delete()
|
186
|
+
|
187
|
+
logger.info(f"Cleaned up {deleted_count} failed archives")
|
188
|
+
return deleted_count
|
189
|
+
|
190
|
+
except Exception as e:
|
191
|
+
logger.error(f"Error during archive cleanup: {str(e)}")
|
192
|
+
raise
|
193
|
+
|
194
|
+
|
195
|
+
@dramatiq.actor(
|
196
|
+
queue_name="knowledge",
|
197
|
+
max_retries=1,
|
198
|
+
priority=1
|
199
|
+
)
|
200
|
+
def generate_archive_statistics_task(user_id: str) -> Dict[str, Any]:
|
201
|
+
"""
|
202
|
+
Generate statistics for user's archives.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
user_id: ID of the user
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
Dictionary with archive statistics
|
209
|
+
"""
|
210
|
+
logger.info(f"Generating archive statistics for user {user_id}")
|
211
|
+
|
212
|
+
try:
|
213
|
+
user = User.objects.get(pk=user_id)
|
214
|
+
|
215
|
+
# Get user's archives
|
216
|
+
archives = DocumentArchive.objects.filter(user=user)
|
217
|
+
|
218
|
+
# Calculate statistics
|
219
|
+
stats = {
|
220
|
+
'total_archives': archives.count(),
|
221
|
+
'completed_archives': archives.filter(processing_status=ProcessingStatus.COMPLETED).count(),
|
222
|
+
'pending_archives': archives.filter(processing_status=ProcessingStatus.PENDING).count(),
|
223
|
+
'processing_archives': archives.filter(processing_status=ProcessingStatus.PROCESSING).count(),
|
224
|
+
'failed_archives': archives.filter(processing_status=ProcessingStatus.FAILED).count(),
|
225
|
+
'total_items': sum(archive.total_items for archive in archives),
|
226
|
+
'total_chunks': sum(archive.total_chunks for archive in archives),
|
227
|
+
'total_cost': sum(archive.total_cost_usd for archive in archives),
|
228
|
+
}
|
229
|
+
|
230
|
+
logger.info(f"Generated statistics for user {user_id}: {stats}")
|
231
|
+
return stats
|
232
|
+
|
233
|
+
except User.DoesNotExist:
|
234
|
+
logger.error(f"User {user_id} not found")
|
235
|
+
raise
|
236
|
+
except Exception as e:
|
237
|
+
logger.error(f"Error generating statistics for user {user_id}: {str(e)}")
|
238
|
+
raise
|
239
|
+
|
240
|
+
|
241
|
+
@dramatiq.actor(
|
242
|
+
queue_name="knowledge",
|
243
|
+
max_retries=1,
|
244
|
+
priority=1
|
245
|
+
)
|
246
|
+
def archive_health_check_task() -> Dict[str, Any]:
|
247
|
+
"""
|
248
|
+
Perform health check on archive system.
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
Dictionary with health check results
|
252
|
+
"""
|
253
|
+
logger.info("Starting archive system health check")
|
254
|
+
|
255
|
+
try:
|
256
|
+
# Check database connectivity
|
257
|
+
total_archives = DocumentArchive.objects.count()
|
258
|
+
|
259
|
+
# Check for orphaned items
|
260
|
+
orphaned_items = ArchiveItem.objects.filter(archive__isnull=True).count()
|
261
|
+
|
262
|
+
# Check for orphaned chunks
|
263
|
+
orphaned_chunks = ArchiveItemChunk.objects.filter(item__isnull=True).count()
|
264
|
+
|
265
|
+
# Check processing status distribution
|
266
|
+
status_counts = {}
|
267
|
+
for status in ProcessingStatus:
|
268
|
+
count = DocumentArchive.objects.filter(processing_status=status).count()
|
269
|
+
status_counts[status.value] = count
|
270
|
+
|
271
|
+
# Check for archives with missing files
|
272
|
+
archives_with_files = DocumentArchive.objects.exclude(file_path__isnull=True).exclude(file_path='')
|
273
|
+
unhealthy_archives = 0
|
274
|
+
|
275
|
+
for archive in archives_with_files:
|
276
|
+
import os
|
277
|
+
if not os.path.exists(archive.file_path):
|
278
|
+
unhealthy_archives += 1
|
279
|
+
|
280
|
+
health_data = {
|
281
|
+
'total_checked': total_archives,
|
282
|
+
'healthy_archives': total_archives - unhealthy_archives,
|
283
|
+
'unhealthy_archives': unhealthy_archives,
|
284
|
+
'orphaned_items': orphaned_items,
|
285
|
+
'orphaned_chunks': orphaned_chunks,
|
286
|
+
'status_distribution': status_counts,
|
287
|
+
'timestamp': timezone.now().isoformat()
|
288
|
+
}
|
289
|
+
|
290
|
+
logger.info(f"Health check completed: {health_data}")
|
291
|
+
return health_data
|
292
|
+
|
293
|
+
except Exception as e:
|
294
|
+
logger.error(f"Error during health check: {str(e)}")
|
295
|
+
raise
|
296
|
+
|
297
|
+
|
298
|
+
# Test task for development
|
299
|
+
@dramatiq.actor(
|
300
|
+
queue_name="knowledge",
|
301
|
+
max_retries=0,
|
302
|
+
priority=1
|
303
|
+
)
|
304
|
+
def test_archive_task(message: str = "Hello from archive tasks!") -> str:
|
305
|
+
"""
|
306
|
+
Simple test task for archive system.
|
307
|
+
|
308
|
+
Args:
|
309
|
+
message: Test message to process
|
310
|
+
|
311
|
+
Returns:
|
312
|
+
Processed message
|
313
|
+
"""
|
314
|
+
logger.info(f"Test archive task executed with message: {message}")
|
315
|
+
time.sleep(1) # Simulate some work
|
316
|
+
return f"Processed: {message}"
|
@@ -0,0 +1,341 @@
|
|
1
|
+
"""
|
2
|
+
Document processing tasks with Dramatiq.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import dramatiq
|
6
|
+
import logging
|
7
|
+
import time
|
8
|
+
from typing import Dict, List, Any, Tuple, Optional
|
9
|
+
from django.db import transaction
|
10
|
+
from django.utils import timezone
|
11
|
+
from django_cfg.modules.django_llm.llm.client import LLMClient
|
12
|
+
from django.conf import settings
|
13
|
+
|
14
|
+
from ..models import Document, DocumentChunk, ProcessingStatus
|
15
|
+
from ..utils.text_processing import TextProcessor, SemanticChunker
|
16
|
+
from ..services.embedding import process_document_chunks_optimized
|
17
|
+
from ..utils.chunk_settings import get_chunking_params_for_type, get_embedding_model
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
@dramatiq.actor(
|
23
|
+
queue_name="knowledge",
|
24
|
+
max_retries=3,
|
25
|
+
min_backoff=1000, # 1 second
|
26
|
+
max_backoff=30000, # 30 seconds
|
27
|
+
priority=5
|
28
|
+
)
|
29
|
+
def process_document_async(
|
30
|
+
document_id: str,
|
31
|
+
chunk_size: Optional[int] = None,
|
32
|
+
chunk_overlap: Optional[int] = None,
|
33
|
+
embedding_model: Optional[str] = None
|
34
|
+
) -> Dict[str, Any]:
|
35
|
+
"""
|
36
|
+
Process document asynchronously with full pipeline.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
document_id: Document UUID to process
|
40
|
+
chunk_size: Maximum chunk size in characters (uses Constance setting if None)
|
41
|
+
chunk_overlap: Overlap between chunks (uses Constance setting if None)
|
42
|
+
embedding_model: Model to use for embeddings (uses Constance setting if None)
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
Processing results with statistics
|
46
|
+
"""
|
47
|
+
start_time = time.time()
|
48
|
+
|
49
|
+
try:
|
50
|
+
with transaction.atomic():
|
51
|
+
# Load document
|
52
|
+
document = Document.objects.select_for_update().get(
|
53
|
+
id=document_id
|
54
|
+
)
|
55
|
+
|
56
|
+
# Update processing status
|
57
|
+
document.processing_status = ProcessingStatus.PROCESSING
|
58
|
+
document.processing_started_at = timezone.now()
|
59
|
+
document.save(update_fields=['processing_status', 'processing_started_at'])
|
60
|
+
|
61
|
+
logger.info(f"Starting document processing: {document_id}")
|
62
|
+
|
63
|
+
# Get dynamic settings from Constance
|
64
|
+
chunking_params = get_chunking_params_for_type('document')
|
65
|
+
final_chunk_size = chunk_size or chunking_params['chunk_size']
|
66
|
+
final_chunk_overlap = chunk_overlap or chunking_params['overlap']
|
67
|
+
final_embedding_model = embedding_model or get_embedding_model()
|
68
|
+
|
69
|
+
logger.info(f"Using dynamic settings: chunk_size={final_chunk_size}, overlap={final_chunk_overlap}, model={final_embedding_model}")
|
70
|
+
|
71
|
+
# Initialize services
|
72
|
+
text_processor = TextProcessor()
|
73
|
+
chunker = SemanticChunker(
|
74
|
+
chunk_size=final_chunk_size,
|
75
|
+
overlap=final_chunk_overlap
|
76
|
+
)
|
77
|
+
|
78
|
+
# Step 1: Clean and preprocess text
|
79
|
+
cleaned_content = text_processor.clean_text(document.content)
|
80
|
+
|
81
|
+
# Step 2: Create semantic chunks
|
82
|
+
chunks = chunker.create_chunks(cleaned_content)
|
83
|
+
|
84
|
+
logger.info(f"Created {len(chunks)} chunks for document {document_id}")
|
85
|
+
|
86
|
+
# Step 3: Create chunks without embeddings first
|
87
|
+
chunk_objects = []
|
88
|
+
for idx, chunk_text in enumerate(chunks):
|
89
|
+
chunk = DocumentChunk(
|
90
|
+
document=document,
|
91
|
+
user_id=document.user_id,
|
92
|
+
content=chunk_text,
|
93
|
+
chunk_index=idx,
|
94
|
+
character_count=len(chunk_text),
|
95
|
+
embedding_model=final_embedding_model,
|
96
|
+
embedding=[0.0] * 1536, # Temporary zero vector, will be replaced
|
97
|
+
metadata={
|
98
|
+
"processed_at": timezone.now().isoformat(),
|
99
|
+
"chunk_size": len(chunk_text),
|
100
|
+
"overlap_size": final_chunk_overlap if idx > 0 else 0
|
101
|
+
}
|
102
|
+
)
|
103
|
+
chunk_objects.append(chunk)
|
104
|
+
|
105
|
+
# Bulk create chunks for performance
|
106
|
+
DocumentChunk.objects.bulk_create(
|
107
|
+
chunk_objects,
|
108
|
+
batch_size=100
|
109
|
+
)
|
110
|
+
|
111
|
+
# Step 4: Generate embeddings using optimized processor
|
112
|
+
created_chunks = DocumentChunk.objects.filter(document=document).order_by('chunk_index')
|
113
|
+
chunks_list = list(created_chunks)
|
114
|
+
logger.info(f"🔍 About to process {len(chunks_list)} chunks for embeddings")
|
115
|
+
|
116
|
+
embedding_result = process_document_chunks_optimized(chunks_list)
|
117
|
+
|
118
|
+
logger.info(f"🔍 Embedding result: {embedding_result.successful_chunks}/{embedding_result.total_chunks}")
|
119
|
+
|
120
|
+
total_tokens = embedding_result.total_tokens
|
121
|
+
total_cost = embedding_result.total_cost
|
122
|
+
|
123
|
+
logger.info(
|
124
|
+
f"Optimized embedding processing: {embedding_result.successful_chunks}/{embedding_result.total_chunks} chunks, "
|
125
|
+
f"{total_tokens} tokens, ${total_cost:.4f} cost, {embedding_result.processing_time:.2f}s"
|
126
|
+
)
|
127
|
+
|
128
|
+
# Step 5: Update document status
|
129
|
+
processing_time = time.time() - start_time
|
130
|
+
document.processing_status = ProcessingStatus.COMPLETED
|
131
|
+
document.processing_completed_at = timezone.now()
|
132
|
+
document.chunks_count = embedding_result.total_chunks
|
133
|
+
document.total_tokens = total_tokens
|
134
|
+
document.total_cost_usd = total_cost
|
135
|
+
document.save(update_fields=[
|
136
|
+
'processing_status', 'processing_completed_at', 'chunks_count',
|
137
|
+
'total_tokens', 'total_cost_usd'
|
138
|
+
])
|
139
|
+
|
140
|
+
return {
|
141
|
+
"document_id": str(document.id),
|
142
|
+
"status": document.processing_status.value,
|
143
|
+
"chunks_count": document.chunks_count,
|
144
|
+
"total_tokens": document.total_tokens,
|
145
|
+
"total_cost_usd": document.total_cost_usd,
|
146
|
+
"processing_time": processing_time,
|
147
|
+
"errors": embedding_result.errors
|
148
|
+
}
|
149
|
+
|
150
|
+
except Document.DoesNotExist:
|
151
|
+
logger.error(f"Document {document_id} not found.")
|
152
|
+
return {
|
153
|
+
"document_id": document_id,
|
154
|
+
"status": ProcessingStatus.FAILED.value,
|
155
|
+
"error": f"Document {document_id} not found."
|
156
|
+
}
|
157
|
+
except Exception as exc:
|
158
|
+
document = Document.objects.filter(id=document_id).first()
|
159
|
+
if document:
|
160
|
+
document.processing_status = ProcessingStatus.FAILED
|
161
|
+
document.processing_completed_at = timezone.now()
|
162
|
+
document.processing_error = str(exc)
|
163
|
+
document.save(update_fields=['processing_status', 'processing_completed_at', 'processing_error'])
|
164
|
+
logger.error(f"Document processing failed for {document_id}: {exc}", exc_info=True)
|
165
|
+
raise
|
166
|
+
|
167
|
+
|
168
|
+
def generate_embeddings_batch(
|
169
|
+
chunks: List[str],
|
170
|
+
document_id: str,
|
171
|
+
embedding_model: str = "text-embedding-ada-002",
|
172
|
+
batch_size: int = 50
|
173
|
+
) -> List[Tuple[str, List[float], int, float]]:
|
174
|
+
"""
|
175
|
+
Generate embeddings for text chunks in batches.
|
176
|
+
|
177
|
+
Args:
|
178
|
+
chunks: List of text chunks
|
179
|
+
document_id: Parent document ID
|
180
|
+
embedding_model: Model to use for embeddings
|
181
|
+
batch_size: Number of chunks per batch
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
List of (chunk_text, embedding, tokens, cost) tuples
|
185
|
+
"""
|
186
|
+
try:
|
187
|
+
from django_cfg.apps.knowbase.config.settings import get_openai_api_key, get_openrouter_api_key, get_cache_settings
|
188
|
+
cache_settings = get_cache_settings()
|
189
|
+
llm_service = LLMClient(
|
190
|
+
apikey_openai=get_openai_api_key(),
|
191
|
+
apikey_openrouter=get_openrouter_api_key(),
|
192
|
+
cache_dir=cache_settings.cache_dir,
|
193
|
+
cache_ttl=cache_settings.cache_ttl,
|
194
|
+
max_cache_size=cache_settings.max_cache_size
|
195
|
+
)
|
196
|
+
results = []
|
197
|
+
|
198
|
+
# Process in batches to avoid rate limits
|
199
|
+
for i in range(0, len(chunks), batch_size):
|
200
|
+
batch = chunks[i:i + batch_size]
|
201
|
+
|
202
|
+
for chunk_text in batch:
|
203
|
+
# Generate embedding (sync call for simplicity)
|
204
|
+
embedding_response = llm_service.generate_embedding(chunk_text, embedding_model)
|
205
|
+
|
206
|
+
# Extract embedding vector from response
|
207
|
+
embedding_vector = embedding_response.embedding if embedding_response else []
|
208
|
+
|
209
|
+
# Use tokens and cost from embedding response if available
|
210
|
+
tokens = embedding_response.tokens if embedding_response else 0
|
211
|
+
cost = embedding_response.cost if embedding_response else 0.0
|
212
|
+
|
213
|
+
# Fallback to manual calculation if needed
|
214
|
+
if tokens == 0:
|
215
|
+
tokens = llm_service.count_tokens(chunk_text, embedding_model)
|
216
|
+
if cost == 0.0:
|
217
|
+
cost = llm_service.estimate_cost(embedding_model, tokens, 0)
|
218
|
+
|
219
|
+
results.append((
|
220
|
+
chunk_text,
|
221
|
+
embedding_vector,
|
222
|
+
tokens,
|
223
|
+
cost
|
224
|
+
))
|
225
|
+
|
226
|
+
# Small delay between requests to respect rate limits
|
227
|
+
time.sleep(0.1)
|
228
|
+
|
229
|
+
# Longer delay between batches
|
230
|
+
if i + batch_size < len(chunks):
|
231
|
+
time.sleep(1.0)
|
232
|
+
|
233
|
+
logger.info(f"Generated {len(results)} embeddings for document {document_id}")
|
234
|
+
return results
|
235
|
+
|
236
|
+
except Exception as exc:
|
237
|
+
logger.error(f"Batch embedding generation failed: {exc}")
|
238
|
+
raise
|
239
|
+
|
240
|
+
|
241
|
+
@dramatiq.actor(
|
242
|
+
queue_name="knowledge",
|
243
|
+
max_retries=2,
|
244
|
+
priority=7 # Higher priority for reprocessing
|
245
|
+
)
|
246
|
+
def reprocess_document_chunks(
|
247
|
+
document_id: str,
|
248
|
+
new_chunk_size: int = None,
|
249
|
+
new_embedding_model: str = None
|
250
|
+
) -> Dict[str, Any]:
|
251
|
+
"""
|
252
|
+
Reprocess existing document with new parameters.
|
253
|
+
|
254
|
+
Args:
|
255
|
+
document_id: Document to reprocess
|
256
|
+
new_chunk_size: New chunk size (optional)
|
257
|
+
new_embedding_model: New embedding model (optional)
|
258
|
+
|
259
|
+
Returns:
|
260
|
+
Reprocessing results
|
261
|
+
"""
|
262
|
+
try:
|
263
|
+
with transaction.atomic():
|
264
|
+
document = Document.objects.get(id=document_id)
|
265
|
+
|
266
|
+
# Delete existing chunks
|
267
|
+
DocumentChunk.objects.filter(
|
268
|
+
document=document
|
269
|
+
).delete()
|
270
|
+
|
271
|
+
# Reset document status
|
272
|
+
document.processing_status = ProcessingStatus.PENDING
|
273
|
+
document.chunks_count = 0
|
274
|
+
document.total_tokens = 0
|
275
|
+
document.processing_error = ""
|
276
|
+
document.save(update_fields=[
|
277
|
+
'processing_status', 'processing_started_at', 'processing_completed_at',
|
278
|
+
'processing_error', 'chunks_count', 'total_tokens', 'total_cost_usd'
|
279
|
+
])
|
280
|
+
|
281
|
+
# Trigger reprocessing
|
282
|
+
return process_document_async(
|
283
|
+
document_id=document_id,
|
284
|
+
chunk_size=new_chunk_size or 1000,
|
285
|
+
embedding_model=new_embedding_model or "text-embedding-ada-002"
|
286
|
+
)
|
287
|
+
|
288
|
+
except Exception as exc:
|
289
|
+
logger.error(f"Reprocessing failed for {document_id}: {exc}")
|
290
|
+
raise
|
291
|
+
|
292
|
+
|
293
|
+
@dramatiq.actor(
|
294
|
+
queue_name="knowledge",
|
295
|
+
max_retries=2,
|
296
|
+
priority=4
|
297
|
+
)
|
298
|
+
def optimize_document_embeddings(document_id: str) -> Dict[str, Any]:
|
299
|
+
"""
|
300
|
+
Post-processing optimization for document embeddings.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
document_id: Document to optimize
|
304
|
+
|
305
|
+
Returns:
|
306
|
+
Optimization results
|
307
|
+
"""
|
308
|
+
try:
|
309
|
+
# Update vector index statistics
|
310
|
+
from django.db import connection
|
311
|
+
|
312
|
+
with connection.cursor() as cursor:
|
313
|
+
# Always analyze the table
|
314
|
+
cursor.execute("ANALYZE django_cfg_knowbase_document_chunks;")
|
315
|
+
|
316
|
+
# Check if index exists before trying to reindex
|
317
|
+
cursor.execute("""
|
318
|
+
SELECT EXISTS (
|
319
|
+
SELECT FROM pg_indexes
|
320
|
+
WHERE indexname = 'embedding_cosine_idx'
|
321
|
+
);
|
322
|
+
""")
|
323
|
+
index_exists = cursor.fetchone()[0]
|
324
|
+
|
325
|
+
if index_exists:
|
326
|
+
cursor.execute("REINDEX INDEX embedding_cosine_idx;")
|
327
|
+
logger.debug("Reindexed embedding_cosine_idx")
|
328
|
+
else:
|
329
|
+
logger.warning("embedding_cosine_idx index does not exist, skipping reindex")
|
330
|
+
|
331
|
+
logger.info(f"Optimized embeddings for document {document_id}")
|
332
|
+
|
333
|
+
return {
|
334
|
+
"status": "optimized",
|
335
|
+
"document_id": document_id,
|
336
|
+
"timestamp": timezone.now().isoformat()
|
337
|
+
}
|
338
|
+
|
339
|
+
except Exception as exc:
|
340
|
+
logger.error(f"Embedding optimization failed for {document_id}: {exc}")
|
341
|
+
raise
|