django-cfg 1.1.81__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_cfg/__init__.py +20 -448
- django_cfg/apps/accounts/README.md +3 -3
- django_cfg/apps/accounts/admin/__init__.py +0 -2
- django_cfg/apps/accounts/admin/activity.py +2 -9
- django_cfg/apps/accounts/admin/filters.py +0 -42
- django_cfg/apps/accounts/admin/inlines.py +8 -8
- django_cfg/apps/accounts/admin/otp.py +5 -5
- django_cfg/apps/accounts/admin/registration_source.py +1 -8
- django_cfg/apps/accounts/admin/user.py +12 -20
- django_cfg/apps/accounts/managers/user_manager.py +2 -129
- django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
- django_cfg/apps/accounts/models.py +3 -123
- django_cfg/apps/accounts/serializers/otp.py +40 -44
- django_cfg/apps/accounts/serializers/profile.py +0 -2
- django_cfg/apps/accounts/services/otp_service.py +98 -186
- django_cfg/apps/accounts/signals.py +25 -15
- django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
- django_cfg/apps/accounts/views/otp.py +35 -36
- django_cfg/apps/agents/README.md +129 -0
- django_cfg/apps/agents/__init__.py +68 -0
- django_cfg/apps/agents/admin/__init__.py +17 -0
- django_cfg/apps/agents/admin/execution_admin.py +460 -0
- django_cfg/apps/agents/admin/registry_admin.py +360 -0
- django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
- django_cfg/apps/agents/apps.py +29 -0
- django_cfg/apps/agents/core/__init__.py +20 -0
- django_cfg/apps/agents/core/agent.py +281 -0
- django_cfg/apps/agents/core/dependencies.py +154 -0
- django_cfg/apps/agents/core/exceptions.py +66 -0
- django_cfg/apps/agents/core/models.py +106 -0
- django_cfg/apps/agents/core/orchestrator.py +391 -0
- django_cfg/apps/agents/examples/__init__.py +3 -0
- django_cfg/apps/agents/examples/simple_example.py +161 -0
- django_cfg/apps/agents/integration/__init__.py +14 -0
- django_cfg/apps/agents/integration/middleware.py +80 -0
- django_cfg/apps/agents/integration/registry.py +345 -0
- django_cfg/apps/agents/integration/signals.py +50 -0
- django_cfg/apps/agents/management/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/create_agent.py +365 -0
- django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
- django_cfg/apps/agents/managers/__init__.py +23 -0
- django_cfg/apps/agents/managers/execution.py +236 -0
- django_cfg/apps/agents/managers/registry.py +254 -0
- django_cfg/apps/agents/managers/toolsets.py +496 -0
- django_cfg/apps/agents/migrations/0001_initial.py +286 -0
- django_cfg/apps/agents/migrations/__init__.py +5 -0
- django_cfg/apps/agents/models/__init__.py +15 -0
- django_cfg/apps/agents/models/execution.py +215 -0
- django_cfg/apps/agents/models/registry.py +220 -0
- django_cfg/apps/agents/models/toolsets.py +305 -0
- django_cfg/apps/agents/patterns/__init__.py +24 -0
- django_cfg/apps/agents/patterns/content_agents.py +234 -0
- django_cfg/apps/agents/toolsets/__init__.py +15 -0
- django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
- django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
- django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
- django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
- django_cfg/apps/agents/urls.py +46 -0
- django_cfg/apps/knowbase/README.md +150 -0
- django_cfg/apps/knowbase/__init__.py +27 -0
- django_cfg/apps/knowbase/admin/__init__.py +23 -0
- django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
- django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
- django_cfg/apps/knowbase/admin/document_admin.py +650 -0
- django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
- django_cfg/apps/knowbase/apps.py +81 -0
- django_cfg/apps/knowbase/config/README.md +176 -0
- django_cfg/apps/knowbase/config/__init__.py +51 -0
- django_cfg/apps/knowbase/config/constance_fields.py +186 -0
- django_cfg/apps/knowbase/config/constance_settings.py +200 -0
- django_cfg/apps/knowbase/config/settings.py +444 -0
- django_cfg/apps/knowbase/examples/__init__.py +3 -0
- django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
- django_cfg/apps/knowbase/management/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
- django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
- django_cfg/apps/knowbase/managers/__init__.py +22 -0
- django_cfg/apps/knowbase/managers/archive.py +426 -0
- django_cfg/apps/knowbase/managers/base.py +32 -0
- django_cfg/apps/knowbase/managers/chat.py +141 -0
- django_cfg/apps/knowbase/managers/document.py +203 -0
- django_cfg/apps/knowbase/managers/external_data.py +471 -0
- django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
- django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
- django_cfg/apps/knowbase/migrations/__init__.py +5 -0
- django_cfg/apps/knowbase/mixins/__init__.py +15 -0
- django_cfg/apps/knowbase/mixins/config.py +108 -0
- django_cfg/apps/knowbase/mixins/creator.py +81 -0
- django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
- django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
- django_cfg/apps/knowbase/mixins/service.py +362 -0
- django_cfg/apps/knowbase/models/__init__.py +41 -0
- django_cfg/apps/knowbase/models/archive.py +599 -0
- django_cfg/apps/knowbase/models/base.py +58 -0
- django_cfg/apps/knowbase/models/chat.py +157 -0
- django_cfg/apps/knowbase/models/document.py +267 -0
- django_cfg/apps/knowbase/models/external_data.py +376 -0
- django_cfg/apps/knowbase/serializers/__init__.py +68 -0
- django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
- django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
- django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
- django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
- django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
- django_cfg/apps/knowbase/services/__init__.py +40 -0
- django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
- django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
- django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
- django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
- django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
- django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
- django_cfg/apps/knowbase/services/base.py +53 -0
- django_cfg/apps/knowbase/services/chat_service.py +239 -0
- django_cfg/apps/knowbase/services/document_service.py +144 -0
- django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
- django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
- django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
- django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
- django_cfg/apps/knowbase/services/embedding/models.py +229 -0
- django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
- django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
- django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
- django_cfg/apps/knowbase/services/search_service.py +293 -0
- django_cfg/apps/knowbase/signals/__init__.py +21 -0
- django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
- django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
- django_cfg/apps/knowbase/signals/document_signals.py +143 -0
- django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
- django_cfg/apps/knowbase/tasks/__init__.py +39 -0
- django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
- django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
- django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
- django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
- django_cfg/apps/knowbase/urls.py +43 -0
- django_cfg/apps/knowbase/utils/__init__.py +12 -0
- django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
- django_cfg/apps/knowbase/utils/text_processing.py +375 -0
- django_cfg/apps/knowbase/utils/validation.py +99 -0
- django_cfg/apps/knowbase/views/__init__.py +28 -0
- django_cfg/apps/knowbase/views/archive_views.py +469 -0
- django_cfg/apps/knowbase/views/base.py +49 -0
- django_cfg/apps/knowbase/views/chat_views.py +181 -0
- django_cfg/apps/knowbase/views/document_views.py +183 -0
- django_cfg/apps/knowbase/views/public_views.py +129 -0
- django_cfg/apps/leads/admin.py +70 -0
- django_cfg/apps/newsletter/admin.py +234 -0
- django_cfg/apps/newsletter/admin_filters.py +124 -0
- django_cfg/apps/support/admin.py +196 -0
- django_cfg/apps/support/admin_filters.py +71 -0
- django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
- django_cfg/apps/urls.py +5 -4
- django_cfg/cli/README.md +1 -1
- django_cfg/cli/commands/create_project.py +2 -2
- django_cfg/cli/commands/info.py +1 -1
- django_cfg/config.py +44 -0
- django_cfg/core/config.py +29 -82
- django_cfg/core/environment.py +1 -1
- django_cfg/core/generation.py +19 -107
- django_cfg/{integration.py → core/integration.py} +18 -16
- django_cfg/core/validation.py +1 -1
- django_cfg/management/__init__.py +1 -1
- django_cfg/management/commands/__init__.py +1 -1
- django_cfg/management/commands/auto_generate.py +482 -0
- django_cfg/management/commands/migrator.py +19 -101
- django_cfg/management/commands/test_email.py +1 -1
- django_cfg/middleware/README.md +0 -158
- django_cfg/middleware/__init__.py +0 -2
- django_cfg/middleware/user_activity.py +3 -3
- django_cfg/models/api.py +145 -0
- django_cfg/models/base.py +287 -0
- django_cfg/models/cache.py +4 -4
- django_cfg/models/constance.py +25 -88
- django_cfg/models/database.py +9 -9
- django_cfg/models/drf.py +3 -36
- django_cfg/models/email.py +163 -0
- django_cfg/models/environment.py +276 -0
- django_cfg/models/limits.py +1 -1
- django_cfg/models/logging.py +366 -0
- django_cfg/models/revolution.py +41 -2
- django_cfg/models/security.py +125 -0
- django_cfg/models/services.py +1 -1
- django_cfg/modules/__init__.py +2 -56
- django_cfg/modules/base.py +78 -52
- django_cfg/modules/django_currency/service.py +2 -2
- django_cfg/modules/django_email.py +2 -2
- django_cfg/modules/django_health.py +267 -0
- django_cfg/modules/django_llm/llm/client.py +79 -17
- django_cfg/modules/django_llm/translator/translator.py +2 -2
- django_cfg/modules/django_logger.py +2 -2
- django_cfg/modules/django_ngrok.py +2 -2
- django_cfg/modules/django_tasks.py +68 -3
- django_cfg/modules/django_telegram.py +3 -3
- django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
- django_cfg/modules/django_twilio/service.py +2 -2
- django_cfg/modules/django_twilio/simple_service.py +2 -2
- django_cfg/modules/django_twilio/templates/guide.md +266 -0
- django_cfg/modules/django_twilio/twilio_service.py +2 -2
- django_cfg/modules/django_unfold/__init__.py +69 -0
- django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
- django_cfg/modules/django_unfold/dashboard.py +278 -0
- django_cfg/modules/django_unfold/icons/README.md +145 -0
- django_cfg/modules/django_unfold/icons/__init__.py +12 -0
- django_cfg/modules/django_unfold/icons/constants.py +2851 -0
- django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
- django_cfg/modules/django_unfold/models/__init__.py +42 -0
- django_cfg/modules/django_unfold/models/config.py +601 -0
- django_cfg/modules/django_unfold/models/dashboard.py +206 -0
- django_cfg/modules/django_unfold/models/dropdown.py +40 -0
- django_cfg/modules/django_unfold/models/navigation.py +73 -0
- django_cfg/modules/django_unfold/models/tabs.py +25 -0
- django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
- django_cfg/modules/django_unfold/utils.py +140 -0
- django_cfg/registry/__init__.py +23 -0
- django_cfg/registry/core.py +61 -0
- django_cfg/registry/exceptions.py +11 -0
- django_cfg/registry/modules.py +12 -0
- django_cfg/registry/services.py +26 -0
- django_cfg/registry/third_party.py +52 -0
- django_cfg/routing/__init__.py +19 -0
- django_cfg/routing/callbacks.py +198 -0
- django_cfg/routing/routers.py +48 -0
- django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
- django_cfg/templatetags/__init__.py +0 -0
- django_cfg/templatetags/django_cfg.py +33 -0
- django_cfg/urls.py +33 -0
- django_cfg/utils/path_resolution.py +1 -1
- django_cfg/utils/smart_defaults.py +7 -61
- django_cfg/utils/toolkit.py +663 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/METADATA +83 -86
- django_cfg-1.2.0.dist-info/RECORD +441 -0
- django_cfg/apps/tasks/@docs/README.md +0 -195
- django_cfg/archive/django_sample.zip +0 -0
- django_cfg/models/unfold.py +0 -271
- django_cfg/modules/unfold/__init__.py +0 -29
- django_cfg/modules/unfold/dashboard.py +0 -318
- django_cfg/pyproject.toml +0 -370
- django_cfg/routers.py +0 -83
- django_cfg-1.1.81.dist-info/RECORD +0 -278
- /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
- /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
- /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
- /django_cfg/{version_check.py → utils/version_check.py} +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/WHEEL +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/entry_points.txt +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
"""
|
2
|
+
Document management service.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import List, Optional, Dict, Any
|
6
|
+
from django.db import transaction, models
|
7
|
+
from django.utils import timezone
|
8
|
+
from ..models import Document, DocumentChunk, ProcessingStatus
|
9
|
+
from .base import BaseService
|
10
|
+
|
11
|
+
|
12
|
+
class DocumentService(BaseService):
|
13
|
+
"""Service for document management and processing."""
|
14
|
+
|
15
|
+
def create_document(
|
16
|
+
self,
|
17
|
+
title: str,
|
18
|
+
content: str,
|
19
|
+
file_type: str = "text/plain",
|
20
|
+
metadata: Optional[Dict[str, Any]] = None
|
21
|
+
) -> Document:
|
22
|
+
"""Create document and trigger async processing."""
|
23
|
+
|
24
|
+
# Generate content hash for duplicate detection
|
25
|
+
content_hash = self._generate_content_hash(content)
|
26
|
+
|
27
|
+
# Check for duplicates
|
28
|
+
existing = Document.objects.filter(
|
29
|
+
user=self.user,
|
30
|
+
content_hash=content_hash
|
31
|
+
).first()
|
32
|
+
|
33
|
+
if existing:
|
34
|
+
raise ValueError(f"Document with same content already exists: {existing.title}")
|
35
|
+
|
36
|
+
# Create document (async processing will be triggered by post_save signal)
|
37
|
+
document = Document.objects.create(
|
38
|
+
user=self.user,
|
39
|
+
title=title,
|
40
|
+
content=content,
|
41
|
+
content_hash=content_hash,
|
42
|
+
file_type=file_type,
|
43
|
+
file_size=len(content.encode('utf-8')),
|
44
|
+
metadata=metadata or {},
|
45
|
+
processing_status=ProcessingStatus.PENDING
|
46
|
+
)
|
47
|
+
|
48
|
+
return document
|
49
|
+
|
50
|
+
def get_document(self, document_id: str) -> Optional[Document]:
|
51
|
+
"""Get document by ID with user access check."""
|
52
|
+
try:
|
53
|
+
document = Document.objects.get(
|
54
|
+
id=document_id,
|
55
|
+
user=self.user
|
56
|
+
)
|
57
|
+
return document
|
58
|
+
except Document.DoesNotExist:
|
59
|
+
return None
|
60
|
+
|
61
|
+
def get_user_documents(self, status: Optional[str] = None):
|
62
|
+
"""Get user documents queryset with filtering."""
|
63
|
+
queryset = Document.objects.filter(user=self.user)
|
64
|
+
|
65
|
+
if status:
|
66
|
+
queryset = queryset.filter(processing_status=status)
|
67
|
+
|
68
|
+
return queryset.order_by('-created_at')
|
69
|
+
|
70
|
+
def list_documents(
|
71
|
+
self,
|
72
|
+
status: Optional[str] = None,
|
73
|
+
limit: int = 20,
|
74
|
+
offset: int = 0
|
75
|
+
) -> List[Document]:
|
76
|
+
"""List user documents with filtering."""
|
77
|
+
queryset = self.get_user_documents(status)
|
78
|
+
return list(queryset[offset:offset + limit])
|
79
|
+
|
80
|
+
def delete_document(self, document_id: str) -> bool:
|
81
|
+
"""Delete document and all associated chunks."""
|
82
|
+
try:
|
83
|
+
with transaction.atomic():
|
84
|
+
document = Document.objects.get(
|
85
|
+
id=document_id,
|
86
|
+
user=self.user
|
87
|
+
)
|
88
|
+
|
89
|
+
# Delete associated chunks first
|
90
|
+
DocumentChunk.objects.filter(document=document).delete()
|
91
|
+
|
92
|
+
# Delete document
|
93
|
+
document.delete()
|
94
|
+
|
95
|
+
return True
|
96
|
+
except Document.DoesNotExist:
|
97
|
+
return False
|
98
|
+
|
99
|
+
def get_processing_stats(self) -> Dict[str, Any]:
|
100
|
+
"""Get user's document processing statistics."""
|
101
|
+
|
102
|
+
from django.db.models import Count, Sum, Avg
|
103
|
+
|
104
|
+
stats = Document.objects.filter(user=self.user).aggregate(
|
105
|
+
total_documents=Count('id'),
|
106
|
+
completed_documents=Count('id', filter=models.Q(processing_status=ProcessingStatus.COMPLETED)),
|
107
|
+
total_chunks=Sum('chunks_count'),
|
108
|
+
total_tokens=Sum('total_tokens'),
|
109
|
+
total_cost=Sum('total_cost_usd'),
|
110
|
+
)
|
111
|
+
|
112
|
+
return {
|
113
|
+
'total_documents': stats['total_documents'] or 0,
|
114
|
+
'completed_documents': stats['completed_documents'] or 0,
|
115
|
+
'processing_success_rate': (
|
116
|
+
(stats['completed_documents'] / stats['total_documents'] * 100)
|
117
|
+
if stats['total_documents'] > 0 else 0
|
118
|
+
),
|
119
|
+
'total_chunks': stats['total_chunks'] or 0,
|
120
|
+
'total_tokens': stats['total_tokens'] or 0,
|
121
|
+
'total_cost_usd': float(stats['total_cost'] or 0),
|
122
|
+
'avg_processing_time_seconds': 0.0 # Calculated separately if needed
|
123
|
+
}
|
124
|
+
|
125
|
+
def reprocess_document(self, document_id: str) -> bool:
|
126
|
+
"""Trigger document reprocessing."""
|
127
|
+
try:
|
128
|
+
document = Document.objects.get(
|
129
|
+
id=document_id,
|
130
|
+
user=self.user
|
131
|
+
)
|
132
|
+
|
133
|
+
# Reset processing status
|
134
|
+
document.processing_status = ProcessingStatus.PENDING
|
135
|
+
document.processing_error = ""
|
136
|
+
document.save()
|
137
|
+
|
138
|
+
# Trigger async reprocessing
|
139
|
+
from ..tasks import reprocess_document_chunks
|
140
|
+
reprocess_document_chunks.send(str(document.id))
|
141
|
+
|
142
|
+
return True
|
143
|
+
except Document.DoesNotExist:
|
144
|
+
return False
|
@@ -0,0 +1,43 @@
|
|
1
|
+
"""
|
2
|
+
Embedding processing services.
|
3
|
+
|
4
|
+
This package provides high-performance embedding generation services
|
5
|
+
for documents and archives with batch processing and async support.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .models import ChunkData, EmbeddingResult, BatchProcessingResult, ProcessingConfig, ChunkType
|
9
|
+
from .processors import DocumentChunkProcessor, ArchiveChunkProcessor, ExternalDataChunkProcessor
|
10
|
+
from .batch_processor import OptimizedEmbeddingProcessor
|
11
|
+
from .async_processor import AsyncOptimizedEmbeddingProcessor
|
12
|
+
from .batch_result import BatchResultBuilder
|
13
|
+
from .utils import (
|
14
|
+
process_document_chunks_optimized,
|
15
|
+
process_archive_chunks_optimized,
|
16
|
+
process_external_data_chunks_optimized,
|
17
|
+
process_chunks_context_aware,
|
18
|
+
)
|
19
|
+
|
20
|
+
__all__ = [
|
21
|
+
# Data models
|
22
|
+
"ChunkData",
|
23
|
+
"EmbeddingResult",
|
24
|
+
"BatchProcessingResult",
|
25
|
+
"ProcessingConfig",
|
26
|
+
"ChunkType",
|
27
|
+
|
28
|
+
# Processors
|
29
|
+
"DocumentChunkProcessor",
|
30
|
+
"ArchiveChunkProcessor",
|
31
|
+
"ExternalDataChunkProcessor",
|
32
|
+
"OptimizedEmbeddingProcessor",
|
33
|
+
"AsyncOptimizedEmbeddingProcessor",
|
34
|
+
|
35
|
+
# Utilities
|
36
|
+
"BatchResultBuilder",
|
37
|
+
|
38
|
+
# Convenience functions
|
39
|
+
"process_document_chunks_optimized",
|
40
|
+
"process_archive_chunks_optimized",
|
41
|
+
"process_external_data_chunks_optimized",
|
42
|
+
"process_chunks_context_aware",
|
43
|
+
]
|
@@ -0,0 +1,244 @@
|
|
1
|
+
"""
|
2
|
+
Async-compatible embedding processor for Django 5.2.
|
3
|
+
|
4
|
+
This module provides async/sync compatibility for embedding generation,
|
5
|
+
following Django 5.2 async best practices.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import asyncio
|
9
|
+
import time
|
10
|
+
import logging
|
11
|
+
from typing import List, Dict, Any, Optional, Union
|
12
|
+
|
13
|
+
from asgiref.sync import sync_to_async, async_to_sync
|
14
|
+
from django.db import transaction
|
15
|
+
|
16
|
+
from django_cfg.apps.knowbase.models import DocumentChunk, ArchiveItemChunk
|
17
|
+
from django_cfg.apps.knowbase.utils.chunk_settings import get_embedding_batch_size, get_embedding_model
|
18
|
+
|
19
|
+
from .models import ChunkData, BatchProcessingResult, EmbeddingResult
|
20
|
+
from .batch_processor import OptimizedEmbeddingProcessor
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
def is_async_context() -> bool:
|
26
|
+
"""Detect current execution context."""
|
27
|
+
try:
|
28
|
+
asyncio.get_running_loop()
|
29
|
+
return True
|
30
|
+
except RuntimeError:
|
31
|
+
return False
|
32
|
+
|
33
|
+
|
34
|
+
class AsyncOptimizedEmbeddingProcessor(OptimizedEmbeddingProcessor):
|
35
|
+
"""
|
36
|
+
Async-compatible embedding processor that works in both sync and async contexts.
|
37
|
+
|
38
|
+
Based on Django 5.2 async patterns:
|
39
|
+
- Context-aware operations
|
40
|
+
- Proper async/sync method selection
|
41
|
+
- Compatible with both WSGI and ASGI
|
42
|
+
"""
|
43
|
+
|
44
|
+
def __init__(self, batch_size: Optional[int] = None, embedding_model: Optional[str] = None):
|
45
|
+
"""Initialize async-compatible processor."""
|
46
|
+
super().__init__(batch_size, embedding_model)
|
47
|
+
logger.info(f"🚀 AsyncOptimizedEmbeddingProcessor initialized: async_context={is_async_context()}")
|
48
|
+
|
49
|
+
async def aprocess_chunks_batch(self, chunks: List[ChunkData]) -> BatchProcessingResult:
|
50
|
+
"""
|
51
|
+
Async version of batch processing.
|
52
|
+
|
53
|
+
Uses Django 5.2 async ORM methods (a-prefixed) for database operations.
|
54
|
+
"""
|
55
|
+
start_time = time.time()
|
56
|
+
total_chunks = len(chunks)
|
57
|
+
successful_chunks = 0
|
58
|
+
failed_chunks = 0
|
59
|
+
total_tokens = 0
|
60
|
+
total_cost = 0.0
|
61
|
+
errors = []
|
62
|
+
|
63
|
+
logger.info(f"🔮 Starting async batch processing of {total_chunks} chunks")
|
64
|
+
|
65
|
+
# Process in batches
|
66
|
+
for i in range(0, total_chunks, self.batch_size):
|
67
|
+
batch = chunks[i:i + self.batch_size]
|
68
|
+
batch_num = (i // self.batch_size) + 1
|
69
|
+
total_batches = (total_chunks + self.batch_size - 1) // self.batch_size
|
70
|
+
|
71
|
+
logger.info(f"🔮 Processing async batch {batch_num}/{total_batches} ({len(batch)} chunks)")
|
72
|
+
|
73
|
+
try:
|
74
|
+
batch_result = await self._aprocess_single_batch(batch)
|
75
|
+
|
76
|
+
successful_chunks += batch_result['successful']
|
77
|
+
failed_chunks += batch_result['failed']
|
78
|
+
total_tokens += batch_result['tokens']
|
79
|
+
total_cost += batch_result['cost']
|
80
|
+
errors.extend(batch_result['errors'])
|
81
|
+
|
82
|
+
# Small async delay between batches
|
83
|
+
if i + self.batch_size < total_chunks:
|
84
|
+
await asyncio.sleep(0.5)
|
85
|
+
|
86
|
+
except Exception as e:
|
87
|
+
error_msg = f"Async batch {batch_num} failed: {str(e)}"
|
88
|
+
logger.error(f"❌ {error_msg}")
|
89
|
+
errors.append(error_msg)
|
90
|
+
failed_chunks += len(batch)
|
91
|
+
|
92
|
+
processing_time = time.time() - start_time
|
93
|
+
|
94
|
+
result = BatchProcessingResult(
|
95
|
+
total_chunks=total_chunks,
|
96
|
+
successful_chunks=successful_chunks,
|
97
|
+
failed_chunks=failed_chunks,
|
98
|
+
total_tokens=total_tokens,
|
99
|
+
total_cost=total_cost,
|
100
|
+
processing_time=processing_time,
|
101
|
+
errors=errors
|
102
|
+
)
|
103
|
+
|
104
|
+
logger.info(
|
105
|
+
f"🎉 Async batch processing completed: {successful_chunks}/{total_chunks} successful, "
|
106
|
+
f"{total_tokens} tokens, ${total_cost:.4f} cost, {processing_time:.2f}s"
|
107
|
+
)
|
108
|
+
|
109
|
+
return result
|
110
|
+
|
111
|
+
async def _aprocess_single_batch(self, batch: List[ChunkData]) -> Dict[str, Any]:
|
112
|
+
"""Async version of single batch processing."""
|
113
|
+
|
114
|
+
# Prepare content for all chunks (sync operation)
|
115
|
+
prepared_contents = []
|
116
|
+
chunk_mapping = {}
|
117
|
+
|
118
|
+
for idx, chunk in enumerate(batch):
|
119
|
+
processor = self.processors.get(chunk.parent_type)
|
120
|
+
if not processor:
|
121
|
+
logger.warning(f"⚠️ Unknown chunk type: {chunk.parent_type}")
|
122
|
+
continue
|
123
|
+
|
124
|
+
try:
|
125
|
+
content = processor.prepare_content_for_embedding(chunk)
|
126
|
+
if content and content.strip():
|
127
|
+
prepared_contents.append(content)
|
128
|
+
chunk_mapping[len(prepared_contents) - 1] = chunk
|
129
|
+
else:
|
130
|
+
logger.warning(f"⚠️ Empty content for chunk {chunk.id}")
|
131
|
+
except Exception as e:
|
132
|
+
logger.error(f"❌ Failed to prepare content for chunk {chunk.id}: {e}")
|
133
|
+
|
134
|
+
if not prepared_contents:
|
135
|
+
return {
|
136
|
+
'successful': 0,
|
137
|
+
'failed': len(batch),
|
138
|
+
'tokens': 0,
|
139
|
+
'cost': 0.0,
|
140
|
+
'errors': ['No valid content to process']
|
141
|
+
}
|
142
|
+
|
143
|
+
# Generate embeddings (sync operation - OpenAI client is sync)
|
144
|
+
try:
|
145
|
+
embedding_results = self._generate_batch_embeddings(prepared_contents)
|
146
|
+
|
147
|
+
# Save results using async database operations
|
148
|
+
successful = 0
|
149
|
+
failed = 0
|
150
|
+
total_tokens = 0
|
151
|
+
total_cost = 0.0
|
152
|
+
errors = []
|
153
|
+
|
154
|
+
for idx, embedding_result in enumerate(embedding_results):
|
155
|
+
if idx not in chunk_mapping:
|
156
|
+
continue
|
157
|
+
|
158
|
+
chunk = chunk_mapping[idx]
|
159
|
+
|
160
|
+
if embedding_result.success:
|
161
|
+
try:
|
162
|
+
await self._asave_embedding_result(chunk, embedding_result)
|
163
|
+
successful += 1
|
164
|
+
total_tokens += embedding_result.tokens
|
165
|
+
total_cost += embedding_result.cost
|
166
|
+
except Exception as e:
|
167
|
+
error_msg = f"Failed to save async embedding for chunk {chunk.id}: {e}"
|
168
|
+
logger.error(f"❌ {error_msg}")
|
169
|
+
errors.append(error_msg)
|
170
|
+
failed += 1
|
171
|
+
else:
|
172
|
+
errors.append(embedding_result.error or f"Failed to generate embedding for chunk {chunk.id}")
|
173
|
+
failed += 1
|
174
|
+
|
175
|
+
return {
|
176
|
+
'successful': successful,
|
177
|
+
'failed': failed,
|
178
|
+
'tokens': total_tokens,
|
179
|
+
'cost': total_cost,
|
180
|
+
'errors': errors
|
181
|
+
}
|
182
|
+
|
183
|
+
except Exception as e:
|
184
|
+
error_msg = f"Async batch embedding generation failed: {e}"
|
185
|
+
logger.error(f"❌ {error_msg}")
|
186
|
+
return {
|
187
|
+
'successful': 0,
|
188
|
+
'failed': len(batch),
|
189
|
+
'tokens': 0,
|
190
|
+
'cost': 0.0,
|
191
|
+
'errors': [error_msg]
|
192
|
+
}
|
193
|
+
|
194
|
+
async def _asave_embedding_result(self, chunk: ChunkData, result: EmbeddingResult) -> None:
|
195
|
+
"""Save embedding result using async database operations."""
|
196
|
+
|
197
|
+
try:
|
198
|
+
if chunk.parent_type == "document":
|
199
|
+
# Use Django 5.2 async ORM methods
|
200
|
+
chunk_obj = await DocumentChunk.objects.aget(id=chunk.id)
|
201
|
+
chunk_obj.embedding = result.embedding
|
202
|
+
chunk_obj.token_count = result.tokens
|
203
|
+
chunk_obj.embedding_cost = result.cost
|
204
|
+
await chunk_obj.asave(update_fields=['embedding', 'token_count', 'embedding_cost'])
|
205
|
+
|
206
|
+
logger.debug(f"✅ Async document chunk {chunk.id} embedding saved: {result.tokens} tokens, ${result.cost:.4f}")
|
207
|
+
|
208
|
+
elif chunk.parent_type == "archive":
|
209
|
+
# Use async ORM with select_related
|
210
|
+
chunk_obj = await ArchiveItemChunk.objects.select_related('item').aget(id=chunk.id)
|
211
|
+
chunk_obj.embedding = result.embedding
|
212
|
+
chunk_obj.token_count = result.tokens
|
213
|
+
chunk_obj.embedding_cost = result.cost
|
214
|
+
await chunk_obj.asave(update_fields=['embedding', 'token_count', 'embedding_cost'])
|
215
|
+
|
216
|
+
# Update parent item statistics
|
217
|
+
item = chunk_obj.item
|
218
|
+
item.total_tokens += result.tokens
|
219
|
+
item.processing_cost += result.cost
|
220
|
+
await item.asave(update_fields=['total_tokens', 'processing_cost'])
|
221
|
+
|
222
|
+
logger.debug(f"✅ Async archive chunk {chunk.id} embedding saved: {result.tokens} tokens, ${result.cost:.4f}")
|
223
|
+
else:
|
224
|
+
raise ValueError(f"Unknown chunk type: {chunk.parent_type}")
|
225
|
+
|
226
|
+
except Exception as e:
|
227
|
+
logger.error(f"❌ Failed to save async embedding for chunk {chunk.id}: {e}")
|
228
|
+
raise
|
229
|
+
|
230
|
+
def process_chunks_batch_context_aware(self, chunks: List[ChunkData]) -> BatchProcessingResult:
|
231
|
+
"""
|
232
|
+
Context-aware processing that works in both sync and async contexts.
|
233
|
+
|
234
|
+
Based on Django 5.2 async patterns.
|
235
|
+
"""
|
236
|
+
if is_async_context():
|
237
|
+
# We're in async context - use async methods
|
238
|
+
logger.info("🔮 Detected async context - using async processing")
|
239
|
+
# Convert async method to sync for compatibility
|
240
|
+
return async_to_sync(self.aprocess_chunks_batch)(chunks)
|
241
|
+
else:
|
242
|
+
# We're in sync context - use sync methods
|
243
|
+
logger.info("🔮 Detected sync context - using sync processing")
|
244
|
+
return super().process_chunks_batch(chunks)
|
@@ -0,0 +1,250 @@
|
|
1
|
+
"""
|
2
|
+
High-performance batch embedding processor.
|
3
|
+
|
4
|
+
This module provides the main batch processing engine for generating
|
5
|
+
embeddings with optimized API calls and database operations.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import time
|
9
|
+
import logging
|
10
|
+
from typing import List, Dict, Any, Optional
|
11
|
+
|
12
|
+
from django_cfg.apps.knowbase.utils.chunk_settings import get_embedding_batch_size, get_embedding_model
|
13
|
+
from django_cfg.apps.knowbase.config.settings import get_openai_api_key, get_cache_settings
|
14
|
+
from django_cfg.modules.django_llm.llm.client import LLMClient
|
15
|
+
|
16
|
+
from .models import ChunkData, EmbeddingResult, BatchProcessingResult, ProcessingConfig, ChunkType
|
17
|
+
from .processors import DocumentChunkProcessor, ArchiveChunkProcessor, ExternalDataChunkProcessor
|
18
|
+
from .batch_result import BatchResultBuilder
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class OptimizedEmbeddingProcessor:
|
24
|
+
"""High-performance embedding processor with batch operations."""
|
25
|
+
|
26
|
+
def __init__(self, batch_size: Optional[int] = None, embedding_model: Optional[str] = None):
|
27
|
+
"""
|
28
|
+
Initialize the processor.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
batch_size: Number of chunks to process in one API call (uses Constance setting if None)
|
32
|
+
embedding_model: Embedding model to use (uses Constance setting if None)
|
33
|
+
"""
|
34
|
+
# Use Constance settings if not provided
|
35
|
+
self.batch_size = min(batch_size or get_embedding_batch_size(), 100) # Conservative limit for stability
|
36
|
+
self.embedding_model = embedding_model or get_embedding_model()
|
37
|
+
|
38
|
+
# Initialize LLM client with OpenAI only for embeddings
|
39
|
+
# OpenRouter doesn't support embedding models, so we use OpenAI directly
|
40
|
+
# Use auto-configured LLMClient with explicit OpenAI preference for embeddings
|
41
|
+
# Get cache settings from configuration (directory is auto-created)
|
42
|
+
cache_settings = get_cache_settings()
|
43
|
+
self.llm_client = LLMClient(
|
44
|
+
preferred_provider="openai", # Force OpenAI for embeddings
|
45
|
+
cache_dir=cache_settings.cache_dir,
|
46
|
+
cache_ttl=cache_settings.cache_ttl,
|
47
|
+
max_cache_size=cache_settings.max_cache_size
|
48
|
+
)
|
49
|
+
|
50
|
+
# Processors for different chunk types
|
51
|
+
self.processors = {
|
52
|
+
ChunkType.DOCUMENT: DocumentChunkProcessor(),
|
53
|
+
ChunkType.ARCHIVE: ArchiveChunkProcessor(),
|
54
|
+
ChunkType.EXTERNAL_DATA: ExternalDataChunkProcessor()
|
55
|
+
}
|
56
|
+
|
57
|
+
logger.info(f"🚀 OptimizedEmbeddingProcessor initialized: batch_size={self.batch_size}, model={self.embedding_model}")
|
58
|
+
|
59
|
+
def process_chunks_batch(self, chunks: List[ChunkData]) -> BatchProcessingResult:
|
60
|
+
"""
|
61
|
+
Process multiple chunks with optimized batch operations.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
chunks: List of chunks to process
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
BatchProcessingResult with processing statistics
|
68
|
+
"""
|
69
|
+
start_time = time.time()
|
70
|
+
total_chunks = len(chunks)
|
71
|
+
result_builder = BatchResultBuilder(total_chunks)
|
72
|
+
|
73
|
+
logger.info(f"🔮 Starting batch processing of {total_chunks} chunks")
|
74
|
+
|
75
|
+
# Process in batches
|
76
|
+
for i in range(0, total_chunks, self.batch_size):
|
77
|
+
batch = chunks[i:i + self.batch_size]
|
78
|
+
batch_num = (i // self.batch_size) + 1
|
79
|
+
total_batches = (total_chunks + self.batch_size - 1) // self.batch_size
|
80
|
+
|
81
|
+
logger.info(f"🔮 Processing batch {batch_num}/{total_batches} ({len(batch)} chunks)")
|
82
|
+
|
83
|
+
try:
|
84
|
+
batch_results = self._process_single_batch(batch)
|
85
|
+
result_builder.add_batch_results(batch_results)
|
86
|
+
|
87
|
+
# Small delay between batches to respect rate limits
|
88
|
+
if i + self.batch_size < total_chunks:
|
89
|
+
time.sleep(0.5)
|
90
|
+
|
91
|
+
except Exception as e:
|
92
|
+
error_msg = f"Batch {batch_num} failed: {str(e)}"
|
93
|
+
logger.error(f"❌ {error_msg}")
|
94
|
+
result_builder.add_batch_error(error_msg, len(batch))
|
95
|
+
|
96
|
+
processing_time = time.time() - start_time
|
97
|
+
result = result_builder.build(processing_time)
|
98
|
+
|
99
|
+
# Log using Pydantic model's summary
|
100
|
+
summary = result.model_dump_summary()
|
101
|
+
logger.info(f"🎉 Batch processing completed: {summary}")
|
102
|
+
logger.info(f"📊 Performance: {summary['chunks_per_second']} chunks/sec, {summary['avg_cost_per_chunk']} per chunk")
|
103
|
+
|
104
|
+
return result
|
105
|
+
|
106
|
+
def _process_single_batch(self, batch: List[ChunkData]) -> List[EmbeddingResult]:
|
107
|
+
"""Process a single batch of chunks and return list of EmbeddingResult."""
|
108
|
+
|
109
|
+
# Prepare content for all chunks
|
110
|
+
prepared_contents = []
|
111
|
+
chunk_mapping = {} # Map index to chunk
|
112
|
+
|
113
|
+
for idx, chunk in enumerate(batch):
|
114
|
+
processor = self.processors.get(chunk.parent_type)
|
115
|
+
if not processor:
|
116
|
+
logger.warning(f"⚠️ Unknown chunk type: {chunk.parent_type}")
|
117
|
+
continue
|
118
|
+
|
119
|
+
try:
|
120
|
+
content = processor.prepare_content_for_embedding(chunk)
|
121
|
+
if content and content.strip():
|
122
|
+
prepared_contents.append(content)
|
123
|
+
chunk_mapping[len(prepared_contents) - 1] = chunk
|
124
|
+
else:
|
125
|
+
logger.warning(f"⚠️ Empty content for chunk {chunk.id}")
|
126
|
+
except Exception as e:
|
127
|
+
logger.error(f"❌ Failed to prepare content for chunk {chunk.id}: {e}")
|
128
|
+
|
129
|
+
if not prepared_contents:
|
130
|
+
# Return failed results for all chunks
|
131
|
+
return [
|
132
|
+
EmbeddingResult(
|
133
|
+
chunk_id=chunk.id,
|
134
|
+
success=False,
|
135
|
+
error="No valid content to process"
|
136
|
+
)
|
137
|
+
for chunk in batch
|
138
|
+
]
|
139
|
+
|
140
|
+
# Generate embeddings in batch
|
141
|
+
try:
|
142
|
+
embedding_results = self._generate_batch_embeddings(prepared_contents)
|
143
|
+
|
144
|
+
# Process and save results
|
145
|
+
final_results = []
|
146
|
+
|
147
|
+
for idx, embedding_result in enumerate(embedding_results):
|
148
|
+
if idx not in chunk_mapping:
|
149
|
+
continue
|
150
|
+
|
151
|
+
chunk = chunk_mapping[idx]
|
152
|
+
processor = self.processors[chunk.parent_type]
|
153
|
+
|
154
|
+
# Set the chunk_id in the result
|
155
|
+
embedding_result.chunk_id = chunk.id
|
156
|
+
|
157
|
+
if embedding_result.success:
|
158
|
+
try:
|
159
|
+
logger.debug(f"🔄 Attempting to save embedding for chunk {chunk.id} (type: {chunk.parent_type})")
|
160
|
+
processor.save_embedding_result(chunk.id, embedding_result)
|
161
|
+
logger.info(f"✅ Successfully saved embedding for chunk {chunk.id}")
|
162
|
+
final_results.append(embedding_result)
|
163
|
+
except Exception as e:
|
164
|
+
error_msg = f"Failed to save embedding for chunk {chunk.id}: {e}"
|
165
|
+
logger.error(f"❌ {error_msg}")
|
166
|
+
failed_result = EmbeddingResult(
|
167
|
+
chunk_id=chunk.id,
|
168
|
+
success=False,
|
169
|
+
error=error_msg
|
170
|
+
)
|
171
|
+
final_results.append(failed_result)
|
172
|
+
else:
|
173
|
+
final_results.append(embedding_result)
|
174
|
+
|
175
|
+
return final_results
|
176
|
+
|
177
|
+
except Exception as e:
|
178
|
+
error_msg = f"Batch embedding generation failed: {e}"
|
179
|
+
logger.error(f"❌ {error_msg}")
|
180
|
+
# Return failed results for all chunks
|
181
|
+
return [
|
182
|
+
EmbeddingResult(
|
183
|
+
chunk_id=chunk.id,
|
184
|
+
success=False,
|
185
|
+
error=error_msg
|
186
|
+
)
|
187
|
+
for chunk in batch
|
188
|
+
]
|
189
|
+
|
190
|
+
def _generate_batch_embeddings(self, contents: List[str]) -> List[EmbeddingResult]:
|
191
|
+
"""Generate embeddings for multiple contents using LLMClient."""
|
192
|
+
|
193
|
+
results = []
|
194
|
+
|
195
|
+
try:
|
196
|
+
# Use LLMClient's generate_embedding method for each content
|
197
|
+
# This handles both OpenAI and OpenRouter properly
|
198
|
+
for idx, content in enumerate(contents):
|
199
|
+
try:
|
200
|
+
# Use LLMClient's method which handles provider differences
|
201
|
+
embedding_response = self.llm_client.generate_embedding(
|
202
|
+
text=content,
|
203
|
+
model=self.embedding_model
|
204
|
+
)
|
205
|
+
|
206
|
+
results.append(EmbeddingResult(
|
207
|
+
chunk_id="", # Will be set by caller
|
208
|
+
embedding=embedding_response.embedding,
|
209
|
+
tokens=embedding_response.tokens,
|
210
|
+
cost=embedding_response.cost,
|
211
|
+
success=True
|
212
|
+
))
|
213
|
+
|
214
|
+
except Exception as e:
|
215
|
+
logger.error(f"❌ Failed to generate embedding for content {idx}: {e}")
|
216
|
+
results.append(EmbeddingResult(
|
217
|
+
chunk_id="",
|
218
|
+
embedding=[],
|
219
|
+
tokens=0,
|
220
|
+
cost=0.0,
|
221
|
+
success=False,
|
222
|
+
error=str(e)
|
223
|
+
))
|
224
|
+
|
225
|
+
successful_count = len([r for r in results if r.success])
|
226
|
+
logger.info(f"🎯 Generated {successful_count}/{len(results)} embeddings successfully")
|
227
|
+
|
228
|
+
# Log details of each result
|
229
|
+
for i, result in enumerate(results):
|
230
|
+
if result.success:
|
231
|
+
logger.debug(f" ✅ Result {i}: {result.tokens} tokens, ${result.cost:.4f}, embedding_len={len(result.embedding)}")
|
232
|
+
else:
|
233
|
+
logger.debug(f" ❌ Result {i}: {result.error}")
|
234
|
+
|
235
|
+
return results
|
236
|
+
|
237
|
+
except Exception as e:
|
238
|
+
logger.error(f"❌ Batch embedding generation failed: {e}")
|
239
|
+
# Return failed results for all contents
|
240
|
+
return [
|
241
|
+
EmbeddingResult(
|
242
|
+
chunk_id="",
|
243
|
+
embedding=[],
|
244
|
+
tokens=0,
|
245
|
+
cost=0.0,
|
246
|
+
success=False,
|
247
|
+
error=str(e)
|
248
|
+
)
|
249
|
+
for _ in contents
|
250
|
+
]
|