django-cfg 1.1.81__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_cfg/__init__.py +20 -448
- django_cfg/apps/accounts/README.md +3 -3
- django_cfg/apps/accounts/admin/__init__.py +0 -2
- django_cfg/apps/accounts/admin/activity.py +2 -9
- django_cfg/apps/accounts/admin/filters.py +0 -42
- django_cfg/apps/accounts/admin/inlines.py +8 -8
- django_cfg/apps/accounts/admin/otp.py +5 -5
- django_cfg/apps/accounts/admin/registration_source.py +1 -8
- django_cfg/apps/accounts/admin/user.py +12 -20
- django_cfg/apps/accounts/managers/user_manager.py +2 -129
- django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
- django_cfg/apps/accounts/models.py +3 -123
- django_cfg/apps/accounts/serializers/otp.py +40 -44
- django_cfg/apps/accounts/serializers/profile.py +0 -2
- django_cfg/apps/accounts/services/otp_service.py +98 -186
- django_cfg/apps/accounts/signals.py +25 -15
- django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
- django_cfg/apps/accounts/views/otp.py +35 -36
- django_cfg/apps/agents/README.md +129 -0
- django_cfg/apps/agents/__init__.py +68 -0
- django_cfg/apps/agents/admin/__init__.py +17 -0
- django_cfg/apps/agents/admin/execution_admin.py +460 -0
- django_cfg/apps/agents/admin/registry_admin.py +360 -0
- django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
- django_cfg/apps/agents/apps.py +29 -0
- django_cfg/apps/agents/core/__init__.py +20 -0
- django_cfg/apps/agents/core/agent.py +281 -0
- django_cfg/apps/agents/core/dependencies.py +154 -0
- django_cfg/apps/agents/core/exceptions.py +66 -0
- django_cfg/apps/agents/core/models.py +106 -0
- django_cfg/apps/agents/core/orchestrator.py +391 -0
- django_cfg/apps/agents/examples/__init__.py +3 -0
- django_cfg/apps/agents/examples/simple_example.py +161 -0
- django_cfg/apps/agents/integration/__init__.py +14 -0
- django_cfg/apps/agents/integration/middleware.py +80 -0
- django_cfg/apps/agents/integration/registry.py +345 -0
- django_cfg/apps/agents/integration/signals.py +50 -0
- django_cfg/apps/agents/management/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/__init__.py +3 -0
- django_cfg/apps/agents/management/commands/create_agent.py +365 -0
- django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
- django_cfg/apps/agents/managers/__init__.py +23 -0
- django_cfg/apps/agents/managers/execution.py +236 -0
- django_cfg/apps/agents/managers/registry.py +254 -0
- django_cfg/apps/agents/managers/toolsets.py +496 -0
- django_cfg/apps/agents/migrations/0001_initial.py +286 -0
- django_cfg/apps/agents/migrations/__init__.py +5 -0
- django_cfg/apps/agents/models/__init__.py +15 -0
- django_cfg/apps/agents/models/execution.py +215 -0
- django_cfg/apps/agents/models/registry.py +220 -0
- django_cfg/apps/agents/models/toolsets.py +305 -0
- django_cfg/apps/agents/patterns/__init__.py +24 -0
- django_cfg/apps/agents/patterns/content_agents.py +234 -0
- django_cfg/apps/agents/toolsets/__init__.py +15 -0
- django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
- django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
- django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
- django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
- django_cfg/apps/agents/urls.py +46 -0
- django_cfg/apps/knowbase/README.md +150 -0
- django_cfg/apps/knowbase/__init__.py +27 -0
- django_cfg/apps/knowbase/admin/__init__.py +23 -0
- django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
- django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
- django_cfg/apps/knowbase/admin/document_admin.py +650 -0
- django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
- django_cfg/apps/knowbase/apps.py +81 -0
- django_cfg/apps/knowbase/config/README.md +176 -0
- django_cfg/apps/knowbase/config/__init__.py +51 -0
- django_cfg/apps/knowbase/config/constance_fields.py +186 -0
- django_cfg/apps/knowbase/config/constance_settings.py +200 -0
- django_cfg/apps/knowbase/config/settings.py +444 -0
- django_cfg/apps/knowbase/examples/__init__.py +3 -0
- django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
- django_cfg/apps/knowbase/management/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
- django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
- django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
- django_cfg/apps/knowbase/managers/__init__.py +22 -0
- django_cfg/apps/knowbase/managers/archive.py +426 -0
- django_cfg/apps/knowbase/managers/base.py +32 -0
- django_cfg/apps/knowbase/managers/chat.py +141 -0
- django_cfg/apps/knowbase/managers/document.py +203 -0
- django_cfg/apps/knowbase/managers/external_data.py +471 -0
- django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
- django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
- django_cfg/apps/knowbase/migrations/__init__.py +5 -0
- django_cfg/apps/knowbase/mixins/__init__.py +15 -0
- django_cfg/apps/knowbase/mixins/config.py +108 -0
- django_cfg/apps/knowbase/mixins/creator.py +81 -0
- django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
- django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
- django_cfg/apps/knowbase/mixins/service.py +362 -0
- django_cfg/apps/knowbase/models/__init__.py +41 -0
- django_cfg/apps/knowbase/models/archive.py +599 -0
- django_cfg/apps/knowbase/models/base.py +58 -0
- django_cfg/apps/knowbase/models/chat.py +157 -0
- django_cfg/apps/knowbase/models/document.py +267 -0
- django_cfg/apps/knowbase/models/external_data.py +376 -0
- django_cfg/apps/knowbase/serializers/__init__.py +68 -0
- django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
- django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
- django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
- django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
- django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
- django_cfg/apps/knowbase/services/__init__.py +40 -0
- django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
- django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
- django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
- django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
- django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
- django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
- django_cfg/apps/knowbase/services/base.py +53 -0
- django_cfg/apps/knowbase/services/chat_service.py +239 -0
- django_cfg/apps/knowbase/services/document_service.py +144 -0
- django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
- django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
- django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
- django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
- django_cfg/apps/knowbase/services/embedding/models.py +229 -0
- django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
- django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
- django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
- django_cfg/apps/knowbase/services/search_service.py +293 -0
- django_cfg/apps/knowbase/signals/__init__.py +21 -0
- django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
- django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
- django_cfg/apps/knowbase/signals/document_signals.py +143 -0
- django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
- django_cfg/apps/knowbase/tasks/__init__.py +39 -0
- django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
- django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
- django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
- django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
- django_cfg/apps/knowbase/urls.py +43 -0
- django_cfg/apps/knowbase/utils/__init__.py +12 -0
- django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
- django_cfg/apps/knowbase/utils/text_processing.py +375 -0
- django_cfg/apps/knowbase/utils/validation.py +99 -0
- django_cfg/apps/knowbase/views/__init__.py +28 -0
- django_cfg/apps/knowbase/views/archive_views.py +469 -0
- django_cfg/apps/knowbase/views/base.py +49 -0
- django_cfg/apps/knowbase/views/chat_views.py +181 -0
- django_cfg/apps/knowbase/views/document_views.py +183 -0
- django_cfg/apps/knowbase/views/public_views.py +129 -0
- django_cfg/apps/leads/admin.py +70 -0
- django_cfg/apps/newsletter/admin.py +234 -0
- django_cfg/apps/newsletter/admin_filters.py +124 -0
- django_cfg/apps/support/admin.py +196 -0
- django_cfg/apps/support/admin_filters.py +71 -0
- django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
- django_cfg/apps/urls.py +5 -4
- django_cfg/cli/README.md +1 -1
- django_cfg/cli/commands/create_project.py +2 -2
- django_cfg/cli/commands/info.py +1 -1
- django_cfg/config.py +44 -0
- django_cfg/core/config.py +29 -82
- django_cfg/core/environment.py +1 -1
- django_cfg/core/generation.py +19 -107
- django_cfg/{integration.py → core/integration.py} +18 -16
- django_cfg/core/validation.py +1 -1
- django_cfg/management/__init__.py +1 -1
- django_cfg/management/commands/__init__.py +1 -1
- django_cfg/management/commands/auto_generate.py +482 -0
- django_cfg/management/commands/migrator.py +19 -101
- django_cfg/management/commands/test_email.py +1 -1
- django_cfg/middleware/README.md +0 -158
- django_cfg/middleware/__init__.py +0 -2
- django_cfg/middleware/user_activity.py +3 -3
- django_cfg/models/api.py +145 -0
- django_cfg/models/base.py +287 -0
- django_cfg/models/cache.py +4 -4
- django_cfg/models/constance.py +25 -88
- django_cfg/models/database.py +9 -9
- django_cfg/models/drf.py +3 -36
- django_cfg/models/email.py +163 -0
- django_cfg/models/environment.py +276 -0
- django_cfg/models/limits.py +1 -1
- django_cfg/models/logging.py +366 -0
- django_cfg/models/revolution.py +41 -2
- django_cfg/models/security.py +125 -0
- django_cfg/models/services.py +1 -1
- django_cfg/modules/__init__.py +2 -56
- django_cfg/modules/base.py +78 -52
- django_cfg/modules/django_currency/service.py +2 -2
- django_cfg/modules/django_email.py +2 -2
- django_cfg/modules/django_health.py +267 -0
- django_cfg/modules/django_llm/llm/client.py +79 -17
- django_cfg/modules/django_llm/translator/translator.py +2 -2
- django_cfg/modules/django_logger.py +2 -2
- django_cfg/modules/django_ngrok.py +2 -2
- django_cfg/modules/django_tasks.py +68 -3
- django_cfg/modules/django_telegram.py +3 -3
- django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
- django_cfg/modules/django_twilio/service.py +2 -2
- django_cfg/modules/django_twilio/simple_service.py +2 -2
- django_cfg/modules/django_twilio/templates/guide.md +266 -0
- django_cfg/modules/django_twilio/twilio_service.py +2 -2
- django_cfg/modules/django_unfold/__init__.py +69 -0
- django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
- django_cfg/modules/django_unfold/dashboard.py +278 -0
- django_cfg/modules/django_unfold/icons/README.md +145 -0
- django_cfg/modules/django_unfold/icons/__init__.py +12 -0
- django_cfg/modules/django_unfold/icons/constants.py +2851 -0
- django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
- django_cfg/modules/django_unfold/models/__init__.py +42 -0
- django_cfg/modules/django_unfold/models/config.py +601 -0
- django_cfg/modules/django_unfold/models/dashboard.py +206 -0
- django_cfg/modules/django_unfold/models/dropdown.py +40 -0
- django_cfg/modules/django_unfold/models/navigation.py +73 -0
- django_cfg/modules/django_unfold/models/tabs.py +25 -0
- django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
- django_cfg/modules/django_unfold/utils.py +140 -0
- django_cfg/registry/__init__.py +23 -0
- django_cfg/registry/core.py +61 -0
- django_cfg/registry/exceptions.py +11 -0
- django_cfg/registry/modules.py +12 -0
- django_cfg/registry/services.py +26 -0
- django_cfg/registry/third_party.py +52 -0
- django_cfg/routing/__init__.py +19 -0
- django_cfg/routing/callbacks.py +198 -0
- django_cfg/routing/routers.py +48 -0
- django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
- django_cfg/templatetags/__init__.py +0 -0
- django_cfg/templatetags/django_cfg.py +33 -0
- django_cfg/urls.py +33 -0
- django_cfg/utils/path_resolution.py +1 -1
- django_cfg/utils/smart_defaults.py +7 -61
- django_cfg/utils/toolkit.py +663 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/METADATA +83 -86
- django_cfg-1.2.0.dist-info/RECORD +441 -0
- django_cfg/apps/tasks/@docs/README.md +0 -195
- django_cfg/archive/django_sample.zip +0 -0
- django_cfg/models/unfold.py +0 -271
- django_cfg/modules/unfold/__init__.py +0 -29
- django_cfg/modules/unfold/dashboard.py +0 -318
- django_cfg/pyproject.toml +0 -370
- django_cfg/routers.py +0 -83
- django_cfg-1.1.81.dist-info/RECORD +0 -278
- /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
- /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
- /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
- /django_cfg/{version_check.py → utils/version_check.py} +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/WHEEL +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/entry_points.txt +0 -0
- {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
"""
|
2
|
+
Dynamic chunk settings using Pydantic configuration.
|
3
|
+
|
4
|
+
This module provides utilities for accessing and managing chunk processing
|
5
|
+
settings with type safety and validation.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
from typing import Dict, Any, Optional
|
10
|
+
from pydantic import BaseModel, Field, validator
|
11
|
+
from ..config.constance_settings import ConstanceSettings
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class ChunkSettings(BaseModel):
|
17
|
+
"""Pydantic model for chunk processing settings."""
|
18
|
+
|
19
|
+
chunk_size: int = Field(
|
20
|
+
ge=100,
|
21
|
+
le=8000,
|
22
|
+
description="Size of each chunk in characters"
|
23
|
+
)
|
24
|
+
|
25
|
+
chunk_overlap: int = Field(
|
26
|
+
ge=0,
|
27
|
+
description="Overlap between chunks in characters"
|
28
|
+
)
|
29
|
+
|
30
|
+
embedding_batch_size: int = Field(
|
31
|
+
ge=1,
|
32
|
+
le=100,
|
33
|
+
description="Number of chunks to process in one embedding batch"
|
34
|
+
)
|
35
|
+
|
36
|
+
embedding_model: str = Field(
|
37
|
+
min_length=1,
|
38
|
+
description="OpenAI embedding model name"
|
39
|
+
)
|
40
|
+
|
41
|
+
@validator('chunk_overlap')
|
42
|
+
def validate_overlap(cls, v, values):
|
43
|
+
"""Ensure overlap is less than chunk_size."""
|
44
|
+
chunk_size = values.get('chunk_size')
|
45
|
+
if chunk_size and v >= chunk_size:
|
46
|
+
raise ValueError(f"Chunk overlap ({v}) must be less than chunk size ({chunk_size})")
|
47
|
+
return v
|
48
|
+
|
49
|
+
class Config:
|
50
|
+
"""Pydantic configuration."""
|
51
|
+
validate_assignment = True
|
52
|
+
|
53
|
+
|
54
|
+
class ChunkSettingsManager:
|
55
|
+
"""Manager for dynamic chunk settings using Pydantic configuration."""
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
def get_document_settings(cls) -> ChunkSettings:
|
59
|
+
"""Get chunk settings for document processing."""
|
60
|
+
from ..config.settings import get_config
|
61
|
+
config = get_config()
|
62
|
+
return ChunkSettings(
|
63
|
+
chunk_size=ConstanceSettings.get_document_chunk_size(),
|
64
|
+
chunk_overlap=config.chunking.document_chunk_overlap,
|
65
|
+
embedding_batch_size=ConstanceSettings.get_embedding_batch_size(),
|
66
|
+
embedding_model=ConstanceSettings.get_embedding_model()
|
67
|
+
)
|
68
|
+
|
69
|
+
@classmethod
|
70
|
+
def get_archive_settings(cls) -> ChunkSettings:
|
71
|
+
"""Get chunk settings for archive processing."""
|
72
|
+
from ..config.settings import get_config
|
73
|
+
config = get_config()
|
74
|
+
return ChunkSettings(
|
75
|
+
chunk_size=ConstanceSettings.get_archive_chunk_size(),
|
76
|
+
chunk_overlap=config.chunking.archive_chunk_overlap,
|
77
|
+
embedding_batch_size=ConstanceSettings.get_embedding_batch_size(),
|
78
|
+
embedding_model=ConstanceSettings.get_embedding_model()
|
79
|
+
)
|
80
|
+
|
81
|
+
@classmethod
|
82
|
+
def get_settings_for_type(cls, content_type: str) -> ChunkSettings:
|
83
|
+
"""
|
84
|
+
Get chunk settings for specific content type.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
content_type: Either 'document' or 'archive'
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
ChunkSettings object with appropriate settings
|
91
|
+
"""
|
92
|
+
if content_type == 'document':
|
93
|
+
return cls.get_document_settings()
|
94
|
+
elif content_type == 'archive':
|
95
|
+
return cls.get_archive_settings()
|
96
|
+
else:
|
97
|
+
logger.warning(f"Unknown content type: {content_type}, using document settings")
|
98
|
+
return cls.get_document_settings()
|
99
|
+
|
100
|
+
@classmethod
|
101
|
+
def get_all_settings(cls) -> Dict[str, ChunkSettings]:
|
102
|
+
"""Get all chunk settings as dictionary."""
|
103
|
+
return {
|
104
|
+
'document': cls.get_document_settings(),
|
105
|
+
'archive': cls.get_archive_settings()
|
106
|
+
}
|
107
|
+
|
108
|
+
@classmethod
|
109
|
+
def validate_settings(cls, settings: ChunkSettings) -> bool:
|
110
|
+
"""
|
111
|
+
Validate chunk settings.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
settings: ChunkSettings to validate
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
True if settings are valid, False otherwise
|
118
|
+
"""
|
119
|
+
if settings.chunk_size <= 0:
|
120
|
+
logger.error(f"Invalid chunk_size: {settings.chunk_size}")
|
121
|
+
return False
|
122
|
+
|
123
|
+
if settings.chunk_overlap < 0:
|
124
|
+
logger.error(f"Invalid chunk_overlap: {settings.chunk_overlap}")
|
125
|
+
return False
|
126
|
+
|
127
|
+
if settings.chunk_overlap >= settings.chunk_size:
|
128
|
+
logger.error(f"Chunk overlap ({settings.chunk_overlap}) must be less than chunk size ({settings.chunk_size})")
|
129
|
+
return False
|
130
|
+
|
131
|
+
if settings.embedding_batch_size <= 0 or settings.embedding_batch_size > 2048:
|
132
|
+
logger.error(f"Invalid embedding_batch_size: {settings.embedding_batch_size} (must be 1-2048)")
|
133
|
+
return False
|
134
|
+
|
135
|
+
if not settings.embedding_model or not settings.embedding_model.strip():
|
136
|
+
logger.error("Embedding model cannot be empty")
|
137
|
+
return False
|
138
|
+
|
139
|
+
return True
|
140
|
+
|
141
|
+
@classmethod
|
142
|
+
def log_current_settings(cls) -> None:
|
143
|
+
"""Log current settings for debugging."""
|
144
|
+
try:
|
145
|
+
doc_settings = cls.get_document_settings()
|
146
|
+
archive_settings = cls.get_archive_settings()
|
147
|
+
|
148
|
+
logger.info("📊 Current Chunk Settings:")
|
149
|
+
logger.info(f" 📄 Documents: size={doc_settings.chunk_size}, overlap={doc_settings.chunk_overlap}")
|
150
|
+
logger.info(f" 📦 Archives: size={archive_settings.chunk_size}, overlap={archive_settings.chunk_overlap}")
|
151
|
+
logger.info(f" 🔮 Embedding: batch_size={doc_settings.embedding_batch_size}, model={doc_settings.embedding_model}")
|
152
|
+
|
153
|
+
except Exception as e:
|
154
|
+
logger.error(f"Failed to log current settings: {e}")
|
155
|
+
|
156
|
+
|
157
|
+
# Convenience functions for easy access (using new Pydantic config)
|
158
|
+
def get_document_chunk_size() -> int:
|
159
|
+
"""Get document chunk size."""
|
160
|
+
from ..config.settings import get_document_chunk_size
|
161
|
+
return get_document_chunk_size()
|
162
|
+
|
163
|
+
|
164
|
+
def get_document_chunk_overlap() -> int:
|
165
|
+
"""Get document chunk overlap."""
|
166
|
+
from ..config.settings import get_document_chunk_overlap
|
167
|
+
return get_document_chunk_overlap()
|
168
|
+
|
169
|
+
|
170
|
+
def get_archive_chunk_size() -> int:
|
171
|
+
"""Get archive chunk size."""
|
172
|
+
from ..config.settings import get_archive_chunk_size
|
173
|
+
return get_archive_chunk_size()
|
174
|
+
|
175
|
+
|
176
|
+
def get_archive_chunk_overlap() -> int:
|
177
|
+
"""Get archive chunk overlap."""
|
178
|
+
from ..config.settings import get_archive_chunk_overlap
|
179
|
+
return get_archive_chunk_overlap()
|
180
|
+
|
181
|
+
|
182
|
+
def get_embedding_batch_size() -> int:
|
183
|
+
"""Get embedding batch size."""
|
184
|
+
from ..config.settings import get_embedding_batch_size
|
185
|
+
return get_embedding_batch_size()
|
186
|
+
|
187
|
+
|
188
|
+
def get_embedding_model() -> str:
|
189
|
+
"""Get embedding model."""
|
190
|
+
from ..config.settings import get_embedding_model
|
191
|
+
return get_embedding_model()
|
192
|
+
|
193
|
+
|
194
|
+
# Additional convenience functions using new Pydantic config
|
195
|
+
def get_search_results_limit() -> int:
|
196
|
+
"""Get search results limit."""
|
197
|
+
from ..config.settings import get_search_results_limit
|
198
|
+
return get_search_results_limit()
|
199
|
+
|
200
|
+
|
201
|
+
def get_search_similarity_threshold() -> float:
|
202
|
+
"""Get search similarity threshold."""
|
203
|
+
from ..config.settings import get_search_similarity_threshold
|
204
|
+
return get_search_similarity_threshold()
|
205
|
+
|
206
|
+
|
207
|
+
def get_chat_context_chunks() -> int:
|
208
|
+
"""Get number of chunks for chat context."""
|
209
|
+
from ..config.settings import get_chat_context_chunks
|
210
|
+
return get_chat_context_chunks()
|
211
|
+
|
212
|
+
|
213
|
+
def get_chat_max_tokens() -> int:
|
214
|
+
"""Get maximum tokens for chat completion."""
|
215
|
+
from ..config.settings import get_chat_max_tokens
|
216
|
+
return get_chat_max_tokens()
|
217
|
+
|
218
|
+
|
219
|
+
def get_chat_temperature() -> float:
|
220
|
+
"""Get chat completion temperature."""
|
221
|
+
from ..config.settings import get_chat_temperature
|
222
|
+
return get_chat_temperature()
|
223
|
+
|
224
|
+
|
225
|
+
def get_max_archive_size_mb() -> int:
|
226
|
+
"""Get maximum archive size in MB."""
|
227
|
+
from ..config.settings import get_max_archive_size_mb
|
228
|
+
return get_max_archive_size_mb()
|
229
|
+
|
230
|
+
|
231
|
+
def get_max_document_size_mb() -> int:
|
232
|
+
"""Get maximum document size in MB."""
|
233
|
+
from ..config.settings import get_max_document_size_mb
|
234
|
+
return get_max_document_size_mb()
|
235
|
+
|
236
|
+
|
237
|
+
def get_processing_timeout_minutes() -> int:
|
238
|
+
"""Get processing timeout in minutes."""
|
239
|
+
from ..config.settings import get_processing_timeout_minutes
|
240
|
+
return get_processing_timeout_minutes()
|
241
|
+
|
242
|
+
|
243
|
+
def get_chunking_params_for_type(content_type: str) -> Dict[str, Any]:
|
244
|
+
"""
|
245
|
+
Get chunking parameters for SemanticChunker.
|
246
|
+
|
247
|
+
Args:
|
248
|
+
content_type: Either 'document' or 'archive'
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
Dictionary with chunk_size and overlap parameters
|
252
|
+
"""
|
253
|
+
from ..config.settings import get_chunking_params_for_type
|
254
|
+
return get_chunking_params_for_type(content_type)
|
255
|
+
|
256
|
+
|
257
|
+
# Initialize settings logging on module import
|
258
|
+
try:
|
259
|
+
ChunkSettingsManager.log_current_settings()
|
260
|
+
except Exception as e:
|
261
|
+
logger.debug(f"Could not log settings on import: {e}")
|
@@ -0,0 +1,375 @@
|
|
1
|
+
"""
|
2
|
+
Text processing utilities for document chunking and cleaning.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
from typing import List, Optional
|
7
|
+
from pydantic import BaseModel, Field, validator
|
8
|
+
from bs4 import BeautifulSoup, NavigableString
|
9
|
+
|
10
|
+
|
11
|
+
class ChunkConfig(BaseModel):
|
12
|
+
"""Pydantic configuration for text chunking."""
|
13
|
+
|
14
|
+
chunk_size: int = Field(
|
15
|
+
default=1000,
|
16
|
+
ge=100,
|
17
|
+
le=8000,
|
18
|
+
description="Size of each text chunk in characters"
|
19
|
+
)
|
20
|
+
|
21
|
+
overlap: int = Field(
|
22
|
+
default=200,
|
23
|
+
ge=0,
|
24
|
+
description="Overlap between consecutive chunks in characters"
|
25
|
+
)
|
26
|
+
|
27
|
+
separators: List[str] = Field(
|
28
|
+
default_factory=lambda: ["\n\n", "\n", ". ", " "],
|
29
|
+
description="List of separators for text splitting in order of preference"
|
30
|
+
)
|
31
|
+
|
32
|
+
@validator('overlap')
|
33
|
+
def validate_overlap(cls, v, values):
|
34
|
+
"""Ensure overlap is less than chunk_size."""
|
35
|
+
chunk_size = values.get('chunk_size', 1000)
|
36
|
+
if v >= chunk_size:
|
37
|
+
raise ValueError(f"Overlap ({v}) must be less than chunk_size ({chunk_size})")
|
38
|
+
return v
|
39
|
+
|
40
|
+
class Config:
|
41
|
+
"""Pydantic configuration."""
|
42
|
+
validate_assignment = True
|
43
|
+
|
44
|
+
|
45
|
+
class TextProcessor:
|
46
|
+
"""Text cleaning and preprocessing utilities."""
|
47
|
+
|
48
|
+
def clean_text(self, text: str) -> str:
|
49
|
+
"""Clean and normalize text content."""
|
50
|
+
|
51
|
+
# First, check if content contains HTML and clean it
|
52
|
+
if self.is_html_content(text):
|
53
|
+
text = self.clean_html_content(text)
|
54
|
+
|
55
|
+
# Remove excessive whitespace
|
56
|
+
text = re.sub(r'\s+', ' ', text)
|
57
|
+
|
58
|
+
# Remove special characters but keep punctuation
|
59
|
+
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\']+', '', text)
|
60
|
+
|
61
|
+
# Normalize quotes
|
62
|
+
text = re.sub(r'["""]', '"', text)
|
63
|
+
text = re.sub(r"[''']", "'", text)
|
64
|
+
|
65
|
+
# Remove extra spaces around punctuation
|
66
|
+
text = re.sub(r'\s+([\.,:;!?])', r'\1', text)
|
67
|
+
text = re.sub(r'([\.,:;!?])\s+', r'\1 ', text)
|
68
|
+
|
69
|
+
# Strip and normalize
|
70
|
+
text = text.strip()
|
71
|
+
|
72
|
+
return text
|
73
|
+
|
74
|
+
def is_html_content(self, text: str) -> bool:
|
75
|
+
"""Detect if content contains HTML tags."""
|
76
|
+
html_pattern = re.compile(r'<[^>]+>')
|
77
|
+
return bool(html_pattern.search(text))
|
78
|
+
|
79
|
+
def clean_html_content(self, html_content: str) -> str:
|
80
|
+
"""
|
81
|
+
Convert HTML content to clean text while preserving structure.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
html_content: Raw HTML content
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
Clean text with preserved structure
|
88
|
+
"""
|
89
|
+
try:
|
90
|
+
# Parse HTML with BeautifulSoup
|
91
|
+
soup = BeautifulSoup(html_content, 'lxml')
|
92
|
+
|
93
|
+
# Remove unwanted elements
|
94
|
+
self._remove_unwanted_elements(soup)
|
95
|
+
|
96
|
+
# Convert to structured text
|
97
|
+
text = self._extract_structured_text(soup)
|
98
|
+
|
99
|
+
return text
|
100
|
+
|
101
|
+
except Exception as e:
|
102
|
+
# Fallback to simple tag removal if parsing fails
|
103
|
+
text = re.sub(r'<[^>]+>', '', html_content)
|
104
|
+
return text
|
105
|
+
|
106
|
+
def _remove_unwanted_elements(self, soup: BeautifulSoup) -> None:
|
107
|
+
"""Remove unwanted HTML elements."""
|
108
|
+
|
109
|
+
# Remove script and style elements
|
110
|
+
for element in soup(['script', 'style', 'meta', 'link']):
|
111
|
+
element.decompose()
|
112
|
+
|
113
|
+
# Remove comments
|
114
|
+
from bs4 import Comment
|
115
|
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
116
|
+
comment.extract()
|
117
|
+
|
118
|
+
# Remove empty elements
|
119
|
+
for element in soup.find_all():
|
120
|
+
if not element.get_text(strip=True) and not element.name in ['br', 'hr', 'img']:
|
121
|
+
element.decompose()
|
122
|
+
|
123
|
+
def _extract_structured_text(self, soup: BeautifulSoup) -> str:
|
124
|
+
"""
|
125
|
+
Extract text while preserving document structure.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
soup: BeautifulSoup parsed HTML
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
Structured text content
|
132
|
+
"""
|
133
|
+
# Start with basic text extraction with proper spacing
|
134
|
+
text = soup.get_text(separator=' ', strip=True)
|
135
|
+
|
136
|
+
# Process specific elements for better structure
|
137
|
+
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
138
|
+
header_text = element.get_text(strip=True)
|
139
|
+
if header_text and header_text in text:
|
140
|
+
text = text.replace(header_text, f'\n\n{header_text}\n')
|
141
|
+
|
142
|
+
for element in soup.find_all(['li']):
|
143
|
+
li_text = element.get_text(strip=True)
|
144
|
+
if li_text and li_text in text:
|
145
|
+
text = text.replace(li_text, f'\n• {li_text}')
|
146
|
+
|
147
|
+
# Clean up excessive whitespace and newlines
|
148
|
+
text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
|
149
|
+
text = re.sub(r'\n\s*\n', '\n\n', text) # Multiple newlines to double newline
|
150
|
+
text = re.sub(r'\n{3,}', '\n\n', text) # More than 2 newlines to 2
|
151
|
+
text = text.strip()
|
152
|
+
|
153
|
+
return text
|
154
|
+
|
155
|
+
def extract_metadata(self, text: str) -> dict:
|
156
|
+
"""Extract basic metadata from text."""
|
157
|
+
|
158
|
+
lines = text.split('\n')
|
159
|
+
|
160
|
+
metadata = {
|
161
|
+
'character_count': len(text),
|
162
|
+
'word_count': len(text.split()),
|
163
|
+
'line_count': len(lines),
|
164
|
+
'paragraph_count': len([line for line in lines if line.strip()]),
|
165
|
+
'has_code': bool(re.search(r'```|`[^`]+`', text)),
|
166
|
+
'has_urls': bool(re.search(r'https?://\S+', text)),
|
167
|
+
'has_emails': bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)),
|
168
|
+
'is_html': self.is_html_content(text)
|
169
|
+
}
|
170
|
+
|
171
|
+
# Add HTML-specific metadata if content is HTML
|
172
|
+
if metadata['is_html']:
|
173
|
+
html_metadata = self._extract_html_metadata(text)
|
174
|
+
metadata.update(html_metadata)
|
175
|
+
|
176
|
+
return metadata
|
177
|
+
|
178
|
+
def _extract_html_metadata(self, html_content: str) -> dict:
|
179
|
+
"""Extract HTML-specific metadata."""
|
180
|
+
try:
|
181
|
+
soup = BeautifulSoup(html_content, 'lxml')
|
182
|
+
|
183
|
+
# Count different HTML elements
|
184
|
+
headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
185
|
+
paragraphs = soup.find_all('p')
|
186
|
+
lists = soup.find_all(['ul', 'ol'])
|
187
|
+
list_items = soup.find_all('li')
|
188
|
+
links = soup.find_all('a')
|
189
|
+
images = soup.find_all('img')
|
190
|
+
tables = soup.find_all('table')
|
191
|
+
|
192
|
+
return {
|
193
|
+
'html_headers_count': len(headers),
|
194
|
+
'html_paragraphs_count': len(paragraphs),
|
195
|
+
'html_lists_count': len(lists),
|
196
|
+
'html_list_items_count': len(list_items),
|
197
|
+
'html_links_count': len(links),
|
198
|
+
'html_images_count': len(images),
|
199
|
+
'html_tables_count': len(tables),
|
200
|
+
'html_has_forms': bool(soup.find('form')),
|
201
|
+
'html_has_media': bool(soup.find_all(['img', 'video', 'audio'])),
|
202
|
+
}
|
203
|
+
except:
|
204
|
+
return {
|
205
|
+
'html_parsing_error': True
|
206
|
+
}
|
207
|
+
|
208
|
+
|
209
|
+
class SemanticChunker:
|
210
|
+
"""Intelligent text chunking with semantic awareness."""
|
211
|
+
|
212
|
+
def __init__(
|
213
|
+
self,
|
214
|
+
chunk_size: int = 1000,
|
215
|
+
overlap: int = 200,
|
216
|
+
separators: Optional[List[str]] = None
|
217
|
+
):
|
218
|
+
# Handle None separators for backward compatibility
|
219
|
+
if separators is None:
|
220
|
+
separators = ["\n\n", "\n", ". ", " "]
|
221
|
+
|
222
|
+
self.config = ChunkConfig(
|
223
|
+
chunk_size=chunk_size,
|
224
|
+
overlap=overlap,
|
225
|
+
separators=separators
|
226
|
+
)
|
227
|
+
|
228
|
+
def create_chunks(self, text: str) -> List[str]:
|
229
|
+
"""Split text into semantic chunks."""
|
230
|
+
|
231
|
+
if len(text) <= self.config.chunk_size:
|
232
|
+
return [text]
|
233
|
+
|
234
|
+
chunks = []
|
235
|
+
current_chunk = ""
|
236
|
+
|
237
|
+
# Split by separators in order of preference
|
238
|
+
segments = self._split_by_separators(text, self.config.separators)
|
239
|
+
|
240
|
+
for segment in segments:
|
241
|
+
# If segment alone is too big, split it further
|
242
|
+
if len(segment) > self.config.chunk_size:
|
243
|
+
# Split large segment
|
244
|
+
sub_chunks = self._split_large_segment(segment)
|
245
|
+
|
246
|
+
# Add current chunk if it exists
|
247
|
+
if current_chunk:
|
248
|
+
chunks.append(current_chunk.strip())
|
249
|
+
current_chunk = ""
|
250
|
+
|
251
|
+
# Add all but last sub-chunk
|
252
|
+
chunks.extend(sub_chunks[:-1])
|
253
|
+
current_chunk = sub_chunks[-1] if sub_chunks else ""
|
254
|
+
|
255
|
+
# If adding segment would exceed chunk size
|
256
|
+
elif len(current_chunk) + len(segment) > self.config.chunk_size:
|
257
|
+
if current_chunk:
|
258
|
+
chunks.append(current_chunk.strip())
|
259
|
+
current_chunk = segment
|
260
|
+
else:
|
261
|
+
current_chunk += segment
|
262
|
+
|
263
|
+
# Add final chunk
|
264
|
+
if current_chunk:
|
265
|
+
chunks.append(current_chunk.strip())
|
266
|
+
|
267
|
+
# Add overlap between chunks
|
268
|
+
if self.config.overlap > 0:
|
269
|
+
chunks = self._add_overlap(chunks)
|
270
|
+
|
271
|
+
return [chunk for chunk in chunks if chunk.strip()]
|
272
|
+
|
273
|
+
def _split_by_separators(self, text: str, separators: List[str]) -> List[str]:
|
274
|
+
"""Split text by separators in order of preference."""
|
275
|
+
|
276
|
+
segments = [text]
|
277
|
+
|
278
|
+
for separator in separators:
|
279
|
+
new_segments = []
|
280
|
+
for segment in segments:
|
281
|
+
if separator in segment:
|
282
|
+
parts = segment.split(separator)
|
283
|
+
for i, part in enumerate(parts):
|
284
|
+
if i > 0:
|
285
|
+
new_segments.append(separator + part)
|
286
|
+
else:
|
287
|
+
new_segments.append(part)
|
288
|
+
else:
|
289
|
+
new_segments.append(segment)
|
290
|
+
segments = new_segments
|
291
|
+
|
292
|
+
return segments
|
293
|
+
|
294
|
+
def _split_large_segment(self, segment: str) -> List[str]:
|
295
|
+
"""Split a segment that's too large."""
|
296
|
+
|
297
|
+
chunks = []
|
298
|
+
start = 0
|
299
|
+
|
300
|
+
while start < len(segment):
|
301
|
+
end = start + self.config.chunk_size
|
302
|
+
|
303
|
+
if end >= len(segment):
|
304
|
+
chunks.append(segment[start:])
|
305
|
+
break
|
306
|
+
|
307
|
+
# Try to find a good breaking point
|
308
|
+
break_point = self._find_break_point(segment, start, end)
|
309
|
+
|
310
|
+
chunks.append(segment[start:break_point])
|
311
|
+
start = break_point - self.config.overlap if break_point > self.config.overlap else break_point
|
312
|
+
|
313
|
+
return chunks
|
314
|
+
|
315
|
+
def _find_break_point(self, text: str, start: int, end: int) -> int:
|
316
|
+
"""Find a good breaking point near the end position."""
|
317
|
+
|
318
|
+
# Look for sentence endings
|
319
|
+
for i in range(end - 1, start + self.config.chunk_size // 2, -1):
|
320
|
+
if text[i] in '.!?':
|
321
|
+
return i + 1
|
322
|
+
|
323
|
+
# Look for paragraph breaks
|
324
|
+
for i in range(end - 1, start + self.config.chunk_size // 2, -1):
|
325
|
+
if text[i] == '\n':
|
326
|
+
return i + 1
|
327
|
+
|
328
|
+
# Look for word boundaries
|
329
|
+
for i in range(end - 1, start + self.config.chunk_size // 2, -1):
|
330
|
+
if text[i] == ' ':
|
331
|
+
return i + 1
|
332
|
+
|
333
|
+
# No good break point found, use hard limit
|
334
|
+
return end
|
335
|
+
|
336
|
+
def _add_overlap(self, chunks: List[str]) -> List[str]:
|
337
|
+
"""Add overlap between consecutive chunks."""
|
338
|
+
|
339
|
+
if len(chunks) <= 1:
|
340
|
+
return chunks
|
341
|
+
|
342
|
+
overlapped_chunks = [chunks[0]]
|
343
|
+
|
344
|
+
for i in range(1, len(chunks)):
|
345
|
+
prev_chunk = chunks[i - 1]
|
346
|
+
current_chunk = chunks[i]
|
347
|
+
|
348
|
+
# Get overlap from previous chunk
|
349
|
+
overlap_text = prev_chunk[-self.config.overlap:] if len(prev_chunk) > self.config.overlap else prev_chunk
|
350
|
+
|
351
|
+
# Add overlap to current chunk
|
352
|
+
overlapped_chunk = overlap_text + " " + current_chunk
|
353
|
+
overlapped_chunks.append(overlapped_chunk)
|
354
|
+
|
355
|
+
return overlapped_chunks
|
356
|
+
|
357
|
+
|
358
|
+
# Convenience function for backward compatibility
|
359
|
+
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
|
360
|
+
"""
|
361
|
+
Convenience function to chunk text with default settings.
|
362
|
+
|
363
|
+
Args:
|
364
|
+
text: Text to chunk
|
365
|
+
chunk_size: Size of each chunk
|
366
|
+
overlap: Overlap between chunks
|
367
|
+
|
368
|
+
Returns:
|
369
|
+
List of text chunks
|
370
|
+
"""
|
371
|
+
chunker = SemanticChunker(
|
372
|
+
chunk_size=chunk_size,
|
373
|
+
overlap=overlap
|
374
|
+
)
|
375
|
+
return chunker.create_chunks(text)
|