django-cfg 1.1.81__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. django_cfg/__init__.py +20 -448
  2. django_cfg/apps/accounts/README.md +3 -3
  3. django_cfg/apps/accounts/admin/__init__.py +0 -2
  4. django_cfg/apps/accounts/admin/activity.py +2 -9
  5. django_cfg/apps/accounts/admin/filters.py +0 -42
  6. django_cfg/apps/accounts/admin/inlines.py +8 -8
  7. django_cfg/apps/accounts/admin/otp.py +5 -5
  8. django_cfg/apps/accounts/admin/registration_source.py +1 -8
  9. django_cfg/apps/accounts/admin/user.py +12 -20
  10. django_cfg/apps/accounts/managers/user_manager.py +2 -129
  11. django_cfg/apps/accounts/migrations/0006_remove_twilioresponse_otp_secret_and_more.py +46 -0
  12. django_cfg/apps/accounts/models.py +3 -123
  13. django_cfg/apps/accounts/serializers/otp.py +40 -44
  14. django_cfg/apps/accounts/serializers/profile.py +0 -2
  15. django_cfg/apps/accounts/services/otp_service.py +98 -186
  16. django_cfg/apps/accounts/signals.py +25 -15
  17. django_cfg/apps/accounts/utils/auth_email_service.py +84 -0
  18. django_cfg/apps/accounts/views/otp.py +35 -36
  19. django_cfg/apps/agents/README.md +129 -0
  20. django_cfg/apps/agents/__init__.py +68 -0
  21. django_cfg/apps/agents/admin/__init__.py +17 -0
  22. django_cfg/apps/agents/admin/execution_admin.py +460 -0
  23. django_cfg/apps/agents/admin/registry_admin.py +360 -0
  24. django_cfg/apps/agents/admin/toolsets_admin.py +482 -0
  25. django_cfg/apps/agents/apps.py +29 -0
  26. django_cfg/apps/agents/core/__init__.py +20 -0
  27. django_cfg/apps/agents/core/agent.py +281 -0
  28. django_cfg/apps/agents/core/dependencies.py +154 -0
  29. django_cfg/apps/agents/core/exceptions.py +66 -0
  30. django_cfg/apps/agents/core/models.py +106 -0
  31. django_cfg/apps/agents/core/orchestrator.py +391 -0
  32. django_cfg/apps/agents/examples/__init__.py +3 -0
  33. django_cfg/apps/agents/examples/simple_example.py +161 -0
  34. django_cfg/apps/agents/integration/__init__.py +14 -0
  35. django_cfg/apps/agents/integration/middleware.py +80 -0
  36. django_cfg/apps/agents/integration/registry.py +345 -0
  37. django_cfg/apps/agents/integration/signals.py +50 -0
  38. django_cfg/apps/agents/management/__init__.py +3 -0
  39. django_cfg/apps/agents/management/commands/__init__.py +3 -0
  40. django_cfg/apps/agents/management/commands/create_agent.py +365 -0
  41. django_cfg/apps/agents/management/commands/orchestrator_status.py +191 -0
  42. django_cfg/apps/agents/managers/__init__.py +23 -0
  43. django_cfg/apps/agents/managers/execution.py +236 -0
  44. django_cfg/apps/agents/managers/registry.py +254 -0
  45. django_cfg/apps/agents/managers/toolsets.py +496 -0
  46. django_cfg/apps/agents/migrations/0001_initial.py +286 -0
  47. django_cfg/apps/agents/migrations/__init__.py +5 -0
  48. django_cfg/apps/agents/models/__init__.py +15 -0
  49. django_cfg/apps/agents/models/execution.py +215 -0
  50. django_cfg/apps/agents/models/registry.py +220 -0
  51. django_cfg/apps/agents/models/toolsets.py +305 -0
  52. django_cfg/apps/agents/patterns/__init__.py +24 -0
  53. django_cfg/apps/agents/patterns/content_agents.py +234 -0
  54. django_cfg/apps/agents/toolsets/__init__.py +15 -0
  55. django_cfg/apps/agents/toolsets/cache_toolset.py +285 -0
  56. django_cfg/apps/agents/toolsets/django_toolset.py +220 -0
  57. django_cfg/apps/agents/toolsets/file_toolset.py +324 -0
  58. django_cfg/apps/agents/toolsets/orm_toolset.py +319 -0
  59. django_cfg/apps/agents/urls.py +46 -0
  60. django_cfg/apps/knowbase/README.md +150 -0
  61. django_cfg/apps/knowbase/__init__.py +27 -0
  62. django_cfg/apps/knowbase/admin/__init__.py +23 -0
  63. django_cfg/apps/knowbase/admin/archive_admin.py +857 -0
  64. django_cfg/apps/knowbase/admin/chat_admin.py +386 -0
  65. django_cfg/apps/knowbase/admin/document_admin.py +650 -0
  66. django_cfg/apps/knowbase/admin/external_data_admin.py +685 -0
  67. django_cfg/apps/knowbase/apps.py +81 -0
  68. django_cfg/apps/knowbase/config/README.md +176 -0
  69. django_cfg/apps/knowbase/config/__init__.py +51 -0
  70. django_cfg/apps/knowbase/config/constance_fields.py +186 -0
  71. django_cfg/apps/knowbase/config/constance_settings.py +200 -0
  72. django_cfg/apps/knowbase/config/settings.py +444 -0
  73. django_cfg/apps/knowbase/examples/__init__.py +3 -0
  74. django_cfg/apps/knowbase/examples/external_data_usage.py +191 -0
  75. django_cfg/apps/knowbase/management/__init__.py +0 -0
  76. django_cfg/apps/knowbase/management/commands/__init__.py +0 -0
  77. django_cfg/apps/knowbase/management/commands/knowbase_stats.py +158 -0
  78. django_cfg/apps/knowbase/management/commands/setup_knowbase.py +59 -0
  79. django_cfg/apps/knowbase/managers/__init__.py +22 -0
  80. django_cfg/apps/knowbase/managers/archive.py +426 -0
  81. django_cfg/apps/knowbase/managers/base.py +32 -0
  82. django_cfg/apps/knowbase/managers/chat.py +141 -0
  83. django_cfg/apps/knowbase/managers/document.py +203 -0
  84. django_cfg/apps/knowbase/managers/external_data.py +471 -0
  85. django_cfg/apps/knowbase/migrations/0001_initial.py +427 -0
  86. django_cfg/apps/knowbase/migrations/0002_archiveitem_archiveitemchunk_documentarchive_and_more.py +434 -0
  87. django_cfg/apps/knowbase/migrations/__init__.py +5 -0
  88. django_cfg/apps/knowbase/mixins/__init__.py +15 -0
  89. django_cfg/apps/knowbase/mixins/config.py +108 -0
  90. django_cfg/apps/knowbase/mixins/creator.py +81 -0
  91. django_cfg/apps/knowbase/mixins/examples/vehicle_model_example.py +199 -0
  92. django_cfg/apps/knowbase/mixins/external_data_mixin.py +813 -0
  93. django_cfg/apps/knowbase/mixins/service.py +362 -0
  94. django_cfg/apps/knowbase/models/__init__.py +41 -0
  95. django_cfg/apps/knowbase/models/archive.py +599 -0
  96. django_cfg/apps/knowbase/models/base.py +58 -0
  97. django_cfg/apps/knowbase/models/chat.py +157 -0
  98. django_cfg/apps/knowbase/models/document.py +267 -0
  99. django_cfg/apps/knowbase/models/external_data.py +376 -0
  100. django_cfg/apps/knowbase/serializers/__init__.py +68 -0
  101. django_cfg/apps/knowbase/serializers/archive_serializers.py +386 -0
  102. django_cfg/apps/knowbase/serializers/chat_serializers.py +137 -0
  103. django_cfg/apps/knowbase/serializers/document_serializers.py +94 -0
  104. django_cfg/apps/knowbase/serializers/external_data_serializers.py +256 -0
  105. django_cfg/apps/knowbase/serializers/public_serializers.py +74 -0
  106. django_cfg/apps/knowbase/services/__init__.py +40 -0
  107. django_cfg/apps/knowbase/services/archive/__init__.py +42 -0
  108. django_cfg/apps/knowbase/services/archive/archive_service.py +541 -0
  109. django_cfg/apps/knowbase/services/archive/chunking_service.py +791 -0
  110. django_cfg/apps/knowbase/services/archive/exceptions.py +52 -0
  111. django_cfg/apps/knowbase/services/archive/extraction_service.py +508 -0
  112. django_cfg/apps/knowbase/services/archive/vectorization_service.py +362 -0
  113. django_cfg/apps/knowbase/services/base.py +53 -0
  114. django_cfg/apps/knowbase/services/chat_service.py +239 -0
  115. django_cfg/apps/knowbase/services/document_service.py +144 -0
  116. django_cfg/apps/knowbase/services/embedding/__init__.py +43 -0
  117. django_cfg/apps/knowbase/services/embedding/async_processor.py +244 -0
  118. django_cfg/apps/knowbase/services/embedding/batch_processor.py +250 -0
  119. django_cfg/apps/knowbase/services/embedding/batch_result.py +61 -0
  120. django_cfg/apps/knowbase/services/embedding/models.py +229 -0
  121. django_cfg/apps/knowbase/services/embedding/processors.py +148 -0
  122. django_cfg/apps/knowbase/services/embedding/utils.py +176 -0
  123. django_cfg/apps/knowbase/services/prompt_builder.py +191 -0
  124. django_cfg/apps/knowbase/services/search_service.py +293 -0
  125. django_cfg/apps/knowbase/signals/__init__.py +21 -0
  126. django_cfg/apps/knowbase/signals/archive_signals.py +211 -0
  127. django_cfg/apps/knowbase/signals/chat_signals.py +37 -0
  128. django_cfg/apps/knowbase/signals/document_signals.py +143 -0
  129. django_cfg/apps/knowbase/signals/external_data_signals.py +157 -0
  130. django_cfg/apps/knowbase/tasks/__init__.py +39 -0
  131. django_cfg/apps/knowbase/tasks/archive_tasks.py +316 -0
  132. django_cfg/apps/knowbase/tasks/document_processing.py +341 -0
  133. django_cfg/apps/knowbase/tasks/external_data_tasks.py +341 -0
  134. django_cfg/apps/knowbase/tasks/maintenance.py +195 -0
  135. django_cfg/apps/knowbase/urls.py +43 -0
  136. django_cfg/apps/knowbase/utils/__init__.py +12 -0
  137. django_cfg/apps/knowbase/utils/chunk_settings.py +261 -0
  138. django_cfg/apps/knowbase/utils/text_processing.py +375 -0
  139. django_cfg/apps/knowbase/utils/validation.py +99 -0
  140. django_cfg/apps/knowbase/views/__init__.py +28 -0
  141. django_cfg/apps/knowbase/views/archive_views.py +469 -0
  142. django_cfg/apps/knowbase/views/base.py +49 -0
  143. django_cfg/apps/knowbase/views/chat_views.py +181 -0
  144. django_cfg/apps/knowbase/views/document_views.py +183 -0
  145. django_cfg/apps/knowbase/views/public_views.py +129 -0
  146. django_cfg/apps/leads/admin.py +70 -0
  147. django_cfg/apps/newsletter/admin.py +234 -0
  148. django_cfg/apps/newsletter/admin_filters.py +124 -0
  149. django_cfg/apps/support/admin.py +196 -0
  150. django_cfg/apps/support/admin_filters.py +71 -0
  151. django_cfg/apps/support/templates/support/chat/ticket_chat.html +1 -1
  152. django_cfg/apps/urls.py +5 -4
  153. django_cfg/cli/README.md +1 -1
  154. django_cfg/cli/commands/create_project.py +2 -2
  155. django_cfg/cli/commands/info.py +1 -1
  156. django_cfg/config.py +44 -0
  157. django_cfg/core/config.py +29 -82
  158. django_cfg/core/environment.py +1 -1
  159. django_cfg/core/generation.py +19 -107
  160. django_cfg/{integration.py → core/integration.py} +18 -16
  161. django_cfg/core/validation.py +1 -1
  162. django_cfg/management/__init__.py +1 -1
  163. django_cfg/management/commands/__init__.py +1 -1
  164. django_cfg/management/commands/auto_generate.py +482 -0
  165. django_cfg/management/commands/migrator.py +19 -101
  166. django_cfg/management/commands/test_email.py +1 -1
  167. django_cfg/middleware/README.md +0 -158
  168. django_cfg/middleware/__init__.py +0 -2
  169. django_cfg/middleware/user_activity.py +3 -3
  170. django_cfg/models/api.py +145 -0
  171. django_cfg/models/base.py +287 -0
  172. django_cfg/models/cache.py +4 -4
  173. django_cfg/models/constance.py +25 -88
  174. django_cfg/models/database.py +9 -9
  175. django_cfg/models/drf.py +3 -36
  176. django_cfg/models/email.py +163 -0
  177. django_cfg/models/environment.py +276 -0
  178. django_cfg/models/limits.py +1 -1
  179. django_cfg/models/logging.py +366 -0
  180. django_cfg/models/revolution.py +41 -2
  181. django_cfg/models/security.py +125 -0
  182. django_cfg/models/services.py +1 -1
  183. django_cfg/modules/__init__.py +2 -56
  184. django_cfg/modules/base.py +78 -52
  185. django_cfg/modules/django_currency/service.py +2 -2
  186. django_cfg/modules/django_email.py +2 -2
  187. django_cfg/modules/django_health.py +267 -0
  188. django_cfg/modules/django_llm/llm/client.py +79 -17
  189. django_cfg/modules/django_llm/translator/translator.py +2 -2
  190. django_cfg/modules/django_logger.py +2 -2
  191. django_cfg/modules/django_ngrok.py +2 -2
  192. django_cfg/modules/django_tasks.py +68 -3
  193. django_cfg/modules/django_telegram.py +3 -3
  194. django_cfg/modules/django_twilio/sendgrid_service.py +2 -2
  195. django_cfg/modules/django_twilio/service.py +2 -2
  196. django_cfg/modules/django_twilio/simple_service.py +2 -2
  197. django_cfg/modules/django_twilio/templates/guide.md +266 -0
  198. django_cfg/modules/django_twilio/twilio_service.py +2 -2
  199. django_cfg/modules/django_unfold/__init__.py +69 -0
  200. django_cfg/modules/{unfold → django_unfold}/callbacks.py +23 -22
  201. django_cfg/modules/django_unfold/dashboard.py +278 -0
  202. django_cfg/modules/django_unfold/icons/README.md +145 -0
  203. django_cfg/modules/django_unfold/icons/__init__.py +12 -0
  204. django_cfg/modules/django_unfold/icons/constants.py +2851 -0
  205. django_cfg/modules/django_unfold/icons/generate_icons.py +486 -0
  206. django_cfg/modules/django_unfold/models/__init__.py +42 -0
  207. django_cfg/modules/django_unfold/models/config.py +601 -0
  208. django_cfg/modules/django_unfold/models/dashboard.py +206 -0
  209. django_cfg/modules/django_unfold/models/dropdown.py +40 -0
  210. django_cfg/modules/django_unfold/models/navigation.py +73 -0
  211. django_cfg/modules/django_unfold/models/tabs.py +25 -0
  212. django_cfg/modules/{unfold → django_unfold}/system_monitor.py +2 -2
  213. django_cfg/modules/django_unfold/utils.py +140 -0
  214. django_cfg/registry/__init__.py +23 -0
  215. django_cfg/registry/core.py +61 -0
  216. django_cfg/registry/exceptions.py +11 -0
  217. django_cfg/registry/modules.py +12 -0
  218. django_cfg/registry/services.py +26 -0
  219. django_cfg/registry/third_party.py +52 -0
  220. django_cfg/routing/__init__.py +19 -0
  221. django_cfg/routing/callbacks.py +198 -0
  222. django_cfg/routing/routers.py +48 -0
  223. django_cfg/templates/admin/layouts/dashboard_with_tabs.html +8 -9
  224. django_cfg/templatetags/__init__.py +0 -0
  225. django_cfg/templatetags/django_cfg.py +33 -0
  226. django_cfg/urls.py +33 -0
  227. django_cfg/utils/path_resolution.py +1 -1
  228. django_cfg/utils/smart_defaults.py +7 -61
  229. django_cfg/utils/toolkit.py +663 -0
  230. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/METADATA +83 -86
  231. django_cfg-1.2.0.dist-info/RECORD +441 -0
  232. django_cfg/apps/tasks/@docs/README.md +0 -195
  233. django_cfg/archive/django_sample.zip +0 -0
  234. django_cfg/models/unfold.py +0 -271
  235. django_cfg/modules/unfold/__init__.py +0 -29
  236. django_cfg/modules/unfold/dashboard.py +0 -318
  237. django_cfg/pyproject.toml +0 -370
  238. django_cfg/routers.py +0 -83
  239. django_cfg-1.1.81.dist-info/RECORD +0 -278
  240. /django_cfg/{exceptions.py → core/exceptions.py} +0 -0
  241. /django_cfg/modules/{unfold → django_unfold}/models.py +0 -0
  242. /django_cfg/modules/{unfold → django_unfold}/tailwind.py +0 -0
  243. /django_cfg/{version_check.py → utils/version_check.py} +0 -0
  244. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/WHEEL +0 -0
  245. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/entry_points.txt +0 -0
  246. {django_cfg-1.1.81.dist-info → django_cfg-1.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
1
+ """
2
+ Dynamic chunk settings using Pydantic configuration.
3
+
4
+ This module provides utilities for accessing and managing chunk processing
5
+ settings with type safety and validation.
6
+ """
7
+
8
+ import logging
9
+ from typing import Dict, Any, Optional
10
+ from pydantic import BaseModel, Field, validator
11
+ from ..config.constance_settings import ConstanceSettings
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ChunkSettings(BaseModel):
17
+ """Pydantic model for chunk processing settings."""
18
+
19
+ chunk_size: int = Field(
20
+ ge=100,
21
+ le=8000,
22
+ description="Size of each chunk in characters"
23
+ )
24
+
25
+ chunk_overlap: int = Field(
26
+ ge=0,
27
+ description="Overlap between chunks in characters"
28
+ )
29
+
30
+ embedding_batch_size: int = Field(
31
+ ge=1,
32
+ le=100,
33
+ description="Number of chunks to process in one embedding batch"
34
+ )
35
+
36
+ embedding_model: str = Field(
37
+ min_length=1,
38
+ description="OpenAI embedding model name"
39
+ )
40
+
41
+ @validator('chunk_overlap')
42
+ def validate_overlap(cls, v, values):
43
+ """Ensure overlap is less than chunk_size."""
44
+ chunk_size = values.get('chunk_size')
45
+ if chunk_size and v >= chunk_size:
46
+ raise ValueError(f"Chunk overlap ({v}) must be less than chunk size ({chunk_size})")
47
+ return v
48
+
49
+ class Config:
50
+ """Pydantic configuration."""
51
+ validate_assignment = True
52
+
53
+
54
+ class ChunkSettingsManager:
55
+ """Manager for dynamic chunk settings using Pydantic configuration."""
56
+
57
+ @classmethod
58
+ def get_document_settings(cls) -> ChunkSettings:
59
+ """Get chunk settings for document processing."""
60
+ from ..config.settings import get_config
61
+ config = get_config()
62
+ return ChunkSettings(
63
+ chunk_size=ConstanceSettings.get_document_chunk_size(),
64
+ chunk_overlap=config.chunking.document_chunk_overlap,
65
+ embedding_batch_size=ConstanceSettings.get_embedding_batch_size(),
66
+ embedding_model=ConstanceSettings.get_embedding_model()
67
+ )
68
+
69
+ @classmethod
70
+ def get_archive_settings(cls) -> ChunkSettings:
71
+ """Get chunk settings for archive processing."""
72
+ from ..config.settings import get_config
73
+ config = get_config()
74
+ return ChunkSettings(
75
+ chunk_size=ConstanceSettings.get_archive_chunk_size(),
76
+ chunk_overlap=config.chunking.archive_chunk_overlap,
77
+ embedding_batch_size=ConstanceSettings.get_embedding_batch_size(),
78
+ embedding_model=ConstanceSettings.get_embedding_model()
79
+ )
80
+
81
+ @classmethod
82
+ def get_settings_for_type(cls, content_type: str) -> ChunkSettings:
83
+ """
84
+ Get chunk settings for specific content type.
85
+
86
+ Args:
87
+ content_type: Either 'document' or 'archive'
88
+
89
+ Returns:
90
+ ChunkSettings object with appropriate settings
91
+ """
92
+ if content_type == 'document':
93
+ return cls.get_document_settings()
94
+ elif content_type == 'archive':
95
+ return cls.get_archive_settings()
96
+ else:
97
+ logger.warning(f"Unknown content type: {content_type}, using document settings")
98
+ return cls.get_document_settings()
99
+
100
+ @classmethod
101
+ def get_all_settings(cls) -> Dict[str, ChunkSettings]:
102
+ """Get all chunk settings as dictionary."""
103
+ return {
104
+ 'document': cls.get_document_settings(),
105
+ 'archive': cls.get_archive_settings()
106
+ }
107
+
108
+ @classmethod
109
+ def validate_settings(cls, settings: ChunkSettings) -> bool:
110
+ """
111
+ Validate chunk settings.
112
+
113
+ Args:
114
+ settings: ChunkSettings to validate
115
+
116
+ Returns:
117
+ True if settings are valid, False otherwise
118
+ """
119
+ if settings.chunk_size <= 0:
120
+ logger.error(f"Invalid chunk_size: {settings.chunk_size}")
121
+ return False
122
+
123
+ if settings.chunk_overlap < 0:
124
+ logger.error(f"Invalid chunk_overlap: {settings.chunk_overlap}")
125
+ return False
126
+
127
+ if settings.chunk_overlap >= settings.chunk_size:
128
+ logger.error(f"Chunk overlap ({settings.chunk_overlap}) must be less than chunk size ({settings.chunk_size})")
129
+ return False
130
+
131
+ if settings.embedding_batch_size <= 0 or settings.embedding_batch_size > 2048:
132
+ logger.error(f"Invalid embedding_batch_size: {settings.embedding_batch_size} (must be 1-2048)")
133
+ return False
134
+
135
+ if not settings.embedding_model or not settings.embedding_model.strip():
136
+ logger.error("Embedding model cannot be empty")
137
+ return False
138
+
139
+ return True
140
+
141
+ @classmethod
142
+ def log_current_settings(cls) -> None:
143
+ """Log current settings for debugging."""
144
+ try:
145
+ doc_settings = cls.get_document_settings()
146
+ archive_settings = cls.get_archive_settings()
147
+
148
+ logger.info("📊 Current Chunk Settings:")
149
+ logger.info(f" 📄 Documents: size={doc_settings.chunk_size}, overlap={doc_settings.chunk_overlap}")
150
+ logger.info(f" 📦 Archives: size={archive_settings.chunk_size}, overlap={archive_settings.chunk_overlap}")
151
+ logger.info(f" 🔮 Embedding: batch_size={doc_settings.embedding_batch_size}, model={doc_settings.embedding_model}")
152
+
153
+ except Exception as e:
154
+ logger.error(f"Failed to log current settings: {e}")
155
+
156
+
157
+ # Convenience functions for easy access (using new Pydantic config)
158
+ def get_document_chunk_size() -> int:
159
+ """Get document chunk size."""
160
+ from ..config.settings import get_document_chunk_size
161
+ return get_document_chunk_size()
162
+
163
+
164
+ def get_document_chunk_overlap() -> int:
165
+ """Get document chunk overlap."""
166
+ from ..config.settings import get_document_chunk_overlap
167
+ return get_document_chunk_overlap()
168
+
169
+
170
+ def get_archive_chunk_size() -> int:
171
+ """Get archive chunk size."""
172
+ from ..config.settings import get_archive_chunk_size
173
+ return get_archive_chunk_size()
174
+
175
+
176
+ def get_archive_chunk_overlap() -> int:
177
+ """Get archive chunk overlap."""
178
+ from ..config.settings import get_archive_chunk_overlap
179
+ return get_archive_chunk_overlap()
180
+
181
+
182
+ def get_embedding_batch_size() -> int:
183
+ """Get embedding batch size."""
184
+ from ..config.settings import get_embedding_batch_size
185
+ return get_embedding_batch_size()
186
+
187
+
188
+ def get_embedding_model() -> str:
189
+ """Get embedding model."""
190
+ from ..config.settings import get_embedding_model
191
+ return get_embedding_model()
192
+
193
+
194
+ # Additional convenience functions using new Pydantic config
195
+ def get_search_results_limit() -> int:
196
+ """Get search results limit."""
197
+ from ..config.settings import get_search_results_limit
198
+ return get_search_results_limit()
199
+
200
+
201
+ def get_search_similarity_threshold() -> float:
202
+ """Get search similarity threshold."""
203
+ from ..config.settings import get_search_similarity_threshold
204
+ return get_search_similarity_threshold()
205
+
206
+
207
+ def get_chat_context_chunks() -> int:
208
+ """Get number of chunks for chat context."""
209
+ from ..config.settings import get_chat_context_chunks
210
+ return get_chat_context_chunks()
211
+
212
+
213
+ def get_chat_max_tokens() -> int:
214
+ """Get maximum tokens for chat completion."""
215
+ from ..config.settings import get_chat_max_tokens
216
+ return get_chat_max_tokens()
217
+
218
+
219
+ def get_chat_temperature() -> float:
220
+ """Get chat completion temperature."""
221
+ from ..config.settings import get_chat_temperature
222
+ return get_chat_temperature()
223
+
224
+
225
+ def get_max_archive_size_mb() -> int:
226
+ """Get maximum archive size in MB."""
227
+ from ..config.settings import get_max_archive_size_mb
228
+ return get_max_archive_size_mb()
229
+
230
+
231
+ def get_max_document_size_mb() -> int:
232
+ """Get maximum document size in MB."""
233
+ from ..config.settings import get_max_document_size_mb
234
+ return get_max_document_size_mb()
235
+
236
+
237
+ def get_processing_timeout_minutes() -> int:
238
+ """Get processing timeout in minutes."""
239
+ from ..config.settings import get_processing_timeout_minutes
240
+ return get_processing_timeout_minutes()
241
+
242
+
243
+ def get_chunking_params_for_type(content_type: str) -> Dict[str, Any]:
244
+ """
245
+ Get chunking parameters for SemanticChunker.
246
+
247
+ Args:
248
+ content_type: Either 'document' or 'archive'
249
+
250
+ Returns:
251
+ Dictionary with chunk_size and overlap parameters
252
+ """
253
+ from ..config.settings import get_chunking_params_for_type
254
+ return get_chunking_params_for_type(content_type)
255
+
256
+
257
+ # Initialize settings logging on module import
258
+ try:
259
+ ChunkSettingsManager.log_current_settings()
260
+ except Exception as e:
261
+ logger.debug(f"Could not log settings on import: {e}")
@@ -0,0 +1,375 @@
1
+ """
2
+ Text processing utilities for document chunking and cleaning.
3
+ """
4
+
5
+ import re
6
+ from typing import List, Optional
7
+ from pydantic import BaseModel, Field, validator
8
+ from bs4 import BeautifulSoup, NavigableString
9
+
10
+
11
+ class ChunkConfig(BaseModel):
12
+ """Pydantic configuration for text chunking."""
13
+
14
+ chunk_size: int = Field(
15
+ default=1000,
16
+ ge=100,
17
+ le=8000,
18
+ description="Size of each text chunk in characters"
19
+ )
20
+
21
+ overlap: int = Field(
22
+ default=200,
23
+ ge=0,
24
+ description="Overlap between consecutive chunks in characters"
25
+ )
26
+
27
+ separators: List[str] = Field(
28
+ default_factory=lambda: ["\n\n", "\n", ". ", " "],
29
+ description="List of separators for text splitting in order of preference"
30
+ )
31
+
32
+ @validator('overlap')
33
+ def validate_overlap(cls, v, values):
34
+ """Ensure overlap is less than chunk_size."""
35
+ chunk_size = values.get('chunk_size', 1000)
36
+ if v >= chunk_size:
37
+ raise ValueError(f"Overlap ({v}) must be less than chunk_size ({chunk_size})")
38
+ return v
39
+
40
+ class Config:
41
+ """Pydantic configuration."""
42
+ validate_assignment = True
43
+
44
+
45
+ class TextProcessor:
46
+ """Text cleaning and preprocessing utilities."""
47
+
48
+ def clean_text(self, text: str) -> str:
49
+ """Clean and normalize text content."""
50
+
51
+ # First, check if content contains HTML and clean it
52
+ if self.is_html_content(text):
53
+ text = self.clean_html_content(text)
54
+
55
+ # Remove excessive whitespace
56
+ text = re.sub(r'\s+', ' ', text)
57
+
58
+ # Remove special characters but keep punctuation
59
+ text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\']+', '', text)
60
+
61
+ # Normalize quotes
62
+ text = re.sub(r'["""]', '"', text)
63
+ text = re.sub(r"[''']", "'", text)
64
+
65
+ # Remove extra spaces around punctuation
66
+ text = re.sub(r'\s+([\.,:;!?])', r'\1', text)
67
+ text = re.sub(r'([\.,:;!?])\s+', r'\1 ', text)
68
+
69
+ # Strip and normalize
70
+ text = text.strip()
71
+
72
+ return text
73
+
74
+ def is_html_content(self, text: str) -> bool:
75
+ """Detect if content contains HTML tags."""
76
+ html_pattern = re.compile(r'<[^>]+>')
77
+ return bool(html_pattern.search(text))
78
+
79
+ def clean_html_content(self, html_content: str) -> str:
80
+ """
81
+ Convert HTML content to clean text while preserving structure.
82
+
83
+ Args:
84
+ html_content: Raw HTML content
85
+
86
+ Returns:
87
+ Clean text with preserved structure
88
+ """
89
+ try:
90
+ # Parse HTML with BeautifulSoup
91
+ soup = BeautifulSoup(html_content, 'lxml')
92
+
93
+ # Remove unwanted elements
94
+ self._remove_unwanted_elements(soup)
95
+
96
+ # Convert to structured text
97
+ text = self._extract_structured_text(soup)
98
+
99
+ return text
100
+
101
+ except Exception as e:
102
+ # Fallback to simple tag removal if parsing fails
103
+ text = re.sub(r'<[^>]+>', '', html_content)
104
+ return text
105
+
106
+ def _remove_unwanted_elements(self, soup: BeautifulSoup) -> None:
107
+ """Remove unwanted HTML elements."""
108
+
109
+ # Remove script and style elements
110
+ for element in soup(['script', 'style', 'meta', 'link']):
111
+ element.decompose()
112
+
113
+ # Remove comments
114
+ from bs4 import Comment
115
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
116
+ comment.extract()
117
+
118
+ # Remove empty elements
119
+ for element in soup.find_all():
120
+ if not element.get_text(strip=True) and not element.name in ['br', 'hr', 'img']:
121
+ element.decompose()
122
+
123
+ def _extract_structured_text(self, soup: BeautifulSoup) -> str:
124
+ """
125
+ Extract text while preserving document structure.
126
+
127
+ Args:
128
+ soup: BeautifulSoup parsed HTML
129
+
130
+ Returns:
131
+ Structured text content
132
+ """
133
+ # Start with basic text extraction with proper spacing
134
+ text = soup.get_text(separator=' ', strip=True)
135
+
136
+ # Process specific elements for better structure
137
+ for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
138
+ header_text = element.get_text(strip=True)
139
+ if header_text and header_text in text:
140
+ text = text.replace(header_text, f'\n\n{header_text}\n')
141
+
142
+ for element in soup.find_all(['li']):
143
+ li_text = element.get_text(strip=True)
144
+ if li_text and li_text in text:
145
+ text = text.replace(li_text, f'\n• {li_text}')
146
+
147
+ # Clean up excessive whitespace and newlines
148
+ text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
149
+ text = re.sub(r'\n\s*\n', '\n\n', text) # Multiple newlines to double newline
150
+ text = re.sub(r'\n{3,}', '\n\n', text) # More than 2 newlines to 2
151
+ text = text.strip()
152
+
153
+ return text
154
+
155
+ def extract_metadata(self, text: str) -> dict:
156
+ """Extract basic metadata from text."""
157
+
158
+ lines = text.split('\n')
159
+
160
+ metadata = {
161
+ 'character_count': len(text),
162
+ 'word_count': len(text.split()),
163
+ 'line_count': len(lines),
164
+ 'paragraph_count': len([line for line in lines if line.strip()]),
165
+ 'has_code': bool(re.search(r'```|`[^`]+`', text)),
166
+ 'has_urls': bool(re.search(r'https?://\S+', text)),
167
+ 'has_emails': bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)),
168
+ 'is_html': self.is_html_content(text)
169
+ }
170
+
171
+ # Add HTML-specific metadata if content is HTML
172
+ if metadata['is_html']:
173
+ html_metadata = self._extract_html_metadata(text)
174
+ metadata.update(html_metadata)
175
+
176
+ return metadata
177
+
178
+ def _extract_html_metadata(self, html_content: str) -> dict:
179
+ """Extract HTML-specific metadata."""
180
+ try:
181
+ soup = BeautifulSoup(html_content, 'lxml')
182
+
183
+ # Count different HTML elements
184
+ headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
185
+ paragraphs = soup.find_all('p')
186
+ lists = soup.find_all(['ul', 'ol'])
187
+ list_items = soup.find_all('li')
188
+ links = soup.find_all('a')
189
+ images = soup.find_all('img')
190
+ tables = soup.find_all('table')
191
+
192
+ return {
193
+ 'html_headers_count': len(headers),
194
+ 'html_paragraphs_count': len(paragraphs),
195
+ 'html_lists_count': len(lists),
196
+ 'html_list_items_count': len(list_items),
197
+ 'html_links_count': len(links),
198
+ 'html_images_count': len(images),
199
+ 'html_tables_count': len(tables),
200
+ 'html_has_forms': bool(soup.find('form')),
201
+ 'html_has_media': bool(soup.find_all(['img', 'video', 'audio'])),
202
+ }
203
+ except:
204
+ return {
205
+ 'html_parsing_error': True
206
+ }
207
+
208
+
209
+ class SemanticChunker:
210
+ """Intelligent text chunking with semantic awareness."""
211
+
212
+ def __init__(
213
+ self,
214
+ chunk_size: int = 1000,
215
+ overlap: int = 200,
216
+ separators: Optional[List[str]] = None
217
+ ):
218
+ # Handle None separators for backward compatibility
219
+ if separators is None:
220
+ separators = ["\n\n", "\n", ". ", " "]
221
+
222
+ self.config = ChunkConfig(
223
+ chunk_size=chunk_size,
224
+ overlap=overlap,
225
+ separators=separators
226
+ )
227
+
228
+ def create_chunks(self, text: str) -> List[str]:
229
+ """Split text into semantic chunks."""
230
+
231
+ if len(text) <= self.config.chunk_size:
232
+ return [text]
233
+
234
+ chunks = []
235
+ current_chunk = ""
236
+
237
+ # Split by separators in order of preference
238
+ segments = self._split_by_separators(text, self.config.separators)
239
+
240
+ for segment in segments:
241
+ # If segment alone is too big, split it further
242
+ if len(segment) > self.config.chunk_size:
243
+ # Split large segment
244
+ sub_chunks = self._split_large_segment(segment)
245
+
246
+ # Add current chunk if it exists
247
+ if current_chunk:
248
+ chunks.append(current_chunk.strip())
249
+ current_chunk = ""
250
+
251
+ # Add all but last sub-chunk
252
+ chunks.extend(sub_chunks[:-1])
253
+ current_chunk = sub_chunks[-1] if sub_chunks else ""
254
+
255
+ # If adding segment would exceed chunk size
256
+ elif len(current_chunk) + len(segment) > self.config.chunk_size:
257
+ if current_chunk:
258
+ chunks.append(current_chunk.strip())
259
+ current_chunk = segment
260
+ else:
261
+ current_chunk += segment
262
+
263
+ # Add final chunk
264
+ if current_chunk:
265
+ chunks.append(current_chunk.strip())
266
+
267
+ # Add overlap between chunks
268
+ if self.config.overlap > 0:
269
+ chunks = self._add_overlap(chunks)
270
+
271
+ return [chunk for chunk in chunks if chunk.strip()]
272
+
273
+ def _split_by_separators(self, text: str, separators: List[str]) -> List[str]:
274
+ """Split text by separators in order of preference."""
275
+
276
+ segments = [text]
277
+
278
+ for separator in separators:
279
+ new_segments = []
280
+ for segment in segments:
281
+ if separator in segment:
282
+ parts = segment.split(separator)
283
+ for i, part in enumerate(parts):
284
+ if i > 0:
285
+ new_segments.append(separator + part)
286
+ else:
287
+ new_segments.append(part)
288
+ else:
289
+ new_segments.append(segment)
290
+ segments = new_segments
291
+
292
+ return segments
293
+
294
+ def _split_large_segment(self, segment: str) -> List[str]:
295
+ """Split a segment that's too large."""
296
+
297
+ chunks = []
298
+ start = 0
299
+
300
+ while start < len(segment):
301
+ end = start + self.config.chunk_size
302
+
303
+ if end >= len(segment):
304
+ chunks.append(segment[start:])
305
+ break
306
+
307
+ # Try to find a good breaking point
308
+ break_point = self._find_break_point(segment, start, end)
309
+
310
+ chunks.append(segment[start:break_point])
311
+ start = break_point - self.config.overlap if break_point > self.config.overlap else break_point
312
+
313
+ return chunks
314
+
315
+ def _find_break_point(self, text: str, start: int, end: int) -> int:
316
+ """Find a good breaking point near the end position."""
317
+
318
+ # Look for sentence endings
319
+ for i in range(end - 1, start + self.config.chunk_size // 2, -1):
320
+ if text[i] in '.!?':
321
+ return i + 1
322
+
323
+ # Look for paragraph breaks
324
+ for i in range(end - 1, start + self.config.chunk_size // 2, -1):
325
+ if text[i] == '\n':
326
+ return i + 1
327
+
328
+ # Look for word boundaries
329
+ for i in range(end - 1, start + self.config.chunk_size // 2, -1):
330
+ if text[i] == ' ':
331
+ return i + 1
332
+
333
+ # No good break point found, use hard limit
334
+ return end
335
+
336
+ def _add_overlap(self, chunks: List[str]) -> List[str]:
337
+ """Add overlap between consecutive chunks."""
338
+
339
+ if len(chunks) <= 1:
340
+ return chunks
341
+
342
+ overlapped_chunks = [chunks[0]]
343
+
344
+ for i in range(1, len(chunks)):
345
+ prev_chunk = chunks[i - 1]
346
+ current_chunk = chunks[i]
347
+
348
+ # Get overlap from previous chunk
349
+ overlap_text = prev_chunk[-self.config.overlap:] if len(prev_chunk) > self.config.overlap else prev_chunk
350
+
351
+ # Add overlap to current chunk
352
+ overlapped_chunk = overlap_text + " " + current_chunk
353
+ overlapped_chunks.append(overlapped_chunk)
354
+
355
+ return overlapped_chunks
356
+
357
+
358
+ # Convenience function for backward compatibility
359
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
360
+ """
361
+ Convenience function to chunk text with default settings.
362
+
363
+ Args:
364
+ text: Text to chunk
365
+ chunk_size: Size of each chunk
366
+ overlap: Overlap between chunks
367
+
368
+ Returns:
369
+ List of text chunks
370
+ """
371
+ chunker = SemanticChunker(
372
+ chunk_size=chunk_size,
373
+ overlap=overlap
374
+ )
375
+ return chunker.create_chunks(text)