signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. signalwire_agents/__init__.py +130 -4
  2. signalwire_agents/agent_server.py +438 -32
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +18 -0
  5. signalwire_agents/cli/build_search.py +1367 -0
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/execution/__init__.py +10 -0
  13. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  14. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  15. signalwire_agents/cli/init_project.py +1225 -0
  16. signalwire_agents/cli/output/__init__.py +10 -0
  17. signalwire_agents/cli/output/output_formatter.py +255 -0
  18. signalwire_agents/cli/output/swml_dump.py +186 -0
  19. signalwire_agents/cli/simulation/__init__.py +10 -0
  20. signalwire_agents/cli/simulation/data_generation.py +374 -0
  21. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  22. signalwire_agents/cli/simulation/mock_env.py +282 -0
  23. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  24. signalwire_agents/cli/test_swaig.py +809 -0
  25. signalwire_agents/cli/types.py +81 -0
  26. signalwire_agents/core/__init__.py +2 -2
  27. signalwire_agents/core/agent/__init__.py +12 -0
  28. signalwire_agents/core/agent/config/__init__.py +12 -0
  29. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  30. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  31. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  32. signalwire_agents/core/agent/prompt/manager.py +306 -0
  33. signalwire_agents/core/agent/routing/__init__.py +9 -0
  34. signalwire_agents/core/agent/security/__init__.py +9 -0
  35. signalwire_agents/core/agent/swml/__init__.py +9 -0
  36. signalwire_agents/core/agent/tools/__init__.py +15 -0
  37. signalwire_agents/core/agent/tools/decorator.py +97 -0
  38. signalwire_agents/core/agent/tools/registry.py +210 -0
  39. signalwire_agents/core/agent_base.py +959 -2166
  40. signalwire_agents/core/auth_handler.py +233 -0
  41. signalwire_agents/core/config_loader.py +259 -0
  42. signalwire_agents/core/contexts.py +707 -0
  43. signalwire_agents/core/data_map.py +487 -0
  44. signalwire_agents/core/function_result.py +1150 -1
  45. signalwire_agents/core/logging_config.py +376 -0
  46. signalwire_agents/core/mixins/__init__.py +28 -0
  47. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  48. signalwire_agents/core/mixins/auth_mixin.py +287 -0
  49. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  50. signalwire_agents/core/mixins/serverless_mixin.py +368 -0
  51. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  52. signalwire_agents/core/mixins/state_mixin.py +153 -0
  53. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  54. signalwire_agents/core/mixins/web_mixin.py +1134 -0
  55. signalwire_agents/core/security/session_manager.py +174 -86
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +200 -0
  58. signalwire_agents/core/skill_manager.py +244 -0
  59. signalwire_agents/core/swaig_function.py +33 -9
  60. signalwire_agents/core/swml_builder.py +212 -12
  61. signalwire_agents/core/swml_handler.py +43 -13
  62. signalwire_agents/core/swml_renderer.py +123 -297
  63. signalwire_agents/core/swml_service.py +277 -260
  64. signalwire_agents/prefabs/concierge.py +6 -2
  65. signalwire_agents/prefabs/info_gatherer.py +149 -33
  66. signalwire_agents/prefabs/receptionist.py +14 -22
  67. signalwire_agents/prefabs/survey.py +6 -2
  68. signalwire_agents/schema.json +9218 -5489
  69. signalwire_agents/search/__init__.py +137 -0
  70. signalwire_agents/search/document_processor.py +1223 -0
  71. signalwire_agents/search/index_builder.py +804 -0
  72. signalwire_agents/search/migration.py +418 -0
  73. signalwire_agents/search/models.py +30 -0
  74. signalwire_agents/search/pgvector_backend.py +752 -0
  75. signalwire_agents/search/query_processor.py +502 -0
  76. signalwire_agents/search/search_engine.py +1264 -0
  77. signalwire_agents/search/search_service.py +574 -0
  78. signalwire_agents/skills/README.md +452 -0
  79. signalwire_agents/skills/__init__.py +23 -0
  80. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  81. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  82. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  83. signalwire_agents/skills/datasphere/README.md +210 -0
  84. signalwire_agents/skills/datasphere/__init__.py +12 -0
  85. signalwire_agents/skills/datasphere/skill.py +310 -0
  86. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  87. signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
  88. signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
  89. signalwire_agents/skills/datetime/README.md +132 -0
  90. signalwire_agents/skills/datetime/__init__.py +10 -0
  91. signalwire_agents/skills/datetime/skill.py +126 -0
  92. signalwire_agents/skills/joke/README.md +149 -0
  93. signalwire_agents/skills/joke/__init__.py +10 -0
  94. signalwire_agents/skills/joke/skill.py +109 -0
  95. signalwire_agents/skills/math/README.md +161 -0
  96. signalwire_agents/skills/math/__init__.py +10 -0
  97. signalwire_agents/skills/math/skill.py +105 -0
  98. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  99. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  100. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  101. signalwire_agents/skills/native_vector_search/README.md +210 -0
  102. signalwire_agents/skills/native_vector_search/__init__.py +10 -0
  103. signalwire_agents/skills/native_vector_search/skill.py +820 -0
  104. signalwire_agents/skills/play_background_file/README.md +218 -0
  105. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  106. signalwire_agents/skills/play_background_file/skill.py +242 -0
  107. signalwire_agents/skills/registry.py +459 -0
  108. signalwire_agents/skills/spider/README.md +236 -0
  109. signalwire_agents/skills/spider/__init__.py +13 -0
  110. signalwire_agents/skills/spider/skill.py +598 -0
  111. signalwire_agents/skills/swml_transfer/README.md +395 -0
  112. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  113. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  114. signalwire_agents/skills/weather_api/README.md +178 -0
  115. signalwire_agents/skills/weather_api/__init__.py +12 -0
  116. signalwire_agents/skills/weather_api/skill.py +191 -0
  117. signalwire_agents/skills/web_search/README.md +163 -0
  118. signalwire_agents/skills/web_search/__init__.py +10 -0
  119. signalwire_agents/skills/web_search/skill.py +739 -0
  120. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  121. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  122. signalwire_agents/skills/wikipedia_search/skill.py +210 -0
  123. signalwire_agents/utils/__init__.py +14 -0
  124. signalwire_agents/utils/schema_utils.py +111 -44
  125. signalwire_agents/web/__init__.py +17 -0
  126. signalwire_agents/web/web_service.py +559 -0
  127. signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
  128. signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
  129. signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
  130. signalwire_agents-1.0.7.dist-info/METADATA +992 -0
  131. signalwire_agents-1.0.7.dist-info/RECORD +142 -0
  132. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
  133. signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
  134. signalwire_agents/core/state/file_state_manager.py +0 -219
  135. signalwire_agents/core/state/state_manager.py +0 -101
  136. signalwire_agents-0.1.6.data/data/schema.json +0 -5611
  137. signalwire_agents-0.1.6.dist-info/METADATA +0 -199
  138. signalwire_agents-0.1.6.dist-info/RECORD +0 -34
  139. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
  140. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1223 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import re
11
+ import hashlib
12
+ import json
13
+ import logging
14
+ from typing import List, Dict, Any, Optional
15
+ from pathlib import Path
16
+
17
+ # Document processing imports
18
+ try:
19
+ import pdfplumber
20
+ except ImportError:
21
+ pdfplumber = None
22
+
23
+ try:
24
+ from docx import Document as DocxDocument
25
+ except ImportError:
26
+ DocxDocument = None
27
+
28
+ try:
29
+ from bs4 import BeautifulSoup
30
+ except ImportError:
31
+ BeautifulSoup = None
32
+
33
+ try:
34
+ import markdown
35
+ except ImportError:
36
+ markdown = None
37
+
38
+ try:
39
+ from striprtf.striprtf import rtf_to_text
40
+ except ImportError:
41
+ rtf_to_text = None
42
+
43
+ try:
44
+ from openpyxl import load_workbook
45
+ except ImportError:
46
+ load_workbook = None
47
+
48
+ try:
49
+ from pptx import Presentation
50
+ except ImportError:
51
+ Presentation = None
52
+
53
+ try:
54
+ from nltk.tokenize import sent_tokenize
55
+ import nltk
56
+ # Ensure NLTK data is available
57
+ try:
58
+ nltk.data.find('tokenizers/punkt')
59
+ except LookupError:
60
+ nltk.download('punkt', quiet=True)
61
+ except ImportError:
62
+ sent_tokenize = None
63
+ nltk = None
64
+
65
+ try:
66
+ import magic
67
+ except ImportError:
68
+ magic = None
69
+
70
+ from .query_processor import preprocess_document_content
71
+
72
+ logger = logging.getLogger(__name__)
73
+
74
+ class DocumentProcessor:
75
+ """Enhanced document processor with smart chunking capabilities"""
76
+
77
+ def __init__(
78
+ self,
79
+ chunking_strategy: str = 'sentence',
80
+ max_sentences_per_chunk: int = 5,
81
+ chunk_size: int = 50,
82
+ chunk_overlap: int = 10,
83
+ split_newlines: Optional[int] = None,
84
+ index_nlp_backend: str = 'nltk',
85
+ verbose: bool = False,
86
+ semantic_threshold: float = 0.5,
87
+ topic_threshold: float = 0.3
88
+ ):
89
+ """
90
+ Initialize document processor
91
+
92
+ Args:
93
+ chunking_strategy: Strategy for chunking documents:
94
+ - 'sentence': Sentence-based chunking with overlap
95
+ - 'sliding': Sliding window with word-based chunks
96
+ - 'paragraph': Natural paragraph boundaries
97
+ - 'page': Page-based chunking (for PDFs)
98
+ - 'semantic': Semantic similarity-based chunking
99
+ - 'topic': Topic modeling-based chunking
100
+ - 'qa': Question-answer optimized chunking
101
+ - 'json': JSON structure-aware chunking
102
+ - 'markdown': Markdown structure-aware chunking with code block detection
103
+ max_sentences_per_chunk: For sentence strategy (default: 5)
104
+ chunk_size: For sliding strategy - words per chunk (default: 50)
105
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
106
+ split_newlines: For sentence strategy - split on multiple newlines (optional)
107
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
108
+ verbose: Whether to enable verbose logging (default: False)
109
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
110
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
111
+ """
112
+ self.chunking_strategy = chunking_strategy
113
+ self.max_sentences_per_chunk = max_sentences_per_chunk
114
+ self.chunk_size = chunk_size
115
+ self.chunk_overlap = chunk_overlap
116
+ self.split_newlines = split_newlines
117
+ self.semantic_threshold = semantic_threshold
118
+ self.topic_threshold = topic_threshold
119
+
120
+ # Legacy support for old character-based chunking
121
+ self.chunk_overlap = chunk_overlap
122
+
123
+ def create_chunks(self, content: str, filename: str,
124
+ file_type: str) -> List[Dict[str, Any]]:
125
+ """
126
+ Create chunks from document content using specified chunking strategy
127
+
128
+ Args:
129
+ content: Document content (string) - should be the actual content, not a file path
130
+ filename: Name of the file (for metadata)
131
+ file_type: File extension/type
132
+
133
+ Returns:
134
+ List of chunk dictionaries
135
+ """
136
+
137
+ # Apply chunking strategy
138
+ if self.chunking_strategy == 'sentence':
139
+ return self._chunk_by_sentences(content, filename, file_type)
140
+ elif self.chunking_strategy == 'sliding':
141
+ return self._chunk_by_sliding_window(content, filename, file_type)
142
+ elif self.chunking_strategy == 'paragraph':
143
+ return self._chunk_by_paragraphs(content, filename, file_type)
144
+ elif self.chunking_strategy == 'page':
145
+ return self._chunk_by_pages(content, filename, file_type)
146
+ elif self.chunking_strategy == 'semantic':
147
+ return self._chunk_by_semantic(content, filename, file_type)
148
+ elif self.chunking_strategy == 'topic':
149
+ return self._chunk_by_topics(content, filename, file_type)
150
+ elif self.chunking_strategy == 'qa':
151
+ return self._chunk_by_qa_optimization(content, filename, file_type)
152
+ elif self.chunking_strategy == 'json':
153
+ return self._chunk_from_json(content, filename, file_type)
154
+ elif self.chunking_strategy == 'markdown':
155
+ # Use markdown-aware chunking for better structure preservation
156
+ return self._chunk_markdown_enhanced(content, filename)
157
+ else:
158
+ # Fallback to sentence-based chunking
159
+ return self._chunk_by_sentences(content, filename, file_type)
160
+
161
+ def _extract_text_from_file(self, file_path: str) -> Any:
162
+ """Extract text from various file formats"""
163
+ if not magic:
164
+ # Fallback to extension-based detection
165
+ file_path_obj = Path(file_path)
166
+ extension = file_path_obj.suffix.lower()
167
+
168
+ if extension == '.pdf':
169
+ file_type = 'application/pdf'
170
+ elif extension == '.docx':
171
+ file_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
172
+ elif extension in ['.txt', '.md']:
173
+ file_type = 'text/plain'
174
+ elif extension == '.html':
175
+ file_type = 'text/html'
176
+ elif extension == '.rtf':
177
+ file_type = 'application/rtf'
178
+ elif extension == '.xlsx':
179
+ file_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
180
+ elif extension == '.pptx':
181
+ file_type = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
182
+ else:
183
+ file_type = 'text/plain'
184
+ else:
185
+ mime = magic.Magic(mime=True)
186
+ file_type = mime.from_file(file_path)
187
+
188
+ if 'pdf' in file_type:
189
+ return self._extract_pdf(file_path)
190
+ elif 'vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
191
+ return self._extract_docx(file_path)
192
+ elif 'plain' in file_type or 'text' in file_type:
193
+ return self._extract_text(file_path)
194
+ elif 'html' in file_type:
195
+ return self._extract_html(file_path)
196
+ elif 'markdown' in file_type or file_path.endswith('.md'):
197
+ return self._extract_markdown(file_path)
198
+ elif 'rtf' in file_type:
199
+ return self._extract_rtf(file_path)
200
+ elif 'vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
201
+ return self._extract_excel(file_path)
202
+ elif 'vnd.openxmlformats-officedocument.presentationml.presentation' in file_type:
203
+ return self._extract_powerpoint(file_path)
204
+ else:
205
+ return json.dumps({"error": f"Unsupported file type: {file_type}"})
206
+
207
+ def _extract_pdf(self, file_path: str):
208
+ """Extract text from PDF files"""
209
+ if not pdfplumber:
210
+ return json.dumps({"error": "pdfplumber not available for PDF processing"})
211
+
212
+ try:
213
+ with pdfplumber.open(file_path) as pdf:
214
+ pages = []
215
+ for page in pdf.pages:
216
+ text = page.extract_text()
217
+ if text:
218
+ # Remove page number from the beginning
219
+ text = re.sub(r'^\d+\.\s*', '', text.strip())
220
+ pages.append(text)
221
+ return pages
222
+ except Exception as e:
223
+ return json.dumps({"error": f"Error processing PDF: {e}"})
224
+
225
+ def _extract_docx(self, file_path: str):
226
+ """Extract text from DOCX files"""
227
+ if not DocxDocument:
228
+ return json.dumps({"error": "python-docx not available for DOCX processing"})
229
+
230
+ try:
231
+ doc = DocxDocument(file_path)
232
+ return [para.text for para in doc.paragraphs if para.text.strip()]
233
+ except Exception as e:
234
+ return json.dumps({"error": f"Error processing DOCX: {e}"})
235
+
236
+ def _extract_text(self, file_path: str):
237
+ """Extract text from plain text files"""
238
+ try:
239
+ with open(file_path, 'r', encoding='utf-8') as file:
240
+ return file.read()
241
+ except Exception as e:
242
+ return json.dumps({"error": f"Error processing TXT: {e}"})
243
+
244
+ def _extract_html(self, file_path: str):
245
+ """Extract text from HTML files"""
246
+ if not BeautifulSoup:
247
+ return json.dumps({"error": "beautifulsoup4 not available for HTML processing"})
248
+
249
+ try:
250
+ with open(file_path, 'r', encoding='utf-8') as file:
251
+ soup = BeautifulSoup(file, 'html.parser')
252
+ return soup.get_text(separator='\n')
253
+ except Exception as e:
254
+ return json.dumps({"error": f"Error processing HTML: {e}"})
255
+
256
+ def _extract_markdown(self, file_path: str):
257
+ """Extract text from Markdown files"""
258
+ try:
259
+ with open(file_path, 'r', encoding='utf-8') as file:
260
+ content = file.read()
261
+ if markdown and BeautifulSoup:
262
+ html = markdown.markdown(content)
263
+ soup = BeautifulSoup(html, 'html.parser')
264
+ return soup.get_text(separator='\n')
265
+ else:
266
+ # Fallback to raw markdown
267
+ return content
268
+ except Exception as e:
269
+ return json.dumps({"error": f"Error processing Markdown: {e}"})
270
+
271
+ def _extract_rtf(self, file_path: str):
272
+ """Extract text from RTF files"""
273
+ if not rtf_to_text:
274
+ return json.dumps({"error": "striprtf not available for RTF processing"})
275
+
276
+ try:
277
+ with open(file_path, 'r', encoding='utf-8') as file:
278
+ return rtf_to_text(file.read())
279
+ except Exception as e:
280
+ return json.dumps({"error": f"Error processing RTF: {e}"})
281
+
282
+ def _extract_excel(self, file_path: str):
283
+ """Extract text from Excel files"""
284
+ if not load_workbook:
285
+ return json.dumps({"error": "openpyxl not available for Excel processing"})
286
+
287
+ try:
288
+ wb = load_workbook(file_path)
289
+ sheets_text = []
290
+ for sheet in wb.worksheets:
291
+ for row in sheet.iter_rows(values_only=True):
292
+ row_text = ' '.join([str(cell) for cell in row if cell is not None])
293
+ sheets_text.append(row_text)
294
+ return "\n".join(sheets_text)
295
+ except Exception as e:
296
+ return json.dumps({"error": f"Error processing Excel: {e}"})
297
+
298
+ def _extract_powerpoint(self, file_path: str):
299
+ """Extract text from PowerPoint files"""
300
+ if not Presentation:
301
+ return json.dumps({"error": "python-pptx not available for PowerPoint processing"})
302
+
303
+ try:
304
+ prs = Presentation(file_path)
305
+ slides_text = []
306
+ for slide in prs.slides:
307
+ slide_text = []
308
+ for shape in slide.shapes:
309
+ if hasattr(shape, "text"):
310
+ slide_text.append(shape.text)
311
+ slides_text.append("\n".join(slide_text))
312
+ return slides_text
313
+ except Exception as e:
314
+ return json.dumps({"error": f"Error processing PowerPoint: {e}"})
315
+
316
+ def _chunk_document_aware(self, content: Any, filename: str, file_type: str) -> List[Dict[str, Any]]:
317
+ """Smart chunking for documents with natural structure"""
318
+ chunks = []
319
+
320
+ if isinstance(content, list):
321
+ # Handle page-based or paragraph-based content (PDF, DOCX, PPTX)
322
+ for i, page_content in enumerate(content):
323
+ if not page_content or not page_content.strip():
324
+ continue
325
+
326
+ # For each page/slide, use sentence-based chunking if it's large
327
+ if len(page_content) > self.chunk_size:
328
+ page_chunks = self._sentence_based_chunking(
329
+ page_content,
330
+ max_sentences_per_chunk=self._calculate_sentences_per_chunk(page_content)
331
+ )
332
+ for j, chunk_content in enumerate(page_chunks):
333
+ chunks.append(self._create_chunk(
334
+ content=chunk_content,
335
+ filename=filename,
336
+ section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
337
+ metadata={'page_number': i+1, 'chunk_index': j}
338
+ ))
339
+ else:
340
+ # Small page/slide - keep as single chunk
341
+ chunks.append(self._create_chunk(
342
+ content=page_content,
343
+ filename=filename,
344
+ section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
345
+ metadata={'page_number': i+1}
346
+ ))
347
+ else:
348
+ # Single text content - use paragraph-aware chunking
349
+ chunks = self._chunk_text_enhanced(content, filename)
350
+
351
+ return chunks
352
+
353
+ def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
354
+ """Enhanced markdown chunking with code block detection and rich metadata
355
+
356
+ Features:
357
+ - Tracks header hierarchy for section paths
358
+ - Detects code blocks and extracts language
359
+ - Adds 'code' tags to chunks containing code
360
+ - Preserves markdown structure for better search
361
+ """
362
+ chunks = []
363
+ lines = content.split('\n')
364
+
365
+ current_section = None
366
+ current_hierarchy = [] # Track header hierarchy
367
+ current_chunk = []
368
+ current_size = 0
369
+ line_start = 1
370
+ in_code_block = False
371
+ code_languages = [] # Track languages in current chunk
372
+ has_code = False
373
+
374
+ for line_num, line in enumerate(lines, 1):
375
+ # Check for code block fences
376
+ code_fence_match = re.match(r'^```(\w+)?', line)
377
+ if code_fence_match:
378
+ in_code_block = not in_code_block
379
+ if in_code_block:
380
+ # Starting code block
381
+ has_code = True
382
+ lang = code_fence_match.group(1)
383
+ if lang and lang not in code_languages:
384
+ code_languages.append(lang)
385
+
386
+ # Check for headers with hierarchy tracking
387
+ header_match = re.match(r'^(#{1,6})\s+(.+)', line) if not in_code_block else None
388
+ if header_match:
389
+ header_level = len(header_match.group(1))
390
+ header_text = header_match.group(2).strip()
391
+
392
+ # Save current chunk if it exists
393
+ if current_chunk:
394
+ chunk_metadata = self._build_markdown_metadata(
395
+ current_hierarchy, code_languages, has_code
396
+ )
397
+ chunks.append(self._create_chunk(
398
+ content='\n'.join(current_chunk),
399
+ filename=filename,
400
+ section=self._build_section_path(current_hierarchy),
401
+ start_line=line_start,
402
+ end_line=line_num - 1,
403
+ metadata=chunk_metadata
404
+ ))
405
+
406
+ # Update hierarchy
407
+ current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
408
+ current_section = header_text
409
+ current_chunk = [line]
410
+ current_size = len(line)
411
+ line_start = line_num
412
+ code_languages = []
413
+ has_code = False
414
+
415
+ else:
416
+ current_chunk.append(line)
417
+ current_size += len(line) + 1
418
+
419
+ # Check if chunk is getting too large - use smart splitting
420
+ # But don't split inside code blocks
421
+ if current_size >= self.chunk_size and not in_code_block:
422
+ # Try to split at paragraph boundary first
423
+ split_point = self._find_best_split_point(current_chunk)
424
+
425
+ chunk_to_save = current_chunk[:split_point]
426
+ chunk_metadata = self._build_markdown_metadata(
427
+ current_hierarchy, code_languages, has_code
428
+ )
429
+ chunks.append(self._create_chunk(
430
+ content='\n'.join(chunk_to_save),
431
+ filename=filename,
432
+ section=self._build_section_path(current_hierarchy),
433
+ start_line=line_start,
434
+ end_line=line_start + split_point - 1,
435
+ metadata=chunk_metadata
436
+ ))
437
+
438
+ # Start new chunk with overlap
439
+ overlap_lines = self._get_overlap_lines(chunk_to_save)
440
+ remaining_lines = current_chunk[split_point:]
441
+ current_chunk = overlap_lines + remaining_lines
442
+ current_size = sum(len(line) + 1 for line in current_chunk)
443
+ line_start = line_start + split_point - len(overlap_lines)
444
+ # Reset code tracking for new chunk
445
+ code_languages = []
446
+ has_code = False
447
+
448
+ # Add final chunk
449
+ if current_chunk:
450
+ chunk_metadata = self._build_markdown_metadata(
451
+ current_hierarchy, code_languages, has_code
452
+ )
453
+ chunks.append(self._create_chunk(
454
+ content='\n'.join(current_chunk),
455
+ filename=filename,
456
+ section=self._build_section_path(current_hierarchy),
457
+ start_line=line_start,
458
+ end_line=len(lines),
459
+ metadata=chunk_metadata
460
+ ))
461
+
462
+ return chunks
463
+
464
+ def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
465
+ """Enhanced Python code chunking with better function/class detection"""
466
+ chunks = []
467
+ lines = content.split('\n')
468
+
469
+ current_function = None
470
+ current_class = None
471
+ current_chunk = []
472
+ current_size = 0
473
+ line_start = 1
474
+ indent_level = 0
475
+
476
+ for line_num, line in enumerate(lines, 1):
477
+ # Detect class definitions
478
+ class_match = re.match(r'^(\s*)(class\s+([^(:\s]+))', line)
479
+ if class_match:
480
+ indent = len(class_match.group(1))
481
+ class_name = class_match.group(3)
482
+
483
+ # Save current chunk if switching context
484
+ if current_chunk and (indent <= indent_level or current_class != class_name):
485
+ chunks.append(self._create_chunk(
486
+ content='\n'.join(current_chunk),
487
+ filename=filename,
488
+ section=self._build_python_section(current_class, current_function),
489
+ start_line=line_start,
490
+ end_line=line_num - 1
491
+ ))
492
+ current_chunk = []
493
+ line_start = line_num
494
+
495
+ current_class = class_name
496
+ current_function = None
497
+ indent_level = indent
498
+
499
+ # Detect function definitions
500
+ func_match = re.match(r'^(\s*)(def\s+([^(:\s]+)|async\s+def\s+([^(:\s]+))', line)
501
+ if func_match:
502
+ indent = len(func_match.group(1))
503
+ func_name = func_match.group(3) or func_match.group(4)
504
+
505
+ # Save current chunk if switching to new function at same or lower level
506
+ if current_chunk and indent <= indent_level:
507
+ chunks.append(self._create_chunk(
508
+ content='\n'.join(current_chunk),
509
+ filename=filename,
510
+ section=self._build_python_section(current_class, current_function),
511
+ start_line=line_start,
512
+ end_line=line_num - 1
513
+ ))
514
+ current_chunk = []
515
+ line_start = line_num
516
+
517
+ if indent >= indent_level: # Method within class or nested function
518
+ current_function = func_name
519
+ else: # Top-level function
520
+ current_function = func_name
521
+ current_class = None
522
+
523
+ indent_level = indent
524
+
525
+ current_chunk.append(line)
526
+ current_size += len(line) + 1
527
+
528
+ # Handle oversized chunks
529
+ if current_size >= self.chunk_size:
530
+ chunks.append(self._create_chunk(
531
+ content='\n'.join(current_chunk),
532
+ filename=filename,
533
+ section=self._build_python_section(current_class, current_function),
534
+ start_line=line_start,
535
+ end_line=line_num
536
+ ))
537
+
538
+ # Start new chunk with minimal overlap for code
539
+ overlap_lines = current_chunk[-2:] if len(current_chunk) > 2 else current_chunk
540
+ current_chunk = overlap_lines
541
+ current_size = sum(len(line) + 1 for line in overlap_lines)
542
+ line_start = line_num - len(overlap_lines) + 1
543
+
544
+ # Add final chunk
545
+ if current_chunk:
546
+ chunks.append(self._create_chunk(
547
+ content='\n'.join(current_chunk),
548
+ filename=filename,
549
+ section=self._build_python_section(current_class, current_function),
550
+ start_line=line_start,
551
+ end_line=len(lines)
552
+ ))
553
+
554
+ return chunks
555
+
556
+ def _chunk_text_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
557
+ """Enhanced text chunking using sentence-based approach"""
558
+ if isinstance(content, list):
559
+ content = '\n'.join(content)
560
+
561
+ # Use sentence-based chunking for better coherence
562
+ max_sentences = self._calculate_sentences_per_chunk(content)
563
+ sentences = self._sentence_based_chunking(content, max_sentences)
564
+
565
+ chunks = []
566
+ for i, chunk_content in enumerate(sentences):
567
+ chunks.append(self._create_chunk(
568
+ content=chunk_content,
569
+ filename=filename,
570
+ section=f"Section {i+1}",
571
+ metadata={'chunk_method': 'sentence_based', 'chunk_index': i}
572
+ ))
573
+
574
+ return chunks
575
+
576
+ def _sentence_based_chunking(self, text: str, max_sentences_per_chunk: int, split_newlines: int = 2) -> List[str]:
577
+ """Sentence-based chunking with enhancements"""
578
+ if not sent_tokenize:
579
+ # Fallback to simple splitting
580
+ sentences = text.split('. ')
581
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
582
+ else:
583
+ sentences = []
584
+
585
+ if split_newlines > 0:
586
+ # Create regex pattern for specified number of newlines
587
+ newline_pattern = r'(\n{%d,})' % split_newlines
588
+ parts = re.split(newline_pattern, text)
589
+
590
+ for part in parts:
591
+ part = part.strip()
592
+ if part and not re.match(newline_pattern, part):
593
+ sentences.extend(sent_tokenize(part))
594
+ elif re.match(newline_pattern, part):
595
+ sentences.append(part)
596
+ else:
597
+ sentences = [sentence.strip() for sentence in sent_tokenize(text) if sentence.strip()]
598
+
599
+ # Create chunks of sentences with overlap
600
+ chunks = []
601
+ overlap_sentences = max(1, max_sentences_per_chunk // 4) # 25% overlap
602
+
603
+ for i in range(0, len(sentences), max_sentences_per_chunk - overlap_sentences):
604
+ chunk_sentences = sentences[i:i + max_sentences_per_chunk]
605
+ if chunk_sentences:
606
+ chunks.append(' '.join(chunk_sentences))
607
+
608
+ return chunks
609
+
610
+ def _calculate_sentences_per_chunk(self, text: str) -> int:
611
+ """Calculate optimal sentences per chunk based on average sentence length"""
612
+ if not sent_tokenize:
613
+ # Fallback calculation
614
+ sentences = text.split('. ')
615
+ else:
616
+ sentences = sent_tokenize(text)
617
+
618
+ if not sentences:
619
+ return 1
620
+
621
+ avg_sentence_length = sum(len(s) for s in sentences) / len(sentences)
622
+ # Target chunk size divided by average sentence length
623
+ optimal_sentences = max(1, int(self.chunk_size / avg_sentence_length))
624
+ return min(optimal_sentences, 10) # Cap at 10 sentences for readability
625
+
626
+ def _build_section_path(self, hierarchy: List[str]) -> str:
627
+ """Build hierarchical section path from header hierarchy"""
628
+ return ' > '.join(hierarchy) if hierarchy else None
629
+
630
+ def _build_markdown_metadata(self, hierarchy: List[str], code_languages: List[str], has_code: bool) -> Dict[str, Any]:
631
+ """Build rich metadata for markdown chunks
632
+
633
+ Args:
634
+ hierarchy: Current header hierarchy (e.g., ['Installation', 'Requirements', 'Python'])
635
+ code_languages: List of code block languages found in chunk (e.g., ['python', 'bash'])
636
+ has_code: Whether chunk contains any code blocks
637
+
638
+ Returns:
639
+ Dictionary with markdown-specific metadata including tags
640
+ """
641
+ metadata = {
642
+ 'chunk_type': 'markdown',
643
+ }
644
+
645
+ # Add header level metadata
646
+ if hierarchy:
647
+ for i, header in enumerate(hierarchy, 1):
648
+ metadata[f'h{i}'] = header
649
+
650
+ # Add code-related metadata
651
+ if has_code:
652
+ metadata['has_code'] = True
653
+ if code_languages:
654
+ metadata['code_languages'] = code_languages
655
+
656
+ # Build tags for enhanced searching
657
+ tags = []
658
+ if has_code:
659
+ tags.append('code')
660
+ # Add language-specific tags
661
+ for lang in code_languages:
662
+ tags.append(f'code:{lang}')
663
+
664
+ # Add tags for header levels (searchable by section depth)
665
+ if len(hierarchy) > 0:
666
+ tags.append(f'depth:{len(hierarchy)}')
667
+
668
+ if tags:
669
+ metadata['tags'] = tags
670
+
671
+ return metadata
672
+
673
+ def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
674
+ """Build section name for Python code"""
675
+ if class_name and function_name:
676
+ return f"{class_name}.{function_name}"
677
+ elif class_name:
678
+ return class_name
679
+ elif function_name:
680
+ return function_name
681
+ else:
682
+ return None
683
+
684
+ def _find_best_split_point(self, lines: List[str]) -> int:
685
+ """Find the best point to split a chunk (prefer paragraph boundaries)"""
686
+ # Look for empty lines (paragraph boundaries) in the last 25% of the chunk
687
+ start_search = max(1, len(lines) * 3 // 4)
688
+
689
+ for i in range(len(lines) - 1, start_search - 1, -1):
690
+ if not lines[i].strip(): # Empty line
691
+ return i
692
+
693
+ # If no paragraph boundary found, split at 75% of chunk size
694
+ return max(1, len(lines) * 3 // 4)
695
+
696
+ def _create_chunk(self, content: str, filename: str,
697
+ section: Optional[str] = None,
698
+ start_line: Optional[int] = None,
699
+ end_line: Optional[int] = None,
700
+ metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
701
+ """Create chunk dictionary with enhanced metadata"""
702
+ base_metadata = {
703
+ 'file_type': Path(filename).suffix.lstrip('.'),
704
+ 'chunk_size': len(content),
705
+ 'word_count': len(content.split()),
706
+ }
707
+
708
+ # Handle sentence count with fallback
709
+ try:
710
+ if sent_tokenize and content.strip():
711
+ base_metadata['sentence_count'] = len(sent_tokenize(content))
712
+ else:
713
+ # Fallback: count sentences by periods
714
+ base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
715
+ except Exception as e:
716
+ logger.warning(f"Error counting sentences: {e}")
717
+ # Simple fallback: count periods
718
+ base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
719
+
720
+ if metadata:
721
+ base_metadata.update(metadata)
722
+
723
+ return {
724
+ 'content': content.strip(),
725
+ 'filename': filename,
726
+ 'section': section,
727
+ 'start_line': start_line,
728
+ 'end_line': end_line,
729
+ 'metadata': base_metadata
730
+ }
731
+
732
+ def _get_overlap_lines(self, lines: List[str]) -> List[str]:
733
+ """Get overlap lines for chunk continuity"""
734
+ if not lines:
735
+ return []
736
+
737
+ # Calculate overlap size in characters
738
+ overlap_chars = self.chunk_overlap
739
+ overlap_lines = []
740
+ char_count = 0
741
+
742
+ # Take lines from the end until we reach overlap size
743
+ for line in reversed(lines):
744
+ if char_count + len(line) <= overlap_chars:
745
+ overlap_lines.insert(0, line)
746
+ char_count += len(line) + 1
747
+ else:
748
+ break
749
+
750
+ return overlap_lines
751
+
752
+ def _chunk_by_sentences(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
753
+ """Chunk content by sentences with specified max sentences per chunk"""
754
+ if isinstance(content, list):
755
+ content = '\n'.join(content)
756
+
757
+ # Use sentence-based chunking
758
+ split_newlines = self.split_newlines if self.split_newlines is not None else 2
759
+ sentences = self._sentence_based_chunking(content, self.max_sentences_per_chunk, split_newlines)
760
+
761
+ chunks = []
762
+ for i, chunk_content in enumerate(sentences):
763
+ chunks.append(self._create_chunk(
764
+ content=chunk_content,
765
+ filename=filename,
766
+ section=f"Section {i+1}",
767
+ metadata={
768
+ 'chunk_method': 'sentence_based',
769
+ 'chunk_index': i,
770
+ 'max_sentences_per_chunk': self.max_sentences_per_chunk,
771
+ 'split_newlines': split_newlines
772
+ }
773
+ ))
774
+
775
+ return chunks
776
+
777
+ def _chunk_by_sliding_window(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
778
+ """Chunk content using sliding window approach with word-based chunks"""
779
+ if isinstance(content, list):
780
+ content = '\n'.join(content)
781
+
782
+ # Split content into words
783
+ words = content.split()
784
+
785
+ if not words:
786
+ return []
787
+
788
+ chunks = []
789
+ chunk_index = 0
790
+
791
+ # Create overlapping chunks
792
+ for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
793
+ chunk_words = words[i:i + self.chunk_size]
794
+ if chunk_words:
795
+ chunk_content = ' '.join(chunk_words)
796
+ chunks.append(self._create_chunk(
797
+ content=chunk_content,
798
+ filename=filename,
799
+ section=f"Chunk {chunk_index + 1}",
800
+ metadata={
801
+ 'chunk_method': 'sliding_window',
802
+ 'chunk_index': chunk_index,
803
+ 'chunk_size_words': self.chunk_size,
804
+ 'overlap_size_words': self.chunk_overlap,
805
+ 'start_word': i,
806
+ 'end_word': i + len(chunk_words)
807
+ }
808
+ ))
809
+ chunk_index += 1
810
+
811
+ return chunks
812
+
813
+ def _chunk_by_paragraphs(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
814
+ """Chunk content by paragraphs (split on double newlines)"""
815
+ if isinstance(content, list):
816
+ content = '\n'.join(content)
817
+
818
+ # Split on double newlines to get paragraphs
819
+ paragraphs = re.split(r'\n\s*\n', content)
820
+ paragraphs = [p.strip() for p in paragraphs if p.strip()]
821
+
822
+ chunks = []
823
+ for i, paragraph in enumerate(paragraphs):
824
+ if paragraph:
825
+ chunks.append(self._create_chunk(
826
+ content=paragraph,
827
+ filename=filename,
828
+ section=f"Paragraph {i+1}",
829
+ metadata={
830
+ 'chunk_method': 'paragraph_based',
831
+ 'chunk_index': i,
832
+ 'paragraph_number': i + 1
833
+ }
834
+ ))
835
+
836
+ return chunks
837
+
838
+ def _chunk_by_pages(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
839
+ """Chunk content by pages (for documents that have page boundaries)"""
840
+ if isinstance(content, list):
841
+ # If content is already a list (e.g., from PDF extraction), treat each item as a page
842
+ pages = [str(page).strip() for page in content if str(page).strip()]
843
+ else:
844
+ # For text content, try to detect page boundaries
845
+ # Look for form feed characters or page break indicators
846
+ if '\f' in content:
847
+ pages = content.split('\f')
848
+ elif '---PAGE---' in content:
849
+ pages = content.split('---PAGE---')
850
+ elif re.search(r'\n\s*Page\s+\d+\s*\n', content):
851
+ # Split on "Page N" patterns
852
+ pages = re.split(r'\n\s*Page\s+\d+\s*\n', content)
853
+ else:
854
+ # Fallback: split into roughly equal chunks
855
+ words = content.split()
856
+ words_per_page = max(500, len(words) // 10) # Aim for ~10 pages
857
+ pages = []
858
+ for i in range(0, len(words), words_per_page):
859
+ page_words = words[i:i + words_per_page]
860
+ if page_words:
861
+ pages.append(' '.join(page_words))
862
+
863
+ pages = [p.strip() for p in pages if p.strip()]
864
+
865
+ chunks = []
866
+ for i, page_content in enumerate(pages):
867
+ if page_content:
868
+ chunks.append(self._create_chunk(
869
+ content=page_content,
870
+ filename=filename,
871
+ section=f"Page {i+1}",
872
+ metadata={
873
+ 'chunk_method': 'page_based',
874
+ 'chunk_index': i,
875
+ 'page_number': i + 1
876
+ }
877
+ ))
878
+
879
+ return chunks
880
+
881
+ def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
882
+ """Chunk based on semantic similarity between sentences"""
883
+ if isinstance(content, list):
884
+ content = '\n'.join(content)
885
+
886
+ # Get sentences
887
+ if sent_tokenize:
888
+ sentences = sent_tokenize(content)
889
+ else:
890
+ sentences = content.split('. ')
891
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
892
+
893
+ if len(sentences) <= 1:
894
+ return [self._create_chunk(content, filename, "Section 1",
895
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
896
+
897
+ # Generate embeddings for sentences (using the same model as the index)
898
+ try:
899
+ from sentence_transformers import SentenceTransformer
900
+ from sklearn.metrics.pairwise import cosine_similarity
901
+ import numpy as np
902
+
903
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
904
+ embeddings = model.encode(sentences, show_progress_bar=False)
905
+
906
+ # Calculate similarity between adjacent sentences
907
+ similarities = []
908
+ for i in range(len(embeddings) - 1):
909
+ sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
910
+ similarities.append(sim)
911
+
912
+ # Find split points where similarity drops below threshold
913
+ split_points = [0]
914
+ for i, sim in enumerate(similarities):
915
+ if sim < self.semantic_threshold:
916
+ split_points.append(i + 1)
917
+ split_points.append(len(sentences))
918
+
919
+ # Create chunks
920
+ chunks = []
921
+ for i in range(len(split_points) - 1):
922
+ start_idx = split_points[i]
923
+ end_idx = split_points[i + 1]
924
+ chunk_sentences = sentences[start_idx:end_idx]
925
+
926
+ # Ensure minimum chunk size
927
+ if len(chunk_sentences) < 2 and i > 0:
928
+ # Merge with previous chunk
929
+ chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
930
+ continue
931
+
932
+ chunk_content = ' '.join(chunk_sentences)
933
+ chunks.append(self._create_chunk(
934
+ content=chunk_content,
935
+ filename=filename,
936
+ section=f"Semantic Section {i+1}",
937
+ metadata={
938
+ 'chunk_method': 'semantic',
939
+ 'chunk_index': i,
940
+ 'semantic_threshold': self.semantic_threshold,
941
+ 'sentence_count': len(chunk_sentences)
942
+ }
943
+ ))
944
+
945
+ return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
946
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
947
+
948
+ except ImportError:
949
+ # Fallback to sentence-based chunking
950
+ return self._chunk_by_sentences(content, filename, file_type)
951
+
952
+ def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
953
+ """Chunk based on topic changes using keyword analysis"""
954
+ if isinstance(content, list):
955
+ content = '\n'.join(content)
956
+
957
+ if sent_tokenize:
958
+ sentences = sent_tokenize(content)
959
+ else:
960
+ sentences = content.split('. ')
961
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
962
+
963
+ if len(sentences) <= 3:
964
+ return [self._create_chunk(content, filename, "Topic 1",
965
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
966
+
967
+ try:
968
+ # Simple topic detection using keyword overlap
969
+ from collections import Counter
970
+ import re
971
+
972
+ # Extract keywords from each sentence
973
+ sentence_keywords = []
974
+ for sentence in sentences:
975
+ # Simple keyword extraction (could be enhanced with NLP)
976
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
977
+ # Filter common words (basic stopwords)
978
+ stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
979
+ keywords = [w for w in words if w not in stopwords and len(w) > 3]
980
+ sentence_keywords.append(set(keywords))
981
+
982
+ # Find topic boundaries based on keyword overlap
983
+ chunks = []
984
+ current_chunk = [sentences[0]]
985
+ current_keywords = sentence_keywords[0]
986
+
987
+ for i in range(1, len(sentences)):
988
+ # Calculate keyword overlap with current chunk
989
+ overlap = len(current_keywords.intersection(sentence_keywords[i]))
990
+ total_keywords = len(current_keywords.union(sentence_keywords[i]))
991
+
992
+ if total_keywords > 0:
993
+ similarity = overlap / total_keywords
994
+ else:
995
+ similarity = 0
996
+
997
+ # If similarity is low, start new chunk
998
+ if similarity < self.topic_threshold and len(current_chunk) >= 2:
999
+ chunk_content = ' '.join(current_chunk)
1000
+ chunks.append(self._create_chunk(
1001
+ content=chunk_content,
1002
+ filename=filename,
1003
+ section=f"Topic {len(chunks)+1}",
1004
+ metadata={
1005
+ 'chunk_method': 'topic',
1006
+ 'chunk_index': len(chunks),
1007
+ 'topic_keywords': list(current_keywords)[:10], # Top keywords
1008
+ 'sentence_count': len(current_chunk),
1009
+ 'topic_threshold': self.topic_threshold
1010
+ }
1011
+ ))
1012
+ current_chunk = [sentences[i]]
1013
+ current_keywords = sentence_keywords[i]
1014
+ else:
1015
+ current_chunk.append(sentences[i])
1016
+ current_keywords = current_keywords.union(sentence_keywords[i])
1017
+
1018
+ # Add final chunk
1019
+ if current_chunk:
1020
+ chunk_content = ' '.join(current_chunk)
1021
+ chunks.append(self._create_chunk(
1022
+ content=chunk_content,
1023
+ filename=filename,
1024
+ section=f"Topic {len(chunks)+1}",
1025
+ metadata={
1026
+ 'chunk_method': 'topic',
1027
+ 'chunk_index': len(chunks),
1028
+ 'topic_keywords': list(current_keywords)[:10],
1029
+ 'sentence_count': len(current_chunk),
1030
+ 'topic_threshold': self.topic_threshold
1031
+ }
1032
+ ))
1033
+
1034
+ return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
1035
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
1036
+
1037
+ except Exception:
1038
+ # Fallback to sentence-based chunking
1039
+ return self._chunk_by_sentences(content, filename, file_type)
1040
+
1041
+ def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
1042
+ """Create chunks optimized for question-answering"""
1043
+ if isinstance(content, list):
1044
+ content = '\n'.join(content)
1045
+
1046
+ if sent_tokenize:
1047
+ sentences = sent_tokenize(content)
1048
+ else:
1049
+ sentences = content.split('. ')
1050
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
1051
+
1052
+ # Patterns that indicate Q&A structure
1053
+ question_patterns = [
1054
+ r'\?', # Questions
1055
+ r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
1056
+ r'(step|steps|process|procedure|method|way to)',
1057
+ r'(example|examples|instance|case)',
1058
+ r'(definition|meaning|refers to|means)',
1059
+ ]
1060
+
1061
+ chunks = []
1062
+ current_chunk = []
1063
+ current_context = []
1064
+
1065
+ for i, sentence in enumerate(sentences):
1066
+ sentence_lower = sentence.lower().strip()
1067
+
1068
+ # Check if this sentence contains Q&A indicators
1069
+ is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
1070
+
1071
+ if is_qa_relevant or len(current_chunk) == 0:
1072
+ current_chunk.append(sentence)
1073
+ # Add surrounding context (previous and next sentences)
1074
+ if i > 0 and sentences[i-1] not in current_chunk:
1075
+ current_context.append(sentences[i-1])
1076
+ if i < len(sentences) - 1:
1077
+ current_context.append(sentences[i+1])
1078
+ else:
1079
+ current_chunk.append(sentence)
1080
+
1081
+ # Create chunk when we have enough content or reach a natural break
1082
+ if (len(current_chunk) >= 3 and
1083
+ (i == len(sentences) - 1 or # Last sentence
1084
+ sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
1085
+
1086
+ # Combine chunk with context
1087
+ full_content = current_context + current_chunk
1088
+ chunk_content = ' '.join(full_content)
1089
+
1090
+ chunks.append(self._create_chunk(
1091
+ content=chunk_content,
1092
+ filename=filename,
1093
+ section=f"QA Section {len(chunks)+1}",
1094
+ metadata={
1095
+ 'chunk_method': 'qa_optimized',
1096
+ 'chunk_index': len(chunks),
1097
+ 'has_question': any('?' in s for s in current_chunk),
1098
+ 'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
1099
+ 'sentence_count': len(full_content)
1100
+ }
1101
+ ))
1102
+
1103
+ current_chunk = []
1104
+ current_context = []
1105
+
1106
+ # Handle remaining content
1107
+ if current_chunk:
1108
+ chunk_content = ' '.join(current_context + current_chunk)
1109
+ chunks.append(self._create_chunk(
1110
+ content=chunk_content,
1111
+ filename=filename,
1112
+ section=f"QA Section {len(chunks)+1}",
1113
+ metadata={
1114
+ 'chunk_method': 'qa_optimized',
1115
+ 'chunk_index': len(chunks),
1116
+ 'sentence_count': len(current_context + current_chunk)
1117
+ }
1118
+ ))
1119
+
1120
+ return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
1121
+ metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
1122
+
1123
+ def _chunk_from_json(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
1124
+ """
1125
+ Create chunks from pre-processed JSON content
1126
+
1127
+ This strategy expects content to be a JSON string with the following structure:
1128
+ {
1129
+ "chunks": [
1130
+ {
1131
+ "chunk_id": "unique_id",
1132
+ "type": "content|toc",
1133
+ "content": "text content",
1134
+ "metadata": {
1135
+ "url": "https://...",
1136
+ "section_number": 1,
1137
+ "related_toc": "toc_id",
1138
+ ...
1139
+ }
1140
+ },
1141
+ ...
1142
+ ]
1143
+ }
1144
+
1145
+ Args:
1146
+ content: JSON string containing pre-chunked content
1147
+ filename: Name of the source file
1148
+ file_type: Should be 'json'
1149
+
1150
+ Returns:
1151
+ List of chunk dictionaries formatted for the search index
1152
+ """
1153
+ try:
1154
+ # Parse JSON content
1155
+ data = json.loads(content)
1156
+
1157
+ if not isinstance(data, dict) or 'chunks' not in data:
1158
+ logger.error(f"Invalid JSON structure in {filename}: expected 'chunks' key")
1159
+ # Fallback to treating it as plain text
1160
+ return self._chunk_by_sentences(content, filename, file_type)
1161
+
1162
+ chunks = []
1163
+ for idx, json_chunk in enumerate(data['chunks']):
1164
+ if not isinstance(json_chunk, dict) or 'content' not in json_chunk:
1165
+ logger.warning(f"Skipping invalid chunk {idx} in {filename}")
1166
+ continue
1167
+
1168
+ # Extract metadata from JSON chunk
1169
+ json_metadata = json_chunk.get('metadata', {})
1170
+ chunk_type = json_chunk.get('type', 'content')
1171
+
1172
+ # Build chunk metadata (excluding tags which go at top level)
1173
+ metadata = {
1174
+ 'chunk_method': 'json',
1175
+ 'chunk_index': idx,
1176
+ 'chunk_type': chunk_type,
1177
+ 'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
1178
+ }
1179
+
1180
+ # Extract tags before merging metadata
1181
+ tags = json_metadata.get('tags', [])
1182
+
1183
+ # Merge JSON metadata (this includes all fields including tags)
1184
+ # We'll keep tags in metadata for backward compatibility but also set at top level
1185
+ metadata.update(json_metadata)
1186
+
1187
+ # Determine section name
1188
+ if chunk_type == 'toc':
1189
+ section = f"TOC: {json_chunk.get('content', '')[:50]}"
1190
+ else:
1191
+ section = json_metadata.get('section', f"Section {json_metadata.get('section_number', idx + 1)}")
1192
+
1193
+ # Create chunk with proper structure
1194
+ chunk = self._create_chunk(
1195
+ content=json_chunk['content'],
1196
+ filename=filename,
1197
+ section=section,
1198
+ metadata=metadata
1199
+ )
1200
+
1201
+ # Set tags at the top level for proper tag filtering
1202
+ if tags:
1203
+ chunk['tags'] = tags
1204
+ elif chunk_type == 'toc':
1205
+ # For TOC entries, add special tags if none provided
1206
+ chunk['tags'] = ['toc', 'navigation']
1207
+
1208
+ chunks.append(chunk)
1209
+
1210
+ if not chunks:
1211
+ logger.warning(f"No valid chunks found in JSON file {filename}")
1212
+ return self._chunk_by_sentences(str(data), filename, file_type)
1213
+
1214
+ logger.info(f"Created {len(chunks)} chunks from JSON file {filename}")
1215
+ return chunks
1216
+
1217
+ except json.JSONDecodeError as e:
1218
+ logger.error(f"Failed to parse JSON in {filename}: {e}")
1219
+ # Fallback to sentence chunking
1220
+ return self._chunk_by_sentences(content, filename, file_type)
1221
+ except Exception as e:
1222
+ logger.error(f"Unexpected error processing JSON chunks in {filename}: {e}")
1223
+ return self._chunk_by_sentences(content, filename, file_type)