signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +130 -4
- signalwire_agents/agent_server.py +438 -32
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +18 -0
- signalwire_agents/cli/build_search.py +1367 -0
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +1225 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +809 -0
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +959 -2166
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +707 -0
- signalwire_agents/core/data_map.py +487 -0
- signalwire_agents/core/function_result.py +1150 -1
- signalwire_agents/core/logging_config.py +376 -0
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +287 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +368 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1134 -0
- signalwire_agents/core/security/session_manager.py +174 -86
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +200 -0
- signalwire_agents/core/skill_manager.py +244 -0
- signalwire_agents/core/swaig_function.py +33 -9
- signalwire_agents/core/swml_builder.py +212 -12
- signalwire_agents/core/swml_handler.py +43 -13
- signalwire_agents/core/swml_renderer.py +123 -297
- signalwire_agents/core/swml_service.py +277 -260
- signalwire_agents/prefabs/concierge.py +6 -2
- signalwire_agents/prefabs/info_gatherer.py +149 -33
- signalwire_agents/prefabs/receptionist.py +14 -22
- signalwire_agents/prefabs/survey.py +6 -2
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +137 -0
- signalwire_agents/search/document_processor.py +1223 -0
- signalwire_agents/search/index_builder.py +804 -0
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +752 -0
- signalwire_agents/search/query_processor.py +502 -0
- signalwire_agents/search/search_engine.py +1264 -0
- signalwire_agents/search/search_service.py +574 -0
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +23 -0
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/__init__.py +12 -0
- signalwire_agents/skills/datasphere/skill.py +310 -0
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +10 -0
- signalwire_agents/skills/datetime/skill.py +126 -0
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +10 -0
- signalwire_agents/skills/joke/skill.py +109 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +10 -0
- signalwire_agents/skills/math/skill.py +105 -0
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +10 -0
- signalwire_agents/skills/native_vector_search/skill.py +820 -0
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +459 -0
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +10 -0
- signalwire_agents/skills/web_search/skill.py +739 -0
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/wikipedia_search/skill.py +210 -0
- signalwire_agents/utils/__init__.py +14 -0
- signalwire_agents/utils/schema_utils.py +111 -44
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
- signalwire_agents-1.0.7.dist-info/METADATA +992 -0
- signalwire_agents-1.0.7.dist-info/RECORD +142 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
- signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents-0.1.6.data/data/schema.json +0 -5611
- signalwire_agents-0.1.6.dist-info/METADATA +0 -199
- signalwire_agents-0.1.6.dist-info/RECORD +0 -34
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 SignalWire
|
|
3
|
+
|
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
|
5
|
+
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
See LICENSE file in the project root for full license information.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
from typing import List, Dict, Any, Optional
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
# Document processing imports
|
|
18
|
+
try:
|
|
19
|
+
import pdfplumber
|
|
20
|
+
except ImportError:
|
|
21
|
+
pdfplumber = None
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from docx import Document as DocxDocument
|
|
25
|
+
except ImportError:
|
|
26
|
+
DocxDocument = None
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from bs4 import BeautifulSoup
|
|
30
|
+
except ImportError:
|
|
31
|
+
BeautifulSoup = None
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
import markdown
|
|
35
|
+
except ImportError:
|
|
36
|
+
markdown = None
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from striprtf.striprtf import rtf_to_text
|
|
40
|
+
except ImportError:
|
|
41
|
+
rtf_to_text = None
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
from openpyxl import load_workbook
|
|
45
|
+
except ImportError:
|
|
46
|
+
load_workbook = None
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
from pptx import Presentation
|
|
50
|
+
except ImportError:
|
|
51
|
+
Presentation = None
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
from nltk.tokenize import sent_tokenize
|
|
55
|
+
import nltk
|
|
56
|
+
# Ensure NLTK data is available
|
|
57
|
+
try:
|
|
58
|
+
nltk.data.find('tokenizers/punkt')
|
|
59
|
+
except LookupError:
|
|
60
|
+
nltk.download('punkt', quiet=True)
|
|
61
|
+
except ImportError:
|
|
62
|
+
sent_tokenize = None
|
|
63
|
+
nltk = None
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
import magic
|
|
67
|
+
except ImportError:
|
|
68
|
+
magic = None
|
|
69
|
+
|
|
70
|
+
from .query_processor import preprocess_document_content
|
|
71
|
+
|
|
72
|
+
logger = logging.getLogger(__name__)
|
|
73
|
+
|
|
74
|
+
class DocumentProcessor:
|
|
75
|
+
"""Enhanced document processor with smart chunking capabilities"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
chunking_strategy: str = 'sentence',
|
|
80
|
+
max_sentences_per_chunk: int = 5,
|
|
81
|
+
chunk_size: int = 50,
|
|
82
|
+
chunk_overlap: int = 10,
|
|
83
|
+
split_newlines: Optional[int] = None,
|
|
84
|
+
index_nlp_backend: str = 'nltk',
|
|
85
|
+
verbose: bool = False,
|
|
86
|
+
semantic_threshold: float = 0.5,
|
|
87
|
+
topic_threshold: float = 0.3
|
|
88
|
+
):
|
|
89
|
+
"""
|
|
90
|
+
Initialize document processor
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
chunking_strategy: Strategy for chunking documents:
|
|
94
|
+
- 'sentence': Sentence-based chunking with overlap
|
|
95
|
+
- 'sliding': Sliding window with word-based chunks
|
|
96
|
+
- 'paragraph': Natural paragraph boundaries
|
|
97
|
+
- 'page': Page-based chunking (for PDFs)
|
|
98
|
+
- 'semantic': Semantic similarity-based chunking
|
|
99
|
+
- 'topic': Topic modeling-based chunking
|
|
100
|
+
- 'qa': Question-answer optimized chunking
|
|
101
|
+
- 'json': JSON structure-aware chunking
|
|
102
|
+
- 'markdown': Markdown structure-aware chunking with code block detection
|
|
103
|
+
max_sentences_per_chunk: For sentence strategy (default: 5)
|
|
104
|
+
chunk_size: For sliding strategy - words per chunk (default: 50)
|
|
105
|
+
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
|
106
|
+
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
|
107
|
+
index_nlp_backend: NLP backend for indexing (default: 'nltk')
|
|
108
|
+
verbose: Whether to enable verbose logging (default: False)
|
|
109
|
+
semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
|
|
110
|
+
topic_threshold: Similarity threshold for topic chunking (default: 0.3)
|
|
111
|
+
"""
|
|
112
|
+
self.chunking_strategy = chunking_strategy
|
|
113
|
+
self.max_sentences_per_chunk = max_sentences_per_chunk
|
|
114
|
+
self.chunk_size = chunk_size
|
|
115
|
+
self.chunk_overlap = chunk_overlap
|
|
116
|
+
self.split_newlines = split_newlines
|
|
117
|
+
self.semantic_threshold = semantic_threshold
|
|
118
|
+
self.topic_threshold = topic_threshold
|
|
119
|
+
|
|
120
|
+
# Legacy support for old character-based chunking
|
|
121
|
+
self.chunk_overlap = chunk_overlap
|
|
122
|
+
|
|
123
|
+
def create_chunks(self, content: str, filename: str,
|
|
124
|
+
file_type: str) -> List[Dict[str, Any]]:
|
|
125
|
+
"""
|
|
126
|
+
Create chunks from document content using specified chunking strategy
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
content: Document content (string) - should be the actual content, not a file path
|
|
130
|
+
filename: Name of the file (for metadata)
|
|
131
|
+
file_type: File extension/type
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of chunk dictionaries
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
# Apply chunking strategy
|
|
138
|
+
if self.chunking_strategy == 'sentence':
|
|
139
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
140
|
+
elif self.chunking_strategy == 'sliding':
|
|
141
|
+
return self._chunk_by_sliding_window(content, filename, file_type)
|
|
142
|
+
elif self.chunking_strategy == 'paragraph':
|
|
143
|
+
return self._chunk_by_paragraphs(content, filename, file_type)
|
|
144
|
+
elif self.chunking_strategy == 'page':
|
|
145
|
+
return self._chunk_by_pages(content, filename, file_type)
|
|
146
|
+
elif self.chunking_strategy == 'semantic':
|
|
147
|
+
return self._chunk_by_semantic(content, filename, file_type)
|
|
148
|
+
elif self.chunking_strategy == 'topic':
|
|
149
|
+
return self._chunk_by_topics(content, filename, file_type)
|
|
150
|
+
elif self.chunking_strategy == 'qa':
|
|
151
|
+
return self._chunk_by_qa_optimization(content, filename, file_type)
|
|
152
|
+
elif self.chunking_strategy == 'json':
|
|
153
|
+
return self._chunk_from_json(content, filename, file_type)
|
|
154
|
+
elif self.chunking_strategy == 'markdown':
|
|
155
|
+
# Use markdown-aware chunking for better structure preservation
|
|
156
|
+
return self._chunk_markdown_enhanced(content, filename)
|
|
157
|
+
else:
|
|
158
|
+
# Fallback to sentence-based chunking
|
|
159
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
160
|
+
|
|
161
|
+
def _extract_text_from_file(self, file_path: str) -> Any:
|
|
162
|
+
"""Extract text from various file formats"""
|
|
163
|
+
if not magic:
|
|
164
|
+
# Fallback to extension-based detection
|
|
165
|
+
file_path_obj = Path(file_path)
|
|
166
|
+
extension = file_path_obj.suffix.lower()
|
|
167
|
+
|
|
168
|
+
if extension == '.pdf':
|
|
169
|
+
file_type = 'application/pdf'
|
|
170
|
+
elif extension == '.docx':
|
|
171
|
+
file_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
172
|
+
elif extension in ['.txt', '.md']:
|
|
173
|
+
file_type = 'text/plain'
|
|
174
|
+
elif extension == '.html':
|
|
175
|
+
file_type = 'text/html'
|
|
176
|
+
elif extension == '.rtf':
|
|
177
|
+
file_type = 'application/rtf'
|
|
178
|
+
elif extension == '.xlsx':
|
|
179
|
+
file_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
|
180
|
+
elif extension == '.pptx':
|
|
181
|
+
file_type = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
|
182
|
+
else:
|
|
183
|
+
file_type = 'text/plain'
|
|
184
|
+
else:
|
|
185
|
+
mime = magic.Magic(mime=True)
|
|
186
|
+
file_type = mime.from_file(file_path)
|
|
187
|
+
|
|
188
|
+
if 'pdf' in file_type:
|
|
189
|
+
return self._extract_pdf(file_path)
|
|
190
|
+
elif 'vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
|
|
191
|
+
return self._extract_docx(file_path)
|
|
192
|
+
elif 'plain' in file_type or 'text' in file_type:
|
|
193
|
+
return self._extract_text(file_path)
|
|
194
|
+
elif 'html' in file_type:
|
|
195
|
+
return self._extract_html(file_path)
|
|
196
|
+
elif 'markdown' in file_type or file_path.endswith('.md'):
|
|
197
|
+
return self._extract_markdown(file_path)
|
|
198
|
+
elif 'rtf' in file_type:
|
|
199
|
+
return self._extract_rtf(file_path)
|
|
200
|
+
elif 'vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
|
|
201
|
+
return self._extract_excel(file_path)
|
|
202
|
+
elif 'vnd.openxmlformats-officedocument.presentationml.presentation' in file_type:
|
|
203
|
+
return self._extract_powerpoint(file_path)
|
|
204
|
+
else:
|
|
205
|
+
return json.dumps({"error": f"Unsupported file type: {file_type}"})
|
|
206
|
+
|
|
207
|
+
def _extract_pdf(self, file_path: str):
|
|
208
|
+
"""Extract text from PDF files"""
|
|
209
|
+
if not pdfplumber:
|
|
210
|
+
return json.dumps({"error": "pdfplumber not available for PDF processing"})
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
with pdfplumber.open(file_path) as pdf:
|
|
214
|
+
pages = []
|
|
215
|
+
for page in pdf.pages:
|
|
216
|
+
text = page.extract_text()
|
|
217
|
+
if text:
|
|
218
|
+
# Remove page number from the beginning
|
|
219
|
+
text = re.sub(r'^\d+\.\s*', '', text.strip())
|
|
220
|
+
pages.append(text)
|
|
221
|
+
return pages
|
|
222
|
+
except Exception as e:
|
|
223
|
+
return json.dumps({"error": f"Error processing PDF: {e}"})
|
|
224
|
+
|
|
225
|
+
def _extract_docx(self, file_path: str):
|
|
226
|
+
"""Extract text from DOCX files"""
|
|
227
|
+
if not DocxDocument:
|
|
228
|
+
return json.dumps({"error": "python-docx not available for DOCX processing"})
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
doc = DocxDocument(file_path)
|
|
232
|
+
return [para.text for para in doc.paragraphs if para.text.strip()]
|
|
233
|
+
except Exception as e:
|
|
234
|
+
return json.dumps({"error": f"Error processing DOCX: {e}"})
|
|
235
|
+
|
|
236
|
+
def _extract_text(self, file_path: str):
|
|
237
|
+
"""Extract text from plain text files"""
|
|
238
|
+
try:
|
|
239
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
240
|
+
return file.read()
|
|
241
|
+
except Exception as e:
|
|
242
|
+
return json.dumps({"error": f"Error processing TXT: {e}"})
|
|
243
|
+
|
|
244
|
+
def _extract_html(self, file_path: str):
|
|
245
|
+
"""Extract text from HTML files"""
|
|
246
|
+
if not BeautifulSoup:
|
|
247
|
+
return json.dumps({"error": "beautifulsoup4 not available for HTML processing"})
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
251
|
+
soup = BeautifulSoup(file, 'html.parser')
|
|
252
|
+
return soup.get_text(separator='\n')
|
|
253
|
+
except Exception as e:
|
|
254
|
+
return json.dumps({"error": f"Error processing HTML: {e}"})
|
|
255
|
+
|
|
256
|
+
def _extract_markdown(self, file_path: str):
|
|
257
|
+
"""Extract text from Markdown files"""
|
|
258
|
+
try:
|
|
259
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
260
|
+
content = file.read()
|
|
261
|
+
if markdown and BeautifulSoup:
|
|
262
|
+
html = markdown.markdown(content)
|
|
263
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
264
|
+
return soup.get_text(separator='\n')
|
|
265
|
+
else:
|
|
266
|
+
# Fallback to raw markdown
|
|
267
|
+
return content
|
|
268
|
+
except Exception as e:
|
|
269
|
+
return json.dumps({"error": f"Error processing Markdown: {e}"})
|
|
270
|
+
|
|
271
|
+
def _extract_rtf(self, file_path: str):
|
|
272
|
+
"""Extract text from RTF files"""
|
|
273
|
+
if not rtf_to_text:
|
|
274
|
+
return json.dumps({"error": "striprtf not available for RTF processing"})
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
278
|
+
return rtf_to_text(file.read())
|
|
279
|
+
except Exception as e:
|
|
280
|
+
return json.dumps({"error": f"Error processing RTF: {e}"})
|
|
281
|
+
|
|
282
|
+
def _extract_excel(self, file_path: str):
|
|
283
|
+
"""Extract text from Excel files"""
|
|
284
|
+
if not load_workbook:
|
|
285
|
+
return json.dumps({"error": "openpyxl not available for Excel processing"})
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
wb = load_workbook(file_path)
|
|
289
|
+
sheets_text = []
|
|
290
|
+
for sheet in wb.worksheets:
|
|
291
|
+
for row in sheet.iter_rows(values_only=True):
|
|
292
|
+
row_text = ' '.join([str(cell) for cell in row if cell is not None])
|
|
293
|
+
sheets_text.append(row_text)
|
|
294
|
+
return "\n".join(sheets_text)
|
|
295
|
+
except Exception as e:
|
|
296
|
+
return json.dumps({"error": f"Error processing Excel: {e}"})
|
|
297
|
+
|
|
298
|
+
def _extract_powerpoint(self, file_path: str):
|
|
299
|
+
"""Extract text from PowerPoint files"""
|
|
300
|
+
if not Presentation:
|
|
301
|
+
return json.dumps({"error": "python-pptx not available for PowerPoint processing"})
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
prs = Presentation(file_path)
|
|
305
|
+
slides_text = []
|
|
306
|
+
for slide in prs.slides:
|
|
307
|
+
slide_text = []
|
|
308
|
+
for shape in slide.shapes:
|
|
309
|
+
if hasattr(shape, "text"):
|
|
310
|
+
slide_text.append(shape.text)
|
|
311
|
+
slides_text.append("\n".join(slide_text))
|
|
312
|
+
return slides_text
|
|
313
|
+
except Exception as e:
|
|
314
|
+
return json.dumps({"error": f"Error processing PowerPoint: {e}"})
|
|
315
|
+
|
|
316
|
+
def _chunk_document_aware(self, content: Any, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
317
|
+
"""Smart chunking for documents with natural structure"""
|
|
318
|
+
chunks = []
|
|
319
|
+
|
|
320
|
+
if isinstance(content, list):
|
|
321
|
+
# Handle page-based or paragraph-based content (PDF, DOCX, PPTX)
|
|
322
|
+
for i, page_content in enumerate(content):
|
|
323
|
+
if not page_content or not page_content.strip():
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
# For each page/slide, use sentence-based chunking if it's large
|
|
327
|
+
if len(page_content) > self.chunk_size:
|
|
328
|
+
page_chunks = self._sentence_based_chunking(
|
|
329
|
+
page_content,
|
|
330
|
+
max_sentences_per_chunk=self._calculate_sentences_per_chunk(page_content)
|
|
331
|
+
)
|
|
332
|
+
for j, chunk_content in enumerate(page_chunks):
|
|
333
|
+
chunks.append(self._create_chunk(
|
|
334
|
+
content=chunk_content,
|
|
335
|
+
filename=filename,
|
|
336
|
+
section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
|
|
337
|
+
metadata={'page_number': i+1, 'chunk_index': j}
|
|
338
|
+
))
|
|
339
|
+
else:
|
|
340
|
+
# Small page/slide - keep as single chunk
|
|
341
|
+
chunks.append(self._create_chunk(
|
|
342
|
+
content=page_content,
|
|
343
|
+
filename=filename,
|
|
344
|
+
section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
|
|
345
|
+
metadata={'page_number': i+1}
|
|
346
|
+
))
|
|
347
|
+
else:
|
|
348
|
+
# Single text content - use paragraph-aware chunking
|
|
349
|
+
chunks = self._chunk_text_enhanced(content, filename)
|
|
350
|
+
|
|
351
|
+
return chunks
|
|
352
|
+
|
|
353
|
+
def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
|
354
|
+
"""Enhanced markdown chunking with code block detection and rich metadata
|
|
355
|
+
|
|
356
|
+
Features:
|
|
357
|
+
- Tracks header hierarchy for section paths
|
|
358
|
+
- Detects code blocks and extracts language
|
|
359
|
+
- Adds 'code' tags to chunks containing code
|
|
360
|
+
- Preserves markdown structure for better search
|
|
361
|
+
"""
|
|
362
|
+
chunks = []
|
|
363
|
+
lines = content.split('\n')
|
|
364
|
+
|
|
365
|
+
current_section = None
|
|
366
|
+
current_hierarchy = [] # Track header hierarchy
|
|
367
|
+
current_chunk = []
|
|
368
|
+
current_size = 0
|
|
369
|
+
line_start = 1
|
|
370
|
+
in_code_block = False
|
|
371
|
+
code_languages = [] # Track languages in current chunk
|
|
372
|
+
has_code = False
|
|
373
|
+
|
|
374
|
+
for line_num, line in enumerate(lines, 1):
|
|
375
|
+
# Check for code block fences
|
|
376
|
+
code_fence_match = re.match(r'^```(\w+)?', line)
|
|
377
|
+
if code_fence_match:
|
|
378
|
+
in_code_block = not in_code_block
|
|
379
|
+
if in_code_block:
|
|
380
|
+
# Starting code block
|
|
381
|
+
has_code = True
|
|
382
|
+
lang = code_fence_match.group(1)
|
|
383
|
+
if lang and lang not in code_languages:
|
|
384
|
+
code_languages.append(lang)
|
|
385
|
+
|
|
386
|
+
# Check for headers with hierarchy tracking
|
|
387
|
+
header_match = re.match(r'^(#{1,6})\s+(.+)', line) if not in_code_block else None
|
|
388
|
+
if header_match:
|
|
389
|
+
header_level = len(header_match.group(1))
|
|
390
|
+
header_text = header_match.group(2).strip()
|
|
391
|
+
|
|
392
|
+
# Save current chunk if it exists
|
|
393
|
+
if current_chunk:
|
|
394
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
395
|
+
current_hierarchy, code_languages, has_code
|
|
396
|
+
)
|
|
397
|
+
chunks.append(self._create_chunk(
|
|
398
|
+
content='\n'.join(current_chunk),
|
|
399
|
+
filename=filename,
|
|
400
|
+
section=self._build_section_path(current_hierarchy),
|
|
401
|
+
start_line=line_start,
|
|
402
|
+
end_line=line_num - 1,
|
|
403
|
+
metadata=chunk_metadata
|
|
404
|
+
))
|
|
405
|
+
|
|
406
|
+
# Update hierarchy
|
|
407
|
+
current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
|
|
408
|
+
current_section = header_text
|
|
409
|
+
current_chunk = [line]
|
|
410
|
+
current_size = len(line)
|
|
411
|
+
line_start = line_num
|
|
412
|
+
code_languages = []
|
|
413
|
+
has_code = False
|
|
414
|
+
|
|
415
|
+
else:
|
|
416
|
+
current_chunk.append(line)
|
|
417
|
+
current_size += len(line) + 1
|
|
418
|
+
|
|
419
|
+
# Check if chunk is getting too large - use smart splitting
|
|
420
|
+
# But don't split inside code blocks
|
|
421
|
+
if current_size >= self.chunk_size and not in_code_block:
|
|
422
|
+
# Try to split at paragraph boundary first
|
|
423
|
+
split_point = self._find_best_split_point(current_chunk)
|
|
424
|
+
|
|
425
|
+
chunk_to_save = current_chunk[:split_point]
|
|
426
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
427
|
+
current_hierarchy, code_languages, has_code
|
|
428
|
+
)
|
|
429
|
+
chunks.append(self._create_chunk(
|
|
430
|
+
content='\n'.join(chunk_to_save),
|
|
431
|
+
filename=filename,
|
|
432
|
+
section=self._build_section_path(current_hierarchy),
|
|
433
|
+
start_line=line_start,
|
|
434
|
+
end_line=line_start + split_point - 1,
|
|
435
|
+
metadata=chunk_metadata
|
|
436
|
+
))
|
|
437
|
+
|
|
438
|
+
# Start new chunk with overlap
|
|
439
|
+
overlap_lines = self._get_overlap_lines(chunk_to_save)
|
|
440
|
+
remaining_lines = current_chunk[split_point:]
|
|
441
|
+
current_chunk = overlap_lines + remaining_lines
|
|
442
|
+
current_size = sum(len(line) + 1 for line in current_chunk)
|
|
443
|
+
line_start = line_start + split_point - len(overlap_lines)
|
|
444
|
+
# Reset code tracking for new chunk
|
|
445
|
+
code_languages = []
|
|
446
|
+
has_code = False
|
|
447
|
+
|
|
448
|
+
# Add final chunk
|
|
449
|
+
if current_chunk:
|
|
450
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
451
|
+
current_hierarchy, code_languages, has_code
|
|
452
|
+
)
|
|
453
|
+
chunks.append(self._create_chunk(
|
|
454
|
+
content='\n'.join(current_chunk),
|
|
455
|
+
filename=filename,
|
|
456
|
+
section=self._build_section_path(current_hierarchy),
|
|
457
|
+
start_line=line_start,
|
|
458
|
+
end_line=len(lines),
|
|
459
|
+
metadata=chunk_metadata
|
|
460
|
+
))
|
|
461
|
+
|
|
462
|
+
return chunks
|
|
463
|
+
|
|
464
|
+
def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
|
465
|
+
"""Enhanced Python code chunking with better function/class detection"""
|
|
466
|
+
chunks = []
|
|
467
|
+
lines = content.split('\n')
|
|
468
|
+
|
|
469
|
+
current_function = None
|
|
470
|
+
current_class = None
|
|
471
|
+
current_chunk = []
|
|
472
|
+
current_size = 0
|
|
473
|
+
line_start = 1
|
|
474
|
+
indent_level = 0
|
|
475
|
+
|
|
476
|
+
for line_num, line in enumerate(lines, 1):
|
|
477
|
+
# Detect class definitions
|
|
478
|
+
class_match = re.match(r'^(\s*)(class\s+([^(:\s]+))', line)
|
|
479
|
+
if class_match:
|
|
480
|
+
indent = len(class_match.group(1))
|
|
481
|
+
class_name = class_match.group(3)
|
|
482
|
+
|
|
483
|
+
# Save current chunk if switching context
|
|
484
|
+
if current_chunk and (indent <= indent_level or current_class != class_name):
|
|
485
|
+
chunks.append(self._create_chunk(
|
|
486
|
+
content='\n'.join(current_chunk),
|
|
487
|
+
filename=filename,
|
|
488
|
+
section=self._build_python_section(current_class, current_function),
|
|
489
|
+
start_line=line_start,
|
|
490
|
+
end_line=line_num - 1
|
|
491
|
+
))
|
|
492
|
+
current_chunk = []
|
|
493
|
+
line_start = line_num
|
|
494
|
+
|
|
495
|
+
current_class = class_name
|
|
496
|
+
current_function = None
|
|
497
|
+
indent_level = indent
|
|
498
|
+
|
|
499
|
+
# Detect function definitions
|
|
500
|
+
func_match = re.match(r'^(\s*)(def\s+([^(:\s]+)|async\s+def\s+([^(:\s]+))', line)
|
|
501
|
+
if func_match:
|
|
502
|
+
indent = len(func_match.group(1))
|
|
503
|
+
func_name = func_match.group(3) or func_match.group(4)
|
|
504
|
+
|
|
505
|
+
# Save current chunk if switching to new function at same or lower level
|
|
506
|
+
if current_chunk and indent <= indent_level:
|
|
507
|
+
chunks.append(self._create_chunk(
|
|
508
|
+
content='\n'.join(current_chunk),
|
|
509
|
+
filename=filename,
|
|
510
|
+
section=self._build_python_section(current_class, current_function),
|
|
511
|
+
start_line=line_start,
|
|
512
|
+
end_line=line_num - 1
|
|
513
|
+
))
|
|
514
|
+
current_chunk = []
|
|
515
|
+
line_start = line_num
|
|
516
|
+
|
|
517
|
+
if indent >= indent_level: # Method within class or nested function
|
|
518
|
+
current_function = func_name
|
|
519
|
+
else: # Top-level function
|
|
520
|
+
current_function = func_name
|
|
521
|
+
current_class = None
|
|
522
|
+
|
|
523
|
+
indent_level = indent
|
|
524
|
+
|
|
525
|
+
current_chunk.append(line)
|
|
526
|
+
current_size += len(line) + 1
|
|
527
|
+
|
|
528
|
+
# Handle oversized chunks
|
|
529
|
+
if current_size >= self.chunk_size:
|
|
530
|
+
chunks.append(self._create_chunk(
|
|
531
|
+
content='\n'.join(current_chunk),
|
|
532
|
+
filename=filename,
|
|
533
|
+
section=self._build_python_section(current_class, current_function),
|
|
534
|
+
start_line=line_start,
|
|
535
|
+
end_line=line_num
|
|
536
|
+
))
|
|
537
|
+
|
|
538
|
+
# Start new chunk with minimal overlap for code
|
|
539
|
+
overlap_lines = current_chunk[-2:] if len(current_chunk) > 2 else current_chunk
|
|
540
|
+
current_chunk = overlap_lines
|
|
541
|
+
current_size = sum(len(line) + 1 for line in overlap_lines)
|
|
542
|
+
line_start = line_num - len(overlap_lines) + 1
|
|
543
|
+
|
|
544
|
+
# Add final chunk
|
|
545
|
+
if current_chunk:
|
|
546
|
+
chunks.append(self._create_chunk(
|
|
547
|
+
content='\n'.join(current_chunk),
|
|
548
|
+
filename=filename,
|
|
549
|
+
section=self._build_python_section(current_class, current_function),
|
|
550
|
+
start_line=line_start,
|
|
551
|
+
end_line=len(lines)
|
|
552
|
+
))
|
|
553
|
+
|
|
554
|
+
return chunks
|
|
555
|
+
|
|
556
|
+
def _chunk_text_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
|
557
|
+
"""Enhanced text chunking using sentence-based approach"""
|
|
558
|
+
if isinstance(content, list):
|
|
559
|
+
content = '\n'.join(content)
|
|
560
|
+
|
|
561
|
+
# Use sentence-based chunking for better coherence
|
|
562
|
+
max_sentences = self._calculate_sentences_per_chunk(content)
|
|
563
|
+
sentences = self._sentence_based_chunking(content, max_sentences)
|
|
564
|
+
|
|
565
|
+
chunks = []
|
|
566
|
+
for i, chunk_content in enumerate(sentences):
|
|
567
|
+
chunks.append(self._create_chunk(
|
|
568
|
+
content=chunk_content,
|
|
569
|
+
filename=filename,
|
|
570
|
+
section=f"Section {i+1}",
|
|
571
|
+
metadata={'chunk_method': 'sentence_based', 'chunk_index': i}
|
|
572
|
+
))
|
|
573
|
+
|
|
574
|
+
return chunks
|
|
575
|
+
|
|
576
|
+
def _sentence_based_chunking(self, text: str, max_sentences_per_chunk: int, split_newlines: int = 2) -> List[str]:
|
|
577
|
+
"""Sentence-based chunking with enhancements"""
|
|
578
|
+
if not sent_tokenize:
|
|
579
|
+
# Fallback to simple splitting
|
|
580
|
+
sentences = text.split('. ')
|
|
581
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
|
582
|
+
else:
|
|
583
|
+
sentences = []
|
|
584
|
+
|
|
585
|
+
if split_newlines > 0:
|
|
586
|
+
# Create regex pattern for specified number of newlines
|
|
587
|
+
newline_pattern = r'(\n{%d,})' % split_newlines
|
|
588
|
+
parts = re.split(newline_pattern, text)
|
|
589
|
+
|
|
590
|
+
for part in parts:
|
|
591
|
+
part = part.strip()
|
|
592
|
+
if part and not re.match(newline_pattern, part):
|
|
593
|
+
sentences.extend(sent_tokenize(part))
|
|
594
|
+
elif re.match(newline_pattern, part):
|
|
595
|
+
sentences.append(part)
|
|
596
|
+
else:
|
|
597
|
+
sentences = [sentence.strip() for sentence in sent_tokenize(text) if sentence.strip()]
|
|
598
|
+
|
|
599
|
+
# Create chunks of sentences with overlap
|
|
600
|
+
chunks = []
|
|
601
|
+
overlap_sentences = max(1, max_sentences_per_chunk // 4) # 25% overlap
|
|
602
|
+
|
|
603
|
+
for i in range(0, len(sentences), max_sentences_per_chunk - overlap_sentences):
|
|
604
|
+
chunk_sentences = sentences[i:i + max_sentences_per_chunk]
|
|
605
|
+
if chunk_sentences:
|
|
606
|
+
chunks.append(' '.join(chunk_sentences))
|
|
607
|
+
|
|
608
|
+
return chunks
|
|
609
|
+
|
|
610
|
+
def _calculate_sentences_per_chunk(self, text: str) -> int:
|
|
611
|
+
"""Calculate optimal sentences per chunk based on average sentence length"""
|
|
612
|
+
if not sent_tokenize:
|
|
613
|
+
# Fallback calculation
|
|
614
|
+
sentences = text.split('. ')
|
|
615
|
+
else:
|
|
616
|
+
sentences = sent_tokenize(text)
|
|
617
|
+
|
|
618
|
+
if not sentences:
|
|
619
|
+
return 1
|
|
620
|
+
|
|
621
|
+
avg_sentence_length = sum(len(s) for s in sentences) / len(sentences)
|
|
622
|
+
# Target chunk size divided by average sentence length
|
|
623
|
+
optimal_sentences = max(1, int(self.chunk_size / avg_sentence_length))
|
|
624
|
+
return min(optimal_sentences, 10) # Cap at 10 sentences for readability
|
|
625
|
+
|
|
626
|
+
def _build_section_path(self, hierarchy: List[str]) -> str:
|
|
627
|
+
"""Build hierarchical section path from header hierarchy"""
|
|
628
|
+
return ' > '.join(hierarchy) if hierarchy else None
|
|
629
|
+
|
|
630
|
+
def _build_markdown_metadata(self, hierarchy: List[str], code_languages: List[str], has_code: bool) -> Dict[str, Any]:
|
|
631
|
+
"""Build rich metadata for markdown chunks
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
hierarchy: Current header hierarchy (e.g., ['Installation', 'Requirements', 'Python'])
|
|
635
|
+
code_languages: List of code block languages found in chunk (e.g., ['python', 'bash'])
|
|
636
|
+
has_code: Whether chunk contains any code blocks
|
|
637
|
+
|
|
638
|
+
Returns:
|
|
639
|
+
Dictionary with markdown-specific metadata including tags
|
|
640
|
+
"""
|
|
641
|
+
metadata = {
|
|
642
|
+
'chunk_type': 'markdown',
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
# Add header level metadata
|
|
646
|
+
if hierarchy:
|
|
647
|
+
for i, header in enumerate(hierarchy, 1):
|
|
648
|
+
metadata[f'h{i}'] = header
|
|
649
|
+
|
|
650
|
+
# Add code-related metadata
|
|
651
|
+
if has_code:
|
|
652
|
+
metadata['has_code'] = True
|
|
653
|
+
if code_languages:
|
|
654
|
+
metadata['code_languages'] = code_languages
|
|
655
|
+
|
|
656
|
+
# Build tags for enhanced searching
|
|
657
|
+
tags = []
|
|
658
|
+
if has_code:
|
|
659
|
+
tags.append('code')
|
|
660
|
+
# Add language-specific tags
|
|
661
|
+
for lang in code_languages:
|
|
662
|
+
tags.append(f'code:{lang}')
|
|
663
|
+
|
|
664
|
+
# Add tags for header levels (searchable by section depth)
|
|
665
|
+
if len(hierarchy) > 0:
|
|
666
|
+
tags.append(f'depth:{len(hierarchy)}')
|
|
667
|
+
|
|
668
|
+
if tags:
|
|
669
|
+
metadata['tags'] = tags
|
|
670
|
+
|
|
671
|
+
return metadata
|
|
672
|
+
|
|
673
|
+
def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
|
|
674
|
+
"""Build section name for Python code"""
|
|
675
|
+
if class_name and function_name:
|
|
676
|
+
return f"{class_name}.{function_name}"
|
|
677
|
+
elif class_name:
|
|
678
|
+
return class_name
|
|
679
|
+
elif function_name:
|
|
680
|
+
return function_name
|
|
681
|
+
else:
|
|
682
|
+
return None
|
|
683
|
+
|
|
684
|
+
def _find_best_split_point(self, lines: List[str]) -> int:
|
|
685
|
+
"""Find the best point to split a chunk (prefer paragraph boundaries)"""
|
|
686
|
+
# Look for empty lines (paragraph boundaries) in the last 25% of the chunk
|
|
687
|
+
start_search = max(1, len(lines) * 3 // 4)
|
|
688
|
+
|
|
689
|
+
for i in range(len(lines) - 1, start_search - 1, -1):
|
|
690
|
+
if not lines[i].strip(): # Empty line
|
|
691
|
+
return i
|
|
692
|
+
|
|
693
|
+
# If no paragraph boundary found, split at 75% of chunk size
|
|
694
|
+
return max(1, len(lines) * 3 // 4)
|
|
695
|
+
|
|
696
|
+
def _create_chunk(self, content: str, filename: str,
|
|
697
|
+
section: Optional[str] = None,
|
|
698
|
+
start_line: Optional[int] = None,
|
|
699
|
+
end_line: Optional[int] = None,
|
|
700
|
+
metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
701
|
+
"""Create chunk dictionary with enhanced metadata"""
|
|
702
|
+
base_metadata = {
|
|
703
|
+
'file_type': Path(filename).suffix.lstrip('.'),
|
|
704
|
+
'chunk_size': len(content),
|
|
705
|
+
'word_count': len(content.split()),
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
# Handle sentence count with fallback
|
|
709
|
+
try:
|
|
710
|
+
if sent_tokenize and content.strip():
|
|
711
|
+
base_metadata['sentence_count'] = len(sent_tokenize(content))
|
|
712
|
+
else:
|
|
713
|
+
# Fallback: count sentences by periods
|
|
714
|
+
base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
|
|
715
|
+
except Exception as e:
|
|
716
|
+
logger.warning(f"Error counting sentences: {e}")
|
|
717
|
+
# Simple fallback: count periods
|
|
718
|
+
base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
|
|
719
|
+
|
|
720
|
+
if metadata:
|
|
721
|
+
base_metadata.update(metadata)
|
|
722
|
+
|
|
723
|
+
return {
|
|
724
|
+
'content': content.strip(),
|
|
725
|
+
'filename': filename,
|
|
726
|
+
'section': section,
|
|
727
|
+
'start_line': start_line,
|
|
728
|
+
'end_line': end_line,
|
|
729
|
+
'metadata': base_metadata
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
def _get_overlap_lines(self, lines: List[str]) -> List[str]:
|
|
733
|
+
"""Get overlap lines for chunk continuity"""
|
|
734
|
+
if not lines:
|
|
735
|
+
return []
|
|
736
|
+
|
|
737
|
+
# Calculate overlap size in characters
|
|
738
|
+
overlap_chars = self.chunk_overlap
|
|
739
|
+
overlap_lines = []
|
|
740
|
+
char_count = 0
|
|
741
|
+
|
|
742
|
+
# Take lines from the end until we reach overlap size
|
|
743
|
+
for line in reversed(lines):
|
|
744
|
+
if char_count + len(line) <= overlap_chars:
|
|
745
|
+
overlap_lines.insert(0, line)
|
|
746
|
+
char_count += len(line) + 1
|
|
747
|
+
else:
|
|
748
|
+
break
|
|
749
|
+
|
|
750
|
+
return overlap_lines
|
|
751
|
+
|
|
752
|
+
def _chunk_by_sentences(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
753
|
+
"""Chunk content by sentences with specified max sentences per chunk"""
|
|
754
|
+
if isinstance(content, list):
|
|
755
|
+
content = '\n'.join(content)
|
|
756
|
+
|
|
757
|
+
# Use sentence-based chunking
|
|
758
|
+
split_newlines = self.split_newlines if self.split_newlines is not None else 2
|
|
759
|
+
sentences = self._sentence_based_chunking(content, self.max_sentences_per_chunk, split_newlines)
|
|
760
|
+
|
|
761
|
+
chunks = []
|
|
762
|
+
for i, chunk_content in enumerate(sentences):
|
|
763
|
+
chunks.append(self._create_chunk(
|
|
764
|
+
content=chunk_content,
|
|
765
|
+
filename=filename,
|
|
766
|
+
section=f"Section {i+1}",
|
|
767
|
+
metadata={
|
|
768
|
+
'chunk_method': 'sentence_based',
|
|
769
|
+
'chunk_index': i,
|
|
770
|
+
'max_sentences_per_chunk': self.max_sentences_per_chunk,
|
|
771
|
+
'split_newlines': split_newlines
|
|
772
|
+
}
|
|
773
|
+
))
|
|
774
|
+
|
|
775
|
+
return chunks
|
|
776
|
+
|
|
777
|
+
def _chunk_by_sliding_window(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
778
|
+
"""Chunk content using sliding window approach with word-based chunks"""
|
|
779
|
+
if isinstance(content, list):
|
|
780
|
+
content = '\n'.join(content)
|
|
781
|
+
|
|
782
|
+
# Split content into words
|
|
783
|
+
words = content.split()
|
|
784
|
+
|
|
785
|
+
if not words:
|
|
786
|
+
return []
|
|
787
|
+
|
|
788
|
+
chunks = []
|
|
789
|
+
chunk_index = 0
|
|
790
|
+
|
|
791
|
+
# Create overlapping chunks
|
|
792
|
+
for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
|
|
793
|
+
chunk_words = words[i:i + self.chunk_size]
|
|
794
|
+
if chunk_words:
|
|
795
|
+
chunk_content = ' '.join(chunk_words)
|
|
796
|
+
chunks.append(self._create_chunk(
|
|
797
|
+
content=chunk_content,
|
|
798
|
+
filename=filename,
|
|
799
|
+
section=f"Chunk {chunk_index + 1}",
|
|
800
|
+
metadata={
|
|
801
|
+
'chunk_method': 'sliding_window',
|
|
802
|
+
'chunk_index': chunk_index,
|
|
803
|
+
'chunk_size_words': self.chunk_size,
|
|
804
|
+
'overlap_size_words': self.chunk_overlap,
|
|
805
|
+
'start_word': i,
|
|
806
|
+
'end_word': i + len(chunk_words)
|
|
807
|
+
}
|
|
808
|
+
))
|
|
809
|
+
chunk_index += 1
|
|
810
|
+
|
|
811
|
+
return chunks
|
|
812
|
+
|
|
813
|
+
def _chunk_by_paragraphs(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
814
|
+
"""Chunk content by paragraphs (split on double newlines)"""
|
|
815
|
+
if isinstance(content, list):
|
|
816
|
+
content = '\n'.join(content)
|
|
817
|
+
|
|
818
|
+
# Split on double newlines to get paragraphs
|
|
819
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
820
|
+
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
|
821
|
+
|
|
822
|
+
chunks = []
|
|
823
|
+
for i, paragraph in enumerate(paragraphs):
|
|
824
|
+
if paragraph:
|
|
825
|
+
chunks.append(self._create_chunk(
|
|
826
|
+
content=paragraph,
|
|
827
|
+
filename=filename,
|
|
828
|
+
section=f"Paragraph {i+1}",
|
|
829
|
+
metadata={
|
|
830
|
+
'chunk_method': 'paragraph_based',
|
|
831
|
+
'chunk_index': i,
|
|
832
|
+
'paragraph_number': i + 1
|
|
833
|
+
}
|
|
834
|
+
))
|
|
835
|
+
|
|
836
|
+
return chunks
|
|
837
|
+
|
|
838
|
+
def _chunk_by_pages(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
839
|
+
"""Chunk content by pages (for documents that have page boundaries)"""
|
|
840
|
+
if isinstance(content, list):
|
|
841
|
+
# If content is already a list (e.g., from PDF extraction), treat each item as a page
|
|
842
|
+
pages = [str(page).strip() for page in content if str(page).strip()]
|
|
843
|
+
else:
|
|
844
|
+
# For text content, try to detect page boundaries
|
|
845
|
+
# Look for form feed characters or page break indicators
|
|
846
|
+
if '\f' in content:
|
|
847
|
+
pages = content.split('\f')
|
|
848
|
+
elif '---PAGE---' in content:
|
|
849
|
+
pages = content.split('---PAGE---')
|
|
850
|
+
elif re.search(r'\n\s*Page\s+\d+\s*\n', content):
|
|
851
|
+
# Split on "Page N" patterns
|
|
852
|
+
pages = re.split(r'\n\s*Page\s+\d+\s*\n', content)
|
|
853
|
+
else:
|
|
854
|
+
# Fallback: split into roughly equal chunks
|
|
855
|
+
words = content.split()
|
|
856
|
+
words_per_page = max(500, len(words) // 10) # Aim for ~10 pages
|
|
857
|
+
pages = []
|
|
858
|
+
for i in range(0, len(words), words_per_page):
|
|
859
|
+
page_words = words[i:i + words_per_page]
|
|
860
|
+
if page_words:
|
|
861
|
+
pages.append(' '.join(page_words))
|
|
862
|
+
|
|
863
|
+
pages = [p.strip() for p in pages if p.strip()]
|
|
864
|
+
|
|
865
|
+
chunks = []
|
|
866
|
+
for i, page_content in enumerate(pages):
|
|
867
|
+
if page_content:
|
|
868
|
+
chunks.append(self._create_chunk(
|
|
869
|
+
content=page_content,
|
|
870
|
+
filename=filename,
|
|
871
|
+
section=f"Page {i+1}",
|
|
872
|
+
metadata={
|
|
873
|
+
'chunk_method': 'page_based',
|
|
874
|
+
'chunk_index': i,
|
|
875
|
+
'page_number': i + 1
|
|
876
|
+
}
|
|
877
|
+
))
|
|
878
|
+
|
|
879
|
+
return chunks
|
|
880
|
+
|
|
881
|
+
def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
882
|
+
"""Chunk based on semantic similarity between sentences"""
|
|
883
|
+
if isinstance(content, list):
|
|
884
|
+
content = '\n'.join(content)
|
|
885
|
+
|
|
886
|
+
# Get sentences
|
|
887
|
+
if sent_tokenize:
|
|
888
|
+
sentences = sent_tokenize(content)
|
|
889
|
+
else:
|
|
890
|
+
sentences = content.split('. ')
|
|
891
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
|
892
|
+
|
|
893
|
+
if len(sentences) <= 1:
|
|
894
|
+
return [self._create_chunk(content, filename, "Section 1",
|
|
895
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
|
896
|
+
|
|
897
|
+
# Generate embeddings for sentences (using the same model as the index)
|
|
898
|
+
try:
|
|
899
|
+
from sentence_transformers import SentenceTransformer
|
|
900
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
901
|
+
import numpy as np
|
|
902
|
+
|
|
903
|
+
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
|
904
|
+
embeddings = model.encode(sentences, show_progress_bar=False)
|
|
905
|
+
|
|
906
|
+
# Calculate similarity between adjacent sentences
|
|
907
|
+
similarities = []
|
|
908
|
+
for i in range(len(embeddings) - 1):
|
|
909
|
+
sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
|
|
910
|
+
similarities.append(sim)
|
|
911
|
+
|
|
912
|
+
# Find split points where similarity drops below threshold
|
|
913
|
+
split_points = [0]
|
|
914
|
+
for i, sim in enumerate(similarities):
|
|
915
|
+
if sim < self.semantic_threshold:
|
|
916
|
+
split_points.append(i + 1)
|
|
917
|
+
split_points.append(len(sentences))
|
|
918
|
+
|
|
919
|
+
# Create chunks
|
|
920
|
+
chunks = []
|
|
921
|
+
for i in range(len(split_points) - 1):
|
|
922
|
+
start_idx = split_points[i]
|
|
923
|
+
end_idx = split_points[i + 1]
|
|
924
|
+
chunk_sentences = sentences[start_idx:end_idx]
|
|
925
|
+
|
|
926
|
+
# Ensure minimum chunk size
|
|
927
|
+
if len(chunk_sentences) < 2 and i > 0:
|
|
928
|
+
# Merge with previous chunk
|
|
929
|
+
chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
|
|
930
|
+
continue
|
|
931
|
+
|
|
932
|
+
chunk_content = ' '.join(chunk_sentences)
|
|
933
|
+
chunks.append(self._create_chunk(
|
|
934
|
+
content=chunk_content,
|
|
935
|
+
filename=filename,
|
|
936
|
+
section=f"Semantic Section {i+1}",
|
|
937
|
+
metadata={
|
|
938
|
+
'chunk_method': 'semantic',
|
|
939
|
+
'chunk_index': i,
|
|
940
|
+
'semantic_threshold': self.semantic_threshold,
|
|
941
|
+
'sentence_count': len(chunk_sentences)
|
|
942
|
+
}
|
|
943
|
+
))
|
|
944
|
+
|
|
945
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
|
|
946
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
|
947
|
+
|
|
948
|
+
except ImportError:
|
|
949
|
+
# Fallback to sentence-based chunking
|
|
950
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
951
|
+
|
|
952
|
+
def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
953
|
+
"""Chunk based on topic changes using keyword analysis"""
|
|
954
|
+
if isinstance(content, list):
|
|
955
|
+
content = '\n'.join(content)
|
|
956
|
+
|
|
957
|
+
if sent_tokenize:
|
|
958
|
+
sentences = sent_tokenize(content)
|
|
959
|
+
else:
|
|
960
|
+
sentences = content.split('. ')
|
|
961
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
|
962
|
+
|
|
963
|
+
if len(sentences) <= 3:
|
|
964
|
+
return [self._create_chunk(content, filename, "Topic 1",
|
|
965
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
|
966
|
+
|
|
967
|
+
try:
|
|
968
|
+
# Simple topic detection using keyword overlap
|
|
969
|
+
from collections import Counter
|
|
970
|
+
import re
|
|
971
|
+
|
|
972
|
+
# Extract keywords from each sentence
|
|
973
|
+
sentence_keywords = []
|
|
974
|
+
for sentence in sentences:
|
|
975
|
+
# Simple keyword extraction (could be enhanced with NLP)
|
|
976
|
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
|
|
977
|
+
# Filter common words (basic stopwords)
|
|
978
|
+
stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
|
|
979
|
+
keywords = [w for w in words if w not in stopwords and len(w) > 3]
|
|
980
|
+
sentence_keywords.append(set(keywords))
|
|
981
|
+
|
|
982
|
+
# Find topic boundaries based on keyword overlap
|
|
983
|
+
chunks = []
|
|
984
|
+
current_chunk = [sentences[0]]
|
|
985
|
+
current_keywords = sentence_keywords[0]
|
|
986
|
+
|
|
987
|
+
for i in range(1, len(sentences)):
|
|
988
|
+
# Calculate keyword overlap with current chunk
|
|
989
|
+
overlap = len(current_keywords.intersection(sentence_keywords[i]))
|
|
990
|
+
total_keywords = len(current_keywords.union(sentence_keywords[i]))
|
|
991
|
+
|
|
992
|
+
if total_keywords > 0:
|
|
993
|
+
similarity = overlap / total_keywords
|
|
994
|
+
else:
|
|
995
|
+
similarity = 0
|
|
996
|
+
|
|
997
|
+
# If similarity is low, start new chunk
|
|
998
|
+
if similarity < self.topic_threshold and len(current_chunk) >= 2:
|
|
999
|
+
chunk_content = ' '.join(current_chunk)
|
|
1000
|
+
chunks.append(self._create_chunk(
|
|
1001
|
+
content=chunk_content,
|
|
1002
|
+
filename=filename,
|
|
1003
|
+
section=f"Topic {len(chunks)+1}",
|
|
1004
|
+
metadata={
|
|
1005
|
+
'chunk_method': 'topic',
|
|
1006
|
+
'chunk_index': len(chunks),
|
|
1007
|
+
'topic_keywords': list(current_keywords)[:10], # Top keywords
|
|
1008
|
+
'sentence_count': len(current_chunk),
|
|
1009
|
+
'topic_threshold': self.topic_threshold
|
|
1010
|
+
}
|
|
1011
|
+
))
|
|
1012
|
+
current_chunk = [sentences[i]]
|
|
1013
|
+
current_keywords = sentence_keywords[i]
|
|
1014
|
+
else:
|
|
1015
|
+
current_chunk.append(sentences[i])
|
|
1016
|
+
current_keywords = current_keywords.union(sentence_keywords[i])
|
|
1017
|
+
|
|
1018
|
+
# Add final chunk
|
|
1019
|
+
if current_chunk:
|
|
1020
|
+
chunk_content = ' '.join(current_chunk)
|
|
1021
|
+
chunks.append(self._create_chunk(
|
|
1022
|
+
content=chunk_content,
|
|
1023
|
+
filename=filename,
|
|
1024
|
+
section=f"Topic {len(chunks)+1}",
|
|
1025
|
+
metadata={
|
|
1026
|
+
'chunk_method': 'topic',
|
|
1027
|
+
'chunk_index': len(chunks),
|
|
1028
|
+
'topic_keywords': list(current_keywords)[:10],
|
|
1029
|
+
'sentence_count': len(current_chunk),
|
|
1030
|
+
'topic_threshold': self.topic_threshold
|
|
1031
|
+
}
|
|
1032
|
+
))
|
|
1033
|
+
|
|
1034
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
|
|
1035
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
|
1036
|
+
|
|
1037
|
+
except Exception:
|
|
1038
|
+
# Fallback to sentence-based chunking
|
|
1039
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
1040
|
+
|
|
1041
|
+
def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
1042
|
+
"""Create chunks optimized for question-answering"""
|
|
1043
|
+
if isinstance(content, list):
|
|
1044
|
+
content = '\n'.join(content)
|
|
1045
|
+
|
|
1046
|
+
if sent_tokenize:
|
|
1047
|
+
sentences = sent_tokenize(content)
|
|
1048
|
+
else:
|
|
1049
|
+
sentences = content.split('. ')
|
|
1050
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
|
1051
|
+
|
|
1052
|
+
# Patterns that indicate Q&A structure
|
|
1053
|
+
question_patterns = [
|
|
1054
|
+
r'\?', # Questions
|
|
1055
|
+
r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
|
|
1056
|
+
r'(step|steps|process|procedure|method|way to)',
|
|
1057
|
+
r'(example|examples|instance|case)',
|
|
1058
|
+
r'(definition|meaning|refers to|means)',
|
|
1059
|
+
]
|
|
1060
|
+
|
|
1061
|
+
chunks = []
|
|
1062
|
+
current_chunk = []
|
|
1063
|
+
current_context = []
|
|
1064
|
+
|
|
1065
|
+
for i, sentence in enumerate(sentences):
|
|
1066
|
+
sentence_lower = sentence.lower().strip()
|
|
1067
|
+
|
|
1068
|
+
# Check if this sentence contains Q&A indicators
|
|
1069
|
+
is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
|
|
1070
|
+
|
|
1071
|
+
if is_qa_relevant or len(current_chunk) == 0:
|
|
1072
|
+
current_chunk.append(sentence)
|
|
1073
|
+
# Add surrounding context (previous and next sentences)
|
|
1074
|
+
if i > 0 and sentences[i-1] not in current_chunk:
|
|
1075
|
+
current_context.append(sentences[i-1])
|
|
1076
|
+
if i < len(sentences) - 1:
|
|
1077
|
+
current_context.append(sentences[i+1])
|
|
1078
|
+
else:
|
|
1079
|
+
current_chunk.append(sentence)
|
|
1080
|
+
|
|
1081
|
+
# Create chunk when we have enough content or reach a natural break
|
|
1082
|
+
if (len(current_chunk) >= 3 and
|
|
1083
|
+
(i == len(sentences) - 1 or # Last sentence
|
|
1084
|
+
sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
|
|
1085
|
+
|
|
1086
|
+
# Combine chunk with context
|
|
1087
|
+
full_content = current_context + current_chunk
|
|
1088
|
+
chunk_content = ' '.join(full_content)
|
|
1089
|
+
|
|
1090
|
+
chunks.append(self._create_chunk(
|
|
1091
|
+
content=chunk_content,
|
|
1092
|
+
filename=filename,
|
|
1093
|
+
section=f"QA Section {len(chunks)+1}",
|
|
1094
|
+
metadata={
|
|
1095
|
+
'chunk_method': 'qa_optimized',
|
|
1096
|
+
'chunk_index': len(chunks),
|
|
1097
|
+
'has_question': any('?' in s for s in current_chunk),
|
|
1098
|
+
'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
|
|
1099
|
+
'sentence_count': len(full_content)
|
|
1100
|
+
}
|
|
1101
|
+
))
|
|
1102
|
+
|
|
1103
|
+
current_chunk = []
|
|
1104
|
+
current_context = []
|
|
1105
|
+
|
|
1106
|
+
# Handle remaining content
|
|
1107
|
+
if current_chunk:
|
|
1108
|
+
chunk_content = ' '.join(current_context + current_chunk)
|
|
1109
|
+
chunks.append(self._create_chunk(
|
|
1110
|
+
content=chunk_content,
|
|
1111
|
+
filename=filename,
|
|
1112
|
+
section=f"QA Section {len(chunks)+1}",
|
|
1113
|
+
metadata={
|
|
1114
|
+
'chunk_method': 'qa_optimized',
|
|
1115
|
+
'chunk_index': len(chunks),
|
|
1116
|
+
'sentence_count': len(current_context + current_chunk)
|
|
1117
|
+
}
|
|
1118
|
+
))
|
|
1119
|
+
|
|
1120
|
+
return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
|
|
1121
|
+
metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
|
|
1122
|
+
|
|
1123
|
+
def _chunk_from_json(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
1124
|
+
"""
|
|
1125
|
+
Create chunks from pre-processed JSON content
|
|
1126
|
+
|
|
1127
|
+
This strategy expects content to be a JSON string with the following structure:
|
|
1128
|
+
{
|
|
1129
|
+
"chunks": [
|
|
1130
|
+
{
|
|
1131
|
+
"chunk_id": "unique_id",
|
|
1132
|
+
"type": "content|toc",
|
|
1133
|
+
"content": "text content",
|
|
1134
|
+
"metadata": {
|
|
1135
|
+
"url": "https://...",
|
|
1136
|
+
"section_number": 1,
|
|
1137
|
+
"related_toc": "toc_id",
|
|
1138
|
+
...
|
|
1139
|
+
}
|
|
1140
|
+
},
|
|
1141
|
+
...
|
|
1142
|
+
]
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
Args:
|
|
1146
|
+
content: JSON string containing pre-chunked content
|
|
1147
|
+
filename: Name of the source file
|
|
1148
|
+
file_type: Should be 'json'
|
|
1149
|
+
|
|
1150
|
+
Returns:
|
|
1151
|
+
List of chunk dictionaries formatted for the search index
|
|
1152
|
+
"""
|
|
1153
|
+
try:
|
|
1154
|
+
# Parse JSON content
|
|
1155
|
+
data = json.loads(content)
|
|
1156
|
+
|
|
1157
|
+
if not isinstance(data, dict) or 'chunks' not in data:
|
|
1158
|
+
logger.error(f"Invalid JSON structure in {filename}: expected 'chunks' key")
|
|
1159
|
+
# Fallback to treating it as plain text
|
|
1160
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
1161
|
+
|
|
1162
|
+
chunks = []
|
|
1163
|
+
for idx, json_chunk in enumerate(data['chunks']):
|
|
1164
|
+
if not isinstance(json_chunk, dict) or 'content' not in json_chunk:
|
|
1165
|
+
logger.warning(f"Skipping invalid chunk {idx} in {filename}")
|
|
1166
|
+
continue
|
|
1167
|
+
|
|
1168
|
+
# Extract metadata from JSON chunk
|
|
1169
|
+
json_metadata = json_chunk.get('metadata', {})
|
|
1170
|
+
chunk_type = json_chunk.get('type', 'content')
|
|
1171
|
+
|
|
1172
|
+
# Build chunk metadata (excluding tags which go at top level)
|
|
1173
|
+
metadata = {
|
|
1174
|
+
'chunk_method': 'json',
|
|
1175
|
+
'chunk_index': idx,
|
|
1176
|
+
'chunk_type': chunk_type,
|
|
1177
|
+
'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
# Extract tags before merging metadata
|
|
1181
|
+
tags = json_metadata.get('tags', [])
|
|
1182
|
+
|
|
1183
|
+
# Merge JSON metadata (this includes all fields including tags)
|
|
1184
|
+
# We'll keep tags in metadata for backward compatibility but also set at top level
|
|
1185
|
+
metadata.update(json_metadata)
|
|
1186
|
+
|
|
1187
|
+
# Determine section name
|
|
1188
|
+
if chunk_type == 'toc':
|
|
1189
|
+
section = f"TOC: {json_chunk.get('content', '')[:50]}"
|
|
1190
|
+
else:
|
|
1191
|
+
section = json_metadata.get('section', f"Section {json_metadata.get('section_number', idx + 1)}")
|
|
1192
|
+
|
|
1193
|
+
# Create chunk with proper structure
|
|
1194
|
+
chunk = self._create_chunk(
|
|
1195
|
+
content=json_chunk['content'],
|
|
1196
|
+
filename=filename,
|
|
1197
|
+
section=section,
|
|
1198
|
+
metadata=metadata
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
# Set tags at the top level for proper tag filtering
|
|
1202
|
+
if tags:
|
|
1203
|
+
chunk['tags'] = tags
|
|
1204
|
+
elif chunk_type == 'toc':
|
|
1205
|
+
# For TOC entries, add special tags if none provided
|
|
1206
|
+
chunk['tags'] = ['toc', 'navigation']
|
|
1207
|
+
|
|
1208
|
+
chunks.append(chunk)
|
|
1209
|
+
|
|
1210
|
+
if not chunks:
|
|
1211
|
+
logger.warning(f"No valid chunks found in JSON file {filename}")
|
|
1212
|
+
return self._chunk_by_sentences(str(data), filename, file_type)
|
|
1213
|
+
|
|
1214
|
+
logger.info(f"Created {len(chunks)} chunks from JSON file {filename}")
|
|
1215
|
+
return chunks
|
|
1216
|
+
|
|
1217
|
+
except json.JSONDecodeError as e:
|
|
1218
|
+
logger.error(f"Failed to parse JSON in {filename}: {e}")
|
|
1219
|
+
# Fallback to sentence chunking
|
|
1220
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
1221
|
+
except Exception as e:
|
|
1222
|
+
logger.error(f"Unexpected error processing JSON chunks in {filename}: {e}")
|
|
1223
|
+
return self._chunk_by_sentences(content, filename, file_type)
|