signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +99 -15
- signalwire_agents/agent_server.py +248 -60
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +951 -41
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/dokku.py +2320 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +2636 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +566 -2366
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +845 -2916
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +418 -0
- signalwire_agents/core/data_map.py +3 -15
- signalwire_agents/core/function_result.py +116 -44
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +280 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +460 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1142 -0
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +84 -1
- signalwire_agents/core/skill_manager.py +62 -20
- signalwire_agents/core/swaig_function.py +18 -5
- signalwire_agents/core/swml_builder.py +207 -11
- signalwire_agents/core/swml_handler.py +27 -21
- signalwire_agents/core/swml_renderer.py +123 -312
- signalwire_agents/core/swml_service.py +171 -203
- signalwire_agents/mcp_gateway/__init__.py +29 -0
- signalwire_agents/mcp_gateway/gateway_service.py +564 -0
- signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
- signalwire_agents/mcp_gateway/session_manager.py +218 -0
- signalwire_agents/prefabs/concierge.py +0 -3
- signalwire_agents/prefabs/faq_bot.py +0 -3
- signalwire_agents/prefabs/info_gatherer.py +0 -3
- signalwire_agents/prefabs/receptionist.py +0 -3
- signalwire_agents/prefabs/survey.py +0 -3
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +490 -31
- signalwire_agents/search/index_builder.py +307 -37
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +748 -0
- signalwire_agents/search/query_processor.py +162 -31
- signalwire_agents/search/search_engine.py +916 -35
- signalwire_agents/search/search_service.py +376 -53
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +14 -2
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/skill.py +84 -3
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +9 -0
- signalwire_agents/skills/datetime/skill.py +20 -7
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +9 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +9 -0
- signalwire_agents/skills/math/skill.py +18 -4
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +9 -0
- signalwire_agents/skills/native_vector_search/skill.py +569 -101
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +395 -40
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +9 -0
- signalwire_agents/skills/web_search/skill.py +586 -112
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
- signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
- signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents/skills/wikipedia/__init__.py +0 -9
- signalwire_agents-0.1.13.data/data/schema.json +0 -5611
- signalwire_agents-0.1.13.dist-info/RECORD +0 -67
- signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
|
@@ -68,6 +68,8 @@ if _SEARCH_AVAILABLE:
|
|
|
68
68
|
from .index_builder import IndexBuilder
|
|
69
69
|
from .search_engine import SearchEngine
|
|
70
70
|
from .search_service import SearchService
|
|
71
|
+
from .models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
|
|
72
|
+
from .migration import SearchIndexMigrator
|
|
71
73
|
|
|
72
74
|
__all__ = [
|
|
73
75
|
'preprocess_query',
|
|
@@ -75,7 +77,11 @@ if _SEARCH_AVAILABLE:
|
|
|
75
77
|
'DocumentProcessor',
|
|
76
78
|
'IndexBuilder',
|
|
77
79
|
'SearchEngine',
|
|
78
|
-
'SearchService'
|
|
80
|
+
'SearchService',
|
|
81
|
+
'MODEL_ALIASES',
|
|
82
|
+
'DEFAULT_MODEL',
|
|
83
|
+
'resolve_model_alias',
|
|
84
|
+
'SearchIndexMigrator'
|
|
79
85
|
]
|
|
80
86
|
except ImportError as e:
|
|
81
87
|
# Some search components failed to import
|
|
@@ -74,29 +74,51 @@ logger = logging.getLogger(__name__)
|
|
|
74
74
|
class DocumentProcessor:
|
|
75
75
|
"""Enhanced document processor with smart chunking capabilities"""
|
|
76
76
|
|
|
77
|
-
def __init__(
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
chunking_strategy: str = 'sentence',
|
|
80
|
+
max_sentences_per_chunk: int = 5,
|
|
81
|
+
chunk_size: int = 50,
|
|
82
|
+
chunk_overlap: int = 10,
|
|
83
|
+
split_newlines: Optional[int] = None,
|
|
84
|
+
index_nlp_backend: str = 'nltk',
|
|
85
|
+
verbose: bool = False,
|
|
86
|
+
semantic_threshold: float = 0.5,
|
|
87
|
+
topic_threshold: float = 0.3
|
|
88
|
+
):
|
|
82
89
|
"""
|
|
83
|
-
Initialize document processor
|
|
84
|
-
|
|
90
|
+
Initialize document processor
|
|
91
|
+
|
|
85
92
|
Args:
|
|
86
|
-
chunking_strategy:
|
|
87
|
-
|
|
93
|
+
chunking_strategy: Strategy for chunking documents:
|
|
94
|
+
- 'sentence': Sentence-based chunking with overlap
|
|
95
|
+
- 'sliding': Sliding window with word-based chunks
|
|
96
|
+
- 'paragraph': Natural paragraph boundaries
|
|
97
|
+
- 'page': Page-based chunking (for PDFs)
|
|
98
|
+
- 'semantic': Semantic similarity-based chunking
|
|
99
|
+
- 'topic': Topic modeling-based chunking
|
|
100
|
+
- 'qa': Question-answer optimized chunking
|
|
101
|
+
- 'json': JSON structure-aware chunking
|
|
102
|
+
- 'markdown': Markdown structure-aware chunking with code block detection
|
|
103
|
+
max_sentences_per_chunk: For sentence strategy (default: 5)
|
|
88
104
|
chunk_size: For sliding strategy - words per chunk (default: 50)
|
|
89
|
-
|
|
105
|
+
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
|
90
106
|
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
|
107
|
+
index_nlp_backend: NLP backend for indexing (default: 'nltk')
|
|
108
|
+
verbose: Whether to enable verbose logging (default: False)
|
|
109
|
+
semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
|
|
110
|
+
topic_threshold: Similarity threshold for topic chunking (default: 0.3)
|
|
91
111
|
"""
|
|
92
112
|
self.chunking_strategy = chunking_strategy
|
|
93
113
|
self.max_sentences_per_chunk = max_sentences_per_chunk
|
|
94
114
|
self.chunk_size = chunk_size
|
|
95
|
-
self.
|
|
115
|
+
self.chunk_overlap = chunk_overlap
|
|
96
116
|
self.split_newlines = split_newlines
|
|
117
|
+
self.semantic_threshold = semantic_threshold
|
|
118
|
+
self.topic_threshold = topic_threshold
|
|
97
119
|
|
|
98
120
|
# Legacy support for old character-based chunking
|
|
99
|
-
self.chunk_overlap =
|
|
121
|
+
self.chunk_overlap = chunk_overlap
|
|
100
122
|
|
|
101
123
|
def create_chunks(self, content: str, filename: str,
|
|
102
124
|
file_type: str) -> List[Dict[str, Any]]:
|
|
@@ -121,6 +143,17 @@ class DocumentProcessor:
|
|
|
121
143
|
return self._chunk_by_paragraphs(content, filename, file_type)
|
|
122
144
|
elif self.chunking_strategy == 'page':
|
|
123
145
|
return self._chunk_by_pages(content, filename, file_type)
|
|
146
|
+
elif self.chunking_strategy == 'semantic':
|
|
147
|
+
return self._chunk_by_semantic(content, filename, file_type)
|
|
148
|
+
elif self.chunking_strategy == 'topic':
|
|
149
|
+
return self._chunk_by_topics(content, filename, file_type)
|
|
150
|
+
elif self.chunking_strategy == 'qa':
|
|
151
|
+
return self._chunk_by_qa_optimization(content, filename, file_type)
|
|
152
|
+
elif self.chunking_strategy == 'json':
|
|
153
|
+
return self._chunk_from_json(content, filename, file_type)
|
|
154
|
+
elif self.chunking_strategy == 'markdown':
|
|
155
|
+
# Use markdown-aware chunking for better structure preservation
|
|
156
|
+
return self._chunk_markdown_enhanced(content, filename)
|
|
124
157
|
else:
|
|
125
158
|
# Fallback to sentence-based chunking
|
|
126
159
|
return self._chunk_by_sentences(content, filename, file_type)
|
|
@@ -318,75 +351,114 @@ class DocumentProcessor:
|
|
|
318
351
|
return chunks
|
|
319
352
|
|
|
320
353
|
def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
|
321
|
-
"""Enhanced markdown chunking with
|
|
354
|
+
"""Enhanced markdown chunking with code block detection and rich metadata
|
|
355
|
+
|
|
356
|
+
Features:
|
|
357
|
+
- Tracks header hierarchy for section paths
|
|
358
|
+
- Detects code blocks and extracts language
|
|
359
|
+
- Adds 'code' tags to chunks containing code
|
|
360
|
+
- Preserves markdown structure for better search
|
|
361
|
+
"""
|
|
322
362
|
chunks = []
|
|
323
363
|
lines = content.split('\n')
|
|
324
|
-
|
|
364
|
+
|
|
325
365
|
current_section = None
|
|
326
366
|
current_hierarchy = [] # Track header hierarchy
|
|
327
367
|
current_chunk = []
|
|
328
368
|
current_size = 0
|
|
329
369
|
line_start = 1
|
|
330
|
-
|
|
370
|
+
in_code_block = False
|
|
371
|
+
code_languages = [] # Track languages in current chunk
|
|
372
|
+
has_code = False
|
|
373
|
+
|
|
331
374
|
for line_num, line in enumerate(lines, 1):
|
|
375
|
+
# Check for code block fences
|
|
376
|
+
code_fence_match = re.match(r'^```(\w+)?', line)
|
|
377
|
+
if code_fence_match:
|
|
378
|
+
in_code_block = not in_code_block
|
|
379
|
+
if in_code_block:
|
|
380
|
+
# Starting code block
|
|
381
|
+
has_code = True
|
|
382
|
+
lang = code_fence_match.group(1)
|
|
383
|
+
if lang and lang not in code_languages:
|
|
384
|
+
code_languages.append(lang)
|
|
385
|
+
|
|
332
386
|
# Check for headers with hierarchy tracking
|
|
333
|
-
header_match = re.match(r'^(#{1,6})\s+(.+)', line)
|
|
387
|
+
header_match = re.match(r'^(#{1,6})\s+(.+)', line) if not in_code_block else None
|
|
334
388
|
if header_match:
|
|
335
389
|
header_level = len(header_match.group(1))
|
|
336
390
|
header_text = header_match.group(2).strip()
|
|
337
|
-
|
|
391
|
+
|
|
338
392
|
# Save current chunk if it exists
|
|
339
393
|
if current_chunk:
|
|
394
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
395
|
+
current_hierarchy, code_languages, has_code
|
|
396
|
+
)
|
|
340
397
|
chunks.append(self._create_chunk(
|
|
341
398
|
content='\n'.join(current_chunk),
|
|
342
399
|
filename=filename,
|
|
343
400
|
section=self._build_section_path(current_hierarchy),
|
|
344
401
|
start_line=line_start,
|
|
345
|
-
end_line=line_num - 1
|
|
402
|
+
end_line=line_num - 1,
|
|
403
|
+
metadata=chunk_metadata
|
|
346
404
|
))
|
|
347
|
-
|
|
405
|
+
|
|
348
406
|
# Update hierarchy
|
|
349
407
|
current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
|
|
350
408
|
current_section = header_text
|
|
351
409
|
current_chunk = [line]
|
|
352
410
|
current_size = len(line)
|
|
353
411
|
line_start = line_num
|
|
354
|
-
|
|
412
|
+
code_languages = []
|
|
413
|
+
has_code = False
|
|
414
|
+
|
|
355
415
|
else:
|
|
356
416
|
current_chunk.append(line)
|
|
357
417
|
current_size += len(line) + 1
|
|
358
|
-
|
|
418
|
+
|
|
359
419
|
# Check if chunk is getting too large - use smart splitting
|
|
360
|
-
|
|
420
|
+
# But don't split inside code blocks
|
|
421
|
+
if current_size >= self.chunk_size and not in_code_block:
|
|
361
422
|
# Try to split at paragraph boundary first
|
|
362
423
|
split_point = self._find_best_split_point(current_chunk)
|
|
363
|
-
|
|
424
|
+
|
|
364
425
|
chunk_to_save = current_chunk[:split_point]
|
|
426
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
427
|
+
current_hierarchy, code_languages, has_code
|
|
428
|
+
)
|
|
365
429
|
chunks.append(self._create_chunk(
|
|
366
430
|
content='\n'.join(chunk_to_save),
|
|
367
431
|
filename=filename,
|
|
368
432
|
section=self._build_section_path(current_hierarchy),
|
|
369
433
|
start_line=line_start,
|
|
370
|
-
end_line=line_start + split_point - 1
|
|
434
|
+
end_line=line_start + split_point - 1,
|
|
435
|
+
metadata=chunk_metadata
|
|
371
436
|
))
|
|
372
|
-
|
|
437
|
+
|
|
373
438
|
# Start new chunk with overlap
|
|
374
439
|
overlap_lines = self._get_overlap_lines(chunk_to_save)
|
|
375
440
|
remaining_lines = current_chunk[split_point:]
|
|
376
441
|
current_chunk = overlap_lines + remaining_lines
|
|
377
442
|
current_size = sum(len(line) + 1 for line in current_chunk)
|
|
378
443
|
line_start = line_start + split_point - len(overlap_lines)
|
|
379
|
-
|
|
444
|
+
# Reset code tracking for new chunk
|
|
445
|
+
code_languages = []
|
|
446
|
+
has_code = False
|
|
447
|
+
|
|
380
448
|
# Add final chunk
|
|
381
449
|
if current_chunk:
|
|
450
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
451
|
+
current_hierarchy, code_languages, has_code
|
|
452
|
+
)
|
|
382
453
|
chunks.append(self._create_chunk(
|
|
383
454
|
content='\n'.join(current_chunk),
|
|
384
455
|
filename=filename,
|
|
385
456
|
section=self._build_section_path(current_hierarchy),
|
|
386
457
|
start_line=line_start,
|
|
387
|
-
end_line=len(lines)
|
|
458
|
+
end_line=len(lines),
|
|
459
|
+
metadata=chunk_metadata
|
|
388
460
|
))
|
|
389
|
-
|
|
461
|
+
|
|
390
462
|
return chunks
|
|
391
463
|
|
|
392
464
|
def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
|
@@ -554,6 +626,49 @@ class DocumentProcessor:
|
|
|
554
626
|
def _build_section_path(self, hierarchy: List[str]) -> str:
|
|
555
627
|
"""Build hierarchical section path from header hierarchy"""
|
|
556
628
|
return ' > '.join(hierarchy) if hierarchy else None
|
|
629
|
+
|
|
630
|
+
def _build_markdown_metadata(self, hierarchy: List[str], code_languages: List[str], has_code: bool) -> Dict[str, Any]:
|
|
631
|
+
"""Build rich metadata for markdown chunks
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
hierarchy: Current header hierarchy (e.g., ['Installation', 'Requirements', 'Python'])
|
|
635
|
+
code_languages: List of code block languages found in chunk (e.g., ['python', 'bash'])
|
|
636
|
+
has_code: Whether chunk contains any code blocks
|
|
637
|
+
|
|
638
|
+
Returns:
|
|
639
|
+
Dictionary with markdown-specific metadata including tags
|
|
640
|
+
"""
|
|
641
|
+
metadata = {
|
|
642
|
+
'chunk_type': 'markdown',
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
# Add header level metadata
|
|
646
|
+
if hierarchy:
|
|
647
|
+
for i, header in enumerate(hierarchy, 1):
|
|
648
|
+
metadata[f'h{i}'] = header
|
|
649
|
+
|
|
650
|
+
# Add code-related metadata
|
|
651
|
+
if has_code:
|
|
652
|
+
metadata['has_code'] = True
|
|
653
|
+
if code_languages:
|
|
654
|
+
metadata['code_languages'] = code_languages
|
|
655
|
+
|
|
656
|
+
# Build tags for enhanced searching
|
|
657
|
+
tags = []
|
|
658
|
+
if has_code:
|
|
659
|
+
tags.append('code')
|
|
660
|
+
# Add language-specific tags
|
|
661
|
+
for lang in code_languages:
|
|
662
|
+
tags.append(f'code:{lang}')
|
|
663
|
+
|
|
664
|
+
# Add tags for header levels (searchable by section depth)
|
|
665
|
+
if len(hierarchy) > 0:
|
|
666
|
+
tags.append(f'depth:{len(hierarchy)}')
|
|
667
|
+
|
|
668
|
+
if tags:
|
|
669
|
+
metadata['tags'] = tags
|
|
670
|
+
|
|
671
|
+
return metadata
|
|
557
672
|
|
|
558
673
|
def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
|
|
559
674
|
"""Build section name for Python code"""
|
|
@@ -674,7 +789,7 @@ class DocumentProcessor:
|
|
|
674
789
|
chunk_index = 0
|
|
675
790
|
|
|
676
791
|
# Create overlapping chunks
|
|
677
|
-
for i in range(0, len(words), self.chunk_size - self.
|
|
792
|
+
for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
|
|
678
793
|
chunk_words = words[i:i + self.chunk_size]
|
|
679
794
|
if chunk_words:
|
|
680
795
|
chunk_content = ' '.join(chunk_words)
|
|
@@ -686,7 +801,7 @@ class DocumentProcessor:
|
|
|
686
801
|
'chunk_method': 'sliding_window',
|
|
687
802
|
'chunk_index': chunk_index,
|
|
688
803
|
'chunk_size_words': self.chunk_size,
|
|
689
|
-
'overlap_size_words': self.
|
|
804
|
+
'overlap_size_words': self.chunk_overlap,
|
|
690
805
|
'start_word': i,
|
|
691
806
|
'end_word': i + len(chunk_words)
|
|
692
807
|
}
|
|
@@ -761,4 +876,348 @@ class DocumentProcessor:
|
|
|
761
876
|
}
|
|
762
877
|
))
|
|
763
878
|
|
|
764
|
-
return chunks
|
|
879
|
+
return chunks
|
|
880
|
+
|
|
881
|
+
def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
882
|
+
"""Chunk based on semantic similarity between sentences"""
|
|
883
|
+
if isinstance(content, list):
|
|
884
|
+
content = '\n'.join(content)
|
|
885
|
+
|
|
886
|
+
# Get sentences
|
|
887
|
+
if sent_tokenize:
|
|
888
|
+
sentences = sent_tokenize(content)
|
|
889
|
+
else:
|
|
890
|
+
sentences = content.split('. ')
|
|
891
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
|
892
|
+
|
|
893
|
+
if len(sentences) <= 1:
|
|
894
|
+
return [self._create_chunk(content, filename, "Section 1",
|
|
895
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
|
896
|
+
|
|
897
|
+
# Generate embeddings for sentences (using the same model as the index)
|
|
898
|
+
try:
|
|
899
|
+
from sentence_transformers import SentenceTransformer
|
|
900
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
901
|
+
import numpy as np
|
|
902
|
+
|
|
903
|
+
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
|
904
|
+
embeddings = model.encode(sentences, show_progress_bar=False)
|
|
905
|
+
|
|
906
|
+
# Calculate similarity between adjacent sentences
|
|
907
|
+
similarities = []
|
|
908
|
+
for i in range(len(embeddings) - 1):
|
|
909
|
+
sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
|
|
910
|
+
similarities.append(sim)
|
|
911
|
+
|
|
912
|
+
# Find split points where similarity drops below threshold
|
|
913
|
+
split_points = [0]
|
|
914
|
+
for i, sim in enumerate(similarities):
|
|
915
|
+
if sim < self.semantic_threshold:
|
|
916
|
+
split_points.append(i + 1)
|
|
917
|
+
split_points.append(len(sentences))
|
|
918
|
+
|
|
919
|
+
# Create chunks
|
|
920
|
+
chunks = []
|
|
921
|
+
for i in range(len(split_points) - 1):
|
|
922
|
+
start_idx = split_points[i]
|
|
923
|
+
end_idx = split_points[i + 1]
|
|
924
|
+
chunk_sentences = sentences[start_idx:end_idx]
|
|
925
|
+
|
|
926
|
+
# Ensure minimum chunk size
|
|
927
|
+
if len(chunk_sentences) < 2 and i > 0:
|
|
928
|
+
# Merge with previous chunk
|
|
929
|
+
chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
|
|
930
|
+
continue
|
|
931
|
+
|
|
932
|
+
chunk_content = ' '.join(chunk_sentences)
|
|
933
|
+
chunks.append(self._create_chunk(
|
|
934
|
+
content=chunk_content,
|
|
935
|
+
filename=filename,
|
|
936
|
+
section=f"Semantic Section {i+1}",
|
|
937
|
+
metadata={
|
|
938
|
+
'chunk_method': 'semantic',
|
|
939
|
+
'chunk_index': i,
|
|
940
|
+
'semantic_threshold': self.semantic_threshold,
|
|
941
|
+
'sentence_count': len(chunk_sentences)
|
|
942
|
+
}
|
|
943
|
+
))
|
|
944
|
+
|
|
945
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
|
|
946
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
|
947
|
+
|
|
948
|
+
except ImportError:
|
|
949
|
+
# Fallback to sentence-based chunking
|
|
950
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
951
|
+
|
|
952
|
+
def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
953
|
+
"""Chunk based on topic changes using keyword analysis"""
|
|
954
|
+
if isinstance(content, list):
|
|
955
|
+
content = '\n'.join(content)
|
|
956
|
+
|
|
957
|
+
if sent_tokenize:
|
|
958
|
+
sentences = sent_tokenize(content)
|
|
959
|
+
else:
|
|
960
|
+
sentences = content.split('. ')
|
|
961
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
|
962
|
+
|
|
963
|
+
if len(sentences) <= 3:
|
|
964
|
+
return [self._create_chunk(content, filename, "Topic 1",
|
|
965
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
|
966
|
+
|
|
967
|
+
try:
|
|
968
|
+
# Simple topic detection using keyword overlap
|
|
969
|
+
from collections import Counter
|
|
970
|
+
import re
|
|
971
|
+
|
|
972
|
+
# Extract keywords from each sentence
|
|
973
|
+
sentence_keywords = []
|
|
974
|
+
for sentence in sentences:
|
|
975
|
+
# Simple keyword extraction (could be enhanced with NLP)
|
|
976
|
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
|
|
977
|
+
# Filter common words (basic stopwords)
|
|
978
|
+
stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
|
|
979
|
+
keywords = [w for w in words if w not in stopwords and len(w) > 3]
|
|
980
|
+
sentence_keywords.append(set(keywords))
|
|
981
|
+
|
|
982
|
+
# Find topic boundaries based on keyword overlap
|
|
983
|
+
chunks = []
|
|
984
|
+
current_chunk = [sentences[0]]
|
|
985
|
+
current_keywords = sentence_keywords[0]
|
|
986
|
+
|
|
987
|
+
for i in range(1, len(sentences)):
|
|
988
|
+
# Calculate keyword overlap with current chunk
|
|
989
|
+
overlap = len(current_keywords.intersection(sentence_keywords[i]))
|
|
990
|
+
total_keywords = len(current_keywords.union(sentence_keywords[i]))
|
|
991
|
+
|
|
992
|
+
if total_keywords > 0:
|
|
993
|
+
similarity = overlap / total_keywords
|
|
994
|
+
else:
|
|
995
|
+
similarity = 0
|
|
996
|
+
|
|
997
|
+
# If similarity is low, start new chunk
|
|
998
|
+
if similarity < self.topic_threshold and len(current_chunk) >= 2:
|
|
999
|
+
chunk_content = ' '.join(current_chunk)
|
|
1000
|
+
chunks.append(self._create_chunk(
|
|
1001
|
+
content=chunk_content,
|
|
1002
|
+
filename=filename,
|
|
1003
|
+
section=f"Topic {len(chunks)+1}",
|
|
1004
|
+
metadata={
|
|
1005
|
+
'chunk_method': 'topic',
|
|
1006
|
+
'chunk_index': len(chunks),
|
|
1007
|
+
'topic_keywords': list(current_keywords)[:10], # Top keywords
|
|
1008
|
+
'sentence_count': len(current_chunk),
|
|
1009
|
+
'topic_threshold': self.topic_threshold
|
|
1010
|
+
}
|
|
1011
|
+
))
|
|
1012
|
+
current_chunk = [sentences[i]]
|
|
1013
|
+
current_keywords = sentence_keywords[i]
|
|
1014
|
+
else:
|
|
1015
|
+
current_chunk.append(sentences[i])
|
|
1016
|
+
current_keywords = current_keywords.union(sentence_keywords[i])
|
|
1017
|
+
|
|
1018
|
+
# Add final chunk
|
|
1019
|
+
if current_chunk:
|
|
1020
|
+
chunk_content = ' '.join(current_chunk)
|
|
1021
|
+
chunks.append(self._create_chunk(
|
|
1022
|
+
content=chunk_content,
|
|
1023
|
+
filename=filename,
|
|
1024
|
+
section=f"Topic {len(chunks)+1}",
|
|
1025
|
+
metadata={
|
|
1026
|
+
'chunk_method': 'topic',
|
|
1027
|
+
'chunk_index': len(chunks),
|
|
1028
|
+
'topic_keywords': list(current_keywords)[:10],
|
|
1029
|
+
'sentence_count': len(current_chunk),
|
|
1030
|
+
'topic_threshold': self.topic_threshold
|
|
1031
|
+
}
|
|
1032
|
+
))
|
|
1033
|
+
|
|
1034
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
|
|
1035
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
|
1036
|
+
|
|
1037
|
+
except Exception:
|
|
1038
|
+
# Fallback to sentence-based chunking
|
|
1039
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
1040
|
+
|
|
1041
|
+
def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
1042
|
+
"""Create chunks optimized for question-answering"""
|
|
1043
|
+
if isinstance(content, list):
|
|
1044
|
+
content = '\n'.join(content)
|
|
1045
|
+
|
|
1046
|
+
if sent_tokenize:
|
|
1047
|
+
sentences = sent_tokenize(content)
|
|
1048
|
+
else:
|
|
1049
|
+
sentences = content.split('. ')
|
|
1050
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
|
1051
|
+
|
|
1052
|
+
# Patterns that indicate Q&A structure
|
|
1053
|
+
question_patterns = [
|
|
1054
|
+
r'\?', # Questions
|
|
1055
|
+
r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
|
|
1056
|
+
r'(step|steps|process|procedure|method|way to)',
|
|
1057
|
+
r'(example|examples|instance|case)',
|
|
1058
|
+
r'(definition|meaning|refers to|means)',
|
|
1059
|
+
]
|
|
1060
|
+
|
|
1061
|
+
chunks = []
|
|
1062
|
+
current_chunk = []
|
|
1063
|
+
current_context = []
|
|
1064
|
+
|
|
1065
|
+
for i, sentence in enumerate(sentences):
|
|
1066
|
+
sentence_lower = sentence.lower().strip()
|
|
1067
|
+
|
|
1068
|
+
# Check if this sentence contains Q&A indicators
|
|
1069
|
+
is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
|
|
1070
|
+
|
|
1071
|
+
if is_qa_relevant or len(current_chunk) == 0:
|
|
1072
|
+
current_chunk.append(sentence)
|
|
1073
|
+
# Add surrounding context (previous and next sentences)
|
|
1074
|
+
if i > 0 and sentences[i-1] not in current_chunk:
|
|
1075
|
+
current_context.append(sentences[i-1])
|
|
1076
|
+
if i < len(sentences) - 1:
|
|
1077
|
+
current_context.append(sentences[i+1])
|
|
1078
|
+
else:
|
|
1079
|
+
current_chunk.append(sentence)
|
|
1080
|
+
|
|
1081
|
+
# Create chunk when we have enough content or reach a natural break
|
|
1082
|
+
if (len(current_chunk) >= 3 and
|
|
1083
|
+
(i == len(sentences) - 1 or # Last sentence
|
|
1084
|
+
sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
|
|
1085
|
+
|
|
1086
|
+
# Combine chunk with context
|
|
1087
|
+
full_content = current_context + current_chunk
|
|
1088
|
+
chunk_content = ' '.join(full_content)
|
|
1089
|
+
|
|
1090
|
+
chunks.append(self._create_chunk(
|
|
1091
|
+
content=chunk_content,
|
|
1092
|
+
filename=filename,
|
|
1093
|
+
section=f"QA Section {len(chunks)+1}",
|
|
1094
|
+
metadata={
|
|
1095
|
+
'chunk_method': 'qa_optimized',
|
|
1096
|
+
'chunk_index': len(chunks),
|
|
1097
|
+
'has_question': any('?' in s for s in current_chunk),
|
|
1098
|
+
'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
|
|
1099
|
+
'sentence_count': len(full_content)
|
|
1100
|
+
}
|
|
1101
|
+
))
|
|
1102
|
+
|
|
1103
|
+
current_chunk = []
|
|
1104
|
+
current_context = []
|
|
1105
|
+
|
|
1106
|
+
# Handle remaining content
|
|
1107
|
+
if current_chunk:
|
|
1108
|
+
chunk_content = ' '.join(current_context + current_chunk)
|
|
1109
|
+
chunks.append(self._create_chunk(
|
|
1110
|
+
content=chunk_content,
|
|
1111
|
+
filename=filename,
|
|
1112
|
+
section=f"QA Section {len(chunks)+1}",
|
|
1113
|
+
metadata={
|
|
1114
|
+
'chunk_method': 'qa_optimized',
|
|
1115
|
+
'chunk_index': len(chunks),
|
|
1116
|
+
'sentence_count': len(current_context + current_chunk)
|
|
1117
|
+
}
|
|
1118
|
+
))
|
|
1119
|
+
|
|
1120
|
+
return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
|
|
1121
|
+
metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
|
|
1122
|
+
|
|
1123
|
+
def _chunk_from_json(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
|
1124
|
+
"""
|
|
1125
|
+
Create chunks from pre-processed JSON content
|
|
1126
|
+
|
|
1127
|
+
This strategy expects content to be a JSON string with the following structure:
|
|
1128
|
+
{
|
|
1129
|
+
"chunks": [
|
|
1130
|
+
{
|
|
1131
|
+
"chunk_id": "unique_id",
|
|
1132
|
+
"type": "content|toc",
|
|
1133
|
+
"content": "text content",
|
|
1134
|
+
"metadata": {
|
|
1135
|
+
"url": "https://...",
|
|
1136
|
+
"section_number": 1,
|
|
1137
|
+
"related_toc": "toc_id",
|
|
1138
|
+
...
|
|
1139
|
+
}
|
|
1140
|
+
},
|
|
1141
|
+
...
|
|
1142
|
+
]
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
Args:
|
|
1146
|
+
content: JSON string containing pre-chunked content
|
|
1147
|
+
filename: Name of the source file
|
|
1148
|
+
file_type: Should be 'json'
|
|
1149
|
+
|
|
1150
|
+
Returns:
|
|
1151
|
+
List of chunk dictionaries formatted for the search index
|
|
1152
|
+
"""
|
|
1153
|
+
try:
|
|
1154
|
+
# Parse JSON content
|
|
1155
|
+
data = json.loads(content)
|
|
1156
|
+
|
|
1157
|
+
if not isinstance(data, dict) or 'chunks' not in data:
|
|
1158
|
+
logger.error(f"Invalid JSON structure in {filename}: expected 'chunks' key")
|
|
1159
|
+
# Fallback to treating it as plain text
|
|
1160
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
1161
|
+
|
|
1162
|
+
chunks = []
|
|
1163
|
+
for idx, json_chunk in enumerate(data['chunks']):
|
|
1164
|
+
if not isinstance(json_chunk, dict) or 'content' not in json_chunk:
|
|
1165
|
+
logger.warning(f"Skipping invalid chunk {idx} in {filename}")
|
|
1166
|
+
continue
|
|
1167
|
+
|
|
1168
|
+
# Extract metadata from JSON chunk
|
|
1169
|
+
json_metadata = json_chunk.get('metadata', {})
|
|
1170
|
+
chunk_type = json_chunk.get('type', 'content')
|
|
1171
|
+
|
|
1172
|
+
# Build chunk metadata (excluding tags which go at top level)
|
|
1173
|
+
metadata = {
|
|
1174
|
+
'chunk_method': 'json',
|
|
1175
|
+
'chunk_index': idx,
|
|
1176
|
+
'chunk_type': chunk_type,
|
|
1177
|
+
'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
# Extract tags before merging metadata
|
|
1181
|
+
tags = json_metadata.get('tags', [])
|
|
1182
|
+
|
|
1183
|
+
# Merge JSON metadata (this includes all fields including tags)
|
|
1184
|
+
# We'll keep tags in metadata for backward compatibility but also set at top level
|
|
1185
|
+
metadata.update(json_metadata)
|
|
1186
|
+
|
|
1187
|
+
# Determine section name
|
|
1188
|
+
if chunk_type == 'toc':
|
|
1189
|
+
section = f"TOC: {json_chunk.get('content', '')[:50]}"
|
|
1190
|
+
else:
|
|
1191
|
+
section = json_metadata.get('section', f"Section {json_metadata.get('section_number', idx + 1)}")
|
|
1192
|
+
|
|
1193
|
+
# Create chunk with proper structure
|
|
1194
|
+
chunk = self._create_chunk(
|
|
1195
|
+
content=json_chunk['content'],
|
|
1196
|
+
filename=filename,
|
|
1197
|
+
section=section,
|
|
1198
|
+
metadata=metadata
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
# Set tags at the top level for proper tag filtering
|
|
1202
|
+
if tags:
|
|
1203
|
+
chunk['tags'] = tags
|
|
1204
|
+
elif chunk_type == 'toc':
|
|
1205
|
+
# For TOC entries, add special tags if none provided
|
|
1206
|
+
chunk['tags'] = ['toc', 'navigation']
|
|
1207
|
+
|
|
1208
|
+
chunks.append(chunk)
|
|
1209
|
+
|
|
1210
|
+
if not chunks:
|
|
1211
|
+
logger.warning(f"No valid chunks found in JSON file {filename}")
|
|
1212
|
+
return self._chunk_by_sentences(str(data), filename, file_type)
|
|
1213
|
+
|
|
1214
|
+
logger.info(f"Created {len(chunks)} chunks from JSON file {filename}")
|
|
1215
|
+
return chunks
|
|
1216
|
+
|
|
1217
|
+
except json.JSONDecodeError as e:
|
|
1218
|
+
logger.error(f"Failed to parse JSON in {filename}: {e}")
|
|
1219
|
+
# Fallback to sentence chunking
|
|
1220
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
|
1221
|
+
except Exception as e:
|
|
1222
|
+
logger.error(f"Unexpected error processing JSON chunks in {filename}: {e}")
|
|
1223
|
+
return self._chunk_by_sentences(content, filename, file_type)
|