signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. signalwire_agents/__init__.py +99 -15
  2. signalwire_agents/agent_server.py +248 -60
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +9 -0
  5. signalwire_agents/cli/build_search.py +951 -41
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/dokku.py +2320 -0
  13. signalwire_agents/cli/execution/__init__.py +10 -0
  14. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  15. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  16. signalwire_agents/cli/init_project.py +2636 -0
  17. signalwire_agents/cli/output/__init__.py +10 -0
  18. signalwire_agents/cli/output/output_formatter.py +255 -0
  19. signalwire_agents/cli/output/swml_dump.py +186 -0
  20. signalwire_agents/cli/simulation/__init__.py +10 -0
  21. signalwire_agents/cli/simulation/data_generation.py +374 -0
  22. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  23. signalwire_agents/cli/simulation/mock_env.py +282 -0
  24. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  25. signalwire_agents/cli/test_swaig.py +566 -2366
  26. signalwire_agents/cli/types.py +81 -0
  27. signalwire_agents/core/__init__.py +2 -2
  28. signalwire_agents/core/agent/__init__.py +12 -0
  29. signalwire_agents/core/agent/config/__init__.py +12 -0
  30. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  31. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  32. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  33. signalwire_agents/core/agent/prompt/manager.py +306 -0
  34. signalwire_agents/core/agent/routing/__init__.py +9 -0
  35. signalwire_agents/core/agent/security/__init__.py +9 -0
  36. signalwire_agents/core/agent/swml/__init__.py +9 -0
  37. signalwire_agents/core/agent/tools/__init__.py +15 -0
  38. signalwire_agents/core/agent/tools/decorator.py +97 -0
  39. signalwire_agents/core/agent/tools/registry.py +210 -0
  40. signalwire_agents/core/agent_base.py +845 -2916
  41. signalwire_agents/core/auth_handler.py +233 -0
  42. signalwire_agents/core/config_loader.py +259 -0
  43. signalwire_agents/core/contexts.py +418 -0
  44. signalwire_agents/core/data_map.py +3 -15
  45. signalwire_agents/core/function_result.py +116 -44
  46. signalwire_agents/core/logging_config.py +162 -18
  47. signalwire_agents/core/mixins/__init__.py +28 -0
  48. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  49. signalwire_agents/core/mixins/auth_mixin.py +280 -0
  50. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  51. signalwire_agents/core/mixins/serverless_mixin.py +460 -0
  52. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  53. signalwire_agents/core/mixins/state_mixin.py +153 -0
  54. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  55. signalwire_agents/core/mixins/web_mixin.py +1142 -0
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +84 -1
  58. signalwire_agents/core/skill_manager.py +62 -20
  59. signalwire_agents/core/swaig_function.py +18 -5
  60. signalwire_agents/core/swml_builder.py +207 -11
  61. signalwire_agents/core/swml_handler.py +27 -21
  62. signalwire_agents/core/swml_renderer.py +123 -312
  63. signalwire_agents/core/swml_service.py +171 -203
  64. signalwire_agents/mcp_gateway/__init__.py +29 -0
  65. signalwire_agents/mcp_gateway/gateway_service.py +564 -0
  66. signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
  67. signalwire_agents/mcp_gateway/session_manager.py +218 -0
  68. signalwire_agents/prefabs/concierge.py +0 -3
  69. signalwire_agents/prefabs/faq_bot.py +0 -3
  70. signalwire_agents/prefabs/info_gatherer.py +0 -3
  71. signalwire_agents/prefabs/receptionist.py +0 -3
  72. signalwire_agents/prefabs/survey.py +0 -3
  73. signalwire_agents/schema.json +9218 -5489
  74. signalwire_agents/search/__init__.py +7 -1
  75. signalwire_agents/search/document_processor.py +490 -31
  76. signalwire_agents/search/index_builder.py +307 -37
  77. signalwire_agents/search/migration.py +418 -0
  78. signalwire_agents/search/models.py +30 -0
  79. signalwire_agents/search/pgvector_backend.py +748 -0
  80. signalwire_agents/search/query_processor.py +162 -31
  81. signalwire_agents/search/search_engine.py +916 -35
  82. signalwire_agents/search/search_service.py +376 -53
  83. signalwire_agents/skills/README.md +452 -0
  84. signalwire_agents/skills/__init__.py +14 -2
  85. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  86. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  87. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  88. signalwire_agents/skills/datasphere/README.md +210 -0
  89. signalwire_agents/skills/datasphere/skill.py +84 -3
  90. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  91. signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
  92. signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
  93. signalwire_agents/skills/datetime/README.md +132 -0
  94. signalwire_agents/skills/datetime/__init__.py +9 -0
  95. signalwire_agents/skills/datetime/skill.py +20 -7
  96. signalwire_agents/skills/joke/README.md +149 -0
  97. signalwire_agents/skills/joke/__init__.py +9 -0
  98. signalwire_agents/skills/joke/skill.py +21 -0
  99. signalwire_agents/skills/math/README.md +161 -0
  100. signalwire_agents/skills/math/__init__.py +9 -0
  101. signalwire_agents/skills/math/skill.py +18 -4
  102. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  103. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  104. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  105. signalwire_agents/skills/native_vector_search/README.md +210 -0
  106. signalwire_agents/skills/native_vector_search/__init__.py +9 -0
  107. signalwire_agents/skills/native_vector_search/skill.py +569 -101
  108. signalwire_agents/skills/play_background_file/README.md +218 -0
  109. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  110. signalwire_agents/skills/play_background_file/skill.py +242 -0
  111. signalwire_agents/skills/registry.py +395 -40
  112. signalwire_agents/skills/spider/README.md +236 -0
  113. signalwire_agents/skills/spider/__init__.py +13 -0
  114. signalwire_agents/skills/spider/skill.py +598 -0
  115. signalwire_agents/skills/swml_transfer/README.md +395 -0
  116. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  117. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  118. signalwire_agents/skills/weather_api/README.md +178 -0
  119. signalwire_agents/skills/weather_api/__init__.py +12 -0
  120. signalwire_agents/skills/weather_api/skill.py +191 -0
  121. signalwire_agents/skills/web_search/README.md +163 -0
  122. signalwire_agents/skills/web_search/__init__.py +9 -0
  123. signalwire_agents/skills/web_search/skill.py +586 -112
  124. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  125. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  126. signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
  127. signalwire_agents/web/__init__.py +17 -0
  128. signalwire_agents/web/web_service.py +559 -0
  129. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
  130. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
  131. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
  132. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
  133. signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
  134. signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
  135. signalwire_agents/core/state/file_state_manager.py +0 -219
  136. signalwire_agents/core/state/state_manager.py +0 -101
  137. signalwire_agents/skills/wikipedia/__init__.py +0 -9
  138. signalwire_agents-0.1.13.data/data/schema.json +0 -5611
  139. signalwire_agents-0.1.13.dist-info/RECORD +0 -67
  140. signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
  141. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
  142. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
  143. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,8 @@ if _SEARCH_AVAILABLE:
68
68
  from .index_builder import IndexBuilder
69
69
  from .search_engine import SearchEngine
70
70
  from .search_service import SearchService
71
+ from .models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
72
+ from .migration import SearchIndexMigrator
71
73
 
72
74
  __all__ = [
73
75
  'preprocess_query',
@@ -75,7 +77,11 @@ if _SEARCH_AVAILABLE:
75
77
  'DocumentProcessor',
76
78
  'IndexBuilder',
77
79
  'SearchEngine',
78
- 'SearchService'
80
+ 'SearchService',
81
+ 'MODEL_ALIASES',
82
+ 'DEFAULT_MODEL',
83
+ 'resolve_model_alias',
84
+ 'SearchIndexMigrator'
79
85
  ]
80
86
  except ImportError as e:
81
87
  # Some search components failed to import
@@ -74,29 +74,51 @@ logger = logging.getLogger(__name__)
74
74
  class DocumentProcessor:
75
75
  """Enhanced document processor with smart chunking capabilities"""
76
76
 
77
- def __init__(self, chunking_strategy: str = 'sentence',
78
- max_sentences_per_chunk: int = 50,
79
- chunk_size: int = 50,
80
- overlap_size: int = 10,
81
- split_newlines: Optional[int] = None):
77
+ def __init__(
78
+ self,
79
+ chunking_strategy: str = 'sentence',
80
+ max_sentences_per_chunk: int = 5,
81
+ chunk_size: int = 50,
82
+ chunk_overlap: int = 10,
83
+ split_newlines: Optional[int] = None,
84
+ index_nlp_backend: str = 'nltk',
85
+ verbose: bool = False,
86
+ semantic_threshold: float = 0.5,
87
+ topic_threshold: float = 0.3
88
+ ):
82
89
  """
83
- Initialize document processor with chunking strategy
84
-
90
+ Initialize document processor
91
+
85
92
  Args:
86
- chunking_strategy: 'sentence', 'sliding', 'paragraph', or 'page'
87
- max_sentences_per_chunk: For sentence strategy (default: 50)
93
+ chunking_strategy: Strategy for chunking documents:
94
+ - 'sentence': Sentence-based chunking with overlap
95
+ - 'sliding': Sliding window with word-based chunks
96
+ - 'paragraph': Natural paragraph boundaries
97
+ - 'page': Page-based chunking (for PDFs)
98
+ - 'semantic': Semantic similarity-based chunking
99
+ - 'topic': Topic modeling-based chunking
100
+ - 'qa': Question-answer optimized chunking
101
+ - 'json': JSON structure-aware chunking
102
+ - 'markdown': Markdown structure-aware chunking with code block detection
103
+ max_sentences_per_chunk: For sentence strategy (default: 5)
88
104
  chunk_size: For sliding strategy - words per chunk (default: 50)
89
- overlap_size: For sliding strategy - overlap in words (default: 10)
105
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
90
106
  split_newlines: For sentence strategy - split on multiple newlines (optional)
107
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
108
+ verbose: Whether to enable verbose logging (default: False)
109
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
110
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
91
111
  """
92
112
  self.chunking_strategy = chunking_strategy
93
113
  self.max_sentences_per_chunk = max_sentences_per_chunk
94
114
  self.chunk_size = chunk_size
95
- self.overlap_size = overlap_size
115
+ self.chunk_overlap = chunk_overlap
96
116
  self.split_newlines = split_newlines
117
+ self.semantic_threshold = semantic_threshold
118
+ self.topic_threshold = topic_threshold
97
119
 
98
120
  # Legacy support for old character-based chunking
99
- self.chunk_overlap = overlap_size
121
+ self.chunk_overlap = chunk_overlap
100
122
 
101
123
  def create_chunks(self, content: str, filename: str,
102
124
  file_type: str) -> List[Dict[str, Any]]:
@@ -121,6 +143,17 @@ class DocumentProcessor:
121
143
  return self._chunk_by_paragraphs(content, filename, file_type)
122
144
  elif self.chunking_strategy == 'page':
123
145
  return self._chunk_by_pages(content, filename, file_type)
146
+ elif self.chunking_strategy == 'semantic':
147
+ return self._chunk_by_semantic(content, filename, file_type)
148
+ elif self.chunking_strategy == 'topic':
149
+ return self._chunk_by_topics(content, filename, file_type)
150
+ elif self.chunking_strategy == 'qa':
151
+ return self._chunk_by_qa_optimization(content, filename, file_type)
152
+ elif self.chunking_strategy == 'json':
153
+ return self._chunk_from_json(content, filename, file_type)
154
+ elif self.chunking_strategy == 'markdown':
155
+ # Use markdown-aware chunking for better structure preservation
156
+ return self._chunk_markdown_enhanced(content, filename)
124
157
  else:
125
158
  # Fallback to sentence-based chunking
126
159
  return self._chunk_by_sentences(content, filename, file_type)
@@ -318,75 +351,114 @@ class DocumentProcessor:
318
351
  return chunks
319
352
 
320
353
  def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
321
- """Enhanced markdown chunking with better header handling"""
354
+ """Enhanced markdown chunking with code block detection and rich metadata
355
+
356
+ Features:
357
+ - Tracks header hierarchy for section paths
358
+ - Detects code blocks and extracts language
359
+ - Adds 'code' tags to chunks containing code
360
+ - Preserves markdown structure for better search
361
+ """
322
362
  chunks = []
323
363
  lines = content.split('\n')
324
-
364
+
325
365
  current_section = None
326
366
  current_hierarchy = [] # Track header hierarchy
327
367
  current_chunk = []
328
368
  current_size = 0
329
369
  line_start = 1
330
-
370
+ in_code_block = False
371
+ code_languages = [] # Track languages in current chunk
372
+ has_code = False
373
+
331
374
  for line_num, line in enumerate(lines, 1):
375
+ # Check for code block fences
376
+ code_fence_match = re.match(r'^```(\w+)?', line)
377
+ if code_fence_match:
378
+ in_code_block = not in_code_block
379
+ if in_code_block:
380
+ # Starting code block
381
+ has_code = True
382
+ lang = code_fence_match.group(1)
383
+ if lang and lang not in code_languages:
384
+ code_languages.append(lang)
385
+
332
386
  # Check for headers with hierarchy tracking
333
- header_match = re.match(r'^(#{1,6})\s+(.+)', line)
387
+ header_match = re.match(r'^(#{1,6})\s+(.+)', line) if not in_code_block else None
334
388
  if header_match:
335
389
  header_level = len(header_match.group(1))
336
390
  header_text = header_match.group(2).strip()
337
-
391
+
338
392
  # Save current chunk if it exists
339
393
  if current_chunk:
394
+ chunk_metadata = self._build_markdown_metadata(
395
+ current_hierarchy, code_languages, has_code
396
+ )
340
397
  chunks.append(self._create_chunk(
341
398
  content='\n'.join(current_chunk),
342
399
  filename=filename,
343
400
  section=self._build_section_path(current_hierarchy),
344
401
  start_line=line_start,
345
- end_line=line_num - 1
402
+ end_line=line_num - 1,
403
+ metadata=chunk_metadata
346
404
  ))
347
-
405
+
348
406
  # Update hierarchy
349
407
  current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
350
408
  current_section = header_text
351
409
  current_chunk = [line]
352
410
  current_size = len(line)
353
411
  line_start = line_num
354
-
412
+ code_languages = []
413
+ has_code = False
414
+
355
415
  else:
356
416
  current_chunk.append(line)
357
417
  current_size += len(line) + 1
358
-
418
+
359
419
  # Check if chunk is getting too large - use smart splitting
360
- if current_size >= self.chunk_size:
420
+ # But don't split inside code blocks
421
+ if current_size >= self.chunk_size and not in_code_block:
361
422
  # Try to split at paragraph boundary first
362
423
  split_point = self._find_best_split_point(current_chunk)
363
-
424
+
364
425
  chunk_to_save = current_chunk[:split_point]
426
+ chunk_metadata = self._build_markdown_metadata(
427
+ current_hierarchy, code_languages, has_code
428
+ )
365
429
  chunks.append(self._create_chunk(
366
430
  content='\n'.join(chunk_to_save),
367
431
  filename=filename,
368
432
  section=self._build_section_path(current_hierarchy),
369
433
  start_line=line_start,
370
- end_line=line_start + split_point - 1
434
+ end_line=line_start + split_point - 1,
435
+ metadata=chunk_metadata
371
436
  ))
372
-
437
+
373
438
  # Start new chunk with overlap
374
439
  overlap_lines = self._get_overlap_lines(chunk_to_save)
375
440
  remaining_lines = current_chunk[split_point:]
376
441
  current_chunk = overlap_lines + remaining_lines
377
442
  current_size = sum(len(line) + 1 for line in current_chunk)
378
443
  line_start = line_start + split_point - len(overlap_lines)
379
-
444
+ # Reset code tracking for new chunk
445
+ code_languages = []
446
+ has_code = False
447
+
380
448
  # Add final chunk
381
449
  if current_chunk:
450
+ chunk_metadata = self._build_markdown_metadata(
451
+ current_hierarchy, code_languages, has_code
452
+ )
382
453
  chunks.append(self._create_chunk(
383
454
  content='\n'.join(current_chunk),
384
455
  filename=filename,
385
456
  section=self._build_section_path(current_hierarchy),
386
457
  start_line=line_start,
387
- end_line=len(lines)
458
+ end_line=len(lines),
459
+ metadata=chunk_metadata
388
460
  ))
389
-
461
+
390
462
  return chunks
391
463
 
392
464
  def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
@@ -554,6 +626,49 @@ class DocumentProcessor:
554
626
  def _build_section_path(self, hierarchy: List[str]) -> str:
555
627
  """Build hierarchical section path from header hierarchy"""
556
628
  return ' > '.join(hierarchy) if hierarchy else None
629
+
630
+ def _build_markdown_metadata(self, hierarchy: List[str], code_languages: List[str], has_code: bool) -> Dict[str, Any]:
631
+ """Build rich metadata for markdown chunks
632
+
633
+ Args:
634
+ hierarchy: Current header hierarchy (e.g., ['Installation', 'Requirements', 'Python'])
635
+ code_languages: List of code block languages found in chunk (e.g., ['python', 'bash'])
636
+ has_code: Whether chunk contains any code blocks
637
+
638
+ Returns:
639
+ Dictionary with markdown-specific metadata including tags
640
+ """
641
+ metadata = {
642
+ 'chunk_type': 'markdown',
643
+ }
644
+
645
+ # Add header level metadata
646
+ if hierarchy:
647
+ for i, header in enumerate(hierarchy, 1):
648
+ metadata[f'h{i}'] = header
649
+
650
+ # Add code-related metadata
651
+ if has_code:
652
+ metadata['has_code'] = True
653
+ if code_languages:
654
+ metadata['code_languages'] = code_languages
655
+
656
+ # Build tags for enhanced searching
657
+ tags = []
658
+ if has_code:
659
+ tags.append('code')
660
+ # Add language-specific tags
661
+ for lang in code_languages:
662
+ tags.append(f'code:{lang}')
663
+
664
+ # Add tags for header levels (searchable by section depth)
665
+ if len(hierarchy) > 0:
666
+ tags.append(f'depth:{len(hierarchy)}')
667
+
668
+ if tags:
669
+ metadata['tags'] = tags
670
+
671
+ return metadata
557
672
 
558
673
  def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
559
674
  """Build section name for Python code"""
@@ -674,7 +789,7 @@ class DocumentProcessor:
674
789
  chunk_index = 0
675
790
 
676
791
  # Create overlapping chunks
677
- for i in range(0, len(words), self.chunk_size - self.overlap_size):
792
+ for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
678
793
  chunk_words = words[i:i + self.chunk_size]
679
794
  if chunk_words:
680
795
  chunk_content = ' '.join(chunk_words)
@@ -686,7 +801,7 @@ class DocumentProcessor:
686
801
  'chunk_method': 'sliding_window',
687
802
  'chunk_index': chunk_index,
688
803
  'chunk_size_words': self.chunk_size,
689
- 'overlap_size_words': self.overlap_size,
804
+ 'overlap_size_words': self.chunk_overlap,
690
805
  'start_word': i,
691
806
  'end_word': i + len(chunk_words)
692
807
  }
@@ -761,4 +876,348 @@ class DocumentProcessor:
761
876
  }
762
877
  ))
763
878
 
764
- return chunks
879
+ return chunks
880
+
881
+ def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
882
+ """Chunk based on semantic similarity between sentences"""
883
+ if isinstance(content, list):
884
+ content = '\n'.join(content)
885
+
886
+ # Get sentences
887
+ if sent_tokenize:
888
+ sentences = sent_tokenize(content)
889
+ else:
890
+ sentences = content.split('. ')
891
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
892
+
893
+ if len(sentences) <= 1:
894
+ return [self._create_chunk(content, filename, "Section 1",
895
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
896
+
897
+ # Generate embeddings for sentences (using the same model as the index)
898
+ try:
899
+ from sentence_transformers import SentenceTransformer
900
+ from sklearn.metrics.pairwise import cosine_similarity
901
+ import numpy as np
902
+
903
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
904
+ embeddings = model.encode(sentences, show_progress_bar=False)
905
+
906
+ # Calculate similarity between adjacent sentences
907
+ similarities = []
908
+ for i in range(len(embeddings) - 1):
909
+ sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
910
+ similarities.append(sim)
911
+
912
+ # Find split points where similarity drops below threshold
913
+ split_points = [0]
914
+ for i, sim in enumerate(similarities):
915
+ if sim < self.semantic_threshold:
916
+ split_points.append(i + 1)
917
+ split_points.append(len(sentences))
918
+
919
+ # Create chunks
920
+ chunks = []
921
+ for i in range(len(split_points) - 1):
922
+ start_idx = split_points[i]
923
+ end_idx = split_points[i + 1]
924
+ chunk_sentences = sentences[start_idx:end_idx]
925
+
926
+ # Ensure minimum chunk size
927
+ if len(chunk_sentences) < 2 and i > 0:
928
+ # Merge with previous chunk
929
+ chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
930
+ continue
931
+
932
+ chunk_content = ' '.join(chunk_sentences)
933
+ chunks.append(self._create_chunk(
934
+ content=chunk_content,
935
+ filename=filename,
936
+ section=f"Semantic Section {i+1}",
937
+ metadata={
938
+ 'chunk_method': 'semantic',
939
+ 'chunk_index': i,
940
+ 'semantic_threshold': self.semantic_threshold,
941
+ 'sentence_count': len(chunk_sentences)
942
+ }
943
+ ))
944
+
945
+ return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
946
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
947
+
948
+ except ImportError:
949
+ # Fallback to sentence-based chunking
950
+ return self._chunk_by_sentences(content, filename, file_type)
951
+
952
+ def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
953
+ """Chunk based on topic changes using keyword analysis"""
954
+ if isinstance(content, list):
955
+ content = '\n'.join(content)
956
+
957
+ if sent_tokenize:
958
+ sentences = sent_tokenize(content)
959
+ else:
960
+ sentences = content.split('. ')
961
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
962
+
963
+ if len(sentences) <= 3:
964
+ return [self._create_chunk(content, filename, "Topic 1",
965
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
966
+
967
+ try:
968
+ # Simple topic detection using keyword overlap
969
+ from collections import Counter
970
+ import re
971
+
972
+ # Extract keywords from each sentence
973
+ sentence_keywords = []
974
+ for sentence in sentences:
975
+ # Simple keyword extraction (could be enhanced with NLP)
976
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
977
+ # Filter common words (basic stopwords)
978
+ stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
979
+ keywords = [w for w in words if w not in stopwords and len(w) > 3]
980
+ sentence_keywords.append(set(keywords))
981
+
982
+ # Find topic boundaries based on keyword overlap
983
+ chunks = []
984
+ current_chunk = [sentences[0]]
985
+ current_keywords = sentence_keywords[0]
986
+
987
+ for i in range(1, len(sentences)):
988
+ # Calculate keyword overlap with current chunk
989
+ overlap = len(current_keywords.intersection(sentence_keywords[i]))
990
+ total_keywords = len(current_keywords.union(sentence_keywords[i]))
991
+
992
+ if total_keywords > 0:
993
+ similarity = overlap / total_keywords
994
+ else:
995
+ similarity = 0
996
+
997
+ # If similarity is low, start new chunk
998
+ if similarity < self.topic_threshold and len(current_chunk) >= 2:
999
+ chunk_content = ' '.join(current_chunk)
1000
+ chunks.append(self._create_chunk(
1001
+ content=chunk_content,
1002
+ filename=filename,
1003
+ section=f"Topic {len(chunks)+1}",
1004
+ metadata={
1005
+ 'chunk_method': 'topic',
1006
+ 'chunk_index': len(chunks),
1007
+ 'topic_keywords': list(current_keywords)[:10], # Top keywords
1008
+ 'sentence_count': len(current_chunk),
1009
+ 'topic_threshold': self.topic_threshold
1010
+ }
1011
+ ))
1012
+ current_chunk = [sentences[i]]
1013
+ current_keywords = sentence_keywords[i]
1014
+ else:
1015
+ current_chunk.append(sentences[i])
1016
+ current_keywords = current_keywords.union(sentence_keywords[i])
1017
+
1018
+ # Add final chunk
1019
+ if current_chunk:
1020
+ chunk_content = ' '.join(current_chunk)
1021
+ chunks.append(self._create_chunk(
1022
+ content=chunk_content,
1023
+ filename=filename,
1024
+ section=f"Topic {len(chunks)+1}",
1025
+ metadata={
1026
+ 'chunk_method': 'topic',
1027
+ 'chunk_index': len(chunks),
1028
+ 'topic_keywords': list(current_keywords)[:10],
1029
+ 'sentence_count': len(current_chunk),
1030
+ 'topic_threshold': self.topic_threshold
1031
+ }
1032
+ ))
1033
+
1034
+ return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
1035
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
1036
+
1037
+ except Exception:
1038
+ # Fallback to sentence-based chunking
1039
+ return self._chunk_by_sentences(content, filename, file_type)
1040
+
1041
+ def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
1042
+ """Create chunks optimized for question-answering"""
1043
+ if isinstance(content, list):
1044
+ content = '\n'.join(content)
1045
+
1046
+ if sent_tokenize:
1047
+ sentences = sent_tokenize(content)
1048
+ else:
1049
+ sentences = content.split('. ')
1050
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
1051
+
1052
+ # Patterns that indicate Q&A structure
1053
+ question_patterns = [
1054
+ r'\?', # Questions
1055
+ r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
1056
+ r'(step|steps|process|procedure|method|way to)',
1057
+ r'(example|examples|instance|case)',
1058
+ r'(definition|meaning|refers to|means)',
1059
+ ]
1060
+
1061
+ chunks = []
1062
+ current_chunk = []
1063
+ current_context = []
1064
+
1065
+ for i, sentence in enumerate(sentences):
1066
+ sentence_lower = sentence.lower().strip()
1067
+
1068
+ # Check if this sentence contains Q&A indicators
1069
+ is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
1070
+
1071
+ if is_qa_relevant or len(current_chunk) == 0:
1072
+ current_chunk.append(sentence)
1073
+ # Add surrounding context (previous and next sentences)
1074
+ if i > 0 and sentences[i-1] not in current_chunk:
1075
+ current_context.append(sentences[i-1])
1076
+ if i < len(sentences) - 1:
1077
+ current_context.append(sentences[i+1])
1078
+ else:
1079
+ current_chunk.append(sentence)
1080
+
1081
+ # Create chunk when we have enough content or reach a natural break
1082
+ if (len(current_chunk) >= 3 and
1083
+ (i == len(sentences) - 1 or # Last sentence
1084
+ sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
1085
+
1086
+ # Combine chunk with context
1087
+ full_content = current_context + current_chunk
1088
+ chunk_content = ' '.join(full_content)
1089
+
1090
+ chunks.append(self._create_chunk(
1091
+ content=chunk_content,
1092
+ filename=filename,
1093
+ section=f"QA Section {len(chunks)+1}",
1094
+ metadata={
1095
+ 'chunk_method': 'qa_optimized',
1096
+ 'chunk_index': len(chunks),
1097
+ 'has_question': any('?' in s for s in current_chunk),
1098
+ 'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
1099
+ 'sentence_count': len(full_content)
1100
+ }
1101
+ ))
1102
+
1103
+ current_chunk = []
1104
+ current_context = []
1105
+
1106
+ # Handle remaining content
1107
+ if current_chunk:
1108
+ chunk_content = ' '.join(current_context + current_chunk)
1109
+ chunks.append(self._create_chunk(
1110
+ content=chunk_content,
1111
+ filename=filename,
1112
+ section=f"QA Section {len(chunks)+1}",
1113
+ metadata={
1114
+ 'chunk_method': 'qa_optimized',
1115
+ 'chunk_index': len(chunks),
1116
+ 'sentence_count': len(current_context + current_chunk)
1117
+ }
1118
+ ))
1119
+
1120
+ return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
1121
+ metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
1122
+
1123
+ def _chunk_from_json(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
1124
+ """
1125
+ Create chunks from pre-processed JSON content
1126
+
1127
+ This strategy expects content to be a JSON string with the following structure:
1128
+ {
1129
+ "chunks": [
1130
+ {
1131
+ "chunk_id": "unique_id",
1132
+ "type": "content|toc",
1133
+ "content": "text content",
1134
+ "metadata": {
1135
+ "url": "https://...",
1136
+ "section_number": 1,
1137
+ "related_toc": "toc_id",
1138
+ ...
1139
+ }
1140
+ },
1141
+ ...
1142
+ ]
1143
+ }
1144
+
1145
+ Args:
1146
+ content: JSON string containing pre-chunked content
1147
+ filename: Name of the source file
1148
+ file_type: Should be 'json'
1149
+
1150
+ Returns:
1151
+ List of chunk dictionaries formatted for the search index
1152
+ """
1153
+ try:
1154
+ # Parse JSON content
1155
+ data = json.loads(content)
1156
+
1157
+ if not isinstance(data, dict) or 'chunks' not in data:
1158
+ logger.error(f"Invalid JSON structure in {filename}: expected 'chunks' key")
1159
+ # Fallback to treating it as plain text
1160
+ return self._chunk_by_sentences(content, filename, file_type)
1161
+
1162
+ chunks = []
1163
+ for idx, json_chunk in enumerate(data['chunks']):
1164
+ if not isinstance(json_chunk, dict) or 'content' not in json_chunk:
1165
+ logger.warning(f"Skipping invalid chunk {idx} in {filename}")
1166
+ continue
1167
+
1168
+ # Extract metadata from JSON chunk
1169
+ json_metadata = json_chunk.get('metadata', {})
1170
+ chunk_type = json_chunk.get('type', 'content')
1171
+
1172
+ # Build chunk metadata (excluding tags which go at top level)
1173
+ metadata = {
1174
+ 'chunk_method': 'json',
1175
+ 'chunk_index': idx,
1176
+ 'chunk_type': chunk_type,
1177
+ 'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
1178
+ }
1179
+
1180
+ # Extract tags before merging metadata
1181
+ tags = json_metadata.get('tags', [])
1182
+
1183
+ # Merge JSON metadata (this includes all fields including tags)
1184
+ # We'll keep tags in metadata for backward compatibility but also set at top level
1185
+ metadata.update(json_metadata)
1186
+
1187
+ # Determine section name
1188
+ if chunk_type == 'toc':
1189
+ section = f"TOC: {json_chunk.get('content', '')[:50]}"
1190
+ else:
1191
+ section = json_metadata.get('section', f"Section {json_metadata.get('section_number', idx + 1)}")
1192
+
1193
+ # Create chunk with proper structure
1194
+ chunk = self._create_chunk(
1195
+ content=json_chunk['content'],
1196
+ filename=filename,
1197
+ section=section,
1198
+ metadata=metadata
1199
+ )
1200
+
1201
+ # Set tags at the top level for proper tag filtering
1202
+ if tags:
1203
+ chunk['tags'] = tags
1204
+ elif chunk_type == 'toc':
1205
+ # For TOC entries, add special tags if none provided
1206
+ chunk['tags'] = ['toc', 'navigation']
1207
+
1208
+ chunks.append(chunk)
1209
+
1210
+ if not chunks:
1211
+ logger.warning(f"No valid chunks found in JSON file {filename}")
1212
+ return self._chunk_by_sentences(str(data), filename, file_type)
1213
+
1214
+ logger.info(f"Created {len(chunks)} chunks from JSON file {filename}")
1215
+ return chunks
1216
+
1217
+ except json.JSONDecodeError as e:
1218
+ logger.error(f"Failed to parse JSON in {filename}: {e}")
1219
+ # Fallback to sentence chunking
1220
+ return self._chunk_by_sentences(content, filename, file_type)
1221
+ except Exception as e:
1222
+ logger.error(f"Unexpected error processing JSON chunks in {filename}: {e}")
1223
+ return self._chunk_by_sentences(content, filename, file_type)