signalwire-agents 0.1.51__tar.gz → 0.1.53__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {signalwire_agents-0.1.51/signalwire_agents.egg-info → signalwire_agents-0.1.53}/PKG-INFO +11 -11
  2. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/pyproject.toml +11 -11
  3. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/__init__.py +1 -1
  4. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/build_search.py +22 -5
  5. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/schema.json +6 -2
  6. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/document_processor.py +112 -18
  7. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/search_engine.py +144 -104
  8. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53/signalwire_agents.egg-info}/PKG-INFO +11 -11
  9. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/requires.txt +10 -10
  10. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/LICENSE +0 -0
  11. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/README.md +0 -0
  12. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/setup.cfg +0 -0
  13. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/setup.py +0 -0
  14. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/agent_server.py +0 -0
  15. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/agents/bedrock.py +0 -0
  16. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/__init__.py +0 -0
  17. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/config.py +0 -0
  18. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/__init__.py +0 -0
  19. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/agent_loader.py +0 -0
  20. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/argparse_helpers.py +0 -0
  21. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/dynamic_config.py +0 -0
  22. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/service_loader.py +0 -0
  23. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/execution/__init__.py +0 -0
  24. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/execution/datamap_exec.py +0 -0
  25. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/execution/webhook_exec.py +0 -0
  26. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/output/__init__.py +0 -0
  27. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/output/output_formatter.py +0 -0
  28. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/output/swml_dump.py +0 -0
  29. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/__init__.py +0 -0
  30. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/data_generation.py +0 -0
  31. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/data_overrides.py +0 -0
  32. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/mock_env.py +0 -0
  33. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/swaig_test_wrapper.py +0 -0
  34. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/test_swaig.py +0 -0
  35. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/types.py +0 -0
  36. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/__init__.py +0 -0
  37. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/__init__.py +0 -0
  38. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/config/__init__.py +0 -0
  39. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/deployment/__init__.py +0 -0
  40. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/deployment/handlers/__init__.py +0 -0
  41. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/prompt/__init__.py +0 -0
  42. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/prompt/manager.py +0 -0
  43. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/routing/__init__.py +0 -0
  44. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/security/__init__.py +0 -0
  45. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/swml/__init__.py +0 -0
  46. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/tools/__init__.py +0 -0
  47. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/tools/decorator.py +0 -0
  48. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/tools/registry.py +0 -0
  49. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent_base.py +0 -0
  50. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/auth_handler.py +0 -0
  51. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/config_loader.py +0 -0
  52. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/contexts.py +0 -0
  53. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/data_map.py +0 -0
  54. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/function_result.py +0 -0
  55. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/logging_config.py +0 -0
  56. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/__init__.py +0 -0
  57. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/ai_config_mixin.py +0 -0
  58. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/auth_mixin.py +0 -0
  59. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/prompt_mixin.py +0 -0
  60. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/serverless_mixin.py +0 -0
  61. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/skill_mixin.py +0 -0
  62. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/state_mixin.py +0 -0
  63. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/tool_mixin.py +0 -0
  64. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/web_mixin.py +0 -0
  65. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/pom_builder.py +0 -0
  66. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/security/__init__.py +0 -0
  67. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/security/session_manager.py +0 -0
  68. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/security_config.py +0 -0
  69. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/skill_base.py +0 -0
  70. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/skill_manager.py +0 -0
  71. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swaig_function.py +0 -0
  72. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_builder.py +0 -0
  73. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_handler.py +0 -0
  74. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_renderer.py +0 -0
  75. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_service.py +0 -0
  76. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/__init__.py +0 -0
  77. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/concierge.py +0 -0
  78. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/faq_bot.py +0 -0
  79. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/info_gatherer.py +0 -0
  80. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/receptionist.py +0 -0
  81. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/survey.py +0 -0
  82. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/__init__.py +0 -0
  83. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/index_builder.py +0 -0
  84. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/migration.py +0 -0
  85. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/models.py +0 -0
  86. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/pgvector_backend.py +0 -0
  87. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/query_processor.py +0 -0
  88. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/search_service.py +0 -0
  89. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/README.md +0 -0
  90. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/__init__.py +0 -0
  91. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/api_ninjas_trivia/README.md +0 -0
  92. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/api_ninjas_trivia/__init__.py +0 -0
  93. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/api_ninjas_trivia/skill.py +0 -0
  94. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere/README.md +0 -0
  95. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere/__init__.py +0 -0
  96. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere/skill.py +0 -0
  97. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere_serverless/README.md +0 -0
  98. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere_serverless/__init__.py +0 -0
  99. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere_serverless/skill.py +0 -0
  100. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datetime/README.md +0 -0
  101. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datetime/__init__.py +0 -0
  102. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datetime/skill.py +0 -0
  103. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/joke/README.md +0 -0
  104. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/joke/__init__.py +0 -0
  105. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/joke/skill.py +0 -0
  106. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/math/README.md +0 -0
  107. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/math/__init__.py +0 -0
  108. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/math/skill.py +0 -0
  109. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/mcp_gateway/README.md +0 -0
  110. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/mcp_gateway/__init__.py +0 -0
  111. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/mcp_gateway/skill.py +0 -0
  112. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/native_vector_search/README.md +0 -0
  113. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/native_vector_search/__init__.py +0 -0
  114. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/native_vector_search/skill.py +0 -0
  115. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/play_background_file/README.md +0 -0
  116. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/play_background_file/__init__.py +0 -0
  117. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/play_background_file/skill.py +0 -0
  118. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/registry.py +0 -0
  119. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/spider/README.md +0 -0
  120. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/spider/__init__.py +0 -0
  121. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/spider/skill.py +0 -0
  122. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/swml_transfer/README.md +0 -0
  123. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/swml_transfer/__init__.py +0 -0
  124. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/swml_transfer/skill.py +0 -0
  125. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/weather_api/README.md +0 -0
  126. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/weather_api/__init__.py +0 -0
  127. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/weather_api/skill.py +0 -0
  128. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/web_search/README.md +0 -0
  129. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/web_search/__init__.py +0 -0
  130. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/web_search/skill.py +0 -0
  131. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/wikipedia_search/README.md +0 -0
  132. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/wikipedia_search/__init__.py +0 -0
  133. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/wikipedia_search/skill.py +0 -0
  134. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/__init__.py +0 -0
  135. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/pom_utils.py +0 -0
  136. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/schema_utils.py +0 -0
  137. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/token_generators.py +0 -0
  138. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/validators.py +0 -0
  139. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/web/__init__.py +0 -0
  140. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/web/web_service.py +0 -0
  141. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/SOURCES.txt +0 -0
  142. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/dependency_links.txt +0 -0
  143. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/entry_points.txt +0 -0
  144. {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: signalwire_agents
3
- Version: 0.1.51
3
+ Version: 0.1.53
4
4
  Summary: SignalWire AI Agents SDK
5
5
  Author-email: SignalWire Team <info@signalwire.com>
6
6
  License: MIT
@@ -18,16 +18,16 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Requires-Python: >=3.7
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: fastapi==0.115.12
22
- Requires-Dist: pydantic==2.11.4
23
- Requires-Dist: PyYAML==6.0.2
24
- Requires-Dist: Requests==2.32.3
25
- Requires-Dist: setuptools==66.1.1
26
- Requires-Dist: signalwire_pom==2.7.1
27
- Requires-Dist: structlog==25.3.0
28
- Requires-Dist: uvicorn==0.34.2
29
- Requires-Dist: beautifulsoup4==4.12.3
30
- Requires-Dist: pytz==2023.3
21
+ Requires-Dist: fastapi>=0.115.12
22
+ Requires-Dist: pydantic>=2.11.4
23
+ Requires-Dist: PyYAML>=6.0.2
24
+ Requires-Dist: Requests>=2.32.3
25
+ Requires-Dist: setuptools>=66.1.1
26
+ Requires-Dist: signalwire_pom>=2.7.1
27
+ Requires-Dist: structlog>=25.3.0
28
+ Requires-Dist: uvicorn>=0.34.2
29
+ Requires-Dist: beautifulsoup4>=4.12.3
30
+ Requires-Dist: pytz>=2023.3
31
31
  Requires-Dist: lxml>=4.9.0
32
32
  Provides-Extra: search-queryonly
33
33
  Requires-Dist: numpy>=1.24.0; extra == "search-queryonly"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "signalwire_agents"
7
- version = "0.1.51"
7
+ version = "0.1.53"
8
8
  description = "SignalWire AI Agents SDK"
9
9
  authors = [
10
10
  {name = "SignalWire Team", email = "info@signalwire.com"}
@@ -25,16 +25,16 @@ classifiers = [
25
25
  "Programming Language :: Python :: 3.11",
26
26
  ]
27
27
  dependencies = [
28
- "fastapi==0.115.12",
29
- "pydantic==2.11.4",
30
- "PyYAML==6.0.2",
31
- "Requests==2.32.3",
32
- "setuptools==66.1.1",
33
- "signalwire_pom==2.7.1",
34
- "structlog==25.3.0",
35
- "uvicorn==0.34.2",
36
- "beautifulsoup4==4.12.3",
37
- "pytz==2023.3",
28
+ "fastapi>=0.115.12",
29
+ "pydantic>=2.11.4",
30
+ "PyYAML>=6.0.2",
31
+ "Requests>=2.32.3",
32
+ "setuptools>=66.1.1",
33
+ "signalwire_pom>=2.7.1",
34
+ "structlog>=25.3.0",
35
+ "uvicorn>=0.34.2",
36
+ "beautifulsoup4>=4.12.3",
37
+ "pytz>=2023.3",
38
38
  "lxml>=4.9.0",
39
39
  ]
40
40
 
@@ -18,7 +18,7 @@ A package for building AI agents using SignalWire's AI and SWML capabilities.
18
18
  from .core.logging_config import configure_logging
19
19
  configure_logging()
20
20
 
21
- __version__ = "0.1.51"
21
+ __version__ = "0.1.53"
22
22
 
23
23
  # Import core classes for easier access
24
24
  from .core.agent_base import AgentBase
@@ -69,6 +69,16 @@ Examples:
69
69
  sw-search ./docs \\
70
70
  --chunking-strategy qa
71
71
 
72
+ # Markdown-aware chunking (preserves headers, detects code blocks, adds tags)
73
+ sw-search ./docs \\
74
+ --chunking-strategy markdown \\
75
+ --file-types md
76
+ # This strategy:
77
+ # - Chunks at header boundaries (h1, h2, h3...)
78
+ # - Detects code blocks and extracts language (python, bash, etc)
79
+ # - Adds "code" tags to chunks with code for better search
80
+ # - Preserves section hierarchy in metadata
81
+
72
82
  # Model selection examples (performance vs quality tradeoff)
73
83
  sw-search ./docs --model mini # Fastest (~5x faster), 384 dims, good for most use cases
74
84
  sw-search ./docs --model base # Balanced speed/quality, 768 dims (previous default)
@@ -128,16 +138,23 @@ Examples:
128
138
  --collection-name docs_collection
129
139
  sw-search migrate --info ./docs.swsearch
130
140
 
131
- # PostgreSQL pgvector backend
141
+ # PostgreSQL pgvector backend (direct build to PostgreSQL)
132
142
  sw-search ./docs \\
133
143
  --backend pgvector \\
134
- --connection-string "postgresql://user:pass@localhost/knowledge" \\
144
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
135
145
  --output docs_collection
136
146
 
147
+ # pgvector with markdown strategy (best for documentation with code examples)
148
+ sw-search ./docs \\
149
+ --backend pgvector \\
150
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
151
+ --output docs_collection \\
152
+ --chunking-strategy markdown
153
+
137
154
  # Overwrite existing pgvector collection
138
155
  sw-search ./docs \\
139
156
  --backend pgvector \\
140
- --connection-string "postgresql://user:pass@localhost/knowledge" \\
157
+ --connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
141
158
  --output docs_collection \\
142
159
  --overwrite
143
160
 
@@ -191,9 +208,9 @@ Examples:
191
208
 
192
209
  parser.add_argument(
193
210
  '--chunking-strategy',
194
- choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json'],
211
+ choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json', 'markdown'],
195
212
  default='sentence',
196
- help='Chunking strategy to use (default: sentence)'
213
+ help='Chunking strategy to use (default: sentence). Use "markdown" for documentation with code blocks.'
197
214
  )
198
215
 
199
216
  parser.add_argument(
@@ -1937,9 +1937,13 @@
1937
1937
  {
1938
1938
  "type": "string",
1939
1939
  "const": "qwen3-235b-A22b-instruct"
1940
+ },
1941
+ {
1942
+ "type": "string",
1943
+ "const": "llama-3.1-8b-instruct-turbo@together.ai"
1940
1944
  }
1941
1945
  ],
1942
- "description": "The model to use for the AI. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, `nova-lite`, and `qwen3-235b-A22b-instruct`."
1946
+ "description": "The model to use for the AI. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, `nova-lite`, and `qwen3-235b-A22b-instruct` and `qwen3-4b-instruct-2507@brian`."
1943
1947
  },
1944
1948
  "ai_volume": {
1945
1949
  "anyOf": [
@@ -7663,4 +7667,4 @@
7663
7667
  }
7664
7668
  },
7665
7669
  "unevaluatedProperties": false
7666
- }
7670
+ }
@@ -88,9 +88,18 @@ class DocumentProcessor:
88
88
  ):
89
89
  """
90
90
  Initialize document processor
91
-
91
+
92
92
  Args:
93
- chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
93
+ chunking_strategy: Strategy for chunking documents:
94
+ - 'sentence': Sentence-based chunking with overlap
95
+ - 'sliding': Sliding window with word-based chunks
96
+ - 'paragraph': Natural paragraph boundaries
97
+ - 'page': Page-based chunking (for PDFs)
98
+ - 'semantic': Semantic similarity-based chunking
99
+ - 'topic': Topic modeling-based chunking
100
+ - 'qa': Question-answer optimized chunking
101
+ - 'json': JSON structure-aware chunking
102
+ - 'markdown': Markdown structure-aware chunking with code block detection
94
103
  max_sentences_per_chunk: For sentence strategy (default: 5)
95
104
  chunk_size: For sliding strategy - words per chunk (default: 50)
96
105
  chunk_overlap: For sliding strategy - overlap in words (default: 10)
@@ -142,6 +151,9 @@ class DocumentProcessor:
142
151
  return self._chunk_by_qa_optimization(content, filename, file_type)
143
152
  elif self.chunking_strategy == 'json':
144
153
  return self._chunk_from_json(content, filename, file_type)
154
+ elif self.chunking_strategy == 'markdown':
155
+ # Use markdown-aware chunking for better structure preservation
156
+ return self._chunk_markdown_enhanced(content, filename)
145
157
  else:
146
158
  # Fallback to sentence-based chunking
147
159
  return self._chunk_by_sentences(content, filename, file_type)
@@ -339,75 +351,114 @@ class DocumentProcessor:
339
351
  return chunks
340
352
 
341
353
  def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
342
- """Enhanced markdown chunking with better header handling"""
354
+ """Enhanced markdown chunking with code block detection and rich metadata
355
+
356
+ Features:
357
+ - Tracks header hierarchy for section paths
358
+ - Detects code blocks and extracts language
359
+ - Adds 'code' tags to chunks containing code
360
+ - Preserves markdown structure for better search
361
+ """
343
362
  chunks = []
344
363
  lines = content.split('\n')
345
-
364
+
346
365
  current_section = None
347
366
  current_hierarchy = [] # Track header hierarchy
348
367
  current_chunk = []
349
368
  current_size = 0
350
369
  line_start = 1
351
-
370
+ in_code_block = False
371
+ code_languages = [] # Track languages in current chunk
372
+ has_code = False
373
+
352
374
  for line_num, line in enumerate(lines, 1):
375
+ # Check for code block fences
376
+ code_fence_match = re.match(r'^```(\w+)?', line)
377
+ if code_fence_match:
378
+ in_code_block = not in_code_block
379
+ if in_code_block:
380
+ # Starting code block
381
+ has_code = True
382
+ lang = code_fence_match.group(1)
383
+ if lang and lang not in code_languages:
384
+ code_languages.append(lang)
385
+
353
386
  # Check for headers with hierarchy tracking
354
- header_match = re.match(r'^(#{1,6})\s+(.+)', line)
387
+ header_match = re.match(r'^(#{1,6})\s+(.+)', line) if not in_code_block else None
355
388
  if header_match:
356
389
  header_level = len(header_match.group(1))
357
390
  header_text = header_match.group(2).strip()
358
-
391
+
359
392
  # Save current chunk if it exists
360
393
  if current_chunk:
394
+ chunk_metadata = self._build_markdown_metadata(
395
+ current_hierarchy, code_languages, has_code
396
+ )
361
397
  chunks.append(self._create_chunk(
362
398
  content='\n'.join(current_chunk),
363
399
  filename=filename,
364
400
  section=self._build_section_path(current_hierarchy),
365
401
  start_line=line_start,
366
- end_line=line_num - 1
402
+ end_line=line_num - 1,
403
+ metadata=chunk_metadata
367
404
  ))
368
-
405
+
369
406
  # Update hierarchy
370
407
  current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
371
408
  current_section = header_text
372
409
  current_chunk = [line]
373
410
  current_size = len(line)
374
411
  line_start = line_num
375
-
412
+ code_languages = []
413
+ has_code = False
414
+
376
415
  else:
377
416
  current_chunk.append(line)
378
417
  current_size += len(line) + 1
379
-
418
+
380
419
  # Check if chunk is getting too large - use smart splitting
381
- if current_size >= self.chunk_size:
420
+ # But don't split inside code blocks
421
+ if current_size >= self.chunk_size and not in_code_block:
382
422
  # Try to split at paragraph boundary first
383
423
  split_point = self._find_best_split_point(current_chunk)
384
-
424
+
385
425
  chunk_to_save = current_chunk[:split_point]
426
+ chunk_metadata = self._build_markdown_metadata(
427
+ current_hierarchy, code_languages, has_code
428
+ )
386
429
  chunks.append(self._create_chunk(
387
430
  content='\n'.join(chunk_to_save),
388
431
  filename=filename,
389
432
  section=self._build_section_path(current_hierarchy),
390
433
  start_line=line_start,
391
- end_line=line_start + split_point - 1
434
+ end_line=line_start + split_point - 1,
435
+ metadata=chunk_metadata
392
436
  ))
393
-
437
+
394
438
  # Start new chunk with overlap
395
439
  overlap_lines = self._get_overlap_lines(chunk_to_save)
396
440
  remaining_lines = current_chunk[split_point:]
397
441
  current_chunk = overlap_lines + remaining_lines
398
442
  current_size = sum(len(line) + 1 for line in current_chunk)
399
443
  line_start = line_start + split_point - len(overlap_lines)
400
-
444
+ # Reset code tracking for new chunk
445
+ code_languages = []
446
+ has_code = False
447
+
401
448
  # Add final chunk
402
449
  if current_chunk:
450
+ chunk_metadata = self._build_markdown_metadata(
451
+ current_hierarchy, code_languages, has_code
452
+ )
403
453
  chunks.append(self._create_chunk(
404
454
  content='\n'.join(current_chunk),
405
455
  filename=filename,
406
456
  section=self._build_section_path(current_hierarchy),
407
457
  start_line=line_start,
408
- end_line=len(lines)
458
+ end_line=len(lines),
459
+ metadata=chunk_metadata
409
460
  ))
410
-
461
+
411
462
  return chunks
412
463
 
413
464
  def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
@@ -575,6 +626,49 @@ class DocumentProcessor:
575
626
  def _build_section_path(self, hierarchy: List[str]) -> str:
576
627
  """Build hierarchical section path from header hierarchy"""
577
628
  return ' > '.join(hierarchy) if hierarchy else None
629
+
630
+ def _build_markdown_metadata(self, hierarchy: List[str], code_languages: List[str], has_code: bool) -> Dict[str, Any]:
631
+ """Build rich metadata for markdown chunks
632
+
633
+ Args:
634
+ hierarchy: Current header hierarchy (e.g., ['Installation', 'Requirements', 'Python'])
635
+ code_languages: List of code block languages found in chunk (e.g., ['python', 'bash'])
636
+ has_code: Whether chunk contains any code blocks
637
+
638
+ Returns:
639
+ Dictionary with markdown-specific metadata including tags
640
+ """
641
+ metadata = {
642
+ 'chunk_type': 'markdown',
643
+ }
644
+
645
+ # Add header level metadata
646
+ if hierarchy:
647
+ for i, header in enumerate(hierarchy, 1):
648
+ metadata[f'h{i}'] = header
649
+
650
+ # Add code-related metadata
651
+ if has_code:
652
+ metadata['has_code'] = True
653
+ if code_languages:
654
+ metadata['code_languages'] = code_languages
655
+
656
+ # Build tags for enhanced searching
657
+ tags = []
658
+ if has_code:
659
+ tags.append('code')
660
+ # Add language-specific tags
661
+ for lang in code_languages:
662
+ tags.append(f'code:{lang}')
663
+
664
+ # Add tags for header levels (searchable by section depth)
665
+ if len(hierarchy) > 0:
666
+ tags.append(f'depth:{len(hierarchy)}')
667
+
668
+ if tags:
669
+ metadata['tags'] = tags
670
+
671
+ return metadata
578
672
 
579
673
  def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
580
674
  """Build section name for Python code"""
@@ -114,51 +114,48 @@ class SearchEngine:
114
114
  logger.error(f"Error converting query vector: {e}")
115
115
  return self._keyword_search_only(enhanced_text, count, tags, original_query)
116
116
 
117
- # Stage 1: Collect candidates using fast methods
117
+ # HYBRID APPROACH: Search vector AND metadata in parallel
118
+ # Stage 1: Run both search types simultaneously
119
+ search_multiplier = 3
120
+
121
+ # Vector search (semantic similarity - primary ranking signal)
122
+ vector_results = self._vector_search(query_array, count * search_multiplier)
123
+
124
+ # Metadata/keyword searches (confirmation signals and backfill)
125
+ filename_results = self._filename_search(original_query or enhanced_text, count * search_multiplier)
126
+ metadata_results = self._metadata_search(original_query or enhanced_text, count * search_multiplier)
127
+ keyword_results = self._keyword_search(enhanced_text, count * search_multiplier, original_query)
128
+
129
+ logger.debug(f"Parallel search: vector={len(vector_results)}, filename={len(filename_results)}, "
130
+ f"metadata={len(metadata_results)}, keyword={len(keyword_results)}")
131
+
132
+ # Stage 2: Merge all results into candidate pool
118
133
  candidates = {}
119
-
120
- # Fast searches - collect all potential matches
121
- filename_results = self._filename_search(original_query or enhanced_text, count * 3)
122
- metadata_results = self._metadata_search(original_query or enhanced_text, count * 2)
123
- keyword_results = self._keyword_search(enhanced_text, count * 2, original_query)
124
-
125
- logger.debug(f"Search for '{original_query}': filename={len(filename_results)}, metadata={len(metadata_results)}, keyword={len(keyword_results)}")
126
-
127
- # Merge candidates from different sources
128
- for result_set, source_weight in [(filename_results, 2.0),
129
- (metadata_results, 1.5),
130
- (keyword_results, 1.0)]:
134
+
135
+ # Add vector results first (primary signal)
136
+ for result in vector_results:
137
+ chunk_id = result['id']
138
+ candidates[chunk_id] = result
139
+ candidates[chunk_id]['vector_score'] = result['score']
140
+ candidates[chunk_id]['vector_distance'] = 1 - result['score']
141
+ candidates[chunk_id]['sources'] = {'vector': True}
142
+ candidates[chunk_id]['source_scores'] = {'vector': result['score']}
143
+
144
+ # Add metadata/keyword results (secondary signals that boost or backfill)
145
+ for result_set, source_type, source_weight in [(filename_results, 'filename', 2.0),
146
+ (metadata_results, 'metadata', 1.5),
147
+ (keyword_results, 'keyword', 1.0)]:
131
148
  for result in result_set:
132
149
  chunk_id = result['id']
133
150
  if chunk_id not in candidates:
151
+ # New candidate from metadata/keyword (no vector match)
134
152
  candidates[chunk_id] = result
135
- candidates[chunk_id]['sources'] = {}
136
- candidates[chunk_id]['source_scores'] = {}
137
-
138
- # Track which searches found this chunk
139
- candidates[chunk_id]['sources'][result['search_type']] = True
140
- candidates[chunk_id]['source_scores'][result['search_type']] = result['score'] * source_weight
141
-
142
- # Stage 2: Check if we have enough candidates
143
- if len(candidates) < count * 2:
144
- # Not enough candidates from fast searches - add full vector search
145
- logger.debug(f"Only {len(candidates)} candidates from fast search, adding full vector search")
146
- vector_results = self._vector_search(query_array, count * 3)
147
-
148
- for result in vector_results:
149
- chunk_id = result['id']
150
- if chunk_id not in candidates:
151
- candidates[chunk_id] = result
152
- candidates[chunk_id]['sources'] = {'vector': True}
153
- candidates[chunk_id]['source_scores'] = {}
154
-
155
- # Add vector score
156
- candidates[chunk_id]['vector_score'] = result['score']
157
- candidates[chunk_id]['vector_distance'] = 1 - result['score']
158
- else:
159
- # We have enough candidates - just re-rank them with vectors
160
- logger.debug(f"Re-ranking {len(candidates)} candidates with vector similarity")
161
- self._add_vector_scores_to_candidates(candidates, query_array, distance_threshold)
153
+ candidates[chunk_id]['sources'] = {source_type: True}
154
+ candidates[chunk_id]['source_scores'] = {source_type: result['score'] * source_weight}
155
+ else:
156
+ # Exists in vector results - add metadata/keyword as confirmation signal
157
+ candidates[chunk_id]['sources'][source_type] = True
158
+ candidates[chunk_id]['source_scores'][source_type] = result['score'] * source_weight
162
159
 
163
160
  # Stage 3: Score and rank all candidates
164
161
  final_results = []
@@ -190,12 +187,12 @@ class SearchEngine:
190
187
 
191
188
  # Apply diversity penalties to prevent single-file dominance
192
189
  final_results = self._apply_diversity_penalties(final_results, count)
193
-
190
+
194
191
  # Ensure 'score' field exists for CLI compatibility
195
192
  for r in final_results:
196
193
  if 'score' not in r:
197
194
  r['score'] = r.get('final_score', 0.0)
198
-
195
+
199
196
  return final_results[:count]
200
197
 
201
198
  def _keyword_search_only(self, enhanced_text: str, count: int,
@@ -1038,70 +1035,55 @@ class SearchEngine:
1038
1035
  logger.error(f"Error in vector re-ranking: {e}")
1039
1036
 
1040
1037
  def _calculate_combined_score(self, candidate: Dict, distance_threshold: float) -> float:
1041
- """Calculate final score combining all signals with comprehensive match bonus"""
1042
- # Base scores from different sources
1043
- source_scores = candidate.get('source_scores', {})
1044
-
1045
- # Check for comprehensive matching (multiple signals)
1038
+ """Calculate final score with hybrid vector + metadata weighting
1039
+
1040
+ Hybrid approach:
1041
+ - Vector score is the primary ranking signal (semantic similarity)
1042
+ - Metadata/keyword matches provide confirmation boost
1043
+ - Multiple signal types indicate high relevance (confirmation bonus)
1044
+ - Special boost for 'code' tag matches when query contains code-related terms
1045
+ """
1046
1046
  sources = candidate.get('sources', {})
1047
- num_sources = len(sources)
1048
-
1049
- # Get match coverage information
1050
- match_coverage = candidate.get('match_coverage', 0)
1051
- fields_matched = candidate.get('fields_matched', 0)
1052
-
1053
- # Calculate base score with exponential boost for multiple sources
1054
- if num_sources > 1:
1055
- # Multiple signal matches are exponentially better
1056
- multi_signal_boost = 1.0 + (0.3 * (num_sources - 1))
1057
- base_score = sum(source_scores.values()) * multi_signal_boost
1058
- else:
1059
- base_score = sum(source_scores.values())
1060
-
1061
- # Apply comprehensive match bonus
1062
- if match_coverage > 0.5: # More than 50% of query terms matched
1063
- coverage_bonus = 1.0 + (match_coverage - 0.5) * 0.5
1064
- base_score *= coverage_bonus
1065
-
1066
- # Apply field diversity bonus (matching in multiple metadata fields)
1067
- if fields_matched > 2:
1068
- field_bonus = 1.0 + (fields_matched - 2) * 0.1
1069
- base_score *= field_bonus
1070
-
1071
- # Apply vector similarity multiplier if available
1047
+ source_scores = candidate.get('source_scores', {})
1048
+
1049
+ # Vector score is PRIMARY
1072
1050
  if 'vector_score' in candidate:
1073
1051
  vector_score = candidate['vector_score']
1074
- vector_distance = candidate.get('vector_distance', 1 - vector_score)
1075
-
1076
- # Distance-aware scoring
1077
- if distance_threshold > 0:
1078
- if vector_distance <= distance_threshold:
1079
- # Within threshold - full vector score
1080
- vector_multiplier = vector_score
1081
- elif vector_distance <= distance_threshold * 1.5:
1082
- # Near threshold - gradual decay
1083
- overflow = (vector_distance - distance_threshold) / (distance_threshold * 0.5)
1084
- vector_multiplier = vector_score * (1 - overflow * 0.3)
1085
- else:
1086
- # Beyond threshold - minimal contribution
1087
- vector_multiplier = vector_score * 0.3
1088
- else:
1089
- vector_multiplier = vector_score
1090
-
1091
- # For chunks found by vector-only search, use vector score directly
1092
- if 'vector' in sources and len(sources) == 1:
1093
- base_score = vector_score
1094
- else:
1095
- # For chunks found by multiple methods, apply vector as quality check
1096
- base_score *= vector_multiplier
1097
-
1098
- # Special handling for strong metadata matches
1099
- if 'metadata' in sources:
1100
- metadata_matches = candidate.get('metadata_matches', {})
1101
- # Strong category or product match should boost significantly
1102
- if metadata_matches.get('category', 0) > 0.8 or metadata_matches.get('product', 0) > 0.8:
1103
- base_score *= 1.2
1104
-
1052
+ base_score = vector_score
1053
+
1054
+ # Metadata/keyword matches provide confirmation boost
1055
+ if len(sources) > 1:
1056
+ # Has both vector AND metadata/keyword matches - strong confirmation signal
1057
+ keyword_signals = sum(source_scores.get(k, 0) for k in ['keyword', 'filename', 'metadata'])
1058
+ if keyword_signals > 0:
1059
+ # Normalize and apply boost (up to 30% for strong confirmation)
1060
+ keyword_boost = min(0.3, keyword_signals * 0.15)
1061
+ base_score = vector_score * (1.0 + keyword_boost)
1062
+
1063
+ # Additional boost if multiple signal types confirm (2+ sources)
1064
+ num_metadata_sources = sum(1 for s in ['keyword', 'filename', 'metadata'] if s in sources)
1065
+ if num_metadata_sources >= 2:
1066
+ # Multiple confirmation signals - very high confidence
1067
+ base_score *= 1.1
1068
+
1069
+ # Check for code-related tags to boost code examples
1070
+ tags = candidate.get('metadata', {}).get('tags', [])
1071
+ if 'code' in tags:
1072
+ # This chunk contains code - boost if query is code-related
1073
+ # (metadata search would have found it if query mentioned code/example/python/etc)
1074
+ if 'metadata' in sources or 'keyword' in sources:
1075
+ # Query matched code-related metadata - apply code boost
1076
+ base_score *= 1.2
1077
+ else:
1078
+ # No vector score - this is a keyword-only result (backfill)
1079
+ # Use keyword scores but penalize for lack of semantic match
1080
+ base_score = sum(source_scores.values()) * 0.6 # 40% penalty for no vector
1081
+
1082
+ # Still boost code chunks if metadata matched
1083
+ tags = candidate.get('metadata', {}).get('tags', [])
1084
+ if 'code' in tags and 'metadata' in sources:
1085
+ base_score *= 1.15
1086
+
1105
1087
  return base_score
1106
1088
 
1107
1089
  def _apply_diversity_penalties(self, results: List[Dict], target_count: int) -> List[Dict]:
@@ -1166,7 +1148,65 @@ class SearchEngine:
1166
1148
  penalized_results[:target_count] = selected
1167
1149
 
1168
1150
  return penalized_results
1169
-
1151
+
1152
+ def _apply_match_type_diversity(self, results: List[Dict], target_count: int) -> List[Dict]:
1153
+ """Ensure diversity of match types in final results
1154
+
1155
+ Ensures we have a mix of:
1156
+ - Vector-only matches (semantic similarity, good for code examples)
1157
+ - Keyword-only matches (exact term matches)
1158
+ - Hybrid matches (both vector + keyword/metadata)
1159
+ """
1160
+ if not results or len(results) <= target_count:
1161
+ return results
1162
+
1163
+ # Categorize results by match type
1164
+ vector_only = []
1165
+ keyword_only = []
1166
+ hybrid = []
1167
+
1168
+ for result in results:
1169
+ sources = result.get('sources', {})
1170
+ has_vector = 'vector' in sources
1171
+ has_keyword = any(k in sources for k in ['keyword', 'filename', 'metadata'])
1172
+
1173
+ if has_vector and not has_keyword:
1174
+ vector_only.append(result)
1175
+ elif has_keyword and not has_vector:
1176
+ keyword_only.append(result)
1177
+ else:
1178
+ hybrid.append(result)
1179
+
1180
+ # Build diverse result set
1181
+ # Target distribution: 40% hybrid, 40% vector-only, 20% keyword-only
1182
+ # This ensures we include semantic matches (code examples) even if keywords don't match
1183
+ diversified = []
1184
+
1185
+ # Take top hybrid matches first (best overall)
1186
+ hybrid_target = max(1, int(target_count * 0.4))
1187
+ diversified.extend(hybrid[:hybrid_target])
1188
+
1189
+ # Ensure we have vector-only matches (critical for code examples)
1190
+ vector_target = max(1, int(target_count * 0.4))
1191
+ diversified.extend(vector_only[:vector_target])
1192
+
1193
+ # Add keyword-only matches
1194
+ keyword_target = max(1, int(target_count * 0.2))
1195
+ diversified.extend(keyword_only[:keyword_target])
1196
+
1197
+ # Fill remaining slots with best remaining results regardless of type
1198
+ remaining_slots = target_count - len(diversified)
1199
+ if remaining_slots > 0:
1200
+ # Get all unused results
1201
+ used_ids = set(r['id'] for r in diversified)
1202
+ unused = [r for r in results if r['id'] not in used_ids]
1203
+ diversified.extend(unused[:remaining_slots])
1204
+
1205
+ # Sort by final score to maintain quality ordering
1206
+ diversified.sort(key=lambda x: x['final_score'], reverse=True)
1207
+
1208
+ return diversified
1209
+
1170
1210
  def get_stats(self) -> Dict[str, Any]:
1171
1211
  """Get statistics about the search index"""
1172
1212
  # Use pgvector backend if available