signalwire-agents 0.1.51__tar.gz → 0.1.53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {signalwire_agents-0.1.51/signalwire_agents.egg-info → signalwire_agents-0.1.53}/PKG-INFO +11 -11
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/pyproject.toml +11 -11
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/__init__.py +1 -1
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/build_search.py +22 -5
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/schema.json +6 -2
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/document_processor.py +112 -18
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/search_engine.py +144 -104
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53/signalwire_agents.egg-info}/PKG-INFO +11 -11
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/requires.txt +10 -10
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/LICENSE +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/setup.cfg +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/setup.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/agent_server.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/agents/bedrock.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/config.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/agent_loader.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/argparse_helpers.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/dynamic_config.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/core/service_loader.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/execution/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/execution/datamap_exec.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/execution/webhook_exec.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/output/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/output/output_formatter.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/output/swml_dump.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/data_generation.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/data_overrides.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/simulation/mock_env.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/swaig_test_wrapper.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/test_swaig.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/cli/types.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/config/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/deployment/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/deployment/handlers/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/prompt/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/prompt/manager.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/routing/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/security/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/swml/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/tools/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/tools/decorator.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent/tools/registry.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/agent_base.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/auth_handler.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/config_loader.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/contexts.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/data_map.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/function_result.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/logging_config.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/ai_config_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/auth_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/prompt_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/serverless_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/skill_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/state_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/tool_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/mixins/web_mixin.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/pom_builder.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/security/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/security/session_manager.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/security_config.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/skill_base.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/skill_manager.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swaig_function.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_builder.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_handler.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_renderer.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/core/swml_service.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/concierge.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/faq_bot.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/info_gatherer.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/receptionist.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/prefabs/survey.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/index_builder.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/migration.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/models.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/pgvector_backend.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/query_processor.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/search_service.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/api_ninjas_trivia/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/api_ninjas_trivia/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/api_ninjas_trivia/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere_serverless/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere_serverless/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datasphere_serverless/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datetime/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datetime/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/datetime/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/joke/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/joke/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/joke/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/math/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/math/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/math/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/mcp_gateway/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/mcp_gateway/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/mcp_gateway/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/native_vector_search/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/native_vector_search/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/native_vector_search/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/play_background_file/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/play_background_file/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/play_background_file/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/registry.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/spider/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/spider/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/spider/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/swml_transfer/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/swml_transfer/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/swml_transfer/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/weather_api/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/weather_api/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/weather_api/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/web_search/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/web_search/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/web_search/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/wikipedia_search/README.md +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/wikipedia_search/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/skills/wikipedia_search/skill.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/pom_utils.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/schema_utils.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/token_generators.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/utils/validators.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/web/__init__.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/web/web_service.py +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/SOURCES.txt +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/dependency_links.txt +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/entry_points.txt +0 -0
- {signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: signalwire_agents
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.53
|
|
4
4
|
Summary: SignalWire AI Agents SDK
|
|
5
5
|
Author-email: SignalWire Team <info@signalwire.com>
|
|
6
6
|
License: MIT
|
|
@@ -18,16 +18,16 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
18
18
|
Requires-Python: >=3.7
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: fastapi
|
|
22
|
-
Requires-Dist: pydantic
|
|
23
|
-
Requires-Dist: PyYAML
|
|
24
|
-
Requires-Dist: Requests
|
|
25
|
-
Requires-Dist: setuptools
|
|
26
|
-
Requires-Dist: signalwire_pom
|
|
27
|
-
Requires-Dist: structlog
|
|
28
|
-
Requires-Dist: uvicorn
|
|
29
|
-
Requires-Dist: beautifulsoup4
|
|
30
|
-
Requires-Dist: pytz
|
|
21
|
+
Requires-Dist: fastapi>=0.115.12
|
|
22
|
+
Requires-Dist: pydantic>=2.11.4
|
|
23
|
+
Requires-Dist: PyYAML>=6.0.2
|
|
24
|
+
Requires-Dist: Requests>=2.32.3
|
|
25
|
+
Requires-Dist: setuptools>=66.1.1
|
|
26
|
+
Requires-Dist: signalwire_pom>=2.7.1
|
|
27
|
+
Requires-Dist: structlog>=25.3.0
|
|
28
|
+
Requires-Dist: uvicorn>=0.34.2
|
|
29
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
30
|
+
Requires-Dist: pytz>=2023.3
|
|
31
31
|
Requires-Dist: lxml>=4.9.0
|
|
32
32
|
Provides-Extra: search-queryonly
|
|
33
33
|
Requires-Dist: numpy>=1.24.0; extra == "search-queryonly"
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "signalwire_agents"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.53"
|
|
8
8
|
description = "SignalWire AI Agents SDK"
|
|
9
9
|
authors = [
|
|
10
10
|
{name = "SignalWire Team", email = "info@signalwire.com"}
|
|
@@ -25,16 +25,16 @@ classifiers = [
|
|
|
25
25
|
"Programming Language :: Python :: 3.11",
|
|
26
26
|
]
|
|
27
27
|
dependencies = [
|
|
28
|
-
"fastapi
|
|
29
|
-
"pydantic
|
|
30
|
-
"PyYAML
|
|
31
|
-
"Requests
|
|
32
|
-
"setuptools
|
|
33
|
-
"signalwire_pom
|
|
34
|
-
"structlog
|
|
35
|
-
"uvicorn
|
|
36
|
-
"beautifulsoup4
|
|
37
|
-
"pytz
|
|
28
|
+
"fastapi>=0.115.12",
|
|
29
|
+
"pydantic>=2.11.4",
|
|
30
|
+
"PyYAML>=6.0.2",
|
|
31
|
+
"Requests>=2.32.3",
|
|
32
|
+
"setuptools>=66.1.1",
|
|
33
|
+
"signalwire_pom>=2.7.1",
|
|
34
|
+
"structlog>=25.3.0",
|
|
35
|
+
"uvicorn>=0.34.2",
|
|
36
|
+
"beautifulsoup4>=4.12.3",
|
|
37
|
+
"pytz>=2023.3",
|
|
38
38
|
"lxml>=4.9.0",
|
|
39
39
|
]
|
|
40
40
|
|
|
@@ -18,7 +18,7 @@ A package for building AI agents using SignalWire's AI and SWML capabilities.
|
|
|
18
18
|
from .core.logging_config import configure_logging
|
|
19
19
|
configure_logging()
|
|
20
20
|
|
|
21
|
-
__version__ = "0.1.
|
|
21
|
+
__version__ = "0.1.53"
|
|
22
22
|
|
|
23
23
|
# Import core classes for easier access
|
|
24
24
|
from .core.agent_base import AgentBase
|
|
@@ -69,6 +69,16 @@ Examples:
|
|
|
69
69
|
sw-search ./docs \\
|
|
70
70
|
--chunking-strategy qa
|
|
71
71
|
|
|
72
|
+
# Markdown-aware chunking (preserves headers, detects code blocks, adds tags)
|
|
73
|
+
sw-search ./docs \\
|
|
74
|
+
--chunking-strategy markdown \\
|
|
75
|
+
--file-types md
|
|
76
|
+
# This strategy:
|
|
77
|
+
# - Chunks at header boundaries (h1, h2, h3...)
|
|
78
|
+
# - Detects code blocks and extracts language (python, bash, etc)
|
|
79
|
+
# - Adds "code" tags to chunks with code for better search
|
|
80
|
+
# - Preserves section hierarchy in metadata
|
|
81
|
+
|
|
72
82
|
# Model selection examples (performance vs quality tradeoff)
|
|
73
83
|
sw-search ./docs --model mini # Fastest (~5x faster), 384 dims, good for most use cases
|
|
74
84
|
sw-search ./docs --model base # Balanced speed/quality, 768 dims (previous default)
|
|
@@ -128,16 +138,23 @@ Examples:
|
|
|
128
138
|
--collection-name docs_collection
|
|
129
139
|
sw-search migrate --info ./docs.swsearch
|
|
130
140
|
|
|
131
|
-
# PostgreSQL pgvector backend
|
|
141
|
+
# PostgreSQL pgvector backend (direct build to PostgreSQL)
|
|
132
142
|
sw-search ./docs \\
|
|
133
143
|
--backend pgvector \\
|
|
134
|
-
--connection-string "postgresql://user:pass@localhost/knowledge" \\
|
|
144
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
135
145
|
--output docs_collection
|
|
136
146
|
|
|
147
|
+
# pgvector with markdown strategy (best for documentation with code examples)
|
|
148
|
+
sw-search ./docs \\
|
|
149
|
+
--backend pgvector \\
|
|
150
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
151
|
+
--output docs_collection \\
|
|
152
|
+
--chunking-strategy markdown
|
|
153
|
+
|
|
137
154
|
# Overwrite existing pgvector collection
|
|
138
155
|
sw-search ./docs \\
|
|
139
156
|
--backend pgvector \\
|
|
140
|
-
--connection-string "postgresql://user:pass@localhost/knowledge" \\
|
|
157
|
+
--connection-string "postgresql://user:pass@localhost:5432/knowledge" \\
|
|
141
158
|
--output docs_collection \\
|
|
142
159
|
--overwrite
|
|
143
160
|
|
|
@@ -191,9 +208,9 @@ Examples:
|
|
|
191
208
|
|
|
192
209
|
parser.add_argument(
|
|
193
210
|
'--chunking-strategy',
|
|
194
|
-
choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json'],
|
|
211
|
+
choices=['sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json', 'markdown'],
|
|
195
212
|
default='sentence',
|
|
196
|
-
help='Chunking strategy to use (default: sentence)'
|
|
213
|
+
help='Chunking strategy to use (default: sentence). Use "markdown" for documentation with code blocks.'
|
|
197
214
|
)
|
|
198
215
|
|
|
199
216
|
parser.add_argument(
|
|
@@ -1937,9 +1937,13 @@
|
|
|
1937
1937
|
{
|
|
1938
1938
|
"type": "string",
|
|
1939
1939
|
"const": "qwen3-235b-A22b-instruct"
|
|
1940
|
+
},
|
|
1941
|
+
{
|
|
1942
|
+
"type": "string",
|
|
1943
|
+
"const": "llama-3.1-8b-instruct-turbo@together.ai"
|
|
1940
1944
|
}
|
|
1941
1945
|
],
|
|
1942
|
-
"description": "The model to use for the AI. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, `nova-lite`, and `qwen3-235b-A22b-instruct`."
|
|
1946
|
+
"description": "The model to use for the AI. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, `nova-lite`, and `qwen3-235b-A22b-instruct` and `qwen3-4b-instruct-2507@brian`."
|
|
1943
1947
|
},
|
|
1944
1948
|
"ai_volume": {
|
|
1945
1949
|
"anyOf": [
|
|
@@ -7663,4 +7667,4 @@
|
|
|
7663
7667
|
}
|
|
7664
7668
|
},
|
|
7665
7669
|
"unevaluatedProperties": false
|
|
7666
|
-
}
|
|
7670
|
+
}
|
{signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/document_processor.py
RENAMED
|
@@ -88,9 +88,18 @@ class DocumentProcessor:
|
|
|
88
88
|
):
|
|
89
89
|
"""
|
|
90
90
|
Initialize document processor
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
Args:
|
|
93
|
-
chunking_strategy: Strategy for chunking documents
|
|
93
|
+
chunking_strategy: Strategy for chunking documents:
|
|
94
|
+
- 'sentence': Sentence-based chunking with overlap
|
|
95
|
+
- 'sliding': Sliding window with word-based chunks
|
|
96
|
+
- 'paragraph': Natural paragraph boundaries
|
|
97
|
+
- 'page': Page-based chunking (for PDFs)
|
|
98
|
+
- 'semantic': Semantic similarity-based chunking
|
|
99
|
+
- 'topic': Topic modeling-based chunking
|
|
100
|
+
- 'qa': Question-answer optimized chunking
|
|
101
|
+
- 'json': JSON structure-aware chunking
|
|
102
|
+
- 'markdown': Markdown structure-aware chunking with code block detection
|
|
94
103
|
max_sentences_per_chunk: For sentence strategy (default: 5)
|
|
95
104
|
chunk_size: For sliding strategy - words per chunk (default: 50)
|
|
96
105
|
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
|
@@ -142,6 +151,9 @@ class DocumentProcessor:
|
|
|
142
151
|
return self._chunk_by_qa_optimization(content, filename, file_type)
|
|
143
152
|
elif self.chunking_strategy == 'json':
|
|
144
153
|
return self._chunk_from_json(content, filename, file_type)
|
|
154
|
+
elif self.chunking_strategy == 'markdown':
|
|
155
|
+
# Use markdown-aware chunking for better structure preservation
|
|
156
|
+
return self._chunk_markdown_enhanced(content, filename)
|
|
145
157
|
else:
|
|
146
158
|
# Fallback to sentence-based chunking
|
|
147
159
|
return self._chunk_by_sentences(content, filename, file_type)
|
|
@@ -339,75 +351,114 @@ class DocumentProcessor:
|
|
|
339
351
|
return chunks
|
|
340
352
|
|
|
341
353
|
def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
|
342
|
-
"""Enhanced markdown chunking with
|
|
354
|
+
"""Enhanced markdown chunking with code block detection and rich metadata
|
|
355
|
+
|
|
356
|
+
Features:
|
|
357
|
+
- Tracks header hierarchy for section paths
|
|
358
|
+
- Detects code blocks and extracts language
|
|
359
|
+
- Adds 'code' tags to chunks containing code
|
|
360
|
+
- Preserves markdown structure for better search
|
|
361
|
+
"""
|
|
343
362
|
chunks = []
|
|
344
363
|
lines = content.split('\n')
|
|
345
|
-
|
|
364
|
+
|
|
346
365
|
current_section = None
|
|
347
366
|
current_hierarchy = [] # Track header hierarchy
|
|
348
367
|
current_chunk = []
|
|
349
368
|
current_size = 0
|
|
350
369
|
line_start = 1
|
|
351
|
-
|
|
370
|
+
in_code_block = False
|
|
371
|
+
code_languages = [] # Track languages in current chunk
|
|
372
|
+
has_code = False
|
|
373
|
+
|
|
352
374
|
for line_num, line in enumerate(lines, 1):
|
|
375
|
+
# Check for code block fences
|
|
376
|
+
code_fence_match = re.match(r'^```(\w+)?', line)
|
|
377
|
+
if code_fence_match:
|
|
378
|
+
in_code_block = not in_code_block
|
|
379
|
+
if in_code_block:
|
|
380
|
+
# Starting code block
|
|
381
|
+
has_code = True
|
|
382
|
+
lang = code_fence_match.group(1)
|
|
383
|
+
if lang and lang not in code_languages:
|
|
384
|
+
code_languages.append(lang)
|
|
385
|
+
|
|
353
386
|
# Check for headers with hierarchy tracking
|
|
354
|
-
header_match = re.match(r'^(#{1,6})\s+(.+)', line)
|
|
387
|
+
header_match = re.match(r'^(#{1,6})\s+(.+)', line) if not in_code_block else None
|
|
355
388
|
if header_match:
|
|
356
389
|
header_level = len(header_match.group(1))
|
|
357
390
|
header_text = header_match.group(2).strip()
|
|
358
|
-
|
|
391
|
+
|
|
359
392
|
# Save current chunk if it exists
|
|
360
393
|
if current_chunk:
|
|
394
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
395
|
+
current_hierarchy, code_languages, has_code
|
|
396
|
+
)
|
|
361
397
|
chunks.append(self._create_chunk(
|
|
362
398
|
content='\n'.join(current_chunk),
|
|
363
399
|
filename=filename,
|
|
364
400
|
section=self._build_section_path(current_hierarchy),
|
|
365
401
|
start_line=line_start,
|
|
366
|
-
end_line=line_num - 1
|
|
402
|
+
end_line=line_num - 1,
|
|
403
|
+
metadata=chunk_metadata
|
|
367
404
|
))
|
|
368
|
-
|
|
405
|
+
|
|
369
406
|
# Update hierarchy
|
|
370
407
|
current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
|
|
371
408
|
current_section = header_text
|
|
372
409
|
current_chunk = [line]
|
|
373
410
|
current_size = len(line)
|
|
374
411
|
line_start = line_num
|
|
375
|
-
|
|
412
|
+
code_languages = []
|
|
413
|
+
has_code = False
|
|
414
|
+
|
|
376
415
|
else:
|
|
377
416
|
current_chunk.append(line)
|
|
378
417
|
current_size += len(line) + 1
|
|
379
|
-
|
|
418
|
+
|
|
380
419
|
# Check if chunk is getting too large - use smart splitting
|
|
381
|
-
|
|
420
|
+
# But don't split inside code blocks
|
|
421
|
+
if current_size >= self.chunk_size and not in_code_block:
|
|
382
422
|
# Try to split at paragraph boundary first
|
|
383
423
|
split_point = self._find_best_split_point(current_chunk)
|
|
384
|
-
|
|
424
|
+
|
|
385
425
|
chunk_to_save = current_chunk[:split_point]
|
|
426
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
427
|
+
current_hierarchy, code_languages, has_code
|
|
428
|
+
)
|
|
386
429
|
chunks.append(self._create_chunk(
|
|
387
430
|
content='\n'.join(chunk_to_save),
|
|
388
431
|
filename=filename,
|
|
389
432
|
section=self._build_section_path(current_hierarchy),
|
|
390
433
|
start_line=line_start,
|
|
391
|
-
end_line=line_start + split_point - 1
|
|
434
|
+
end_line=line_start + split_point - 1,
|
|
435
|
+
metadata=chunk_metadata
|
|
392
436
|
))
|
|
393
|
-
|
|
437
|
+
|
|
394
438
|
# Start new chunk with overlap
|
|
395
439
|
overlap_lines = self._get_overlap_lines(chunk_to_save)
|
|
396
440
|
remaining_lines = current_chunk[split_point:]
|
|
397
441
|
current_chunk = overlap_lines + remaining_lines
|
|
398
442
|
current_size = sum(len(line) + 1 for line in current_chunk)
|
|
399
443
|
line_start = line_start + split_point - len(overlap_lines)
|
|
400
|
-
|
|
444
|
+
# Reset code tracking for new chunk
|
|
445
|
+
code_languages = []
|
|
446
|
+
has_code = False
|
|
447
|
+
|
|
401
448
|
# Add final chunk
|
|
402
449
|
if current_chunk:
|
|
450
|
+
chunk_metadata = self._build_markdown_metadata(
|
|
451
|
+
current_hierarchy, code_languages, has_code
|
|
452
|
+
)
|
|
403
453
|
chunks.append(self._create_chunk(
|
|
404
454
|
content='\n'.join(current_chunk),
|
|
405
455
|
filename=filename,
|
|
406
456
|
section=self._build_section_path(current_hierarchy),
|
|
407
457
|
start_line=line_start,
|
|
408
|
-
end_line=len(lines)
|
|
458
|
+
end_line=len(lines),
|
|
459
|
+
metadata=chunk_metadata
|
|
409
460
|
))
|
|
410
|
-
|
|
461
|
+
|
|
411
462
|
return chunks
|
|
412
463
|
|
|
413
464
|
def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
|
@@ -575,6 +626,49 @@ class DocumentProcessor:
|
|
|
575
626
|
def _build_section_path(self, hierarchy: List[str]) -> str:
|
|
576
627
|
"""Build hierarchical section path from header hierarchy"""
|
|
577
628
|
return ' > '.join(hierarchy) if hierarchy else None
|
|
629
|
+
|
|
630
|
+
def _build_markdown_metadata(self, hierarchy: List[str], code_languages: List[str], has_code: bool) -> Dict[str, Any]:
|
|
631
|
+
"""Build rich metadata for markdown chunks
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
hierarchy: Current header hierarchy (e.g., ['Installation', 'Requirements', 'Python'])
|
|
635
|
+
code_languages: List of code block languages found in chunk (e.g., ['python', 'bash'])
|
|
636
|
+
has_code: Whether chunk contains any code blocks
|
|
637
|
+
|
|
638
|
+
Returns:
|
|
639
|
+
Dictionary with markdown-specific metadata including tags
|
|
640
|
+
"""
|
|
641
|
+
metadata = {
|
|
642
|
+
'chunk_type': 'markdown',
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
# Add header level metadata
|
|
646
|
+
if hierarchy:
|
|
647
|
+
for i, header in enumerate(hierarchy, 1):
|
|
648
|
+
metadata[f'h{i}'] = header
|
|
649
|
+
|
|
650
|
+
# Add code-related metadata
|
|
651
|
+
if has_code:
|
|
652
|
+
metadata['has_code'] = True
|
|
653
|
+
if code_languages:
|
|
654
|
+
metadata['code_languages'] = code_languages
|
|
655
|
+
|
|
656
|
+
# Build tags for enhanced searching
|
|
657
|
+
tags = []
|
|
658
|
+
if has_code:
|
|
659
|
+
tags.append('code')
|
|
660
|
+
# Add language-specific tags
|
|
661
|
+
for lang in code_languages:
|
|
662
|
+
tags.append(f'code:{lang}')
|
|
663
|
+
|
|
664
|
+
# Add tags for header levels (searchable by section depth)
|
|
665
|
+
if len(hierarchy) > 0:
|
|
666
|
+
tags.append(f'depth:{len(hierarchy)}')
|
|
667
|
+
|
|
668
|
+
if tags:
|
|
669
|
+
metadata['tags'] = tags
|
|
670
|
+
|
|
671
|
+
return metadata
|
|
578
672
|
|
|
579
673
|
def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
|
|
580
674
|
"""Build section name for Python code"""
|
{signalwire_agents-0.1.51 → signalwire_agents-0.1.53}/signalwire_agents/search/search_engine.py
RENAMED
|
@@ -114,51 +114,48 @@ class SearchEngine:
|
|
|
114
114
|
logger.error(f"Error converting query vector: {e}")
|
|
115
115
|
return self._keyword_search_only(enhanced_text, count, tags, original_query)
|
|
116
116
|
|
|
117
|
-
#
|
|
117
|
+
# HYBRID APPROACH: Search vector AND metadata in parallel
|
|
118
|
+
# Stage 1: Run both search types simultaneously
|
|
119
|
+
search_multiplier = 3
|
|
120
|
+
|
|
121
|
+
# Vector search (semantic similarity - primary ranking signal)
|
|
122
|
+
vector_results = self._vector_search(query_array, count * search_multiplier)
|
|
123
|
+
|
|
124
|
+
# Metadata/keyword searches (confirmation signals and backfill)
|
|
125
|
+
filename_results = self._filename_search(original_query or enhanced_text, count * search_multiplier)
|
|
126
|
+
metadata_results = self._metadata_search(original_query or enhanced_text, count * search_multiplier)
|
|
127
|
+
keyword_results = self._keyword_search(enhanced_text, count * search_multiplier, original_query)
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Parallel search: vector={len(vector_results)}, filename={len(filename_results)}, "
|
|
130
|
+
f"metadata={len(metadata_results)}, keyword={len(keyword_results)}")
|
|
131
|
+
|
|
132
|
+
# Stage 2: Merge all results into candidate pool
|
|
118
133
|
candidates = {}
|
|
119
|
-
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
134
|
+
|
|
135
|
+
# Add vector results first (primary signal)
|
|
136
|
+
for result in vector_results:
|
|
137
|
+
chunk_id = result['id']
|
|
138
|
+
candidates[chunk_id] = result
|
|
139
|
+
candidates[chunk_id]['vector_score'] = result['score']
|
|
140
|
+
candidates[chunk_id]['vector_distance'] = 1 - result['score']
|
|
141
|
+
candidates[chunk_id]['sources'] = {'vector': True}
|
|
142
|
+
candidates[chunk_id]['source_scores'] = {'vector': result['score']}
|
|
143
|
+
|
|
144
|
+
# Add metadata/keyword results (secondary signals that boost or backfill)
|
|
145
|
+
for result_set, source_type, source_weight in [(filename_results, 'filename', 2.0),
|
|
146
|
+
(metadata_results, 'metadata', 1.5),
|
|
147
|
+
(keyword_results, 'keyword', 1.0)]:
|
|
131
148
|
for result in result_set:
|
|
132
149
|
chunk_id = result['id']
|
|
133
150
|
if chunk_id not in candidates:
|
|
151
|
+
# New candidate from metadata/keyword (no vector match)
|
|
134
152
|
candidates[chunk_id] = result
|
|
135
|
-
candidates[chunk_id]['sources'] = {}
|
|
136
|
-
candidates[chunk_id]['source_scores'] = {}
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
# Stage 2: Check if we have enough candidates
|
|
143
|
-
if len(candidates) < count * 2:
|
|
144
|
-
# Not enough candidates from fast searches - add full vector search
|
|
145
|
-
logger.debug(f"Only {len(candidates)} candidates from fast search, adding full vector search")
|
|
146
|
-
vector_results = self._vector_search(query_array, count * 3)
|
|
147
|
-
|
|
148
|
-
for result in vector_results:
|
|
149
|
-
chunk_id = result['id']
|
|
150
|
-
if chunk_id not in candidates:
|
|
151
|
-
candidates[chunk_id] = result
|
|
152
|
-
candidates[chunk_id]['sources'] = {'vector': True}
|
|
153
|
-
candidates[chunk_id]['source_scores'] = {}
|
|
154
|
-
|
|
155
|
-
# Add vector score
|
|
156
|
-
candidates[chunk_id]['vector_score'] = result['score']
|
|
157
|
-
candidates[chunk_id]['vector_distance'] = 1 - result['score']
|
|
158
|
-
else:
|
|
159
|
-
# We have enough candidates - just re-rank them with vectors
|
|
160
|
-
logger.debug(f"Re-ranking {len(candidates)} candidates with vector similarity")
|
|
161
|
-
self._add_vector_scores_to_candidates(candidates, query_array, distance_threshold)
|
|
153
|
+
candidates[chunk_id]['sources'] = {source_type: True}
|
|
154
|
+
candidates[chunk_id]['source_scores'] = {source_type: result['score'] * source_weight}
|
|
155
|
+
else:
|
|
156
|
+
# Exists in vector results - add metadata/keyword as confirmation signal
|
|
157
|
+
candidates[chunk_id]['sources'][source_type] = True
|
|
158
|
+
candidates[chunk_id]['source_scores'][source_type] = result['score'] * source_weight
|
|
162
159
|
|
|
163
160
|
# Stage 3: Score and rank all candidates
|
|
164
161
|
final_results = []
|
|
@@ -190,12 +187,12 @@ class SearchEngine:
|
|
|
190
187
|
|
|
191
188
|
# Apply diversity penalties to prevent single-file dominance
|
|
192
189
|
final_results = self._apply_diversity_penalties(final_results, count)
|
|
193
|
-
|
|
190
|
+
|
|
194
191
|
# Ensure 'score' field exists for CLI compatibility
|
|
195
192
|
for r in final_results:
|
|
196
193
|
if 'score' not in r:
|
|
197
194
|
r['score'] = r.get('final_score', 0.0)
|
|
198
|
-
|
|
195
|
+
|
|
199
196
|
return final_results[:count]
|
|
200
197
|
|
|
201
198
|
def _keyword_search_only(self, enhanced_text: str, count: int,
|
|
@@ -1038,70 +1035,55 @@ class SearchEngine:
|
|
|
1038
1035
|
logger.error(f"Error in vector re-ranking: {e}")
|
|
1039
1036
|
|
|
1040
1037
|
def _calculate_combined_score(self, candidate: Dict, distance_threshold: float) -> float:
|
|
1041
|
-
"""Calculate final score
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1038
|
+
"""Calculate final score with hybrid vector + metadata weighting
|
|
1039
|
+
|
|
1040
|
+
Hybrid approach:
|
|
1041
|
+
- Vector score is the primary ranking signal (semantic similarity)
|
|
1042
|
+
- Metadata/keyword matches provide confirmation boost
|
|
1043
|
+
- Multiple signal types indicate high relevance (confirmation bonus)
|
|
1044
|
+
- Special boost for 'code' tag matches when query contains code-related terms
|
|
1045
|
+
"""
|
|
1046
1046
|
sources = candidate.get('sources', {})
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
#
|
|
1050
|
-
match_coverage = candidate.get('match_coverage', 0)
|
|
1051
|
-
fields_matched = candidate.get('fields_matched', 0)
|
|
1052
|
-
|
|
1053
|
-
# Calculate base score with exponential boost for multiple sources
|
|
1054
|
-
if num_sources > 1:
|
|
1055
|
-
# Multiple signal matches are exponentially better
|
|
1056
|
-
multi_signal_boost = 1.0 + (0.3 * (num_sources - 1))
|
|
1057
|
-
base_score = sum(source_scores.values()) * multi_signal_boost
|
|
1058
|
-
else:
|
|
1059
|
-
base_score = sum(source_scores.values())
|
|
1060
|
-
|
|
1061
|
-
# Apply comprehensive match bonus
|
|
1062
|
-
if match_coverage > 0.5: # More than 50% of query terms matched
|
|
1063
|
-
coverage_bonus = 1.0 + (match_coverage - 0.5) * 0.5
|
|
1064
|
-
base_score *= coverage_bonus
|
|
1065
|
-
|
|
1066
|
-
# Apply field diversity bonus (matching in multiple metadata fields)
|
|
1067
|
-
if fields_matched > 2:
|
|
1068
|
-
field_bonus = 1.0 + (fields_matched - 2) * 0.1
|
|
1069
|
-
base_score *= field_bonus
|
|
1070
|
-
|
|
1071
|
-
# Apply vector similarity multiplier if available
|
|
1047
|
+
source_scores = candidate.get('source_scores', {})
|
|
1048
|
+
|
|
1049
|
+
# Vector score is PRIMARY
|
|
1072
1050
|
if 'vector_score' in candidate:
|
|
1073
1051
|
vector_score = candidate['vector_score']
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
#
|
|
1077
|
-
if
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
#
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
#
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
#
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1052
|
+
base_score = vector_score
|
|
1053
|
+
|
|
1054
|
+
# Metadata/keyword matches provide confirmation boost
|
|
1055
|
+
if len(sources) > 1:
|
|
1056
|
+
# Has both vector AND metadata/keyword matches - strong confirmation signal
|
|
1057
|
+
keyword_signals = sum(source_scores.get(k, 0) for k in ['keyword', 'filename', 'metadata'])
|
|
1058
|
+
if keyword_signals > 0:
|
|
1059
|
+
# Normalize and apply boost (up to 30% for strong confirmation)
|
|
1060
|
+
keyword_boost = min(0.3, keyword_signals * 0.15)
|
|
1061
|
+
base_score = vector_score * (1.0 + keyword_boost)
|
|
1062
|
+
|
|
1063
|
+
# Additional boost if multiple signal types confirm (2+ sources)
|
|
1064
|
+
num_metadata_sources = sum(1 for s in ['keyword', 'filename', 'metadata'] if s in sources)
|
|
1065
|
+
if num_metadata_sources >= 2:
|
|
1066
|
+
# Multiple confirmation signals - very high confidence
|
|
1067
|
+
base_score *= 1.1
|
|
1068
|
+
|
|
1069
|
+
# Check for code-related tags to boost code examples
|
|
1070
|
+
tags = candidate.get('metadata', {}).get('tags', [])
|
|
1071
|
+
if 'code' in tags:
|
|
1072
|
+
# This chunk contains code - boost if query is code-related
|
|
1073
|
+
# (metadata search would have found it if query mentioned code/example/python/etc)
|
|
1074
|
+
if 'metadata' in sources or 'keyword' in sources:
|
|
1075
|
+
# Query matched code-related metadata - apply code boost
|
|
1076
|
+
base_score *= 1.2
|
|
1077
|
+
else:
|
|
1078
|
+
# No vector score - this is a keyword-only result (backfill)
|
|
1079
|
+
# Use keyword scores but penalize for lack of semantic match
|
|
1080
|
+
base_score = sum(source_scores.values()) * 0.6 # 40% penalty for no vector
|
|
1081
|
+
|
|
1082
|
+
# Still boost code chunks if metadata matched
|
|
1083
|
+
tags = candidate.get('metadata', {}).get('tags', [])
|
|
1084
|
+
if 'code' in tags and 'metadata' in sources:
|
|
1085
|
+
base_score *= 1.15
|
|
1086
|
+
|
|
1105
1087
|
return base_score
|
|
1106
1088
|
|
|
1107
1089
|
def _apply_diversity_penalties(self, results: List[Dict], target_count: int) -> List[Dict]:
|
|
@@ -1166,7 +1148,65 @@ class SearchEngine:
|
|
|
1166
1148
|
penalized_results[:target_count] = selected
|
|
1167
1149
|
|
|
1168
1150
|
return penalized_results
|
|
1169
|
-
|
|
1151
|
+
|
|
1152
|
+
def _apply_match_type_diversity(self, results: List[Dict], target_count: int) -> List[Dict]:
|
|
1153
|
+
"""Ensure diversity of match types in final results
|
|
1154
|
+
|
|
1155
|
+
Ensures we have a mix of:
|
|
1156
|
+
- Vector-only matches (semantic similarity, good for code examples)
|
|
1157
|
+
- Keyword-only matches (exact term matches)
|
|
1158
|
+
- Hybrid matches (both vector + keyword/metadata)
|
|
1159
|
+
"""
|
|
1160
|
+
if not results or len(results) <= target_count:
|
|
1161
|
+
return results
|
|
1162
|
+
|
|
1163
|
+
# Categorize results by match type
|
|
1164
|
+
vector_only = []
|
|
1165
|
+
keyword_only = []
|
|
1166
|
+
hybrid = []
|
|
1167
|
+
|
|
1168
|
+
for result in results:
|
|
1169
|
+
sources = result.get('sources', {})
|
|
1170
|
+
has_vector = 'vector' in sources
|
|
1171
|
+
has_keyword = any(k in sources for k in ['keyword', 'filename', 'metadata'])
|
|
1172
|
+
|
|
1173
|
+
if has_vector and not has_keyword:
|
|
1174
|
+
vector_only.append(result)
|
|
1175
|
+
elif has_keyword and not has_vector:
|
|
1176
|
+
keyword_only.append(result)
|
|
1177
|
+
else:
|
|
1178
|
+
hybrid.append(result)
|
|
1179
|
+
|
|
1180
|
+
# Build diverse result set
|
|
1181
|
+
# Target distribution: 40% hybrid, 40% vector-only, 20% keyword-only
|
|
1182
|
+
# This ensures we include semantic matches (code examples) even if keywords don't match
|
|
1183
|
+
diversified = []
|
|
1184
|
+
|
|
1185
|
+
# Take top hybrid matches first (best overall)
|
|
1186
|
+
hybrid_target = max(1, int(target_count * 0.4))
|
|
1187
|
+
diversified.extend(hybrid[:hybrid_target])
|
|
1188
|
+
|
|
1189
|
+
# Ensure we have vector-only matches (critical for code examples)
|
|
1190
|
+
vector_target = max(1, int(target_count * 0.4))
|
|
1191
|
+
diversified.extend(vector_only[:vector_target])
|
|
1192
|
+
|
|
1193
|
+
# Add keyword-only matches
|
|
1194
|
+
keyword_target = max(1, int(target_count * 0.2))
|
|
1195
|
+
diversified.extend(keyword_only[:keyword_target])
|
|
1196
|
+
|
|
1197
|
+
# Fill remaining slots with best remaining results regardless of type
|
|
1198
|
+
remaining_slots = target_count - len(diversified)
|
|
1199
|
+
if remaining_slots > 0:
|
|
1200
|
+
# Get all unused results
|
|
1201
|
+
used_ids = set(r['id'] for r in diversified)
|
|
1202
|
+
unused = [r for r in results if r['id'] not in used_ids]
|
|
1203
|
+
diversified.extend(unused[:remaining_slots])
|
|
1204
|
+
|
|
1205
|
+
# Sort by final score to maintain quality ordering
|
|
1206
|
+
diversified.sort(key=lambda x: x['final_score'], reverse=True)
|
|
1207
|
+
|
|
1208
|
+
return diversified
|
|
1209
|
+
|
|
1170
1210
|
def get_stats(self) -> Dict[str, Any]:
|
|
1171
1211
|
"""Get statistics about the search index"""
|
|
1172
1212
|
# Use pgvector backend if available
|