minder-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. minder/__init__.py +12 -0
  2. minder/api/routers/prompts.py +177 -0
  3. minder/application/__init__.py +1 -0
  4. minder/application/admin/__init__.py +11 -0
  5. minder/application/admin/dto.py +453 -0
  6. minder/application/admin/jobs.py +327 -0
  7. minder/application/admin/use_cases.py +1895 -0
  8. minder/auth/__init__.py +12 -0
  9. minder/auth/context.py +26 -0
  10. minder/auth/middleware.py +70 -0
  11. minder/auth/principal.py +59 -0
  12. minder/auth/rate_limiter.py +89 -0
  13. minder/auth/rbac.py +60 -0
  14. minder/auth/service.py +541 -0
  15. minder/bootstrap/__init__.py +9 -0
  16. minder/bootstrap/providers.py +109 -0
  17. minder/bootstrap/transport.py +807 -0
  18. minder/cache/__init__.py +10 -0
  19. minder/cache/providers.py +140 -0
  20. minder/chunking/__init__.py +4 -0
  21. minder/chunking/code_splitter.py +184 -0
  22. minder/chunking/splitter.py +136 -0
  23. minder/cli.py +1542 -0
  24. minder/config.py +179 -0
  25. minder/continuity.py +363 -0
  26. minder/dev.py +160 -0
  27. minder/embedding/__init__.py +9 -0
  28. minder/embedding/base.py +7 -0
  29. minder/embedding/local.py +65 -0
  30. minder/embedding/openai.py +7 -0
  31. minder/graph/__init__.py +11 -0
  32. minder/graph/edges.py +13 -0
  33. minder/graph/executor.py +127 -0
  34. minder/graph/graph.py +263 -0
  35. minder/graph/nodes/__init__.py +27 -0
  36. minder/graph/nodes/evaluator.py +21 -0
  37. minder/graph/nodes/guard.py +64 -0
  38. minder/graph/nodes/llm.py +59 -0
  39. minder/graph/nodes/planning.py +30 -0
  40. minder/graph/nodes/reasoning.py +87 -0
  41. minder/graph/nodes/reranker.py +141 -0
  42. minder/graph/nodes/retriever.py +86 -0
  43. minder/graph/nodes/verification.py +230 -0
  44. minder/graph/nodes/workflow_planner.py +250 -0
  45. minder/graph/runtime.py +15 -0
  46. minder/graph/state.py +26 -0
  47. minder/llm/__init__.py +5 -0
  48. minder/llm/base.py +14 -0
  49. minder/llm/local.py +381 -0
  50. minder/llm/openai.py +89 -0
  51. minder/models/__init__.py +109 -0
  52. minder/models/base.py +10 -0
  53. minder/models/client.py +137 -0
  54. minder/models/document.py +34 -0
  55. minder/models/error.py +32 -0
  56. minder/models/graph.py +114 -0
  57. minder/models/history.py +32 -0
  58. minder/models/job.py +62 -0
  59. minder/models/prompt.py +41 -0
  60. minder/models/repository.py +62 -0
  61. minder/models/rule.py +68 -0
  62. minder/models/session.py +51 -0
  63. minder/models/skill.py +52 -0
  64. minder/models/user.py +41 -0
  65. minder/models/workflow.py +35 -0
  66. minder/observability/__init__.py +57 -0
  67. minder/observability/audit.py +243 -0
  68. minder/observability/logging.py +253 -0
  69. minder/observability/metrics.py +448 -0
  70. minder/observability/tracing.py +215 -0
  71. minder/presentation/__init__.py +1 -0
  72. minder/presentation/http/__init__.py +1 -0
  73. minder/presentation/http/admin/__init__.py +3 -0
  74. minder/presentation/http/admin/api.py +1309 -0
  75. minder/presentation/http/admin/context.py +94 -0
  76. minder/presentation/http/admin/dashboard.py +111 -0
  77. minder/presentation/http/admin/jobs.py +208 -0
  78. minder/presentation/http/admin/memories.py +185 -0
  79. minder/presentation/http/admin/prompts.py +219 -0
  80. minder/presentation/http/admin/routes.py +127 -0
  81. minder/presentation/http/admin/runtime.py +650 -0
  82. minder/presentation/http/admin/search.py +368 -0
  83. minder/presentation/http/admin/skills.py +230 -0
  84. minder/prompts/__init__.py +646 -0
  85. minder/prompts/formatter.py +142 -0
  86. minder/resources/__init__.py +318 -0
  87. minder/retrieval/__init__.py +5 -0
  88. minder/retrieval/hybrid.py +178 -0
  89. minder/retrieval/mmr.py +116 -0
  90. minder/retrieval/multi_hop.py +115 -0
  91. minder/runtime.py +15 -0
  92. minder/server.py +145 -0
  93. minder/store/__init__.py +64 -0
  94. minder/store/document.py +115 -0
  95. minder/store/error.py +82 -0
  96. minder/store/feedback.py +114 -0
  97. minder/store/graph.py +588 -0
  98. minder/store/history.py +57 -0
  99. minder/store/interfaces.py +512 -0
  100. minder/store/milvus/__init__.py +11 -0
  101. minder/store/milvus/client.py +26 -0
  102. minder/store/milvus/collections.py +15 -0
  103. minder/store/milvus/vector_store.py +232 -0
  104. minder/store/mongodb/__init__.py +11 -0
  105. minder/store/mongodb/client.py +49 -0
  106. minder/store/mongodb/indexes.py +90 -0
  107. minder/store/mongodb/operational_store.py +993 -0
  108. minder/store/relational.py +1087 -0
  109. minder/store/repo_state.py +58 -0
  110. minder/store/rule.py +93 -0
  111. minder/store/vector.py +79 -0
  112. minder/tools/__init__.py +47 -0
  113. minder/tools/auth.py +94 -0
  114. minder/tools/graph.py +839 -0
  115. minder/tools/ingest.py +353 -0
  116. minder/tools/memory.py +381 -0
  117. minder/tools/query.py +307 -0
  118. minder/tools/registry.py +269 -0
  119. minder/tools/repo_scanner.py +1266 -0
  120. minder/tools/search.py +15 -0
  121. minder/tools/session.py +316 -0
  122. minder/tools/skills.py +899 -0
  123. minder/tools/workflow.py +215 -0
  124. minder/transport/__init__.py +4 -0
  125. minder/transport/base.py +286 -0
  126. minder/transport/sse.py +252 -0
  127. minder/transport/stdio.py +29 -0
  128. minder_cli-0.2.0.dist-info/METADATA +318 -0
  129. minder_cli-0.2.0.dist-info/RECORD +132 -0
  130. minder_cli-0.2.0.dist-info/WHEEL +4 -0
  131. minder_cli-0.2.0.dist-info/entry_points.txt +2 -0
  132. minder_cli-0.2.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,142 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from dataclasses import dataclass
6
+
7
+ from minder.config import MinderConfig
8
+
9
+
10
+ @dataclass(slots=True)
11
+ class PromptDraft:
12
+ name: str
13
+ title: str
14
+ description: str
15
+ content_template: str
16
+ arguments: list[str]
17
+
18
+
19
+ def _normalize_arguments(arguments: list[str]) -> list[str]:
20
+ normalized: list[str] = []
21
+ seen: set[str] = set()
22
+ for argument in arguments:
23
+ value = str(argument).strip()
24
+ if not value or value in seen:
25
+ continue
26
+ seen.add(value)
27
+ normalized.append(value)
28
+ return normalized
29
+
30
+
31
+ def _humanize_name(name: str) -> str:
32
+ parts = [part for part in re.split(r"[_\-\s]+", name.strip()) if part]
33
+ return " ".join(part.capitalize() for part in parts) or "Prompt"
34
+
35
+
36
+ def _heuristic_polish(draft: PromptDraft) -> PromptDraft:
37
+ normalized_args = _normalize_arguments(draft.arguments)
38
+ title = draft.title.strip() or _humanize_name(draft.name)
39
+ description = draft.description.strip() or (
40
+ f"Guide the model to act as {title.lower()} with grounded, actionable output."
41
+ )
42
+ base_task = draft.content_template.strip()
43
+ arg_section = "\n".join(f"- {{{argument}}}" for argument in normalized_args)
44
+ if not arg_section:
45
+ arg_section = "- No named placeholders required."
46
+
47
+ polished_template = "\n\n".join(
48
+ section
49
+ for section in [
50
+ f"## Role\nYou are {title}. {description}",
51
+ f"## Inputs\n{arg_section}",
52
+ (
53
+ "## Task\n" + base_task
54
+ if base_task
55
+ else "## Task\nRespond with a concrete, well-structured answer tailored to the provided inputs."
56
+ ),
57
+ "## Output Requirements\n- Be specific and practical.\n- Preserve important technical constraints.\n- Avoid filler and generic advice.",
58
+ ]
59
+ if section.strip()
60
+ )
61
+
62
+ return PromptDraft(
63
+ name=draft.name.strip(),
64
+ title=title,
65
+ description=description,
66
+ content_template=polished_template,
67
+ arguments=normalized_args,
68
+ )
69
+
70
+
71
+ def _extract_json_object(raw: str) -> dict[str, object] | None:
72
+ if not raw.strip():
73
+ return None
74
+ candidates = re.findall(r"\{.*\}", raw, flags=re.DOTALL)
75
+ for candidate in candidates:
76
+ try:
77
+ value = json.loads(candidate)
78
+ except json.JSONDecodeError:
79
+ continue
80
+ if isinstance(value, dict):
81
+ return value
82
+ return None
83
+
84
+
85
+ def polish_prompt_draft(
86
+ draft: PromptDraft, config: MinderConfig
87
+ ) -> tuple[PromptDraft, dict[str, str]]:
88
+ from minder.llm.local import LocalModelLLM
89
+
90
+ polished = _heuristic_polish(draft)
91
+ llm = LocalModelLLM(
92
+ config.llm.model_path,
93
+ runtime="auto",
94
+ context_length=config.llm.context_length,
95
+ )
96
+ runtime = llm.runtime
97
+
98
+ instruction = """You are polishing an MCP prompt template for an engineering assistant.
99
+ Return only valid JSON with keys: title, description, content_template.
100
+ Keep placeholders exactly as provided, for example {repo_name} or {error}.
101
+ Do not invent new placeholders.
102
+ Make the prompt direct, structured, and useful for a coding workflow.
103
+ """
104
+ request_payload = {
105
+ "name": polished.name,
106
+ "title": polished.title,
107
+ "description": polished.description,
108
+ "arguments": polished.arguments,
109
+ "content_template": polished.content_template,
110
+ }
111
+ llm_response = llm.complete_text(
112
+ f"{instruction}\n\nDraft:\n{json.dumps(request_payload, ensure_ascii=True, indent=2)}",
113
+ max_tokens=900,
114
+ temperature=min(max(config.llm.temperature, 0.05), 0.3),
115
+ fallback="",
116
+ )
117
+ parsed = _extract_json_object(llm_response)
118
+ if not parsed:
119
+ return polished, {
120
+ "provider": "heuristic",
121
+ "model": config.llm.model_name,
122
+ "runtime": runtime,
123
+ }
124
+
125
+ merged = PromptDraft(
126
+ name=polished.name,
127
+ title=str(parsed.get("title", polished.title)).strip() or polished.title,
128
+ description=(
129
+ str(parsed.get("description", polished.description)).strip()
130
+ or polished.description
131
+ ),
132
+ content_template=(
133
+ str(parsed.get("content_template", polished.content_template)).strip()
134
+ or polished.content_template
135
+ ),
136
+ arguments=polished.arguments,
137
+ )
138
+ return merged, {
139
+ "provider": "local_llm",
140
+ "model": "gemma-4-e2b-it",
141
+ "runtime": runtime,
142
+ }
@@ -0,0 +1,318 @@
1
+ """MCP resource registration for Minder."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from collections import Counter
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from mcp.server.fastmcp import FastMCP
11
+
12
+ from minder.store.interfaces import IGraphRepository, IOperationalStore
13
+
14
+
15
+ class ResourceRegistry:
16
+ """Registers all Minder MCP resources onto a :class:`FastMCP` app."""
17
+
18
+ @staticmethod
19
+ def register(
20
+ app: FastMCP,
21
+ store: IOperationalStore,
22
+ graph_store: IGraphRepository | None = None,
23
+ ) -> None:
24
+ """Register core Minder resources, and graph resources when available.
25
+
26
+ Args:
27
+ app: The FastMCP application to register resources with.
28
+ store: An initialised operational store used to fetch live data.
29
+ """
30
+
31
+ # ------------------------------------------------------------------
32
+ # minder://skills
33
+ # ------------------------------------------------------------------
34
+
35
+ @app.resource(
36
+ "minder://skills",
37
+ name="minder_skills",
38
+ title="Minder Skills",
39
+ description=(
40
+ "List all stored skills with their id, title, language, and tags."
41
+ ),
42
+ mime_type="application/json",
43
+ )
44
+ async def skills_resource() -> str:
45
+ skills = await store.list_skills()
46
+ return json.dumps(
47
+ [
48
+ {
49
+ "id": str(s.id),
50
+ "title": s.title,
51
+ "language": getattr(s, "language", ""),
52
+ "tags": list(s.tags) if s.tags else [],
53
+ }
54
+ for s in skills
55
+ ],
56
+ indent=2,
57
+ )
58
+
59
+ # ------------------------------------------------------------------
60
+ # minder://repos
61
+ # ------------------------------------------------------------------
62
+
63
+ @app.resource(
64
+ "minder://repos",
65
+ name="minder_repos",
66
+ title="Minder Repositories",
67
+ description=(
68
+ "List all repositories with their name, URL, and current workflow state."
69
+ ),
70
+ mime_type="application/json",
71
+ )
72
+ async def repos_resource() -> str:
73
+ repos = await store.list_repositories()
74
+ result: list[dict[str, Any]] = []
75
+ for repo in repos:
76
+ state = await store.get_workflow_state_by_repo(repo.id)
77
+ workflow_info: dict[str, Any] | None = None
78
+ if state is not None:
79
+ workflow_info = {
80
+ "current_step": state.current_step,
81
+ "completed_steps": list(state.completed_steps),
82
+ "blocked_by": list(state.blocked_by),
83
+ }
84
+ result.append(
85
+ {
86
+ "id": str(repo.id),
87
+ "name": repo.repo_name,
88
+ "url": getattr(repo, "repo_url", ""),
89
+ "workflow_state": workflow_info,
90
+ }
91
+ )
92
+ return json.dumps(result, indent=2)
93
+
94
+ # ------------------------------------------------------------------
95
+ # minder://stats
96
+ # ------------------------------------------------------------------
97
+
98
+ @app.resource(
99
+ "minder://stats",
100
+ name="minder_stats",
101
+ title="Minder Statistics",
102
+ description=(
103
+ "Aggregated counts: total skills, repos, workflows, and recorded errors."
104
+ ),
105
+ mime_type="application/json",
106
+ )
107
+ async def stats_resource() -> str:
108
+ skills = await store.list_skills()
109
+ repos = await store.list_repositories()
110
+ workflows = await store.list_workflows()
111
+ errors = await store.list_errors()
112
+ return json.dumps(
113
+ {
114
+ "skill_count": len(skills),
115
+ "repo_count": len(repos),
116
+ "workflow_count": len(workflows),
117
+ "error_count": len(errors),
118
+ },
119
+ indent=2,
120
+ )
121
+
122
+ if graph_store is None:
123
+ return
124
+
125
+ @app.resource(
126
+ "minder://repos/{repo_name}/structure",
127
+ name="minder_repo_structure",
128
+ title="Minder Repository Structure",
129
+ description="Graph-backed structural summary for a repository, grouped by node type.",
130
+ mime_type="application/json",
131
+ )
132
+ async def repo_structure_resource(repo_name: str) -> str:
133
+ repo_nodes = await _repo_graph_nodes(graph_store, repo_name)
134
+ counts = Counter(str(getattr(node, "node_type", "")) for node in repo_nodes)
135
+ grouped: dict[str, list[dict[str, Any]]] = {}
136
+ for node in sorted(
137
+ repo_nodes,
138
+ key=lambda item: (
139
+ str(getattr(item, "node_type", "")),
140
+ str(getattr(item, "name", "")),
141
+ ),
142
+ ):
143
+ item = _serialize_graph_node(node)
144
+ grouped.setdefault(item["node_type"], []).append(item)
145
+ return json.dumps(
146
+ {
147
+ "repo_name": repo_name,
148
+ "counts": dict(counts),
149
+ "nodes": grouped,
150
+ },
151
+ indent=2,
152
+ )
153
+
154
+ @app.resource(
155
+ "minder://repos/{repo_name}/todos",
156
+ name="minder_repo_todos",
157
+ title="Minder Repository TODOs",
158
+ description="Graph-backed TODO items extracted for a repository.",
159
+ mime_type="application/json",
160
+ )
161
+ async def repo_todos_resource(repo_name: str) -> str:
162
+ repo_nodes = await _repo_graph_nodes(graph_store, repo_name)
163
+ todos = [
164
+ _serialize_graph_node(node)
165
+ for node in repo_nodes
166
+ if str(getattr(node, "node_type", "")) == "todo"
167
+ ]
168
+ todos.sort(
169
+ key=lambda item: (
170
+ str(item["metadata"].get("path", "")),
171
+ int(item["metadata"].get("line", 0) or 0),
172
+ item["name"],
173
+ )
174
+ )
175
+ return json.dumps(
176
+ {
177
+ "repo_name": repo_name,
178
+ "count": len(todos),
179
+ "items": todos,
180
+ },
181
+ indent=2,
182
+ )
183
+
184
+ @app.resource(
185
+ "minder://repos/{repo_name}/routes",
186
+ name="minder_repo_routes",
187
+ title="Minder Repository Routes",
188
+ description="Graph-backed route inventory for a repository.",
189
+ mime_type="application/json",
190
+ )
191
+ async def repo_routes_resource(repo_name: str) -> str:
192
+ repo_nodes = await _repo_graph_nodes(graph_store, repo_name)
193
+ routes = [
194
+ _serialize_graph_node(node)
195
+ for node in repo_nodes
196
+ if str(getattr(node, "node_type", "")) == "route"
197
+ ]
198
+ routes.sort(
199
+ key=lambda item: (
200
+ str(item["metadata"].get("method", "")),
201
+ str(item["metadata"].get("route_path", "")),
202
+ item["name"],
203
+ )
204
+ )
205
+ return json.dumps(
206
+ {
207
+ "repo_name": repo_name,
208
+ "count": len(routes),
209
+ "items": routes,
210
+ },
211
+ indent=2,
212
+ )
213
+
214
+ @app.resource(
215
+ "minder://repos/{repo_name}/dependencies",
216
+ name="minder_repo_dependencies",
217
+ title="Minder Repository Dependencies",
218
+ description="Graph-backed internal and external dependency summary for a repository.",
219
+ mime_type="application/json",
220
+ )
221
+ async def repo_dependencies_resource(repo_name: str) -> str:
222
+ repo_nodes = await _repo_graph_nodes(graph_store, repo_name)
223
+ repo_node_ids = {str(getattr(node, "id")) for node in repo_nodes}
224
+ services = [node for node in repo_nodes if str(getattr(node, "node_type", "")) == "service"]
225
+ internal_dependencies: list[dict[str, Any]] = []
226
+ for service in services:
227
+ neighbors = await graph_store.get_neighbors(getattr(service, "id"), direction="out", relation="depends_on")
228
+ targets = [
229
+ {
230
+ "id": str(getattr(neighbor, "id")),
231
+ "name": str(getattr(neighbor, "name", "")),
232
+ "node_type": str(getattr(neighbor, "node_type", "")),
233
+ }
234
+ for neighbor in neighbors
235
+ if str(getattr(neighbor, "id")) in repo_node_ids
236
+ ]
237
+ if targets:
238
+ internal_dependencies.append(
239
+ {
240
+ "service": str(getattr(service, "name", "")),
241
+ "depends_on": sorted(targets, key=lambda item: item["name"]),
242
+ }
243
+ )
244
+
245
+ external_apis = [
246
+ _serialize_graph_node(node)
247
+ for node in repo_nodes
248
+ if str(getattr(node, "node_type", "")) == "external_service_api"
249
+ ]
250
+ external_apis.sort(key=lambda item: item["name"])
251
+ return json.dumps(
252
+ {
253
+ "repo_name": repo_name,
254
+ "internal_dependencies": internal_dependencies,
255
+ "external_services": external_apis,
256
+ },
257
+ indent=2,
258
+ )
259
+
260
+ @app.resource(
261
+ "minder://repos/{repo_name}/symbols",
262
+ name="minder_repo_symbols",
263
+ title="Minder Repository Symbols",
264
+ description="Graph-backed symbol inventory for functions, classes, controllers, and interfaces within a repository.",
265
+ mime_type="application/json",
266
+ )
267
+ async def repo_symbols_resource(repo_name: str) -> str:
268
+ repo_nodes = await _repo_graph_nodes(graph_store, repo_name)
269
+ symbol_types = {"function", "class", "controller", "interface", "abstract_class", "module"}
270
+ symbols = [
271
+ _serialize_graph_node(node)
272
+ for node in repo_nodes
273
+ if str(getattr(node, "node_type", "")) in symbol_types
274
+ ]
275
+ symbols.sort(
276
+ key=lambda item: (
277
+ item["node_type"],
278
+ str(item["metadata"].get("path", "")),
279
+ item["name"],
280
+ )
281
+ )
282
+ return json.dumps(
283
+ {
284
+ "repo_name": repo_name,
285
+ "count": len(symbols),
286
+ "items": symbols,
287
+ },
288
+ indent=2,
289
+ )
290
+
291
+
292
+ def _serialize_graph_node(node: Any) -> dict[str, Any]:
293
+ metadata = getattr(node, "node_metadata", {}) or {}
294
+ return {
295
+ "id": str(getattr(node, "id")),
296
+ "node_type": str(getattr(node, "node_type", "")),
297
+ "name": str(getattr(node, "name", "")),
298
+ "metadata": metadata if isinstance(metadata, dict) else {},
299
+ }
300
+
301
+
302
+ async def _repo_graph_nodes(graph_store: IGraphRepository, repo_name: str) -> list[Any]:
303
+ nodes = await graph_store.list_nodes()
304
+ selected: list[Any] = []
305
+ for node in nodes:
306
+ metadata = getattr(node, "node_metadata", {}) or {}
307
+ project = str(metadata.get("project", "") or "")
308
+ path_value = str(metadata.get("path", "") or "")
309
+ if project == repo_name:
310
+ selected.append(node)
311
+ continue
312
+ if path_value:
313
+ try:
314
+ if Path(path_value).name == repo_name:
315
+ selected.append(node)
316
+ except (TypeError, ValueError):
317
+ continue
318
+ return selected
@@ -0,0 +1,5 @@
1
+ from minder.retrieval.hybrid import HybridRetriever
2
+ from minder.retrieval.mmr import mmr_rerank
3
+ from minder.retrieval.multi_hop import MultiHopRetriever, RetrieveFn
4
+
5
+ __all__ = ["HybridRetriever", "MultiHopRetriever", "RetrieveFn", "mmr_rerank"]
@@ -0,0 +1,178 @@
1
+ """
2
+ BM25 + Vector hybrid retrieval.
3
+
4
+ Combines normalised BM25 keyword scores with normalised vector similarity
5
+ scores using a configurable alpha blend:
6
+
7
+ combined = alpha * vector_score + (1 - alpha) * bm25_score
8
+
9
+ alpha = 1.0 → pure vector search
10
+ alpha = 0.0 → pure BM25
11
+ alpha = 0.5 → equal blend (default)
12
+
13
+ BM25 is implemented in pure Python (no external index server required).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import math
19
+ from collections import Counter
20
+ from typing import Any
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # BM25 helpers
25
+ # ---------------------------------------------------------------------------
26
+
27
+ _BM25_K1 = 1.5
28
+ _BM25_B = 0.75
29
+
30
+
31
+ def _tokenize(text: str) -> list[str]:
32
+ return [tok for tok in text.lower().split() if len(tok) > 1]
33
+
34
+
35
+ def _bm25_score(
36
+ query_terms: list[str],
37
+ doc_tokens: list[str],
38
+ doc_freq: dict[str, int],
39
+ num_docs: int,
40
+ avg_dl: float,
41
+ ) -> float:
42
+ dl = len(doc_tokens)
43
+ tf_map: Counter[str] = Counter(doc_tokens)
44
+ score = 0.0
45
+ for term in query_terms:
46
+ tf = tf_map.get(term, 0)
47
+ if tf == 0:
48
+ continue
49
+ df = doc_freq.get(term, 0)
50
+ idf = math.log((num_docs - df + 0.5) / (df + 0.5) + 1.0)
51
+ tf_norm = (tf * (_BM25_K1 + 1.0)) / (
52
+ tf + _BM25_K1 * (1.0 - _BM25_B + _BM25_B * dl / max(avg_dl, 1.0))
53
+ )
54
+ score += idf * tf_norm
55
+ return score
56
+
57
+
58
+ def _min_max_normalize(scores: list[float]) -> list[float]:
59
+ if not scores:
60
+ return scores
61
+ lo, hi = min(scores), max(scores)
62
+ if math.isclose(hi, lo):
63
+ return [1.0] * len(scores)
64
+ span = hi - lo
65
+ return [(s - lo) / span for s in scores]
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # HybridRetriever
70
+ # ---------------------------------------------------------------------------
71
+
72
+
73
+ class HybridRetriever:
74
+ """
75
+ Merge vector-search results with BM25 scores computed over a corpus.
76
+
77
+ Args:
78
+ alpha: blend coefficient in [0, 1].
79
+ 1.0 = pure vector, 0.0 = pure BM25.
80
+ """
81
+
82
+ def __init__(self, alpha: float = 0.5) -> None:
83
+ if not 0.0 <= alpha <= 1.0:
84
+ raise ValueError(f"alpha must be in [0.0, 1.0], got {alpha}")
85
+ self._alpha = alpha
86
+
87
+ @property
88
+ def alpha(self) -> float:
89
+ return self._alpha
90
+
91
+ def merge(
92
+ self,
93
+ query: str,
94
+ vector_results: list[dict[str, Any]],
95
+ corpus: list[dict[str, Any]],
96
+ *,
97
+ limit: int = 5,
98
+ content_key: str = "content",
99
+ id_key: str = "path",
100
+ ) -> list[dict[str, Any]]:
101
+ """
102
+ Merge *vector_results* with BM25 scores computed over *corpus*.
103
+
104
+ Args:
105
+ query: original user query (used for BM25 term matching).
106
+ vector_results: documents returned by vector search, each with a
107
+ ``"score"`` field (float, already normalised or raw cosine).
108
+ corpus: the full candidate set to build the BM25 index over.
109
+ Should include all docs in *vector_results* plus any extra
110
+ candidates. May equal *vector_results* when no corpus is
111
+ separately available.
112
+ limit: maximum number of merged results to return.
113
+ content_key: key in each doc dict that holds the text content.
114
+ id_key: key used to de-duplicate documents across the two lists.
115
+
116
+ Returns:
117
+ Merged, sorted list of doc dicts enriched with ``"score"``,
118
+ ``"vector_score"``, and ``"bm25_score"`` fields.
119
+ """
120
+ all_docs = corpus if corpus else vector_results
121
+ if not all_docs:
122
+ return []
123
+
124
+ # ---- BM25 index ----
125
+ tokenized = [_tokenize(str(doc.get(content_key, ""))) for doc in all_docs]
126
+ avg_dl = sum(len(t) for t in tokenized) / max(len(tokenized), 1)
127
+ doc_freq: Counter[str] = Counter()
128
+ for tokens in tokenized:
129
+ for term in set(tokens):
130
+ doc_freq[term] += 1
131
+
132
+ query_terms = _tokenize(query)
133
+ raw_bm25 = [
134
+ _bm25_score(query_terms, tokens, doc_freq, len(all_docs), avg_dl)
135
+ for tokens in tokenized
136
+ ]
137
+ bm25_norm = _min_max_normalize(raw_bm25)
138
+ bm25_map: dict[str, float] = {
139
+ str(doc.get(id_key, i)): bm25_norm[i]
140
+ for i, doc in enumerate(all_docs)
141
+ }
142
+
143
+ # ---- Vector score map ----
144
+ raw_vec = [float(doc.get("score", 0.0)) for doc in vector_results]
145
+ vec_norm = _min_max_normalize(raw_vec)
146
+ vec_map: dict[str, float] = {
147
+ str(doc.get(id_key, i)): vec_norm[i]
148
+ for i, doc in enumerate(vector_results)
149
+ }
150
+
151
+ # ---- Union merge ----
152
+ vec_ids = {str(doc.get(id_key, "")) for doc in vector_results}
153
+ candidates = list(vector_results) + [
154
+ doc for doc in all_docs
155
+ if str(doc.get(id_key, "")) not in vec_ids
156
+ ]
157
+
158
+ seen: set[str] = set()
159
+ merged: list[dict[str, Any]] = []
160
+ for doc in candidates:
161
+ key = str(doc.get(id_key, id(doc)))
162
+ if key in seen:
163
+ continue
164
+ seen.add(key)
165
+ v = vec_map.get(key, 0.0)
166
+ b = bm25_map.get(key, 0.0)
167
+ combined = round(self._alpha * v + (1.0 - self._alpha) * b, 6)
168
+ merged.append(
169
+ {
170
+ **doc,
171
+ "score": combined,
172
+ "vector_score": round(v, 6),
173
+ "bm25_score": round(b, 6),
174
+ }
175
+ )
176
+
177
+ merged.sort(key=lambda d: float(d["score"]), reverse=True)
178
+ return merged[:limit]