devmem-agents 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devmem/__init__.py +5 -0
- devmem/api.py +257 -0
- devmem/config.py +34 -0
- devmem/embeddings.py +119 -0
- devmem/ingest.py +184 -0
- devmem/live_backend.py +344 -0
- devmem/main.py +11 -0
- devmem/models.py +157 -0
- devmem/retrieval_eval.py +145 -0
- devmem/service.py +280 -0
- devmem/storage/__init__.py +4 -0
- devmem/storage/milvus_store.py +321 -0
- devmem/storage/neptune_store.py +194 -0
- devmem/storage/record_store.py +974 -0
- devmem_agents-0.1.0.dist-info/METADATA +100 -0
- devmem_agents-0.1.0.dist-info/RECORD +19 -0
- devmem_agents-0.1.0.dist-info/WHEEL +5 -0
- devmem_agents-0.1.0.dist-info/licenses/LICENSE +21 -0
- devmem_agents-0.1.0.dist-info/top_level.txt +1 -0
devmem/service.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Core service logic for devmem.
|
|
2
|
+
|
|
3
|
+
Wires the API layer to a `RecordStore` backend (in-memory or SQL) and
|
|
4
|
+
embedding provider. Session task descriptions are embedded at session start
|
|
5
|
+
to power `/v1/tasks/similar`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from devmem.config import settings
|
|
13
|
+
from devmem.embeddings import embed_one, is_fallback_active
|
|
14
|
+
from devmem.models import SessionStartRequest, SessionStartResponse
|
|
15
|
+
from devmem.storage.record_store import RecordStore, build_record_store
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class NamespaceError(ValueError):
|
|
19
|
+
"""Raised when request namespace is invalid."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DevMemService:
|
|
23
|
+
"""Contract-first service layer with strict namespace guard."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, store: RecordStore | None = None) -> None:
|
|
26
|
+
self.store: RecordStore = store or build_record_store(settings.record_store_dsn)
|
|
27
|
+
|
|
28
|
+
def _require_namespace(self, namespace: str) -> None:
|
|
29
|
+
if namespace != settings.namespace:
|
|
30
|
+
raise NamespaceError(
|
|
31
|
+
f"invalid namespace '{namespace}'. expected '{settings.namespace}'"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def health(self) -> dict[str, Any]:
|
|
35
|
+
return {
|
|
36
|
+
"service": settings.service_name,
|
|
37
|
+
"version": settings.service_version,
|
|
38
|
+
"namespace": settings.namespace,
|
|
39
|
+
"checks": {
|
|
40
|
+
"api": "healthy",
|
|
41
|
+
"milvus": "not_configured" if not settings.milvus_uri else "configured",
|
|
42
|
+
"neptune": "not_configured" if not settings.neptune_endpoint else "configured",
|
|
43
|
+
"aurora": "not_configured" if not settings.aurora_dsn else "configured",
|
|
44
|
+
"record_store": "sql" if settings.record_store_dsn else "memory",
|
|
45
|
+
"embeddings": "fallback_sha256" if is_fallback_active() else settings.embedding_model,
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# ---- sessions --------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def start_session(self, req: SessionStartRequest) -> SessionStartResponse:
|
|
52
|
+
self._require_namespace(req.namespace)
|
|
53
|
+
session = SessionStartResponse()
|
|
54
|
+
task_embedding = embed_one(req.task) if req.task else None
|
|
55
|
+
self.store.create_session(
|
|
56
|
+
session_id=session.session_id,
|
|
57
|
+
namespace=req.namespace,
|
|
58
|
+
project=req.project,
|
|
59
|
+
repo=req.repo,
|
|
60
|
+
branch=req.branch,
|
|
61
|
+
agent=req.agent,
|
|
62
|
+
task=req.task,
|
|
63
|
+
started_at=session.started_at,
|
|
64
|
+
task_embedding=task_embedding,
|
|
65
|
+
)
|
|
66
|
+
return session
|
|
67
|
+
|
|
68
|
+
# ---- retrieval -------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
def pull_context(
|
|
71
|
+
self,
|
|
72
|
+
*,
|
|
73
|
+
namespace: str,
|
|
74
|
+
session_id: str,
|
|
75
|
+
project: str,
|
|
76
|
+
repo: str,
|
|
77
|
+
task: str,
|
|
78
|
+
top_k: int,
|
|
79
|
+
) -> dict[str, Any]:
|
|
80
|
+
self._require_namespace(namespace)
|
|
81
|
+
artifacts = self.store.query_artifacts(
|
|
82
|
+
namespace=namespace, project=project, repo=repo, limit=top_k
|
|
83
|
+
)
|
|
84
|
+
decisions = self.store.query_decisions(
|
|
85
|
+
namespace=namespace, project=project, repo=repo, limit=top_k
|
|
86
|
+
)
|
|
87
|
+
return {
|
|
88
|
+
"session_id": session_id,
|
|
89
|
+
"task": task,
|
|
90
|
+
"artifacts": artifacts,
|
|
91
|
+
"decisions": decisions,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
def hybrid_search(
|
|
95
|
+
self,
|
|
96
|
+
*,
|
|
97
|
+
namespace: str,
|
|
98
|
+
q: str,
|
|
99
|
+
project: str,
|
|
100
|
+
repo: str,
|
|
101
|
+
top_k: int,
|
|
102
|
+
) -> list[dict[str, Any]]:
|
|
103
|
+
"""Semantic search over artifact+decision content, ranked by cosine similarity."""
|
|
104
|
+
self._require_namespace(namespace)
|
|
105
|
+
if not (q or "").strip():
|
|
106
|
+
return []
|
|
107
|
+
query_vec = embed_one(q)
|
|
108
|
+
|
|
109
|
+
# Pool all artifacts and decisions for this project/repo, rank by
|
|
110
|
+
# cosine similarity between query embedding and a per-record embedding
|
|
111
|
+
# computed on the fly from title + content.
|
|
112
|
+
artifacts = self.store.query_artifacts(
|
|
113
|
+
namespace=namespace, project=project, repo=repo, limit=500
|
|
114
|
+
)
|
|
115
|
+
decisions = self.store.query_decisions(
|
|
116
|
+
namespace=namespace, project=project, repo=repo, limit=500
|
|
117
|
+
)
|
|
118
|
+
candidates: list[tuple[str, dict[str, Any]]] = []
|
|
119
|
+
for a in artifacts:
|
|
120
|
+
text = f"{a.get('title', '')}\n{a.get('content', '')}"
|
|
121
|
+
candidates.append((text, {"kind": "artifact", **a}))
|
|
122
|
+
for d in decisions:
|
|
123
|
+
text = f"{d.get('title', '')}\n{d.get('decision', '')}\n{d.get('rationale', '')}"
|
|
124
|
+
candidates.append((text, {"kind": "decision", **d}))
|
|
125
|
+
if not candidates:
|
|
126
|
+
return []
|
|
127
|
+
|
|
128
|
+
from devmem.embeddings import embed_many, cosine_similarity
|
|
129
|
+
|
|
130
|
+
texts = [c[0] for c in candidates]
|
|
131
|
+
vectors = embed_many(texts)
|
|
132
|
+
scored: list[tuple[float, dict[str, Any]]] = []
|
|
133
|
+
for (_, record), vec in zip(candidates, vectors):
|
|
134
|
+
score = cosine_similarity(query_vec, vec)
|
|
135
|
+
if score > 0:
|
|
136
|
+
scored.append((float(score), record))
|
|
137
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
138
|
+
return [{"score": score, **rec} for score, rec in scored[:top_k]]
|
|
139
|
+
|
|
140
|
+
def similar_tasks(
|
|
141
|
+
self,
|
|
142
|
+
*,
|
|
143
|
+
namespace: str,
|
|
144
|
+
q: str,
|
|
145
|
+
project: str | None,
|
|
146
|
+
repo: str | None,
|
|
147
|
+
top_k: int,
|
|
148
|
+
) -> list[dict[str, Any]]:
|
|
149
|
+
self._require_namespace(namespace)
|
|
150
|
+
if not (q or "").strip():
|
|
151
|
+
return []
|
|
152
|
+
query_vec = embed_one(q)
|
|
153
|
+
return self.store.similar_tasks(
|
|
154
|
+
namespace=namespace,
|
|
155
|
+
task_embedding=query_vec,
|
|
156
|
+
project=project,
|
|
157
|
+
repo=repo,
|
|
158
|
+
top_k=top_k,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# ---- writes ----------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
def upsert_artifact(self, namespace: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
164
|
+
self._require_namespace(namespace)
|
|
165
|
+
return self.store.insert_artifact(namespace=namespace, payload=payload)
|
|
166
|
+
|
|
167
|
+
def upsert_fact(self, namespace: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
168
|
+
self._require_namespace(namespace)
|
|
169
|
+
return self.store.insert_fact(namespace=namespace, payload=payload)
|
|
170
|
+
|
|
171
|
+
def upsert_decision(self, namespace: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
172
|
+
self._require_namespace(namespace)
|
|
173
|
+
return self.store.insert_decision(namespace=namespace, payload=payload)
|
|
174
|
+
|
|
175
|
+
def create_handoff(self, namespace: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
176
|
+
self._require_namespace(namespace)
|
|
177
|
+
return self.store.insert_handoff(namespace=namespace, payload=payload)
|
|
178
|
+
|
|
179
|
+
def update_task(self, namespace: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
180
|
+
self._require_namespace(namespace)
|
|
181
|
+
return self.store.insert_task_update(namespace=namespace, payload=payload)
|
|
182
|
+
|
|
183
|
+
def record_feedback(self, namespace: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
184
|
+
self._require_namespace(namespace)
|
|
185
|
+
return self.store.insert_feedback(namespace=namespace, payload=payload)
|
|
186
|
+
|
|
187
|
+
def commit_session(
|
|
188
|
+
self,
|
|
189
|
+
*,
|
|
190
|
+
namespace: str,
|
|
191
|
+
session_id: str,
|
|
192
|
+
artifacts: list[dict[str, Any]],
|
|
193
|
+
decisions: list[dict[str, Any]],
|
|
194
|
+
handoff: dict[str, Any] | None,
|
|
195
|
+
task_update: dict[str, Any] | None,
|
|
196
|
+
client_commit_id: str | None,
|
|
197
|
+
) -> dict[str, Any]:
|
|
198
|
+
self._require_namespace(namespace)
|
|
199
|
+
return self.store.commit_session(
|
|
200
|
+
namespace=namespace,
|
|
201
|
+
session_id=session_id,
|
|
202
|
+
artifacts=artifacts,
|
|
203
|
+
decisions=decisions,
|
|
204
|
+
handoff=handoff,
|
|
205
|
+
task_update=task_update,
|
|
206
|
+
client_commit_id=client_commit_id,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# ---- listings --------------------------------------------------------
|
|
210
|
+
|
|
211
|
+
def list_artifacts(
|
|
212
|
+
self,
|
|
213
|
+
*,
|
|
214
|
+
namespace: str,
|
|
215
|
+
project: str | None,
|
|
216
|
+
repo: str | None,
|
|
217
|
+
session_id: str | None,
|
|
218
|
+
artifact_type: str | None,
|
|
219
|
+
since: str | None,
|
|
220
|
+
limit: int,
|
|
221
|
+
offset: int,
|
|
222
|
+
) -> list[dict[str, Any]]:
|
|
223
|
+
self._require_namespace(namespace)
|
|
224
|
+
return self.store.query_artifacts(
|
|
225
|
+
namespace=namespace,
|
|
226
|
+
project=project,
|
|
227
|
+
repo=repo,
|
|
228
|
+
session_id=session_id,
|
|
229
|
+
artifact_type=artifact_type,
|
|
230
|
+
since=since,
|
|
231
|
+
limit=limit,
|
|
232
|
+
offset=offset,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def list_decisions(
|
|
236
|
+
self,
|
|
237
|
+
*,
|
|
238
|
+
namespace: str,
|
|
239
|
+
project: str | None,
|
|
240
|
+
repo: str | None,
|
|
241
|
+
session_id: str | None,
|
|
242
|
+
since: str | None,
|
|
243
|
+
limit: int,
|
|
244
|
+
offset: int,
|
|
245
|
+
) -> list[dict[str, Any]]:
|
|
246
|
+
self._require_namespace(namespace)
|
|
247
|
+
return self.store.query_decisions(
|
|
248
|
+
namespace=namespace,
|
|
249
|
+
project=project,
|
|
250
|
+
repo=repo,
|
|
251
|
+
session_id=session_id,
|
|
252
|
+
since=since,
|
|
253
|
+
limit=limit,
|
|
254
|
+
offset=offset,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def list_handoffs(
|
|
258
|
+
self,
|
|
259
|
+
*,
|
|
260
|
+
namespace: str,
|
|
261
|
+
project: str | None,
|
|
262
|
+
repo: str | None,
|
|
263
|
+
session_id: str | None,
|
|
264
|
+
since: str | None,
|
|
265
|
+
limit: int,
|
|
266
|
+
offset: int,
|
|
267
|
+
) -> list[dict[str, Any]]:
|
|
268
|
+
self._require_namespace(namespace)
|
|
269
|
+
return self.store.query_handoffs(
|
|
270
|
+
namespace=namespace,
|
|
271
|
+
project=project,
|
|
272
|
+
repo=repo,
|
|
273
|
+
session_id=session_id,
|
|
274
|
+
since=since,
|
|
275
|
+
limit=limit,
|
|
276
|
+
offset=offset,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
service = DevMemService()
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""Milvus adapter for devmem repository ingestion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import Any, Iterable
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from pymilvus import (
|
|
13
|
+
Collection,
|
|
14
|
+
CollectionSchema,
|
|
15
|
+
DataType,
|
|
16
|
+
FieldSchema,
|
|
17
|
+
connections,
|
|
18
|
+
utility,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from devmem.embeddings import embed_many, embed_one, embedding_dim
|
|
22
|
+
from devmem.live_backend import LiveBackendConfig, parse_host_port_from_url
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
_TOKEN_RE = re.compile(r"[a-zA-Z0-9_]{2,}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MilvusStore:
|
|
29
|
+
"""Write repository text chunks to Milvus with strict namespace metadata."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
cfg: LiveBackendConfig,
|
|
34
|
+
*,
|
|
35
|
+
namespace: str,
|
|
36
|
+
project_id: str,
|
|
37
|
+
repo_id: str,
|
|
38
|
+
dim: int | None = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
self.cfg = cfg
|
|
41
|
+
self.namespace = namespace
|
|
42
|
+
self.project_id = project_id
|
|
43
|
+
self.repo_id = repo_id
|
|
44
|
+
self.dim = dim if dim is not None else embedding_dim()
|
|
45
|
+
self.alias = f"devmem_{namespace}_{project_id}"
|
|
46
|
+
self.collection_name = f"{namespace}_{project_id}_{repo_id}_chunks".replace("-", "_")[:200]
|
|
47
|
+
self.collection: Collection | None = None
|
|
48
|
+
|
|
49
|
+
def connect(self) -> None:
|
|
50
|
+
"""Connect to Milvus and ensure target collection exists."""
|
|
51
|
+
args: dict[str, object] = {"timeout": self.cfg.milvus_timeout}
|
|
52
|
+
if self.cfg.milvus_uri:
|
|
53
|
+
args["uri"] = self.cfg.milvus_uri
|
|
54
|
+
elif self.cfg.milvus_host:
|
|
55
|
+
args["host"] = self.cfg.milvus_host
|
|
56
|
+
args["port"] = self.cfg.milvus_port
|
|
57
|
+
args["secure"] = self.cfg.milvus_secure
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError("Milvus host/URI is not configured")
|
|
60
|
+
|
|
61
|
+
if self.cfg.milvus_token:
|
|
62
|
+
args["token"] = self.cfg.milvus_token
|
|
63
|
+
elif self.cfg.milvus_user:
|
|
64
|
+
args["user"] = self.cfg.milvus_user
|
|
65
|
+
args["password"] = self.cfg.milvus_password or ""
|
|
66
|
+
|
|
67
|
+
connections.connect(alias=self.alias, **args)
|
|
68
|
+
self.collection = self._ensure_collection()
|
|
69
|
+
try:
|
|
70
|
+
self.collection.load()
|
|
71
|
+
except Exception:
|
|
72
|
+
logger.debug(
|
|
73
|
+
"Milvus collection load skipped collection=%s",
|
|
74
|
+
self.collection_name,
|
|
75
|
+
exc_info=True,
|
|
76
|
+
)
|
|
77
|
+
logger.info(
|
|
78
|
+
"Milvus connected alias=%s collection=%s",
|
|
79
|
+
self.alias,
|
|
80
|
+
self.collection_name,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def close(self) -> None:
|
|
84
|
+
"""Disconnect from Milvus."""
|
|
85
|
+
try:
|
|
86
|
+
connections.disconnect(self.alias)
|
|
87
|
+
except Exception:
|
|
88
|
+
logger.debug("Milvus disconnect failed alias=%s", self.alias, exc_info=True)
|
|
89
|
+
|
|
90
|
+
def _ensure_collection(self) -> Collection:
|
|
91
|
+
if not utility.has_collection(self.collection_name, using=self.alias):
|
|
92
|
+
fields = [
|
|
93
|
+
FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=64, auto_id=False),
|
|
94
|
+
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=self.dim),
|
|
95
|
+
FieldSchema(name="namespace", dtype=DataType.VARCHAR, max_length=128),
|
|
96
|
+
FieldSchema(name="project_id", dtype=DataType.VARCHAR, max_length=256),
|
|
97
|
+
FieldSchema(name="repo_id", dtype=DataType.VARCHAR, max_length=256),
|
|
98
|
+
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=1024),
|
|
99
|
+
FieldSchema(name="chunk_index", dtype=DataType.INT64),
|
|
100
|
+
FieldSchema(name="sha", dtype=DataType.VARCHAR, max_length=128),
|
|
101
|
+
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
|
|
102
|
+
FieldSchema(name="created_at", dtype=DataType.VARCHAR, max_length=64),
|
|
103
|
+
]
|
|
104
|
+
schema = CollectionSchema(fields=fields, description="devmem repository chunks", enable_dynamic_field=False)
|
|
105
|
+
coll = Collection(self.collection_name, schema=schema, using=self.alias)
|
|
106
|
+
coll.create_index(
|
|
107
|
+
field_name="vector",
|
|
108
|
+
index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 2048}},
|
|
109
|
+
)
|
|
110
|
+
else:
|
|
111
|
+
coll = Collection(self.collection_name, using=self.alias)
|
|
112
|
+
return coll
|
|
113
|
+
|
|
114
|
+
def _embed(self, text: str) -> list[float]:
|
|
115
|
+
return embed_one(text)
|
|
116
|
+
|
|
117
|
+
def _embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
118
|
+
return embed_many(texts)
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def _doc_id(namespace: str, project_id: str, repo_id: str, path: str, chunk_index: int, sha: str) -> str:
|
|
122
|
+
h = hashlib.sha256()
|
|
123
|
+
h.update(namespace.encode("utf-8"))
|
|
124
|
+
h.update(project_id.encode("utf-8"))
|
|
125
|
+
h.update(repo_id.encode("utf-8"))
|
|
126
|
+
h.update(path.encode("utf-8"))
|
|
127
|
+
h.update(str(chunk_index).encode("utf-8"))
|
|
128
|
+
h.update(sha.encode("utf-8"))
|
|
129
|
+
return h.hexdigest()
|
|
130
|
+
|
|
131
|
+
def clear_project_data(self) -> None:
|
|
132
|
+
"""Delete existing rows for this namespace/project/repo."""
|
|
133
|
+
if self.collection is None:
|
|
134
|
+
raise RuntimeError("Milvus collection is not initialized")
|
|
135
|
+
|
|
136
|
+
escaped_ns = self.namespace.replace('"', '\\"')
|
|
137
|
+
escaped_project = self.project_id.replace('"', '\\"')
|
|
138
|
+
escaped_repo = self.repo_id.replace('"', '\\"')
|
|
139
|
+
delete_expr = (
|
|
140
|
+
f'namespace == "{escaped_ns}" and project_id == "{escaped_project}" '
|
|
141
|
+
f'and repo_id == "{escaped_repo}"'
|
|
142
|
+
)
|
|
143
|
+
try:
|
|
144
|
+
self.collection.delete(expr=delete_expr)
|
|
145
|
+
except Exception:
|
|
146
|
+
logger.debug(
|
|
147
|
+
"Milvus project cleanup skipped namespace=%s project=%s repo=%s",
|
|
148
|
+
self.namespace,
|
|
149
|
+
self.project_id,
|
|
150
|
+
self.repo_id,
|
|
151
|
+
exc_info=True,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def replace_file_chunks(self, *, path: str, sha: str, chunks: Iterable[str]) -> int:
|
|
155
|
+
"""Insert/update chunks for a single file path in the project collection."""
|
|
156
|
+
if self.collection is None:
|
|
157
|
+
raise RuntimeError("Milvus collection is not initialized")
|
|
158
|
+
|
|
159
|
+
rows = []
|
|
160
|
+
created_at = datetime.now(timezone.utc).isoformat()
|
|
161
|
+
chunk_texts = [chunk[:65500] for chunk in chunks]
|
|
162
|
+
vectors = self._embed_batch(chunk_texts) if chunk_texts else []
|
|
163
|
+
for idx, (chunk_text, vec) in enumerate(zip(chunk_texts, vectors)):
|
|
164
|
+
rows.append(
|
|
165
|
+
{
|
|
166
|
+
"id": self._doc_id(self.namespace, self.project_id, self.repo_id, path, idx, sha),
|
|
167
|
+
"vector": vec,
|
|
168
|
+
"namespace": self.namespace,
|
|
169
|
+
"project_id": self.project_id,
|
|
170
|
+
"repo_id": self.repo_id,
|
|
171
|
+
"path": path,
|
|
172
|
+
"chunk_index": idx,
|
|
173
|
+
"sha": sha,
|
|
174
|
+
"content": chunk_text,
|
|
175
|
+
"created_at": created_at,
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if not rows:
|
|
180
|
+
return 0
|
|
181
|
+
|
|
182
|
+
if hasattr(self.collection, "upsert"):
|
|
183
|
+
self.collection.upsert(rows)
|
|
184
|
+
else:
|
|
185
|
+
self.collection.insert(rows)
|
|
186
|
+
return len(rows)
|
|
187
|
+
|
|
188
|
+
def flush(self) -> None:
|
|
189
|
+
if self.collection is not None:
|
|
190
|
+
self.collection.flush()
|
|
191
|
+
|
|
192
|
+
def fetch_project_chunks(self, *, max_rows: int = 20000, batch_size: int = 1000) -> list[dict[str, Any]]:
|
|
193
|
+
"""Read chunk rows for this namespace/project/repo from Milvus."""
|
|
194
|
+
if self.collection is None:
|
|
195
|
+
raise RuntimeError("Milvus collection is not initialized")
|
|
196
|
+
|
|
197
|
+
escaped_ns = self.namespace.replace('"', '\\"')
|
|
198
|
+
escaped_project = self.project_id.replace('"', '\\"')
|
|
199
|
+
escaped_repo = self.repo_id.replace('"', '\\"')
|
|
200
|
+
expr = (
|
|
201
|
+
f'namespace == "{escaped_ns}" and project_id == "{escaped_project}" '
|
|
202
|
+
f'and repo_id == "{escaped_repo}"'
|
|
203
|
+
)
|
|
204
|
+
output_fields = ["path", "chunk_index", "content", "sha"]
|
|
205
|
+
rows: list[dict[str, Any]] = []
|
|
206
|
+
offset = 0
|
|
207
|
+
|
|
208
|
+
while len(rows) < max_rows:
|
|
209
|
+
limit = min(batch_size, max_rows - len(rows))
|
|
210
|
+
try:
|
|
211
|
+
batch = self.collection.query(
|
|
212
|
+
expr=expr,
|
|
213
|
+
output_fields=output_fields,
|
|
214
|
+
limit=limit,
|
|
215
|
+
offset=offset,
|
|
216
|
+
)
|
|
217
|
+
except TypeError:
|
|
218
|
+
# Older Milvus clients may not support offset.
|
|
219
|
+
batch = self.collection.query(
|
|
220
|
+
expr=expr,
|
|
221
|
+
output_fields=output_fields,
|
|
222
|
+
limit=max_rows,
|
|
223
|
+
)
|
|
224
|
+
rows.extend(batch)
|
|
225
|
+
break
|
|
226
|
+
|
|
227
|
+
if not batch:
|
|
228
|
+
break
|
|
229
|
+
rows.extend(batch)
|
|
230
|
+
offset += len(batch)
|
|
231
|
+
if len(batch) < limit:
|
|
232
|
+
break
|
|
233
|
+
|
|
234
|
+
return rows[:max_rows]
|
|
235
|
+
|
|
236
|
+
def vector_search(self, *, query_text: str, top_k: int) -> list[dict[str, Any]]:
|
|
237
|
+
"""Semantic ANN search over repo chunks using the configured embedding model."""
|
|
238
|
+
if self.collection is None:
|
|
239
|
+
raise RuntimeError("Milvus collection is not initialized")
|
|
240
|
+
if not (query_text or "").strip():
|
|
241
|
+
return []
|
|
242
|
+
|
|
243
|
+
query_vec = self._embed(query_text)
|
|
244
|
+
escaped_ns = self.namespace.replace('"', '\\"')
|
|
245
|
+
escaped_project = self.project_id.replace('"', '\\"')
|
|
246
|
+
escaped_repo = self.repo_id.replace('"', '\\"')
|
|
247
|
+
expr = (
|
|
248
|
+
f'namespace == "{escaped_ns}" and project_id == "{escaped_project}" '
|
|
249
|
+
f'and repo_id == "{escaped_repo}"'
|
|
250
|
+
)
|
|
251
|
+
try:
|
|
252
|
+
hits = self.collection.search(
|
|
253
|
+
data=[query_vec],
|
|
254
|
+
anns_field="vector",
|
|
255
|
+
param={"metric_type": "L2", "params": {"nprobe": 16}},
|
|
256
|
+
limit=top_k,
|
|
257
|
+
expr=expr,
|
|
258
|
+
output_fields=["path", "chunk_index", "content", "sha"],
|
|
259
|
+
)
|
|
260
|
+
except Exception:
|
|
261
|
+
logger.warning("Milvus vector search failed; falling back to lexical search", exc_info=True)
|
|
262
|
+
return self.lexical_search(query_text=query_text, top_k=top_k)
|
|
263
|
+
|
|
264
|
+
results: list[dict[str, Any]] = []
|
|
265
|
+
for hit_list in hits:
|
|
266
|
+
for hit in hit_list:
|
|
267
|
+
entity = getattr(hit, "entity", None)
|
|
268
|
+
getter = entity.get if entity is not None else (lambda key, default=None: default)
|
|
269
|
+
results.append(
|
|
270
|
+
{
|
|
271
|
+
# Lower L2 distance = closer match; convert to similarity score.
|
|
272
|
+
"score": 1.0 / (1.0 + float(getattr(hit, "distance", 0.0))),
|
|
273
|
+
"path": getter("path"),
|
|
274
|
+
"chunk_index": getter("chunk_index"),
|
|
275
|
+
"sha": getter("sha"),
|
|
276
|
+
"content": getter("content"),
|
|
277
|
+
}
|
|
278
|
+
)
|
|
279
|
+
return results
|
|
280
|
+
|
|
281
|
+
def lexical_search(self, *, query_text: str, top_k: int) -> list[dict[str, Any]]:
|
|
282
|
+
"""Search ingested chunks with token-overlap scoring."""
|
|
283
|
+
query_lc = (query_text or "").strip().lower()
|
|
284
|
+
if not query_lc:
|
|
285
|
+
return []
|
|
286
|
+
|
|
287
|
+
tokens = list(dict.fromkeys(_TOKEN_RE.findall(query_lc)))
|
|
288
|
+
rows = self.fetch_project_chunks()
|
|
289
|
+
scored: list[tuple[float, dict[str, Any]]] = []
|
|
290
|
+
|
|
291
|
+
for row in rows:
|
|
292
|
+
content = str(row.get("content") or "")
|
|
293
|
+
content_lc = content.lower()
|
|
294
|
+
score = 0.0
|
|
295
|
+
if query_lc in content_lc:
|
|
296
|
+
score += 10.0
|
|
297
|
+
for token in tokens:
|
|
298
|
+
hits = content_lc.count(token)
|
|
299
|
+
if hits:
|
|
300
|
+
score += min(3.0, float(hits))
|
|
301
|
+
if score <= 0:
|
|
302
|
+
continue
|
|
303
|
+
scored.append((score, row))
|
|
304
|
+
|
|
305
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
306
|
+
return [
|
|
307
|
+
{
|
|
308
|
+
"score": score,
|
|
309
|
+
"path": row.get("path"),
|
|
310
|
+
"chunk_index": row.get("chunk_index"),
|
|
311
|
+
"sha": row.get("sha"),
|
|
312
|
+
"content": row.get("content"),
|
|
313
|
+
}
|
|
314
|
+
for score, row in scored[:top_k]
|
|
315
|
+
]
|
|
316
|
+
|
|
317
|
+
def endpoint_summary(self) -> str:
|
|
318
|
+
if self.cfg.milvus_uri:
|
|
319
|
+
host, port = parse_host_port_from_url(self.cfg.milvus_uri, default_port=self.cfg.milvus_port)
|
|
320
|
+
return f"{host}:{port}"
|
|
321
|
+
return f"{self.cfg.milvus_host}:{self.cfg.milvus_port}"
|