devmem-agents 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devmem/__init__.py +5 -0
- devmem/api.py +257 -0
- devmem/config.py +34 -0
- devmem/embeddings.py +119 -0
- devmem/ingest.py +184 -0
- devmem/live_backend.py +344 -0
- devmem/main.py +11 -0
- devmem/models.py +157 -0
- devmem/retrieval_eval.py +145 -0
- devmem/service.py +280 -0
- devmem/storage/__init__.py +4 -0
- devmem/storage/milvus_store.py +321 -0
- devmem/storage/neptune_store.py +194 -0
- devmem/storage/record_store.py +974 -0
- devmem_agents-0.1.0.dist-info/METADATA +100 -0
- devmem_agents-0.1.0.dist-info/RECORD +19 -0
- devmem_agents-0.1.0.dist-info/WHEEL +5 -0
- devmem_agents-0.1.0.dist-info/licenses/LICENSE +21 -0
- devmem_agents-0.1.0.dist-info/top_level.txt +1 -0
devmem/__init__.py
ADDED
devmem/api.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""HTTP API for devmem."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, HTTPException, Query, Response
|
|
6
|
+
|
|
7
|
+
from devmem.models import (
|
|
8
|
+
ApiResponse,
|
|
9
|
+
ArtifactUpsertRequest,
|
|
10
|
+
ContextPullRequest,
|
|
11
|
+
DecisionUpsertRequest,
|
|
12
|
+
FactUpsertRequest,
|
|
13
|
+
FeedbackRecordRequest,
|
|
14
|
+
HandoffCreateRequest,
|
|
15
|
+
HybridSearchRequest,
|
|
16
|
+
SessionCommitRequest,
|
|
17
|
+
SessionStartRequest,
|
|
18
|
+
TaskSimilarRequest,
|
|
19
|
+
TaskUpdateRequest,
|
|
20
|
+
)
|
|
21
|
+
from devmem.service import NamespaceError, service
|
|
22
|
+
|
|
23
|
+
router = APIRouter(prefix="/v1")
|
|
24
|
+
|
|
25
|
+
# Endpoints superseded by POST /v1/sessions/commit. They still work but clients
|
|
26
|
+
# should migrate. Responses include a `Deprecation: true` header.
|
|
27
|
+
_DEPRECATED_SUPERSEDED_BY = "/v1/sessions/commit"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _handle_namespace(exc: NamespaceError) -> None:
|
|
31
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _mark_deprecated(response: Response) -> None:
|
|
35
|
+
response.headers["Deprecation"] = "true"
|
|
36
|
+
response.headers["Link"] = f'<{_DEPRECATED_SUPERSEDED_BY}>; rel="successor-version"'
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@router.get("/health", response_model=ApiResponse)
|
|
40
|
+
def health() -> ApiResponse:
|
|
41
|
+
return ApiResponse(data=service.health())
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@router.post("/sessions/start", response_model=ApiResponse)
|
|
45
|
+
def sessions_start(req: SessionStartRequest) -> ApiResponse:
|
|
46
|
+
try:
|
|
47
|
+
session = service.start_session(req)
|
|
48
|
+
except NamespaceError as exc:
|
|
49
|
+
_handle_namespace(exc)
|
|
50
|
+
return ApiResponse(data=session.model_dump())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@router.post("/sessions/commit", response_model=ApiResponse)
|
|
54
|
+
def sessions_commit(req: SessionCommitRequest) -> ApiResponse:
|
|
55
|
+
"""Atomically finalize a session.
|
|
56
|
+
|
|
57
|
+
Replaces the 3-call ritual (`/artifacts/upsert` + `/decisions/upsert` +
|
|
58
|
+
`/tasks/update`) with one transactional write. Optional `client_commit_id`
|
|
59
|
+
gives idempotent retries — submitting the same id twice returns the
|
|
60
|
+
original result without re-inserting any rows.
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
result = service.commit_session(
|
|
64
|
+
namespace=req.namespace,
|
|
65
|
+
session_id=req.session_id,
|
|
66
|
+
artifacts=[a.model_dump() for a in req.artifacts],
|
|
67
|
+
decisions=[d.model_dump() for d in req.decisions],
|
|
68
|
+
handoff=req.handoff.model_dump() if req.handoff else None,
|
|
69
|
+
task_update=req.task_update.model_dump() if req.task_update else None,
|
|
70
|
+
client_commit_id=req.client_commit_id,
|
|
71
|
+
)
|
|
72
|
+
except NamespaceError as exc:
|
|
73
|
+
_handle_namespace(exc)
|
|
74
|
+
return ApiResponse(data=result)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@router.post("/context/pull", response_model=ApiResponse)
|
|
78
|
+
def context_pull(req: ContextPullRequest) -> ApiResponse:
|
|
79
|
+
try:
|
|
80
|
+
payload = service.pull_context(
|
|
81
|
+
namespace=req.namespace,
|
|
82
|
+
session_id=req.session_id,
|
|
83
|
+
project=req.project,
|
|
84
|
+
repo=req.repo,
|
|
85
|
+
task=req.task,
|
|
86
|
+
top_k=req.top_k,
|
|
87
|
+
)
|
|
88
|
+
except NamespaceError as exc:
|
|
89
|
+
_handle_namespace(exc)
|
|
90
|
+
return ApiResponse(data=payload)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@router.post("/search/hybrid", response_model=ApiResponse)
|
|
94
|
+
def search_hybrid(req: HybridSearchRequest) -> ApiResponse:
|
|
95
|
+
try:
|
|
96
|
+
results = service.hybrid_search(
|
|
97
|
+
namespace=req.namespace,
|
|
98
|
+
q=req.q,
|
|
99
|
+
project=req.project,
|
|
100
|
+
repo=req.repo,
|
|
101
|
+
top_k=req.top_k,
|
|
102
|
+
)
|
|
103
|
+
except NamespaceError as exc:
|
|
104
|
+
_handle_namespace(exc)
|
|
105
|
+
return ApiResponse(data={"results": results})
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@router.post("/tasks/similar", response_model=ApiResponse)
|
|
109
|
+
def tasks_similar(req: TaskSimilarRequest) -> ApiResponse:
|
|
110
|
+
try:
|
|
111
|
+
results = service.similar_tasks(
|
|
112
|
+
namespace=req.namespace,
|
|
113
|
+
q=req.q,
|
|
114
|
+
project=req.project,
|
|
115
|
+
repo=req.repo,
|
|
116
|
+
top_k=req.top_k,
|
|
117
|
+
)
|
|
118
|
+
except NamespaceError as exc:
|
|
119
|
+
_handle_namespace(exc)
|
|
120
|
+
return ApiResponse(data={"results": results})
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@router.post("/artifacts/upsert", response_model=ApiResponse)
|
|
124
|
+
def artifacts_upsert(req: ArtifactUpsertRequest, response: Response) -> ApiResponse:
|
|
125
|
+
"""Deprecated. Prefer POST /v1/sessions/commit for atomic multi-write."""
|
|
126
|
+
_mark_deprecated(response)
|
|
127
|
+
try:
|
|
128
|
+
result = service.upsert_artifact(req.namespace, req.model_dump())
|
|
129
|
+
except NamespaceError as exc:
|
|
130
|
+
_handle_namespace(exc)
|
|
131
|
+
return ApiResponse(data=result)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@router.get("/artifacts", response_model=ApiResponse)
|
|
135
|
+
def artifacts_list(
|
|
136
|
+
namespace: str = Query(...),
|
|
137
|
+
project: str | None = Query(default=None),
|
|
138
|
+
repo: str | None = Query(default=None),
|
|
139
|
+
session_id: str | None = Query(default=None),
|
|
140
|
+
artifact_type: str | None = Query(default=None),
|
|
141
|
+
since: str | None = Query(default=None, description="ISO-8601 timestamp; rows created_at >= since"),
|
|
142
|
+
limit: int = Query(default=50, ge=1, le=500),
|
|
143
|
+
offset: int = Query(default=0, ge=0),
|
|
144
|
+
) -> ApiResponse:
|
|
145
|
+
try:
|
|
146
|
+
rows = service.list_artifacts(
|
|
147
|
+
namespace=namespace,
|
|
148
|
+
project=project,
|
|
149
|
+
repo=repo,
|
|
150
|
+
session_id=session_id,
|
|
151
|
+
artifact_type=artifact_type,
|
|
152
|
+
since=since,
|
|
153
|
+
limit=limit,
|
|
154
|
+
offset=offset,
|
|
155
|
+
)
|
|
156
|
+
except NamespaceError as exc:
|
|
157
|
+
_handle_namespace(exc)
|
|
158
|
+
return ApiResponse(data={"results": rows, "limit": limit, "offset": offset})
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@router.post("/facts/upsert", response_model=ApiResponse)
|
|
162
|
+
def facts_upsert(req: FactUpsertRequest) -> ApiResponse:
|
|
163
|
+
try:
|
|
164
|
+
result = service.upsert_fact(req.namespace, req.model_dump())
|
|
165
|
+
except NamespaceError as exc:
|
|
166
|
+
_handle_namespace(exc)
|
|
167
|
+
return ApiResponse(data=result)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@router.post("/decisions/upsert", response_model=ApiResponse)
|
|
171
|
+
def decisions_upsert(req: DecisionUpsertRequest, response: Response) -> ApiResponse:
|
|
172
|
+
"""Deprecated. Prefer POST /v1/sessions/commit for atomic multi-write."""
|
|
173
|
+
_mark_deprecated(response)
|
|
174
|
+
try:
|
|
175
|
+
result = service.upsert_decision(req.namespace, req.model_dump())
|
|
176
|
+
except NamespaceError as exc:
|
|
177
|
+
_handle_namespace(exc)
|
|
178
|
+
return ApiResponse(data=result)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@router.get("/decisions", response_model=ApiResponse)
|
|
182
|
+
def decisions_list(
|
|
183
|
+
namespace: str = Query(...),
|
|
184
|
+
project: str | None = Query(default=None),
|
|
185
|
+
repo: str | None = Query(default=None),
|
|
186
|
+
session_id: str | None = Query(default=None),
|
|
187
|
+
since: str | None = Query(default=None),
|
|
188
|
+
limit: int = Query(default=50, ge=1, le=500),
|
|
189
|
+
offset: int = Query(default=0, ge=0),
|
|
190
|
+
) -> ApiResponse:
|
|
191
|
+
try:
|
|
192
|
+
rows = service.list_decisions(
|
|
193
|
+
namespace=namespace,
|
|
194
|
+
project=project,
|
|
195
|
+
repo=repo,
|
|
196
|
+
session_id=session_id,
|
|
197
|
+
since=since,
|
|
198
|
+
limit=limit,
|
|
199
|
+
offset=offset,
|
|
200
|
+
)
|
|
201
|
+
except NamespaceError as exc:
|
|
202
|
+
_handle_namespace(exc)
|
|
203
|
+
return ApiResponse(data={"results": rows, "limit": limit, "offset": offset})
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@router.post("/handoffs/create", response_model=ApiResponse)
|
|
207
|
+
def handoffs_create(req: HandoffCreateRequest) -> ApiResponse:
|
|
208
|
+
try:
|
|
209
|
+
result = service.create_handoff(req.namespace, req.model_dump())
|
|
210
|
+
except NamespaceError as exc:
|
|
211
|
+
_handle_namespace(exc)
|
|
212
|
+
return ApiResponse(data=result)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@router.get("/handoffs", response_model=ApiResponse)
|
|
216
|
+
def handoffs_list(
|
|
217
|
+
namespace: str = Query(...),
|
|
218
|
+
project: str | None = Query(default=None),
|
|
219
|
+
repo: str | None = Query(default=None),
|
|
220
|
+
session_id: str | None = Query(default=None),
|
|
221
|
+
since: str | None = Query(default=None),
|
|
222
|
+
limit: int = Query(default=50, ge=1, le=500),
|
|
223
|
+
offset: int = Query(default=0, ge=0),
|
|
224
|
+
) -> ApiResponse:
|
|
225
|
+
try:
|
|
226
|
+
rows = service.list_handoffs(
|
|
227
|
+
namespace=namespace,
|
|
228
|
+
project=project,
|
|
229
|
+
repo=repo,
|
|
230
|
+
session_id=session_id,
|
|
231
|
+
since=since,
|
|
232
|
+
limit=limit,
|
|
233
|
+
offset=offset,
|
|
234
|
+
)
|
|
235
|
+
except NamespaceError as exc:
|
|
236
|
+
_handle_namespace(exc)
|
|
237
|
+
return ApiResponse(data={"results": rows, "limit": limit, "offset": offset})
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
@router.post("/tasks/update", response_model=ApiResponse)
|
|
241
|
+
def tasks_update(req: TaskUpdateRequest, response: Response) -> ApiResponse:
|
|
242
|
+
"""Deprecated. Prefer POST /v1/sessions/commit (pass task_update in body)."""
|
|
243
|
+
_mark_deprecated(response)
|
|
244
|
+
try:
|
|
245
|
+
result = service.update_task(req.namespace, req.model_dump())
|
|
246
|
+
except NamespaceError as exc:
|
|
247
|
+
_handle_namespace(exc)
|
|
248
|
+
return ApiResponse(data=result)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@router.post("/feedback/record", response_model=ApiResponse)
|
|
252
|
+
def feedback_record(req: FeedbackRecordRequest) -> ApiResponse:
|
|
253
|
+
try:
|
|
254
|
+
result = service.record_feedback(req.namespace, req.model_dump())
|
|
255
|
+
except NamespaceError as exc:
|
|
256
|
+
_handle_namespace(exc)
|
|
257
|
+
return ApiResponse(data=result)
|
devmem/config.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Configuration for devmem."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import Field
|
|
6
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Settings(BaseSettings):
|
|
10
|
+
"""Runtime settings loaded from environment."""
|
|
11
|
+
|
|
12
|
+
model_config = SettingsConfigDict(env_prefix="DEVMEM_", env_file=".env", extra="ignore")
|
|
13
|
+
|
|
14
|
+
namespace: str = Field(default="devlib_v1")
|
|
15
|
+
service_name: str = Field(default="devmem-gateway")
|
|
16
|
+
service_version: str = Field(default="0.1.0")
|
|
17
|
+
|
|
18
|
+
# Backing services (placeholders for real integration).
|
|
19
|
+
milvus_uri: str | None = Field(default=None)
|
|
20
|
+
neptune_endpoint: str | None = Field(default=None)
|
|
21
|
+
aurora_dsn: str | None = Field(default=None)
|
|
22
|
+
|
|
23
|
+
# Embedding model (sentence-transformers). Falls back to SHA-256
|
|
24
|
+
# pseudo-embeddings if the model cannot be loaded. `embedding_dim` must
|
|
25
|
+
# match the model's output dimensionality and the Milvus collection schema.
|
|
26
|
+
embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2")
|
|
27
|
+
embedding_dim: int = Field(default=384)
|
|
28
|
+
|
|
29
|
+
# Record store DSN (SQLAlchemy-compatible, e.g. postgresql+psycopg://user:pw@host/db).
|
|
30
|
+
# When unset, devmem uses an in-memory store (fine for tests, not for production).
|
|
31
|
+
record_store_dsn: str | None = Field(default=None)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
settings = Settings()
|
devmem/embeddings.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Embedding provider for devmem.
|
|
2
|
+
|
|
3
|
+
Uses `sentence-transformers/all-MiniLM-L6-v2` (384-dim) by default. The model
|
|
4
|
+
is lazy-loaded on first use and cached for the process lifetime. If the
|
|
5
|
+
`sentence-transformers` package is not installed (or model download fails),
|
|
6
|
+
the provider falls back to a deterministic SHA-256 pseudo-embedding and logs
|
|
7
|
+
a loud warning so operators know vector search quality is degraded.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import logging
|
|
14
|
+
import threading
|
|
15
|
+
from typing import Iterable
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
from devmem.config import settings
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
_MODEL_LOCK = threading.Lock()
|
|
24
|
+
_MODEL: object | None = None
|
|
25
|
+
_USING_FALLBACK: bool = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def embedding_dim() -> int:
|
|
29
|
+
"""Configured embedding dimension (must match Milvus collection schema)."""
|
|
30
|
+
return int(settings.embedding_dim)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def is_fallback_active() -> bool:
|
|
34
|
+
"""True if real model failed to load and pseudo-embeddings are in use."""
|
|
35
|
+
return _USING_FALLBACK
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _load_model() -> object | None:
|
|
39
|
+
"""Try to load the sentence-transformers model; return None on failure."""
|
|
40
|
+
try:
|
|
41
|
+
from sentence_transformers import SentenceTransformer # type: ignore
|
|
42
|
+
except Exception as exc:
|
|
43
|
+
logger.warning(
|
|
44
|
+
"sentence-transformers not available (%s); devmem will use SHA-256 "
|
|
45
|
+
"pseudo-embeddings. Install `sentence-transformers` to enable semantic search.",
|
|
46
|
+
exc,
|
|
47
|
+
)
|
|
48
|
+
return None
|
|
49
|
+
try:
|
|
50
|
+
model = SentenceTransformer(settings.embedding_model)
|
|
51
|
+
logger.info("Loaded embedding model=%s dim=%s", settings.embedding_model, embedding_dim())
|
|
52
|
+
return model
|
|
53
|
+
except Exception as exc:
|
|
54
|
+
logger.error(
|
|
55
|
+
"Failed to load embedding model %s (%s); falling back to SHA-256 pseudo-embeddings.",
|
|
56
|
+
settings.embedding_model,
|
|
57
|
+
exc,
|
|
58
|
+
exc_info=True,
|
|
59
|
+
)
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _get_model() -> object | None:
|
|
64
|
+
global _MODEL, _USING_FALLBACK
|
|
65
|
+
if _MODEL is not None or _USING_FALLBACK:
|
|
66
|
+
return _MODEL
|
|
67
|
+
with _MODEL_LOCK:
|
|
68
|
+
if _MODEL is None and not _USING_FALLBACK:
|
|
69
|
+
_MODEL = _load_model()
|
|
70
|
+
if _MODEL is None:
|
|
71
|
+
_USING_FALLBACK = True
|
|
72
|
+
return _MODEL
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _pseudo_embed(text: str) -> list[float]:
|
|
76
|
+
"""Deterministic hash-based fallback vector (not semantic)."""
|
|
77
|
+
dim = embedding_dim()
|
|
78
|
+
digest = hashlib.sha256((text or "").encode("utf-8", errors="ignore")).digest()
|
|
79
|
+
repeats = (dim // len(digest)) + 1
|
|
80
|
+
raw = (digest * repeats)[:dim]
|
|
81
|
+
vec = np.frombuffer(raw, dtype=np.uint8).astype(np.float32)
|
|
82
|
+
vec = (vec / 255.0) - 0.5
|
|
83
|
+
norm = float(np.linalg.norm(vec))
|
|
84
|
+
if norm > 0:
|
|
85
|
+
vec /= norm
|
|
86
|
+
return vec.tolist()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def embed_one(text: str) -> list[float]:
|
|
90
|
+
"""Return a single embedding vector."""
|
|
91
|
+
model = _get_model()
|
|
92
|
+
if model is None:
|
|
93
|
+
return _pseudo_embed(text)
|
|
94
|
+
vec = model.encode([text or ""], normalize_embeddings=True)[0]
|
|
95
|
+
return [float(x) for x in vec]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def embed_many(texts: Iterable[str]) -> list[list[float]]:
|
|
99
|
+
"""Return a batch of embedding vectors."""
|
|
100
|
+
items = [t or "" for t in texts]
|
|
101
|
+
if not items:
|
|
102
|
+
return []
|
|
103
|
+
model = _get_model()
|
|
104
|
+
if model is None:
|
|
105
|
+
return [_pseudo_embed(t) for t in items]
|
|
106
|
+
vectors = model.encode(items, normalize_embeddings=True, batch_size=32)
|
|
107
|
+
return [[float(x) for x in v] for v in vectors]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
111
|
+
"""Cosine similarity between two vectors; assumes they may not be unit-normed."""
|
|
112
|
+
if not a or not b:
|
|
113
|
+
return 0.0
|
|
114
|
+
va = np.asarray(a, dtype=np.float32)
|
|
115
|
+
vb = np.asarray(b, dtype=np.float32)
|
|
116
|
+
denom = float(np.linalg.norm(va)) * float(np.linalg.norm(vb))
|
|
117
|
+
if denom == 0.0:
|
|
118
|
+
return 0.0
|
|
119
|
+
return float(np.dot(va, vb) / denom)
|
devmem/ingest.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Repository scanning and memory ingestion pipeline for devmem."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import subprocess
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Iterable
|
|
12
|
+
|
|
13
|
+
from devmem.live_backend import LiveBackendConfig
|
|
14
|
+
from devmem.storage.milvus_store import MilvusStore
|
|
15
|
+
from devmem.storage.neptune_store import NeptuneStore
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_ALLOWED_EXTENSIONS = {
|
|
20
|
+
".py", ".md", ".toml", ".json", ".yaml", ".yml", ".js", ".ts", ".tsx", ".jsx",
|
|
21
|
+
".css", ".html", ".sql", ".sh", ".txt", ".ini", ".cfg", ".conf", ".rst", ".csv", ".svg",
|
|
22
|
+
}
|
|
23
|
+
_ALLOWED_FILENAMES = {
|
|
24
|
+
"Makefile",
|
|
25
|
+
"Dockerfile",
|
|
26
|
+
"Jenkinsfile",
|
|
27
|
+
"Procfile",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
_SKIP_PREFIXES = {
|
|
31
|
+
".git/",
|
|
32
|
+
"node_modules/",
|
|
33
|
+
".venv/",
|
|
34
|
+
"venv/",
|
|
35
|
+
"data/",
|
|
36
|
+
"logs/",
|
|
37
|
+
"__pycache__/",
|
|
38
|
+
".pytest_cache/",
|
|
39
|
+
"react-app/node_modules/",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class IngestResult:
|
|
45
|
+
files_seen: int = 0
|
|
46
|
+
files_indexed: int = 0
|
|
47
|
+
files_skipped: int = 0
|
|
48
|
+
chunks_written: int = 0
|
|
49
|
+
kg_file_nodes: int = 0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _is_binary(blob: bytes) -> bool:
|
|
53
|
+
return b"\x00" in blob
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _sha256(blob: bytes) -> str:
|
|
57
|
+
return hashlib.sha256(blob).hexdigest()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _chunk_text(text: str, *, chunk_size: int = 1800, overlap: int = 200) -> list[str]:
|
|
61
|
+
if not text.strip():
|
|
62
|
+
return []
|
|
63
|
+
if chunk_size <= overlap:
|
|
64
|
+
raise ValueError("chunk_size must be greater than overlap")
|
|
65
|
+
|
|
66
|
+
chunks: list[str] = []
|
|
67
|
+
start = 0
|
|
68
|
+
n = len(text)
|
|
69
|
+
while start < n:
|
|
70
|
+
end = min(start + chunk_size, n)
|
|
71
|
+
chunks.append(text[start:end])
|
|
72
|
+
if end >= n:
|
|
73
|
+
break
|
|
74
|
+
start = max(0, end - overlap)
|
|
75
|
+
return chunks
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _tracked_files(repo_path: Path) -> list[Path]:
|
|
79
|
+
try:
|
|
80
|
+
proc = subprocess.run(
|
|
81
|
+
["git", "-C", str(repo_path), "ls-files", "-z"],
|
|
82
|
+
check=True,
|
|
83
|
+
capture_output=True,
|
|
84
|
+
)
|
|
85
|
+
raw = proc.stdout.decode("utf-8", errors="ignore")
|
|
86
|
+
files = [repo_path / p for p in raw.split("\x00") if p]
|
|
87
|
+
return [p for p in files if p.is_file()]
|
|
88
|
+
except Exception:
|
|
89
|
+
logger.warning("git ls-files failed; falling back to filesystem walk", exc_info=True)
|
|
90
|
+
files: list[Path] = []
|
|
91
|
+
for root, _dirs, names in os.walk(repo_path):
|
|
92
|
+
for name in names:
|
|
93
|
+
files.append(Path(root) / name)
|
|
94
|
+
return files
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _should_index(rel_path: str, suffix: str, size_bytes: int) -> bool:
|
|
98
|
+
if size_bytes > 512 * 1024:
|
|
99
|
+
return False
|
|
100
|
+
path_obj = Path(rel_path)
|
|
101
|
+
if suffix.lower() not in _ALLOWED_EXTENSIONS and path_obj.name not in _ALLOWED_FILENAMES:
|
|
102
|
+
return False
|
|
103
|
+
norm = rel_path.replace("\\", "/")
|
|
104
|
+
for prefix in _SKIP_PREFIXES:
|
|
105
|
+
if norm.startswith(prefix):
|
|
106
|
+
return False
|
|
107
|
+
return True
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def ingest_repository(
|
|
111
|
+
*,
|
|
112
|
+
repo_path: Path,
|
|
113
|
+
project_id: str,
|
|
114
|
+
project_name: str,
|
|
115
|
+
repo_id: str,
|
|
116
|
+
namespace: str,
|
|
117
|
+
cfg: LiveBackendConfig,
|
|
118
|
+
) -> IngestResult:
|
|
119
|
+
"""Scan repository and write memory records to Milvus and Neptune."""
|
|
120
|
+
result = IngestResult()
|
|
121
|
+
|
|
122
|
+
milvus = MilvusStore(cfg, namespace=namespace, project_id=project_id, repo_id=repo_id)
|
|
123
|
+
neptune = NeptuneStore(cfg, namespace=namespace)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
milvus.connect()
|
|
127
|
+
neptune.connect()
|
|
128
|
+
neptune.health_check()
|
|
129
|
+
|
|
130
|
+
neptune.upsert_project(
|
|
131
|
+
project_id=project_id,
|
|
132
|
+
name=project_name,
|
|
133
|
+
repo_path=str(repo_path),
|
|
134
|
+
repo_id=repo_id,
|
|
135
|
+
)
|
|
136
|
+
milvus.clear_project_data()
|
|
137
|
+
|
|
138
|
+
tracked = _tracked_files(repo_path)
|
|
139
|
+
result.files_seen = len(tracked)
|
|
140
|
+
|
|
141
|
+
for file_path in tracked:
|
|
142
|
+
rel_path = file_path.relative_to(repo_path).as_posix()
|
|
143
|
+
suffix = file_path.suffix.lower()
|
|
144
|
+
try:
|
|
145
|
+
blob = file_path.read_bytes()
|
|
146
|
+
except Exception:
|
|
147
|
+
logger.debug("Skipping unreadable file path=%s", rel_path, exc_info=True)
|
|
148
|
+
result.files_skipped += 1
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
if _is_binary(blob):
|
|
152
|
+
result.files_skipped += 1
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
if not _should_index(rel_path, suffix, len(blob)):
|
|
156
|
+
result.files_skipped += 1
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
text = blob.decode("utf-8", errors="ignore")
|
|
160
|
+
sha = _sha256(blob)
|
|
161
|
+
chunks = _chunk_text(text)
|
|
162
|
+
if not chunks:
|
|
163
|
+
result.files_skipped += 1
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
neptune.upsert_file(
|
|
167
|
+
project_id=project_id,
|
|
168
|
+
repo_id=repo_id,
|
|
169
|
+
path=rel_path,
|
|
170
|
+
ext=suffix,
|
|
171
|
+
sha=sha,
|
|
172
|
+
size_bytes=len(blob),
|
|
173
|
+
)
|
|
174
|
+
result.kg_file_nodes += 1
|
|
175
|
+
|
|
176
|
+
inserted = milvus.replace_file_chunks(path=rel_path, sha=sha, chunks=chunks)
|
|
177
|
+
result.chunks_written += inserted
|
|
178
|
+
result.files_indexed += 1
|
|
179
|
+
|
|
180
|
+
milvus.flush()
|
|
181
|
+
return result
|
|
182
|
+
finally:
|
|
183
|
+
neptune.close()
|
|
184
|
+
milvus.close()
|