code-memory 1.0.3__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {code_memory-1.0.3 → code_memory-1.0.5}/PKG-INFO +2 -1
- {code_memory-1.0.3 → code_memory-1.0.5}/db.py +143 -23
- {code_memory-1.0.3 → code_memory-1.0.5}/doc_parser.py +97 -65
- {code_memory-1.0.3 → code_memory-1.0.5}/git_search.py +4 -5
- {code_memory-1.0.3 → code_memory-1.0.5}/logging_config.py +24 -1
- {code_memory-1.0.3 → code_memory-1.0.5}/parser.py +67 -42
- code_memory-1.0.5/prompts/milestone_6.xml +756 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/pyproject.toml +2 -1
- {code_memory-1.0.3 → code_memory-1.0.5}/queries.py +99 -2
- {code_memory-1.0.3 → code_memory-1.0.5}/server.py +55 -18
- {code_memory-1.0.3 → code_memory-1.0.5}/tests/test_errors.py +2 -4
- {code_memory-1.0.3 → code_memory-1.0.5}/tests/test_logging.py +1 -3
- {code_memory-1.0.3 → code_memory-1.0.5}/tests/test_tools.py +0 -3
- {code_memory-1.0.3 → code_memory-1.0.5}/uv.lock +71 -1
- {code_memory-1.0.3 → code_memory-1.0.5}/validation.py +0 -1
- {code_memory-1.0.3 → code_memory-1.0.5}/.github/workflows/ci.yml +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/.github/workflows/publish.yml +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/.gitignore +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/.python-version +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/CHANGELOG.md +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/CONTRIBUTING.md +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/LICENSE +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/Makefile +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/README.md +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/errors.py +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/prompts/milestone_1.xml +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/prompts/milestone_2.xml +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/prompts/milestone_3.xml +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/prompts/milestone_4.xml +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/prompts/milestone_5.xml +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/tests/__init__.py +0 -0
- {code_memory-1.0.3 → code_memory-1.0.5}/tests/conftest.py +1 -1
- {code_memory-1.0.3 → code_memory-1.0.5}/tests/test_validation.py +1 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: code-memory
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.5
|
|
4
4
|
Summary: A deterministic, high-precision code intelligence MCP server
|
|
5
5
|
Project-URL: Homepage, https://github.com/kapillamba4/code-memory
|
|
6
6
|
Project-URL: Documentation, https://github.com/kapillamba4/code-memory#readme
|
|
@@ -32,6 +32,7 @@ Requires-Dist: tree-sitter-ruby>=0.23.1
|
|
|
32
32
|
Requires-Dist: tree-sitter-rust>=0.24.0
|
|
33
33
|
Requires-Dist: tree-sitter-typescript>=0.23.2
|
|
34
34
|
Requires-Dist: tree-sitter>=0.25.2
|
|
35
|
+
Requires-Dist: xxhash>=3.6.0
|
|
35
36
|
Provides-Extra: dev
|
|
36
37
|
Requires-Dist: mypy>=1.13.0; extra == 'dev'
|
|
37
38
|
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
@@ -11,16 +11,19 @@ All writes use upsert semantics so re-indexing is idempotent.
|
|
|
11
11
|
|
|
12
12
|
from __future__ import annotations
|
|
13
13
|
|
|
14
|
-
import
|
|
14
|
+
import logging
|
|
15
15
|
import sqlite3
|
|
16
|
-
from
|
|
16
|
+
from contextlib import contextmanager
|
|
17
17
|
from typing import TYPE_CHECKING
|
|
18
18
|
|
|
19
19
|
import sqlite_vec
|
|
20
|
+
import xxhash
|
|
20
21
|
|
|
21
22
|
if TYPE_CHECKING:
|
|
22
23
|
pass
|
|
23
24
|
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
24
27
|
# ---------------------------------------------------------------------------
|
|
25
28
|
# Embedding model (lazy-loaded singleton)
|
|
26
29
|
# ---------------------------------------------------------------------------
|
|
@@ -42,10 +45,80 @@ def get_embedding_model():
|
|
|
42
45
|
def embed_text(text: str) -> list[float]:
|
|
43
46
|
"""Generate a 384-dim dense vector embedding for *text*."""
|
|
44
47
|
model = get_embedding_model()
|
|
45
|
-
vec = model.encode(text, normalize_embeddings=True)
|
|
48
|
+
vec = model.encode(text, normalize_embeddings=True, show_progress_bar=False)
|
|
46
49
|
return vec.tolist()
|
|
47
50
|
|
|
48
51
|
|
|
52
|
+
def embed_texts_batch(texts: list[str], batch_size: int = 32) -> list[list[float]]:
|
|
53
|
+
"""Generate embeddings for multiple texts at once.
|
|
54
|
+
|
|
55
|
+
This is significantly faster than calling embed_text() in a loop
|
|
56
|
+
because sentence-transformers is optimized for batch processing.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
texts: List of text strings to embed.
|
|
60
|
+
batch_size: Number of texts to process per batch (default 32).
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
List of embedding vectors (same order as input texts).
|
|
64
|
+
"""
|
|
65
|
+
if not texts:
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
model = get_embedding_model()
|
|
69
|
+
|
|
70
|
+
# Batch encode with normalization (same as single-text version)
|
|
71
|
+
vectors = model.encode(
|
|
72
|
+
texts,
|
|
73
|
+
batch_size=batch_size,
|
|
74
|
+
normalize_embeddings=True,
|
|
75
|
+
show_progress_bar=False,
|
|
76
|
+
convert_to_numpy=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return [v.tolist() for v in vectors]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def warmup_embedding_model() -> None:
|
|
83
|
+
"""Pre-load and warm up the embedding model.
|
|
84
|
+
|
|
85
|
+
Call this at server startup to avoid cold-start latency on first search.
|
|
86
|
+
The warmup encodes a dummy string to initialize internal tensors.
|
|
87
|
+
"""
|
|
88
|
+
model = get_embedding_model()
|
|
89
|
+
# Warmup encode to initialize lazy-loaded components
|
|
90
|
+
model.encode("warmup", normalize_embeddings=True, show_progress_bar=False)
|
|
91
|
+
logger.info("Embedding model warmed up")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# Transaction support
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@contextmanager
|
|
100
|
+
def transaction(db: sqlite3.Connection):
|
|
101
|
+
"""Context manager for explicit transaction control.
|
|
102
|
+
|
|
103
|
+
Disables autocommit, yields control, then commits on success.
|
|
104
|
+
On exception, rolls back automatically.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
with transaction(db):
|
|
108
|
+
for item in items:
|
|
109
|
+
upsert_symbol(db, ..., auto_commit=False)
|
|
110
|
+
# Single commit here
|
|
111
|
+
"""
|
|
112
|
+
# Disable autocommit by starting a transaction
|
|
113
|
+
db.execute("BEGIN")
|
|
114
|
+
try:
|
|
115
|
+
yield db
|
|
116
|
+
db.commit()
|
|
117
|
+
except Exception:
|
|
118
|
+
db.rollback()
|
|
119
|
+
raise
|
|
120
|
+
|
|
121
|
+
|
|
49
122
|
# ---------------------------------------------------------------------------
|
|
50
123
|
# Database initialisation
|
|
51
124
|
# ---------------------------------------------------------------------------
|
|
@@ -210,17 +283,34 @@ def get_db(db_path: str = "code_memory.db") -> sqlite3.Connection:
|
|
|
210
283
|
|
|
211
284
|
|
|
212
285
|
def file_hash(filepath: str) -> str:
|
|
213
|
-
"""Compute
|
|
214
|
-
|
|
286
|
+
"""Compute fast non-cryptographic hash of a file's contents.
|
|
287
|
+
|
|
288
|
+
Uses xxHash (xxh64) which is ~10x faster than SHA-256 while still
|
|
289
|
+
providing excellent collision resistance for change detection.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
filepath: Path to the file to hash.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
Hexadecimal string representation of the 64-bit hash.
|
|
296
|
+
"""
|
|
297
|
+
h = xxhash.xxh64()
|
|
215
298
|
with open(filepath, "rb") as f:
|
|
216
|
-
|
|
299
|
+
# Read in 64KB chunks for memory efficiency
|
|
300
|
+
for chunk in iter(lambda: f.read(65536), b""):
|
|
217
301
|
h.update(chunk)
|
|
218
302
|
return h.hexdigest()
|
|
219
303
|
|
|
220
304
|
|
|
221
|
-
def upsert_file(
|
|
305
|
+
def upsert_file(
|
|
306
|
+
db: sqlite3.Connection,
|
|
307
|
+
path: str,
|
|
308
|
+
last_modified: float,
|
|
309
|
+
fhash: str,
|
|
310
|
+
auto_commit: bool = True,
|
|
311
|
+
) -> int:
|
|
222
312
|
"""Insert or update a file record. Returns the file_id."""
|
|
223
|
-
|
|
313
|
+
db.execute(
|
|
224
314
|
"""
|
|
225
315
|
INSERT INTO files (path, last_modified, file_hash)
|
|
226
316
|
VALUES (?, ?, ?)
|
|
@@ -230,13 +320,14 @@ def upsert_file(db: sqlite3.Connection, path: str, last_modified: float, fhash:
|
|
|
230
320
|
""",
|
|
231
321
|
(path, last_modified, fhash),
|
|
232
322
|
)
|
|
233
|
-
|
|
323
|
+
if auto_commit:
|
|
324
|
+
db.commit()
|
|
234
325
|
# Fetch the id (needed because last_insert_rowid isn't reliable on update)
|
|
235
326
|
row = db.execute("SELECT id FROM files WHERE path = ?", (path,)).fetchone()
|
|
236
327
|
return row[0]
|
|
237
328
|
|
|
238
329
|
|
|
239
|
-
def delete_file_data(db: sqlite3.Connection, file_id: int) -> None:
|
|
330
|
+
def delete_file_data(db: sqlite3.Connection, file_id: int, auto_commit: bool = True) -> None:
|
|
240
331
|
"""Remove all symbols, embeddings, and references for a file.
|
|
241
332
|
|
|
242
333
|
This is called before re-indexing to guarantee idempotency.
|
|
@@ -251,7 +342,8 @@ def delete_file_data(db: sqlite3.Connection, file_id: int) -> None:
|
|
|
251
342
|
|
|
252
343
|
db.execute("DELETE FROM symbols WHERE file_id = ?", (file_id,))
|
|
253
344
|
db.execute("DELETE FROM references_ WHERE file_id = ?", (file_id,))
|
|
254
|
-
|
|
345
|
+
if auto_commit:
|
|
346
|
+
db.commit()
|
|
255
347
|
|
|
256
348
|
|
|
257
349
|
def upsert_symbol(
|
|
@@ -263,6 +355,7 @@ def upsert_symbol(
|
|
|
263
355
|
line_end: int,
|
|
264
356
|
parent_symbol_id: int | None,
|
|
265
357
|
source_text: str,
|
|
358
|
+
auto_commit: bool = True,
|
|
266
359
|
) -> int:
|
|
267
360
|
"""Insert or update a symbol record. Returns the symbol_id."""
|
|
268
361
|
db.execute(
|
|
@@ -277,7 +370,8 @@ def upsert_symbol(
|
|
|
277
370
|
""",
|
|
278
371
|
(name, kind, file_id, line_start, line_end, parent_symbol_id, source_text),
|
|
279
372
|
)
|
|
280
|
-
|
|
373
|
+
if auto_commit:
|
|
374
|
+
db.commit()
|
|
281
375
|
row = db.execute(
|
|
282
376
|
"SELECT id FROM symbols WHERE file_id = ? AND name = ? AND kind = ? AND line_start = ?",
|
|
283
377
|
(file_id, name, kind, line_start),
|
|
@@ -286,7 +380,11 @@ def upsert_symbol(
|
|
|
286
380
|
|
|
287
381
|
|
|
288
382
|
def upsert_reference(
|
|
289
|
-
db: sqlite3.Connection,
|
|
383
|
+
db: sqlite3.Connection,
|
|
384
|
+
symbol_name: str,
|
|
385
|
+
file_id: int,
|
|
386
|
+
line_number: int,
|
|
387
|
+
auto_commit: bool = True,
|
|
290
388
|
) -> None:
|
|
291
389
|
"""Insert or update a cross-reference record."""
|
|
292
390
|
db.execute(
|
|
@@ -297,10 +395,16 @@ def upsert_reference(
|
|
|
297
395
|
""",
|
|
298
396
|
(symbol_name, file_id, line_number),
|
|
299
397
|
)
|
|
300
|
-
|
|
398
|
+
if auto_commit:
|
|
399
|
+
db.commit()
|
|
301
400
|
|
|
302
401
|
|
|
303
|
-
def upsert_embedding(
|
|
402
|
+
def upsert_embedding(
|
|
403
|
+
db: sqlite3.Connection,
|
|
404
|
+
symbol_id: int,
|
|
405
|
+
embedding: list[float],
|
|
406
|
+
auto_commit: bool = True,
|
|
407
|
+
) -> None:
|
|
304
408
|
"""Insert or replace a symbol's dense vector embedding."""
|
|
305
409
|
import struct
|
|
306
410
|
|
|
@@ -311,7 +415,8 @@ def upsert_embedding(db: sqlite3.Connection, symbol_id: int, embedding: list[flo
|
|
|
311
415
|
"INSERT INTO symbol_embeddings (symbol_id, embedding) VALUES (?, ?)",
|
|
312
416
|
(symbol_id, blob),
|
|
313
417
|
)
|
|
314
|
-
|
|
418
|
+
if auto_commit:
|
|
419
|
+
db.commit()
|
|
315
420
|
|
|
316
421
|
|
|
317
422
|
# ---------------------------------------------------------------------------
|
|
@@ -320,7 +425,12 @@ def upsert_embedding(db: sqlite3.Connection, symbol_id: int, embedding: list[flo
|
|
|
320
425
|
|
|
321
426
|
|
|
322
427
|
def upsert_doc_file(
|
|
323
|
-
db: sqlite3.Connection,
|
|
428
|
+
db: sqlite3.Connection,
|
|
429
|
+
path: str,
|
|
430
|
+
last_modified: float,
|
|
431
|
+
fhash: str,
|
|
432
|
+
doc_type: str,
|
|
433
|
+
auto_commit: bool = True,
|
|
324
434
|
) -> int:
|
|
325
435
|
"""Insert or update a documentation file record. Returns doc_file_id."""
|
|
326
436
|
db.execute(
|
|
@@ -334,12 +444,13 @@ def upsert_doc_file(
|
|
|
334
444
|
""",
|
|
335
445
|
(path, last_modified, fhash, doc_type),
|
|
336
446
|
)
|
|
337
|
-
|
|
447
|
+
if auto_commit:
|
|
448
|
+
db.commit()
|
|
338
449
|
row = db.execute("SELECT id FROM doc_files WHERE path = ?", (path,)).fetchone()
|
|
339
450
|
return row[0]
|
|
340
451
|
|
|
341
452
|
|
|
342
|
-
def delete_doc_file_data(db: sqlite3.Connection, doc_file_id: int) -> None:
|
|
453
|
+
def delete_doc_file_data(db: sqlite3.Connection, doc_file_id: int, auto_commit: bool = True) -> None:
|
|
343
454
|
"""Remove all chunks and embeddings for a documentation file.
|
|
344
455
|
|
|
345
456
|
This is called before re-indexing to guarantee idempotency.
|
|
@@ -356,7 +467,8 @@ def delete_doc_file_data(db: sqlite3.Connection, doc_file_id: int) -> None:
|
|
|
356
467
|
db.execute(f"DELETE FROM doc_embeddings WHERE chunk_id IN ({placeholders})", chunk_ids)
|
|
357
468
|
|
|
358
469
|
db.execute("DELETE FROM doc_chunks WHERE doc_file_id = ?", (doc_file_id,))
|
|
359
|
-
|
|
470
|
+
if auto_commit:
|
|
471
|
+
db.commit()
|
|
360
472
|
|
|
361
473
|
|
|
362
474
|
def upsert_doc_chunk(
|
|
@@ -367,6 +479,7 @@ def upsert_doc_chunk(
|
|
|
367
479
|
content: str,
|
|
368
480
|
line_start: int,
|
|
369
481
|
line_end: int,
|
|
482
|
+
auto_commit: bool = True,
|
|
370
483
|
) -> int:
|
|
371
484
|
"""Insert or update a documentation chunk. Returns chunk_id."""
|
|
372
485
|
db.execute(
|
|
@@ -382,7 +495,8 @@ def upsert_doc_chunk(
|
|
|
382
495
|
""",
|
|
383
496
|
(doc_file_id, chunk_index, section_title, content, line_start, line_end),
|
|
384
497
|
)
|
|
385
|
-
|
|
498
|
+
if auto_commit:
|
|
499
|
+
db.commit()
|
|
386
500
|
row = db.execute(
|
|
387
501
|
"SELECT id FROM doc_chunks WHERE doc_file_id = ? AND chunk_index = ?",
|
|
388
502
|
(doc_file_id, chunk_index),
|
|
@@ -390,7 +504,12 @@ def upsert_doc_chunk(
|
|
|
390
504
|
return row[0]
|
|
391
505
|
|
|
392
506
|
|
|
393
|
-
def upsert_doc_embedding(
|
|
507
|
+
def upsert_doc_embedding(
|
|
508
|
+
db: sqlite3.Connection,
|
|
509
|
+
chunk_id: int,
|
|
510
|
+
embedding: list[float],
|
|
511
|
+
auto_commit: bool = True,
|
|
512
|
+
) -> None:
|
|
394
513
|
"""Insert or replace a documentation chunk's dense vector embedding."""
|
|
395
514
|
import struct
|
|
396
515
|
|
|
@@ -400,4 +519,5 @@ def upsert_doc_embedding(db: sqlite3.Connection, chunk_id: int, embedding: list[
|
|
|
400
519
|
"INSERT INTO doc_embeddings (chunk_id, embedding) VALUES (?, ?)",
|
|
401
520
|
(chunk_id, blob),
|
|
402
521
|
)
|
|
403
|
-
|
|
522
|
+
if auto_commit:
|
|
523
|
+
db.commit()
|
|
@@ -7,10 +7,8 @@ and indexes them for hybrid retrieval (BM25 + vector search).
|
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
|
-
import hashlib
|
|
11
10
|
import os
|
|
12
11
|
import re
|
|
13
|
-
from pathlib import Path
|
|
14
12
|
|
|
15
13
|
from markdown_it import MarkdownIt
|
|
16
14
|
|
|
@@ -239,7 +237,7 @@ def index_doc_file(
|
|
|
239
237
|
overlap: int = DEFAULT_OVERLAP,
|
|
240
238
|
min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE,
|
|
241
239
|
) -> dict:
|
|
242
|
-
"""Index a documentation file.
|
|
240
|
+
"""Index a documentation file with batch embeddings and transaction.
|
|
243
241
|
|
|
244
242
|
Args:
|
|
245
243
|
filepath: Path to the documentation file.
|
|
@@ -259,7 +257,7 @@ def index_doc_file(
|
|
|
259
257
|
# Check if file has changed
|
|
260
258
|
stat = os.stat(abs_path)
|
|
261
259
|
last_modified = stat.st_mtime
|
|
262
|
-
fhash = db_mod.file_hash(abs_path)
|
|
260
|
+
fhash = db_mod.file_hash(abs_path) # Now uses xxHash
|
|
263
261
|
|
|
264
262
|
existing = db.execute(
|
|
265
263
|
"SELECT id, file_hash FROM doc_files WHERE path = ?", (abs_path,)
|
|
@@ -285,8 +283,9 @@ def index_doc_file(
|
|
|
285
283
|
# Parse and chunk
|
|
286
284
|
sections = parse_markdown_sections(abs_path)
|
|
287
285
|
|
|
288
|
-
|
|
289
|
-
|
|
286
|
+
# === BATCH PROCESSING ===
|
|
287
|
+
chunks_to_store: list[dict] = []
|
|
288
|
+
embed_inputs: list[str] = []
|
|
290
289
|
|
|
291
290
|
for section in sections:
|
|
292
291
|
content = section["content"]
|
|
@@ -300,22 +299,34 @@ def index_doc_file(
|
|
|
300
299
|
if len(sub_content) < min_chunk_size:
|
|
301
300
|
continue
|
|
302
301
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
section["
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
# Generate and store embedding
|
|
314
|
-
embedding = db_mod.embed_text(f"{section['section_title'] or ''}: {sub_content}")
|
|
315
|
-
db_mod.upsert_doc_embedding(db, chunk_id, embedding)
|
|
302
|
+
chunks_to_store.append({
|
|
303
|
+
"section_title": section["section_title"],
|
|
304
|
+
"content": sub_content,
|
|
305
|
+
"line_start": section["line_start"],
|
|
306
|
+
"line_end": section["line_end"],
|
|
307
|
+
})
|
|
308
|
+
embed_input = f"{section['section_title'] or ''}: {sub_content}"
|
|
309
|
+
embed_inputs.append(embed_input)
|
|
316
310
|
|
|
317
|
-
|
|
318
|
-
|
|
311
|
+
# Batch embed all chunks
|
|
312
|
+
chunks_indexed = 0
|
|
313
|
+
if embed_inputs:
|
|
314
|
+
embeddings = db_mod.embed_texts_batch(embed_inputs, batch_size=64)
|
|
315
|
+
|
|
316
|
+
with db_mod.transaction(db):
|
|
317
|
+
for i, chunk in enumerate(chunks_to_store):
|
|
318
|
+
chunk_id = db_mod.upsert_doc_chunk(
|
|
319
|
+
db,
|
|
320
|
+
doc_file_id,
|
|
321
|
+
i, # chunk_index
|
|
322
|
+
chunk["section_title"],
|
|
323
|
+
chunk["content"],
|
|
324
|
+
chunk["line_start"],
|
|
325
|
+
chunk["line_end"],
|
|
326
|
+
auto_commit=False,
|
|
327
|
+
)
|
|
328
|
+
db_mod.upsert_doc_embedding(db, chunk_id, embeddings[i], auto_commit=False)
|
|
329
|
+
chunks_indexed += 1
|
|
319
330
|
|
|
320
331
|
return {
|
|
321
332
|
"file": filepath,
|
|
@@ -356,8 +367,7 @@ def index_doc_directory(dirpath: str, db) -> list[dict]:
|
|
|
356
367
|
def extract_docstrings_from_code(db) -> list[dict]:
|
|
357
368
|
"""Extract docstrings from already-indexed code symbols.
|
|
358
369
|
|
|
359
|
-
|
|
360
|
-
docstrings from the source_text field.
|
|
370
|
+
Uses batch embedding generation for better performance.
|
|
361
371
|
|
|
362
372
|
Args:
|
|
363
373
|
db: Database connection.
|
|
@@ -377,6 +387,10 @@ def extract_docstrings_from_code(db) -> list[dict]:
|
|
|
377
387
|
"""
|
|
378
388
|
).fetchall()
|
|
379
389
|
|
|
390
|
+
# === BATCH PROCESSING ===
|
|
391
|
+
docstrings_to_store: list[dict] = []
|
|
392
|
+
embed_inputs: list[str] = []
|
|
393
|
+
|
|
380
394
|
for row in rows:
|
|
381
395
|
symbol_id, name, kind, file_path, line_start, line_end, source_text = row
|
|
382
396
|
|
|
@@ -398,50 +412,68 @@ def extract_docstrings_from_code(db) -> list[dict]:
|
|
|
398
412
|
if existing:
|
|
399
413
|
continue
|
|
400
414
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
"SELECT id FROM doc_files WHERE path = ?", (file_path,)
|
|
404
|
-
).fetchone()
|
|
405
|
-
|
|
406
|
-
if not doc_file:
|
|
407
|
-
# Get file stats
|
|
408
|
-
stat = os.stat(file_path) if os.path.exists(file_path) else None
|
|
409
|
-
doc_file_id = db_mod.upsert_doc_file(
|
|
410
|
-
db,
|
|
411
|
-
file_path,
|
|
412
|
-
stat.st_mtime if stat else 0,
|
|
413
|
-
db_mod.file_hash(file_path) if stat else "",
|
|
414
|
-
"docstring",
|
|
415
|
-
)
|
|
416
|
-
else:
|
|
417
|
-
doc_file_id = doc_file[0]
|
|
418
|
-
|
|
419
|
-
# Get next chunk index
|
|
420
|
-
max_idx = db.execute(
|
|
421
|
-
"SELECT COALESCE(MAX(chunk_index), -1) FROM doc_chunks WHERE doc_file_id = ?",
|
|
422
|
-
(doc_file_id,),
|
|
423
|
-
).fetchone()[0]
|
|
424
|
-
|
|
425
|
-
chunk_id = db_mod.upsert_doc_chunk(
|
|
426
|
-
db,
|
|
427
|
-
doc_file_id,
|
|
428
|
-
max_idx + 1,
|
|
429
|
-
name, # Use symbol name as section title
|
|
430
|
-
docstring,
|
|
431
|
-
line_start,
|
|
432
|
-
line_end,
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
# Generate and store embedding
|
|
436
|
-
embedding = db_mod.embed_text(f"{kind} {name}: {docstring}")
|
|
437
|
-
db_mod.upsert_doc_embedding(db, chunk_id, embedding)
|
|
438
|
-
|
|
439
|
-
results.append({
|
|
440
|
-
"symbol": name,
|
|
415
|
+
docstrings_to_store.append({
|
|
416
|
+
"name": name,
|
|
441
417
|
"kind": kind,
|
|
442
|
-
"
|
|
443
|
-
"
|
|
418
|
+
"file_path": file_path,
|
|
419
|
+
"line_start": line_start,
|
|
420
|
+
"line_end": line_end,
|
|
421
|
+
"docstring": docstring,
|
|
444
422
|
})
|
|
423
|
+
embed_inputs.append(f"{kind} {name}: {docstring}")
|
|
424
|
+
|
|
425
|
+
# Batch embed all docstrings
|
|
426
|
+
if embed_inputs:
|
|
427
|
+
embeddings = db_mod.embed_texts_batch(embed_inputs, batch_size=64)
|
|
428
|
+
|
|
429
|
+
with db_mod.transaction(db):
|
|
430
|
+
for i, doc_info in enumerate(docstrings_to_store):
|
|
431
|
+
file_path = doc_info["file_path"]
|
|
432
|
+
|
|
433
|
+
# Create a doc_file entry for the code file if needed
|
|
434
|
+
doc_file = db.execute(
|
|
435
|
+
"SELECT id FROM doc_files WHERE path = ?", (file_path,)
|
|
436
|
+
).fetchone()
|
|
437
|
+
|
|
438
|
+
if not doc_file:
|
|
439
|
+
# Get file stats
|
|
440
|
+
stat = os.stat(file_path) if os.path.exists(file_path) else None
|
|
441
|
+
doc_file_id = db_mod.upsert_doc_file(
|
|
442
|
+
db,
|
|
443
|
+
file_path,
|
|
444
|
+
stat.st_mtime if stat else 0,
|
|
445
|
+
db_mod.file_hash(file_path) if stat else "",
|
|
446
|
+
"docstring",
|
|
447
|
+
auto_commit=False,
|
|
448
|
+
)
|
|
449
|
+
else:
|
|
450
|
+
doc_file_id = doc_file[0]
|
|
451
|
+
|
|
452
|
+
# Get next chunk index
|
|
453
|
+
max_idx = db.execute(
|
|
454
|
+
"SELECT COALESCE(MAX(chunk_index), -1) FROM doc_chunks WHERE doc_file_id = ?",
|
|
455
|
+
(doc_file_id,),
|
|
456
|
+
).fetchone()[0]
|
|
457
|
+
|
|
458
|
+
chunk_id = db_mod.upsert_doc_chunk(
|
|
459
|
+
db,
|
|
460
|
+
doc_file_id,
|
|
461
|
+
max_idx + 1,
|
|
462
|
+
doc_info["name"], # Use symbol name as section title
|
|
463
|
+
doc_info["docstring"],
|
|
464
|
+
doc_info["line_start"],
|
|
465
|
+
doc_info["line_end"],
|
|
466
|
+
auto_commit=False,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
db_mod.upsert_doc_embedding(db, chunk_id, embeddings[i], auto_commit=False)
|
|
470
|
+
|
|
471
|
+
results.append({
|
|
472
|
+
"symbol": doc_info["name"],
|
|
473
|
+
"kind": doc_info["kind"],
|
|
474
|
+
"file": file_path,
|
|
475
|
+
"docstring_length": len(doc_info["docstring"]),
|
|
476
|
+
})
|
|
445
477
|
|
|
446
478
|
return results
|
|
447
479
|
|
|
@@ -15,14 +15,13 @@ Design rules
|
|
|
15
15
|
|
|
16
16
|
from __future__ import annotations
|
|
17
17
|
|
|
18
|
-
from datetime import
|
|
18
|
+
from datetime import UTC, datetime
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
from typing import Any
|
|
21
21
|
|
|
22
22
|
import git
|
|
23
23
|
from git.exc import InvalidGitRepositoryError, NoSuchPathError
|
|
24
24
|
|
|
25
|
-
|
|
26
25
|
# ---------------------------------------------------------------------------
|
|
27
26
|
# Helpers
|
|
28
27
|
# ---------------------------------------------------------------------------
|
|
@@ -34,7 +33,7 @@ def _commit_to_dict(commit: git.Commit, *, include_files_changed_count: bool = F
|
|
|
34
33
|
include_files_changed_count: If True, compute the number of files
|
|
35
34
|
changed (triggers a diff — slow for bulk iteration).
|
|
36
35
|
"""
|
|
37
|
-
dt = datetime.fromtimestamp(commit.committed_date, tz=
|
|
36
|
+
dt = datetime.fromtimestamp(commit.committed_date, tz=UTC)
|
|
38
37
|
result: dict[str, Any] = {
|
|
39
38
|
"hash": commit.hexsha[:7],
|
|
40
39
|
"full_hash": commit.hexsha,
|
|
@@ -143,7 +142,7 @@ def get_commit_detail(
|
|
|
143
142
|
return {"error": f"Could not resolve commit '{commit_hash}': {exc}"}
|
|
144
143
|
|
|
145
144
|
try:
|
|
146
|
-
dt = datetime.fromtimestamp(commit.committed_date, tz=
|
|
145
|
+
dt = datetime.fromtimestamp(commit.committed_date, tz=UTC)
|
|
147
146
|
|
|
148
147
|
parent_hashes = [p.hexsha[:7] for p in commit.parents]
|
|
149
148
|
|
|
@@ -271,7 +270,7 @@ def get_blame(
|
|
|
271
270
|
"full_hash": commit.hexsha,
|
|
272
271
|
"author": str(commit.author),
|
|
273
272
|
"date": datetime.fromtimestamp(
|
|
274
|
-
commit.committed_date, tz=
|
|
273
|
+
commit.committed_date, tz=UTC
|
|
275
274
|
).isoformat(),
|
|
276
275
|
"line_content": line_text,
|
|
277
276
|
"commit_message": commit.message.strip().split("\n")[0],
|
|
@@ -10,6 +10,8 @@ from __future__ import annotations
|
|
|
10
10
|
import logging
|
|
11
11
|
import os
|
|
12
12
|
import sys
|
|
13
|
+
import time
|
|
14
|
+
from contextlib import contextmanager
|
|
13
15
|
from datetime import datetime
|
|
14
16
|
from typing import TextIO
|
|
15
17
|
|
|
@@ -24,6 +26,27 @@ DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
|
|
24
26
|
_initialized = False
|
|
25
27
|
|
|
26
28
|
|
|
29
|
+
@contextmanager
|
|
30
|
+
def log_timing(operation_name: str, logger: logging.Logger):
|
|
31
|
+
"""Context manager to log operation timing.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
operation_name: Name of the operation being timed.
|
|
35
|
+
logger: Logger instance to use for logging.
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
with log_timing("Indexing myfile.py", logger):
|
|
39
|
+
# ... indexing code ...
|
|
40
|
+
"""
|
|
41
|
+
start = time.perf_counter()
|
|
42
|
+
logger.debug(f"{operation_name} started")
|
|
43
|
+
try:
|
|
44
|
+
yield
|
|
45
|
+
finally:
|
|
46
|
+
elapsed = time.perf_counter() - start
|
|
47
|
+
logger.info(f"{operation_name} completed in {elapsed:.2f}s")
|
|
48
|
+
|
|
49
|
+
|
|
27
50
|
def setup_logging(level: str = LOG_LEVEL, stream: TextIO = sys.stderr) -> logging.Logger:
|
|
28
51
|
"""Configure structured logging for code-memory.
|
|
29
52
|
|
|
@@ -96,7 +119,7 @@ class ToolLogger:
|
|
|
96
119
|
self.result_count: int | None = None
|
|
97
120
|
self.error: str | None = None
|
|
98
121
|
|
|
99
|
-
def __enter__(self) ->
|
|
122
|
+
def __enter__(self) -> ToolLogger:
|
|
100
123
|
self.start_time = datetime.now()
|
|
101
124
|
# Sanitize params for logging (don't log sensitive data)
|
|
102
125
|
safe_params = {k: v for k, v in self.params.items() if v is not None}
|