gdmcode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gdmcode-0.1.0.dist-info/METADATA +240 -0
- gdmcode-0.1.0.dist-info/RECORD +131 -0
- gdmcode-0.1.0.dist-info/WHEEL +4 -0
- gdmcode-0.1.0.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/_internal/__init__.py +0 -0
- src/_internal/constants.py +244 -0
- src/_internal/domain_skills.py +339 -0
- src/agent/__init__.py +0 -0
- src/agent/commit_classifier.py +91 -0
- src/agent/context_budget.py +391 -0
- src/agent/daemon.py +681 -0
- src/agent/dag_validator.py +153 -0
- src/agent/debug_loop.py +473 -0
- src/agent/impact_analyzer.py +149 -0
- src/agent/impact_graph.py +117 -0
- src/agent/loop.py +1410 -0
- src/agent/orchestrator.py +141 -0
- src/agent/regression_guard.py +251 -0
- src/agent/review_gate.py +648 -0
- src/agent/risk_scorer.py +169 -0
- src/agent/self_healing.py +145 -0
- src/agent/smart_test_selector.py +89 -0
- src/agent/system_prompt.py +226 -0
- src/agent/task_tracker.py +320 -0
- src/agent/test_validator.py +210 -0
- src/agent/tool_orchestrator.py +402 -0
- src/agent/transcript.py +230 -0
- src/agent/verification_loop.py +133 -0
- src/agent/work_director.py +136 -0
- src/agent/worktree_manager.py +53 -0
- src/artifacts/__init__.py +16 -0
- src/artifacts/artifact_store.py +456 -0
- src/artifacts/verification_graph.py +75 -0
- src/auth.py +411 -0
- src/cli.py +1290 -0
- src/commands.py +1398 -0
- src/config.py +762 -0
- src/cost_tracker.py +348 -0
- src/db/__init__.py +4 -0
- src/db/migrations.py +337 -0
- src/enterprise/__init__.py +3 -0
- src/enterprise/audit_log.py +182 -0
- src/enterprise/identity.py +90 -0
- src/enterprise/rbac.py +100 -0
- src/enterprise/team_config.py +125 -0
- src/enterprise/usage_analytics.py +261 -0
- src/exceptions.py +207 -0
- src/git_workflow.py +651 -0
- src/integrations/__init__.py +6 -0
- src/integrations/github_actions.py +106 -0
- src/integrations/mcp_server.py +333 -0
- src/integrations/sentry_integration.py +100 -0
- src/integrations/sentry_server.py +82 -0
- src/integrations/webhook_security.py +19 -0
- src/main.py +27 -0
- src/memory/__init__.py +0 -0
- src/memory/code_index.py +376 -0
- src/memory/compressor.py +378 -0
- src/memory/context_memory.py +135 -0
- src/memory/continuous_memory.py +234 -0
- src/memory/conventions.py +495 -0
- src/memory/db.py +1119 -0
- src/memory/document_index.py +205 -0
- src/memory/file_cache.py +128 -0
- src/memory/project_scanner.py +178 -0
- src/memory/session_store.py +201 -0
- src/models/__init__.py +0 -0
- src/models/client.py +715 -0
- src/models/definitions.py +459 -0
- src/models/router.py +418 -0
- src/models/schemas.py +389 -0
- src/permissions.py +294 -0
- src/remote/__init__.py +5 -0
- src/remote/command_filter.py +33 -0
- src/remote/models.py +31 -0
- src/remote/permission_handler.py +79 -0
- src/remote/phone_ui.py +48 -0
- src/remote/protocol.py +59 -0
- src/remote/qr.py +65 -0
- src/remote/server.py +586 -0
- src/remote/token_manager.py +61 -0
- src/remote/tunnel.py +212 -0
- src/repl.py +475 -0
- src/runtime/__init__.py +1 -0
- src/runtime/branch_farm.py +372 -0
- src/runtime/replay.py +351 -0
- src/sandbox/__init__.py +2 -0
- src/sandbox/hermetic.py +214 -0
- src/sandbox/policy.py +44 -0
- src/sdk/__init__.py +3 -0
- src/sdk/plugin_base.py +39 -0
- src/sdk/plugin_host.py +100 -0
- src/sdk/plugin_loader.py +101 -0
- src/security.py +409 -0
- src/server/__init__.py +7 -0
- src/server/bridge.py +427 -0
- src/server/bridge_cli.py +103 -0
- src/server/bridge_client.py +170 -0
- src/server/protocol_version.py +103 -0
- src/session/__init__.py +10 -0
- src/session/event_fanout.py +46 -0
- src/session/input_broker.py +38 -0
- src/session/permission_bridge.py +100 -0
- src/tools/__init__.py +160 -0
- src/tools/_atomic.py +72 -0
- src/tools/agent_tools.py +423 -0
- src/tools/ask_user_tool.py +83 -0
- src/tools/bash_tool.py +384 -0
- src/tools/browser_tool.py +352 -0
- src/tools/browser_tools.py +179 -0
- src/tools/dep_tools.py +210 -0
- src/tools/document_reader.py +167 -0
- src/tools/document_tool.py +240 -0
- src/tools/document_writer.py +171 -0
- src/tools/impact_tools.py +240 -0
- src/tools/playwright_tool.py +172 -0
- src/tools/quality_tools.py +366 -0
- src/tools/read_tools.py +318 -0
- src/tools/result_cache.py +157 -0
- src/tools/search_tools.py +310 -0
- src/tools/shell_tools.py +311 -0
- src/tools/write_tools.py +337 -0
- src/voice/__init__.py +25 -0
- src/voice/audio_capture.py +92 -0
- src/voice/audio_playback.py +68 -0
- src/voice/errors.py +14 -0
- src/voice/models.py +35 -0
- src/voice/providers.py +143 -0
- src/voice/vad.py +55 -0
- src/voice/voice_loop.py +156 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""DocumentIndex — SQLite FTS5 full-text index for local documents.
|
|
2
|
+
|
|
3
|
+
The index DB lives at ~/.config/gdm/document_index.db by default.
|
|
4
|
+
Pass db_path= in tests to use a temp database.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
import logging, sqlite3, threading, time
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
log = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
__all__ = ["DocumentIndex", "IndexedChunk", "SearchResult"]
|
|
15
|
+
|
|
16
|
+
INDEX_DB = Path.home() / ".config" / "gdm" / "document_index.db"
|
|
17
|
+
CHUNK_MAX_WORDS = 500
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class IndexedChunk:
|
|
22
|
+
doc_id: int
|
|
23
|
+
chunk_index: int
|
|
24
|
+
text: str
|
|
25
|
+
source_label: str # e.g. "Sheet: Revenue" or "Page 3"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class SearchResult:
|
|
30
|
+
file_path: str
|
|
31
|
+
source_label: str
|
|
32
|
+
snippet: str
|
|
33
|
+
score: float
|
|
34
|
+
chunk_index: int
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DocumentIndex:
|
|
38
|
+
def __init__(self, db_path: Path = None):
|
|
39
|
+
self._path = db_path or INDEX_DB
|
|
40
|
+
self._lock = threading.Lock()
|
|
41
|
+
self._conn: Optional[sqlite3.Connection] = None
|
|
42
|
+
self._init_db()
|
|
43
|
+
|
|
44
|
+
def _get_conn(self) -> sqlite3.Connection:
|
|
45
|
+
if self._conn is None:
|
|
46
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
self._conn = sqlite3.connect(str(self._path), check_same_thread=False)
|
|
48
|
+
self._conn.row_factory = sqlite3.Row
|
|
49
|
+
return self._conn
|
|
50
|
+
|
|
51
|
+
def _init_db(self) -> None:
|
|
52
|
+
with self._lock:
|
|
53
|
+
conn = self._get_conn()
|
|
54
|
+
conn.executescript("""
|
|
55
|
+
CREATE TABLE IF NOT EXISTS indexed_documents (
|
|
56
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
57
|
+
file_path TEXT UNIQUE NOT NULL,
|
|
58
|
+
format TEXT NOT NULL,
|
|
59
|
+
file_mtime REAL NOT NULL,
|
|
60
|
+
chunk_count INTEGER NOT NULL DEFAULT 0,
|
|
61
|
+
indexed_at REAL NOT NULL
|
|
62
|
+
);
|
|
63
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS document_chunks USING fts5(
|
|
64
|
+
file_path UNINDEXED,
|
|
65
|
+
source_label UNINDEXED,
|
|
66
|
+
chunk_index UNINDEXED,
|
|
67
|
+
text,
|
|
68
|
+
tokenize='porter unicode61'
|
|
69
|
+
);
|
|
70
|
+
""")
|
|
71
|
+
conn.commit()
|
|
72
|
+
|
|
73
|
+
def index_document(self, path: Path | str, force: bool = False) -> int:
|
|
74
|
+
"""Index document at path. Returns number of chunks indexed.
|
|
75
|
+
Returns 0 if file mtime unchanged (unless force=True).
|
|
76
|
+
Returns -1 on error (file not found, unreadable, etc.)."""
|
|
77
|
+
from src.tools.document_reader import DocumentReader
|
|
78
|
+
path = Path(path)
|
|
79
|
+
try:
|
|
80
|
+
mtime = path.stat().st_mtime
|
|
81
|
+
except OSError as exc:
|
|
82
|
+
log.warning("Cannot stat %s: %s", path, exc)
|
|
83
|
+
return -1
|
|
84
|
+
|
|
85
|
+
with self._lock:
|
|
86
|
+
conn = self._get_conn()
|
|
87
|
+
existing = conn.execute(
|
|
88
|
+
"SELECT file_mtime FROM indexed_documents WHERE file_path=?",
|
|
89
|
+
(str(path),)
|
|
90
|
+
).fetchone()
|
|
91
|
+
if existing and not force and abs(existing["file_mtime"] - mtime) < 0.001:
|
|
92
|
+
return 0 # up to date
|
|
93
|
+
|
|
94
|
+
content = DocumentReader().read(path)
|
|
95
|
+
if not content.success:
|
|
96
|
+
log.warning("Cannot index %s: %s", path, content.error)
|
|
97
|
+
return -1
|
|
98
|
+
|
|
99
|
+
chunks = self._chunk(content)
|
|
100
|
+
with self._lock:
|
|
101
|
+
conn = self._get_conn()
|
|
102
|
+
conn.execute(
|
|
103
|
+
"DELETE FROM document_chunks WHERE file_path=?", (str(path),)
|
|
104
|
+
)
|
|
105
|
+
conn.execute(
|
|
106
|
+
"DELETE FROM indexed_documents WHERE file_path=?", (str(path),)
|
|
107
|
+
)
|
|
108
|
+
conn.executemany(
|
|
109
|
+
"INSERT INTO document_chunks (file_path, source_label, chunk_index, text) VALUES (?,?,?,?)",
|
|
110
|
+
[(str(path), c.source_label, c.chunk_index, c.text) for c in chunks]
|
|
111
|
+
)
|
|
112
|
+
conn.execute(
|
|
113
|
+
"INSERT INTO indexed_documents (file_path, format, file_mtime, chunk_count, indexed_at) VALUES (?,?,?,?,?)",
|
|
114
|
+
(str(path), content.format, mtime, len(chunks), time.time())
|
|
115
|
+
)
|
|
116
|
+
conn.commit()
|
|
117
|
+
return len(chunks)
|
|
118
|
+
|
|
119
|
+
def search(self, query: str, limit: int = 10) -> list[SearchResult]:
|
|
120
|
+
"""Full-text search across all indexed documents."""
|
|
121
|
+
try:
|
|
122
|
+
with self._lock:
|
|
123
|
+
conn = self._get_conn()
|
|
124
|
+
rows = conn.execute(
|
|
125
|
+
"""SELECT file_path, source_label, chunk_index,
|
|
126
|
+
snippet(document_chunks, 3, '[', ']', '...', 20) as snip,
|
|
127
|
+
rank
|
|
128
|
+
FROM document_chunks
|
|
129
|
+
WHERE document_chunks MATCH ?
|
|
130
|
+
ORDER BY rank
|
|
131
|
+
LIMIT ?""",
|
|
132
|
+
(query, limit)
|
|
133
|
+
).fetchall()
|
|
134
|
+
except sqlite3.OperationalError as exc:
|
|
135
|
+
log.warning("FTS5 search error: %s", exc)
|
|
136
|
+
return []
|
|
137
|
+
results = []
|
|
138
|
+
for row in rows:
|
|
139
|
+
results.append(SearchResult(
|
|
140
|
+
file_path=row["file_path"],
|
|
141
|
+
source_label=row["source_label"],
|
|
142
|
+
snippet=row["snip"],
|
|
143
|
+
score=abs(row["rank"]),
|
|
144
|
+
chunk_index=row["chunk_index"],
|
|
145
|
+
))
|
|
146
|
+
return results
|
|
147
|
+
|
|
148
|
+
def list_indexed(self) -> list[dict]:
|
|
149
|
+
with self._lock:
|
|
150
|
+
rows = self._get_conn().execute(
|
|
151
|
+
"SELECT file_path, format, chunk_count, indexed_at FROM indexed_documents ORDER BY indexed_at DESC"
|
|
152
|
+
).fetchall()
|
|
153
|
+
return [dict(r) for r in rows]
|
|
154
|
+
|
|
155
|
+
def remove(self, path: Path | str) -> None:
|
|
156
|
+
path_str = str(Path(path))
|
|
157
|
+
with self._lock:
|
|
158
|
+
conn = self._get_conn()
|
|
159
|
+
conn.execute("DELETE FROM document_chunks WHERE file_path=?", (path_str,))
|
|
160
|
+
conn.execute("DELETE FROM indexed_documents WHERE file_path=?", (path_str,))
|
|
161
|
+
conn.commit()
|
|
162
|
+
|
|
163
|
+
def _chunk(self, content) -> list[IndexedChunk]:
|
|
164
|
+
chunks: list[IndexedChunk] = []
|
|
165
|
+
# Spreadsheets: one chunk per sheet
|
|
166
|
+
if content.sheets:
|
|
167
|
+
for sheet in content.sheets:
|
|
168
|
+
text = sheet.to_text()
|
|
169
|
+
if text.strip():
|
|
170
|
+
chunks.append(IndexedChunk(
|
|
171
|
+
doc_id=0, chunk_index=len(chunks),
|
|
172
|
+
text=text[:8000],
|
|
173
|
+
source_label=f"Sheet: {sheet.name}",
|
|
174
|
+
))
|
|
175
|
+
return chunks
|
|
176
|
+
# Text/PDF/DOCX: split on paragraphs, group into ~500 word chunks
|
|
177
|
+
paragraphs = [p.strip() for p in content.text.split("\n\n") if p.strip()]
|
|
178
|
+
current_words = 0
|
|
179
|
+
current_parts: list[str] = []
|
|
180
|
+
page_hint = 1
|
|
181
|
+
for para in paragraphs:
|
|
182
|
+
words = len(para.split())
|
|
183
|
+
if current_words + words > CHUNK_MAX_WORDS and current_parts:
|
|
184
|
+
chunks.append(IndexedChunk(
|
|
185
|
+
doc_id=0, chunk_index=len(chunks),
|
|
186
|
+
text="\n\n".join(current_parts),
|
|
187
|
+
source_label=f"Page {page_hint}",
|
|
188
|
+
))
|
|
189
|
+
page_hint += 1
|
|
190
|
+
current_parts = []
|
|
191
|
+
current_words = 0
|
|
192
|
+
current_parts.append(para)
|
|
193
|
+
current_words += words
|
|
194
|
+
if current_parts:
|
|
195
|
+
chunks.append(IndexedChunk(
|
|
196
|
+
doc_id=0, chunk_index=len(chunks),
|
|
197
|
+
text="\n\n".join(current_parts),
|
|
198
|
+
source_label=f"Page {page_hint}",
|
|
199
|
+
))
|
|
200
|
+
return chunks
|
|
201
|
+
|
|
202
|
+
def close(self) -> None:
|
|
203
|
+
if self._conn:
|
|
204
|
+
self._conn.close()
|
|
205
|
+
self._conn = None
|
src/memory/file_cache.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""mtime-based file freshness cache — anti-hallucination core.
|
|
2
|
+
|
|
3
|
+
Tracks when each file was last read and its mtime at that time.
|
|
4
|
+
Used to detect stale file content in context and force re-reads.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from src.memory.db import GdmDatabase
|
|
15
|
+
|
|
16
|
+
__all__ = ["CacheEntry", "FileCache"]
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_PURGE_BATCH_SIZE: int = 500
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class CacheEntry:
|
|
25
|
+
"""A single file-cache record."""
|
|
26
|
+
|
|
27
|
+
path: Path
|
|
28
|
+
mtime: float
|
|
29
|
+
summary: str | None
|
|
30
|
+
last_read_at: str # ISO timestamp
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FileCache:
|
|
34
|
+
"""mtime-based freshness cache for file reads.
|
|
35
|
+
|
|
36
|
+
Tracks when each file was last read and its mtime at that time.
|
|
37
|
+
Used to detect stale file content in context and force re-reads.
|
|
38
|
+
|
|
39
|
+
Usage::
|
|
40
|
+
|
|
41
|
+
cache = FileCache(db, project_id)
|
|
42
|
+
cache.mark_read(Path("src/auth.py"))
|
|
43
|
+
is_fresh = cache.is_fresh(Path("src/auth.py")) # False if file changed
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, db: GdmDatabase, project_id: str) -> None:
|
|
47
|
+
self._db = db
|
|
48
|
+
self._project_id = project_id
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def _normalize(path: Path) -> Path:
|
|
52
|
+
"""Return resolved absolute path for consistent DB storage."""
|
|
53
|
+
return path.resolve()
|
|
54
|
+
|
|
55
|
+
def mark_read(self, path: Path, summary: str | None = None) -> None:
|
|
56
|
+
"""Record that this file was just read. Stores current mtime."""
|
|
57
|
+
normalized = self._normalize(path)
|
|
58
|
+
try:
|
|
59
|
+
mtime = normalized.stat().st_mtime
|
|
60
|
+
except OSError:
|
|
61
|
+
log.warning("mark_read: cannot stat %s", normalized)
|
|
62
|
+
return
|
|
63
|
+
self._db.upsert_file_cache(self._project_id, str(normalized), mtime, summary)
|
|
64
|
+
|
|
65
|
+
def is_fresh(self, path: Path) -> bool:
|
|
66
|
+
"""True if file mtime matches cached mtime (file unchanged since last read).
|
|
67
|
+
|
|
68
|
+
Returns True if never read (unknown = assumed fresh).
|
|
69
|
+
Returns False if file no longer exists OR mtime changed.
|
|
70
|
+
"""
|
|
71
|
+
normalized = self._normalize(path)
|
|
72
|
+
row = self._db.get_file_cache(self._project_id, str(normalized))
|
|
73
|
+
if row is None:
|
|
74
|
+
return True # never read — assumed fresh per spec
|
|
75
|
+
try:
|
|
76
|
+
current_mtime = normalized.stat().st_mtime
|
|
77
|
+
except OSError:
|
|
78
|
+
return False
|
|
79
|
+
return float(row["mtime"]) == current_mtime
|
|
80
|
+
|
|
81
|
+
def get_stale_paths(self, paths: list[Path]) -> list[Path]:
|
|
82
|
+
"""Return subset of paths whose content may be stale."""
|
|
83
|
+
return [p for p in paths if not self.is_fresh(p)]
|
|
84
|
+
|
|
85
|
+
def invalidate(self, path: Path) -> None:
|
|
86
|
+
"""Explicitly invalidate cache for a path (e.g., after write)."""
|
|
87
|
+
normalized = self._normalize(path)
|
|
88
|
+
self._db.execute(
|
|
89
|
+
"DELETE FROM file_cache WHERE project_id = ? AND path = ?",
|
|
90
|
+
(self._project_id, str(normalized)),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def get_entry(self, path: Path) -> CacheEntry | None:
|
|
94
|
+
"""Get cache entry for a path. Returns None if not tracked."""
|
|
95
|
+
normalized = self._normalize(path)
|
|
96
|
+
row = self._db.get_file_cache(self._project_id, str(normalized))
|
|
97
|
+
if row is None:
|
|
98
|
+
return None
|
|
99
|
+
return CacheEntry(
|
|
100
|
+
path=normalized,
|
|
101
|
+
mtime=float(row["mtime"]),
|
|
102
|
+
summary=row["summary"],
|
|
103
|
+
last_read_at=row["last_read_at"],
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def purge_deleted(self) -> int:
|
|
107
|
+
"""Remove entries for files that no longer exist. Returns count purged."""
|
|
108
|
+
rows = self._db.execute_all(
|
|
109
|
+
"SELECT path FROM file_cache WHERE project_id = ?",
|
|
110
|
+
(self._project_id,),
|
|
111
|
+
)
|
|
112
|
+
stale = [row["path"] for row in rows if not Path(row["path"]).exists()]
|
|
113
|
+
if not stale:
|
|
114
|
+
return 0
|
|
115
|
+
return self._delete_batch(stale)
|
|
116
|
+
|
|
117
|
+
def _delete_batch(self, paths: list[str]) -> int:
|
|
118
|
+
"""Delete file_cache rows for given path strings in batches."""
|
|
119
|
+
deleted = 0
|
|
120
|
+
for i in range(0, len(paths), _PURGE_BATCH_SIZE):
|
|
121
|
+
batch = paths[i : i + _PURGE_BATCH_SIZE]
|
|
122
|
+
placeholders = ",".join("?" * len(batch))
|
|
123
|
+
self._db.execute(
|
|
124
|
+
f"DELETE FROM file_cache WHERE project_id = ? AND path IN ({placeholders})",
|
|
125
|
+
(self._project_id, *tuple(batch)),
|
|
126
|
+
)
|
|
127
|
+
deleted += len(batch)
|
|
128
|
+
return deleted
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Initial project record builder — runs once on first invocation.
|
|
2
|
+
|
|
3
|
+
Updates on /memory refresh. Detects tech stack from marker files.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import sqlite3
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from src.memory.db import GdmDatabase
|
|
18
|
+
|
|
19
|
+
__all__ = ["ProjectRecord", "ProjectScanner", "estimate_token_count"]
|
|
20
|
+
|
|
21
|
+
log = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
_REFRESH_DAYS: int = 7
|
|
24
|
+
|
|
25
|
+
_TECH_MARKERS: dict[str, str] = {
|
|
26
|
+
"pyproject.toml": "python",
|
|
27
|
+
"requirements.txt": "python",
|
|
28
|
+
"setup.py": "python",
|
|
29
|
+
"package.json": "nodejs",
|
|
30
|
+
"tsconfig.json": "typescript",
|
|
31
|
+
"Cargo.toml": "rust",
|
|
32
|
+
"go.mod": "golang",
|
|
33
|
+
"pom.xml": "java",
|
|
34
|
+
"build.gradle": "java",
|
|
35
|
+
"Dockerfile": "docker",
|
|
36
|
+
"docker-compose.yml": "docker",
|
|
37
|
+
".terraform": "terraform",
|
|
38
|
+
"pubspec.yaml": "dart",
|
|
39
|
+
"mix.exs": "elixir",
|
|
40
|
+
"Gemfile": "ruby",
|
|
41
|
+
"composer.json": "php",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ProjectRecord:
|
|
47
|
+
"""Lightweight descriptor for a scanned project."""
|
|
48
|
+
|
|
49
|
+
project_id: str
|
|
50
|
+
root_path: Path
|
|
51
|
+
name: str
|
|
52
|
+
tech_stack: list[str] # e.g. ["python", "typescript", "docker"]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ProjectScanner:
|
|
56
|
+
"""Scans project directory and builds/updates the project record in gdm.db.
|
|
57
|
+
|
|
58
|
+
Detects tech stack from marker files. Fast — no file content reading.
|
|
59
|
+
|
|
60
|
+
Usage::
|
|
61
|
+
|
|
62
|
+
scanner = ProjectScanner(db)
|
|
63
|
+
record = scanner.ensure_project(root_path) # creates or updates
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, db: GdmDatabase) -> None:
|
|
67
|
+
self._db = db
|
|
68
|
+
|
|
69
|
+
def ensure_project(self, root: Path) -> ProjectRecord:
|
|
70
|
+
"""Get existing project record or scan and create a new one.
|
|
71
|
+
|
|
72
|
+
Always updates last_seen. Updates tech_stack if 7+ days old.
|
|
73
|
+
Returns ProjectRecord.
|
|
74
|
+
"""
|
|
75
|
+
normalized = root.resolve()
|
|
76
|
+
row = self._db.execute_one(
|
|
77
|
+
"SELECT project_id, root_path, name, tech_stack, last_seen "
|
|
78
|
+
"FROM projects WHERE root_path = ?",
|
|
79
|
+
(str(normalized),),
|
|
80
|
+
)
|
|
81
|
+
if row is None:
|
|
82
|
+
return self.scan(root)
|
|
83
|
+
return self._refresh_record(row, normalized)
|
|
84
|
+
|
|
85
|
+
def scan(self, root: Path) -> ProjectRecord:
|
|
86
|
+
"""Full scan — detect tech stack, count files. Creates DB record."""
|
|
87
|
+
normalized = root.resolve()
|
|
88
|
+
project_id = self._generate_project_id(normalized)
|
|
89
|
+
name = normalized.name
|
|
90
|
+
tech_stack = self._detect_tech_stack(normalized)
|
|
91
|
+
self._db.execute(
|
|
92
|
+
"INSERT INTO projects (project_id, root_path, name, tech_stack) "
|
|
93
|
+
"VALUES (?, ?, ?, ?) "
|
|
94
|
+
"ON CONFLICT(root_path) DO UPDATE SET "
|
|
95
|
+
"name = excluded.name, "
|
|
96
|
+
"tech_stack = excluded.tech_stack, "
|
|
97
|
+
"last_seen = datetime('now')",
|
|
98
|
+
(project_id, str(normalized), name, json.dumps(tech_stack)),
|
|
99
|
+
)
|
|
100
|
+
row = self._db.execute_one(
|
|
101
|
+
"SELECT project_id FROM projects WHERE root_path = ?",
|
|
102
|
+
(str(normalized),),
|
|
103
|
+
)
|
|
104
|
+
actual_id = row["project_id"] if row else project_id
|
|
105
|
+
log.info("Scanned project '%s' id=%s stack=%s", name, actual_id, tech_stack)
|
|
106
|
+
return ProjectRecord(
|
|
107
|
+
project_id=actual_id,
|
|
108
|
+
root_path=normalized,
|
|
109
|
+
name=name,
|
|
110
|
+
tech_stack=tech_stack,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def get_project_id(self, root: Path) -> str | None:
|
|
114
|
+
"""Get project_id for a root path. Returns None if not in DB."""
|
|
115
|
+
normalized = root.resolve()
|
|
116
|
+
row = self._db.execute_one(
|
|
117
|
+
"SELECT project_id FROM projects WHERE root_path = ?",
|
|
118
|
+
(str(normalized),),
|
|
119
|
+
)
|
|
120
|
+
return row["project_id"] if row is not None else None
|
|
121
|
+
|
|
122
|
+
def _refresh_record(self, row: sqlite3.Row, root: Path) -> ProjectRecord:
|
|
123
|
+
"""Update last_seen and optionally refresh tech_stack if stale."""
|
|
124
|
+
last_seen = datetime.fromisoformat(row["last_seen"])
|
|
125
|
+
now = datetime.now(timezone.utc).replace(tzinfo=None) # naive UTC, matches SQLite
|
|
126
|
+
days_old = (now - last_seen).days
|
|
127
|
+
tech_stack = json.loads(row["tech_stack"])
|
|
128
|
+
if days_old >= _REFRESH_DAYS:
|
|
129
|
+
tech_stack = self._detect_tech_stack(root)
|
|
130
|
+
self._db.execute(
|
|
131
|
+
"UPDATE projects SET tech_stack = ?, last_seen = datetime('now') "
|
|
132
|
+
"WHERE project_id = ?",
|
|
133
|
+
(json.dumps(tech_stack), row["project_id"]),
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
self._db.execute(
|
|
137
|
+
"UPDATE projects SET last_seen = datetime('now') WHERE project_id = ?",
|
|
138
|
+
(row["project_id"],),
|
|
139
|
+
)
|
|
140
|
+
return ProjectRecord(
|
|
141
|
+
project_id=row["project_id"],
|
|
142
|
+
root_path=root,
|
|
143
|
+
name=row["name"],
|
|
144
|
+
tech_stack=tech_stack,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def _detect_tech_stack(self, root: Path) -> list[str]:
|
|
148
|
+
"""Detect technology stack from marker files. Returns sorted list."""
|
|
149
|
+
detected: set[str] = set()
|
|
150
|
+
for marker, tech in _TECH_MARKERS.items():
|
|
151
|
+
if (root / marker).exists():
|
|
152
|
+
detected.add(tech)
|
|
153
|
+
return sorted(detected)
|
|
154
|
+
|
|
155
|
+
def _generate_project_id(self, root: Path) -> str:
|
|
156
|
+
"""Generate stable project ID from root path hash."""
|
|
157
|
+
return hashlib.sha256(str(root).encode()).hexdigest()[:16]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def estimate_token_count(root: Path, tech_stack: list[str] | None = None) -> int:
|
|
161
|
+
"""Estimate total token count for a project directory.
|
|
162
|
+
|
|
163
|
+
Uses :class:`~src.agent.context_budget.WholeCodebaseMode` internally for
|
|
164
|
+
deterministic file inclusion and a fast char-based approximation.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
root: project root directory.
|
|
168
|
+
tech_stack: list of detected technologies (e.g. ``["python", "nodejs"]``).
|
|
169
|
+
Pass ``None`` or ``[]`` to scan all text files.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Estimated token count (always ≥ 1).
|
|
173
|
+
"""
|
|
174
|
+
from src.agent.context_budget import ContextBudget, WholeCodebaseMode
|
|
175
|
+
|
|
176
|
+
budget = ContextBudget()
|
|
177
|
+
wcm = WholeCodebaseMode(budget)
|
|
178
|
+
return wcm.estimate_token_count(root, tech_stack or [])
|