diary-docs 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diary/__init__.py +1 -0
- diary/__main__.py +3 -0
- diary/aimb/__init__.py +48 -0
- diary/aimb/hasher.py +157 -0
- diary/aimb/merge.py +252 -0
- diary/aimb/parser.py +202 -0
- diary/cli.py +999 -0
- diary/git_utils.py +202 -0
- diary/indexer/__init__.py +44 -0
- diary/indexer/database.py +340 -0
- diary/indexer/extractors.py +468 -0
- diary/indexer/gitignore.py +62 -0
- diary/indexer/indexer.py +511 -0
- diary/indexer/reporter.py +137 -0
- diary/indexer/scanner.py +65 -0
- diary/sync/__init__.py +33 -0
- diary/sync/detector.py +405 -0
- diary/sync/engine.py +404 -0
- diary/sync/protocol.py +176 -0
- diary/templates.py +102 -0
- diary_docs-0.1.0.dist-info/METADATA +228 -0
- diary_docs-0.1.0.dist-info/RECORD +26 -0
- diary_docs-0.1.0.dist-info/WHEEL +5 -0
- diary_docs-0.1.0.dist-info/entry_points.txt +2 -0
- diary_docs-0.1.0.dist-info/licenses/LICENSE +21 -0
- diary_docs-0.1.0.dist-info/top_level.txt +1 -0
diary/indexer/indexer.py
ADDED
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
"""Orchestrator — glues scanner, extractors, database, gitignore, and reporter.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from diary.indexer.indexer import run_index, compute_coverage
|
|
6
|
+
db = run_index(Path("/path/to/workspace"))
|
|
7
|
+
coverage = compute_coverage(db)
|
|
8
|
+
db.close()
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
import unicodedata
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
from .database import IndexDatabase
|
|
21
|
+
from .extractors import extract_symbols
|
|
22
|
+
from .gitignore import ensure_gitignore
|
|
23
|
+
from .scanner import scan_files, SUPPORTED_EXTENSIONS
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# Extensions that are considered markdown (processed for documents table)
|
|
28
|
+
MARKDOWN_EXTENSIONS = frozenset({".md", ".mdx"})
|
|
29
|
+
|
|
30
|
+
# Mapping from file extension to human-readable language name
|
|
31
|
+
_EXTENSION_LANGUAGE: dict[str, str] = {
|
|
32
|
+
".py": "Python",
|
|
33
|
+
".ts": "TypeScript",
|
|
34
|
+
".tsx": "TypeScript",
|
|
35
|
+
".js": "JavaScript",
|
|
36
|
+
".jsx": "JavaScript",
|
|
37
|
+
".php": "PHP",
|
|
38
|
+
".java": "Java",
|
|
39
|
+
".go": "Go",
|
|
40
|
+
".cs": "C#",
|
|
41
|
+
".yaml": "YAML",
|
|
42
|
+
".yml": "YAML",
|
|
43
|
+
".json": "JSON",
|
|
44
|
+
".md": "Markdown",
|
|
45
|
+
".mdx": "Markdown",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ── Helpers ──────────────────────────────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def sha256_hash(content: str | bytes) -> str:
|
|
53
|
+
"""Return the hex SHA-256 digest of *content*.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
content : str | bytes
|
|
58
|
+
Input data to hash.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
str
|
|
63
|
+
Hex-encoded SHA-256 digest (64 characters).
|
|
64
|
+
"""
|
|
65
|
+
if isinstance(content, str):
|
|
66
|
+
content = content.encode("utf-8")
|
|
67
|
+
return hashlib.sha256(content).hexdigest()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _extract_frontmatter(content: str) -> dict:
|
|
71
|
+
"""Parse YAML-style frontmatter from markdown content.
|
|
72
|
+
|
|
73
|
+
Expects content between the first ``---\\n`` and the second ``\\n---\\n``.
|
|
74
|
+
Returns a dict with ``title``, ``summary`` and any other key-value pairs
|
|
75
|
+
found (simple ``key: value`` format only — no nested YAML).
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
content : str
|
|
80
|
+
Raw markdown file content.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
dict
|
|
85
|
+
Parsed frontmatter fields (may be empty).
|
|
86
|
+
"""
|
|
87
|
+
result: dict = {}
|
|
88
|
+
# Match content between first `---\n` and second `\n---\n`
|
|
89
|
+
m = re.match(r"^---\n(.*?)\n---(?:\n|$)", content, re.DOTALL)
|
|
90
|
+
if not m:
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
for line in m.group(1).split("\n"):
|
|
94
|
+
line = line.strip()
|
|
95
|
+
if not line or line.startswith("#"):
|
|
96
|
+
continue
|
|
97
|
+
kv = re.match(r"(\w[\w_-]*)\s*:\s*(.*)", line)
|
|
98
|
+
if kv:
|
|
99
|
+
key = kv.group(1).strip()
|
|
100
|
+
value = kv.group(2).strip()
|
|
101
|
+
result[key] = value
|
|
102
|
+
|
|
103
|
+
return result
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _line_count(content: str) -> int:
|
|
107
|
+
"""Return number of lines in *content*."""
|
|
108
|
+
return len(content.split("\n"))
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ── Coverage ─────────────────────────────────────────────────────────────
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def compute_coverage(db: IndexDatabase) -> dict:
|
|
115
|
+
"""Query *db* and return a dictionary of coverage statistics.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
db : IndexDatabase
|
|
120
|
+
An open database handle with populated tables.
|
|
121
|
+
|
|
122
|
+
Returns
|
|
123
|
+
-------
|
|
124
|
+
dict
|
|
125
|
+
Keys:
|
|
126
|
+
- ``total_files``
|
|
127
|
+
- ``total_symbols``
|
|
128
|
+
- ``total_documents``
|
|
129
|
+
- ``documented_files`` (files with at least one relation)
|
|
130
|
+
- ``documented_symbols`` (symbols referenced in relations)
|
|
131
|
+
- ``documented_documents`` (documents with at least one relation)
|
|
132
|
+
- ``files_per_language`` (list of ``{language, count}`` dicts)
|
|
133
|
+
"""
|
|
134
|
+
conn = db.conn
|
|
135
|
+
|
|
136
|
+
def _scalar(sql: str, params: tuple = ()) -> int:
|
|
137
|
+
row = conn.execute(sql, params).fetchone()
|
|
138
|
+
return row[0] if row else 0
|
|
139
|
+
|
|
140
|
+
total_files = _scalar("SELECT COUNT(*) FROM files")
|
|
141
|
+
total_symbols = _scalar("SELECT COUNT(*) FROM symbols")
|
|
142
|
+
total_documents = _scalar("SELECT COUNT(*) FROM documents")
|
|
143
|
+
|
|
144
|
+
documented_files = _scalar(
|
|
145
|
+
"SELECT COUNT(DISTINCT file_id) FROM relations WHERE file_id IS NOT NULL"
|
|
146
|
+
)
|
|
147
|
+
documented_symbols = _scalar(
|
|
148
|
+
"SELECT COUNT(DISTINCT symbol_id) FROM relations"
|
|
149
|
+
)
|
|
150
|
+
documented_documents = _scalar(
|
|
151
|
+
"SELECT COUNT(DISTINCT doc_id) FROM relations"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
rows_lang = conn.execute(
|
|
155
|
+
"SELECT language, COUNT(*) FROM files GROUP BY language ORDER BY COUNT(*) DESC"
|
|
156
|
+
).fetchall()
|
|
157
|
+
files_per_language = [{"language": lang, "count": cnt} for lang, cnt in rows_lang]
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
"total_files": total_files,
|
|
161
|
+
"total_symbols": total_symbols,
|
|
162
|
+
"total_documents": total_documents,
|
|
163
|
+
"documented_files": documented_files,
|
|
164
|
+
"documented_symbols": documented_symbols,
|
|
165
|
+
"documented_documents": documented_documents,
|
|
166
|
+
"files_per_language": files_per_language,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ── Relation building ────────────────────────────────────────────────────
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _build_relations(db: IndexDatabase) -> None:
|
|
174
|
+
"""Link markdown document headings to matching symbol names.
|
|
175
|
+
|
|
176
|
+
For every document heading stored in the ``documents`` table, search the
|
|
177
|
+
``symbols`` table for a symbol whose name matches (case-insensitive) and
|
|
178
|
+
insert a relation row with confidence 0.8.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
db : IndexDatabase
|
|
183
|
+
Open database handle with populated ``documents`` and ``symbols`` tables.
|
|
184
|
+
"""
|
|
185
|
+
conn = db.conn
|
|
186
|
+
docs = conn.execute("SELECT id, headings FROM documents").fetchall()
|
|
187
|
+
|
|
188
|
+
for doc_id, headings_json in docs:
|
|
189
|
+
if not headings_json:
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
import json as _json
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
headings = _json.loads(headings_json)
|
|
196
|
+
except (_json.JSONDecodeError, TypeError):
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
if not isinstance(headings, list):
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
for heading in headings:
|
|
203
|
+
if not isinstance(heading, str) or not heading.strip():
|
|
204
|
+
continue
|
|
205
|
+
# Search symbols for case-insensitive match
|
|
206
|
+
rows = conn.execute(
|
|
207
|
+
"SELECT id, file_id FROM symbols WHERE LOWER(name) = LOWER(?)",
|
|
208
|
+
(heading.strip(),),
|
|
209
|
+
).fetchall()
|
|
210
|
+
for sym_id, file_id in rows:
|
|
211
|
+
db.insert_relation(
|
|
212
|
+
doc_id=doc_id,
|
|
213
|
+
symbol_id=sym_id,
|
|
214
|
+
file_id=file_id,
|
|
215
|
+
confidence=0.8,
|
|
216
|
+
reason="heading match",
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _build_dependencies(file_path: Path, content: str) -> list[dict]:
|
|
221
|
+
"""Extract import/require statements from *content* using naive regex.
|
|
222
|
+
|
|
223
|
+
Supports Python (``import X``, ``from X import Y``), JavaScript/TypeScript
|
|
224
|
+
(``import X from``, ``require(...)``) and basic static file references.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
file_path : Path
|
|
229
|
+
Path to the source file (used for extension detection).
|
|
230
|
+
content : str
|
|
231
|
+
Raw file content.
|
|
232
|
+
|
|
233
|
+
Returns
|
|
234
|
+
-------
|
|
235
|
+
list[dict]
|
|
236
|
+
Each dict has keys ``target_path`` (the imported module name) and
|
|
237
|
+
``dep_type`` (always ``"import"`` for now).
|
|
238
|
+
"""
|
|
239
|
+
ext = file_path.suffix.lower()
|
|
240
|
+
deps: list[dict] = []
|
|
241
|
+
|
|
242
|
+
if ext == ".py":
|
|
243
|
+
# Python: import X, from X import Y
|
|
244
|
+
for m in re.finditer(
|
|
245
|
+
r"^(?:from\s+([\w.]+)\s+import|import\s+([\w.]+))", content, re.MULTILINE
|
|
246
|
+
):
|
|
247
|
+
target = m.group(1) or m.group(2)
|
|
248
|
+
deps.append({"target_path": target, "dep_type": "import"})
|
|
249
|
+
elif ext in (".ts", ".tsx", ".js", ".jsx"):
|
|
250
|
+
# JS/TS: import X from '...', require('...'), import '...'
|
|
251
|
+
for m in re.finditer(
|
|
252
|
+
r'(?:import\s+(?:\w+\s+from\s+)?["\']([^"\']+)["\']|require\(["\']([^"\']+)["\']\))',
|
|
253
|
+
content,
|
|
254
|
+
):
|
|
255
|
+
target = m.group(1) or m.group(2)
|
|
256
|
+
deps.append({"target_path": target, "dep_type": "import"})
|
|
257
|
+
elif ext == ".java":
|
|
258
|
+
for m in re.finditer(r"^import\s+([\w.*]+);", content, re.MULTILINE):
|
|
259
|
+
deps.append({"target_path": m.group(1), "dep_type": "import"})
|
|
260
|
+
elif ext == ".go":
|
|
261
|
+
for m in re.finditer(r'^import\s+["\']([^"\']+)["\']', content, re.MULTILINE):
|
|
262
|
+
deps.append({"target_path": m.group(1), "dep_type": "import"})
|
|
263
|
+
elif ext == ".php":
|
|
264
|
+
for m in re.finditer(
|
|
265
|
+
r"(?:use\s+([\w\\\\]+)|require(?:_once)?\s+[\"']([^\"']+)[\"']|include(?:_once)?\s+[\"']([^\"']+)[\"'])",
|
|
266
|
+
content,
|
|
267
|
+
):
|
|
268
|
+
target = m.group(1) or m.group(2) or m.group(3)
|
|
269
|
+
deps.append({"target_path": target, "dep_type": "import"})
|
|
270
|
+
|
|
271
|
+
return deps
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# ── Branch name sanitization (ADR-4) ─────────────────────────────────────
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _sanitize_branch_name(name: str) -> str:
|
|
278
|
+
"""Sanitize *name* to a safe filesystem fragment.
|
|
279
|
+
|
|
280
|
+
* lowercased
|
|
281
|
+
* ``/`` → ``_``
|
|
282
|
+
* NFC-normalised
|
|
283
|
+
* Truncated to 100 code points
|
|
284
|
+
|
|
285
|
+
Parameters
|
|
286
|
+
----------
|
|
287
|
+
name : str
|
|
288
|
+
Raw branch name (e.g. ``feature/my-thing``).
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
str
|
|
293
|
+
Safe name usable in a filename (e.g. ``feature_my-thing``).
|
|
294
|
+
"""
|
|
295
|
+
name = unicodedata.normalize("NFC", name)
|
|
296
|
+
name = name.lower().replace("/", "_")
|
|
297
|
+
return name[:100]
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# ── Main orchestrator ────────────────────────────────────────────────────
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def run_index(
|
|
304
|
+
workspace_path: Path,
|
|
305
|
+
branch_name: str | None = None,
|
|
306
|
+
db_path: Optional[Path] = None,
|
|
307
|
+
) -> IndexDatabase:
|
|
308
|
+
"""Scan *workspace_path*, extract symbols, build knowledge database.
|
|
309
|
+
|
|
310
|
+
This is the main entry point for the knowledge indexer. It:
|
|
311
|
+
|
|
312
|
+
1. Ensures the ``docs/.index/`` output directory exists.
|
|
313
|
+
2. Updates ``.gitignore`` so the index directory is not tracked.
|
|
314
|
+
3. Creates / upgrades the SQLite database schema.
|
|
315
|
+
4. Scans all supported source files.
|
|
316
|
+
5. Extracts symbols from each file.
|
|
317
|
+
6. Processes markdown files into a documents table.
|
|
318
|
+
7. Links documents to symbols via heading matching.
|
|
319
|
+
8. Extracts import dependency edges.
|
|
320
|
+
|
|
321
|
+
Parameters
|
|
322
|
+
----------
|
|
323
|
+
workspace_path : Path
|
|
324
|
+
Root of the workspace to index.
|
|
325
|
+
branch_name : str or None, optional
|
|
326
|
+
If provided, the database is stored as
|
|
327
|
+
``docs/.index/knowledge-{sanitized_name}.db``, enabling per-branch
|
|
328
|
+
isolation. When ``None`` (default), ``knowledge.db`` is used,
|
|
329
|
+
preserving backward compatibility.
|
|
330
|
+
db_path : Path or None, optional
|
|
331
|
+
Explicit path to the output SQLite database. If provided, takes
|
|
332
|
+
precedence over *branch_name*. If ``None`` (default), the path is
|
|
333
|
+
derived from *workspace_path* and *branch_name*.
|
|
334
|
+
|
|
335
|
+
Returns
|
|
336
|
+
-------
|
|
337
|
+
IndexDatabase
|
|
338
|
+
The populated database instance (still open — caller should
|
|
339
|
+
``db.close()`` when done).
|
|
340
|
+
"""
|
|
341
|
+
workspace_path = workspace_path.resolve()
|
|
342
|
+
|
|
343
|
+
# 1. Default database path
|
|
344
|
+
if db_path is None:
|
|
345
|
+
index_dir = workspace_path / "docs" / ".index"
|
|
346
|
+
db_filename = (
|
|
347
|
+
f"knowledge-{_sanitize_branch_name(branch_name)}.db"
|
|
348
|
+
if branch_name
|
|
349
|
+
else "knowledge.db"
|
|
350
|
+
)
|
|
351
|
+
db_path = index_dir / db_filename
|
|
352
|
+
else:
|
|
353
|
+
index_dir = db_path.parent
|
|
354
|
+
|
|
355
|
+
# 2. Ensure output directory exists
|
|
356
|
+
index_dir.mkdir(parents=True, exist_ok=True)
|
|
357
|
+
|
|
358
|
+
# 3. Update .gitignore
|
|
359
|
+
ensure_gitignore(workspace_path)
|
|
360
|
+
|
|
361
|
+
# 4. Initialise database
|
|
362
|
+
db = IndexDatabase(db_path)
|
|
363
|
+
db.create_tables()
|
|
364
|
+
db.clear_all()
|
|
365
|
+
|
|
366
|
+
# 5. Scan files
|
|
367
|
+
all_files = scan_files(workspace_path)
|
|
368
|
+
|
|
369
|
+
# Separate source files and markdown files
|
|
370
|
+
md_files: list[Path] = []
|
|
371
|
+
source_files: list[Path] = []
|
|
372
|
+
|
|
373
|
+
for fp in all_files:
|
|
374
|
+
if fp.suffix.lower() in MARKDOWN_EXTENSIONS:
|
|
375
|
+
md_files.append(fp)
|
|
376
|
+
else:
|
|
377
|
+
source_files.append(fp)
|
|
378
|
+
|
|
379
|
+
# 6. Process source files (populate files + symbols tables)
|
|
380
|
+
file_count = 0
|
|
381
|
+
for file_path in source_files:
|
|
382
|
+
try:
|
|
383
|
+
content = file_path.read_text("utf-8")
|
|
384
|
+
except (OSError, UnicodeDecodeError):
|
|
385
|
+
logger.warning("Cannot read %s — skipping", file_path)
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
sha256 = sha256_hash(content)
|
|
389
|
+
rel_path = str(file_path.relative_to(workspace_path).as_posix())
|
|
390
|
+
ext = file_path.suffix.lower()
|
|
391
|
+
language = _EXTENSION_LANGUAGE.get(ext, ext.lstrip(".").upper())
|
|
392
|
+
stat = file_path.stat()
|
|
393
|
+
lines = _line_count(content)
|
|
394
|
+
|
|
395
|
+
file_id = db.insert_file(
|
|
396
|
+
path=str(file_path),
|
|
397
|
+
rel_path=rel_path,
|
|
398
|
+
language=language,
|
|
399
|
+
sha256=sha256,
|
|
400
|
+
size=stat.st_size,
|
|
401
|
+
modified=stat.st_mtime,
|
|
402
|
+
lines=lines,
|
|
403
|
+
)
|
|
404
|
+
file_count += 1
|
|
405
|
+
|
|
406
|
+
# Extract symbols
|
|
407
|
+
symbols = extract_symbols(file_path, content)
|
|
408
|
+
for sym in symbols:
|
|
409
|
+
db.insert_symbol(
|
|
410
|
+
file_id=file_id,
|
|
411
|
+
name=sym["name"],
|
|
412
|
+
fqn=sym.get("name"), # simple FQN = name for v1
|
|
413
|
+
sym_type=sym["type"],
|
|
414
|
+
parent=sym.get("parent"),
|
|
415
|
+
namespace=sym.get("namespace", ""),
|
|
416
|
+
start_line=sym["line"],
|
|
417
|
+
end_line=sym.get("end_line", sym["line"]),
|
|
418
|
+
visibility="public",
|
|
419
|
+
signature=sym.get("signature", ""),
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# Extract dependencies
|
|
423
|
+
deps = _build_dependencies(file_path, content)
|
|
424
|
+
for dep in deps:
|
|
425
|
+
db.insert_dependency(
|
|
426
|
+
source_id=file_id,
|
|
427
|
+
target_path=dep["target_path"],
|
|
428
|
+
dep_type=dep["dep_type"],
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# 7. Process markdown files (populate documents table)
|
|
432
|
+
for file_path in md_files:
|
|
433
|
+
try:
|
|
434
|
+
content = file_path.read_text("utf-8")
|
|
435
|
+
except (OSError, UnicodeDecodeError):
|
|
436
|
+
logger.warning("Cannot read %s — skipping", file_path)
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
sha256 = sha256_hash(content)
|
|
440
|
+
rel_path = str(file_path.relative_to(workspace_path).as_posix())
|
|
441
|
+
|
|
442
|
+
# Extract frontmatter
|
|
443
|
+
fm = _extract_frontmatter(content)
|
|
444
|
+
title = fm.get("title", file_path.stem)
|
|
445
|
+
summary = fm.get("summary", "")
|
|
446
|
+
|
|
447
|
+
# Extract headings from markdown symbols
|
|
448
|
+
md_symbols = extract_symbols(file_path, content)
|
|
449
|
+
headings = [s["name"] for s in md_symbols if s["type"] == "heading"]
|
|
450
|
+
|
|
451
|
+
import json as _json
|
|
452
|
+
|
|
453
|
+
headings_json = _json.dumps(headings)
|
|
454
|
+
|
|
455
|
+
# Also insert into files table as a regular file
|
|
456
|
+
stat = file_path.stat()
|
|
457
|
+
file_id = db.insert_file(
|
|
458
|
+
path=str(file_path),
|
|
459
|
+
rel_path=rel_path,
|
|
460
|
+
language="Markdown",
|
|
461
|
+
sha256=sha256,
|
|
462
|
+
size=stat.st_size,
|
|
463
|
+
modified=stat.st_mtime,
|
|
464
|
+
lines=_line_count(content),
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Insert symbols if any
|
|
468
|
+
for sym in md_symbols:
|
|
469
|
+
db.insert_symbol(
|
|
470
|
+
file_id=file_id,
|
|
471
|
+
name=sym["name"],
|
|
472
|
+
fqn=sym["name"],
|
|
473
|
+
sym_type=sym["type"],
|
|
474
|
+
parent=sym.get("parent"),
|
|
475
|
+
namespace=sym.get("namespace", ""),
|
|
476
|
+
start_line=sym["line"],
|
|
477
|
+
end_line=sym.get("end_line", sym["line"]),
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# Insert into documents table
|
|
481
|
+
db.insert_document(
|
|
482
|
+
path=rel_path,
|
|
483
|
+
title=title,
|
|
484
|
+
headings=headings_json,
|
|
485
|
+
summary=summary,
|
|
486
|
+
sha256=sha256,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# 8. Build relations
|
|
490
|
+
_build_relations(db)
|
|
491
|
+
|
|
492
|
+
# 9. Store schema version in metadata
|
|
493
|
+
db.conn.execute(
|
|
494
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
|
|
495
|
+
("schema_version", "1"),
|
|
496
|
+
)
|
|
497
|
+
db.conn.execute(
|
|
498
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
|
|
499
|
+
("indexed_at", str(db.conn.execute("SELECT strftime('%s','now')").fetchone()[0])),
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# 10. Final commit
|
|
503
|
+
db.conn.commit()
|
|
504
|
+
|
|
505
|
+
logger.info(
|
|
506
|
+
"Indexed %d source files and %d markdown files",
|
|
507
|
+
file_count,
|
|
508
|
+
len(md_files),
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
return db
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Rich coverage report for the knowledge index.
|
|
2
|
+
|
|
3
|
+
Prints a table to stdout summarizing Files, Symbols, Documents,
|
|
4
|
+
and a per-language breakdown. Read-only — never modifies the database.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from diary.indexer.database import IndexDatabase
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_report(db: "IndexDatabase", workspace_path: Path) -> str:
|
|
20
|
+
"""Query *db* for coverage statistics and print a Rich table to stdout.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
db : IndexDatabase
|
|
25
|
+
Open database handle (must have tables created).
|
|
26
|
+
workspace_path : Path
|
|
27
|
+
Root of the workspace (used only for display, not DB queries).
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
str
|
|
32
|
+
The rendered table text (exported from Console).
|
|
33
|
+
"""
|
|
34
|
+
conn = db.conn
|
|
35
|
+
|
|
36
|
+
# ------------------------------------------------------------------
|
|
37
|
+
# Queries
|
|
38
|
+
# ------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
def _scalar(sql: str, params: tuple = ()) -> int:
|
|
41
|
+
row = conn.execute(sql, params).fetchone()
|
|
42
|
+
return row[0] if row else 0
|
|
43
|
+
|
|
44
|
+
# Summary metrics
|
|
45
|
+
total_files = _scalar("SELECT COUNT(*) FROM files")
|
|
46
|
+
documented_files = _scalar(
|
|
47
|
+
"SELECT COUNT(DISTINCT file_id) FROM relations WHERE file_id IS NOT NULL"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
total_symbols = _scalar("SELECT COUNT(*) FROM symbols")
|
|
51
|
+
documented_symbols = _scalar(
|
|
52
|
+
"SELECT COUNT(DISTINCT symbol_id) FROM relations"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
total_documents = _scalar("SELECT COUNT(*) FROM documents")
|
|
56
|
+
documented_documents = _scalar(
|
|
57
|
+
"SELECT COUNT(DISTINCT doc_id) FROM relations"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Per-language file counts
|
|
61
|
+
rows_lang = conn.execute(
|
|
62
|
+
"SELECT language, COUNT(*) FROM files GROUP BY language ORDER BY COUNT(*) DESC"
|
|
63
|
+
).fetchall()
|
|
64
|
+
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
# Table construction
|
|
67
|
+
# ------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
table = Table(
|
|
70
|
+
title="Knowledge Index Coverage Report",
|
|
71
|
+
title_justify="left",
|
|
72
|
+
)
|
|
73
|
+
table.add_column("Category", style="cyan", no_wrap=True)
|
|
74
|
+
table.add_column("Total", justify="right")
|
|
75
|
+
table.add_column("Documented", justify="right")
|
|
76
|
+
table.add_column("Undocumented", justify="right")
|
|
77
|
+
table.add_column("Coverage %", justify="right", style="green")
|
|
78
|
+
|
|
79
|
+
# -- Summary rows ---------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def _pct(documented: int, total: int) -> str:
|
|
82
|
+
if total == 0:
|
|
83
|
+
return "—"
|
|
84
|
+
return f"{documented / total * 100:.1f}%"
|
|
85
|
+
|
|
86
|
+
def _undocumented(documented: int, total: int) -> int:
|
|
87
|
+
return total - documented
|
|
88
|
+
|
|
89
|
+
table.add_row(
|
|
90
|
+
"[bold]Files[/]",
|
|
91
|
+
str(total_files),
|
|
92
|
+
str(documented_files),
|
|
93
|
+
str(_undocumented(documented_files, total_files)),
|
|
94
|
+
_pct(documented_files, total_files),
|
|
95
|
+
)
|
|
96
|
+
table.add_row(
|
|
97
|
+
"[bold]Symbols[/]",
|
|
98
|
+
str(total_symbols),
|
|
99
|
+
str(documented_symbols),
|
|
100
|
+
str(_undocumented(documented_symbols, total_symbols)),
|
|
101
|
+
_pct(documented_symbols, total_symbols),
|
|
102
|
+
)
|
|
103
|
+
table.add_row(
|
|
104
|
+
"[bold]Documents[/]",
|
|
105
|
+
str(total_documents),
|
|
106
|
+
str(documented_documents),
|
|
107
|
+
str(_undocumented(documented_documents, total_documents)),
|
|
108
|
+
_pct(documented_documents, total_documents),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# -- Per-Language rows ----------------------------------------------
|
|
112
|
+
if rows_lang:
|
|
113
|
+
table.add_section()
|
|
114
|
+
for language, cnt in rows_lang:
|
|
115
|
+
expr = """
|
|
116
|
+
SELECT COUNT(DISTINCT s.id)
|
|
117
|
+
FROM symbols s
|
|
118
|
+
JOIN relations r ON r.symbol_id = s.id
|
|
119
|
+
JOIN files f ON f.id = s.file_id
|
|
120
|
+
WHERE f.language = ?
|
|
121
|
+
"""
|
|
122
|
+
docd = _scalar(expr, (language,))
|
|
123
|
+
table.add_row(
|
|
124
|
+
f"[italic]{language}[/]",
|
|
125
|
+
str(cnt),
|
|
126
|
+
str(docd),
|
|
127
|
+
str(cnt - docd),
|
|
128
|
+
_pct(docd, cnt),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# ------------------------------------------------------------------
|
|
132
|
+
# Render & print
|
|
133
|
+
# ------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
console = Console(record=True)
|
|
136
|
+
console.print(table)
|
|
137
|
+
return console.export_text()
|
diary/indexer/scanner.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""File scanner — walks a directory tree and returns supported source files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
SUPPORTED_EXTENSIONS = frozenset({
|
|
9
|
+
".php", ".ts", ".js", ".java", ".py",
|
|
10
|
+
".go", ".cs", ".yaml", ".yml", ".json", ".md",
|
|
11
|
+
})
|
|
12
|
+
|
|
13
|
+
EXCLUDED_DIRS = frozenset({
|
|
14
|
+
"node_modules", ".git", "__pycache__", ".venv", "venv",
|
|
15
|
+
".pytest_cache", "dist", "build", ".next", "vendor",
|
|
16
|
+
})
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def scan_files(root_path: Path, max_file_size: int = 524288) -> list[Path]:
|
|
20
|
+
"""Walk *root_path* and return a sorted list of supported source files.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
root_path : Path
|
|
25
|
+
Root directory to scan.
|
|
26
|
+
max_file_size : int, optional
|
|
27
|
+
Maximum file size in bytes (default 524288 = 512 KiB).
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
list[Path]
|
|
32
|
+
Sorted list of absolute paths matching the supported extensions
|
|
33
|
+
and exclusion rules.
|
|
34
|
+
"""
|
|
35
|
+
result: list[Path] = []
|
|
36
|
+
|
|
37
|
+
for path in root_path.rglob("*"):
|
|
38
|
+
# Skip directories – we only collect files
|
|
39
|
+
if not path.is_file():
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
# Exclusion: skip if any excluded directory appears in the path
|
|
43
|
+
if any(part in EXCLUDED_DIRS for part in path.parts):
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
# Exclusion: skip minified files
|
|
47
|
+
if ".min." in path.name:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
# Inclusion: only accepted extensions
|
|
51
|
+
if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
# Size check
|
|
55
|
+
try:
|
|
56
|
+
if path.stat().st_size > max_file_size:
|
|
57
|
+
continue
|
|
58
|
+
except OSError:
|
|
59
|
+
# Skip files we can't stat (permissions, broken symlinks, etc.)
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
result.append(path)
|
|
63
|
+
|
|
64
|
+
result.sort()
|
|
65
|
+
return result
|