diary-docs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,511 @@
1
+ """Orchestrator — glues scanner, extractors, database, gitignore, and reporter.
2
+
3
+ Usage::
4
+
5
+ from diary.indexer.indexer import run_index, compute_coverage
6
+ db = run_index(Path("/path/to/workspace"))
7
+ coverage = compute_coverage(db)
8
+ db.close()
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import hashlib
14
+ import logging
15
+ import re
16
+ import unicodedata
17
+ from pathlib import Path
18
+ from typing import Optional
19
+
20
+ from .database import IndexDatabase
21
+ from .extractors import extract_symbols
22
+ from .gitignore import ensure_gitignore
23
+ from .scanner import scan_files, SUPPORTED_EXTENSIONS
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Extensions that are considered markdown (processed for documents table)
28
+ MARKDOWN_EXTENSIONS = frozenset({".md", ".mdx"})
29
+
30
+ # Mapping from file extension to human-readable language name
31
+ _EXTENSION_LANGUAGE: dict[str, str] = {
32
+ ".py": "Python",
33
+ ".ts": "TypeScript",
34
+ ".tsx": "TypeScript",
35
+ ".js": "JavaScript",
36
+ ".jsx": "JavaScript",
37
+ ".php": "PHP",
38
+ ".java": "Java",
39
+ ".go": "Go",
40
+ ".cs": "C#",
41
+ ".yaml": "YAML",
42
+ ".yml": "YAML",
43
+ ".json": "JSON",
44
+ ".md": "Markdown",
45
+ ".mdx": "Markdown",
46
+ }
47
+
48
+
49
+ # ── Helpers ──────────────────────────────────────────────────────────────
50
+
51
+
52
+ def sha256_hash(content: str | bytes) -> str:
53
+ """Return the hex SHA-256 digest of *content*.
54
+
55
+ Parameters
56
+ ----------
57
+ content : str | bytes
58
+ Input data to hash.
59
+
60
+ Returns
61
+ -------
62
+ str
63
+ Hex-encoded SHA-256 digest (64 characters).
64
+ """
65
+ if isinstance(content, str):
66
+ content = content.encode("utf-8")
67
+ return hashlib.sha256(content).hexdigest()
68
+
69
+
70
+ def _extract_frontmatter(content: str) -> dict:
71
+ """Parse YAML-style frontmatter from markdown content.
72
+
73
+ Expects content between the first ``---\\n`` and the second ``\\n---\\n``.
74
+ Returns a dict with ``title``, ``summary`` and any other key-value pairs
75
+ found (simple ``key: value`` format only — no nested YAML).
76
+
77
+ Parameters
78
+ ----------
79
+ content : str
80
+ Raw markdown file content.
81
+
82
+ Returns
83
+ -------
84
+ dict
85
+ Parsed frontmatter fields (may be empty).
86
+ """
87
+ result: dict = {}
88
+ # Match content between first `---\n` and second `\n---\n`
89
+ m = re.match(r"^---\n(.*?)\n---(?:\n|$)", content, re.DOTALL)
90
+ if not m:
91
+ return result
92
+
93
+ for line in m.group(1).split("\n"):
94
+ line = line.strip()
95
+ if not line or line.startswith("#"):
96
+ continue
97
+ kv = re.match(r"(\w[\w_-]*)\s*:\s*(.*)", line)
98
+ if kv:
99
+ key = kv.group(1).strip()
100
+ value = kv.group(2).strip()
101
+ result[key] = value
102
+
103
+ return result
104
+
105
+
106
+ def _line_count(content: str) -> int:
107
+ """Return number of lines in *content*."""
108
+ return len(content.split("\n"))
109
+
110
+
111
+ # ── Coverage ─────────────────────────────────────────────────────────────
112
+
113
+
114
+ def compute_coverage(db: IndexDatabase) -> dict:
115
+ """Query *db* and return a dictionary of coverage statistics.
116
+
117
+ Parameters
118
+ ----------
119
+ db : IndexDatabase
120
+ An open database handle with populated tables.
121
+
122
+ Returns
123
+ -------
124
+ dict
125
+ Keys:
126
+ - ``total_files``
127
+ - ``total_symbols``
128
+ - ``total_documents``
129
+ - ``documented_files`` (files with at least one relation)
130
+ - ``documented_symbols`` (symbols referenced in relations)
131
+ - ``documented_documents`` (documents with at least one relation)
132
+ - ``files_per_language`` (list of ``{language, count}`` dicts)
133
+ """
134
+ conn = db.conn
135
+
136
+ def _scalar(sql: str, params: tuple = ()) -> int:
137
+ row = conn.execute(sql, params).fetchone()
138
+ return row[0] if row else 0
139
+
140
+ total_files = _scalar("SELECT COUNT(*) FROM files")
141
+ total_symbols = _scalar("SELECT COUNT(*) FROM symbols")
142
+ total_documents = _scalar("SELECT COUNT(*) FROM documents")
143
+
144
+ documented_files = _scalar(
145
+ "SELECT COUNT(DISTINCT file_id) FROM relations WHERE file_id IS NOT NULL"
146
+ )
147
+ documented_symbols = _scalar(
148
+ "SELECT COUNT(DISTINCT symbol_id) FROM relations"
149
+ )
150
+ documented_documents = _scalar(
151
+ "SELECT COUNT(DISTINCT doc_id) FROM relations"
152
+ )
153
+
154
+ rows_lang = conn.execute(
155
+ "SELECT language, COUNT(*) FROM files GROUP BY language ORDER BY COUNT(*) DESC"
156
+ ).fetchall()
157
+ files_per_language = [{"language": lang, "count": cnt} for lang, cnt in rows_lang]
158
+
159
+ return {
160
+ "total_files": total_files,
161
+ "total_symbols": total_symbols,
162
+ "total_documents": total_documents,
163
+ "documented_files": documented_files,
164
+ "documented_symbols": documented_symbols,
165
+ "documented_documents": documented_documents,
166
+ "files_per_language": files_per_language,
167
+ }
168
+
169
+
170
+ # ── Relation building ────────────────────────────────────────────────────
171
+
172
+
173
+ def _build_relations(db: IndexDatabase) -> None:
174
+ """Link markdown document headings to matching symbol names.
175
+
176
+ For every document heading stored in the ``documents`` table, search the
177
+ ``symbols`` table for a symbol whose name matches (case-insensitive) and
178
+ insert a relation row with confidence 0.8.
179
+
180
+ Parameters
181
+ ----------
182
+ db : IndexDatabase
183
+ Open database handle with populated ``documents`` and ``symbols`` tables.
184
+ """
185
+ conn = db.conn
186
+ docs = conn.execute("SELECT id, headings FROM documents").fetchall()
187
+
188
+ for doc_id, headings_json in docs:
189
+ if not headings_json:
190
+ continue
191
+
192
+ import json as _json
193
+
194
+ try:
195
+ headings = _json.loads(headings_json)
196
+ except (_json.JSONDecodeError, TypeError):
197
+ continue
198
+
199
+ if not isinstance(headings, list):
200
+ continue
201
+
202
+ for heading in headings:
203
+ if not isinstance(heading, str) or not heading.strip():
204
+ continue
205
+ # Search symbols for case-insensitive match
206
+ rows = conn.execute(
207
+ "SELECT id, file_id FROM symbols WHERE LOWER(name) = LOWER(?)",
208
+ (heading.strip(),),
209
+ ).fetchall()
210
+ for sym_id, file_id in rows:
211
+ db.insert_relation(
212
+ doc_id=doc_id,
213
+ symbol_id=sym_id,
214
+ file_id=file_id,
215
+ confidence=0.8,
216
+ reason="heading match",
217
+ )
218
+
219
+
220
+ def _build_dependencies(file_path: Path, content: str) -> list[dict]:
221
+ """Extract import/require statements from *content* using naive regex.
222
+
223
+ Supports Python (``import X``, ``from X import Y``), JavaScript/TypeScript
224
+ (``import X from``, ``require(...)``) and basic static file references.
225
+
226
+ Parameters
227
+ ----------
228
+ file_path : Path
229
+ Path to the source file (used for extension detection).
230
+ content : str
231
+ Raw file content.
232
+
233
+ Returns
234
+ -------
235
+ list[dict]
236
+ Each dict has keys ``target_path`` (the imported module name) and
237
+ ``dep_type`` (always ``"import"`` for now).
238
+ """
239
+ ext = file_path.suffix.lower()
240
+ deps: list[dict] = []
241
+
242
+ if ext == ".py":
243
+ # Python: import X, from X import Y
244
+ for m in re.finditer(
245
+ r"^(?:from\s+([\w.]+)\s+import|import\s+([\w.]+))", content, re.MULTILINE
246
+ ):
247
+ target = m.group(1) or m.group(2)
248
+ deps.append({"target_path": target, "dep_type": "import"})
249
+ elif ext in (".ts", ".tsx", ".js", ".jsx"):
250
+ # JS/TS: import X from '...', require('...'), import '...'
251
+ for m in re.finditer(
252
+ r'(?:import\s+(?:\w+\s+from\s+)?["\']([^"\']+)["\']|require\(["\']([^"\']+)["\']\))',
253
+ content,
254
+ ):
255
+ target = m.group(1) or m.group(2)
256
+ deps.append({"target_path": target, "dep_type": "import"})
257
+ elif ext == ".java":
258
+ for m in re.finditer(r"^import\s+([\w.*]+);", content, re.MULTILINE):
259
+ deps.append({"target_path": m.group(1), "dep_type": "import"})
260
+ elif ext == ".go":
261
+ for m in re.finditer(r'^import\s+["\']([^"\']+)["\']', content, re.MULTILINE):
262
+ deps.append({"target_path": m.group(1), "dep_type": "import"})
263
+ elif ext == ".php":
264
+ for m in re.finditer(
265
+ r"(?:use\s+([\w\\\\]+)|require(?:_once)?\s+[\"']([^\"']+)[\"']|include(?:_once)?\s+[\"']([^\"']+)[\"'])",
266
+ content,
267
+ ):
268
+ target = m.group(1) or m.group(2) or m.group(3)
269
+ deps.append({"target_path": target, "dep_type": "import"})
270
+
271
+ return deps
272
+
273
+
274
+ # ── Branch name sanitization (ADR-4) ─────────────────────────────────────
275
+
276
+
277
+ def _sanitize_branch_name(name: str) -> str:
278
+ """Sanitize *name* to a safe filesystem fragment.
279
+
280
+ * lowercased
281
+ * ``/`` → ``_``
282
+ * NFC-normalised
283
+ * Truncated to 100 code points
284
+
285
+ Parameters
286
+ ----------
287
+ name : str
288
+ Raw branch name (e.g. ``feature/my-thing``).
289
+
290
+ Returns
291
+ -------
292
+ str
293
+ Safe name usable in a filename (e.g. ``feature_my-thing``).
294
+ """
295
+ name = unicodedata.normalize("NFC", name)
296
+ name = name.lower().replace("/", "_")
297
+ return name[:100]
298
+
299
+
300
+ # ── Main orchestrator ────────────────────────────────────────────────────
301
+
302
+
303
+ def run_index(
304
+ workspace_path: Path,
305
+ branch_name: str | None = None,
306
+ db_path: Optional[Path] = None,
307
+ ) -> IndexDatabase:
308
+ """Scan *workspace_path*, extract symbols, build knowledge database.
309
+
310
+ This is the main entry point for the knowledge indexer. It:
311
+
312
+ 1. Ensures the ``docs/.index/`` output directory exists.
313
+ 2. Updates ``.gitignore`` so the index directory is not tracked.
314
+ 3. Creates / upgrades the SQLite database schema.
315
+ 4. Scans all supported source files.
316
+ 5. Extracts symbols from each file.
317
+ 6. Processes markdown files into a documents table.
318
+ 7. Links documents to symbols via heading matching.
319
+ 8. Extracts import dependency edges.
320
+
321
+ Parameters
322
+ ----------
323
+ workspace_path : Path
324
+ Root of the workspace to index.
325
+ branch_name : str or None, optional
326
+ If provided, the database is stored as
327
+ ``docs/.index/knowledge-{sanitized_name}.db``, enabling per-branch
328
+ isolation. When ``None`` (default), ``knowledge.db`` is used,
329
+ preserving backward compatibility.
330
+ db_path : Path or None, optional
331
+ Explicit path to the output SQLite database. If provided, takes
332
+ precedence over *branch_name*. If ``None`` (default), the path is
333
+ derived from *workspace_path* and *branch_name*.
334
+
335
+ Returns
336
+ -------
337
+ IndexDatabase
338
+ The populated database instance (still open — caller should
339
+ ``db.close()`` when done).
340
+ """
341
+ workspace_path = workspace_path.resolve()
342
+
343
+ # 1. Default database path
344
+ if db_path is None:
345
+ index_dir = workspace_path / "docs" / ".index"
346
+ db_filename = (
347
+ f"knowledge-{_sanitize_branch_name(branch_name)}.db"
348
+ if branch_name
349
+ else "knowledge.db"
350
+ )
351
+ db_path = index_dir / db_filename
352
+ else:
353
+ index_dir = db_path.parent
354
+
355
+ # 2. Ensure output directory exists
356
+ index_dir.mkdir(parents=True, exist_ok=True)
357
+
358
+ # 3. Update .gitignore
359
+ ensure_gitignore(workspace_path)
360
+
361
+ # 4. Initialise database
362
+ db = IndexDatabase(db_path)
363
+ db.create_tables()
364
+ db.clear_all()
365
+
366
+ # 5. Scan files
367
+ all_files = scan_files(workspace_path)
368
+
369
+ # Separate source files and markdown files
370
+ md_files: list[Path] = []
371
+ source_files: list[Path] = []
372
+
373
+ for fp in all_files:
374
+ if fp.suffix.lower() in MARKDOWN_EXTENSIONS:
375
+ md_files.append(fp)
376
+ else:
377
+ source_files.append(fp)
378
+
379
+ # 6. Process source files (populate files + symbols tables)
380
+ file_count = 0
381
+ for file_path in source_files:
382
+ try:
383
+ content = file_path.read_text("utf-8")
384
+ except (OSError, UnicodeDecodeError):
385
+ logger.warning("Cannot read %s — skipping", file_path)
386
+ continue
387
+
388
+ sha256 = sha256_hash(content)
389
+ rel_path = str(file_path.relative_to(workspace_path).as_posix())
390
+ ext = file_path.suffix.lower()
391
+ language = _EXTENSION_LANGUAGE.get(ext, ext.lstrip(".").upper())
392
+ stat = file_path.stat()
393
+ lines = _line_count(content)
394
+
395
+ file_id = db.insert_file(
396
+ path=str(file_path),
397
+ rel_path=rel_path,
398
+ language=language,
399
+ sha256=sha256,
400
+ size=stat.st_size,
401
+ modified=stat.st_mtime,
402
+ lines=lines,
403
+ )
404
+ file_count += 1
405
+
406
+ # Extract symbols
407
+ symbols = extract_symbols(file_path, content)
408
+ for sym in symbols:
409
+ db.insert_symbol(
410
+ file_id=file_id,
411
+ name=sym["name"],
412
+ fqn=sym.get("name"), # simple FQN = name for v1
413
+ sym_type=sym["type"],
414
+ parent=sym.get("parent"),
415
+ namespace=sym.get("namespace", ""),
416
+ start_line=sym["line"],
417
+ end_line=sym.get("end_line", sym["line"]),
418
+ visibility="public",
419
+ signature=sym.get("signature", ""),
420
+ )
421
+
422
+ # Extract dependencies
423
+ deps = _build_dependencies(file_path, content)
424
+ for dep in deps:
425
+ db.insert_dependency(
426
+ source_id=file_id,
427
+ target_path=dep["target_path"],
428
+ dep_type=dep["dep_type"],
429
+ )
430
+
431
+ # 7. Process markdown files (populate documents table)
432
+ for file_path in md_files:
433
+ try:
434
+ content = file_path.read_text("utf-8")
435
+ except (OSError, UnicodeDecodeError):
436
+ logger.warning("Cannot read %s — skipping", file_path)
437
+ continue
438
+
439
+ sha256 = sha256_hash(content)
440
+ rel_path = str(file_path.relative_to(workspace_path).as_posix())
441
+
442
+ # Extract frontmatter
443
+ fm = _extract_frontmatter(content)
444
+ title = fm.get("title", file_path.stem)
445
+ summary = fm.get("summary", "")
446
+
447
+ # Extract headings from markdown symbols
448
+ md_symbols = extract_symbols(file_path, content)
449
+ headings = [s["name"] for s in md_symbols if s["type"] == "heading"]
450
+
451
+ import json as _json
452
+
453
+ headings_json = _json.dumps(headings)
454
+
455
+ # Also insert into files table as a regular file
456
+ stat = file_path.stat()
457
+ file_id = db.insert_file(
458
+ path=str(file_path),
459
+ rel_path=rel_path,
460
+ language="Markdown",
461
+ sha256=sha256,
462
+ size=stat.st_size,
463
+ modified=stat.st_mtime,
464
+ lines=_line_count(content),
465
+ )
466
+
467
+ # Insert symbols if any
468
+ for sym in md_symbols:
469
+ db.insert_symbol(
470
+ file_id=file_id,
471
+ name=sym["name"],
472
+ fqn=sym["name"],
473
+ sym_type=sym["type"],
474
+ parent=sym.get("parent"),
475
+ namespace=sym.get("namespace", ""),
476
+ start_line=sym["line"],
477
+ end_line=sym.get("end_line", sym["line"]),
478
+ )
479
+
480
+ # Insert into documents table
481
+ db.insert_document(
482
+ path=rel_path,
483
+ title=title,
484
+ headings=headings_json,
485
+ summary=summary,
486
+ sha256=sha256,
487
+ )
488
+
489
+ # 8. Build relations
490
+ _build_relations(db)
491
+
492
+ # 9. Store schema version in metadata
493
+ db.conn.execute(
494
+ "INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
495
+ ("schema_version", "1"),
496
+ )
497
+ db.conn.execute(
498
+ "INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
499
+ ("indexed_at", str(db.conn.execute("SELECT strftime('%s','now')").fetchone()[0])),
500
+ )
501
+
502
+ # 10. Final commit
503
+ db.conn.commit()
504
+
505
+ logger.info(
506
+ "Indexed %d source files and %d markdown files",
507
+ file_count,
508
+ len(md_files),
509
+ )
510
+
511
+ return db
@@ -0,0 +1,137 @@
1
+ """Rich coverage report for the knowledge index.
2
+
3
+ Prints a table to stdout summarizing Files, Symbols, Documents,
4
+ and a per-language breakdown. Read-only — never modifies the database.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
+
12
+ from rich.console import Console
13
+ from rich.table import Table
14
+
15
+ if TYPE_CHECKING:
16
+ from diary.indexer.database import IndexDatabase
17
+
18
+
19
+ def generate_report(db: "IndexDatabase", workspace_path: Path) -> str:
20
+ """Query *db* for coverage statistics and print a Rich table to stdout.
21
+
22
+ Parameters
23
+ ----------
24
+ db : IndexDatabase
25
+ Open database handle (must have tables created).
26
+ workspace_path : Path
27
+ Root of the workspace (used only for display, not DB queries).
28
+
29
+ Returns
30
+ -------
31
+ str
32
+ The rendered table text (exported from Console).
33
+ """
34
+ conn = db.conn
35
+
36
+ # ------------------------------------------------------------------
37
+ # Queries
38
+ # ------------------------------------------------------------------
39
+
40
+ def _scalar(sql: str, params: tuple = ()) -> int:
41
+ row = conn.execute(sql, params).fetchone()
42
+ return row[0] if row else 0
43
+
44
+ # Summary metrics
45
+ total_files = _scalar("SELECT COUNT(*) FROM files")
46
+ documented_files = _scalar(
47
+ "SELECT COUNT(DISTINCT file_id) FROM relations WHERE file_id IS NOT NULL"
48
+ )
49
+
50
+ total_symbols = _scalar("SELECT COUNT(*) FROM symbols")
51
+ documented_symbols = _scalar(
52
+ "SELECT COUNT(DISTINCT symbol_id) FROM relations"
53
+ )
54
+
55
+ total_documents = _scalar("SELECT COUNT(*) FROM documents")
56
+ documented_documents = _scalar(
57
+ "SELECT COUNT(DISTINCT doc_id) FROM relations"
58
+ )
59
+
60
+ # Per-language file counts
61
+ rows_lang = conn.execute(
62
+ "SELECT language, COUNT(*) FROM files GROUP BY language ORDER BY COUNT(*) DESC"
63
+ ).fetchall()
64
+
65
+ # ------------------------------------------------------------------
66
+ # Table construction
67
+ # ------------------------------------------------------------------
68
+
69
+ table = Table(
70
+ title="Knowledge Index Coverage Report",
71
+ title_justify="left",
72
+ )
73
+ table.add_column("Category", style="cyan", no_wrap=True)
74
+ table.add_column("Total", justify="right")
75
+ table.add_column("Documented", justify="right")
76
+ table.add_column("Undocumented", justify="right")
77
+ table.add_column("Coverage %", justify="right", style="green")
78
+
79
+ # -- Summary rows ---------------------------------------------------
80
+
81
+ def _pct(documented: int, total: int) -> str:
82
+ if total == 0:
83
+ return "—"
84
+ return f"{documented / total * 100:.1f}%"
85
+
86
+ def _undocumented(documented: int, total: int) -> int:
87
+ return total - documented
88
+
89
+ table.add_row(
90
+ "[bold]Files[/]",
91
+ str(total_files),
92
+ str(documented_files),
93
+ str(_undocumented(documented_files, total_files)),
94
+ _pct(documented_files, total_files),
95
+ )
96
+ table.add_row(
97
+ "[bold]Symbols[/]",
98
+ str(total_symbols),
99
+ str(documented_symbols),
100
+ str(_undocumented(documented_symbols, total_symbols)),
101
+ _pct(documented_symbols, total_symbols),
102
+ )
103
+ table.add_row(
104
+ "[bold]Documents[/]",
105
+ str(total_documents),
106
+ str(documented_documents),
107
+ str(_undocumented(documented_documents, total_documents)),
108
+ _pct(documented_documents, total_documents),
109
+ )
110
+
111
+ # -- Per-Language rows ----------------------------------------------
112
+ if rows_lang:
113
+ table.add_section()
114
+ for language, cnt in rows_lang:
115
+ expr = """
116
+ SELECT COUNT(DISTINCT s.id)
117
+ FROM symbols s
118
+ JOIN relations r ON r.symbol_id = s.id
119
+ JOIN files f ON f.id = s.file_id
120
+ WHERE f.language = ?
121
+ """
122
+ docd = _scalar(expr, (language,))
123
+ table.add_row(
124
+ f"[italic]{language}[/]",
125
+ str(cnt),
126
+ str(docd),
127
+ str(cnt - docd),
128
+ _pct(docd, cnt),
129
+ )
130
+
131
+ # ------------------------------------------------------------------
132
+ # Render & print
133
+ # ------------------------------------------------------------------
134
+
135
+ console = Console(record=True)
136
+ console.print(table)
137
+ return console.export_text()
@@ -0,0 +1,65 @@
1
+ """File scanner — walks a directory tree and returns supported source files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+
8
+ SUPPORTED_EXTENSIONS = frozenset({
9
+ ".php", ".ts", ".js", ".java", ".py",
10
+ ".go", ".cs", ".yaml", ".yml", ".json", ".md",
11
+ })
12
+
13
+ EXCLUDED_DIRS = frozenset({
14
+ "node_modules", ".git", "__pycache__", ".venv", "venv",
15
+ ".pytest_cache", "dist", "build", ".next", "vendor",
16
+ })
17
+
18
+
19
+ def scan_files(root_path: Path, max_file_size: int = 524288) -> list[Path]:
20
+ """Walk *root_path* and return a sorted list of supported source files.
21
+
22
+ Parameters
23
+ ----------
24
+ root_path : Path
25
+ Root directory to scan.
26
+ max_file_size : int, optional
27
+ Maximum file size in bytes (default 524288 = 512 KiB).
28
+
29
+ Returns
30
+ -------
31
+ list[Path]
32
+ Sorted list of absolute paths matching the supported extensions
33
+ and exclusion rules.
34
+ """
35
+ result: list[Path] = []
36
+
37
+ for path in root_path.rglob("*"):
38
+ # Skip directories – we only collect files
39
+ if not path.is_file():
40
+ continue
41
+
42
+ # Exclusion: skip if any excluded directory appears in the path
43
+ if any(part in EXCLUDED_DIRS for part in path.parts):
44
+ continue
45
+
46
+ # Exclusion: skip minified files
47
+ if ".min." in path.name:
48
+ continue
49
+
50
+ # Inclusion: only accepted extensions
51
+ if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
52
+ continue
53
+
54
+ # Size check
55
+ try:
56
+ if path.stat().st_size > max_file_size:
57
+ continue
58
+ except OSError:
59
+ # Skip files we can't stat (permissions, broken symlinks, etc.)
60
+ continue
61
+
62
+ result.append(path)
63
+
64
+ result.sort()
65
+ return result