kiwiskil 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kiwiskil-0.1.0/PKG-INFO +8 -0
- kiwiskil-0.1.0/README.md +124 -0
- kiwiskil-0.1.0/indexer/__init__.py +0 -0
- kiwiskil-0.1.0/indexer/ast_parser.py +131 -0
- kiwiskil-0.1.0/indexer/cli.py +259 -0
- kiwiskil-0.1.0/indexer/config.py +49 -0
- kiwiskil-0.1.0/indexer/git.py +38 -0
- kiwiskil-0.1.0/indexer/grouper.py +55 -0
- kiwiskil-0.1.0/indexer/hooks.py +83 -0
- kiwiskil-0.1.0/indexer/llm.py +89 -0
- kiwiskil-0.1.0/indexer/manifest.py +64 -0
- kiwiskil-0.1.0/indexer/wiki.py +79 -0
- kiwiskil-0.1.0/kiwiskil.egg-info/PKG-INFO +8 -0
- kiwiskil-0.1.0/kiwiskil.egg-info/SOURCES.txt +25 -0
- kiwiskil-0.1.0/kiwiskil.egg-info/dependency_links.txt +1 -0
- kiwiskil-0.1.0/kiwiskil.egg-info/entry_points.txt +2 -0
- kiwiskil-0.1.0/kiwiskil.egg-info/requires.txt +4 -0
- kiwiskil-0.1.0/kiwiskil.egg-info/top_level.txt +3 -0
- kiwiskil-0.1.0/pyproject.toml +20 -0
- kiwiskil-0.1.0/setup.cfg +4 -0
- kiwiskil-0.1.0/tests/__init__.py +0 -0
- kiwiskil-0.1.0/tests/fixtures/sample_py/auth.py +16 -0
- kiwiskil-0.1.0/tests/test_ast_parser.py +53 -0
- kiwiskil-0.1.0/tests/test_config.py +33 -0
- kiwiskil-0.1.0/tests/test_grouper.py +55 -0
- kiwiskil-0.1.0/tests/test_manifest.py +81 -0
- kiwiskil-0.1.0/tests/test_wiki.py +69 -0
kiwiskil-0.1.0/PKG-INFO
ADDED
kiwiskil-0.1.0/README.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# kiwiskil
|
|
2
|
+
|
|
3
|
+
> Chat with your codebase using any LLM.
|
|
4
|
+
|
|
5
|
+
kiwiskil generates a checked-in structural wiki and skill files from any codebase. It enables LLM agents to navigate code without reading source files — using a knowledge graph built from your repo and checked into git.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## How it works
|
|
10
|
+
|
|
11
|
+
1. **AST parsing** extracts symbols, imports, and call graphs from your source files (deterministic, free)
|
|
12
|
+
2. **LiteLLM** adds one-line descriptions per symbol using any provider you configure
|
|
13
|
+
3. **A density-based grouper** organises files into wiki pages by logical density, not directory structure
|
|
14
|
+
4. **A pre-commit hook** keeps the wiki in sync — every commit includes updated wiki pages atomically
|
|
15
|
+
5. **A skill file** is generated at `.indexer/skills/codebase.md` so any LLM agent can navigate your codebase via structured tools
|
|
16
|
+
|
|
17
|
+
The wiki is plain markdown checked into your repo. No cloud service, no search index, no lock-in.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install kiwiskil
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick start
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# In any git repo
|
|
33
|
+
kiwiskil init # creates .indexer.toml, installs pre-commit hook, appends CLAUDE.md
|
|
34
|
+
kiwiskil run # generates wiki/ and .indexer/skills/codebase.md
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
On every subsequent commit, the pre-commit hook runs `kiwiskil run --staged` automatically — only changed files are re-indexed.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## CLI
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
kiwiskil init # set up config, hook, and CLAUDE.md
|
|
45
|
+
kiwiskil run # smart: incremental if indexed before, full if not
|
|
46
|
+
kiwiskil run --force # force full re-index
|
|
47
|
+
kiwiskil run --staged # incremental on staged files only (used by hook)
|
|
48
|
+
kiwiskil status # show last indexed commit, stale files, stats
|
|
49
|
+
kiwiskil hook install # manually install pre-commit hook
|
|
50
|
+
kiwiskil hook remove # remove pre-commit hook
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Output
|
|
56
|
+
|
|
57
|
+
### `wiki/INDEX.md`
|
|
58
|
+
Top-level map of the entire codebase — which wiki page covers which files, and the entry points for each.
|
|
59
|
+
|
|
60
|
+
### `wiki/<group>.md`
|
|
61
|
+
One page per logical folder cluster. Each page contains:
|
|
62
|
+
- **Modules** — files covered
|
|
63
|
+
- **Key Symbols** — functions, classes, methods with one-line descriptions
|
|
64
|
+
- **Relationships** — what this group calls, what calls it, what it imports
|
|
65
|
+
- **Entry Points** — symbols with no callers (architectural roots)
|
|
66
|
+
|
|
67
|
+
### `.indexer/skills/codebase.md`
|
|
68
|
+
A skill file compatible with Claude Code, Cursor, Copilot, and other LLM agents. Drop it into your agent's skill directory to unlock:
|
|
69
|
+
|
|
70
|
+
- `find_module(query)` — search wiki pages by keyword
|
|
71
|
+
- `get_symbol(id)` — look up any symbol by component ID (`file::Class.method`)
|
|
72
|
+
- `trace_callers(symbol_id)` — find what calls a given symbol
|
|
73
|
+
- `what_changed(since_commit)` — list changed files with their wiki pages
|
|
74
|
+
- `entry_points()` — list all architectural entry points
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Configuration
|
|
79
|
+
|
|
80
|
+
`.indexer.toml` is created by `kiwiskil init` and checked into the repo:
|
|
81
|
+
|
|
82
|
+
```toml
|
|
83
|
+
[llm]
|
|
84
|
+
provider = "anthropic/claude-sonnet-4-6" # any LiteLLM-compatible model string
|
|
85
|
+
api_key_env = "ANTHROPIC_API_KEY" # env var name, not the key itself
|
|
86
|
+
|
|
87
|
+
[indexer]
|
|
88
|
+
wiki_dir = "wiki"
|
|
89
|
+
ignore = ["node_modules", ".venv", "dist", "build", "__pycache__", "*.test.*"]
|
|
90
|
+
max_tokens_per_batch = 8000
|
|
91
|
+
|
|
92
|
+
[hooks]
|
|
93
|
+
pre_commit = true
|
|
94
|
+
synthesize_commit_message = true
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Any LiteLLM-compatible provider works: OpenAI, Anthropic, Gemini, Ollama, local models.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Commit message synthesis
|
|
102
|
+
|
|
103
|
+
When running as a pre-commit hook, kiwiskil synthesises a commit message from the code changes and prints it to stdout. You can use it, edit it, or ignore it — your choice.
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Design principles
|
|
108
|
+
|
|
109
|
+
- **Structural facts only** — wiki pages contain symbols, relationships, and entry points. No prose summaries, no architectural opinions. The client LLM draws its own conclusions.
|
|
110
|
+
- **Checked in, not served** — the wiki is plain markdown in your repo. It travels with your code, is tracked by git, and is readable by humans and agents alike.
|
|
111
|
+
- **Incremental by default** — git diff + content hash manifest means only changed files are re-processed on each commit.
|
|
112
|
+
- **Provider-agnostic** — LiteLLM means you can use any model, local or cloud, without changing the tool.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Supported languages
|
|
117
|
+
|
|
118
|
+
Python (stdlib `ast`). JS, TS, Go, Rust, Java, Ruby support planned via tree-sitter.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
MIT
|
|
File without changes
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import ast, json, hashlib
|
|
3
|
+
from dataclasses import dataclass, field, asdict
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ASTNode:
|
|
9
|
+
id: str # "rel/path.py::Class.method"
|
|
10
|
+
type: str # "function" | "method" | "class"
|
|
11
|
+
file: str # repo-relative path
|
|
12
|
+
line_start: int
|
|
13
|
+
line_end: int
|
|
14
|
+
docstring: Optional[str]
|
|
15
|
+
imports: list[str] = field(default_factory=list)
|
|
16
|
+
calls: list[str] = field(default_factory=list)
|
|
17
|
+
# called_by is intentionally empty at parse time;
|
|
18
|
+
# populated in a later cross-reference pass by cli.py
|
|
19
|
+
called_by: list[str] = field(default_factory=list)
|
|
20
|
+
|
|
21
|
+
def _rel(path: Path, repo_root: Path) -> str:
|
|
22
|
+
try:
|
|
23
|
+
return str(path.relative_to(repo_root))
|
|
24
|
+
except ValueError:
|
|
25
|
+
return str(path)
|
|
26
|
+
|
|
27
|
+
def _extract_imports(tree: ast.Module) -> list[str]:
|
|
28
|
+
imports = []
|
|
29
|
+
for node in ast.walk(tree):
|
|
30
|
+
if isinstance(node, ast.Import):
|
|
31
|
+
for alias in node.names:
|
|
32
|
+
imports.append(alias.name)
|
|
33
|
+
elif isinstance(node, ast.ImportFrom):
|
|
34
|
+
mod = node.module or ""
|
|
35
|
+
for alias in node.names:
|
|
36
|
+
imports.append(f"{mod}.{alias.name}" if mod else alias.name)
|
|
37
|
+
return imports
|
|
38
|
+
|
|
39
|
+
def _extract_calls(func_node: ast.FunctionDef | ast.AsyncFunctionDef) -> list[str]:
|
|
40
|
+
calls = []
|
|
41
|
+
for node in ast.walk(func_node):
|
|
42
|
+
if isinstance(node, ast.Call):
|
|
43
|
+
if isinstance(node.func, ast.Name):
|
|
44
|
+
calls.append(node.func.id)
|
|
45
|
+
elif isinstance(node.func, ast.Attribute):
|
|
46
|
+
calls.append(node.func.attr)
|
|
47
|
+
return list(set(calls))
|
|
48
|
+
|
|
49
|
+
def _get_class_method_ids(tree: ast.Module) -> set[int]:
|
|
50
|
+
"""Return AST node ids of functions that are direct children of a class body."""
|
|
51
|
+
ids = set()
|
|
52
|
+
for node in ast.walk(tree):
|
|
53
|
+
if isinstance(node, ast.ClassDef):
|
|
54
|
+
for item in node.body:
|
|
55
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
56
|
+
ids.add(id(item))
|
|
57
|
+
return ids
|
|
58
|
+
|
|
59
|
+
def parse_file(path: Path, repo_root: Path) -> list[ASTNode]:
|
|
60
|
+
"""Parse a Python file and return ASTNode list. Returns [] on syntax error."""
|
|
61
|
+
try:
|
|
62
|
+
source = path.read_text(encoding="utf-8", errors="replace")
|
|
63
|
+
tree = ast.parse(source)
|
|
64
|
+
except (SyntaxError, OSError):
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
rel_path = _rel(path, repo_root)
|
|
68
|
+
file_imports = _extract_imports(tree)
|
|
69
|
+
method_ids = _get_class_method_ids(tree)
|
|
70
|
+
nodes: list[ASTNode] = []
|
|
71
|
+
|
|
72
|
+
for node in ast.walk(tree):
|
|
73
|
+
if isinstance(node, ast.ClassDef):
|
|
74
|
+
nodes.append(ASTNode(
|
|
75
|
+
id=f"{rel_path}::{node.name}",
|
|
76
|
+
type="class",
|
|
77
|
+
file=rel_path,
|
|
78
|
+
line_start=node.lineno,
|
|
79
|
+
line_end=node.end_lineno or node.lineno,
|
|
80
|
+
docstring=ast.get_docstring(node),
|
|
81
|
+
imports=list(file_imports),
|
|
82
|
+
calls=[],
|
|
83
|
+
))
|
|
84
|
+
for item in node.body:
|
|
85
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
86
|
+
nodes.append(ASTNode(
|
|
87
|
+
id=f"{rel_path}::{node.name}.{item.name}",
|
|
88
|
+
type="method",
|
|
89
|
+
file=rel_path,
|
|
90
|
+
line_start=item.lineno,
|
|
91
|
+
line_end=item.end_lineno or item.lineno,
|
|
92
|
+
docstring=ast.get_docstring(item),
|
|
93
|
+
imports=list(file_imports),
|
|
94
|
+
calls=_extract_calls(item),
|
|
95
|
+
))
|
|
96
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
97
|
+
if id(node) not in method_ids:
|
|
98
|
+
nodes.append(ASTNode(
|
|
99
|
+
id=f"{rel_path}::{node.name}",
|
|
100
|
+
type="function",
|
|
101
|
+
file=rel_path,
|
|
102
|
+
line_start=node.lineno,
|
|
103
|
+
line_end=node.end_lineno or node.lineno,
|
|
104
|
+
docstring=ast.get_docstring(node),
|
|
105
|
+
imports=list(file_imports),
|
|
106
|
+
calls=_extract_calls(node),
|
|
107
|
+
))
|
|
108
|
+
|
|
109
|
+
return nodes
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def compute_hash_short(path: Path) -> str:
|
|
113
|
+
"""Returns first 16 chars of sha256 hex digest — used for cache filenames only."""
|
|
114
|
+
return hashlib.sha256(path.read_bytes()).hexdigest()[:16]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def load_cached_nodes(repo_root: Path, file_hash: str) -> Optional[list[ASTNode]]:
|
|
118
|
+
p = repo_root / ".indexer" / "cache" / f"{file_hash}.json"
|
|
119
|
+
if not p.exists():
|
|
120
|
+
return None
|
|
121
|
+
try:
|
|
122
|
+
data = json.loads(p.read_text())
|
|
123
|
+
return [ASTNode(**n) for n in data]
|
|
124
|
+
except (json.JSONDecodeError, TypeError, KeyError):
|
|
125
|
+
return None # corrupted cache — caller will re-parse
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def save_cached_nodes(repo_root: Path, file_hash: str, nodes: list[ASTNode]) -> None:
|
|
129
|
+
p = repo_root / ".indexer" / "cache" / f"{file_hash}.json"
|
|
130
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
p.write_text(json.dumps([asdict(n) for n in nodes], indent=2))
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# indexer/cli.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import subprocess
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
|
|
9
|
+
from indexer.config import Config, load_config, save_config
|
|
10
|
+
from indexer.manifest import load_manifest, save_manifest, compute_hash, FileEntry
|
|
11
|
+
from indexer.git import (
|
|
12
|
+
staged_files, all_tracked_files, current_commit,
|
|
13
|
+
changed_files_since, is_git_repo
|
|
14
|
+
)
|
|
15
|
+
from indexer.ast_parser import parse_file, load_cached_nodes, save_cached_nodes, compute_hash_short
|
|
16
|
+
from indexer.llm import describe_nodes, synthesize_commit_message
|
|
17
|
+
from indexer.grouper import density_group
|
|
18
|
+
from indexer.wiki import build_page, build_index, write_page, write_index, PageContext, IndexEntry, TEMPLATES_DIR
|
|
19
|
+
from indexer.hooks import install_hook, remove_hook
|
|
20
|
+
|
|
21
|
+
CLAUDEMD_SNIPPET = """
|
|
22
|
+
## Codebase Navigation
|
|
23
|
+
|
|
24
|
+
This repo is indexed. Before reading source files:
|
|
25
|
+
- Load `wiki/INDEX.md` for the full structure map
|
|
26
|
+
- Use `.indexer/skills/codebase.md` as a skill for structured lookup tools
|
|
27
|
+
- Wiki pages are in `wiki/` — grouped by logical density, not mirroring directory structure exactly
|
|
28
|
+
- Component IDs follow `file::Class.method` format throughout
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@click.group()
|
|
33
|
+
def main():
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@main.command()
|
|
38
|
+
def init():
|
|
39
|
+
"""Create .indexer.toml, install pre-commit hook, append to CLAUDE.md."""
|
|
40
|
+
root = Path.cwd()
|
|
41
|
+
cfg = load_config(root)
|
|
42
|
+
save_config(root, cfg)
|
|
43
|
+
click.echo(f"Created {root / '.indexer.toml'}")
|
|
44
|
+
|
|
45
|
+
if is_git_repo(root) and cfg.pre_commit:
|
|
46
|
+
install_hook(root)
|
|
47
|
+
click.echo("Installed pre-commit hook.")
|
|
48
|
+
|
|
49
|
+
claude_md = root / "CLAUDE.md"
|
|
50
|
+
if claude_md.exists():
|
|
51
|
+
existing = claude_md.read_text()
|
|
52
|
+
if "Codebase Navigation" not in existing:
|
|
53
|
+
claude_md.write_text(existing + "\n" + CLAUDEMD_SNIPPET)
|
|
54
|
+
click.echo("Appended to CLAUDE.md.")
|
|
55
|
+
else:
|
|
56
|
+
claude_md.write_text(CLAUDEMD_SNIPPET.lstrip())
|
|
57
|
+
click.echo("Created CLAUDE.md.")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@main.command()
|
|
61
|
+
@click.option("--staged", is_flag=True, help="Incremental: only staged files (used by hook)")
|
|
62
|
+
@click.option("--force", is_flag=True, help="Force full re-index regardless of manifest")
|
|
63
|
+
def run(staged: bool, force: bool):
|
|
64
|
+
"""Index the codebase and generate wiki pages."""
|
|
65
|
+
root = Path.cwd()
|
|
66
|
+
cfg = load_config(root)
|
|
67
|
+
manifest = load_manifest(root)
|
|
68
|
+
|
|
69
|
+
# Determine which files to process
|
|
70
|
+
if staged:
|
|
71
|
+
candidates = staged_files(root)
|
|
72
|
+
elif force or manifest.last_indexed_commit is None:
|
|
73
|
+
candidates = [f for f in all_tracked_files(root) if _is_indexable(f, cfg)]
|
|
74
|
+
else:
|
|
75
|
+
git_changed = changed_files_since(root, manifest.last_indexed_commit) if is_git_repo(root) else []
|
|
76
|
+
all_files = [f for f in all_tracked_files(root) if _is_indexable(f, cfg)]
|
|
77
|
+
stale = manifest.stale_files(root, all_files)
|
|
78
|
+
candidates = list(set(git_changed + stale))
|
|
79
|
+
|
|
80
|
+
candidates = [f for f in candidates if _is_indexable(f, cfg)]
|
|
81
|
+
|
|
82
|
+
if not candidates:
|
|
83
|
+
click.echo("Nothing to index.")
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
click.echo(f"Indexing {len(candidates)} file(s)...")
|
|
87
|
+
|
|
88
|
+
# Parse AST (use cache for unchanged files, fresh parse for changed)
|
|
89
|
+
all_nodes = []
|
|
90
|
+
for rel_path in candidates:
|
|
91
|
+
abs_path = root / rel_path
|
|
92
|
+
file_hash = compute_hash_short(abs_path)
|
|
93
|
+
cached = load_cached_nodes(root, file_hash)
|
|
94
|
+
if cached is not None:
|
|
95
|
+
all_nodes.extend(cached)
|
|
96
|
+
else:
|
|
97
|
+
nodes = parse_file(abs_path, root)
|
|
98
|
+
save_cached_nodes(root, file_hash, nodes)
|
|
99
|
+
all_nodes.extend(nodes)
|
|
100
|
+
|
|
101
|
+
if not all_nodes:
|
|
102
|
+
click.echo("No symbols found.")
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# Cross-reference pass: populate called_by from calls graph
|
|
106
|
+
# Note: calls stores bare function/method names (e.g. "sign_payload"), not full component IDs.
|
|
107
|
+
# This is best-effort: correctly links calls within the same codebase batch.
|
|
108
|
+
call_index: dict[str, list[str]] = {}
|
|
109
|
+
for node in all_nodes:
|
|
110
|
+
for callee_name in node.calls:
|
|
111
|
+
call_index.setdefault(callee_name, []).append(node.id)
|
|
112
|
+
for node in all_nodes:
|
|
113
|
+
# Match by bare name (last part of component ID after "::")
|
|
114
|
+
bare_name = node.id.split("::")[-1]
|
|
115
|
+
node.called_by = call_index.get(bare_name, [])
|
|
116
|
+
|
|
117
|
+
# LLM: describe nodes in batches by token budget
|
|
118
|
+
descriptions: dict[str, str] = {}
|
|
119
|
+
batch, batch_size = [], 0
|
|
120
|
+
for node in all_nodes:
|
|
121
|
+
node_size = len(node.docstring or "") + len(" ".join(node.calls)) + 50
|
|
122
|
+
if batch_size + node_size > cfg.max_tokens_per_batch and batch:
|
|
123
|
+
descriptions.update(describe_nodes(batch, cfg))
|
|
124
|
+
batch, batch_size = [], 0
|
|
125
|
+
batch.append(node)
|
|
126
|
+
batch_size += node_size
|
|
127
|
+
if batch:
|
|
128
|
+
descriptions.update(describe_nodes(batch, cfg))
|
|
129
|
+
|
|
130
|
+
# Group files → wiki page labels
|
|
131
|
+
groups = density_group(candidates)
|
|
132
|
+
group_nodes: dict[str, list] = {}
|
|
133
|
+
for node in all_nodes:
|
|
134
|
+
group = groups.get(node.file, node.file)
|
|
135
|
+
group_nodes.setdefault(group, []).append(node)
|
|
136
|
+
|
|
137
|
+
# Write wiki pages
|
|
138
|
+
wiki_dir = root / cfg.wiki_dir
|
|
139
|
+
index_entries = []
|
|
140
|
+
for group_label, nodes in group_nodes.items():
|
|
141
|
+
ctx = PageContext(
|
|
142
|
+
group_label=group_label,
|
|
143
|
+
files=list({n.file for n in nodes}),
|
|
144
|
+
nodes=nodes,
|
|
145
|
+
descriptions=descriptions,
|
|
146
|
+
)
|
|
147
|
+
content = build_page(ctx)
|
|
148
|
+
page_path = write_page(wiki_dir, group_label, content)
|
|
149
|
+
entry_points = [n.id.split("::")[-1] for n in nodes if not n.called_by]
|
|
150
|
+
index_entries.append(IndexEntry(
|
|
151
|
+
path=str(page_path.relative_to(root)),
|
|
152
|
+
covers=", ".join(sorted({n.file for n in nodes})),
|
|
153
|
+
entry_points=entry_points,
|
|
154
|
+
))
|
|
155
|
+
|
|
156
|
+
# Write INDEX.md
|
|
157
|
+
commit = current_commit(root) or "unknown"
|
|
158
|
+
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
159
|
+
index_content = build_index(index_entries, commit, today)
|
|
160
|
+
write_index(wiki_dir, index_content)
|
|
161
|
+
|
|
162
|
+
# Write skill file
|
|
163
|
+
from jinja2 import Environment, FileSystemLoader
|
|
164
|
+
skill_dir = root / ".indexer" / "skills"
|
|
165
|
+
skill_dir.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
env = Environment(loader=FileSystemLoader(str(TEMPLATES_DIR)), trim_blocks=True, lstrip_blocks=True)
|
|
167
|
+
skill_content = env.get_template("skill.md.j2").render(wiki_dir=cfg.wiki_dir)
|
|
168
|
+
(skill_dir / "codebase.md").write_text(skill_content)
|
|
169
|
+
|
|
170
|
+
# Update manifest
|
|
171
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
172
|
+
for rel_path in candidates:
|
|
173
|
+
abs_path = root / rel_path
|
|
174
|
+
if abs_path.exists():
|
|
175
|
+
file_hash = compute_hash(abs_path)
|
|
176
|
+
group = groups.get(rel_path, rel_path)
|
|
177
|
+
manifest.files[rel_path] = FileEntry(
|
|
178
|
+
hash=file_hash,
|
|
179
|
+
wiki_page=f"{cfg.wiki_dir}/{group}.md",
|
|
180
|
+
component_ids=[n.id for n in all_nodes if n.file == rel_path],
|
|
181
|
+
)
|
|
182
|
+
manifest.last_indexed_commit = commit
|
|
183
|
+
manifest.indexed_at = now
|
|
184
|
+
|
|
185
|
+
# Prune manifest entries for files no longer tracked by git
|
|
186
|
+
if is_git_repo(root):
|
|
187
|
+
tracked = set(all_tracked_files(root))
|
|
188
|
+
stale_keys = [k for k in manifest.files if k not in tracked]
|
|
189
|
+
for k in stale_keys:
|
|
190
|
+
del manifest.files[k]
|
|
191
|
+
|
|
192
|
+
save_manifest(root, manifest)
|
|
193
|
+
|
|
194
|
+
# Synthesize commit message
|
|
195
|
+
if cfg.synthesize_commit_message and staged:
|
|
196
|
+
msg = synthesize_commit_message(candidates, descriptions, cfg)
|
|
197
|
+
if msg:
|
|
198
|
+
click.echo(f"\nSuggested commit message:\n {msg}")
|
|
199
|
+
|
|
200
|
+
# Auto-stage wiki + manifest when running as pre-commit hook
|
|
201
|
+
if staged and is_git_repo(root):
|
|
202
|
+
subprocess.run(["git", "add", cfg.wiki_dir, ".indexer/manifest.json"], cwd=root)
|
|
203
|
+
|
|
204
|
+
click.echo(f"Done. Wiki written to {wiki_dir}/")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@main.command()
|
|
208
|
+
def status():
|
|
209
|
+
"""Show last indexed commit, stale files, manifest stats."""
|
|
210
|
+
root = Path.cwd()
|
|
211
|
+
cfg = load_config(root)
|
|
212
|
+
manifest = load_manifest(root)
|
|
213
|
+
|
|
214
|
+
click.echo(f"Last indexed commit: {manifest.last_indexed_commit or 'never'}")
|
|
215
|
+
click.echo(f"Indexed at: {manifest.indexed_at or 'n/a'}")
|
|
216
|
+
click.echo(f"Tracked files: {len(manifest.files)}")
|
|
217
|
+
|
|
218
|
+
if is_git_repo(root):
|
|
219
|
+
all_files = [f for f in all_tracked_files(root) if _is_indexable(f, cfg)]
|
|
220
|
+
stale = manifest.stale_files(root, all_files)
|
|
221
|
+
click.echo(f"Stale files: {len(stale)}")
|
|
222
|
+
if stale:
|
|
223
|
+
for f in stale[:10]:
|
|
224
|
+
click.echo(f" {f}")
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@main.group()
|
|
228
|
+
def hook():
|
|
229
|
+
"""Manage the pre-commit hook."""
|
|
230
|
+
pass
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@hook.command("install")
|
|
234
|
+
def hook_install():
|
|
235
|
+
"""Install the pre-commit hook in the current repo."""
|
|
236
|
+
root = Path.cwd()
|
|
237
|
+
install_hook(root)
|
|
238
|
+
click.echo("Pre-commit hook installed.")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@hook.command("remove")
|
|
242
|
+
def hook_remove():
|
|
243
|
+
"""Remove the pre-commit hook from the current repo."""
|
|
244
|
+
root = Path.cwd()
|
|
245
|
+
remove_hook(root)
|
|
246
|
+
click.echo("Pre-commit hook removed.")
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _is_indexable(path: str, cfg: Config) -> bool:
|
|
250
|
+
from fnmatch import fnmatch
|
|
251
|
+
p = Path(path)
|
|
252
|
+
if p.suffix not in {".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rb", ".rs"}:
|
|
253
|
+
return False
|
|
254
|
+
for pattern in cfg.ignore:
|
|
255
|
+
if any(fnmatch(part, pattern) for part in p.parts):
|
|
256
|
+
return False
|
|
257
|
+
if fnmatch(path, pattern):
|
|
258
|
+
return False
|
|
259
|
+
return True
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# indexer/config.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import tomllib
|
|
4
|
+
import tomli_w
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
FILENAME = ".indexer.toml"
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Config:
|
|
12
|
+
provider: str = "anthropic/claude-sonnet-4-6"
|
|
13
|
+
api_key_env: str = "ANTHROPIC_API_KEY"
|
|
14
|
+
wiki_dir: str = "wiki"
|
|
15
|
+
ignore: list[str] = field(default_factory=lambda: [
|
|
16
|
+
"node_modules", ".venv", "dist", "build", "__pycache__", "*.test.*"
|
|
17
|
+
])
|
|
18
|
+
max_tokens_per_batch: int = 8000
|
|
19
|
+
pre_commit: bool = True
|
|
20
|
+
synthesize_commit_message: bool = True
|
|
21
|
+
|
|
22
|
+
def load_config(repo_root: Path) -> Config:
|
|
23
|
+
path = repo_root / FILENAME
|
|
24
|
+
if not path.exists():
|
|
25
|
+
return Config()
|
|
26
|
+
defaults = Config()
|
|
27
|
+
with open(path, "rb") as f:
|
|
28
|
+
data = tomllib.load(f)
|
|
29
|
+
llm = data.get("llm", {})
|
|
30
|
+
idx = data.get("indexer", {})
|
|
31
|
+
hooks = data.get("hooks", {})
|
|
32
|
+
return Config(
|
|
33
|
+
provider=llm.get("provider", defaults.provider),
|
|
34
|
+
api_key_env=llm.get("api_key_env", defaults.api_key_env),
|
|
35
|
+
wiki_dir=idx.get("wiki_dir", defaults.wiki_dir),
|
|
36
|
+
ignore=list(idx.get("ignore", defaults.ignore)),
|
|
37
|
+
max_tokens_per_batch=idx.get("max_tokens_per_batch", defaults.max_tokens_per_batch),
|
|
38
|
+
pre_commit=hooks.get("pre_commit", defaults.pre_commit),
|
|
39
|
+
synthesize_commit_message=hooks.get("synthesize_commit_message", defaults.synthesize_commit_message),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def save_config(repo_root: Path, cfg: Config) -> None:
|
|
43
|
+
data = {
|
|
44
|
+
"llm": {"provider": cfg.provider, "api_key_env": cfg.api_key_env},
|
|
45
|
+
"indexer": {"wiki_dir": cfg.wiki_dir, "ignore": cfg.ignore, "max_tokens_per_batch": cfg.max_tokens_per_batch},
|
|
46
|
+
"hooks": {"pre_commit": cfg.pre_commit, "synthesize_commit_message": cfg.synthesize_commit_message},
|
|
47
|
+
}
|
|
48
|
+
with open(repo_root / FILENAME, "wb") as f:
|
|
49
|
+
tomli_w.dump(data, f)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# indexer/git.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
def _run(cmd: list[str], cwd: Path) -> str:
|
|
8
|
+
try:
|
|
9
|
+
result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
|
|
10
|
+
return result.stdout.strip()
|
|
11
|
+
except (FileNotFoundError, OSError):
|
|
12
|
+
return ""
|
|
13
|
+
|
|
14
|
+
def current_commit(repo_root: Path) -> Optional[str]:
|
|
15
|
+
out = _run(["git", "rev-parse", "HEAD"], cwd=repo_root)
|
|
16
|
+
return out if out else None
|
|
17
|
+
|
|
18
|
+
def staged_files(repo_root: Path) -> list[str]:
|
|
19
|
+
out = _run(["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"], cwd=repo_root)
|
|
20
|
+
return [line for line in out.splitlines() if line]
|
|
21
|
+
|
|
22
|
+
def changed_files_since(repo_root: Path, since_commit: str) -> list[str]:
|
|
23
|
+
out = _run(["git", "diff", "--name-only", "--diff-filter=ACM", since_commit, "HEAD"], cwd=repo_root)
|
|
24
|
+
return [line for line in out.splitlines() if line]
|
|
25
|
+
|
|
26
|
+
def all_tracked_files(repo_root: Path) -> list[str]:
|
|
27
|
+
out = _run(["git", "ls-files"], cwd=repo_root)
|
|
28
|
+
return [line for line in out.splitlines() if line]
|
|
29
|
+
|
|
30
|
+
def is_git_repo(repo_root: Path) -> bool:
|
|
31
|
+
try:
|
|
32
|
+
result = subprocess.run(
|
|
33
|
+
["git", "rev-parse", "--git-dir"],
|
|
34
|
+
cwd=repo_root, capture_output=True
|
|
35
|
+
)
|
|
36
|
+
return result.returncode == 0
|
|
37
|
+
except (FileNotFoundError, OSError):
|
|
38
|
+
return False
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def density_group(files: list[str], merge_threshold: int = 6) -> dict[str, str]:
|
|
7
|
+
"""
|
|
8
|
+
Maps each file path to a wiki page label (a folder path string).
|
|
9
|
+
|
|
10
|
+
Algorithm:
|
|
11
|
+
1. Count files reachable under each folder prefix.
|
|
12
|
+
2. Walk from the deepest folder upward. If a folder has fewer than
|
|
13
|
+
merge_threshold files reachable under it, merge it into its parent.
|
|
14
|
+
3. The resolved group for each file is the shallowest folder whose
|
|
15
|
+
subtree count >= merge_threshold, or the top-level folder if none qualify.
|
|
16
|
+
|
|
17
|
+
Returns: dict mapping file_path -> wiki_page_label (a folder string)
|
|
18
|
+
"""
|
|
19
|
+
if not files:
|
|
20
|
+
return {}
|
|
21
|
+
|
|
22
|
+
def folder_of(f: str) -> str:
|
|
23
|
+
parent = str(Path(f).parent)
|
|
24
|
+
return "." if parent == "." else parent
|
|
25
|
+
|
|
26
|
+
def prefixes(folder: str) -> list[str]:
|
|
27
|
+
"""All ancestor folder prefixes from deepest to shallowest."""
|
|
28
|
+
if folder == ".":
|
|
29
|
+
return ["."]
|
|
30
|
+
parts = folder.split("/")
|
|
31
|
+
return ["/".join(parts[:i]) for i in range(len(parts), 0, -1)]
|
|
32
|
+
|
|
33
|
+
# Count how many files fall under each prefix
|
|
34
|
+
prefix_count: dict[str, int] = defaultdict(int)
|
|
35
|
+
for f in files:
|
|
36
|
+
folder = folder_of(f)
|
|
37
|
+
for prefix in prefixes(folder):
|
|
38
|
+
prefix_count[prefix] += 1
|
|
39
|
+
|
|
40
|
+
def resolve_group(f: str) -> str:
|
|
41
|
+
folder = folder_of(f)
|
|
42
|
+
all_prefixes = prefixes(folder)
|
|
43
|
+
|
|
44
|
+
# Walk from deepest to shallowest, find first prefix that meets threshold
|
|
45
|
+
for prefix in all_prefixes:
|
|
46
|
+
if prefix_count[prefix] >= merge_threshold:
|
|
47
|
+
return prefix
|
|
48
|
+
|
|
49
|
+
# None met threshold — use the shallowest non-root prefix, or "." for root
|
|
50
|
+
# For files at depth, merge to immediate parent folder
|
|
51
|
+
if len(all_prefixes) > 1:
|
|
52
|
+
return all_prefixes[-2] # parent of deepest
|
|
53
|
+
return "." # root level
|
|
54
|
+
|
|
55
|
+
return {f: resolve_group(f) for f in files}
|