librarian-code 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- librarian/__init__.py +3 -0
- librarian/__main__.py +3 -0
- librarian/actions/__init__.py +0 -0
- librarian/actions/file_ops.py +47 -0
- librarian/actions/safety.py +29 -0
- librarian/actions/shell_ops.py +49 -0
- librarian/adapter/__init__.py +0 -0
- librarian/adapter/base.py +11 -0
- librarian/adapter/groq_adapter.py +40 -0
- librarian/adapter/openrouter_adapter.py +58 -0
- librarian/cli.py +26 -0
- librarian/commands/__init__.py +0 -0
- librarian/commands/ask.py +46 -0
- librarian/commands/do.py +232 -0
- librarian/commands/init.py +96 -0
- librarian/commands/status.py +71 -0
- librarian/commands/undo.py +85 -0
- librarian/commands/why.py +47 -0
- librarian/exceptions.py +22 -0
- librarian/memory/__init__.py +0 -0
- librarian/memory/capsule.py +94 -0
- librarian/memory/chunker.py +183 -0
- librarian/memory/decision_log.py +36 -0
- librarian/memory/indexer.py +96 -0
- librarian/memory/retriever.py +62 -0
- librarian/orchestrator/__init__.py +0 -0
- librarian/orchestrator/core.py +47 -0
- librarian/orchestrator/router.py +17 -0
- librarian/skills/__init__.py +0 -0
- librarian/skills/bundled/__init__.py +0 -0
- librarian/skills/bundled/api-design/conventions.md +93 -0
- librarian/skills/bundled/python/conventions.md +59 -0
- librarian/skills/bundled/react/conventions.md +83 -0
- librarian/skills/bundled/web-dev/conventions.md +54 -0
- librarian/skills/loader.py +109 -0
- librarian/utils/__init__.py +0 -0
- librarian/utils/config.py +15 -0
- librarian/utils/logger.py +32 -0
- librarian/utils/token_tracker.py +16 -0
- librarian/utils/ui.py +97 -0
- librarian_code-0.1.0.dist-info/METADATA +180 -0
- librarian_code-0.1.0.dist-info/RECORD +45 -0
- librarian_code-0.1.0.dist-info/WHEEL +4 -0
- librarian_code-0.1.0.dist-info/entry_points.txt +2 -0
- librarian_code-0.1.0.dist-info/licenses/LICENSE.md +21 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from librarian.utils.ui import print_header, print_panel, print_muted, INDIGO, SUCCESS
|
|
6
|
+
from librarian.utils.token_tracker import tracker
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def run():
|
|
10
|
+
project_name = os.path.basename(os.getcwd())
|
|
11
|
+
librarian_dir = Path(".librarian")
|
|
12
|
+
meta_file = librarian_dir / "index_meta.json"
|
|
13
|
+
decisions_file = librarian_dir / "decisions.jsonl"
|
|
14
|
+
capsules_file = librarian_dir / "capsules.json"
|
|
15
|
+
|
|
16
|
+
if not librarian_dir.exists():
|
|
17
|
+
print_header("librarian status")
|
|
18
|
+
print_muted(f" project {project_name}")
|
|
19
|
+
print_muted(" indexed not yet (run librarian init)")
|
|
20
|
+
print_muted(" memory 0 capsules")
|
|
21
|
+
print_muted(" last action —")
|
|
22
|
+
print_muted(" tokens 0")
|
|
23
|
+
print_muted(" providers groq (primary) · openrouter (fallback)")
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
indexed = "not yet"
|
|
27
|
+
if meta_file.exists():
|
|
28
|
+
meta = json.loads(meta_file.read_text())
|
|
29
|
+
indexed = f"{meta.get('file_count', '?')} files · {meta.get('chunk_count', '?')} chunks"
|
|
30
|
+
|
|
31
|
+
capsule_count = 0
|
|
32
|
+
avg_confidence = 0
|
|
33
|
+
if capsules_file.exists():
|
|
34
|
+
capsules = json.loads(capsules_file.read_text())
|
|
35
|
+
capsule_count = len(capsules)
|
|
36
|
+
if capsules:
|
|
37
|
+
avg_confidence = sum(c.get("confidence", 0) for c in capsules) / len(capsules)
|
|
38
|
+
|
|
39
|
+
decision_count = 0
|
|
40
|
+
last_action = "—"
|
|
41
|
+
if decisions_file.exists():
|
|
42
|
+
lines = decisions_file.read_text().strip().splitlines()
|
|
43
|
+
decision_count = len(lines)
|
|
44
|
+
if lines:
|
|
45
|
+
last = json.loads(lines[-1])
|
|
46
|
+
task = last.get("task", "—")
|
|
47
|
+
ts = last.get("timestamp", "")
|
|
48
|
+
try:
|
|
49
|
+
dt = datetime.fromisoformat(ts)
|
|
50
|
+
diff = datetime.now(timezone.utc) - dt
|
|
51
|
+
hours = int(diff.total_seconds() // 3600)
|
|
52
|
+
if hours < 1:
|
|
53
|
+
last_action = f"{task} (just now)"
|
|
54
|
+
elif hours < 24:
|
|
55
|
+
last_action = f"{task} ({hours}h ago)"
|
|
56
|
+
else:
|
|
57
|
+
last_action = f"{task} ({hours // 24}d ago)"
|
|
58
|
+
except Exception:
|
|
59
|
+
last_action = task
|
|
60
|
+
|
|
61
|
+
print_header("librarian status")
|
|
62
|
+
content = (
|
|
63
|
+
f" project {project_name}\n"
|
|
64
|
+
f" indexed {indexed}\n"
|
|
65
|
+
f" memory {capsule_count} capsules · avg confidence {avg_confidence:.2f}\n"
|
|
66
|
+
f" last action {last_action}\n"
|
|
67
|
+
f" tokens {tracker.total()} this session\n"
|
|
68
|
+
f" providers groq (primary) · openrouter (fallback)\n"
|
|
69
|
+
f" log entries {decision_count}"
|
|
70
|
+
)
|
|
71
|
+
print_panel(content, title="project")
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from librarian.utils.ui import (
|
|
4
|
+
print_header, print_warning, print_success, print_muted, confirm_action,
|
|
5
|
+
)
|
|
6
|
+
from librarian.memory.decision_log import get_last, mark_undone
|
|
7
|
+
from librarian.memory import capsule
|
|
8
|
+
from librarian.actions.file_ops import read_file, write_file
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _undo_edit_file(action: dict):
|
|
12
|
+
path = action.get("file")
|
|
13
|
+
new_code = action.get("new_code", "")
|
|
14
|
+
old_code = action.get("old_code", "")
|
|
15
|
+
if not path or not old_code:
|
|
16
|
+
return False
|
|
17
|
+
try:
|
|
18
|
+
content = read_file(path)
|
|
19
|
+
if new_code in content:
|
|
20
|
+
content = content.replace(new_code, old_code, 1)
|
|
21
|
+
write_file(path, content)
|
|
22
|
+
return True
|
|
23
|
+
except Exception:
|
|
24
|
+
pass
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _undo_create_file(action: dict):
|
|
29
|
+
path = action.get("file")
|
|
30
|
+
if not path:
|
|
31
|
+
return False
|
|
32
|
+
p = Path(path)
|
|
33
|
+
if p.exists():
|
|
34
|
+
if confirm_action(f"delete {path}?"):
|
|
35
|
+
p.unlink()
|
|
36
|
+
return True
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def run():
|
|
41
|
+
if not Path(".librarian").exists():
|
|
42
|
+
print_header("librarian undo")
|
|
43
|
+
print_warning("project not initialised — run 'librarian init' first")
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
print_header("librarian undo")
|
|
47
|
+
|
|
48
|
+
entries = get_last(10)
|
|
49
|
+
undone = [e for e in entries if e.get("outcome") != "undone"]
|
|
50
|
+
if not undone:
|
|
51
|
+
print_muted(" nothing to undo")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
last = undone[-1]
|
|
55
|
+
task = last.get("task", "—")
|
|
56
|
+
print_muted(f" last action: {task}")
|
|
57
|
+
|
|
58
|
+
if not confirm_action(f"undo '{task}'?"):
|
|
59
|
+
print_muted(" cancelled")
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
reverted = 0
|
|
63
|
+
for action in last.get("actions_taken", []):
|
|
64
|
+
action_type = action.get("type")
|
|
65
|
+
if action_type == "edit_file":
|
|
66
|
+
if _undo_edit_file(action):
|
|
67
|
+
reverted += 1
|
|
68
|
+
print_success(f"reverted: {action.get('file', '?')}")
|
|
69
|
+
elif action_type == "create_file":
|
|
70
|
+
if _undo_create_file(action):
|
|
71
|
+
reverted += 1
|
|
72
|
+
print_success(f"deleted: {action.get('file', '?')}")
|
|
73
|
+
elif action_type == "shell_command":
|
|
74
|
+
print_warning(f"cannot auto-undo shell command: {action.get('command', '?')}")
|
|
75
|
+
|
|
76
|
+
mark_undone(last["id"])
|
|
77
|
+
|
|
78
|
+
capsules = capsule.get_all()
|
|
79
|
+
for c in capsules:
|
|
80
|
+
if c.get("decision") == task:
|
|
81
|
+
capsule.undo(c["id"])
|
|
82
|
+
print_muted(f" capsule confidence updated: {c['confidence']:.2f} → {c['confidence'] * 0.6:.2f}")
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
print_success(f"undo complete — {reverted} actions reverted")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from librarian.utils.ui import print_header, print_warning, print_muted, console, INDIGO
|
|
4
|
+
from librarian.memory.decision_log import get_last
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _time_ago(timestamp: str) -> str:
|
|
8
|
+
try:
|
|
9
|
+
ts = datetime.fromisoformat(timestamp)
|
|
10
|
+
now = datetime.now(timezone.utc)
|
|
11
|
+
diff = now - ts
|
|
12
|
+
seconds = int(diff.total_seconds())
|
|
13
|
+
if seconds < 60:
|
|
14
|
+
return "just now"
|
|
15
|
+
if seconds < 3600:
|
|
16
|
+
return f"{seconds // 60} min ago"
|
|
17
|
+
if seconds < 86400:
|
|
18
|
+
return f"{seconds // 3600} hours ago"
|
|
19
|
+
return f"{seconds // 86400} days ago"
|
|
20
|
+
except Exception:
|
|
21
|
+
return "—"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run():
|
|
25
|
+
if not Path(".librarian").exists():
|
|
26
|
+
print_header("librarian why")
|
|
27
|
+
print_warning("project not initialised — run 'librarian init' first")
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
print_header("decision history")
|
|
31
|
+
|
|
32
|
+
entries = get_last(10)
|
|
33
|
+
if not entries:
|
|
34
|
+
print_muted(" no decisions logged yet")
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
for i, entry in enumerate(reversed(entries), 1):
|
|
38
|
+
time_str = _time_ago(entry.get("timestamp", ""))
|
|
39
|
+
task = entry.get("task", "—")
|
|
40
|
+
reasoning = entry.get("reasoning", "—")
|
|
41
|
+
provider = entry.get("llm_provider", "—")
|
|
42
|
+
outcome = entry.get("outcome", "approved")
|
|
43
|
+
tokens = entry.get("tokens_used", 0)
|
|
44
|
+
|
|
45
|
+
console.print(f"\n [bold {INDIGO}]{i}[/bold {INDIGO}] [{time_str}] {task}")
|
|
46
|
+
console.print(f" reasoning: {reasoning[:80]}{'...' if len(reasoning) > 80 else ''}")
|
|
47
|
+
console.print(f" provider: {provider} tokens: {tokens} outcome: {outcome}")
|
librarian/exceptions.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
class LibrarianError(Exception):
|
|
2
|
+
pass
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RateLimitError(LibrarianError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProviderUnavailableError(LibrarianError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProjectNotInitialisedError(LibrarianError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SafetyBoundaryError(LibrarianError):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ChunkNotFoundError(LibrarianError):
|
|
22
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
CAPSULE_FILE = ".librarian/capsules.json"
|
|
9
|
+
ARCHIVE_FILE = ".librarian/archive/archived_capsules.json"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _load() -> list[dict]:
|
|
13
|
+
path = Path(CAPSULE_FILE)
|
|
14
|
+
if not path.exists():
|
|
15
|
+
return []
|
|
16
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _save(capsules: list[dict]):
|
|
20
|
+
Path(CAPSULE_FILE).write_text(json.dumps(capsules, indent=2), encoding="utf-8")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def create(task: str, reasoning: str, files_changed: list[str] = None):
|
|
24
|
+
capsules = _load()
|
|
25
|
+
capsule = {
|
|
26
|
+
"id": str(uuid.uuid4()),
|
|
27
|
+
"project_id": hashlib.sha256(os.getcwd().encode()).hexdigest()[:16],
|
|
28
|
+
"file": (files_changed or [""])[0],
|
|
29
|
+
"decision": task,
|
|
30
|
+
"reason": reasoning,
|
|
31
|
+
"confidence": 0.5,
|
|
32
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
33
|
+
"embed_text": f"{task} because {reasoning}",
|
|
34
|
+
"outcome": "approved",
|
|
35
|
+
}
|
|
36
|
+
capsules.append(capsule)
|
|
37
|
+
_save(capsules)
|
|
38
|
+
return capsule
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def approve(capsule_id: str):
|
|
42
|
+
capsules = _load()
|
|
43
|
+
for c in capsules:
|
|
44
|
+
if c["id"] == capsule_id:
|
|
45
|
+
c["outcome"] = "approved"
|
|
46
|
+
c["confidence"] = min(c["confidence"] * 1.15, 1.0)
|
|
47
|
+
break
|
|
48
|
+
_save(capsules)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def undo(capsule_id: str):
|
|
52
|
+
capsules = _load()
|
|
53
|
+
for c in capsules:
|
|
54
|
+
if c["id"] == capsule_id:
|
|
55
|
+
c["outcome"] = "undone"
|
|
56
|
+
c["confidence"] *= 0.6
|
|
57
|
+
break
|
|
58
|
+
_save(capsules)
|
|
59
|
+
_archive_low_confidence()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def decay():
|
|
63
|
+
capsules = _load()
|
|
64
|
+
now = datetime.now(timezone.utc)
|
|
65
|
+
for c in capsules:
|
|
66
|
+
if c["outcome"] == "ignored":
|
|
67
|
+
ts = datetime.fromisoformat(c["timestamp"])
|
|
68
|
+
days = (now - ts).days
|
|
69
|
+
c["confidence"] *= 0.98 ** days
|
|
70
|
+
_save(capsules)
|
|
71
|
+
_archive_low_confidence()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _archive_low_confidence():
|
|
75
|
+
capsules = _load()
|
|
76
|
+
keep = []
|
|
77
|
+
archive = []
|
|
78
|
+
for c in capsules:
|
|
79
|
+
if c["confidence"] < 0.4:
|
|
80
|
+
archive.append(c)
|
|
81
|
+
else:
|
|
82
|
+
keep.append(c)
|
|
83
|
+
if archive:
|
|
84
|
+
_save(keep)
|
|
85
|
+
Path(".librarian/archive").mkdir(parents=True, exist_ok=True)
|
|
86
|
+
existing = []
|
|
87
|
+
if Path(ARCHIVE_FILE).exists():
|
|
88
|
+
existing = json.loads(Path(ARCHIVE_FILE).read_text(encoding="utf-8"))
|
|
89
|
+
existing.extend(archive)
|
|
90
|
+
Path(ARCHIVE_FILE).write_text(json.dumps(existing, indent=2), encoding="utf-8")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_all() -> list[dict]:
|
|
94
|
+
return _load()
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def chunk_file(path: str) -> list[dict]:
|
|
8
|
+
ext = os.path.splitext(path)[1].lower()
|
|
9
|
+
try:
|
|
10
|
+
with open(path, encoding="utf-8") as f:
|
|
11
|
+
content = f.read()
|
|
12
|
+
except UnicodeDecodeError:
|
|
13
|
+
with open(path, encoding="latin-1") as f:
|
|
14
|
+
content = f.read()
|
|
15
|
+
|
|
16
|
+
if ext == ".py":
|
|
17
|
+
return _chunk_python(path, content)
|
|
18
|
+
elif ext in (".js", ".ts", ".jsx", ".tsx"):
|
|
19
|
+
return _chunk_js(path, content)
|
|
20
|
+
elif ext in (".md", ".txt"):
|
|
21
|
+
return _chunk_text(path, content)
|
|
22
|
+
else:
|
|
23
|
+
return _chunk_generic(path, content)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _chunk_python(path: str, content: str) -> list[dict]:
|
|
27
|
+
chunks = []
|
|
28
|
+
try:
|
|
29
|
+
tree = ast.parse(content)
|
|
30
|
+
except SyntaxError:
|
|
31
|
+
return _chunk_generic(path, content)
|
|
32
|
+
|
|
33
|
+
lines = content.splitlines()
|
|
34
|
+
mtime = datetime.fromtimestamp(os.path.getmtime(path), tz=timezone.utc).isoformat()
|
|
35
|
+
|
|
36
|
+
for node in ast.iter_child_nodes(tree):
|
|
37
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
38
|
+
start = node.lineno - 1
|
|
39
|
+
end = node.end_lineno or start + 1
|
|
40
|
+
chunk_text = "\n".join(lines[start:end])
|
|
41
|
+
chunks.append({
|
|
42
|
+
"content": chunk_text,
|
|
43
|
+
"metadata": {
|
|
44
|
+
"file_path": path,
|
|
45
|
+
"chunk_type": "function",
|
|
46
|
+
"name": node.name,
|
|
47
|
+
"language": "python",
|
|
48
|
+
"start_line": start + 1,
|
|
49
|
+
"end_line": end,
|
|
50
|
+
"last_modified": mtime,
|
|
51
|
+
},
|
|
52
|
+
})
|
|
53
|
+
elif isinstance(node, ast.ClassDef):
|
|
54
|
+
start = node.lineno - 1
|
|
55
|
+
end = node.end_lineno or start + 1
|
|
56
|
+
chunk_text = "\n".join(lines[start:end])
|
|
57
|
+
chunks.append({
|
|
58
|
+
"content": chunk_text,
|
|
59
|
+
"metadata": {
|
|
60
|
+
"file_path": path,
|
|
61
|
+
"chunk_type": "class",
|
|
62
|
+
"name": node.name,
|
|
63
|
+
"language": "python",
|
|
64
|
+
"start_line": start + 1,
|
|
65
|
+
"end_line": end,
|
|
66
|
+
"last_modified": mtime,
|
|
67
|
+
},
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
if not chunks:
|
|
71
|
+
chunks.append({
|
|
72
|
+
"content": content,
|
|
73
|
+
"metadata": {
|
|
74
|
+
"file_path": path,
|
|
75
|
+
"chunk_type": "module",
|
|
76
|
+
"name": os.path.basename(path),
|
|
77
|
+
"language": "python",
|
|
78
|
+
"start_line": 1,
|
|
79
|
+
"end_line": len(lines),
|
|
80
|
+
"last_modified": mtime,
|
|
81
|
+
},
|
|
82
|
+
})
|
|
83
|
+
return chunks
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _chunk_js(path: str, content: str) -> list[dict]:
|
|
87
|
+
pattern = r"^(?:export\s+)?(?:async\s+)?(?:function\s+\w+|const\s+\w+\s*=\s*(?:async\s+)?(?:\([^)]*\)\s*=>|function)|class\s+\w+)"
|
|
88
|
+
lines = content.splitlines()
|
|
89
|
+
mtime = datetime.fromtimestamp(os.path.getmtime(path), tz=timezone.utc).isoformat()
|
|
90
|
+
lang = "javascript" if os.path.splitext(path)[1] in (".js", ".jsx") else "typescript"
|
|
91
|
+
|
|
92
|
+
chunks = []
|
|
93
|
+
for i, line in enumerate(lines):
|
|
94
|
+
if re.match(pattern, line.strip()):
|
|
95
|
+
end = min(i + 50, len(lines))
|
|
96
|
+
chunk_text = "\n".join(lines[i:end])
|
|
97
|
+
name_match = re.search(r"(?:function|class|const)\s+(\w+)", line)
|
|
98
|
+
chunks.append({
|
|
99
|
+
"content": chunk_text,
|
|
100
|
+
"metadata": {
|
|
101
|
+
"file_path": path,
|
|
102
|
+
"chunk_type": "function",
|
|
103
|
+
"name": name_match.group(1) if name_match else "unknown",
|
|
104
|
+
"language": lang,
|
|
105
|
+
"start_line": i + 1,
|
|
106
|
+
"end_line": end,
|
|
107
|
+
"last_modified": mtime,
|
|
108
|
+
},
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
if not chunks:
|
|
112
|
+
chunks.append({
|
|
113
|
+
"content": content,
|
|
114
|
+
"metadata": {
|
|
115
|
+
"file_path": path,
|
|
116
|
+
"chunk_type": "module",
|
|
117
|
+
"name": os.path.basename(path),
|
|
118
|
+
"language": lang,
|
|
119
|
+
"start_line": 1,
|
|
120
|
+
"end_line": len(lines),
|
|
121
|
+
"last_modified": mtime,
|
|
122
|
+
},
|
|
123
|
+
})
|
|
124
|
+
return chunks
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _chunk_text(path: str, content: str) -> list[dict]:
|
|
128
|
+
mtime = datetime.fromtimestamp(os.path.getmtime(path), tz=timezone.utc).isoformat()
|
|
129
|
+
chunks = []
|
|
130
|
+
sections = re.split(r"\n(?=#{1,3}\s)", content)
|
|
131
|
+
for section in sections:
|
|
132
|
+
section = section.strip()
|
|
133
|
+
if not section:
|
|
134
|
+
continue
|
|
135
|
+
heading_match = re.match(r"^#{1,3}\s+(.*)", section)
|
|
136
|
+
chunks.append({
|
|
137
|
+
"content": section,
|
|
138
|
+
"metadata": {
|
|
139
|
+
"file_path": path,
|
|
140
|
+
"chunk_type": "text",
|
|
141
|
+
"name": heading_match.group(1) if heading_match else os.path.basename(path),
|
|
142
|
+
"language": "markdown",
|
|
143
|
+
"start_line": 1,
|
|
144
|
+
"end_line": content[:content.find(section)].count("\n") + section.count("\n") + 1,
|
|
145
|
+
"last_modified": mtime,
|
|
146
|
+
},
|
|
147
|
+
})
|
|
148
|
+
if not chunks:
|
|
149
|
+
chunks.append({
|
|
150
|
+
"content": content,
|
|
151
|
+
"metadata": {
|
|
152
|
+
"file_path": path,
|
|
153
|
+
"chunk_type": "text",
|
|
154
|
+
"name": os.path.basename(path),
|
|
155
|
+
"language": "text",
|
|
156
|
+
"start_line": 1,
|
|
157
|
+
"end_line": content.count("\n") + 1,
|
|
158
|
+
"last_modified": mtime,
|
|
159
|
+
},
|
|
160
|
+
})
|
|
161
|
+
return chunks
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _chunk_generic(path: str, content: str) -> list[dict]:
|
|
165
|
+
mtime = datetime.fromtimestamp(os.path.getmtime(path), tz=timezone.utc).isoformat()
|
|
166
|
+
chunk_size = 500
|
|
167
|
+
overlap = 50
|
|
168
|
+
chunks = []
|
|
169
|
+
for i in range(0, len(content), chunk_size - overlap):
|
|
170
|
+
chunk_text = content[i:i + chunk_size]
|
|
171
|
+
chunks.append({
|
|
172
|
+
"content": chunk_text,
|
|
173
|
+
"metadata": {
|
|
174
|
+
"file_path": path,
|
|
175
|
+
"chunk_type": "module",
|
|
176
|
+
"name": os.path.basename(path),
|
|
177
|
+
"language": os.path.splitext(path)[1].lstrip("."),
|
|
178
|
+
"start_line": content[:i].count("\n") + 1,
|
|
179
|
+
"end_line": content[:i + len(chunk_text)].count("\n") + 1,
|
|
180
|
+
"last_modified": mtime,
|
|
181
|
+
},
|
|
182
|
+
})
|
|
183
|
+
return chunks
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
LOG_FILE = ".librarian/decisions.jsonl"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def append(entry: dict):
|
|
10
|
+
entry["id"] = str(uuid.uuid4())
|
|
11
|
+
entry["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
12
|
+
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
|
13
|
+
f.write(json.dumps(entry) + "\n")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_last(n: int = 5) -> list[dict]:
|
|
17
|
+
path = Path(LOG_FILE)
|
|
18
|
+
if not path.exists():
|
|
19
|
+
return []
|
|
20
|
+
lines = path.read_text(encoding="utf-8").strip().splitlines()
|
|
21
|
+
entries = [json.loads(line) for line in lines if line.strip()]
|
|
22
|
+
return entries[-n:]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def mark_undone(entry_id: str):
|
|
26
|
+
path = Path(LOG_FILE)
|
|
27
|
+
if not path.exists():
|
|
28
|
+
return
|
|
29
|
+
lines = path.read_text(encoding="utf-8").strip().splitlines()
|
|
30
|
+
updated = []
|
|
31
|
+
for line in lines:
|
|
32
|
+
entry = json.loads(line)
|
|
33
|
+
if entry.get("id") == entry_id:
|
|
34
|
+
entry["outcome"] = "undone"
|
|
35
|
+
updated.append(json.dumps(entry))
|
|
36
|
+
path.write_text("\n".join(updated) + "\n", encoding="utf-8")
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from librarian.actions.file_ops import list_files
|
|
6
|
+
from librarian.memory.chunker import chunk_file
|
|
7
|
+
from librarian.utils.config import CHROMA_PERSIST_DIR, EMBED_MODEL
|
|
8
|
+
from librarian.utils.ui import spinner, print_success, print_muted
|
|
9
|
+
|
|
10
|
+
CHUNK_EXTENSIONS = [
|
|
11
|
+
".py", ".js", ".ts", ".jsx", ".tsx",
|
|
12
|
+
".md", ".txt", ".yaml", ".yml", ".toml",
|
|
13
|
+
".json", ".html", ".css", ".sh", ".sql",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
META_FILE = ".librarian/index_meta.json"
|
|
17
|
+
|
|
18
|
+
_model = None
|
|
19
|
+
_client = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_model():
|
|
23
|
+
global _model
|
|
24
|
+
if _model is None:
|
|
25
|
+
from sentence_transformers import SentenceTransformer
|
|
26
|
+
_model = SentenceTransformer(EMBED_MODEL)
|
|
27
|
+
return _model
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_client():
|
|
31
|
+
global _client
|
|
32
|
+
if _client is None:
|
|
33
|
+
import chromadb
|
|
34
|
+
_client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
|
|
35
|
+
return _client
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _sanitize_collection_name(name: str) -> str:
|
|
39
|
+
name = re.sub(r"[^a-zA-Z0-9_\-\.\s]", "", name)
|
|
40
|
+
name = re.sub(r"\s+", "_", name).strip("_")
|
|
41
|
+
if len(name) < 3:
|
|
42
|
+
name = f"project_{name}" if name else "project"
|
|
43
|
+
return name[:512]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def index_project():
|
|
47
|
+
model = _get_model()
|
|
48
|
+
client = _get_client()
|
|
49
|
+
project_name = _sanitize_collection_name(os.path.basename(os.getcwd()))
|
|
50
|
+
collection = client.get_or_create_collection(name=project_name)
|
|
51
|
+
|
|
52
|
+
files = list_files(".", CHUNK_EXTENSIONS)
|
|
53
|
+
all_chunks = []
|
|
54
|
+
for f in files:
|
|
55
|
+
all_chunks.extend(chunk_file(f))
|
|
56
|
+
|
|
57
|
+
existing = collection.get()
|
|
58
|
+
existing_ids = set(existing["ids"]) if existing["ids"] else set()
|
|
59
|
+
|
|
60
|
+
to_add = []
|
|
61
|
+
seen_ids = set()
|
|
62
|
+
for idx, chunk in enumerate(all_chunks):
|
|
63
|
+
chunk_id = f"{chunk['metadata']['file_path']}:{chunk['metadata']['start_line']}:{idx}"
|
|
64
|
+
if chunk_id in existing_ids or chunk_id in seen_ids:
|
|
65
|
+
continue
|
|
66
|
+
seen_ids.add(chunk_id)
|
|
67
|
+
to_add.append((chunk_id, chunk))
|
|
68
|
+
|
|
69
|
+
with spinner("indexing files...") as prog:
|
|
70
|
+
task = prog.add_task("indexing", total=len(to_add))
|
|
71
|
+
batch_size = 100
|
|
72
|
+
for i in range(0, len(to_add), batch_size):
|
|
73
|
+
batch = to_add[i:i + batch_size]
|
|
74
|
+
ids = [c[0] for c in batch]
|
|
75
|
+
documents = [c[1]["content"] for c in batch]
|
|
76
|
+
metadatas = [c[1]["metadata"] for c in batch]
|
|
77
|
+
embeddings = model.encode(documents).tolist()
|
|
78
|
+
collection.add(
|
|
79
|
+
ids=ids,
|
|
80
|
+
documents=documents,
|
|
81
|
+
metadatas=metadatas,
|
|
82
|
+
embeddings=embeddings,
|
|
83
|
+
)
|
|
84
|
+
prog.update(task, advance=len(batch))
|
|
85
|
+
|
|
86
|
+
meta = {
|
|
87
|
+
"project": project_name,
|
|
88
|
+
"file_count": len(files),
|
|
89
|
+
"chunk_count": collection.count(),
|
|
90
|
+
"indexed_at": __import__("datetime").datetime.now(__import__("datetime").timezone.utc).isoformat(),
|
|
91
|
+
}
|
|
92
|
+
Path(".librarian").mkdir(exist_ok=True)
|
|
93
|
+
Path(META_FILE).write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
|
94
|
+
|
|
95
|
+
print_success(f"{len(files)} files, {collection.count()} chunks indexed")
|
|
96
|
+
return meta
|