optulus 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextforge/__init__.py +4 -0
- contextforge/cli.py +185 -0
- contextforge/graph/__init__.py +0 -0
- contextforge/graph/schema.py +50 -0
- contextforge/graph/store.py +224 -0
- contextforge/indexer/__init__.py +0 -0
- contextforge/indexer/embedder.py +64 -0
- contextforge/indexer/parser.py +166 -0
- contextforge/indexer/run.py +93 -0
- contextforge/indexer/tree_sitter_extract.py +273 -0
- contextforge/indexer/walker.py +61 -0
- contextforge/indexer/watch.py +49 -0
- contextforge/indexer/writer.py +18 -0
- contextforge/mcp/__init__.py +0 -0
- contextforge/mcp/__main__.py +31 -0
- contextforge/mcp/indexing.py +65 -0
- contextforge/mcp/server.py +127 -0
- contextforge/query/__init__.py +0 -0
- contextforge/query/context.py +65 -0
- contextforge/query/expand.py +17 -0
- contextforge/query/rank.py +71 -0
- contextforge/query/search.py +76 -0
- contextforge/utils/__init__.py +0 -0
- contextforge/utils/config.py +69 -0
- contextforge/utils/repo.py +41 -0
- contextforge/utils/tokens.py +11 -0
- optulus-0.1.0.dist-info/METADATA +138 -0
- optulus-0.1.0.dist-info/RECORD +32 -0
- optulus-0.1.0.dist-info/WHEEL +5 -0
- optulus-0.1.0.dist-info/entry_points.txt +2 -0
- optulus-0.1.0.dist-info/licenses/LICENSE +201 -0
- optulus-0.1.0.dist-info/top_level.txt +1 -0
contextforge/__init__.py
ADDED
contextforge/cli.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Typer CLI entrypoints."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
|
|
13
|
+
from contextforge.graph.store import GraphStore
|
|
14
|
+
from contextforge.indexer.run import run_full_index
|
|
15
|
+
from contextforge.query.context import build_context
|
|
16
|
+
from contextforge.utils.config import load_repo_config
|
|
17
|
+
|
|
18
|
+
app = typer.Typer(help="ContextForge CLI")
|
|
19
|
+
console = Console(stderr=True)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _err(msg: str) -> None:
|
|
23
|
+
console.print(f"[red]Error:[/red] {msg}")
|
|
24
|
+
raise typer.Exit(1)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _repo_root(path: str) -> Path:
|
|
28
|
+
return Path(path).resolve()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _require_index(repo_root: Path) -> Path:
|
|
32
|
+
cf_dir = repo_root / ".cf"
|
|
33
|
+
db_path = cf_dir / "index.db"
|
|
34
|
+
if not cf_dir.exists():
|
|
35
|
+
console.print("[yellow]ContextForge is not initialized in this repo yet.[/yellow]")
|
|
36
|
+
_err(
|
|
37
|
+
f"No index at {db_path}. Run: cf index <repo> (creates <repo>/.cf/) then cd into that repo for cf context."
|
|
38
|
+
)
|
|
39
|
+
if not db_path.exists():
|
|
40
|
+
_err(
|
|
41
|
+
f"No index at {db_path}. Run: cf index <repo> (creates <repo>/.cf/) then cd into that repo for cf context."
|
|
42
|
+
)
|
|
43
|
+
return db_path
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@app.command()
|
|
47
|
+
def index(
|
|
48
|
+
path: str,
|
|
49
|
+
watch: bool = typer.Option(False, "--watch"),
|
|
50
|
+
langs: str | None = typer.Option(None, "--langs"),
|
|
51
|
+
reset: bool = typer.Option(False, "--reset"),
|
|
52
|
+
) -> None:
|
|
53
|
+
repo_root = _repo_root(path)
|
|
54
|
+
if not repo_root.exists():
|
|
55
|
+
_err(f"Path does not exist: {repo_root}")
|
|
56
|
+
config = load_repo_config(repo_root)
|
|
57
|
+
lang_override: set[str] | None = None
|
|
58
|
+
if langs:
|
|
59
|
+
lang_override = {x.strip() for x in langs.split(",") if x.strip()}
|
|
60
|
+
selected_langs = lang_override if lang_override is not None else set(config.index_languages)
|
|
61
|
+
run_full_index(repo_root, reset=reset, langs=lang_override, console=console)
|
|
62
|
+
|
|
63
|
+
if watch:
|
|
64
|
+
from contextforge.indexer.watch import watch_and_reindex
|
|
65
|
+
|
|
66
|
+
watch_and_reindex(repo_root, selected_langs, set(config.index_exclude), config.batch_size)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@app.command()
|
|
70
|
+
def context(
|
|
71
|
+
task: str,
|
|
72
|
+
max_tokens: int = typer.Option(8000, "--max-tokens"),
|
|
73
|
+
top: int = typer.Option(20, "--top"),
|
|
74
|
+
json_out: bool = typer.Option(False, "--json"),
|
|
75
|
+
) -> None:
|
|
76
|
+
repo_root = Path.cwd()
|
|
77
|
+
db_path = _require_index(repo_root)
|
|
78
|
+
store = GraphStore(db_path)
|
|
79
|
+
result = build_context(task=task, store=store, repo_root=repo_root, top_n=top, max_tokens=max_tokens)
|
|
80
|
+
if json_out:
|
|
81
|
+
typer.echo(
|
|
82
|
+
json.dumps(
|
|
83
|
+
{
|
|
84
|
+
"session_id": result.session_id,
|
|
85
|
+
"task": result.task,
|
|
86
|
+
"token_count": result.token_count,
|
|
87
|
+
"baseline_tokens": result.baseline_tokens,
|
|
88
|
+
"savings": result.baseline_tokens - result.token_count,
|
|
89
|
+
"nodes": [
|
|
90
|
+
{"id": s.node.id, "kind": s.node.kind, "name": s.node.name, "path": s.node.path, "score": s.score}
|
|
91
|
+
for s in result.nodes
|
|
92
|
+
],
|
|
93
|
+
"context_block": json.loads(result.context_block),
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
return
|
|
98
|
+
table = Table(title=f'Context for: "{task}"')
|
|
99
|
+
table.add_column("#")
|
|
100
|
+
table.add_column("Kind")
|
|
101
|
+
table.add_column("Name")
|
|
102
|
+
table.add_column("Path")
|
|
103
|
+
table.add_column("Score")
|
|
104
|
+
table.add_column("Tokens")
|
|
105
|
+
for i, sn in enumerate(result.nodes, start=1):
|
|
106
|
+
table.add_row(str(i), sn.node.kind, sn.node.name, sn.node.path, f"{sn.score:.2f}", str(len((sn.node.snippet or "").split())))
|
|
107
|
+
console.print(table)
|
|
108
|
+
console.print(f"Token estimate: {result.token_count}")
|
|
109
|
+
console.print(f"Baseline (est): {result.baseline_tokens}")
|
|
110
|
+
console.print(f"Savings: {result.baseline_tokens - result.token_count}")
|
|
111
|
+
console.print("\n--- CONTEXT BLOCK ---")
|
|
112
|
+
console.print(result.context_block)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@app.command()
|
|
116
|
+
def stats(since: str | None = typer.Option(None, "--since")) -> None:
|
|
117
|
+
repo_root = Path.cwd()
|
|
118
|
+
db_path = _require_index(repo_root)
|
|
119
|
+
store = GraphStore(db_path)
|
|
120
|
+
sessions = store.list_sessions()
|
|
121
|
+
if since:
|
|
122
|
+
now = time.time()
|
|
123
|
+
if since.endswith("d"):
|
|
124
|
+
days = int(since[:-1] or "0")
|
|
125
|
+
cutoff = now - days * 86400
|
|
126
|
+
sessions = [s for s in sessions if (s.created_at or 0) >= cutoff]
|
|
127
|
+
baseline = 12000
|
|
128
|
+
total_tokens = sum(s.token_count or 0 for s in sessions)
|
|
129
|
+
total_baseline = len(sessions) * baseline
|
|
130
|
+
savings = total_baseline - total_tokens
|
|
131
|
+
dollars = (savings / 1_000_000) * 15
|
|
132
|
+
table = Table(title="ContextForge Stats")
|
|
133
|
+
table.add_column("Metric")
|
|
134
|
+
table.add_column("Value")
|
|
135
|
+
table.add_row("Total sessions", str(len(sessions)))
|
|
136
|
+
table.add_row("Avg tokens returned", f"{(total_tokens / len(sessions)):.1f}" if sessions else "0")
|
|
137
|
+
table.add_row("Cumulative token savings", str(savings))
|
|
138
|
+
table.add_row("Estimated dollar savings", f"${dollars:.2f}")
|
|
139
|
+
console.print(table)
|
|
140
|
+
counts: dict[str, int] = {}
|
|
141
|
+
for sess in sessions:
|
|
142
|
+
for node_id in sess.nodes_returned or []:
|
|
143
|
+
counts[node_id] = counts.get(node_id, 0) + 1
|
|
144
|
+
top = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[:10]
|
|
145
|
+
top_table = Table(title="Top Retrieved Nodes")
|
|
146
|
+
top_table.add_column("Node")
|
|
147
|
+
top_table.add_column("Path")
|
|
148
|
+
top_table.add_column("Count")
|
|
149
|
+
for node_id, count in top:
|
|
150
|
+
node = store.get_node(node_id)
|
|
151
|
+
if node is None:
|
|
152
|
+
continue
|
|
153
|
+
top_table.add_row(node.name, node.path, str(count))
|
|
154
|
+
console.print(top_table)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@app.command()
|
|
158
|
+
def serve(
|
|
159
|
+
transport: str = typer.Option(
|
|
160
|
+
"stdio",
|
|
161
|
+
"--transport",
|
|
162
|
+
help="stdio for Claude Code / Cursor MCP; sse or streamable-http for HTTP clients",
|
|
163
|
+
),
|
|
164
|
+
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for sse/streamable-http"),
|
|
165
|
+
port: int = typer.Option(8765, "--port", help="Port for sse/streamable-http (ignored for stdio)"),
|
|
166
|
+
auto_index: bool | None = typer.Option(
|
|
167
|
+
None,
|
|
168
|
+
"--auto-index/--no-auto-index",
|
|
169
|
+
help="Create or repair .cf/index.db on server start; default follows CONTEXTFORGE_AUTO_INDEX",
|
|
170
|
+
),
|
|
171
|
+
) -> None:
|
|
172
|
+
from contextforge.mcp.server import run_server
|
|
173
|
+
|
|
174
|
+
if transport not in ("stdio", "sse", "streamable-http"):
|
|
175
|
+
_err(f"Unknown transport {transport!r}; use stdio, sse, or streamable-http")
|
|
176
|
+
run_server(
|
|
177
|
+
transport=transport, # type: ignore[arg-type]
|
|
178
|
+
host=host,
|
|
179
|
+
port=port,
|
|
180
|
+
auto_index=auto_index,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""SQLite schema and migrations for ContextForge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
|
|
7
|
+
SCHEMA_SQL = """
|
|
8
|
+
CREATE TABLE IF NOT EXISTS nodes (
|
|
9
|
+
id TEXT PRIMARY KEY,
|
|
10
|
+
kind TEXT NOT NULL,
|
|
11
|
+
name TEXT NOT NULL,
|
|
12
|
+
path TEXT NOT NULL,
|
|
13
|
+
language TEXT,
|
|
14
|
+
start_line INTEGER,
|
|
15
|
+
end_line INTEGER,
|
|
16
|
+
snippet TEXT,
|
|
17
|
+
file_hash TEXT,
|
|
18
|
+
embedding BLOB,
|
|
19
|
+
created_at REAL
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
23
|
+
src_id TEXT NOT NULL,
|
|
24
|
+
dst_id TEXT NOT NULL,
|
|
25
|
+
kind TEXT NOT NULL,
|
|
26
|
+
weight REAL DEFAULT 1.0,
|
|
27
|
+
PRIMARY KEY (src_id, dst_id, kind)
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
CREATE TABLE IF NOT EXISTS sessions (
|
|
31
|
+
id TEXT PRIMARY KEY,
|
|
32
|
+
task_text TEXT NOT NULL,
|
|
33
|
+
nodes_returned TEXT,
|
|
34
|
+
nodes_used TEXT,
|
|
35
|
+
outcome_signal INTEGER,
|
|
36
|
+
token_count INTEGER,
|
|
37
|
+
created_at REAL
|
|
38
|
+
);
|
|
39
|
+
|
|
40
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_path ON nodes(path);
|
|
41
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_kind ON nodes(kind);
|
|
42
|
+
CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src_id);
|
|
43
|
+
CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_id);
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def migrate(conn: sqlite3.Connection) -> None:
|
|
48
|
+
"""Create required tables and indexes."""
|
|
49
|
+
conn.executescript(SCHEMA_SQL)
|
|
50
|
+
conn.commit()
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Graph store abstraction over SQLite."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sqlite3
|
|
7
|
+
import time
|
|
8
|
+
import uuid
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from contextforge.graph.schema import migrate
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _pack_embedding(vector: np.ndarray | None) -> bytes | None:
|
|
18
|
+
if vector is None:
|
|
19
|
+
return None
|
|
20
|
+
return np.asarray(vector, dtype=np.float32).tobytes()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _unpack_embedding(blob: bytes | None) -> np.ndarray | None:
|
|
24
|
+
if blob is None:
|
|
25
|
+
return None
|
|
26
|
+
return np.frombuffer(blob, dtype=np.float32)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(slots=True)
|
|
30
|
+
class Node:
|
|
31
|
+
id: str
|
|
32
|
+
kind: str
|
|
33
|
+
name: str
|
|
34
|
+
path: str
|
|
35
|
+
language: str | None = None
|
|
36
|
+
start_line: int | None = None
|
|
37
|
+
end_line: int | None = None
|
|
38
|
+
snippet: str | None = None
|
|
39
|
+
file_hash: str | None = None
|
|
40
|
+
embedding: np.ndarray | None = None
|
|
41
|
+
created_at: float | None = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(slots=True)
|
|
45
|
+
class Edge:
|
|
46
|
+
src_id: str
|
|
47
|
+
dst_id: str
|
|
48
|
+
kind: str
|
|
49
|
+
weight: float = 1.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(slots=True)
|
|
53
|
+
class Session:
|
|
54
|
+
id: str
|
|
55
|
+
task_text: str
|
|
56
|
+
nodes_returned: list[str] | None = None
|
|
57
|
+
nodes_used: list[str] | None = None
|
|
58
|
+
outcome_signal: int | None = None
|
|
59
|
+
token_count: int | None = None
|
|
60
|
+
created_at: float | None = None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GraphStore:
|
|
64
|
+
"""Read/write helper for nodes, edges, and sessions."""
|
|
65
|
+
|
|
66
|
+
def __init__(self, db_path: Path) -> None:
|
|
67
|
+
self.db_path = db_path
|
|
68
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
70
|
+
self.conn.row_factory = sqlite3.Row
|
|
71
|
+
migrate(self.conn)
|
|
72
|
+
|
|
73
|
+
def close(self) -> None:
|
|
74
|
+
self.conn.close()
|
|
75
|
+
|
|
76
|
+
def upsert_nodes(self, nodes: list[Node]) -> None:
|
|
77
|
+
self.conn.executemany(
|
|
78
|
+
"""
|
|
79
|
+
INSERT INTO nodes (id, kind, name, path, language, start_line, end_line, snippet, file_hash, embedding, created_at)
|
|
80
|
+
VALUES (:id, :kind, :name, :path, :language, :start_line, :end_line, :snippet, :file_hash, :embedding, :created_at)
|
|
81
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
82
|
+
kind = excluded.kind,
|
|
83
|
+
name = excluded.name,
|
|
84
|
+
path = excluded.path,
|
|
85
|
+
language = excluded.language,
|
|
86
|
+
start_line = excluded.start_line,
|
|
87
|
+
end_line = excluded.end_line,
|
|
88
|
+
snippet = excluded.snippet,
|
|
89
|
+
file_hash = excluded.file_hash,
|
|
90
|
+
embedding = excluded.embedding
|
|
91
|
+
""",
|
|
92
|
+
[
|
|
93
|
+
{
|
|
94
|
+
"id": n.id,
|
|
95
|
+
"kind": n.kind,
|
|
96
|
+
"name": n.name,
|
|
97
|
+
"path": n.path,
|
|
98
|
+
"language": n.language,
|
|
99
|
+
"start_line": n.start_line,
|
|
100
|
+
"end_line": n.end_line,
|
|
101
|
+
"snippet": n.snippet,
|
|
102
|
+
"file_hash": n.file_hash,
|
|
103
|
+
"embedding": _pack_embedding(n.embedding),
|
|
104
|
+
"created_at": n.created_at if n.created_at is not None else time.time(),
|
|
105
|
+
}
|
|
106
|
+
for n in nodes
|
|
107
|
+
],
|
|
108
|
+
)
|
|
109
|
+
self.conn.commit()
|
|
110
|
+
|
|
111
|
+
def upsert_edges(self, edges: list[Edge]) -> None:
|
|
112
|
+
self.conn.executemany(
|
|
113
|
+
"""
|
|
114
|
+
INSERT INTO edges (src_id, dst_id, kind, weight)
|
|
115
|
+
VALUES (?, ?, ?, ?)
|
|
116
|
+
ON CONFLICT(src_id, dst_id, kind) DO UPDATE SET
|
|
117
|
+
weight = excluded.weight
|
|
118
|
+
""",
|
|
119
|
+
[(e.src_id, e.dst_id, e.kind, e.weight) for e in edges],
|
|
120
|
+
)
|
|
121
|
+
self.conn.commit()
|
|
122
|
+
|
|
123
|
+
def get_node(self, node_id: str) -> Node | None:
|
|
124
|
+
row = self.conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,)).fetchone()
|
|
125
|
+
if row is None:
|
|
126
|
+
return None
|
|
127
|
+
return Node(
|
|
128
|
+
id=row["id"],
|
|
129
|
+
kind=row["kind"],
|
|
130
|
+
name=row["name"],
|
|
131
|
+
path=row["path"],
|
|
132
|
+
language=row["language"],
|
|
133
|
+
start_line=row["start_line"],
|
|
134
|
+
end_line=row["end_line"],
|
|
135
|
+
snippet=row["snippet"],
|
|
136
|
+
file_hash=row["file_hash"],
|
|
137
|
+
embedding=_unpack_embedding(row["embedding"]),
|
|
138
|
+
created_at=row["created_at"],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def list_nodes(self) -> list[Node]:
|
|
142
|
+
rows = self.conn.execute("SELECT * FROM nodes").fetchall()
|
|
143
|
+
return [
|
|
144
|
+
Node(
|
|
145
|
+
id=row["id"],
|
|
146
|
+
kind=row["kind"],
|
|
147
|
+
name=row["name"],
|
|
148
|
+
path=row["path"],
|
|
149
|
+
language=row["language"],
|
|
150
|
+
start_line=row["start_line"],
|
|
151
|
+
end_line=row["end_line"],
|
|
152
|
+
snippet=row["snippet"],
|
|
153
|
+
file_hash=row["file_hash"],
|
|
154
|
+
embedding=_unpack_embedding(row["embedding"]),
|
|
155
|
+
created_at=row["created_at"],
|
|
156
|
+
)
|
|
157
|
+
for row in rows
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
def get_edges_for_node(self, node_id: str) -> list[Edge]:
|
|
161
|
+
rows = self.conn.execute(
|
|
162
|
+
"SELECT * FROM edges WHERE src_id = ? OR dst_id = ?",
|
|
163
|
+
(node_id, node_id),
|
|
164
|
+
).fetchall()
|
|
165
|
+
return [Edge(src_id=r["src_id"], dst_id=r["dst_id"], kind=r["kind"], weight=r["weight"]) for r in rows]
|
|
166
|
+
|
|
167
|
+
def file_hash(self, path: str) -> str | None:
|
|
168
|
+
row = self.conn.execute(
|
|
169
|
+
"SELECT file_hash FROM nodes WHERE kind='file' AND path = ? LIMIT 1",
|
|
170
|
+
(path,),
|
|
171
|
+
).fetchone()
|
|
172
|
+
return None if row is None else row["file_hash"]
|
|
173
|
+
|
|
174
|
+
def create_session(self, task_text: str, nodes_returned: list[str], token_count: int) -> Session:
|
|
175
|
+
session = Session(
|
|
176
|
+
id=str(uuid.uuid4()),
|
|
177
|
+
task_text=task_text,
|
|
178
|
+
nodes_returned=nodes_returned,
|
|
179
|
+
token_count=token_count,
|
|
180
|
+
created_at=time.time(),
|
|
181
|
+
)
|
|
182
|
+
self.conn.execute(
|
|
183
|
+
"""
|
|
184
|
+
INSERT INTO sessions (id, task_text, nodes_returned, nodes_used, outcome_signal, token_count, created_at)
|
|
185
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
186
|
+
""",
|
|
187
|
+
(
|
|
188
|
+
session.id,
|
|
189
|
+
session.task_text,
|
|
190
|
+
json.dumps(nodes_returned),
|
|
191
|
+
None,
|
|
192
|
+
None,
|
|
193
|
+
token_count,
|
|
194
|
+
session.created_at,
|
|
195
|
+
),
|
|
196
|
+
)
|
|
197
|
+
self.conn.commit()
|
|
198
|
+
return session
|
|
199
|
+
|
|
200
|
+
def update_session_outcome(self, session_id: str, used_node_ids: list[str], success: bool) -> None:
|
|
201
|
+
self.conn.execute(
|
|
202
|
+
"""
|
|
203
|
+
UPDATE sessions
|
|
204
|
+
SET nodes_used = ?, outcome_signal = ?
|
|
205
|
+
WHERE id = ?
|
|
206
|
+
""",
|
|
207
|
+
(json.dumps(used_node_ids), 1 if success else 0, session_id),
|
|
208
|
+
)
|
|
209
|
+
self.conn.commit()
|
|
210
|
+
|
|
211
|
+
def list_sessions(self) -> list[Session]:
|
|
212
|
+
rows = self.conn.execute("SELECT * FROM sessions ORDER BY created_at DESC").fetchall()
|
|
213
|
+
return [
|
|
214
|
+
Session(
|
|
215
|
+
id=r["id"],
|
|
216
|
+
task_text=r["task_text"],
|
|
217
|
+
nodes_returned=json.loads(r["nodes_returned"]) if r["nodes_returned"] else None,
|
|
218
|
+
nodes_used=json.loads(r["nodes_used"]) if r["nodes_used"] else None,
|
|
219
|
+
outcome_signal=r["outcome_signal"],
|
|
220
|
+
token_count=r["token_count"],
|
|
221
|
+
created_at=r["created_at"],
|
|
222
|
+
)
|
|
223
|
+
for r in rows
|
|
224
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Node embedding utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.progress import BarColumn, Progress, TaskProgressColumn, TextColumn, TimeRemainingColumn
|
|
11
|
+
|
|
12
|
+
from contextforge.graph.store import Node
|
|
13
|
+
|
|
14
|
+
MODEL = "BAAI/bge-small-en-v1.5"
|
|
15
|
+
|
|
16
|
+
console = Console(stderr=True)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _fallback_embedding(text: str, size: int = 384) -> np.ndarray:
|
|
20
|
+
digest = hashlib.sha256(text.encode("utf-8")).digest()
|
|
21
|
+
seed = int.from_bytes(digest[:8], "little")
|
|
22
|
+
rng = np.random.default_rng(seed)
|
|
23
|
+
vec = rng.random(size, dtype=np.float32)
|
|
24
|
+
norm = np.linalg.norm(vec)
|
|
25
|
+
if norm == 0:
|
|
26
|
+
return vec
|
|
27
|
+
return vec / norm
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def embed_nodes(nodes: list[Node], batch_size: int = 64, cache_dir: Path | None = None) -> list[Node]:
|
|
31
|
+
if not nodes:
|
|
32
|
+
return nodes
|
|
33
|
+
|
|
34
|
+
model = None
|
|
35
|
+
try:
|
|
36
|
+
from sentence_transformers import SentenceTransformer
|
|
37
|
+
|
|
38
|
+
cache = cache_dir or (Path.home() / ".cache" / "contextforge")
|
|
39
|
+
cache.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
model = SentenceTransformer(MODEL, cache_folder=str(cache))
|
|
41
|
+
except Exception:
|
|
42
|
+
console.print("[yellow]Warning:[/yellow] embedding model unavailable, using fallback vectors")
|
|
43
|
+
|
|
44
|
+
texts = [f"{n.kind}: {n.name}. {n.snippet or ''}" for n in nodes]
|
|
45
|
+
with Progress(
|
|
46
|
+
TextColumn("[progress.description]{task.description}"),
|
|
47
|
+
BarColumn(),
|
|
48
|
+
TaskProgressColumn(),
|
|
49
|
+
TimeRemainingColumn(),
|
|
50
|
+
console=console,
|
|
51
|
+
transient=True,
|
|
52
|
+
) as progress:
|
|
53
|
+
task = progress.add_task("Embedding nodes", total=len(nodes))
|
|
54
|
+
for i in range(0, len(nodes), batch_size):
|
|
55
|
+
batch = nodes[i : i + batch_size]
|
|
56
|
+
batch_texts = texts[i : i + batch_size]
|
|
57
|
+
if model is not None:
|
|
58
|
+
vectors = model.encode(batch_texts, convert_to_numpy=True, normalize_embeddings=True)
|
|
59
|
+
else:
|
|
60
|
+
vectors = np.stack([_fallback_embedding(t) for t in batch_texts], axis=0)
|
|
61
|
+
for node, vector in zip(batch, vectors, strict=True):
|
|
62
|
+
node.embedding = np.asarray(vector, dtype=np.float32)
|
|
63
|
+
progress.advance(task, len(batch))
|
|
64
|
+
return nodes
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Parse source files into graph nodes and edges."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import hashlib
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from contextforge.graph.store import Edge, Node
|
|
12
|
+
from contextforge.indexer.tree_sitter_extract import try_extract_symbols
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _node_id(kind: str, path: str, name: str) -> str:
|
|
16
|
+
return hashlib.sha1(f"{kind}:{path}:{name}".encode("utf-8")).hexdigest()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _file_hash(content: str) -> str:
|
|
20
|
+
return hashlib.sha1(content.encode("utf-8")).hexdigest()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(slots=True)
|
|
24
|
+
class ParseResult:
|
|
25
|
+
file_node: Node
|
|
26
|
+
symbol_nodes: list[Node]
|
|
27
|
+
edges: list[Edge]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _parse_python_with_ast(content: str, rel_path: str) -> tuple[list[Node], list[Edge]]:
|
|
31
|
+
tree = ast.parse(content)
|
|
32
|
+
symbols: list[Node] = []
|
|
33
|
+
edges: list[Edge] = []
|
|
34
|
+
file_id = _node_id("file", rel_path, rel_path)
|
|
35
|
+
known_symbol_names: set[str] = set()
|
|
36
|
+
|
|
37
|
+
for item in ast.walk(tree):
|
|
38
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
39
|
+
name = item.name
|
|
40
|
+
symbol_id = _node_id("symbol", rel_path, name)
|
|
41
|
+
known_symbol_names.add(name)
|
|
42
|
+
snippet = ast.get_source_segment(content, item) or ""
|
|
43
|
+
symbols.append(
|
|
44
|
+
Node(
|
|
45
|
+
id=symbol_id,
|
|
46
|
+
kind="symbol",
|
|
47
|
+
name=name,
|
|
48
|
+
path=rel_path,
|
|
49
|
+
language="python",
|
|
50
|
+
start_line=getattr(item, "lineno", None),
|
|
51
|
+
end_line=getattr(item, "end_lineno", None),
|
|
52
|
+
snippet=snippet[:300],
|
|
53
|
+
file_hash=_file_hash(content),
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
edges.append(Edge(src_id=file_id, dst_id=symbol_id, kind="contains"))
|
|
57
|
+
if isinstance(item, ast.Import):
|
|
58
|
+
for alias in item.names:
|
|
59
|
+
mod_id = _node_id("module", rel_path, alias.name)
|
|
60
|
+
edges.append(Edge(src_id=file_id, dst_id=mod_id, kind="imports"))
|
|
61
|
+
if isinstance(item, ast.ImportFrom):
|
|
62
|
+
module = item.module or ""
|
|
63
|
+
if module:
|
|
64
|
+
mod_id = _node_id("module", rel_path, module)
|
|
65
|
+
edges.append(Edge(src_id=file_id, dst_id=mod_id, kind="imports"))
|
|
66
|
+
|
|
67
|
+
for item in ast.walk(tree):
|
|
68
|
+
if isinstance(item, ast.Call):
|
|
69
|
+
name: str | None = None
|
|
70
|
+
if isinstance(item.func, ast.Name):
|
|
71
|
+
name = item.func.id
|
|
72
|
+
elif isinstance(item.func, ast.Attribute):
|
|
73
|
+
name = item.func.attr
|
|
74
|
+
if name and name in known_symbol_names:
|
|
75
|
+
src = _node_id("symbol", rel_path, name)
|
|
76
|
+
dst = _node_id("symbol", rel_path, name)
|
|
77
|
+
edges.append(Edge(src_id=src, dst_id=dst, kind="calls"))
|
|
78
|
+
|
|
79
|
+
return symbols, edges
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _parse_ts_js_regex(content: str, rel_path: str, language: str, file_id: str, file_hash: str) -> tuple[list[Node], list[Edge]]:
|
|
83
|
+
symbol_nodes: list[Node] = []
|
|
84
|
+
edges: list[Edge] = []
|
|
85
|
+
patterns = [
|
|
86
|
+
re.compile(r"function\s+([A-Za-z_][A-Za-z0-9_]*)"),
|
|
87
|
+
re.compile(r"class\s+([A-Za-z_][A-Za-z0-9_]*)"),
|
|
88
|
+
re.compile(r"const\s+([A-Za-z_][A-Za-z0-9_]*)\s*=\s*\("),
|
|
89
|
+
]
|
|
90
|
+
for pat in patterns:
|
|
91
|
+
for m in pat.finditer(content):
|
|
92
|
+
name = m.group(1)
|
|
93
|
+
sid = _node_id("symbol", rel_path, name)
|
|
94
|
+
symbol_nodes.append(
|
|
95
|
+
Node(
|
|
96
|
+
id=sid,
|
|
97
|
+
kind="symbol",
|
|
98
|
+
name=name,
|
|
99
|
+
path=rel_path,
|
|
100
|
+
language=language,
|
|
101
|
+
snippet=name,
|
|
102
|
+
file_hash=file_hash,
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
edges.append(Edge(src_id=file_id, dst_id=sid, kind="contains"))
|
|
106
|
+
return symbol_nodes, edges
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _parse_go_regex(content: str, rel_path: str, language: str, file_id: str, file_hash: str) -> tuple[list[Node], list[Edge]]:
|
|
110
|
+
symbol_nodes: list[Node] = []
|
|
111
|
+
edges: list[Edge] = []
|
|
112
|
+
for m in re.finditer(r"func\s+([A-Za-z_][A-Za-z0-9_]*)", content):
|
|
113
|
+
name = m.group(1)
|
|
114
|
+
sid = _node_id("symbol", rel_path, name)
|
|
115
|
+
symbol_nodes.append(
|
|
116
|
+
Node(id=sid, kind="symbol", name=name, path=rel_path, language=language, snippet=name, file_hash=file_hash)
|
|
117
|
+
)
|
|
118
|
+
edges.append(Edge(src_id=file_id, dst_id=sid, kind="contains"))
|
|
119
|
+
return symbol_nodes, edges
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _parse_rust_regex(content: str, rel_path: str, language: str, file_id: str, file_hash: str) -> tuple[list[Node], list[Edge]]:
|
|
123
|
+
symbol_nodes: list[Node] = []
|
|
124
|
+
edges: list[Edge] = []
|
|
125
|
+
for m in re.finditer(r"(fn|struct|enum)\s+([A-Za-z_][A-Za-z0-9_]*)", content):
|
|
126
|
+
name = m.group(2)
|
|
127
|
+
sid = _node_id("symbol", rel_path, name)
|
|
128
|
+
symbol_nodes.append(
|
|
129
|
+
Node(id=sid, kind="symbol", name=name, path=rel_path, language=language, snippet=name, file_hash=file_hash)
|
|
130
|
+
)
|
|
131
|
+
edges.append(Edge(src_id=file_id, dst_id=sid, kind="contains"))
|
|
132
|
+
return symbol_nodes, edges
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def parse_file(path: Path, rel_path: str, language: str) -> ParseResult:
|
|
136
|
+
content = path.read_text(encoding="utf-8", errors="ignore")
|
|
137
|
+
file_id = _node_id("file", rel_path, rel_path)
|
|
138
|
+
fh = _file_hash(content)
|
|
139
|
+
file_node = Node(
|
|
140
|
+
id=file_id,
|
|
141
|
+
kind="file",
|
|
142
|
+
name=path.name,
|
|
143
|
+
path=rel_path,
|
|
144
|
+
language=language,
|
|
145
|
+
start_line=1,
|
|
146
|
+
end_line=max(content.count("\n") + 1, 1),
|
|
147
|
+
snippet=content[:300],
|
|
148
|
+
file_hash=fh,
|
|
149
|
+
)
|
|
150
|
+
symbol_nodes: list[Node] = []
|
|
151
|
+
edges: list[Edge] = []
|
|
152
|
+
|
|
153
|
+
if language == "python":
|
|
154
|
+
symbol_nodes, edges = _parse_python_with_ast(content, rel_path)
|
|
155
|
+
elif language in {"typescript", "javascript", "go", "rust"}:
|
|
156
|
+
extracted = try_extract_symbols(path, rel_path, content, language, file_id, fh)
|
|
157
|
+
if extracted is not None:
|
|
158
|
+
symbol_nodes, edges = extracted
|
|
159
|
+
elif language in {"typescript", "javascript"}:
|
|
160
|
+
symbol_nodes, edges = _parse_ts_js_regex(content, rel_path, language, file_id, fh)
|
|
161
|
+
elif language == "go":
|
|
162
|
+
symbol_nodes, edges = _parse_go_regex(content, rel_path, language, file_id, fh)
|
|
163
|
+
elif language == "rust":
|
|
164
|
+
symbol_nodes, edges = _parse_rust_regex(content, rel_path, language, file_id, fh)
|
|
165
|
+
|
|
166
|
+
return ParseResult(file_node=file_node, symbol_nodes=symbol_nodes, edges=edges)
|