contextguardrail 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextguardrail
3
+ Version: 0.1.0
4
+ Summary: Local-first token firewall for AI coding agents
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: typer>=0.12
8
+ Requires-Dist: rich>=13.0
9
+ Requires-Dist: networkx>=3.0
10
+ Requires-Dist: tiktoken>=0.7
11
+
12
+ # ContextGuardrail
13
+
14
+ ContextGuardrail is a local-first MVP for reducing AI coding-agent context. It scans a repo, builds a lightweight code graph, selects relevant files for a prompt, prevents replaying already-sent files, caches repeated asks, and reports estimated token/cost savings.
15
+
16
+ ## Install locally
17
+
18
+ ```bash
19
+ cd /Users/homesachin/Desktop/zoneone/contextguardrail
20
+ python -m venv .venv
21
+ source .venv/bin/activate
22
+ pip install -e .
23
+ ```
24
+
25
+ ## Use
26
+
27
+ ```bash
28
+ contextguardrail init
29
+ contextguardrail index /path/to/repo
30
+ contextguardrail ask "Where is authentication handled?"
31
+ contextguardrail stats
32
+ contextguardrail export
33
+ contextguardrail clean
34
+ ```
35
+
36
+ All state is stored in the indexed repo under `.contextguardrail/`.
37
+
38
+ ## MVP Features
39
+
40
+ - Repo scanner with incremental hashing
41
+ - Python AST parser for imports, classes, functions, and summaries
42
+ - Lightweight dependency graph
43
+ - Context selector using prompt keywords and graph metadata
44
+ - Token counting with `tiktoken` when available, word-count fallback otherwise
45
+ - Semantic cache for repeated prompt and selected-file sets
46
+ - Replay prevention so already-sent files are skipped unless changed
47
+ - Context diffing via file hashes
48
+ - Cost observability through `contextguardrail stats`
49
+
50
+ This version intentionally skips dashboards, multi-user support, Neo4j, and agent orchestration.
51
+
52
+
53
+ ## Project Layout
54
+
55
+ ```
56
+ contextguardrail/
57
+ ├── pyproject.toml # Package metadata, dependencies, CLI entrypoints
58
+ ├── README.md # Project documentation and usage guide
59
+ ├── contextguardrail/
60
+ │ ├── scanner.py # Scan repo and detect files, hashes, changes
61
+ │ ├── config.py # Global settings and configuration loading
62
+ │ ├── budget.py # Token estimation and budget enforcement
63
+ │ ├── exporter.py # Export graph, summaries, and reports
64
+ │ ├── graph.py # Build dependency graph from source code
65
+ │ ├── selector.py # Select most relevant context for a prompt
66
+ │ ├── cache.py # Semantic cache and replay prevention
67
+ │ ├── cli.py # Main CLI commands exposed to users
68
+ │ ├── stats.py # Usage metrics and cost-saving reports
69
+ │ └── storage.py # SQLite helpers and persistence layer
70
+ └── tests/
71
+ └── test_budget.py # Unit tests for token budgeting logic
72
+ ```
@@ -0,0 +1,61 @@
1
+ # ContextGuardrail
2
+
3
+ ContextGuardrail is a local-first MVP for reducing AI coding-agent context. It scans a repo, builds a lightweight code graph, selects relevant files for a prompt, prevents replaying already-sent files, caches repeated asks, and reports estimated token/cost savings.
4
+
5
+ ## Install locally
6
+
7
+ ```bash
8
+ cd /Users/homesachin/Desktop/zoneone/contextguardrail
9
+ python -m venv .venv
10
+ source .venv/bin/activate
11
+ pip install -e .
12
+ ```
13
+
14
+ ## Use
15
+
16
+ ```bash
17
+ contextguardrail init
18
+ contextguardrail index /path/to/repo
19
+ contextguardrail ask "Where is authentication handled?"
20
+ contextguardrail stats
21
+ contextguardrail export
22
+ contextguardrail clean
23
+ ```
24
+
25
+ All state is stored in the indexed repo under `.contextguardrail/`.
26
+
27
+ ## MVP Features
28
+
29
+ - Repo scanner with incremental hashing
30
+ - Python AST parser for imports, classes, functions, and summaries
31
+ - Lightweight dependency graph
32
+ - Context selector using prompt keywords and graph metadata
33
+ - Token counting with `tiktoken` when available, word-count fallback otherwise
34
+ - Semantic cache for repeated prompt and selected-file sets
35
+ - Replay prevention so already-sent files are skipped unless changed
36
+ - Context diffing via file hashes
37
+ - Cost observability through `contextguardrail stats`
38
+
39
+ This version intentionally skips dashboards, multi-user support, Neo4j, and agent orchestration.
40
+
41
+
42
+ ## Project Layout
43
+
44
+ ```
45
+ contextguardrail/
46
+ ├── pyproject.toml # Package metadata, dependencies, CLI entrypoints
47
+ ├── README.md # Project documentation and usage guide
48
+ ├── contextguardrail/
49
+ │ ├── scanner.py # Scan repo and detect files, hashes, changes
50
+ │ ├── config.py # Global settings and configuration loading
51
+ │ ├── budget.py # Token estimation and budget enforcement
52
+ │ ├── exporter.py # Export graph, summaries, and reports
53
+ │ ├── graph.py # Build dependency graph from source code
54
+ │ ├── selector.py # Select most relevant context for a prompt
55
+ │ ├── cache.py # Semantic cache and replay prevention
56
+ │ ├── cli.py # Main CLI commands exposed to users
57
+ │ ├── stats.py # Usage metrics and cost-saving reports
58
+ │ └── storage.py # SQLite helpers and persistence layer
59
+ └── tests/
60
+ └── test_budget.py # Unit tests for token budgeting logic
61
+ ```
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def estimate_tokens(text: str, model: str = "gpt-4o-mini") -> int:
5
+ try:
6
+ import tiktoken
7
+
8
+ encoding = tiktoken.encoding_for_model(model)
9
+ return len(encoding.encode(text))
10
+ except Exception:
11
+ return int(len(text.split()) * 1.3)
12
+
13
+
14
+ def cost_usd(input_tokens: int, output_tokens: int = 0) -> float:
15
+ input_cost_per_million = 0.15
16
+ output_cost_per_million = 0.60
17
+ return (input_tokens / 1_000_000 * input_cost_per_million) + (
18
+ output_tokens / 1_000_000 * output_cost_per_million
19
+ )
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from pathlib import Path
6
+
7
+ from contextguardrail.storage import connect
8
+
9
+
10
+ def cache_key(prompt: str, selected_hash: str, model: str) -> str:
11
+ return hashlib.sha256(f"{prompt}\n{selected_hash}\n{model}".encode()).hexdigest()
12
+
13
+
14
+ def selected_files_hash(files: list[dict]) -> str:
15
+ payload = "|".join(f"{item['path']}:{item['hash']}" for item in files)
16
+ return hashlib.sha256(payload.encode()).hexdigest()
17
+
18
+
19
+ def get_cache(repo: str | Path, key: str) -> str | None:
20
+ with connect(repo, "cache.db") as db:
21
+ row = db.execute("SELECT response FROM cache WHERE key = ?", (key,)).fetchone()
22
+ return row["response"] if row else None
23
+
24
+
25
+ def set_cache(repo: str | Path, key: str, response: str) -> None:
26
+ with connect(repo, "cache.db") as db:
27
+ db.execute(
28
+ "INSERT OR REPLACE INTO cache(key, response) VALUES (?, ?)",
29
+ (key, response),
30
+ )
31
+
32
+
33
+ def prompt_hash(prompt: str) -> str:
34
+ return hashlib.sha256(prompt.strip().lower().encode()).hexdigest()
35
+
36
+
37
+ def already_sent(repo: str | Path, prompt: str) -> dict[str, str]:
38
+ with connect(repo, "cache.db") as db:
39
+ row = db.execute("SELECT file_hashes FROM replay WHERE prompt_hash = ?", (prompt_hash(prompt),)).fetchone()
40
+ return json.loads(row["file_hashes"]) if row else {}
41
+
42
+
43
+ def remember_sent(repo: str | Path, prompt: str, files: list[dict]) -> None:
44
+ file_hashes = {item["path"]: item["hash"] for item in files}
45
+ with connect(repo, "cache.db") as db:
46
+ db.execute(
47
+ "INSERT OR REPLACE INTO replay(prompt_hash, files, file_hashes) VALUES (?, ?, ?)",
48
+ (prompt_hash(prompt), json.dumps(list(file_hashes)), json.dumps(file_hashes)),
49
+ )
50
+
51
+
52
+ def clean_cache(repo: str | Path) -> int:
53
+ with connect(repo, "cache.db") as db:
54
+ count = db.execute("SELECT COUNT(*) AS count FROM cache").fetchone()["count"]
55
+ db.execute("DELETE FROM cache")
56
+ db.execute("DELETE FROM replay")
57
+ return int(count)
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+
9
+ from contextguardrail.budget import cost_usd
10
+ from contextguardrail.cache import (
11
+ already_sent,
12
+ cache_key,
13
+ clean_cache,
14
+ get_cache,
15
+ remember_sent,
16
+ selected_files_hash,
17
+ set_cache,
18
+ )
19
+ from contextguardrail.config import DEFAULT_BUDGET, DEFAULT_MODEL, ensure_state, repo_root
20
+ from contextguardrail.exporter import export_repo
21
+ from contextguardrail.graph import graph_counts
22
+ from contextguardrail.scanner import index_repo
23
+ from contextguardrail.selector import select_context
24
+ from contextguardrail.stats import record_request, show_stats
25
+ from contextguardrail.storage import init_storage
26
+
27
+ app = typer.Typer(help="Local-first token firewall for AI coding agents.")
28
+ console = Console()
29
+
30
+
31
+ @app.command()
32
+ def init(path: str = typer.Argument(".")):
33
+ """Create .contextguardrail storage for a repo."""
34
+ root = repo_root(path)
35
+ init_storage(root)
36
+ console.print(f"[green]Initialized[/green] {ensure_state(root)}")
37
+
38
+
39
+ @app.command()
40
+ def index(
41
+ path: str = typer.Argument("."),
42
+ incremental: bool = typer.Option(False, "--incremental", "-i"),
43
+ ):
44
+ """Scan a repo and build the local code graph."""
45
+ root = repo_root(path)
46
+ result = index_repo(root, incremental=incremental)
47
+ functions, classes = graph_counts(root)
48
+ console.print(f"Files scanned: [bold]{result['files_scanned']}[/bold]")
49
+ if incremental:
50
+ console.print(f"Files skipped: [bold]{result['files_skipped']}[/bold]")
51
+ console.print(f"Functions: [bold]{functions}[/bold]")
52
+ console.print(f"Classes: [bold]{classes}[/bold]")
53
+ console.print("[green]Graph built.[/green]")
54
+
55
+
56
+ @app.command()
57
+ def ask(
58
+ prompt: str,
59
+ path: str = typer.Option(".", "--path", "-p"),
60
+ model: str = typer.Option(DEFAULT_MODEL, "--model", "-m"),
61
+ budget: int = typer.Option(DEFAULT_BUDGET, "--budget", "-b"),
62
+ include_replay: bool = typer.Option(False, "--include-replay"),
63
+ ):
64
+ """Select optimized context for a coding prompt."""
65
+ root = repo_root(path)
66
+ base_selected, raw_tokens = select_context(root, prompt, budget=budget)
67
+ base_optimized = estimated_optimized_tokens(base_selected)
68
+ key = cache_key(prompt, selected_files_hash(base_selected), model)
69
+ cached = get_cache(root, key)
70
+ if cached is not None:
71
+ console.print(cached.replace("Cache: miss", "Cache: hit"))
72
+ record_request(root, raw_tokens, 0, cache_hit=True)
73
+ return
74
+
75
+ replay = {} if include_replay else already_sent(root, prompt)
76
+ selected, raw_tokens = select_context(root, prompt, budget=budget, exclude_unchanged=replay)
77
+ optimized_tokens = sum(max(30, min(item["tokens"], int(item["tokens"] * 0.25))) for item in selected)
78
+
79
+ lines = ["Files selected:", ""]
80
+ if selected:
81
+ lines.extend(item["path"] for item in selected)
82
+ else:
83
+ lines.append("No changed files since this prompt was last sent.")
84
+ lines.extend(
85
+ [
86
+ "",
87
+ f"Raw Tokens: {raw_tokens:,}",
88
+ f"Optimized Tokens: {optimized_tokens:,}",
89
+ f"Savings: {savings_percent(raw_tokens, optimized_tokens):.1f}%",
90
+ f"Estimated Cost Saved: ${cost_usd(max(0, raw_tokens - optimized_tokens)):.4f}",
91
+ ]
92
+ )
93
+ lines.append("Cache: miss")
94
+ if base_optimized == optimized_tokens or not replay:
95
+ set_cache(root, key, "\n".join(lines))
96
+ remember_sent(root, prompt, selected)
97
+ record_request(root, raw_tokens, optimized_tokens, cache_hit=False)
98
+ console.print("\n".join(lines))
99
+
100
+
101
+ @app.command()
102
+ def analyze(
103
+ prompt: str,
104
+ path: str = typer.Option(".", "--path", "-p"),
105
+ budget: int = typer.Option(DEFAULT_BUDGET, "--budget", "-b"),
106
+ ):
107
+ """Killer-demo alias for ask with cost framing."""
108
+ ask(prompt=prompt, path=path, budget=budget)
109
+
110
+
111
+ @app.command()
112
+ def stats(path: str = typer.Argument(".")):
113
+ """Show token and cache savings."""
114
+ data = show_stats(repo_root(path))
115
+ table = Table(title="ContextGuardrail Stats")
116
+ table.add_column("Metric")
117
+ table.add_column("Value", justify="right")
118
+ table.add_row("Requests", f"{data['requests']:,}")
119
+ table.add_row("Input tokens saved", f"{data['input_tokens_saved']:,}")
120
+ table.add_row("Output tokens saved", f"{data['output_tokens_saved']:,}")
121
+ table.add_row("Cache hits", f"{data['cache_hits']:,}")
122
+ table.add_row("Raw tokens observed", f"{data['raw_tokens']:,}")
123
+ table.add_row("Optimized tokens sent", f"{data['optimized_tokens']:,}")
124
+ table.add_row("Estimated cost saved", f"${data['estimated_cost_saved']:.4f}")
125
+ console.print(table)
126
+
127
+
128
+ @app.command("clean")
129
+ def clean(path: str = typer.Argument(".")):
130
+ """Clean semantic cache and replay memory."""
131
+ count = clean_cache(repo_root(path))
132
+ console.print(f"[green]Cleaned[/green] {count} cached responses")
133
+
134
+
135
+ @app.command("export")
136
+ def export_command(path: str = typer.Argument(".")):
137
+ """Export repo-brain.json, code-dna.json, and ai-gossip.md."""
138
+ files = export_repo(repo_root(path))
139
+ for file in files:
140
+ console.print(file)
141
+
142
+
143
+ def savings_percent(raw_tokens: int, optimized_tokens: int) -> float:
144
+ if raw_tokens <= 0:
145
+ return 0.0
146
+ return max(0.0, (raw_tokens - optimized_tokens) / raw_tokens * 100)
147
+
148
+
149
+ def estimated_optimized_tokens(files: list[dict]) -> int:
150
+ return sum(max(30, min(item["tokens"], int(item["tokens"] * 0.25))) for item in files)
151
+
152
+
153
+ if __name__ == "__main__":
154
+ app()
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ STATE_DIR = ".contextguardrail"
7
+ DEFAULT_MODEL = "gpt-4o-mini"
8
+ DEFAULT_BUDGET = 8_000
9
+
10
+ SKIP_DIRS = {
11
+ ".git",
12
+ ".hg",
13
+ ".svn",
14
+ ".venv",
15
+ "venv",
16
+ "env",
17
+ "__pycache__",
18
+ "node_modules",
19
+ "dist",
20
+ "build",
21
+ ".contextguardrail",
22
+ ".pytest_cache",
23
+ ".mypy_cache",
24
+ ".ruff_cache",
25
+ }
26
+
27
+ CODE_EXTENSIONS = {
28
+ ".py",
29
+ ".js",
30
+ ".jsx",
31
+ ".ts",
32
+ ".tsx",
33
+ ".go",
34
+ ".java",
35
+ ".rs",
36
+ ".rb",
37
+ ".php",
38
+ ".c",
39
+ ".cc",
40
+ ".cpp",
41
+ ".h",
42
+ ".hpp",
43
+ ".cs",
44
+ ".swift",
45
+ ".kt",
46
+ ".kts",
47
+ ".scala",
48
+ ".sh",
49
+ ".sql",
50
+ ".yaml",
51
+ ".yml",
52
+ ".json",
53
+ ".toml",
54
+ ".md",
55
+ }
56
+
57
+
58
+ def repo_root(path: str | Path = ".") -> Path:
59
+ return Path(path).expanduser().resolve()
60
+
61
+
62
+ def state_dir(path: str | Path = ".") -> Path:
63
+ return repo_root(path) / STATE_DIR
64
+
65
+
66
+ def ensure_state(path: str | Path = ".") -> Path:
67
+ root = state_dir(path)
68
+ (root / "cache").mkdir(parents=True, exist_ok=True)
69
+ (root / "summaries").mkdir(parents=True, exist_ok=True)
70
+ return root
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from contextguardrail.config import ensure_state
7
+ from contextguardrail.graph import load_graph
8
+ from contextguardrail.storage import connect, load_stats
9
+
10
+
11
+ def export_repo(repo: str | Path = ".") -> list[Path]:
12
+ state = ensure_state(repo)
13
+ with connect(repo, "hashes.db") as files_db, connect(repo, "graph.db") as graph_db:
14
+ files = [dict(row) for row in files_db.execute("SELECT * FROM files ORDER BY path").fetchall()]
15
+ symbols = [dict(row) for row in graph_db.execute("SELECT * FROM symbols ORDER BY path").fetchall()]
16
+ graph = load_graph(repo)
17
+ brain = {
18
+ "files": files,
19
+ "symbols": symbols,
20
+ "edges": [{"source": a, "target": b, **data} for a, b, data in graph.edges(data=True)],
21
+ "stats": load_stats(repo),
22
+ }
23
+ repo_brain = state / "repo-brain.json"
24
+ code_dna = state / "code-dna.json"
25
+ ai_gossip = state / "ai-gossip.md"
26
+ repo_brain.write_text(json.dumps(brain, indent=2) + "\n", encoding="utf-8")
27
+ code_dna.write_text(
28
+ json.dumps({"files": len(files), "symbols": len(symbols), "edges": graph.number_of_edges()}, indent=2) + "\n",
29
+ encoding="utf-8",
30
+ )
31
+ lines = ["# AI Gossip", "", "Most connected files:"]
32
+ for node, degree in sorted(graph.degree, key=lambda item: item[1], reverse=True)[:20]:
33
+ lines.append(f"- {node}: {degree} links")
34
+ ai_gossip.write_text("\n".join(lines) + "\n", encoding="utf-8")
35
+ return [repo_brain, code_dna, ai_gossip]
@@ -0,0 +1,108 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import networkx as nx
9
+
10
+ from contextguardrail.storage import connect
11
+
12
+
13
+ WORD_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]+")
14
+
15
+
16
+ def parse_python(text: str) -> dict[str, list[str]]:
17
+ tree = ast.parse(text)
18
+ imports: list[str] = []
19
+ classes: list[str] = []
20
+ functions: list[str] = []
21
+ for node in ast.walk(tree):
22
+ if isinstance(node, ast.Import):
23
+ imports.extend(alias.name for alias in node.names)
24
+ elif isinstance(node, ast.ImportFrom):
25
+ if node.module:
26
+ imports.append(node.module)
27
+ elif isinstance(node, ast.ClassDef):
28
+ classes.append(node.name)
29
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
30
+ functions.append(node.name)
31
+ return {
32
+ "imports": sorted(set(imports)),
33
+ "classes": sorted(set(classes)),
34
+ "functions": sorted(set(functions)),
35
+ }
36
+
37
+
38
+ def parse_file(path: Path, rel_path: str, text: str) -> dict[str, Any]:
39
+ data: dict[str, Any] = {"imports": [], "classes": [], "functions": []}
40
+ if path.suffix == ".py":
41
+ try:
42
+ data = parse_python(text)
43
+ except SyntaxError:
44
+ pass
45
+ words = WORD_RE.findall(rel_path + " " + text[:8_000])
46
+ symbols = data["imports"] + data["classes"] + data["functions"]
47
+ data["keywords"] = sorted(set(w.lower() for w in words + symbols if len(w) > 2))
48
+ return data
49
+
50
+
51
+ def summarize_file(rel_path: str, text: str, parsed: dict[str, Any]) -> str:
52
+ lines = [f"File: {rel_path}"]
53
+ if parsed["functions"]:
54
+ lines.append("Functions: " + ", ".join(parsed["functions"][:30]))
55
+ if parsed["classes"]:
56
+ lines.append("Classes: " + ", ".join(parsed["classes"][:30]))
57
+ if parsed["imports"]:
58
+ lines.append("Imports: " + ", ".join(parsed["imports"][:20]))
59
+ doc = next((line.strip("# ").strip() for line in text.splitlines() if line.strip()), "")
60
+ if doc:
61
+ lines.append("First line: " + doc[:240])
62
+ return "\n".join(lines)
63
+
64
+
65
+ def upsert_symbols(repo: str | Path, rel_path: str, parsed: dict[str, Any]) -> None:
66
+ with connect(repo, "graph.db") as db:
67
+ db.execute(
68
+ """
69
+ INSERT INTO symbols(path, imports, classes, functions, keywords)
70
+ VALUES (?, ?, ?, ?, ?)
71
+ ON CONFLICT(path) DO UPDATE SET
72
+ imports=excluded.imports,
73
+ classes=excluded.classes,
74
+ functions=excluded.functions,
75
+ keywords=excluded.keywords
76
+ """,
77
+ (
78
+ rel_path,
79
+ "\n".join(parsed["imports"]),
80
+ "\n".join(parsed["classes"]),
81
+ "\n".join(parsed["functions"]),
82
+ "\n".join(parsed["keywords"]),
83
+ ),
84
+ )
85
+
86
+
87
+ def load_graph(repo: str | Path) -> nx.DiGraph:
88
+ graph = nx.DiGraph()
89
+ with connect(repo, "graph.db") as db:
90
+ rows = db.execute("SELECT path, imports, classes, functions FROM symbols").fetchall()
91
+ paths = {row["path"] for row in rows}
92
+ module_to_path = {Path(path).with_suffix("").as_posix().replace("/", "."): path for path in paths}
93
+ for row in rows:
94
+ path = row["path"]
95
+ graph.add_node(path, classes=row["classes"].splitlines(), functions=row["functions"].splitlines())
96
+ for imported in row["imports"].splitlines():
97
+ target = module_to_path.get(imported)
98
+ if target:
99
+ graph.add_edge(path, target, type="imports")
100
+ return graph
101
+
102
+
103
+ def graph_counts(repo: str | Path) -> tuple[int, int]:
104
+ with connect(repo, "graph.db") as db:
105
+ rows = db.execute("SELECT classes, functions FROM symbols").fetchall()
106
+ classes = sum(len(row["classes"].splitlines()) for row in rows if row["classes"])
107
+ functions = sum(len(row["functions"].splitlines()) for row in rows if row["functions"])
108
+ return functions, classes
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from pathlib import Path
5
+
6
+ from contextguardrail.budget import estimate_tokens
7
+ from contextguardrail.config import CODE_EXTENSIONS, SKIP_DIRS, ensure_state, repo_root
8
+ from contextguardrail.graph import parse_file, summarize_file, upsert_symbols
9
+ from contextguardrail.storage import connect, init_storage
10
+
11
+
12
+ def file_hash(path: Path) -> str:
13
+ return hashlib.sha256(path.read_bytes()).hexdigest()
14
+
15
+
16
+ def should_scan(path: Path, rel_path: Path) -> bool:
17
+ if any(part in SKIP_DIRS for part in rel_path.parts):
18
+ return False
19
+ return path.is_file() and path.suffix.lower() in CODE_EXTENSIONS
20
+
21
+
22
+ def iter_files(root: Path):
23
+ for path in root.rglob("*"):
24
+ if should_scan(path, path.relative_to(root)):
25
+ yield path
26
+
27
+
28
+ def read_text(path: Path) -> str:
29
+ return path.read_text(encoding="utf-8", errors="ignore")
30
+
31
+
32
+ def index_repo(path: str | Path = ".", incremental: bool = False) -> dict[str, int]:
33
+ root = repo_root(path)
34
+ state = ensure_state(root)
35
+ init_storage(root)
36
+
37
+ scanned = changed = skipped = raw_tokens = 0
38
+ for file_path in iter_files(root):
39
+ rel_path = file_path.relative_to(root).as_posix()
40
+ digest = file_hash(file_path)
41
+ stat = file_path.stat()
42
+ with connect(root, "hashes.db") as db:
43
+ existing = db.execute("SELECT hash FROM files WHERE path = ?", (rel_path,)).fetchone()
44
+ if incremental and existing and existing["hash"] == digest:
45
+ skipped += 1
46
+ continue
47
+
48
+ text = read_text(file_path)
49
+ tokens = estimate_tokens(text)
50
+ parsed = parse_file(file_path, rel_path, text)
51
+ summary = summarize_file(rel_path, text, parsed)
52
+ summary_path = state / "summaries" / f"{hashlib.sha256(rel_path.encode()).hexdigest()}.md"
53
+ summary_path.write_text(summary + "\n", encoding="utf-8")
54
+
55
+ with connect(root, "hashes.db") as db:
56
+ db.execute(
57
+ """
58
+ INSERT INTO files(path, hash, size, mtime, tokens, summary)
59
+ VALUES (?, ?, ?, ?, ?, ?)
60
+ ON CONFLICT(path) DO UPDATE SET
61
+ hash=excluded.hash,
62
+ size=excluded.size,
63
+ mtime=excluded.mtime,
64
+ tokens=excluded.tokens,
65
+ summary=excluded.summary,
66
+ updated_at=CURRENT_TIMESTAMP
67
+ """,
68
+ (rel_path, digest, stat.st_size, stat.st_mtime, tokens, summary),
69
+ )
70
+ upsert_symbols(root, rel_path, parsed)
71
+ scanned += 1
72
+ changed += 1
73
+ raw_tokens += tokens
74
+
75
+ return {
76
+ "files_scanned": scanned,
77
+ "files_changed": changed,
78
+ "files_skipped": skipped,
79
+ "raw_tokens": raw_tokens,
80
+ }
@@ -0,0 +1,82 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ from contextguardrail.config import DEFAULT_BUDGET
7
+ from contextguardrail.storage import connect
8
+
9
+
10
+ WORD_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]+")
11
+
12
+
13
+ def prompt_terms(prompt: str) -> set[str]:
14
+ return {word.lower() for word in WORD_RE.findall(prompt) if len(word) > 2}
15
+
16
+
17
+ def score_row(row, terms: set[str]) -> int:
18
+ haystack = "\n".join(
19
+ [row["path"], row["summary"], row["keywords"], row["classes"], row["functions"]]
20
+ ).lower()
21
+ score = sum(5 for term in terms if term in row["path"].lower())
22
+ score += sum(3 for term in terms if term in row["classes"].lower() or term in row["functions"].lower())
23
+ score += sum(1 for term in terms if term in haystack)
24
+ if any(term in row["path"].lower() for term in ("auth", "user", "api", "cache", "config", "setting")):
25
+ score += 1
26
+ return score
27
+
28
+
29
+ def select_context(
30
+ repo: str | Path,
31
+ prompt: str,
32
+ budget: int = DEFAULT_BUDGET,
33
+ exclude_unchanged: dict[str, str] | None = None,
34
+ ) -> tuple[list[dict], int]:
35
+ terms = prompt_terms(prompt)
36
+ exclude_unchanged = exclude_unchanged or {}
37
+ with connect(repo, "hashes.db") as files_db, connect(repo, "graph.db") as graph_db:
38
+ rows = files_db.execute(
39
+ "SELECT path, hash, tokens, summary FROM files ORDER BY path"
40
+ ).fetchall()
41
+ symbols = {
42
+ row["path"]: row
43
+ for row in graph_db.execute(
44
+ "SELECT path, classes, functions, keywords FROM symbols"
45
+ ).fetchall()
46
+ }
47
+
48
+ candidates = []
49
+ raw_tokens = 0
50
+ for row in rows:
51
+ raw_tokens += int(row["tokens"])
52
+ if exclude_unchanged.get(row["path"]) == row["hash"]:
53
+ continue
54
+ symbol = symbols.get(row["path"])
55
+ merged = {
56
+ "path": row["path"],
57
+ "hash": row["hash"],
58
+ "tokens": int(row["tokens"]),
59
+ "summary": row["summary"],
60
+ "classes": symbol["classes"] if symbol else "",
61
+ "functions": symbol["functions"] if symbol else "",
62
+ "keywords": symbol["keywords"] if symbol else "",
63
+ }
64
+ score = score_row(merged, terms)
65
+ if score > 0:
66
+ candidates.append((score, merged))
67
+
68
+ candidates.sort(key=lambda item: (-item[0], item[1]["tokens"], item[1]["path"]))
69
+ selected = []
70
+ used = 0
71
+ for _, item in candidates:
72
+ summary_tokens = max(30, min(item["tokens"], int(item["tokens"] * 0.25)))
73
+ if selected and used + summary_tokens > budget:
74
+ continue
75
+ selected.append(item)
76
+ used += summary_tokens
77
+ if used >= budget:
78
+ break
79
+
80
+ if not selected:
81
+ selected = [item for _, item in candidates[:5]]
82
+ return selected, raw_tokens
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from contextguardrail.budget import cost_usd
6
+ from contextguardrail.storage import load_stats, save_stats
7
+
8
+
9
+ def record_request(
10
+ repo: str | Path,
11
+ raw_tokens: int,
12
+ optimized_tokens: int,
13
+ cache_hit: bool = False,
14
+ ) -> dict:
15
+ stats = load_stats(repo)
16
+ saved = max(0, raw_tokens - optimized_tokens)
17
+ stats["requests"] += 1
18
+ stats["raw_tokens"] += raw_tokens
19
+ stats["optimized_tokens"] += optimized_tokens
20
+ stats["input_tokens_saved"] += saved
21
+ stats["estimated_cost_saved"] = round(
22
+ stats.get("estimated_cost_saved", 0.0) + cost_usd(saved), 4
23
+ )
24
+ if cache_hit:
25
+ stats["cache_hits"] += 1
26
+ save_stats(repo, stats)
27
+ return stats
28
+
29
+
30
+ def show_stats(repo: str | Path = ".") -> dict:
31
+ return load_stats(repo)
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sqlite3
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from contextguardrail.config import ensure_state
9
+
10
+
11
+ def db_path(repo: str | Path, name: str) -> Path:
12
+ return ensure_state(repo) / name
13
+
14
+
15
+ def connect(repo: str | Path, name: str) -> sqlite3.Connection:
16
+ connection = sqlite3.connect(db_path(repo, name))
17
+ connection.row_factory = sqlite3.Row
18
+ return connection
19
+
20
+
21
+ def init_storage(repo: str | Path = ".") -> None:
22
+ ensure_state(repo)
23
+ with connect(repo, "hashes.db") as db:
24
+ db.execute(
25
+ """
26
+ CREATE TABLE IF NOT EXISTS files(
27
+ path TEXT PRIMARY KEY,
28
+ hash TEXT NOT NULL,
29
+ size INTEGER NOT NULL,
30
+ mtime REAL NOT NULL,
31
+ tokens INTEGER NOT NULL,
32
+ summary TEXT NOT NULL,
33
+ updated_at TEXT DEFAULT CURRENT_TIMESTAMP
34
+ )
35
+ """
36
+ )
37
+ with connect(repo, "graph.db") as db:
38
+ db.execute(
39
+ """
40
+ CREATE TABLE IF NOT EXISTS symbols(
41
+ path TEXT PRIMARY KEY,
42
+ imports TEXT NOT NULL,
43
+ classes TEXT NOT NULL,
44
+ functions TEXT NOT NULL,
45
+ keywords TEXT NOT NULL
46
+ )
47
+ """
48
+ )
49
+ with connect(repo, "cache.db") as db:
50
+ db.execute(
51
+ """
52
+ CREATE TABLE IF NOT EXISTS cache(
53
+ key TEXT PRIMARY KEY,
54
+ response TEXT NOT NULL,
55
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
56
+ )
57
+ """
58
+ )
59
+ db.execute(
60
+ """
61
+ CREATE TABLE IF NOT EXISTS replay(
62
+ prompt_hash TEXT PRIMARY KEY,
63
+ files TEXT NOT NULL,
64
+ file_hashes TEXT NOT NULL,
65
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
66
+ )
67
+ """
68
+ )
69
+ stats_file(repo).write_text(
70
+ json.dumps(load_stats(repo), indent=2) + "\n", encoding="utf-8"
71
+ )
72
+
73
+
74
+ def stats_file(repo: str | Path) -> Path:
75
+ return ensure_state(repo) / "costs.json"
76
+
77
+
78
+ def load_stats(repo: str | Path) -> dict[str, Any]:
79
+ path = stats_file(repo)
80
+ if not path.exists():
81
+ return {
82
+ "requests": 0,
83
+ "input_tokens_saved": 0,
84
+ "output_tokens_saved": 0,
85
+ "cache_hits": 0,
86
+ "raw_tokens": 0,
87
+ "optimized_tokens": 0,
88
+ "estimated_cost_saved": 0.0,
89
+ }
90
+ return json.loads(path.read_text(encoding="utf-8"))
91
+
92
+
93
+ def save_stats(repo: str | Path, stats: dict[str, Any]) -> None:
94
+ stats_file(repo).write_text(json.dumps(stats, indent=2) + "\n", encoding="utf-8")
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextguardrail
3
+ Version: 0.1.0
4
+ Summary: Local-first token firewall for AI coding agents
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: typer>=0.12
8
+ Requires-Dist: rich>=13.0
9
+ Requires-Dist: networkx>=3.0
10
+ Requires-Dist: tiktoken>=0.7
11
+
12
+ # ContextGuardrail
13
+
14
+ ContextGuardrail is a local-first MVP for reducing AI coding-agent context. It scans a repo, builds a lightweight code graph, selects relevant files for a prompt, prevents replaying already-sent files, caches repeated asks, and reports estimated token/cost savings.
15
+
16
+ ## Install locally
17
+
18
+ ```bash
19
+ cd /Users/homesachin/Desktop/zoneone/contextguardrail
20
+ python -m venv .venv
21
+ source .venv/bin/activate
22
+ pip install -e .
23
+ ```
24
+
25
+ ## Use
26
+
27
+ ```bash
28
+ contextguardrail init
29
+ contextguardrail index /path/to/repo
30
+ contextguardrail ask "Where is authentication handled?"
31
+ contextguardrail stats
32
+ contextguardrail export
33
+ contextguardrail clean
34
+ ```
35
+
36
+ All state is stored in the indexed repo under `.contextguardrail/`.
37
+
38
+ ## MVP Features
39
+
40
+ - Repo scanner with incremental hashing
41
+ - Python AST parser for imports, classes, functions, and summaries
42
+ - Lightweight dependency graph
43
+ - Context selector using prompt keywords and graph metadata
44
+ - Token counting with `tiktoken` when available, word-count fallback otherwise
45
+ - Semantic cache for repeated prompt and selected-file sets
46
+ - Replay prevention so already-sent files are skipped unless changed
47
+ - Context diffing via file hashes
48
+ - Cost observability through `contextguardrail stats`
49
+
50
+ This version intentionally skips dashboards, multi-user support, Neo4j, and agent orchestration.
51
+
52
+
53
+ ## Project Layout
54
+
55
+ ```
56
+ contextguardrail/
57
+ ├── pyproject.toml # Package metadata, dependencies, CLI entrypoints
58
+ ├── README.md # Project documentation and usage guide
59
+ ├── contextguardrail/
60
+ │ ├── scanner.py # Scan repo and detect files, hashes, changes
61
+ │ ├── config.py # Global settings and configuration loading
62
+ │ ├── budget.py # Token estimation and budget enforcement
63
+ │ ├── exporter.py # Export graph, summaries, and reports
64
+ │ ├── graph.py # Build dependency graph from source code
65
+ │ ├── selector.py # Select most relevant context for a prompt
66
+ │ ├── cache.py # Semantic cache and replay prevention
67
+ │ ├── cli.py # Main CLI commands exposed to users
68
+ │ ├── stats.py # Usage metrics and cost-saving reports
69
+ │ └── storage.py # SQLite helpers and persistence layer
70
+ └── tests/
71
+ └── test_budget.py # Unit tests for token budgeting logic
72
+ ```
@@ -0,0 +1,20 @@
1
+ README.md
2
+ pyproject.toml
3
+ contextguardrail/__init__.py
4
+ contextguardrail/budget.py
5
+ contextguardrail/cache.py
6
+ contextguardrail/cli.py
7
+ contextguardrail/config.py
8
+ contextguardrail/exporter.py
9
+ contextguardrail/graph.py
10
+ contextguardrail/scanner.py
11
+ contextguardrail/selector.py
12
+ contextguardrail/stats.py
13
+ contextguardrail/storage.py
14
+ contextguardrail.egg-info/PKG-INFO
15
+ contextguardrail.egg-info/SOURCES.txt
16
+ contextguardrail.egg-info/dependency_links.txt
17
+ contextguardrail.egg-info/entry_points.txt
18
+ contextguardrail.egg-info/requires.txt
19
+ contextguardrail.egg-info/top_level.txt
20
+ tests/test_budget.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ contextguardrail = contextguardrail.cli:app
@@ -0,0 +1,4 @@
1
+ typer>=0.12
2
+ rich>=13.0
3
+ networkx>=3.0
4
+ tiktoken>=0.7
@@ -0,0 +1 @@
1
+ contextguardrail
@@ -0,0 +1,22 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "contextguardrail"
7
+ version = "0.1.0"
8
+ description = "Local-first token firewall for AI coding agents"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "typer>=0.12",
13
+ "rich>=13.0",
14
+ "networkx>=3.0",
15
+ "tiktoken>=0.7"
16
+ ]
17
+
18
+ [project.scripts]
19
+ contextguardrail = "contextguardrail.cli:app"
20
+
21
+ [tool.setuptools.packages.find]
22
+ include = ["contextguardrail*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,5 @@
1
+ from contextguardrail.budget import estimate_tokens
2
+
3
+
4
+ def test_estimate_tokens_returns_positive_count():
5
+ assert estimate_tokens("hello world") > 0