fittok 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fittok/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """Fittok — relevant-code retrieval within a token budget.
2
+
3
+ Usable three ways, each standalone:
4
+ - As a library: ``from fittok import optimize``
5
+ - As a CLI: ``fittok query <path> "<question>"``
6
+ - As an MCP server: ``python -m fittok`` (for AI clients)
7
+ """
8
+
9
+ __version__ = "0.3.0"
10
+
11
+ __all__ = ["optimize", "index", "__version__"]
12
+
13
+
14
+ def optimize(codebase_path: str, query: str, token_budget: int = 0) -> dict:
15
+ """Return the most relevant source code for *query*, within *token_budget*.
16
+
17
+ token_budget=0 → adaptive sizing. No MCP client required. Returns a dict with
18
+ ``optimized_context``, ``graph_stats``, ``slurp_stats`` and ``savings``.
19
+ """
20
+ from .server import optimize_context_tool
21
+ return optimize_context_tool(codebase_path, query, token_budget)
22
+
23
+
24
+ def index(codebase_path: str) -> dict:
25
+ """Pre-build + cache the graph and embeddings for a codebase."""
26
+ from .indexer import index_codebase
27
+ return index_codebase(codebase_path)
fittok/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Allow running as `python -m fittok`."""
2
+
3
+ from .server import main
4
+
5
+ main()
fittok/cache.py ADDED
@@ -0,0 +1,223 @@
1
+ """3-level caching layer for fittok.
2
+
3
+ Levels:
4
+ 1. Graph cache — keyed by (root_path, file_mtimes_hash)
5
+ 2. Query cache — keyed by (graph_path, query_hash, token_budget)
6
+ 3. Compression cache — keyed by (context_hash, question_hash, target_tokens)
7
+
8
+ Uses diskcache for persistent storage. Cache dir: ~/.cache/fittok/
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import hashlib
14
+ import logging
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Any, Optional
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ CACHE_DIR = os.environ.get(
22
+ "FITTOK_CACHE_DIR",
23
+ os.path.expanduser("~/.cache/fittok"),
24
+ )
25
+ MAX_CACHE_SIZE = int(os.environ.get("FITTOK_CACHE_MAX_MB", "500")) * 1024 * 1024
26
+
27
+ # Stats tracking
28
+ _stats: dict[str, int] = {"graph_hits": 0, "graph_misses": 0,
29
+ "query_hits": 0, "query_misses": 0,
30
+ "compression_hits": 0, "compression_misses": 0}
31
+
32
+ _cache = None
33
+
34
+
35
+ def _get_cache():
36
+ """Lazily initialize diskcache."""
37
+ global _cache
38
+ if _cache is not None:
39
+ return _cache
40
+ try:
41
+ import diskcache
42
+ os.makedirs(CACHE_DIR, exist_ok=True)
43
+ _cache = diskcache.Cache(CACHE_DIR, size_limit=MAX_CACHE_SIZE)
44
+ return _cache
45
+ except ImportError:
46
+ logger.warning("diskcache not installed — caching disabled. Install with: pip install diskcache")
47
+ return None
48
+
49
+
50
+ # ── Hashing helpers ───────────────────────────────────────────────────────────
51
+
52
+ def _hash_str(text: str) -> str:
53
+ return hashlib.sha256(text.encode()).hexdigest()[:16]
54
+
55
+
56
+ def _hash_mtimes(root_path: str) -> str:
57
+ """Hash of all code file mtimes under root_path."""
58
+ from .graphify import _EXT_TO_LANG
59
+ mtimes: list[str] = []
60
+ root = Path(root_path).resolve()
61
+ if not root.is_dir():
62
+ return _hash_str(root_path)
63
+ skip_dirs = {
64
+ "node_modules", ".git", "__pycache__", ".venv", "venv",
65
+ "dist", "build", ".tox", ".mypy_cache", ".pytest_cache",
66
+ ".next", ".nuxt", "target", "vendor", ".gradle",
67
+ }
68
+ for dirpath, dirnames, filenames in os.walk(root):
69
+ dirnames[:] = [d for d in dirnames if d not in skip_dirs]
70
+ for f in sorted(filenames):
71
+ p = Path(dirpath) / f
72
+ if p.suffix in _EXT_TO_LANG:
73
+ try:
74
+ mtimes.append(f"{p.relative_to(root)}:{p.stat().st_mtime}")
75
+ except OSError:
76
+ pass
77
+ return _hash_str("|".join(mtimes))
78
+
79
+
80
+ # ── Graph cache ───────────────────────────────────────────────────────────────
81
+
82
+ def get_cached_graph(root_path: str) -> Optional[Any]:
83
+ """Look up a cached graph by root path + file mtimes."""
84
+ cache = _get_cache()
85
+ if cache is None:
86
+ return None
87
+ key = f"graph:{root_path}:{_hash_mtimes(root_path)}"
88
+ result = cache.get(key)
89
+ if result is not None:
90
+ _stats["graph_hits"] += 1
91
+ logger.debug("Graph cache HIT: %s", root_path)
92
+ from .models import KnowledgeGraph
93
+ return KnowledgeGraph.model_validate(result)
94
+ _stats["graph_misses"] += 1
95
+ return None
96
+
97
+
98
+ def set_cached_graph(root_path: str, graph: Any) -> None:
99
+ """Store a graph in cache."""
100
+ cache = _get_cache()
101
+ if cache is None:
102
+ return
103
+ key = f"graph:{root_path}:{_hash_mtimes(root_path)}"
104
+ cache.set(key, graph.model_dump(), expire=3600 * 24) # 24h TTL
105
+
106
+
107
+ # ── Query cache ──────────────────────────────────────────────────────────────
108
+
109
+ def get_cached_query(graph_path: str, query: str, token_budget: int) -> Optional[dict]:
110
+ """Look up a cached query result."""
111
+ cache = _get_cache()
112
+ if cache is None:
113
+ return None
114
+ key = f"query:{graph_path}:{_hash_str(query)}:{token_budget}"
115
+ result = cache.get(key)
116
+ if result is not None:
117
+ _stats["query_hits"] += 1
118
+ logger.debug("Query cache HIT: %s", query[:40])
119
+ return result # type: ignore[return-value]
120
+ _stats["query_misses"] += 1
121
+ return None
122
+
123
+
124
+ def set_cached_query(graph_path: str, query: str, token_budget: int, result: dict) -> None:
125
+ """Store a query result in cache."""
126
+ cache = _get_cache()
127
+ if cache is None:
128
+ return
129
+ key = f"query:{graph_path}:{_hash_str(query)}:{token_budget}"
130
+ cache.set(key, result, expire=3600) # 1h TTL
131
+
132
+
133
+ # ── Compression cache ────────────────────────────────────────────────────────
134
+
135
+ def get_cached_compression(context: str, question: str, target_tokens: int) -> Optional[dict]:
136
+ """Look up a cached compression result."""
137
+ cache = _get_cache()
138
+ if cache is None:
139
+ return None
140
+ key = f"compress:{_hash_str(context)}:{_hash_str(question)}:{target_tokens}"
141
+ result = cache.get(key)
142
+ if result is not None:
143
+ _stats["compression_hits"] += 1
144
+ logger.debug("Compression cache HIT")
145
+ return result # type: ignore[return-value]
146
+ _stats["compression_misses"] += 1
147
+ return None
148
+
149
+
150
+ def set_cached_compression(context: str, question: str, target_tokens: int, result: dict) -> None:
151
+ """Store a compression result in cache."""
152
+ cache = _get_cache()
153
+ if cache is None:
154
+ return
155
+ key = f"compress:{_hash_str(context)}:{_hash_str(question)}:{target_tokens}"
156
+ cache.set(key, result, expire=3600)
157
+
158
+
159
+ # ── Embedding cache (persistent, content-keyed) ──────────────────────────────
160
+ # Content-keyed means this is incremental by nature: an unchanged function keeps
161
+ # its embedding across restarts and edits; only new/changed code is re-embedded.
162
+
163
+ def get_cached_embedding(content_hash: str):
164
+ """Return a persisted embedding vector for a content hash, or None."""
165
+ cache = _get_cache()
166
+ if cache is None:
167
+ return None
168
+ return cache.get(f"emb:{content_hash}")
169
+
170
+
171
+ def set_cached_embedding(content_hash: str, vector) -> None:
172
+ """Persist an embedding vector (30-day TTL)."""
173
+ cache = _get_cache()
174
+ if cache is None:
175
+ return
176
+ cache.set(f"emb:{content_hash}", vector, expire=3600 * 24 * 30)
177
+
178
+
179
+ # ── Cache management ─────────────────────────────────────────────────────────
180
+
181
+ def clear_cache(scope: str = "all") -> dict:
182
+ """Clear cache by scope: all, graph, query, compression."""
183
+ cache = _get_cache()
184
+ if cache is None:
185
+ return {"error": "Cache not available (diskcache not installed)"}
186
+
187
+ removed = 0
188
+ for key in list(cache.iterkeys()):
189
+ if scope == "all":
190
+ cache.delete(key)
191
+ removed += 1
192
+ elif scope == "graph" and str(key).startswith("graph:"):
193
+ cache.delete(key)
194
+ removed += 1
195
+ elif scope == "query" and str(key).startswith("query:"):
196
+ cache.delete(key)
197
+ removed += 1
198
+ elif scope == "compression" and str(key).startswith("compress:"):
199
+ cache.delete(key)
200
+ removed += 1
201
+
202
+ return {"cleared": removed, "scope": scope}
203
+
204
+
205
+ def cache_stats() -> dict:
206
+ """Return cache hit/miss stats and size."""
207
+ cache = _get_cache()
208
+ size = 0
209
+ count = 0
210
+ if cache is not None:
211
+ try:
212
+ count = len(cache) # type: ignore[arg-type]
213
+ size = cache.volume()
214
+ except Exception:
215
+ pass
216
+
217
+ return {
218
+ "stats": dict(_stats),
219
+ "entries": count,
220
+ "size_bytes": size,
221
+ "cache_dir": CACHE_DIR,
222
+ "available": cache is not None,
223
+ }
fittok/cli.py ADDED
@@ -0,0 +1,81 @@
1
+ """Command-line interface — lets the package do its job WITHOUT an MCP client.
2
+
3
+ Subcommands:
4
+ fittok serve Run the MCP server (stdio) — for AI clients.
5
+ fittok index <path> Pre-build graph + embeddings for a repo.
6
+ fittok query <path> "<q>" Print the most relevant code for a query.
7
+
8
+ `query` is the standalone equivalent of the MCP `optimize_context` tool: it
9
+ returns the same readable, budget-bounded context, straight to your terminal.
10
+ With no subcommand, defaults to `serve` (so existing MCP registrations that
11
+ launch the bare `fittok` command keep working).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import sys
19
+
20
+
21
+ def main(argv: list[str] | None = None) -> None:
22
+ parser = argparse.ArgumentParser(
23
+ prog="fittok",
24
+ description="Retrieve the most relevant source code for a query, within a token budget.",
25
+ )
26
+ sub = parser.add_subparsers(dest="cmd")
27
+
28
+ sub.add_parser("serve", help="Run the MCP server over stdio (for AI clients).")
29
+
30
+ p_index = sub.add_parser("index", help="Pre-build the knowledge graph + embeddings for a repo.")
31
+ p_index.add_argument("path", help="Path to the codebase to index.")
32
+
33
+ p_query = sub.add_parser("query", help="Print the most relevant code for a query (no MCP needed).")
34
+ p_query.add_argument("path", help="Path to the codebase.")
35
+ p_query.add_argument("query", help="Natural-language question about the codebase.")
36
+ p_query.add_argument("--budget", type=int, default=0,
37
+ help="Token budget (0 = adaptive, the default).")
38
+ p_query.add_argument("--json", action="store_true",
39
+ help="Emit the full result dict as JSON.")
40
+
41
+ args = parser.parse_args(argv)
42
+
43
+ # Default (or `serve`) → run the MCP server.
44
+ if args.cmd in (None, "serve"):
45
+ from .server import main as serve_main
46
+ serve_main()
47
+ return
48
+
49
+ # For human-facing commands, keep stderr clean (INFO logs are server-mode noise).
50
+ import logging
51
+ logging.disable(logging.INFO)
52
+
53
+ if args.cmd == "index":
54
+ from .indexer import index_codebase
55
+ r = index_codebase(args.path)
56
+ print(f"Indexed {r['nodes']} nodes / {r['edges']} edges, {r['embedded']} embeddings "
57
+ f"(parse {r['parse_s']}s, embed {r['embed_s']}s). Cached.")
58
+ return
59
+
60
+ if args.cmd == "query":
61
+ from .server import optimize_context_tool
62
+ res = optimize_context_tool(args.path, args.query, args.budget)
63
+ if "error" in res:
64
+ print(f"Error: {res['error']}", file=sys.stderr)
65
+ sys.exit(1)
66
+ if args.json:
67
+ print(json.dumps(res, indent=2))
68
+ return
69
+ # Human mode: stats to stderr, the actual context to stdout (pipe-friendly).
70
+ s, sv = res["slurp_stats"], res["savings"]
71
+ print(
72
+ f"# {s['selected_nodes']} nodes · {s['tokens_sent']} tokens · "
73
+ f"confidence {s['confidence_label']} ({s['confidence']}) · {sv['summary']}",
74
+ file=sys.stderr,
75
+ )
76
+ print(res["optimized_context"])
77
+ return
78
+
79
+
80
+ if __name__ == "__main__":
81
+ main()
fittok/diff.py ADDED
@@ -0,0 +1,110 @@
1
+ """Graph diffing — compare two knowledge graphs and report structural differences."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from typing import Any
7
+
8
+ from .models import KnowledgeGraph
9
+
10
+
11
+ def _content_hash(content: str) -> str:
12
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
13
+
14
+
15
+ def diff_graphs(graph_a: KnowledgeGraph, graph_b: KnowledgeGraph) -> dict:
16
+ """Compare two knowledge graphs and return structural differences.
17
+
18
+ Args:
19
+ graph_a: First (earlier) graph.
20
+ graph_b: Second (later) graph.
21
+
22
+ Returns:
23
+ Dict with nodes_added, nodes_removed, nodes_modified, edges_added,
24
+ edges_removed, and a human-readable summary.
25
+ """
26
+ # Index nodes by ID
27
+ nodes_a = {n.id: n for n in graph_a.nodes}
28
+ nodes_b = {n.id: n for n in graph_b.nodes}
29
+
30
+ ids_a = set(nodes_a.keys())
31
+ ids_b = set(nodes_b.keys())
32
+
33
+ nodes_added = [
34
+ {"id": n.id, "name": n.name, "type": n.type.value, "file": n.file}
35
+ for nid in sorted(ids_b - ids_a)
36
+ if (n := nodes_b[nid])
37
+ ]
38
+
39
+ nodes_removed = [
40
+ {"id": n.id, "name": n.name, "type": n.type.value, "file": n.file}
41
+ for nid in sorted(ids_a - ids_b)
42
+ if (n := nodes_a[nid])
43
+ ]
44
+
45
+ # Modified: same ID but different content hash
46
+ nodes_modified: list[dict] = []
47
+ for nid in sorted(ids_a & ids_b):
48
+ a, b = nodes_a[nid], nodes_b[nid]
49
+ if _content_hash(a.content) != _content_hash(b.content):
50
+ nodes_modified.append({
51
+ "id": nid,
52
+ "name": b.name,
53
+ "type": b.type.value,
54
+ "file": b.file,
55
+ "line_start": b.line_start,
56
+ "line_end": b.line_end,
57
+ })
58
+
59
+ # Index edges by (source, target, type)
60
+ def _edge_key(e: Any) -> str:
61
+ return f"{e.source}|{e.target}|{e.type.value}"
62
+
63
+ edges_a = {_edge_key(e): e for e in graph_a.edges}
64
+ edges_b = {_edge_key(e): e for e in graph_b.edges}
65
+
66
+ edge_keys_a = set(edges_a.keys())
67
+ edge_keys_b = set(edges_b.keys())
68
+
69
+ edges_added = [
70
+ {"source": e.source, "target": e.target, "type": e.type.value}
71
+ for k in sorted(edge_keys_b - edge_keys_a)
72
+ if (e := edges_b[k])
73
+ ]
74
+
75
+ edges_removed = [
76
+ {"source": e.source, "target": e.target, "type": e.type.value}
77
+ for k in sorted(edge_keys_a - edge_keys_b)
78
+ if (e := edges_a[k])
79
+ ]
80
+
81
+ # Build summary
82
+ parts: list[str] = []
83
+ if nodes_added:
84
+ files_added = {n["file"] for n in nodes_added}
85
+ parts.append(f"{len(nodes_added)} nodes added across {len(files_added)} file(s)")
86
+ if nodes_removed:
87
+ files_removed = {n["file"] for n in nodes_removed}
88
+ parts.append(f"{len(nodes_removed)} nodes removed across {len(files_removed)} file(s)")
89
+ if nodes_modified:
90
+ files_modified = {n["file"] for n in nodes_modified}
91
+ parts.append(f"{len(nodes_modified)} nodes modified across {len(files_modified)} file(s)")
92
+ if edges_added:
93
+ parts.append(f"{len(edges_added)} edges added")
94
+ if edges_removed:
95
+ parts.append(f"{len(edges_removed)} edges removed")
96
+
97
+ summary = "; ".join(parts) if parts else "No changes detected"
98
+
99
+ return {
100
+ "nodes_added": nodes_added,
101
+ "nodes_removed": nodes_removed,
102
+ "nodes_modified": nodes_modified,
103
+ "edges_added": edges_added,
104
+ "edges_removed": edges_removed,
105
+ "stats": {
106
+ "graph_a": {"nodes": len(graph_a.nodes), "edges": len(graph_a.edges)},
107
+ "graph_b": {"nodes": len(graph_b.nodes), "edges": len(graph_b.edges)},
108
+ },
109
+ "summary": summary,
110
+ }
fittok/embeddings.py ADDED
@@ -0,0 +1,115 @@
1
+ """Semantic embeddings for relevance scoring.
2
+
3
+ Optional and fail-safe: if sentence-transformers (or the model) is unavailable,
4
+ every function degrades to returning ``None`` and the caller falls back to the
5
+ lexical TF-IDF path. This is what lets natural-language queries
6
+ (e.g. "real-time conversation with the AI") match code that uses different
7
+ words (``WebSocket``, ``streamResponse``, ``transcript``) — something pure
8
+ keyword matching cannot do.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import os
15
+ from typing import Optional
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _DEFAULT_MODEL = os.environ.get(
20
+ "FITTOK_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
21
+ )
22
+
23
+ _model = None
24
+ _unavailable = False
25
+ _EMB_CACHE: dict[str, object] = {} # content-hash -> embedding vector (per process)
26
+
27
+
28
+ def _get_model():
29
+ """Lazily load the embedding model; cache the result. Returns None if unavailable."""
30
+ global _model, _unavailable
31
+ if _model is not None:
32
+ return _model
33
+ if _unavailable:
34
+ return None
35
+ try:
36
+ from sentence_transformers import SentenceTransformer
37
+ except ImportError:
38
+ logger.info("sentence-transformers not installed; semantic scoring disabled")
39
+ _unavailable = True
40
+ return None
41
+ try:
42
+ _model = SentenceTransformer(_DEFAULT_MODEL)
43
+ logger.info("Loaded embedding model %s", _DEFAULT_MODEL)
44
+ except Exception:
45
+ logger.warning("Failed to load embedding model %s; semantic scoring disabled",
46
+ _DEFAULT_MODEL, exc_info=True)
47
+ _unavailable = True
48
+ return None
49
+ return _model
50
+
51
+
52
+ def is_available() -> bool:
53
+ """True if semantic scoring can run."""
54
+ return _get_model() is not None
55
+
56
+
57
+ def semantic_scores(nodes: list, query: str) -> Optional[dict[str, float]]:
58
+ """Cosine similarity of each node to the query, in [0, 1]-ish (raw cosine).
59
+
60
+ Returns None if embeddings are unavailable (caller should fall back to TF-IDF).
61
+ Scores are raw cosine similarities — meaningful as an *absolute* confidence
62
+ signal, unlike min-max-normalized lexical scores.
63
+ """
64
+ if not nodes:
65
+ return {}
66
+ model = _get_model()
67
+ if model is None:
68
+ return None
69
+ # Represent each node by its name + content so both signal sources count.
70
+ node_texts = [f"{n.name}\n{n.content or ''}" for n in nodes]
71
+ try:
72
+ node_embs = _embed_cached(model, node_texts) # reused across queries
73
+ query_vec = model.encode([query], normalize_embeddings=True,
74
+ show_progress_bar=False)[0]
75
+ except Exception:
76
+ logger.warning("Embedding encode failed; falling back to lexical scoring", exc_info=True)
77
+ return None
78
+ sims = node_embs @ query_vec # normalized vectors → dot product == cosine
79
+ return {n.id: float(sims[i]) for i, n in enumerate(nodes)}
80
+
81
+
82
+ def _embed_cached(model, texts: list[str]):
83
+ """Encode texts, reusing a per-process cache keyed by content.
84
+
85
+ Node content is stable across queries for a given graph, so without this the
86
+ server re-embedded thousands of nodes on every call (and N times for an
87
+ N-query batch) — seconds of wasted compute each time.
88
+ """
89
+ import hashlib
90
+ import numpy as np
91
+ from . import cache as _cache
92
+
93
+ keys = [hashlib.sha256(t.encode("utf-8", "ignore")).hexdigest() for t in texts]
94
+ out: list = [None] * len(texts)
95
+ missing_i, missing_t = [], []
96
+ for i, k in enumerate(keys):
97
+ # L1: in-process cache
98
+ vec = _EMB_CACHE.get(k)
99
+ if vec is None:
100
+ # L2: persistent disk cache (survives restarts; incremental by content)
101
+ vec = _cache.get_cached_embedding(k)
102
+ if vec is not None:
103
+ _EMB_CACHE[k] = vec
104
+ if vec is None:
105
+ missing_i.append(i)
106
+ missing_t.append(texts[i])
107
+ else:
108
+ out[i] = vec
109
+ if missing_t:
110
+ enc = model.encode(missing_t, normalize_embeddings=True, show_progress_bar=False)
111
+ for j, i in enumerate(missing_i):
112
+ _EMB_CACHE[keys[i]] = enc[j]
113
+ _cache.set_cached_embedding(keys[i], enc[j])
114
+ out[i] = enc[j]
115
+ return np.array(out)