fittok 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fittok/__init__.py +27 -0
- fittok/__main__.py +5 -0
- fittok/cache.py +223 -0
- fittok/cli.py +81 -0
- fittok/diff.py +110 -0
- fittok/embeddings.py +115 -0
- fittok/graphify.py +710 -0
- fittok/indexer.py +85 -0
- fittok/llmlingua_wrapper.py +175 -0
- fittok/models.py +113 -0
- fittok/pii_scrubber.py +150 -0
- fittok/server.py +715 -0
- fittok/slurp.py +497 -0
- fittok/tokens.py +22 -0
- fittok/ui.py +201 -0
- fittok/watcher.py +145 -0
- fittok-0.3.0.dist-info/METADATA +153 -0
- fittok-0.3.0.dist-info/RECORD +21 -0
- fittok-0.3.0.dist-info/WHEEL +4 -0
- fittok-0.3.0.dist-info/entry_points.txt +3 -0
- fittok-0.3.0.dist-info/licenses/LICENSE +21 -0
fittok/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Fittok — relevant-code retrieval within a token budget.
|
|
2
|
+
|
|
3
|
+
Usable three ways, each standalone:
|
|
4
|
+
- As a library: ``from fittok import optimize``
|
|
5
|
+
- As a CLI: ``fittok query <path> "<question>"``
|
|
6
|
+
- As an MCP server: ``python -m fittok`` (for AI clients)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__version__ = "0.3.0"
|
|
10
|
+
|
|
11
|
+
__all__ = ["optimize", "index", "__version__"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def optimize(codebase_path: str, query: str, token_budget: int = 0) -> dict:
|
|
15
|
+
"""Return the most relevant source code for *query*, within *token_budget*.
|
|
16
|
+
|
|
17
|
+
token_budget=0 → adaptive sizing. No MCP client required. Returns a dict with
|
|
18
|
+
``optimized_context``, ``graph_stats``, ``slurp_stats`` and ``savings``.
|
|
19
|
+
"""
|
|
20
|
+
from .server import optimize_context_tool
|
|
21
|
+
return optimize_context_tool(codebase_path, query, token_budget)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def index(codebase_path: str) -> dict:
|
|
25
|
+
"""Pre-build + cache the graph and embeddings for a codebase."""
|
|
26
|
+
from .indexer import index_codebase
|
|
27
|
+
return index_codebase(codebase_path)
|
fittok/__main__.py
ADDED
fittok/cache.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""3-level caching layer for fittok.
|
|
2
|
+
|
|
3
|
+
Levels:
|
|
4
|
+
1. Graph cache — keyed by (root_path, file_mtimes_hash)
|
|
5
|
+
2. Query cache — keyed by (graph_path, query_hash, token_budget)
|
|
6
|
+
3. Compression cache — keyed by (context_hash, question_hash, target_tokens)
|
|
7
|
+
|
|
8
|
+
Uses diskcache for persistent storage. Cache dir: ~/.cache/fittok/
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Optional
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
CACHE_DIR = os.environ.get(
|
|
22
|
+
"FITTOK_CACHE_DIR",
|
|
23
|
+
os.path.expanduser("~/.cache/fittok"),
|
|
24
|
+
)
|
|
25
|
+
MAX_CACHE_SIZE = int(os.environ.get("FITTOK_CACHE_MAX_MB", "500")) * 1024 * 1024
|
|
26
|
+
|
|
27
|
+
# Stats tracking
|
|
28
|
+
_stats: dict[str, int] = {"graph_hits": 0, "graph_misses": 0,
|
|
29
|
+
"query_hits": 0, "query_misses": 0,
|
|
30
|
+
"compression_hits": 0, "compression_misses": 0}
|
|
31
|
+
|
|
32
|
+
_cache = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_cache():
|
|
36
|
+
"""Lazily initialize diskcache."""
|
|
37
|
+
global _cache
|
|
38
|
+
if _cache is not None:
|
|
39
|
+
return _cache
|
|
40
|
+
try:
|
|
41
|
+
import diskcache
|
|
42
|
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
43
|
+
_cache = diskcache.Cache(CACHE_DIR, size_limit=MAX_CACHE_SIZE)
|
|
44
|
+
return _cache
|
|
45
|
+
except ImportError:
|
|
46
|
+
logger.warning("diskcache not installed — caching disabled. Install with: pip install diskcache")
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── Hashing helpers ───────────────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
def _hash_str(text: str) -> str:
|
|
53
|
+
return hashlib.sha256(text.encode()).hexdigest()[:16]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _hash_mtimes(root_path: str) -> str:
|
|
57
|
+
"""Hash of all code file mtimes under root_path."""
|
|
58
|
+
from .graphify import _EXT_TO_LANG
|
|
59
|
+
mtimes: list[str] = []
|
|
60
|
+
root = Path(root_path).resolve()
|
|
61
|
+
if not root.is_dir():
|
|
62
|
+
return _hash_str(root_path)
|
|
63
|
+
skip_dirs = {
|
|
64
|
+
"node_modules", ".git", "__pycache__", ".venv", "venv",
|
|
65
|
+
"dist", "build", ".tox", ".mypy_cache", ".pytest_cache",
|
|
66
|
+
".next", ".nuxt", "target", "vendor", ".gradle",
|
|
67
|
+
}
|
|
68
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
69
|
+
dirnames[:] = [d for d in dirnames if d not in skip_dirs]
|
|
70
|
+
for f in sorted(filenames):
|
|
71
|
+
p = Path(dirpath) / f
|
|
72
|
+
if p.suffix in _EXT_TO_LANG:
|
|
73
|
+
try:
|
|
74
|
+
mtimes.append(f"{p.relative_to(root)}:{p.stat().st_mtime}")
|
|
75
|
+
except OSError:
|
|
76
|
+
pass
|
|
77
|
+
return _hash_str("|".join(mtimes))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ── Graph cache ───────────────────────────────────────────────────────────────
|
|
81
|
+
|
|
82
|
+
def get_cached_graph(root_path: str) -> Optional[Any]:
|
|
83
|
+
"""Look up a cached graph by root path + file mtimes."""
|
|
84
|
+
cache = _get_cache()
|
|
85
|
+
if cache is None:
|
|
86
|
+
return None
|
|
87
|
+
key = f"graph:{root_path}:{_hash_mtimes(root_path)}"
|
|
88
|
+
result = cache.get(key)
|
|
89
|
+
if result is not None:
|
|
90
|
+
_stats["graph_hits"] += 1
|
|
91
|
+
logger.debug("Graph cache HIT: %s", root_path)
|
|
92
|
+
from .models import KnowledgeGraph
|
|
93
|
+
return KnowledgeGraph.model_validate(result)
|
|
94
|
+
_stats["graph_misses"] += 1
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def set_cached_graph(root_path: str, graph: Any) -> None:
|
|
99
|
+
"""Store a graph in cache."""
|
|
100
|
+
cache = _get_cache()
|
|
101
|
+
if cache is None:
|
|
102
|
+
return
|
|
103
|
+
key = f"graph:{root_path}:{_hash_mtimes(root_path)}"
|
|
104
|
+
cache.set(key, graph.model_dump(), expire=3600 * 24) # 24h TTL
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ── Query cache ──────────────────────────────────────────────────────────────
|
|
108
|
+
|
|
109
|
+
def get_cached_query(graph_path: str, query: str, token_budget: int) -> Optional[dict]:
|
|
110
|
+
"""Look up a cached query result."""
|
|
111
|
+
cache = _get_cache()
|
|
112
|
+
if cache is None:
|
|
113
|
+
return None
|
|
114
|
+
key = f"query:{graph_path}:{_hash_str(query)}:{token_budget}"
|
|
115
|
+
result = cache.get(key)
|
|
116
|
+
if result is not None:
|
|
117
|
+
_stats["query_hits"] += 1
|
|
118
|
+
logger.debug("Query cache HIT: %s", query[:40])
|
|
119
|
+
return result # type: ignore[return-value]
|
|
120
|
+
_stats["query_misses"] += 1
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def set_cached_query(graph_path: str, query: str, token_budget: int, result: dict) -> None:
|
|
125
|
+
"""Store a query result in cache."""
|
|
126
|
+
cache = _get_cache()
|
|
127
|
+
if cache is None:
|
|
128
|
+
return
|
|
129
|
+
key = f"query:{graph_path}:{_hash_str(query)}:{token_budget}"
|
|
130
|
+
cache.set(key, result, expire=3600) # 1h TTL
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ── Compression cache ────────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
def get_cached_compression(context: str, question: str, target_tokens: int) -> Optional[dict]:
|
|
136
|
+
"""Look up a cached compression result."""
|
|
137
|
+
cache = _get_cache()
|
|
138
|
+
if cache is None:
|
|
139
|
+
return None
|
|
140
|
+
key = f"compress:{_hash_str(context)}:{_hash_str(question)}:{target_tokens}"
|
|
141
|
+
result = cache.get(key)
|
|
142
|
+
if result is not None:
|
|
143
|
+
_stats["compression_hits"] += 1
|
|
144
|
+
logger.debug("Compression cache HIT")
|
|
145
|
+
return result # type: ignore[return-value]
|
|
146
|
+
_stats["compression_misses"] += 1
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def set_cached_compression(context: str, question: str, target_tokens: int, result: dict) -> None:
|
|
151
|
+
"""Store a compression result in cache."""
|
|
152
|
+
cache = _get_cache()
|
|
153
|
+
if cache is None:
|
|
154
|
+
return
|
|
155
|
+
key = f"compress:{_hash_str(context)}:{_hash_str(question)}:{target_tokens}"
|
|
156
|
+
cache.set(key, result, expire=3600)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# ── Embedding cache (persistent, content-keyed) ──────────────────────────────
|
|
160
|
+
# Content-keyed means this is incremental by nature: an unchanged function keeps
|
|
161
|
+
# its embedding across restarts and edits; only new/changed code is re-embedded.
|
|
162
|
+
|
|
163
|
+
def get_cached_embedding(content_hash: str):
|
|
164
|
+
"""Return a persisted embedding vector for a content hash, or None."""
|
|
165
|
+
cache = _get_cache()
|
|
166
|
+
if cache is None:
|
|
167
|
+
return None
|
|
168
|
+
return cache.get(f"emb:{content_hash}")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def set_cached_embedding(content_hash: str, vector) -> None:
|
|
172
|
+
"""Persist an embedding vector (30-day TTL)."""
|
|
173
|
+
cache = _get_cache()
|
|
174
|
+
if cache is None:
|
|
175
|
+
return
|
|
176
|
+
cache.set(f"emb:{content_hash}", vector, expire=3600 * 24 * 30)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# ── Cache management ─────────────────────────────────────────────────────────
|
|
180
|
+
|
|
181
|
+
def clear_cache(scope: str = "all") -> dict:
|
|
182
|
+
"""Clear cache by scope: all, graph, query, compression."""
|
|
183
|
+
cache = _get_cache()
|
|
184
|
+
if cache is None:
|
|
185
|
+
return {"error": "Cache not available (diskcache not installed)"}
|
|
186
|
+
|
|
187
|
+
removed = 0
|
|
188
|
+
for key in list(cache.iterkeys()):
|
|
189
|
+
if scope == "all":
|
|
190
|
+
cache.delete(key)
|
|
191
|
+
removed += 1
|
|
192
|
+
elif scope == "graph" and str(key).startswith("graph:"):
|
|
193
|
+
cache.delete(key)
|
|
194
|
+
removed += 1
|
|
195
|
+
elif scope == "query" and str(key).startswith("query:"):
|
|
196
|
+
cache.delete(key)
|
|
197
|
+
removed += 1
|
|
198
|
+
elif scope == "compression" and str(key).startswith("compress:"):
|
|
199
|
+
cache.delete(key)
|
|
200
|
+
removed += 1
|
|
201
|
+
|
|
202
|
+
return {"cleared": removed, "scope": scope}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def cache_stats() -> dict:
|
|
206
|
+
"""Return cache hit/miss stats and size."""
|
|
207
|
+
cache = _get_cache()
|
|
208
|
+
size = 0
|
|
209
|
+
count = 0
|
|
210
|
+
if cache is not None:
|
|
211
|
+
try:
|
|
212
|
+
count = len(cache) # type: ignore[arg-type]
|
|
213
|
+
size = cache.volume()
|
|
214
|
+
except Exception:
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
"stats": dict(_stats),
|
|
219
|
+
"entries": count,
|
|
220
|
+
"size_bytes": size,
|
|
221
|
+
"cache_dir": CACHE_DIR,
|
|
222
|
+
"available": cache is not None,
|
|
223
|
+
}
|
fittok/cli.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Command-line interface — lets the package do its job WITHOUT an MCP client.
|
|
2
|
+
|
|
3
|
+
Subcommands:
|
|
4
|
+
fittok serve Run the MCP server (stdio) — for AI clients.
|
|
5
|
+
fittok index <path> Pre-build graph + embeddings for a repo.
|
|
6
|
+
fittok query <path> "<q>" Print the most relevant code for a query.
|
|
7
|
+
|
|
8
|
+
`query` is the standalone equivalent of the MCP `optimize_context` tool: it
|
|
9
|
+
returns the same readable, budget-bounded context, straight to your terminal.
|
|
10
|
+
With no subcommand, defaults to `serve` (so existing MCP registrations that
|
|
11
|
+
launch the bare `fittok` command keep working).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main(argv: list[str] | None = None) -> None:
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
prog="fittok",
|
|
24
|
+
description="Retrieve the most relevant source code for a query, within a token budget.",
|
|
25
|
+
)
|
|
26
|
+
sub = parser.add_subparsers(dest="cmd")
|
|
27
|
+
|
|
28
|
+
sub.add_parser("serve", help="Run the MCP server over stdio (for AI clients).")
|
|
29
|
+
|
|
30
|
+
p_index = sub.add_parser("index", help="Pre-build the knowledge graph + embeddings for a repo.")
|
|
31
|
+
p_index.add_argument("path", help="Path to the codebase to index.")
|
|
32
|
+
|
|
33
|
+
p_query = sub.add_parser("query", help="Print the most relevant code for a query (no MCP needed).")
|
|
34
|
+
p_query.add_argument("path", help="Path to the codebase.")
|
|
35
|
+
p_query.add_argument("query", help="Natural-language question about the codebase.")
|
|
36
|
+
p_query.add_argument("--budget", type=int, default=0,
|
|
37
|
+
help="Token budget (0 = adaptive, the default).")
|
|
38
|
+
p_query.add_argument("--json", action="store_true",
|
|
39
|
+
help="Emit the full result dict as JSON.")
|
|
40
|
+
|
|
41
|
+
args = parser.parse_args(argv)
|
|
42
|
+
|
|
43
|
+
# Default (or `serve`) → run the MCP server.
|
|
44
|
+
if args.cmd in (None, "serve"):
|
|
45
|
+
from .server import main as serve_main
|
|
46
|
+
serve_main()
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
# For human-facing commands, keep stderr clean (INFO logs are server-mode noise).
|
|
50
|
+
import logging
|
|
51
|
+
logging.disable(logging.INFO)
|
|
52
|
+
|
|
53
|
+
if args.cmd == "index":
|
|
54
|
+
from .indexer import index_codebase
|
|
55
|
+
r = index_codebase(args.path)
|
|
56
|
+
print(f"Indexed {r['nodes']} nodes / {r['edges']} edges, {r['embedded']} embeddings "
|
|
57
|
+
f"(parse {r['parse_s']}s, embed {r['embed_s']}s). Cached.")
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
if args.cmd == "query":
|
|
61
|
+
from .server import optimize_context_tool
|
|
62
|
+
res = optimize_context_tool(args.path, args.query, args.budget)
|
|
63
|
+
if "error" in res:
|
|
64
|
+
print(f"Error: {res['error']}", file=sys.stderr)
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
if args.json:
|
|
67
|
+
print(json.dumps(res, indent=2))
|
|
68
|
+
return
|
|
69
|
+
# Human mode: stats to stderr, the actual context to stdout (pipe-friendly).
|
|
70
|
+
s, sv = res["slurp_stats"], res["savings"]
|
|
71
|
+
print(
|
|
72
|
+
f"# {s['selected_nodes']} nodes · {s['tokens_sent']} tokens · "
|
|
73
|
+
f"confidence {s['confidence_label']} ({s['confidence']}) · {sv['summary']}",
|
|
74
|
+
file=sys.stderr,
|
|
75
|
+
)
|
|
76
|
+
print(res["optimized_context"])
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
main()
|
fittok/diff.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Graph diffing — compare two knowledge graphs and report structural differences."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .models import KnowledgeGraph
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _content_hash(content: str) -> str:
|
|
12
|
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def diff_graphs(graph_a: KnowledgeGraph, graph_b: KnowledgeGraph) -> dict:
|
|
16
|
+
"""Compare two knowledge graphs and return structural differences.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
graph_a: First (earlier) graph.
|
|
20
|
+
graph_b: Second (later) graph.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Dict with nodes_added, nodes_removed, nodes_modified, edges_added,
|
|
24
|
+
edges_removed, and a human-readable summary.
|
|
25
|
+
"""
|
|
26
|
+
# Index nodes by ID
|
|
27
|
+
nodes_a = {n.id: n for n in graph_a.nodes}
|
|
28
|
+
nodes_b = {n.id: n for n in graph_b.nodes}
|
|
29
|
+
|
|
30
|
+
ids_a = set(nodes_a.keys())
|
|
31
|
+
ids_b = set(nodes_b.keys())
|
|
32
|
+
|
|
33
|
+
nodes_added = [
|
|
34
|
+
{"id": n.id, "name": n.name, "type": n.type.value, "file": n.file}
|
|
35
|
+
for nid in sorted(ids_b - ids_a)
|
|
36
|
+
if (n := nodes_b[nid])
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
nodes_removed = [
|
|
40
|
+
{"id": n.id, "name": n.name, "type": n.type.value, "file": n.file}
|
|
41
|
+
for nid in sorted(ids_a - ids_b)
|
|
42
|
+
if (n := nodes_a[nid])
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# Modified: same ID but different content hash
|
|
46
|
+
nodes_modified: list[dict] = []
|
|
47
|
+
for nid in sorted(ids_a & ids_b):
|
|
48
|
+
a, b = nodes_a[nid], nodes_b[nid]
|
|
49
|
+
if _content_hash(a.content) != _content_hash(b.content):
|
|
50
|
+
nodes_modified.append({
|
|
51
|
+
"id": nid,
|
|
52
|
+
"name": b.name,
|
|
53
|
+
"type": b.type.value,
|
|
54
|
+
"file": b.file,
|
|
55
|
+
"line_start": b.line_start,
|
|
56
|
+
"line_end": b.line_end,
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
# Index edges by (source, target, type)
|
|
60
|
+
def _edge_key(e: Any) -> str:
|
|
61
|
+
return f"{e.source}|{e.target}|{e.type.value}"
|
|
62
|
+
|
|
63
|
+
edges_a = {_edge_key(e): e for e in graph_a.edges}
|
|
64
|
+
edges_b = {_edge_key(e): e for e in graph_b.edges}
|
|
65
|
+
|
|
66
|
+
edge_keys_a = set(edges_a.keys())
|
|
67
|
+
edge_keys_b = set(edges_b.keys())
|
|
68
|
+
|
|
69
|
+
edges_added = [
|
|
70
|
+
{"source": e.source, "target": e.target, "type": e.type.value}
|
|
71
|
+
for k in sorted(edge_keys_b - edge_keys_a)
|
|
72
|
+
if (e := edges_b[k])
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
edges_removed = [
|
|
76
|
+
{"source": e.source, "target": e.target, "type": e.type.value}
|
|
77
|
+
for k in sorted(edge_keys_a - edge_keys_b)
|
|
78
|
+
if (e := edges_a[k])
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# Build summary
|
|
82
|
+
parts: list[str] = []
|
|
83
|
+
if nodes_added:
|
|
84
|
+
files_added = {n["file"] for n in nodes_added}
|
|
85
|
+
parts.append(f"{len(nodes_added)} nodes added across {len(files_added)} file(s)")
|
|
86
|
+
if nodes_removed:
|
|
87
|
+
files_removed = {n["file"] for n in nodes_removed}
|
|
88
|
+
parts.append(f"{len(nodes_removed)} nodes removed across {len(files_removed)} file(s)")
|
|
89
|
+
if nodes_modified:
|
|
90
|
+
files_modified = {n["file"] for n in nodes_modified}
|
|
91
|
+
parts.append(f"{len(nodes_modified)} nodes modified across {len(files_modified)} file(s)")
|
|
92
|
+
if edges_added:
|
|
93
|
+
parts.append(f"{len(edges_added)} edges added")
|
|
94
|
+
if edges_removed:
|
|
95
|
+
parts.append(f"{len(edges_removed)} edges removed")
|
|
96
|
+
|
|
97
|
+
summary = "; ".join(parts) if parts else "No changes detected"
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
"nodes_added": nodes_added,
|
|
101
|
+
"nodes_removed": nodes_removed,
|
|
102
|
+
"nodes_modified": nodes_modified,
|
|
103
|
+
"edges_added": edges_added,
|
|
104
|
+
"edges_removed": edges_removed,
|
|
105
|
+
"stats": {
|
|
106
|
+
"graph_a": {"nodes": len(graph_a.nodes), "edges": len(graph_a.edges)},
|
|
107
|
+
"graph_b": {"nodes": len(graph_b.nodes), "edges": len(graph_b.edges)},
|
|
108
|
+
},
|
|
109
|
+
"summary": summary,
|
|
110
|
+
}
|
fittok/embeddings.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Semantic embeddings for relevance scoring.
|
|
2
|
+
|
|
3
|
+
Optional and fail-safe: if sentence-transformers (or the model) is unavailable,
|
|
4
|
+
every function degrades to returning ``None`` and the caller falls back to the
|
|
5
|
+
lexical TF-IDF path. This is what lets natural-language queries
|
|
6
|
+
(e.g. "real-time conversation with the AI") match code that uses different
|
|
7
|
+
words (``WebSocket``, ``streamResponse``, ``transcript``) — something pure
|
|
8
|
+
keyword matching cannot do.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_DEFAULT_MODEL = os.environ.get(
|
|
20
|
+
"FITTOK_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
_model = None
|
|
24
|
+
_unavailable = False
|
|
25
|
+
_EMB_CACHE: dict[str, object] = {} # content-hash -> embedding vector (per process)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_model():
|
|
29
|
+
"""Lazily load the embedding model; cache the result. Returns None if unavailable."""
|
|
30
|
+
global _model, _unavailable
|
|
31
|
+
if _model is not None:
|
|
32
|
+
return _model
|
|
33
|
+
if _unavailable:
|
|
34
|
+
return None
|
|
35
|
+
try:
|
|
36
|
+
from sentence_transformers import SentenceTransformer
|
|
37
|
+
except ImportError:
|
|
38
|
+
logger.info("sentence-transformers not installed; semantic scoring disabled")
|
|
39
|
+
_unavailable = True
|
|
40
|
+
return None
|
|
41
|
+
try:
|
|
42
|
+
_model = SentenceTransformer(_DEFAULT_MODEL)
|
|
43
|
+
logger.info("Loaded embedding model %s", _DEFAULT_MODEL)
|
|
44
|
+
except Exception:
|
|
45
|
+
logger.warning("Failed to load embedding model %s; semantic scoring disabled",
|
|
46
|
+
_DEFAULT_MODEL, exc_info=True)
|
|
47
|
+
_unavailable = True
|
|
48
|
+
return None
|
|
49
|
+
return _model
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def is_available() -> bool:
|
|
53
|
+
"""True if semantic scoring can run."""
|
|
54
|
+
return _get_model() is not None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def semantic_scores(nodes: list, query: str) -> Optional[dict[str, float]]:
|
|
58
|
+
"""Cosine similarity of each node to the query, in [0, 1]-ish (raw cosine).
|
|
59
|
+
|
|
60
|
+
Returns None if embeddings are unavailable (caller should fall back to TF-IDF).
|
|
61
|
+
Scores are raw cosine similarities — meaningful as an *absolute* confidence
|
|
62
|
+
signal, unlike min-max-normalized lexical scores.
|
|
63
|
+
"""
|
|
64
|
+
if not nodes:
|
|
65
|
+
return {}
|
|
66
|
+
model = _get_model()
|
|
67
|
+
if model is None:
|
|
68
|
+
return None
|
|
69
|
+
# Represent each node by its name + content so both signal sources count.
|
|
70
|
+
node_texts = [f"{n.name}\n{n.content or ''}" for n in nodes]
|
|
71
|
+
try:
|
|
72
|
+
node_embs = _embed_cached(model, node_texts) # reused across queries
|
|
73
|
+
query_vec = model.encode([query], normalize_embeddings=True,
|
|
74
|
+
show_progress_bar=False)[0]
|
|
75
|
+
except Exception:
|
|
76
|
+
logger.warning("Embedding encode failed; falling back to lexical scoring", exc_info=True)
|
|
77
|
+
return None
|
|
78
|
+
sims = node_embs @ query_vec # normalized vectors → dot product == cosine
|
|
79
|
+
return {n.id: float(sims[i]) for i, n in enumerate(nodes)}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _embed_cached(model, texts: list[str]):
|
|
83
|
+
"""Encode texts, reusing a per-process cache keyed by content.
|
|
84
|
+
|
|
85
|
+
Node content is stable across queries for a given graph, so without this the
|
|
86
|
+
server re-embedded thousands of nodes on every call (and N times for an
|
|
87
|
+
N-query batch) — seconds of wasted compute each time.
|
|
88
|
+
"""
|
|
89
|
+
import hashlib
|
|
90
|
+
import numpy as np
|
|
91
|
+
from . import cache as _cache
|
|
92
|
+
|
|
93
|
+
keys = [hashlib.sha256(t.encode("utf-8", "ignore")).hexdigest() for t in texts]
|
|
94
|
+
out: list = [None] * len(texts)
|
|
95
|
+
missing_i, missing_t = [], []
|
|
96
|
+
for i, k in enumerate(keys):
|
|
97
|
+
# L1: in-process cache
|
|
98
|
+
vec = _EMB_CACHE.get(k)
|
|
99
|
+
if vec is None:
|
|
100
|
+
# L2: persistent disk cache (survives restarts; incremental by content)
|
|
101
|
+
vec = _cache.get_cached_embedding(k)
|
|
102
|
+
if vec is not None:
|
|
103
|
+
_EMB_CACHE[k] = vec
|
|
104
|
+
if vec is None:
|
|
105
|
+
missing_i.append(i)
|
|
106
|
+
missing_t.append(texts[i])
|
|
107
|
+
else:
|
|
108
|
+
out[i] = vec
|
|
109
|
+
if missing_t:
|
|
110
|
+
enc = model.encode(missing_t, normalize_embeddings=True, show_progress_bar=False)
|
|
111
|
+
for j, i in enumerate(missing_i):
|
|
112
|
+
_EMB_CACHE[keys[i]] = enc[j]
|
|
113
|
+
_cache.set_cached_embedding(keys[i], enc[j])
|
|
114
|
+
out[i] = enc[j]
|
|
115
|
+
return np.array(out)
|