feed-the-machine 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +170 -170
- package/bin/brain.py +1340 -0
- package/bin/convert_claude_skills_to_codex.py +490 -0
- package/bin/generate-manifest.mjs +463 -463
- package/bin/harden_codex_skills.py +141 -0
- package/bin/install.mjs +491 -491
- package/bin/migrate-eng-buddy-data.py +875 -0
- package/bin/playbook_engine/__init__.py +1 -0
- package/bin/playbook_engine/conftest.py +8 -0
- package/bin/playbook_engine/extractor.py +33 -0
- package/bin/playbook_engine/manager.py +102 -0
- package/bin/playbook_engine/models.py +84 -0
- package/bin/playbook_engine/registry.py +35 -0
- package/bin/playbook_engine/test_extractor.py +72 -0
- package/bin/playbook_engine/test_integration.py +129 -0
- package/bin/playbook_engine/test_manager.py +85 -0
- package/bin/playbook_engine/test_models.py +166 -0
- package/bin/playbook_engine/test_registry.py +67 -0
- package/bin/playbook_engine/test_tracer.py +86 -0
- package/bin/playbook_engine/tracer.py +93 -0
- package/bin/tasks_db.py +456 -0
- package/docs/HOOKS.md +243 -243
- package/docs/INBOX.md +233 -233
- package/ftm/SKILL.md +125 -122
- package/ftm-audit/SKILL.md +623 -623
- package/ftm-audit/references/protocols/PROJECT-PATTERNS.md +91 -91
- package/ftm-audit/references/protocols/RUNTIME-WIRING.md +66 -66
- package/ftm-audit/references/protocols/WIRING-CONTRACTS.md +135 -135
- package/ftm-audit/references/strategies/AUTO-FIX-STRATEGIES.md +69 -69
- package/ftm-audit/references/templates/REPORT-FORMAT.md +96 -96
- package/ftm-audit/scripts/run-knip.sh +23 -23
- package/ftm-audit.yml +2 -2
- package/ftm-brainstorm/SKILL.md +1003 -498
- package/ftm-brainstorm/evals/evals.json +180 -100
- package/ftm-brainstorm/evals/promptfoo.yaml +109 -109
- package/ftm-brainstorm/references/agent-prompts.md +552 -224
- package/ftm-brainstorm/references/plan-template.md +209 -121
- package/ftm-brainstorm.yml +2 -2
- package/ftm-browse/SKILL.md +454 -454
- package/ftm-browse/daemon/browser-manager.ts +206 -206
- package/ftm-browse/daemon/bun.lock +30 -30
- package/ftm-browse/daemon/cli.ts +347 -347
- package/ftm-browse/daemon/commands.ts +410 -410
- package/ftm-browse/daemon/main.ts +357 -357
- package/ftm-browse/daemon/package.json +17 -17
- package/ftm-browse/daemon/server.ts +189 -189
- package/ftm-browse/daemon/snapshot.ts +519 -519
- package/ftm-browse/daemon/tsconfig.json +22 -22
- package/ftm-browse.yml +4 -4
- package/ftm-capture/SKILL.md +370 -370
- package/ftm-capture.yml +4 -4
- package/ftm-codex-gate/SKILL.md +361 -361
- package/ftm-codex-gate.yml +2 -2
- package/ftm-config/SKILL.md +422 -345
- package/ftm-config.default.yml +125 -82
- package/ftm-config.yml +44 -2
- package/ftm-council/SKILL.md +416 -416
- package/ftm-council/references/prompts/CLAUDE-INVESTIGATION.md +60 -60
- package/ftm-council/references/prompts/CODEX-INVESTIGATION.md +58 -58
- package/ftm-council/references/prompts/GEMINI-INVESTIGATION.md +58 -58
- package/ftm-council/references/prompts/REBUTTAL-TEMPLATE.md +57 -57
- package/ftm-council/references/protocols/PREREQUISITES.md +47 -47
- package/ftm-council/references/protocols/STEP-0-FRAMING.md +46 -46
- package/ftm-council.yml +2 -2
- package/ftm-dashboard/SKILL.md +163 -163
- package/ftm-dashboard.yml +4 -4
- package/ftm-debug/SKILL.md +1037 -1037
- package/ftm-debug/references/phases/PHASE-0-INTAKE.md +58 -58
- package/ftm-debug/references/phases/PHASE-1-TRIAGE.md +46 -46
- package/ftm-debug/references/phases/PHASE-2-WAR-ROOM-AGENTS.md +279 -279
- package/ftm-debug/references/phases/PHASE-3-TO-6-EXECUTION.md +436 -436
- package/ftm-debug/references/protocols/BLACKBOARD.md +86 -86
- package/ftm-debug/references/protocols/EDGE-CASES.md +103 -103
- package/ftm-debug.yml +2 -2
- package/ftm-diagram/SKILL.md +277 -277
- package/ftm-diagram.yml +2 -2
- package/ftm-executor/SKILL.md +777 -777
- package/ftm-executor/references/STYLE-TEMPLATE.md +73 -73
- package/ftm-executor/references/phases/PHASE-0-VERIFICATION.md +62 -62
- package/ftm-executor/references/phases/PHASE-2-AGENT-ASSEMBLY.md +34 -34
- package/ftm-executor/references/phases/PHASE-3-WORKTREES.md +38 -38
- package/ftm-executor/references/phases/PHASE-4-5-AUDIT.md +72 -72
- package/ftm-executor/references/phases/PHASE-4-DISPATCH.md +66 -66
- package/ftm-executor/references/phases/PHASE-5-5-CODEX-GATE.md +73 -73
- package/ftm-executor/references/protocols/DOCUMENTATION-BOOTSTRAP.md +36 -36
- package/ftm-executor/references/protocols/MODEL-PROFILE.md +59 -59
- package/ftm-executor/references/protocols/PROGRESS-TRACKING.md +66 -66
- package/ftm-executor/runtime/ftm-runtime.mjs +252 -252
- package/ftm-executor/runtime/package.json +8 -8
- package/ftm-executor.yml +2 -2
- package/ftm-git/SKILL.md +441 -441
- package/ftm-git/evals/evals.json +26 -26
- package/ftm-git/evals/promptfoo.yaml +75 -75
- package/ftm-git/hooks/post-commit-experience.sh +92 -92
- package/ftm-git/references/patterns/SECRET-PATTERNS.md +104 -104
- package/ftm-git/references/protocols/REMEDIATION.md +139 -139
- package/ftm-git/scripts/pre-commit-secrets.sh +110 -110
- package/ftm-git.yml +2 -2
- package/ftm-inbox/backend/__pycache__/main.cpython-314.pyc +0 -0
- package/ftm-inbox/backend/adapters/_retry.py +64 -64
- package/ftm-inbox/backend/adapters/base.py +230 -230
- package/ftm-inbox/backend/adapters/freshservice.py +104 -104
- package/ftm-inbox/backend/adapters/gmail.py +125 -125
- package/ftm-inbox/backend/adapters/jira.py +136 -136
- package/ftm-inbox/backend/adapters/registry.py +192 -192
- package/ftm-inbox/backend/adapters/slack.py +110 -110
- package/ftm-inbox/backend/db/connection.py +54 -54
- package/ftm-inbox/backend/db/schema.py +78 -78
- package/ftm-inbox/backend/executor/__init__.py +7 -7
- package/ftm-inbox/backend/executor/engine.py +149 -149
- package/ftm-inbox/backend/executor/step_runner.py +98 -98
- package/ftm-inbox/backend/main.py +103 -103
- package/ftm-inbox/backend/models/__init__.py +1 -1
- package/ftm-inbox/backend/models/unified_task.py +36 -36
- package/ftm-inbox/backend/planner/__init__.py +6 -6
- package/ftm-inbox/backend/planner/__pycache__/__init__.cpython-314.pyc +0 -0
- package/ftm-inbox/backend/planner/__pycache__/generator.cpython-314.pyc +0 -0
- package/ftm-inbox/backend/planner/__pycache__/schema.cpython-314.pyc +0 -0
- package/ftm-inbox/backend/planner/generator.py +127 -127
- package/ftm-inbox/backend/planner/schema.py +34 -34
- package/ftm-inbox/backend/requirements.txt +5 -5
- package/ftm-inbox/backend/routes/__pycache__/plan.cpython-314.pyc +0 -0
- package/ftm-inbox/backend/routes/execute.py +186 -186
- package/ftm-inbox/backend/routes/health.py +52 -52
- package/ftm-inbox/backend/routes/inbox.py +68 -68
- package/ftm-inbox/backend/routes/plan.py +271 -271
- package/ftm-inbox/bin/launchagent.mjs +91 -91
- package/ftm-inbox/bin/setup.mjs +188 -188
- package/ftm-inbox/bin/start.sh +10 -10
- package/ftm-inbox/bin/status.sh +17 -17
- package/ftm-inbox/bin/stop.sh +8 -8
- package/ftm-inbox/config.example.yml +55 -55
- package/ftm-inbox/package-lock.json +2898 -2898
- package/ftm-inbox/package.json +26 -26
- package/ftm-inbox/postcss.config.js +6 -6
- package/ftm-inbox/src/app.css +199 -199
- package/ftm-inbox/src/app.html +18 -18
- package/ftm-inbox/src/lib/api.ts +166 -166
- package/ftm-inbox/src/lib/components/ExecutionLog.svelte +81 -81
- package/ftm-inbox/src/lib/components/InboxFeed.svelte +143 -143
- package/ftm-inbox/src/lib/components/PlanStep.svelte +271 -271
- package/ftm-inbox/src/lib/components/PlanView.svelte +206 -206
- package/ftm-inbox/src/lib/components/StreamPanel.svelte +99 -99
- package/ftm-inbox/src/lib/components/TaskCard.svelte +190 -190
- package/ftm-inbox/src/lib/components/ui/EmptyState.svelte +63 -63
- package/ftm-inbox/src/lib/components/ui/KawaiiCard.svelte +86 -86
- package/ftm-inbox/src/lib/components/ui/PillButton.svelte +106 -106
- package/ftm-inbox/src/lib/components/ui/StatusBadge.svelte +67 -67
- package/ftm-inbox/src/lib/components/ui/StreamDrawer.svelte +149 -149
- package/ftm-inbox/src/lib/components/ui/ThemeToggle.svelte +80 -80
- package/ftm-inbox/src/lib/theme.ts +47 -47
- package/ftm-inbox/src/routes/+layout.svelte +76 -76
- package/ftm-inbox/src/routes/+page.svelte +401 -401
- package/ftm-inbox/svelte.config.js +12 -12
- package/ftm-inbox/tailwind.config.ts +63 -63
- package/ftm-inbox/tsconfig.json +13 -13
- package/ftm-inbox/vite.config.ts +6 -6
- package/ftm-intent/SKILL.md +241 -241
- package/ftm-intent.yml +2 -2
- package/ftm-manifest.json +3794 -3794
- package/ftm-map/SKILL.md +291 -291
- package/ftm-map/scripts/db.py +712 -712
- package/ftm-map/scripts/index.py +415 -415
- package/ftm-map/scripts/parser.py +224 -224
- package/ftm-map/scripts/queries/go-tags.scm +20 -20
- package/ftm-map/scripts/queries/javascript-tags.scm +35 -35
- package/ftm-map/scripts/queries/python-tags.scm +31 -31
- package/ftm-map/scripts/queries/ruby-tags.scm +19 -19
- package/ftm-map/scripts/queries/rust-tags.scm +37 -37
- package/ftm-map/scripts/queries/typescript-tags.scm +41 -41
- package/ftm-map/scripts/query.py +301 -301
- package/ftm-map/scripts/ranker.py +377 -377
- package/ftm-map/scripts/requirements.txt +5 -5
- package/ftm-map/scripts/setup-hooks.sh +27 -27
- package/ftm-map/scripts/setup.sh +56 -56
- package/ftm-map/scripts/test_db.py +364 -364
- package/ftm-map/scripts/test_parser.py +174 -174
- package/ftm-map/scripts/test_query.py +183 -183
- package/ftm-map/scripts/test_ranker.py +199 -199
- package/ftm-map/scripts/views.py +591 -591
- package/ftm-map.yml +2 -2
- package/ftm-mind/SKILL.md +201 -1943
- package/ftm-mind/evals/promptfoo.yaml +142 -142
- package/ftm-mind/references/blackboard-protocol.md +110 -0
- package/ftm-mind/references/blackboard-schema.md +328 -328
- package/ftm-mind/references/complexity-guide.md +110 -110
- package/ftm-mind/references/complexity-sizing.md +138 -0
- package/ftm-mind/references/decide-act-protocol.md +172 -0
- package/ftm-mind/references/direct-execution.md +51 -0
- package/ftm-mind/references/environment-discovery.md +77 -0
- package/ftm-mind/references/event-registry.md +319 -319
- package/ftm-mind/references/mcp-inventory.md +300 -296
- package/ftm-mind/references/ops-routing.md +47 -0
- package/ftm-mind/references/orient-protocol.md +234 -0
- package/ftm-mind/references/personality.md +40 -0
- package/ftm-mind/references/protocols/COMPLEXITY-SIZING.md +72 -72
- package/ftm-mind/references/protocols/MCP-HEURISTICS.md +32 -32
- package/ftm-mind/references/protocols/PLAN-APPROVAL.md +80 -80
- package/ftm-mind/references/reflexion-protocol.md +249 -249
- package/ftm-mind/references/routing/SCENARIOS.md +22 -22
- package/ftm-mind/references/routing-scenarios.md +35 -35
- package/ftm-mind.yml +2 -2
- package/ftm-ops.yml +4 -0
- package/ftm-pause/SKILL.md +395 -395
- package/ftm-pause/references/protocols/SKILL-RESTORE-PROTOCOLS.md +186 -186
- package/ftm-pause/references/protocols/VALIDATION.md +80 -80
- package/ftm-pause.yml +2 -2
- package/ftm-researcher/SKILL.md +275 -275
- package/ftm-researcher/evals/agent-diversity.yaml +17 -17
- package/ftm-researcher/evals/synthesis-quality.yaml +12 -12
- package/ftm-researcher/evals/trigger-accuracy.yaml +39 -39
- package/ftm-researcher/references/adaptive-search.md +116 -116
- package/ftm-researcher/references/agent-prompts.md +193 -193
- package/ftm-researcher/references/council-integration.md +193 -193
- package/ftm-researcher/references/output-format.md +203 -203
- package/ftm-researcher/references/synthesis-pipeline.md +165 -165
- package/ftm-researcher/scripts/score_credibility.py +234 -234
- package/ftm-researcher/scripts/validate_research.py +92 -92
- package/ftm-researcher.yml +2 -2
- package/ftm-resume/SKILL.md +518 -518
- package/ftm-resume/references/protocols/VALIDATION.md +172 -172
- package/ftm-resume.yml +2 -2
- package/ftm-retro/SKILL.md +380 -380
- package/ftm-retro/references/protocols/SCORING-RUBRICS.md +89 -89
- package/ftm-retro/references/templates/REPORT-FORMAT.md +109 -109
- package/ftm-retro.yml +2 -2
- package/ftm-routine/SKILL.md +170 -170
- package/ftm-routine.yml +4 -4
- package/ftm-state/blackboard/capabilities.json +5 -5
- package/ftm-state/blackboard/capabilities.schema.json +27 -27
- package/ftm-state/blackboard/context.json +37 -23
- package/ftm-state/blackboard/experiences/doom-statusline-fix.json +26 -0
- package/ftm-state/blackboard/experiences/hackathon-pages-site.json +26 -0
- package/ftm-state/blackboard/experiences/hindsight-sso-kickoff.json +42 -0
- package/ftm-state/blackboard/experiences/index.json +58 -9
- package/ftm-state/blackboard/experiences/learning-ragnarok-api-access.json +23 -0
- package/ftm-state/blackboard/experiences/nordlayer-members-auto-assign.json +26 -0
- package/ftm-state/blackboard/experiences/saml2aws-stale-session-fix.json +41 -0
- package/ftm-state/blackboard/patterns.json +6 -6
- package/ftm-state/schemas/context.schema.json +130 -130
- package/ftm-state/schemas/experience-index.schema.json +77 -77
- package/ftm-state/schemas/experience.schema.json +78 -78
- package/ftm-state/schemas/patterns.schema.json +44 -44
- package/ftm-upgrade/SKILL.md +194 -194
- package/ftm-upgrade/scripts/check-version.sh +76 -76
- package/ftm-upgrade/scripts/upgrade.sh +143 -143
- package/ftm-upgrade.yml +2 -2
- package/ftm-verify.yml +2 -2
- package/ftm.yml +2 -2
- package/hooks/ftm-auto-log.sh +137 -0
- package/hooks/ftm-blackboard-enforcer.sh +93 -93
- package/hooks/ftm-discovery-reminder.sh +90 -90
- package/hooks/ftm-drafts-gate.sh +61 -61
- package/hooks/ftm-event-logger.mjs +107 -107
- package/hooks/ftm-install-hooks.sh +240 -0
- package/hooks/ftm-learning-capture.sh +117 -0
- package/hooks/ftm-map-autodetect.sh +79 -79
- package/hooks/ftm-pending-sync-check.sh +22 -22
- package/hooks/ftm-plan-gate.sh +92 -92
- package/hooks/ftm-post-commit-trigger.sh +57 -57
- package/hooks/ftm-post-compaction.sh +138 -0
- package/hooks/ftm-pre-compaction.sh +147 -0
- package/hooks/ftm-session-end.sh +52 -0
- package/hooks/ftm-session-snapshot.sh +213 -0
- package/hooks/settings-template.json +81 -81
- package/install.sh +363 -363
- package/package.json +84 -84
- package/uninstall.sh +25 -25
|
@@ -1,377 +1,377 @@
|
|
|
1
|
-
"""
|
|
2
|
-
ranker.py -- PageRank-based context selection engine for ftm-map.
|
|
3
|
-
|
|
4
|
-
Implements Aider-style personalized PageRank over the file-level dependency graph
|
|
5
|
-
with task-aware personalization and token-budget binary search.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import math
|
|
9
|
-
import os
|
|
10
|
-
import sys
|
|
11
|
-
|
|
12
|
-
sys.path.insert(0, os.path.dirname(__file__))
|
|
13
|
-
|
|
14
|
-
import numpy as np
|
|
15
|
-
import scipy.sparse as sp
|
|
16
|
-
|
|
17
|
-
# Try fast-pagerank first, fall back to scipy power iteration
|
|
18
|
-
try:
|
|
19
|
-
from fast_pagerank import pagerank_power
|
|
20
|
-
HAS_FAST_PAGERANK = True
|
|
21
|
-
except ImportError:
|
|
22
|
-
HAS_FAST_PAGERANK = False
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def build_adjacency_matrix(conn):
|
|
26
|
-
"""Build undirected sparse adjacency matrix from file_edges.
|
|
27
|
-
|
|
28
|
-
Returns (matrix, file_id_to_idx, idx_to_file_id) where:
|
|
29
|
-
- matrix is a scipy CSR sparse matrix (undirected: A + A.T)
|
|
30
|
-
- file_id_to_idx maps file_id -> matrix index
|
|
31
|
-
- idx_to_file_id maps matrix index -> file_id
|
|
32
|
-
"""
|
|
33
|
-
# Get all files
|
|
34
|
-
files = conn.execute("SELECT id FROM files ORDER BY id").fetchall()
|
|
35
|
-
if not files:
|
|
36
|
-
return None, {}, {}
|
|
37
|
-
|
|
38
|
-
file_ids = [row['id'] for row in files]
|
|
39
|
-
file_id_to_idx = {fid: i for i, fid in enumerate(file_ids)}
|
|
40
|
-
idx_to_file_id = {i: fid for i, fid in enumerate(file_ids)}
|
|
41
|
-
n = len(file_ids)
|
|
42
|
-
|
|
43
|
-
# Get edges
|
|
44
|
-
edges = conn.execute(
|
|
45
|
-
"SELECT source_file_id, target_file_id, weight FROM file_edges"
|
|
46
|
-
).fetchall()
|
|
47
|
-
|
|
48
|
-
if not edges:
|
|
49
|
-
return sp.csr_matrix((n, n)), file_id_to_idx, idx_to_file_id
|
|
50
|
-
|
|
51
|
-
rows, cols, data = [], [], []
|
|
52
|
-
for edge in edges:
|
|
53
|
-
src_idx = file_id_to_idx.get(edge['source_file_id'])
|
|
54
|
-
tgt_idx = file_id_to_idx.get(edge['target_file_id'])
|
|
55
|
-
if src_idx is not None and tgt_idx is not None:
|
|
56
|
-
rows.append(src_idx)
|
|
57
|
-
cols.append(tgt_idx)
|
|
58
|
-
data.append(edge['weight'])
|
|
59
|
-
|
|
60
|
-
# Build directed matrix, then symmetrize for undirected PageRank
|
|
61
|
-
A = sp.csr_matrix((data, (rows, cols)), shape=(n, n))
|
|
62
|
-
A_undirected = A + A.T # Symmetrize
|
|
63
|
-
|
|
64
|
-
return A_undirected, file_id_to_idx, idx_to_file_id
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def build_personalization(
|
|
68
|
-
conn, seed_files=None, seed_keywords=None, seed_symbols=None, file_id_to_idx=None
|
|
69
|
-
):
|
|
70
|
-
"""Build personalization vector for PageRank.
|
|
71
|
-
|
|
72
|
-
Three channels:
|
|
73
|
-
- seed_files: file paths get 100x weight
|
|
74
|
-
- seed_keywords: FTS5 matches get 30x weight
|
|
75
|
-
- seed_symbols: symbol name matches - defining file gets 80x, referencing files get 40x
|
|
76
|
-
|
|
77
|
-
Returns normalized numpy array (sums to 1.0).
|
|
78
|
-
"""
|
|
79
|
-
n = len(file_id_to_idx)
|
|
80
|
-
if n == 0:
|
|
81
|
-
return None
|
|
82
|
-
|
|
83
|
-
pers = np.ones(n) # Base: uniform weight of 1
|
|
84
|
-
|
|
85
|
-
# Channel 1: Seed files (100x)
|
|
86
|
-
if seed_files:
|
|
87
|
-
for fpath in seed_files:
|
|
88
|
-
file_row = conn.execute(
|
|
89
|
-
"SELECT id FROM files WHERE path=?", (fpath,)
|
|
90
|
-
).fetchone()
|
|
91
|
-
if file_row and file_row['id'] in file_id_to_idx:
|
|
92
|
-
idx = file_id_to_idx[file_row['id']]
|
|
93
|
-
pers[idx] *= 100
|
|
94
|
-
|
|
95
|
-
# Channel 2: Seed keywords via FTS5 (30x)
|
|
96
|
-
if seed_keywords:
|
|
97
|
-
for kw in seed_keywords:
|
|
98
|
-
try:
|
|
99
|
-
fts_results = conn.execute(
|
|
100
|
-
"SELECT s.file_id FROM symbols_fts fts "
|
|
101
|
-
"JOIN symbols s ON s.id = fts.rowid "
|
|
102
|
-
"WHERE symbols_fts MATCH ? LIMIT 50",
|
|
103
|
-
(kw,),
|
|
104
|
-
).fetchall()
|
|
105
|
-
for row in fts_results:
|
|
106
|
-
if row['file_id'] in file_id_to_idx:
|
|
107
|
-
pers[file_id_to_idx[row['file_id']]] *= 30
|
|
108
|
-
except Exception:
|
|
109
|
-
pass # FTS query syntax errors are non-fatal
|
|
110
|
-
|
|
111
|
-
# Channel 3: Seed symbols (80x defining, 40x referencing)
|
|
112
|
-
if seed_symbols:
|
|
113
|
-
for sym_name in seed_symbols:
|
|
114
|
-
# Defining files get 80x
|
|
115
|
-
def_files = conn.execute(
|
|
116
|
-
"SELECT DISTINCT file_id FROM symbols WHERE name=?", (sym_name,)
|
|
117
|
-
).fetchall()
|
|
118
|
-
for row in def_files:
|
|
119
|
-
if row['file_id'] in file_id_to_idx:
|
|
120
|
-
pers[file_id_to_idx[row['file_id']]] *= 80
|
|
121
|
-
|
|
122
|
-
# Referencing files get 40x
|
|
123
|
-
ref_files = conn.execute(
|
|
124
|
-
"SELECT DISTINCT file_id FROM refs WHERE symbol_name=?", (sym_name,)
|
|
125
|
-
).fetchall()
|
|
126
|
-
for row in ref_files:
|
|
127
|
-
if row['file_id'] in file_id_to_idx:
|
|
128
|
-
pers[file_id_to_idx[row['file_id']]] *= 40
|
|
129
|
-
|
|
130
|
-
# Normalize to sum to 1
|
|
131
|
-
total = pers.sum()
|
|
132
|
-
if total > 0:
|
|
133
|
-
pers /= total
|
|
134
|
-
|
|
135
|
-
return pers
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def run_pagerank(adj_matrix, personalization=None, damping=0.85, max_iter=100, tol=1e-6):
|
|
139
|
-
"""Run PageRank on the adjacency matrix.
|
|
140
|
-
|
|
141
|
-
Uses fast-pagerank if available, otherwise scipy power iteration.
|
|
142
|
-
Returns numpy array of scores indexed by matrix position.
|
|
143
|
-
"""
|
|
144
|
-
n = adj_matrix.shape[0]
|
|
145
|
-
if n == 0:
|
|
146
|
-
return np.array([])
|
|
147
|
-
|
|
148
|
-
if HAS_FAST_PAGERANK and personalization is not None:
|
|
149
|
-
try:
|
|
150
|
-
scores = pagerank_power(
|
|
151
|
-
adj_matrix, p=damping, personalize=personalization, tol=tol
|
|
152
|
-
)
|
|
153
|
-
return scores
|
|
154
|
-
except Exception:
|
|
155
|
-
pass # Fall through to scipy implementation
|
|
156
|
-
|
|
157
|
-
# Scipy power iteration fallback
|
|
158
|
-
# Normalize adjacency matrix columns (column-stochastic transition matrix)
|
|
159
|
-
col_sums = np.array(adj_matrix.sum(axis=0)).flatten()
|
|
160
|
-
col_sums[col_sums == 0] = 1 # Avoid division by zero for dangling nodes
|
|
161
|
-
|
|
162
|
-
# Transition matrix: M[i,j] = A[i,j] / col_sum[j]
|
|
163
|
-
D_inv = sp.diags(1.0 / col_sums)
|
|
164
|
-
M = adj_matrix @ D_inv
|
|
165
|
-
|
|
166
|
-
# Initialize personalization / teleport vector
|
|
167
|
-
if personalization is not None:
|
|
168
|
-
v = personalization.copy()
|
|
169
|
-
else:
|
|
170
|
-
v = np.ones(n) / n
|
|
171
|
-
|
|
172
|
-
scores = v.copy()
|
|
173
|
-
|
|
174
|
-
# Dangling nodes: columns with zero outgoing weight
|
|
175
|
-
dangling_mask = np.array(adj_matrix.sum(axis=0)).flatten() == 0
|
|
176
|
-
|
|
177
|
-
for _ in range(max_iter):
|
|
178
|
-
prev = scores.copy()
|
|
179
|
-
|
|
180
|
-
# PageRank iteration with dangling-node redistribution
|
|
181
|
-
dangling_sum = scores[dangling_mask].sum() if dangling_mask.any() else 0
|
|
182
|
-
scores = damping * (M @ scores) + damping * dangling_sum * v + (1 - damping) * v
|
|
183
|
-
|
|
184
|
-
# Check convergence via L1 norm
|
|
185
|
-
if np.abs(scores - prev).sum() < tol:
|
|
186
|
-
break
|
|
187
|
-
|
|
188
|
-
return scores
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def rank_files(conn, seed_files=None, seed_keywords=None, seed_symbols=None):
|
|
192
|
-
"""Rank all files by structural importance with personalization.
|
|
193
|
-
|
|
194
|
-
Returns sorted list of (file_path, score) tuples, highest score first.
|
|
195
|
-
"""
|
|
196
|
-
adj, fid_to_idx, idx_to_fid = build_adjacency_matrix(conn)
|
|
197
|
-
if adj is None or adj.shape[0] == 0:
|
|
198
|
-
return []
|
|
199
|
-
|
|
200
|
-
pers = build_personalization(
|
|
201
|
-
conn, seed_files, seed_keywords, seed_symbols, fid_to_idx
|
|
202
|
-
)
|
|
203
|
-
scores = run_pagerank(adj, pers)
|
|
204
|
-
|
|
205
|
-
# Map scores back to file paths
|
|
206
|
-
results = []
|
|
207
|
-
for idx, score in enumerate(scores):
|
|
208
|
-
file_id = idx_to_fid[idx]
|
|
209
|
-
file_row = conn.execute(
|
|
210
|
-
"SELECT path FROM files WHERE id=?", (file_id,)
|
|
211
|
-
).fetchone()
|
|
212
|
-
if file_row:
|
|
213
|
-
results.append((file_row['path'], float(score)))
|
|
214
|
-
|
|
215
|
-
results.sort(key=lambda x: x[1], reverse=True)
|
|
216
|
-
return results
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def fit_to_budget(ranked_files, conn, token_budget):
|
|
220
|
-
"""Select files + key symbols that fit within token budget.
|
|
221
|
-
|
|
222
|
-
Uses binary search with 15% tolerance (Aider's approach).
|
|
223
|
-
Token estimation: ~25 tokens per tag/symbol entry.
|
|
224
|
-
|
|
225
|
-
Returns (result_list, total_tokens) where result_list contains dicts:
|
|
226
|
-
[{path, score, symbols: [name, ...], tokens}]
|
|
227
|
-
"""
|
|
228
|
-
if not ranked_files or token_budget <= 0:
|
|
229
|
-
return [], 0
|
|
230
|
-
|
|
231
|
-
def estimate_tokens(file_list):
|
|
232
|
-
"""Estimate tokens for a list of files based on their symbol count."""
|
|
233
|
-
total = 0
|
|
234
|
-
for fpath, _ in file_list:
|
|
235
|
-
file_row = conn.execute(
|
|
236
|
-
"SELECT id, line_count FROM files WHERE path=?", (fpath,)
|
|
237
|
-
).fetchone()
|
|
238
|
-
if not file_row:
|
|
239
|
-
continue
|
|
240
|
-
syms = conn.execute(
|
|
241
|
-
"SELECT name, signature FROM symbols WHERE file_id=? ORDER BY line_start",
|
|
242
|
-
(file_row['id'],),
|
|
243
|
-
).fetchall()
|
|
244
|
-
for _sym in syms:
|
|
245
|
-
# ~25 tokens per tag entry (Aider's estimate)
|
|
246
|
-
total += 25
|
|
247
|
-
return total
|
|
248
|
-
|
|
249
|
-
# Binary search: find max number of files that fits within budget
|
|
250
|
-
lo, hi = 1, len(ranked_files)
|
|
251
|
-
best = 1
|
|
252
|
-
|
|
253
|
-
while lo <= hi:
|
|
254
|
-
mid = (lo + hi) // 2
|
|
255
|
-
tokens = estimate_tokens(ranked_files[:mid])
|
|
256
|
-
if tokens <= token_budget:
|
|
257
|
-
best = mid
|
|
258
|
-
lo = mid + 1
|
|
259
|
-
else:
|
|
260
|
-
hi = mid - 1
|
|
261
|
-
|
|
262
|
-
# Allow 15% tolerance -- greedily add more files if within tolerance
|
|
263
|
-
tolerance_budget = token_budget * 1.15
|
|
264
|
-
while best < len(ranked_files):
|
|
265
|
-
tokens = estimate_tokens(ranked_files[: best + 1])
|
|
266
|
-
if tokens <= tolerance_budget:
|
|
267
|
-
best += 1
|
|
268
|
-
else:
|
|
269
|
-
break
|
|
270
|
-
|
|
271
|
-
# Build output with symbols for each selected file
|
|
272
|
-
result = []
|
|
273
|
-
total_tokens = 0
|
|
274
|
-
for fpath, score in ranked_files[:best]:
|
|
275
|
-
file_row = conn.execute(
|
|
276
|
-
"SELECT id FROM files WHERE path=?", (fpath,)
|
|
277
|
-
).fetchone()
|
|
278
|
-
if not file_row:
|
|
279
|
-
continue
|
|
280
|
-
syms = conn.execute(
|
|
281
|
-
"SELECT name FROM symbols WHERE file_id=? ORDER BY line_start",
|
|
282
|
-
(file_row['id'],),
|
|
283
|
-
).fetchall()
|
|
284
|
-
sym_names = [s['name'] for s in syms]
|
|
285
|
-
entry_tokens = len(sym_names) * 25
|
|
286
|
-
total_tokens += entry_tokens
|
|
287
|
-
result.append({
|
|
288
|
-
"path": fpath,
|
|
289
|
-
"score": round(score, 6),
|
|
290
|
-
"symbols": sym_names,
|
|
291
|
-
"tokens": entry_tokens,
|
|
292
|
-
})
|
|
293
|
-
|
|
294
|
-
return result, total_tokens
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
# ---------------------------------------------------------------------------
|
|
298
|
-
# Smoke test
|
|
299
|
-
# ---------------------------------------------------------------------------
|
|
300
|
-
|
|
301
|
-
if __name__ == "__main__":
|
|
302
|
-
import tempfile
|
|
303
|
-
from db import (
|
|
304
|
-
get_connection,
|
|
305
|
-
add_file,
|
|
306
|
-
add_symbol,
|
|
307
|
-
add_reference,
|
|
308
|
-
rebuild_file_edges,
|
|
309
|
-
rebuild_symbol_edges,
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
print("Running ranker.py smoke tests ...")
|
|
313
|
-
|
|
314
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
315
|
-
conn = get_connection(tmp)
|
|
316
|
-
|
|
317
|
-
# Create a small graph: 3 files with cross-references
|
|
318
|
-
f1 = add_file(conn, "src/auth.py", "python", 1.0, line_count=50)
|
|
319
|
-
f2 = add_file(conn, "src/api.py", "python", 1.0, line_count=100)
|
|
320
|
-
f3 = add_file(conn, "src/utils.py", "python", 1.0, line_count=30)
|
|
321
|
-
|
|
322
|
-
# Symbols
|
|
323
|
-
add_symbol(
|
|
324
|
-
conn, f1, "authenticate", "definition", 1, 20,
|
|
325
|
-
signature="def authenticate(req)",
|
|
326
|
-
)
|
|
327
|
-
add_symbol(conn, f1, "verify_token", "definition", 25, 40)
|
|
328
|
-
add_symbol(
|
|
329
|
-
conn, f2, "handle_request", "definition", 1, 50,
|
|
330
|
-
signature="def handle_request(req)",
|
|
331
|
-
)
|
|
332
|
-
add_symbol(conn, f3, "format_date", "definition", 1, 10)
|
|
333
|
-
add_symbol(conn, f3, "parse_config", "definition", 15, 25)
|
|
334
|
-
|
|
335
|
-
# References: api.py references auth.py functions, and utils.py
|
|
336
|
-
add_reference(conn, f2, "authenticate", 10)
|
|
337
|
-
add_reference(conn, f2, "verify_token", 15)
|
|
338
|
-
add_reference(conn, f2, "format_date", 20)
|
|
339
|
-
add_reference(conn, f2, "parse_config", 25)
|
|
340
|
-
# auth.py also references utils
|
|
341
|
-
add_reference(conn, f1, "parse_config", 30)
|
|
342
|
-
|
|
343
|
-
# Materialize edges
|
|
344
|
-
rebuild_file_edges(conn)
|
|
345
|
-
conn.commit()
|
|
346
|
-
|
|
347
|
-
# Test 1: Uniform PageRank
|
|
348
|
-
results = rank_files(conn)
|
|
349
|
-
print(f" Uniform PageRank: {len(results)} files ranked")
|
|
350
|
-
for path, score in results:
|
|
351
|
-
print(f" {path}: {score:.6f}")
|
|
352
|
-
assert len(results) == 3
|
|
353
|
-
|
|
354
|
-
# Test 2: Personalized -- seed auth.py
|
|
355
|
-
results_pers = rank_files(conn, seed_files=["src/auth.py"])
|
|
356
|
-
print(f" Personalized (seed auth.py): {len(results_pers)} files")
|
|
357
|
-
for path, score in results_pers:
|
|
358
|
-
print(f" {path}: {score:.6f}")
|
|
359
|
-
# auth.py should be ranked higher with personalization
|
|
360
|
-
auth_score = next(s for p, s in results_pers if p == "src/auth.py")
|
|
361
|
-
auth_uniform = next(s for p, s in results if p == "src/auth.py")
|
|
362
|
-
print(f" Auth personalized boost: {auth_score:.6f} vs {auth_uniform:.6f}")
|
|
363
|
-
|
|
364
|
-
# Test 3: Budget fitting
|
|
365
|
-
budget_result, total_tokens = fit_to_budget(results, conn, 200)
|
|
366
|
-
print(f" Budget fit (200 tokens): {len(budget_result)} files, {total_tokens} tokens")
|
|
367
|
-
assert total_tokens <= 200 * 1.15 # 15% tolerance
|
|
368
|
-
|
|
369
|
-
# Test 4: Keyword personalization
|
|
370
|
-
results_kw = rank_files(conn, seed_keywords=["authenticate"])
|
|
371
|
-
print(f" Keyword personalized: {len(results_kw)} files")
|
|
372
|
-
|
|
373
|
-
# Test 5: Symbol personalization
|
|
374
|
-
results_sym = rank_files(conn, seed_symbols=["authenticate"])
|
|
375
|
-
print(f" Symbol personalized: {len(results_sym)} files")
|
|
376
|
-
|
|
377
|
-
print("\nAll ranker smoke tests passed.")
|
|
1
|
+
"""
|
|
2
|
+
ranker.py -- PageRank-based context selection engine for ftm-map.
|
|
3
|
+
|
|
4
|
+
Implements Aider-style personalized PageRank over the file-level dependency graph
|
|
5
|
+
with task-aware personalization and token-budget binary search.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import math
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import scipy.sparse as sp
|
|
16
|
+
|
|
17
|
+
# Try fast-pagerank first, fall back to scipy power iteration
|
|
18
|
+
try:
|
|
19
|
+
from fast_pagerank import pagerank_power
|
|
20
|
+
HAS_FAST_PAGERANK = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
HAS_FAST_PAGERANK = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_adjacency_matrix(conn):
|
|
26
|
+
"""Build undirected sparse adjacency matrix from file_edges.
|
|
27
|
+
|
|
28
|
+
Returns (matrix, file_id_to_idx, idx_to_file_id) where:
|
|
29
|
+
- matrix is a scipy CSR sparse matrix (undirected: A + A.T)
|
|
30
|
+
- file_id_to_idx maps file_id -> matrix index
|
|
31
|
+
- idx_to_file_id maps matrix index -> file_id
|
|
32
|
+
"""
|
|
33
|
+
# Get all files
|
|
34
|
+
files = conn.execute("SELECT id FROM files ORDER BY id").fetchall()
|
|
35
|
+
if not files:
|
|
36
|
+
return None, {}, {}
|
|
37
|
+
|
|
38
|
+
file_ids = [row['id'] for row in files]
|
|
39
|
+
file_id_to_idx = {fid: i for i, fid in enumerate(file_ids)}
|
|
40
|
+
idx_to_file_id = {i: fid for i, fid in enumerate(file_ids)}
|
|
41
|
+
n = len(file_ids)
|
|
42
|
+
|
|
43
|
+
# Get edges
|
|
44
|
+
edges = conn.execute(
|
|
45
|
+
"SELECT source_file_id, target_file_id, weight FROM file_edges"
|
|
46
|
+
).fetchall()
|
|
47
|
+
|
|
48
|
+
if not edges:
|
|
49
|
+
return sp.csr_matrix((n, n)), file_id_to_idx, idx_to_file_id
|
|
50
|
+
|
|
51
|
+
rows, cols, data = [], [], []
|
|
52
|
+
for edge in edges:
|
|
53
|
+
src_idx = file_id_to_idx.get(edge['source_file_id'])
|
|
54
|
+
tgt_idx = file_id_to_idx.get(edge['target_file_id'])
|
|
55
|
+
if src_idx is not None and tgt_idx is not None:
|
|
56
|
+
rows.append(src_idx)
|
|
57
|
+
cols.append(tgt_idx)
|
|
58
|
+
data.append(edge['weight'])
|
|
59
|
+
|
|
60
|
+
# Build directed matrix, then symmetrize for undirected PageRank
|
|
61
|
+
A = sp.csr_matrix((data, (rows, cols)), shape=(n, n))
|
|
62
|
+
A_undirected = A + A.T # Symmetrize
|
|
63
|
+
|
|
64
|
+
return A_undirected, file_id_to_idx, idx_to_file_id
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_personalization(
|
|
68
|
+
conn, seed_files=None, seed_keywords=None, seed_symbols=None, file_id_to_idx=None
|
|
69
|
+
):
|
|
70
|
+
"""Build personalization vector for PageRank.
|
|
71
|
+
|
|
72
|
+
Three channels:
|
|
73
|
+
- seed_files: file paths get 100x weight
|
|
74
|
+
- seed_keywords: FTS5 matches get 30x weight
|
|
75
|
+
- seed_symbols: symbol name matches - defining file gets 80x, referencing files get 40x
|
|
76
|
+
|
|
77
|
+
Returns normalized numpy array (sums to 1.0).
|
|
78
|
+
"""
|
|
79
|
+
n = len(file_id_to_idx)
|
|
80
|
+
if n == 0:
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
pers = np.ones(n) # Base: uniform weight of 1
|
|
84
|
+
|
|
85
|
+
# Channel 1: Seed files (100x)
|
|
86
|
+
if seed_files:
|
|
87
|
+
for fpath in seed_files:
|
|
88
|
+
file_row = conn.execute(
|
|
89
|
+
"SELECT id FROM files WHERE path=?", (fpath,)
|
|
90
|
+
).fetchone()
|
|
91
|
+
if file_row and file_row['id'] in file_id_to_idx:
|
|
92
|
+
idx = file_id_to_idx[file_row['id']]
|
|
93
|
+
pers[idx] *= 100
|
|
94
|
+
|
|
95
|
+
# Channel 2: Seed keywords via FTS5 (30x)
|
|
96
|
+
if seed_keywords:
|
|
97
|
+
for kw in seed_keywords:
|
|
98
|
+
try:
|
|
99
|
+
fts_results = conn.execute(
|
|
100
|
+
"SELECT s.file_id FROM symbols_fts fts "
|
|
101
|
+
"JOIN symbols s ON s.id = fts.rowid "
|
|
102
|
+
"WHERE symbols_fts MATCH ? LIMIT 50",
|
|
103
|
+
(kw,),
|
|
104
|
+
).fetchall()
|
|
105
|
+
for row in fts_results:
|
|
106
|
+
if row['file_id'] in file_id_to_idx:
|
|
107
|
+
pers[file_id_to_idx[row['file_id']]] *= 30
|
|
108
|
+
except Exception:
|
|
109
|
+
pass # FTS query syntax errors are non-fatal
|
|
110
|
+
|
|
111
|
+
# Channel 3: Seed symbols (80x defining, 40x referencing)
|
|
112
|
+
if seed_symbols:
|
|
113
|
+
for sym_name in seed_symbols:
|
|
114
|
+
# Defining files get 80x
|
|
115
|
+
def_files = conn.execute(
|
|
116
|
+
"SELECT DISTINCT file_id FROM symbols WHERE name=?", (sym_name,)
|
|
117
|
+
).fetchall()
|
|
118
|
+
for row in def_files:
|
|
119
|
+
if row['file_id'] in file_id_to_idx:
|
|
120
|
+
pers[file_id_to_idx[row['file_id']]] *= 80
|
|
121
|
+
|
|
122
|
+
# Referencing files get 40x
|
|
123
|
+
ref_files = conn.execute(
|
|
124
|
+
"SELECT DISTINCT file_id FROM refs WHERE symbol_name=?", (sym_name,)
|
|
125
|
+
).fetchall()
|
|
126
|
+
for row in ref_files:
|
|
127
|
+
if row['file_id'] in file_id_to_idx:
|
|
128
|
+
pers[file_id_to_idx[row['file_id']]] *= 40
|
|
129
|
+
|
|
130
|
+
# Normalize to sum to 1
|
|
131
|
+
total = pers.sum()
|
|
132
|
+
if total > 0:
|
|
133
|
+
pers /= total
|
|
134
|
+
|
|
135
|
+
return pers
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def run_pagerank(adj_matrix, personalization=None, damping=0.85, max_iter=100, tol=1e-6):
|
|
139
|
+
"""Run PageRank on the adjacency matrix.
|
|
140
|
+
|
|
141
|
+
Uses fast-pagerank if available, otherwise scipy power iteration.
|
|
142
|
+
Returns numpy array of scores indexed by matrix position.
|
|
143
|
+
"""
|
|
144
|
+
n = adj_matrix.shape[0]
|
|
145
|
+
if n == 0:
|
|
146
|
+
return np.array([])
|
|
147
|
+
|
|
148
|
+
if HAS_FAST_PAGERANK and personalization is not None:
|
|
149
|
+
try:
|
|
150
|
+
scores = pagerank_power(
|
|
151
|
+
adj_matrix, p=damping, personalize=personalization, tol=tol
|
|
152
|
+
)
|
|
153
|
+
return scores
|
|
154
|
+
except Exception:
|
|
155
|
+
pass # Fall through to scipy implementation
|
|
156
|
+
|
|
157
|
+
# Scipy power iteration fallback
|
|
158
|
+
# Normalize adjacency matrix columns (column-stochastic transition matrix)
|
|
159
|
+
col_sums = np.array(adj_matrix.sum(axis=0)).flatten()
|
|
160
|
+
col_sums[col_sums == 0] = 1 # Avoid division by zero for dangling nodes
|
|
161
|
+
|
|
162
|
+
# Transition matrix: M[i,j] = A[i,j] / col_sum[j]
|
|
163
|
+
D_inv = sp.diags(1.0 / col_sums)
|
|
164
|
+
M = adj_matrix @ D_inv
|
|
165
|
+
|
|
166
|
+
# Initialize personalization / teleport vector
|
|
167
|
+
if personalization is not None:
|
|
168
|
+
v = personalization.copy()
|
|
169
|
+
else:
|
|
170
|
+
v = np.ones(n) / n
|
|
171
|
+
|
|
172
|
+
scores = v.copy()
|
|
173
|
+
|
|
174
|
+
# Dangling nodes: columns with zero outgoing weight
|
|
175
|
+
dangling_mask = np.array(adj_matrix.sum(axis=0)).flatten() == 0
|
|
176
|
+
|
|
177
|
+
for _ in range(max_iter):
|
|
178
|
+
prev = scores.copy()
|
|
179
|
+
|
|
180
|
+
# PageRank iteration with dangling-node redistribution
|
|
181
|
+
dangling_sum = scores[dangling_mask].sum() if dangling_mask.any() else 0
|
|
182
|
+
scores = damping * (M @ scores) + damping * dangling_sum * v + (1 - damping) * v
|
|
183
|
+
|
|
184
|
+
# Check convergence via L1 norm
|
|
185
|
+
if np.abs(scores - prev).sum() < tol:
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
return scores
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def rank_files(conn, seed_files=None, seed_keywords=None, seed_symbols=None):
|
|
192
|
+
"""Rank all files by structural importance with personalization.
|
|
193
|
+
|
|
194
|
+
Returns sorted list of (file_path, score) tuples, highest score first.
|
|
195
|
+
"""
|
|
196
|
+
adj, fid_to_idx, idx_to_fid = build_adjacency_matrix(conn)
|
|
197
|
+
if adj is None or adj.shape[0] == 0:
|
|
198
|
+
return []
|
|
199
|
+
|
|
200
|
+
pers = build_personalization(
|
|
201
|
+
conn, seed_files, seed_keywords, seed_symbols, fid_to_idx
|
|
202
|
+
)
|
|
203
|
+
scores = run_pagerank(adj, pers)
|
|
204
|
+
|
|
205
|
+
# Map scores back to file paths
|
|
206
|
+
results = []
|
|
207
|
+
for idx, score in enumerate(scores):
|
|
208
|
+
file_id = idx_to_fid[idx]
|
|
209
|
+
file_row = conn.execute(
|
|
210
|
+
"SELECT path FROM files WHERE id=?", (file_id,)
|
|
211
|
+
).fetchone()
|
|
212
|
+
if file_row:
|
|
213
|
+
results.append((file_row['path'], float(score)))
|
|
214
|
+
|
|
215
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
216
|
+
return results
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def fit_to_budget(ranked_files, conn, token_budget):
|
|
220
|
+
"""Select files + key symbols that fit within token budget.
|
|
221
|
+
|
|
222
|
+
Uses binary search with 15% tolerance (Aider's approach).
|
|
223
|
+
Token estimation: ~25 tokens per tag/symbol entry.
|
|
224
|
+
|
|
225
|
+
Returns (result_list, total_tokens) where result_list contains dicts:
|
|
226
|
+
[{path, score, symbols: [name, ...], tokens}]
|
|
227
|
+
"""
|
|
228
|
+
if not ranked_files or token_budget <= 0:
|
|
229
|
+
return [], 0
|
|
230
|
+
|
|
231
|
+
def estimate_tokens(file_list):
|
|
232
|
+
"""Estimate tokens for a list of files based on their symbol count."""
|
|
233
|
+
total = 0
|
|
234
|
+
for fpath, _ in file_list:
|
|
235
|
+
file_row = conn.execute(
|
|
236
|
+
"SELECT id, line_count FROM files WHERE path=?", (fpath,)
|
|
237
|
+
).fetchone()
|
|
238
|
+
if not file_row:
|
|
239
|
+
continue
|
|
240
|
+
syms = conn.execute(
|
|
241
|
+
"SELECT name, signature FROM symbols WHERE file_id=? ORDER BY line_start",
|
|
242
|
+
(file_row['id'],),
|
|
243
|
+
).fetchall()
|
|
244
|
+
for _sym in syms:
|
|
245
|
+
# ~25 tokens per tag entry (Aider's estimate)
|
|
246
|
+
total += 25
|
|
247
|
+
return total
|
|
248
|
+
|
|
249
|
+
# Binary search: find max number of files that fits within budget
|
|
250
|
+
lo, hi = 1, len(ranked_files)
|
|
251
|
+
best = 1
|
|
252
|
+
|
|
253
|
+
while lo <= hi:
|
|
254
|
+
mid = (lo + hi) // 2
|
|
255
|
+
tokens = estimate_tokens(ranked_files[:mid])
|
|
256
|
+
if tokens <= token_budget:
|
|
257
|
+
best = mid
|
|
258
|
+
lo = mid + 1
|
|
259
|
+
else:
|
|
260
|
+
hi = mid - 1
|
|
261
|
+
|
|
262
|
+
# Allow 15% tolerance -- greedily add more files if within tolerance
|
|
263
|
+
tolerance_budget = token_budget * 1.15
|
|
264
|
+
while best < len(ranked_files):
|
|
265
|
+
tokens = estimate_tokens(ranked_files[: best + 1])
|
|
266
|
+
if tokens <= tolerance_budget:
|
|
267
|
+
best += 1
|
|
268
|
+
else:
|
|
269
|
+
break
|
|
270
|
+
|
|
271
|
+
# Build output with symbols for each selected file
|
|
272
|
+
result = []
|
|
273
|
+
total_tokens = 0
|
|
274
|
+
for fpath, score in ranked_files[:best]:
|
|
275
|
+
file_row = conn.execute(
|
|
276
|
+
"SELECT id FROM files WHERE path=?", (fpath,)
|
|
277
|
+
).fetchone()
|
|
278
|
+
if not file_row:
|
|
279
|
+
continue
|
|
280
|
+
syms = conn.execute(
|
|
281
|
+
"SELECT name FROM symbols WHERE file_id=? ORDER BY line_start",
|
|
282
|
+
(file_row['id'],),
|
|
283
|
+
).fetchall()
|
|
284
|
+
sym_names = [s['name'] for s in syms]
|
|
285
|
+
entry_tokens = len(sym_names) * 25
|
|
286
|
+
total_tokens += entry_tokens
|
|
287
|
+
result.append({
|
|
288
|
+
"path": fpath,
|
|
289
|
+
"score": round(score, 6),
|
|
290
|
+
"symbols": sym_names,
|
|
291
|
+
"tokens": entry_tokens,
|
|
292
|
+
})
|
|
293
|
+
|
|
294
|
+
return result, total_tokens
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# ---------------------------------------------------------------------------
|
|
298
|
+
# Smoke test
|
|
299
|
+
# ---------------------------------------------------------------------------
|
|
300
|
+
|
|
301
|
+
if __name__ == "__main__":
|
|
302
|
+
import tempfile
|
|
303
|
+
from db import (
|
|
304
|
+
get_connection,
|
|
305
|
+
add_file,
|
|
306
|
+
add_symbol,
|
|
307
|
+
add_reference,
|
|
308
|
+
rebuild_file_edges,
|
|
309
|
+
rebuild_symbol_edges,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
print("Running ranker.py smoke tests ...")
|
|
313
|
+
|
|
314
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
315
|
+
conn = get_connection(tmp)
|
|
316
|
+
|
|
317
|
+
# Create a small graph: 3 files with cross-references
|
|
318
|
+
f1 = add_file(conn, "src/auth.py", "python", 1.0, line_count=50)
|
|
319
|
+
f2 = add_file(conn, "src/api.py", "python", 1.0, line_count=100)
|
|
320
|
+
f3 = add_file(conn, "src/utils.py", "python", 1.0, line_count=30)
|
|
321
|
+
|
|
322
|
+
# Symbols
|
|
323
|
+
add_symbol(
|
|
324
|
+
conn, f1, "authenticate", "definition", 1, 20,
|
|
325
|
+
signature="def authenticate(req)",
|
|
326
|
+
)
|
|
327
|
+
add_symbol(conn, f1, "verify_token", "definition", 25, 40)
|
|
328
|
+
add_symbol(
|
|
329
|
+
conn, f2, "handle_request", "definition", 1, 50,
|
|
330
|
+
signature="def handle_request(req)",
|
|
331
|
+
)
|
|
332
|
+
add_symbol(conn, f3, "format_date", "definition", 1, 10)
|
|
333
|
+
add_symbol(conn, f3, "parse_config", "definition", 15, 25)
|
|
334
|
+
|
|
335
|
+
# References: api.py references auth.py functions, and utils.py
|
|
336
|
+
add_reference(conn, f2, "authenticate", 10)
|
|
337
|
+
add_reference(conn, f2, "verify_token", 15)
|
|
338
|
+
add_reference(conn, f2, "format_date", 20)
|
|
339
|
+
add_reference(conn, f2, "parse_config", 25)
|
|
340
|
+
# auth.py also references utils
|
|
341
|
+
add_reference(conn, f1, "parse_config", 30)
|
|
342
|
+
|
|
343
|
+
# Materialize edges
|
|
344
|
+
rebuild_file_edges(conn)
|
|
345
|
+
conn.commit()
|
|
346
|
+
|
|
347
|
+
# Test 1: Uniform PageRank
|
|
348
|
+
results = rank_files(conn)
|
|
349
|
+
print(f" Uniform PageRank: {len(results)} files ranked")
|
|
350
|
+
for path, score in results:
|
|
351
|
+
print(f" {path}: {score:.6f}")
|
|
352
|
+
assert len(results) == 3
|
|
353
|
+
|
|
354
|
+
# Test 2: Personalized -- seed auth.py
|
|
355
|
+
results_pers = rank_files(conn, seed_files=["src/auth.py"])
|
|
356
|
+
print(f" Personalized (seed auth.py): {len(results_pers)} files")
|
|
357
|
+
for path, score in results_pers:
|
|
358
|
+
print(f" {path}: {score:.6f}")
|
|
359
|
+
# auth.py should be ranked higher with personalization
|
|
360
|
+
auth_score = next(s for p, s in results_pers if p == "src/auth.py")
|
|
361
|
+
auth_uniform = next(s for p, s in results if p == "src/auth.py")
|
|
362
|
+
print(f" Auth personalized boost: {auth_score:.6f} vs {auth_uniform:.6f}")
|
|
363
|
+
|
|
364
|
+
# Test 3: Budget fitting
|
|
365
|
+
budget_result, total_tokens = fit_to_budget(results, conn, 200)
|
|
366
|
+
print(f" Budget fit (200 tokens): {len(budget_result)} files, {total_tokens} tokens")
|
|
367
|
+
assert total_tokens <= 200 * 1.15 # 15% tolerance
|
|
368
|
+
|
|
369
|
+
# Test 4: Keyword personalization
|
|
370
|
+
results_kw = rank_files(conn, seed_keywords=["authenticate"])
|
|
371
|
+
print(f" Keyword personalized: {len(results_kw)} files")
|
|
372
|
+
|
|
373
|
+
# Test 5: Symbol personalization
|
|
374
|
+
results_sym = rank_files(conn, seed_symbols=["authenticate"])
|
|
375
|
+
print(f" Symbol personalized: {len(results_sym)} files")
|
|
376
|
+
|
|
377
|
+
print("\nAll ranker smoke tests passed.")
|