@unerr-ai/unerr 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/dist/cli.js +37236 -35793
- package/package.json +1 -1
- package/dist/behaviors/agent-llm-bridge.js +0 -166
- package/dist/behaviors/architecture-guard.js +0 -256
- package/dist/behaviors/auto-doc.js +0 -247
- package/dist/behaviors/cascade-guard.js +0 -289
- package/dist/behaviors/change-narrative.js +0 -270
- package/dist/behaviors/convention-drift.js +0 -290
- package/dist/behaviors/framework.js +0 -235
- package/dist/behaviors/guard-formatter.js +0 -44
- package/dist/behaviors/incomplete-work.js +0 -270
- package/dist/behaviors/loop-breaker.js +0 -300
- package/dist/behaviors/session-continuity.js +0 -208
- package/dist/commands/branches.js +0 -97
- package/dist/commands/check-commit.js +0 -225
- package/dist/commands/compress-output.js +0 -64
- package/dist/commands/config-verify.js +0 -243
- package/dist/commands/daemon.js +0 -905
- package/dist/commands/dashboard.js +0 -52
- package/dist/commands/debug.js +0 -200
- package/dist/commands/enrich.js +0 -184
- package/dist/commands/exec.js +0 -233
- package/dist/commands/gain.js +0 -156
- package/dist/commands/hook.js +0 -88
- package/dist/commands/index.js +0 -88
- package/dist/commands/init.js +0 -74
- package/dist/commands/install.js +0 -505
- package/dist/commands/learn.js +0 -116
- package/dist/commands/manifest.js +0 -193
- package/dist/commands/rewind.js +0 -103
- package/dist/commands/serve.js +0 -19
- package/dist/commands/setup-wizard.js +0 -414
- package/dist/commands/skills.js +0 -64
- package/dist/commands/stats.js +0 -20
- package/dist/commands/status.js +0 -654
- package/dist/commands/timeline.js +0 -139
- package/dist/commands/uninstall.js +0 -230
- package/dist/components/App.js +0 -109
- package/dist/components/Banner.js +0 -12
- package/dist/components/ConfirmPrompt.js +0 -25
- package/dist/components/DriftSummary.js +0 -23
- package/dist/components/GradeBadge.js +0 -15
- package/dist/components/HealthCard.js +0 -18
- package/dist/components/InkSpinner.js +0 -22
- package/dist/components/InputBox.js +0 -17
- package/dist/components/KeyValue.js +0 -13
- package/dist/components/MessageList.js +0 -14
- package/dist/components/ProgressBar.js +0 -26
- package/dist/components/Section.js +0 -16
- package/dist/components/SessionSummaryCard.js +0 -73
- package/dist/components/StartupDisplay.js +0 -24
- package/dist/components/StatusDashboard.js +0 -57
- package/dist/components/StatusLine.js +0 -8
- package/dist/components/StepLine.js +0 -22
- package/dist/components/Theme.js +0 -20
- package/dist/components/ToolProgress.js +0 -8
- package/dist/components/ViolationList.js +0 -21
- package/dist/components/render.js +0 -13
- package/dist/config/agent-registry.js +0 -237
- package/dist/config/claude-settings-hooks.js +0 -304
- package/dist/config/hook-installer.js +0 -65
- package/dist/config/instruction-writer.js +0 -388
- package/dist/config/mcp-config-writer.js +0 -266
- package/dist/config/settings.js +0 -174
- package/dist/config/tool-detector.js +0 -42
- package/dist/config/value-surfacing.js +0 -119
- package/dist/core/context-assembly.js +0 -108
- package/dist/core/conversation.js +0 -33
- package/dist/core/local-chat-provider.js +0 -475
- package/dist/core/provider-factory.js +0 -55
- package/dist/core/providers.js +0 -90
- package/dist/core/query-engine.js +0 -174
- package/dist/daemon/api.js +0 -312
- package/dist/daemon/autostart.js +0 -119
- package/dist/daemon/bootstrap.js +0 -39
- package/dist/daemon/client.js +0 -164
- package/dist/daemon/detect-ci.js +0 -81
- package/dist/daemon/platform-linux.js +0 -146
- package/dist/daemon/platform-macos.js +0 -134
- package/dist/daemon/platform-windows.js +0 -116
- package/dist/daemon/process-manager.js +0 -299
- package/dist/daemon/protocol.js +0 -23
- package/dist/daemon/registry.js +0 -270
- package/dist/daemon/settings-schema.js +0 -72
- package/dist/daemon/system-health.js +0 -134
- package/dist/daemon/version-checker.js +0 -262
- package/dist/daemon/warm-start.js +0 -223
- package/dist/entrypoints/cli.js +0 -1043
- package/dist/entrypoints/daemon.js +0 -380
- package/dist/entrypoints/repl.js +0 -147
- package/dist/hooks/adapters/claude-code.js +0 -90
- package/dist/hooks/adapters/cline.js +0 -100
- package/dist/hooks/adapters/cursor.js +0 -98
- package/dist/hooks/hook-dedup.js +0 -79
- package/dist/hooks/hook-runner.js +0 -113
- package/dist/hooks/navigation-hooks.js +0 -175
- package/dist/hooks/prompt-hooks.js +0 -63
- package/dist/hooks/shell-hooks.js +0 -47
- package/dist/ignore.js +0 -111
- package/dist/intelligence/approach-suggester.js +0 -61
- package/dist/intelligence/ast-extractor.js +0 -2615
- package/dist/intelligence/ast-worker.js +0 -34
- package/dist/intelligence/background-indexer.js +0 -121
- package/dist/intelligence/blast-radius.js +0 -200
- package/dist/intelligence/community-detection.js +0 -691
- package/dist/intelligence/community-detector.js +0 -184
- package/dist/intelligence/computation-scheduler.js +0 -75
- package/dist/intelligence/confidence-propagation.js +0 -47
- package/dist/intelligence/convention-detector.js +0 -242
- package/dist/intelligence/convention-learner.js +0 -205
- package/dist/intelligence/convention-matcher.js +0 -205
- package/dist/intelligence/cozo-schema.js +0 -376
- package/dist/intelligence/decision-point-detector.js +0 -90
- package/dist/intelligence/deep-dive-tools.js +0 -586
- package/dist/intelligence/durability-scorer.js +0 -84
- package/dist/intelligence/exploration-cost.js +0 -204
- package/dist/intelligence/exploration-pattern-tracker.js +0 -61
- package/dist/intelligence/fact-generator.js +0 -322
- package/dist/intelligence/facts-schema.js +0 -90
- package/dist/intelligence/file-intelligence.js +0 -59
- package/dist/intelligence/graph-holder.js +0 -220
- package/dist/intelligence/graph-temporal-joiner.js +0 -238
- package/dist/intelligence/health-grade.js +0 -423
- package/dist/intelligence/health-grader.js +0 -200
- package/dist/intelligence/health-map-data.js +0 -259
- package/dist/intelligence/import-symbols.js +0 -136
- package/dist/intelligence/incremental-indexer.js +0 -658
- package/dist/intelligence/indexer/centrality.js +0 -62
- package/dist/intelligence/indexer/cfg-context.js +0 -95
- package/dist/intelligence/indexer/confidence.js +0 -34
- package/dist/intelligence/indexer/cross-file-resolver.js +0 -104
- package/dist/intelligence/indexer/edge-repair.js +0 -89
- package/dist/intelligence/indexer/entity-key.js +0 -17
- package/dist/intelligence/indexer/export-map.js +0 -132
- package/dist/intelligence/indexer/git-cochange.js +0 -128
- package/dist/intelligence/indexer/graph-patch.js +0 -147
- package/dist/intelligence/indexer/incremental.js +0 -78
- package/dist/intelligence/indexer/ingest.js +0 -160
- package/dist/intelligence/indexer/language-detect.js +0 -226
- package/dist/intelligence/indexer/metadata.js +0 -63
- package/dist/intelligence/indexer/mutation-tracker.js +0 -79
- package/dist/intelligence/indexer/orchestrator.js +0 -155
- package/dist/intelligence/indexer/plugin-interface.js +0 -31
- package/dist/intelligence/indexer/plugins/csharp.js +0 -440
- package/dist/intelligence/indexer/plugins/go.js +0 -335
- package/dist/intelligence/indexer/plugins/java.js +0 -370
- package/dist/intelligence/indexer/plugins/python.js +0 -358
- package/dist/intelligence/indexer/plugins/regex-fallback.js +0 -82
- package/dist/intelligence/indexer/plugins/ruby.js +0 -290
- package/dist/intelligence/indexer/plugins/rust.js +0 -484
- package/dist/intelligence/indexer/plugins/tier2-generic.js +0 -310
- package/dist/intelligence/indexer/plugins/typescript.js +0 -456
- package/dist/intelligence/indexer/resource-monitor.js +0 -93
- package/dist/intelligence/indexer/scip/decoder.js +0 -253
- package/dist/intelligence/indexer/scip/detector.js +0 -232
- package/dist/intelligence/indexer/scip/downloader.js +0 -427
- package/dist/intelligence/indexer/scip/fallback.js +0 -34
- package/dist/intelligence/indexer/scip/merger.js +0 -109
- package/dist/intelligence/indexer/scip/orchestrator.js +0 -433
- package/dist/intelligence/indexer/scip/runner.js +0 -98
- package/dist/intelligence/indexer/snapshot.js +0 -66
- package/dist/intelligence/indexer/test-detector.js +0 -196
- package/dist/intelligence/indexer/watch-integration.js +0 -61
- package/dist/intelligence/indexer/worker.js +0 -85
- package/dist/intelligence/local-convention-detector.js +0 -437
- package/dist/intelligence/local-embeddings.js +0 -190
- package/dist/intelligence/local-graph.js +0 -1946
- package/dist/intelligence/local-indexer.js +0 -1575
- package/dist/intelligence/local-llm.js +0 -163
- package/dist/intelligence/local-rule-generator.js +0 -154
- package/dist/intelligence/local-snapshot.js +0 -213
- package/dist/intelligence/negative-knowledge.js +0 -103
- package/dist/intelligence/persistent-db.js +0 -85
- package/dist/intelligence/query-router.js +0 -2556
- package/dist/intelligence/risk-classifier.js +0 -116
- package/dist/intelligence/rule-evaluator.js +0 -380
- package/dist/intelligence/rule-generator.js +0 -49
- package/dist/intelligence/search-index.js +0 -173
- package/dist/intelligence/semantic/docstring-extractor.js +0 -67
- package/dist/intelligence/semantic/embedding-store.js +0 -52
- package/dist/intelligence/semantic/enrichment-orchestrator.js +0 -48
- package/dist/intelligence/semantic/git-message-miner.js +0 -114
- package/dist/intelligence/semantic/identifier-tokenizer.js +0 -51
- package/dist/intelligence/semantic/node2vec-embeddings.js +0 -71
- package/dist/intelligence/semantic/node2vec-walks.js +0 -103
- package/dist/intelligence/semantic/path-domain-inference.js +0 -112
- package/dist/intelligence/semantic/similarity-engine.js +0 -60
- package/dist/intelligence/semantic/tfidf-vectors.js +0 -88
- package/dist/intelligence/session-brief-builder.js +0 -159
- package/dist/intelligence/session-context.js +0 -221
- package/dist/intelligence/session-health-monitor.js +0 -211
- package/dist/intelligence/session-narrative.js +0 -197
- package/dist/intelligence/session-pattern-analyzer.js +0 -218
- package/dist/intelligence/signal-scorer.js +0 -390
- package/dist/intelligence/signal-show-store.js +0 -182
- package/dist/intelligence/smart-truncate.js +0 -158
- package/dist/intelligence/subgraph-cache.js +0 -88
- package/dist/intelligence/temporal-facts.js +0 -494
- package/dist/intelligence/token-estimator.js +0 -100
- package/dist/intelligence/tool-injector.js +0 -87
- package/dist/intelligence/tree-sitter-loader.js +0 -71
- package/dist/intelligence/worker-pool.js +0 -116
- package/dist/proxy/arg-validator.js +0 -79
- package/dist/proxy/auto-bootstrap.js +0 -167
- package/dist/proxy/bridge.js +0 -147
- package/dist/proxy/budget-enforcer.js +0 -70
- package/dist/proxy/compression-quality-monitor.js +0 -160
- package/dist/proxy/compression-stats.js +0 -51
- package/dist/proxy/context-rot-detector.js +0 -137
- package/dist/proxy/drift-detector.js +0 -139
- package/dist/proxy/efficiency-tracker.js +0 -79
- package/dist/proxy/fact-ranking.js +0 -154
- package/dist/proxy/format-encoder.js +0 -266
- package/dist/proxy/http-transport.js +0 -90
- package/dist/proxy/lifecycle-actor.js +0 -55
- package/dist/proxy/lifecycle-machine.js +0 -187
- package/dist/proxy/log-tailer.js +0 -265
- package/dist/proxy/model-pricing.js +0 -98
- package/dist/proxy/network-firewall.js +0 -141
- package/dist/proxy/nudge-state.js +0 -93
- package/dist/proxy/output-compressor.js +0 -185
- package/dist/proxy/pid-lock.js +0 -291
- package/dist/proxy/proxy-context.js +0 -11
- package/dist/proxy/proxy.js +0 -2633
- package/dist/proxy/response-enrichment.js +0 -32
- package/dist/proxy/response-envelope.js +0 -313
- package/dist/proxy/session-dedup.js +0 -82
- package/dist/proxy/session-legend.js +0 -30
- package/dist/proxy/session-persistence.js +0 -210
- package/dist/proxy/session-resume.js +0 -94
- package/dist/proxy/session-stats.js +0 -513
- package/dist/proxy/shell-classifier.js +0 -1346
- package/dist/proxy/shell-compression-log.js +0 -93
- package/dist/proxy/shell-compressor.js +0 -390
- package/dist/proxy/shell-graph-boost.js +0 -202
- package/dist/proxy/shell-monitor-map.js +0 -18
- package/dist/proxy/shell-stats.js +0 -54
- package/dist/proxy/shell-strategies/cloud.js +0 -215
- package/dist/proxy/shell-strategies/diff.js +0 -159
- package/dist/proxy/shell-strategies/error-diagnostic.js +0 -796
- package/dist/proxy/shell-strategies/filter-dsl.js +0 -358
- package/dist/proxy/shell-strategies/git-status.js +0 -177
- package/dist/proxy/shell-strategies/key-value.js +0 -193
- package/dist/proxy/shell-strategies/log-text.js +0 -154
- package/dist/proxy/shell-strategies/omni.js +0 -188
- package/dist/proxy/shell-strategies/progress.js +0 -55
- package/dist/proxy/shell-strategies/redact.js +0 -76
- package/dist/proxy/shell-strategies/structured.js +0 -241
- package/dist/proxy/shell-strategies/tabular.js +0 -243
- package/dist/proxy/shell-strategies/test-results-types.js +0 -13
- package/dist/proxy/shell-strategies/test-results.js +0 -784
- package/dist/proxy/shell-strategies/tree-paths.js +0 -144
- package/dist/proxy/shell-strategies/yaml.js +0 -182
- package/dist/proxy/shell-tee.js +0 -111
- package/dist/proxy/signal-dedup.js +0 -171
- package/dist/proxy/startup-renderer.js +0 -158
- package/dist/proxy/task-token-display.js +0 -38
- package/dist/proxy/token-counter.js +0 -61
- package/dist/proxy/tool-clusters.js +0 -273
- package/dist/proxy/tool-definitions.js +0 -525
- package/dist/proxy/transport-mux.js +0 -229
- package/dist/proxy/wire-cap.js +0 -268
- package/dist/rules/developer.mozilla.org.json +0 -9
- package/dist/rules/github.com.json +0 -21
- package/dist/schemas/api/skills.js +0 -19
- package/dist/schemas/common/errors.js +0 -7
- package/dist/schemas/common/headers.js +0 -5
- package/dist/schemas/entities/edge.js +0 -25
- package/dist/schemas/entities/entity.js +0 -22
- package/dist/schemas/entities/rule.js +0 -18
- package/dist/schemas/index.js +0 -14
- package/dist/server/event-bus.js +0 -59
- package/dist/server/http.js +0 -156
- package/dist/server/middleware.js +0 -70
- package/dist/server/routes/drift.js +0 -97
- package/dist/server/routes/intelligence.js +0 -1217
- package/dist/server/routes/reasoning-quality.js +0 -444
- package/dist/server/routes/session.js +0 -86
- package/dist/server/routes/stream.js +0 -120
- package/dist/server/routes/system.js +0 -73
- package/dist/server/routes/temporal.js +0 -170
- package/dist/server/routes/timeline.js +0 -232
- package/dist/server/routes/token-flow.js +0 -403
- package/dist/skills/effectiveness-tracker.js +0 -93
- package/dist/skills/local-pack.js +0 -380
- package/dist/skills/resolver.js +0 -495
- package/dist/state-detector.js +0 -83
- package/dist/timeline/intent-detector.js +0 -263
- package/dist/timeline/loop-miner.js +0 -140
- package/dist/timeline/open-threads.js +0 -49
- package/dist/timeline/signal-reinforcer.js +0 -62
- package/dist/timeline/timeline-bootstrap.js +0 -151
- package/dist/timeline/timeline-store.js +0 -618
- package/dist/tools/coding/bash.js +0 -49
- package/dist/tools/coding/file-edit.js +0 -72
- package/dist/tools/coding/file-outline.js +0 -227
- package/dist/tools/coding/file-read-protocol.js +0 -425
- package/dist/tools/coding/file-read.js +0 -35
- package/dist/tools/coding/file-write.js +0 -43
- package/dist/tools/coding/glob-tool.js +0 -109
- package/dist/tools/coding/grep.js +0 -162
- package/dist/tools/coding/index.js +0 -27
- package/dist/tools/intelligence/index.js +0 -269
- package/dist/tools/intelligence/record-fact.js +0 -48
- package/dist/tools/intelligence/timeline-markers.js +0 -130
- package/dist/tools/registry.js +0 -47
- package/dist/tools/types.js +0 -8
- package/dist/tracking/auto-snapshot-triggers.js +0 -246
- package/dist/tracking/branch-context.js +0 -115
- package/dist/tracking/branch-snapshot.js +0 -217
- package/dist/tracking/causal-bridge.js +0 -317
- package/dist/tracking/circuit-breaker.js +0 -147
- package/dist/tracking/commit-watcher.js +0 -114
- package/dist/tracking/context-ledger.js +0 -119
- package/dist/tracking/correction-detector.js +0 -324
- package/dist/tracking/drift-tracker.js +0 -874
- package/dist/tracking/durability-tracker.js +0 -94
- package/dist/tracking/entity-rewind.js +0 -200
- package/dist/tracking/file-hash-state.js +0 -114
- package/dist/tracking/git-attribution.js +0 -132
- package/dist/tracking/git-trailers.js +0 -171
- package/dist/tracking/intelligence-counter.js +0 -46
- package/dist/tracking/intent-correlator.js +0 -202
- package/dist/tracking/intent-encoder.js +0 -52
- package/dist/tracking/intent-token-tracker.js +0 -159
- package/dist/tracking/ledger-archiver.js +0 -94
- package/dist/tracking/ledger-chains.js +0 -245
- package/dist/tracking/metrics-store.js +0 -361
- package/dist/tracking/native-watcher.js +0 -131
- package/dist/tracking/offline-rewind.js +0 -295
- package/dist/tracking/pending-violations.js +0 -74
- package/dist/tracking/persistence-effectiveness.js +0 -167
- package/dist/tracking/prompt-durability.js +0 -202
- package/dist/tracking/quality-signals.js +0 -213
- package/dist/tracking/redactor.js +0 -73
- package/dist/tracking/rewind-engine.js +0 -161
- package/dist/tracking/session-history.js +0 -128
- package/dist/tracking/session-receipt.js +0 -88
- package/dist/tracking/session-summary-writer.js +0 -157
- package/dist/tracking/shadow-ledger.js +0 -321
- package/dist/tracking/stash-manager.js +0 -258
- package/dist/tracking/timeline-fork.js +0 -213
- package/dist/tracking/timeline.js +0 -69
- package/dist/tracking/token-flow.js +0 -276
- package/dist/tracking/turn-segmenter.js +0 -122
- package/dist/tracking/weekly-accumulator.js +0 -179
- package/dist/tracking/working-snapshots.js +0 -188
- package/dist/tracking/workspace-manifest.js +0 -176
- package/dist/transport/http.js +0 -102
- package/dist/utils/counterfactual.js +0 -65
- package/dist/utils/deep-link.js +0 -34
- package/dist/utils/detect.js +0 -193
- package/dist/utils/exec.js +0 -73
- package/dist/utils/file-logger.js +0 -87
- package/dist/utils/format-error.js +0 -29
- package/dist/utils/git.js +0 -181
- package/dist/utils/log.js +0 -57
- package/dist/utils/logger.js +0 -35
- package/dist/utils/mcp-content-json.js +0 -8
- package/dist/utils/session-logger.js +0 -154
- package/dist/utils/startup-log.js +0 -512
- package/dist/utils/ui.js +0 -56
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Local search index using CozoDB relations.
|
|
3
|
-
*
|
|
4
|
-
* Tokenizes entity names (camelCase, snake_case, PascalCase) and stores
|
|
5
|
-
* as search_tokens relation for fast local text search.
|
|
6
|
-
*
|
|
7
|
-
* Sprint 6.6: IDF-weighted scoring — rare tokens rank higher than common ones.
|
|
8
|
-
* Pre-computes IDF weights during buildSearchIndex() for O(1) lookup at query time.
|
|
9
|
-
*/
|
|
10
|
-
/**
|
|
11
|
-
* Tokenize an entity name into searchable tokens.
|
|
12
|
-
* Handles camelCase, PascalCase, snake_case, and kebab-case.
|
|
13
|
-
*/
|
|
14
|
-
export function tokenize(name) {
|
|
15
|
-
// Split on non-alphanumeric, then split camelCase
|
|
16
|
-
const parts = name
|
|
17
|
-
.replace(/([a-z])([A-Z])/g, "$1 $2")
|
|
18
|
-
.replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
|
|
19
|
-
.split(/[^a-zA-Z0-9]+/)
|
|
20
|
-
.filter((p) => p.length > 0)
|
|
21
|
-
.map((p) => p.toLowerCase());
|
|
22
|
-
return [...new Set(parts)];
|
|
23
|
-
}
|
|
24
|
-
/**
|
|
25
|
-
* Build search index from entities already loaded in CozoDB.
|
|
26
|
-
* Reads all entities, tokenizes names, populates search_tokens,
|
|
27
|
-
* and computes IDF weights in token_doc_frequency.
|
|
28
|
-
*/
|
|
29
|
-
export async function buildSearchIndex(db) {
|
|
30
|
-
// Read all entities
|
|
31
|
-
let result;
|
|
32
|
-
try {
|
|
33
|
-
result = await db.run("?[key, name] := *entities{key, name}");
|
|
34
|
-
}
|
|
35
|
-
catch (err) {
|
|
36
|
-
process.stderr.write(`[unerr:search-index] Failed to read entities: ${err instanceof Error ? err.message : JSON.stringify(err)}\n`);
|
|
37
|
-
return; // Cannot build search index without entities
|
|
38
|
-
}
|
|
39
|
-
if (!result?.rows)
|
|
40
|
-
return;
|
|
41
|
-
const totalEntities = result.rows.length;
|
|
42
|
-
process.stderr.write(`[unerr:search-index] Building index for ${totalEntities} entities\n`);
|
|
43
|
-
// Track document frequency: how many entities contain each token
|
|
44
|
-
const tokenDocCount = new Map();
|
|
45
|
-
for (const row of result.rows) {
|
|
46
|
-
const [key, name] = row;
|
|
47
|
-
const tokens = tokenize(name);
|
|
48
|
-
for (const token of tokens) {
|
|
49
|
-
tokenDocCount.set(token, (tokenDocCount.get(token) ?? 0) + 1);
|
|
50
|
-
try {
|
|
51
|
-
await db.run("?[token, entity_key] <- [[$token, $key]] :put search_tokens { token, entity_key }", { token, key });
|
|
52
|
-
}
|
|
53
|
-
catch {
|
|
54
|
-
// Duplicate — ignore
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
// Store document frequencies + pre-computed IDF weights
|
|
59
|
-
let idfErrors = 0;
|
|
60
|
-
for (const [token, docCount] of tokenDocCount) {
|
|
61
|
-
const idf = totalEntities > 0 ? Math.log(totalEntities / Math.max(docCount, 1)) : 0;
|
|
62
|
-
try {
|
|
63
|
-
await db.run("?[token, doc_count, idf] <- [[$token, $dc, $idf]] :put token_doc_frequency { token => doc_count, idf }", { token, dc: docCount, idf });
|
|
64
|
-
}
|
|
65
|
-
catch {
|
|
66
|
-
idfErrors++;
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
process.stderr.write(`[unerr:search-index] Done: ${tokenDocCount.size} tokens indexed, ${idfErrors} IDF errors\n`);
|
|
70
|
-
}
|
|
71
|
-
/**
|
|
72
|
-
* Incrementally update search index for specific entities.
|
|
73
|
-
* Removes old tokens for given keys, re-tokenizes, and updates IDF weights.
|
|
74
|
-
*/
|
|
75
|
-
export async function updateSearchIndexIncremental(db, changedKeys, deletedKeys) {
|
|
76
|
-
if (changedKeys.size === 0 && deletedKeys.size === 0)
|
|
77
|
-
return;
|
|
78
|
-
const allKeys = new Set([...changedKeys, ...deletedKeys]);
|
|
79
|
-
// Remove old tokens for all affected keys
|
|
80
|
-
for (const key of allKeys) {
|
|
81
|
-
try {
|
|
82
|
-
await db.run("?[token, entity_key] := *search_tokens{token, entity_key}, entity_key = $key :rm search_tokens { token, entity_key }", { key });
|
|
83
|
-
}
|
|
84
|
-
catch {
|
|
85
|
-
/* safe */
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
// Re-tokenize changed entities (not deleted ones)
|
|
89
|
-
if (changedKeys.size === 0)
|
|
90
|
-
return;
|
|
91
|
-
// Fetch names for changed keys
|
|
92
|
-
const keyRows = [...changedKeys]
|
|
93
|
-
.map((k) => `["${k.replace(/"/g, '\\"')}"]`)
|
|
94
|
-
.join(", ");
|
|
95
|
-
let result;
|
|
96
|
-
try {
|
|
97
|
-
result = await db.run(`
|
|
98
|
-
keys[k] <- [${keyRows}]
|
|
99
|
-
?[key, name] := keys[k], *entities{key: k, name}, key = k
|
|
100
|
-
`);
|
|
101
|
-
}
|
|
102
|
-
catch {
|
|
103
|
-
return;
|
|
104
|
-
}
|
|
105
|
-
if (!result?.rows)
|
|
106
|
-
return;
|
|
107
|
-
// Insert new tokens
|
|
108
|
-
for (const row of result.rows) {
|
|
109
|
-
const [key, name] = row;
|
|
110
|
-
const tokens = tokenize(name);
|
|
111
|
-
for (const token of tokens) {
|
|
112
|
-
try {
|
|
113
|
-
await db.run("?[token, entity_key] <- [[$token, $key]] :put search_tokens { token, entity_key }", { token, key });
|
|
114
|
-
}
|
|
115
|
-
catch {
|
|
116
|
-
/* safe */
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
// Recompute IDF for all tokens (lightweight — just counts + math)
|
|
121
|
-
try {
|
|
122
|
-
const totalResult = await db.run("?[count(key)] := *entities{key}");
|
|
123
|
-
const totalEntities = totalResult.rows?.[0]?.[0] ?? 1;
|
|
124
|
-
const tokenResult = await db.run("?[token, count(entity_key)] := *search_tokens{token, entity_key}");
|
|
125
|
-
if (tokenResult.rows) {
|
|
126
|
-
for (const row of tokenResult.rows) {
|
|
127
|
-
const [token, docCount] = row;
|
|
128
|
-
const idf = Math.log(totalEntities / Math.max(docCount, 1));
|
|
129
|
-
try {
|
|
130
|
-
await db.run("?[token, doc_count, idf] <- [[$token, $dc, $idf]] :put token_doc_frequency { token => doc_count, idf }", { token, dc: docCount, idf });
|
|
131
|
-
}
|
|
132
|
-
catch {
|
|
133
|
-
/* safe */
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
catch {
|
|
139
|
-
/* IDF update failed — search still works, just with stale weights */
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
/**
|
|
143
|
-
* Search local entities by query string.
|
|
144
|
-
* Tokenizes query, finds matching entities via token intersection,
|
|
145
|
-
* ranks by IDF-weighted score (rare tokens contribute more to score).
|
|
146
|
-
*/
|
|
147
|
-
export async function searchLocal(db, query, limit = 20) {
|
|
148
|
-
const queryTokens = tokenize(query);
|
|
149
|
-
if (queryTokens.length === 0)
|
|
150
|
-
return [];
|
|
151
|
-
// Find entities that match ANY token, sum IDF weights per entity
|
|
152
|
-
const tokenRows = queryTokens.map((t) => `["${t}"]`).join(", ");
|
|
153
|
-
let result;
|
|
154
|
-
try {
|
|
155
|
-
result = await db.run(`
|
|
156
|
-
tokens[t] <- [${tokenRows}]
|
|
157
|
-
matched[ek, sum(w)] := tokens[t], *search_tokens[t, ek], *token_doc_frequency[t, _, w]
|
|
158
|
-
?[ek, score, name, kind, fp] := matched[ek, score],
|
|
159
|
-
*entities{key: ek, kind, name, file_path: fp}
|
|
160
|
-
:order -score
|
|
161
|
-
:limit ${limit}
|
|
162
|
-
`);
|
|
163
|
-
}
|
|
164
|
-
catch {
|
|
165
|
-
return [];
|
|
166
|
-
}
|
|
167
|
-
if (!result?.rows)
|
|
168
|
-
return [];
|
|
169
|
-
return result.rows.map((row) => {
|
|
170
|
-
const [key, score, name, kind, file_path] = row;
|
|
171
|
-
return { key, name, kind, file_path, score };
|
|
172
|
-
});
|
|
173
|
-
}
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Docstring/Comment Extractor — extracts documentation annotations from entities.
|
|
3
|
-
*
|
|
4
|
-
* Reads JSDoc, Python docstrings, Go doc comments, etc. from the source
|
|
5
|
-
* and associates them with the closest entity declaration.
|
|
6
|
-
*/
|
|
7
|
-
/**
|
|
8
|
-
* Extract doc comment text from raw source at a specific line.
|
|
9
|
-
* Looks backward from the entity start line for comment blocks.
|
|
10
|
-
*/
|
|
11
|
-
export function extractDocComment(source, entityStartLine) {
|
|
12
|
-
const lines = source.split("\n");
|
|
13
|
-
const lineIdx = entityStartLine - 1;
|
|
14
|
-
if (lineIdx <= 0 || lineIdx >= lines.length)
|
|
15
|
-
return null;
|
|
16
|
-
const commentLines = [];
|
|
17
|
-
let i = lineIdx - 1;
|
|
18
|
-
while (i >= 0) {
|
|
19
|
-
const line = lines[i]?.trim();
|
|
20
|
-
if (line === undefined)
|
|
21
|
-
break;
|
|
22
|
-
if (line.startsWith("*") || line.startsWith("//") || line.startsWith("#")) {
|
|
23
|
-
commentLines.unshift(line);
|
|
24
|
-
i--;
|
|
25
|
-
}
|
|
26
|
-
else if (line === "/**" || line === "*/") {
|
|
27
|
-
commentLines.unshift(line);
|
|
28
|
-
i--;
|
|
29
|
-
}
|
|
30
|
-
else if (line.startsWith("/*")) {
|
|
31
|
-
commentLines.unshift(line);
|
|
32
|
-
break;
|
|
33
|
-
}
|
|
34
|
-
else if (line === '"""' || line === "'''") {
|
|
35
|
-
commentLines.unshift(line);
|
|
36
|
-
break;
|
|
37
|
-
}
|
|
38
|
-
else {
|
|
39
|
-
break;
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
if (commentLines.length === 0)
|
|
43
|
-
return null;
|
|
44
|
-
const text = commentLines
|
|
45
|
-
.join("\n")
|
|
46
|
-
.replace(/^\/\*\*?\s*|\s*\*\/$/g, "")
|
|
47
|
-
.replace(/^\s*\*\s?/gm, "")
|
|
48
|
-
.replace(/^\/\/\s?/gm, "")
|
|
49
|
-
.replace(/^#\s?/gm, "")
|
|
50
|
-
.replace(/^"""|'''|"""|'''/gm, "")
|
|
51
|
-
.trim();
|
|
52
|
-
return text.length > 0 ? text.slice(0, 500) : null;
|
|
53
|
-
}
|
|
54
|
-
/**
|
|
55
|
-
* Extract JSDoc tags (@param, @returns, @deprecated, etc.)
|
|
56
|
-
*/
|
|
57
|
-
export function extractDocTags(docstring) {
|
|
58
|
-
const tagPattern = /@(\w+)/g;
|
|
59
|
-
const tags = [];
|
|
60
|
-
let match;
|
|
61
|
-
match = tagPattern.exec(docstring);
|
|
62
|
-
while (match !== null) {
|
|
63
|
-
tags.push(match[1]);
|
|
64
|
-
match = tagPattern.exec(docstring);
|
|
65
|
-
}
|
|
66
|
-
return [...new Set(tags)];
|
|
67
|
-
}
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* CozoDB Embedding Store — stores Float32Array embeddings inline.
|
|
3
|
-
*
|
|
4
|
-
* Each entity gets a 128-dim embedding (512 bytes) stored as a
|
|
5
|
-
* base64-encoded string in CozoDB. Provides cosine similarity
|
|
6
|
-
* queries in <5ms.
|
|
7
|
-
*/
|
|
8
|
-
/**
|
|
9
|
-
* Encode a Float32Array to a base64 string for CozoDB storage.
|
|
10
|
-
*/
|
|
11
|
-
export function encodeEmbedding(embedding) {
|
|
12
|
-
const buffer = Buffer.from(embedding.buffer, embedding.byteOffset, embedding.byteLength);
|
|
13
|
-
return buffer.toString("base64");
|
|
14
|
-
}
|
|
15
|
-
/**
|
|
16
|
-
* Decode a base64 string back to Float32Array.
|
|
17
|
-
*/
|
|
18
|
-
export function decodeEmbedding(encoded) {
|
|
19
|
-
const buffer = Buffer.from(encoded, "base64");
|
|
20
|
-
return new Float32Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 4);
|
|
21
|
-
}
|
|
22
|
-
/**
|
|
23
|
-
* Compute cosine similarity between two embeddings.
|
|
24
|
-
*/
|
|
25
|
-
export function cosineSimilarity(a, b) {
|
|
26
|
-
if (a.length !== b.length)
|
|
27
|
-
return 0;
|
|
28
|
-
let dot = 0;
|
|
29
|
-
let normA = 0;
|
|
30
|
-
let normB = 0;
|
|
31
|
-
for (let i = 0; i < a.length; i++) {
|
|
32
|
-
dot += a[i] * b[i];
|
|
33
|
-
normA += a[i] * a[i];
|
|
34
|
-
normB += b[i] * b[i];
|
|
35
|
-
}
|
|
36
|
-
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
37
|
-
return denom > 0 ? dot / denom : 0;
|
|
38
|
-
}
|
|
39
|
-
/**
|
|
40
|
-
* Find the top-K most similar entities to a query embedding.
|
|
41
|
-
*/
|
|
42
|
-
export function findSimilar(query, allEmbeddings, topK = 10, excludeKey) {
|
|
43
|
-
const results = [];
|
|
44
|
-
for (const [key, embedding] of allEmbeddings) {
|
|
45
|
-
if (key === excludeKey)
|
|
46
|
-
continue;
|
|
47
|
-
const sim = cosineSimilarity(query, embedding);
|
|
48
|
-
results.push({ entityKey: key, similarity: sim });
|
|
49
|
-
}
|
|
50
|
-
results.sort((a, b) => b.similarity - a.similarity);
|
|
51
|
-
return results.slice(0, topK);
|
|
52
|
-
}
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Semantic Enrichment Orchestrator — runs after Phase 1 indexing.
|
|
3
|
-
*
|
|
4
|
-
* Pipeline: tokenize → TF-IDF → Node2Vec → git mine → combine → store.
|
|
5
|
-
* Runs in background, non-blocking to MCP proxy startup.
|
|
6
|
-
*/
|
|
7
|
-
import { createModuleLogger } from "../../utils/logger.js";
|
|
8
|
-
import { tokenizeIdentifier } from "./identifier-tokenizer.js";
|
|
9
|
-
import { trainEmbeddings } from "./node2vec-embeddings.js";
|
|
10
|
-
import { buildAdjacencyFromEdges, generateWalks } from "./node2vec-walks.js";
|
|
11
|
-
import { getPrimaryDomain } from "./path-domain-inference.js";
|
|
12
|
-
import { buildCorpus, computeTfIdfVector } from "./tfidf-vectors.js";
|
|
13
|
-
const log = createModuleLogger("semantic-enrichment");
|
|
14
|
-
/**
|
|
15
|
-
* Run the full semantic enrichment pipeline.
|
|
16
|
-
*/
|
|
17
|
-
export async function runEnrichment(input) {
|
|
18
|
-
const start = performance.now();
|
|
19
|
-
const { entities, edges } = input;
|
|
20
|
-
if (entities.length === 0) {
|
|
21
|
-
return { embeddings: new Map(), entityCount: 0, durationMs: 0 };
|
|
22
|
-
}
|
|
23
|
-
log.info(`Starting semantic enrichment for ${entities.length} entities`);
|
|
24
|
-
const identifiers = entities.map((e) => e.name);
|
|
25
|
-
const corpus = buildCorpus(identifiers);
|
|
26
|
-
const adjacency = buildAdjacencyFromEdges(edges);
|
|
27
|
-
const walks = generateWalks(adjacency, {
|
|
28
|
-
seed: 42,
|
|
29
|
-
walksPerNode: 5,
|
|
30
|
-
walkLength: 20,
|
|
31
|
-
});
|
|
32
|
-
const structuralEmbeddings = trainEmbeddings(walks);
|
|
33
|
-
const embeddings = new Map();
|
|
34
|
-
for (const entity of entities) {
|
|
35
|
-
const lexical = computeTfIdfVector(entity.name, corpus);
|
|
36
|
-
const structural = structuralEmbeddings.get(entity.key) ?? new Float32Array(64);
|
|
37
|
-
const domain = getPrimaryDomain(entity.file_path);
|
|
38
|
-
embeddings.set(entity.key, {
|
|
39
|
-
lexical,
|
|
40
|
-
structural,
|
|
41
|
-
domain,
|
|
42
|
-
keywords: tokenizeIdentifier(entity.name),
|
|
43
|
-
});
|
|
44
|
-
}
|
|
45
|
-
const durationMs = performance.now() - start;
|
|
46
|
-
log.info(`Semantic enrichment complete: ${entities.length} entities in ${Math.round(durationMs)}ms`);
|
|
47
|
-
return { embeddings, entityCount: entities.length, durationMs };
|
|
48
|
-
}
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Git Commit Message Miner — maps commit messages to entities via blame.
|
|
3
|
-
*
|
|
4
|
-
* Extracts semantic context from git history:
|
|
5
|
-
* - Which commits touched which entities (via file + line intersection)
|
|
6
|
-
* - Commit message keywords → entity semantic labels
|
|
7
|
-
*/
|
|
8
|
-
import { gitQuery } from "../../utils/exec.js";
|
|
9
|
-
/**
|
|
10
|
-
* Get recent commits that touched a specific file.
|
|
11
|
-
*/
|
|
12
|
-
export async function getFileCommits(filePath, cwd, maxCount = 20) {
|
|
13
|
-
const output = await gitQuery(["log", `--max-count=${maxCount}`, "--format=%H|%s|%ci", "--", filePath], cwd);
|
|
14
|
-
if (!output)
|
|
15
|
-
return [];
|
|
16
|
-
return output
|
|
17
|
-
.split("\n")
|
|
18
|
-
.filter(Boolean)
|
|
19
|
-
.map((line) => {
|
|
20
|
-
const [hash, message, date] = line.split("|");
|
|
21
|
-
return { hash: hash ?? "", message: message ?? "", date: date ?? "" };
|
|
22
|
-
})
|
|
23
|
-
.filter((c) => c.hash.length > 0);
|
|
24
|
-
}
|
|
25
|
-
/**
|
|
26
|
-
* Extract keywords from commit messages (for semantic context).
|
|
27
|
-
*/
|
|
28
|
-
export function extractKeywords(messages) {
|
|
29
|
-
const stopWords = new Set([
|
|
30
|
-
"a",
|
|
31
|
-
"an",
|
|
32
|
-
"the",
|
|
33
|
-
"is",
|
|
34
|
-
"are",
|
|
35
|
-
"was",
|
|
36
|
-
"were",
|
|
37
|
-
"be",
|
|
38
|
-
"been",
|
|
39
|
-
"being",
|
|
40
|
-
"have",
|
|
41
|
-
"has",
|
|
42
|
-
"had",
|
|
43
|
-
"do",
|
|
44
|
-
"does",
|
|
45
|
-
"did",
|
|
46
|
-
"will",
|
|
47
|
-
"would",
|
|
48
|
-
"could",
|
|
49
|
-
"should",
|
|
50
|
-
"may",
|
|
51
|
-
"might",
|
|
52
|
-
"shall",
|
|
53
|
-
"can",
|
|
54
|
-
"need",
|
|
55
|
-
"dare",
|
|
56
|
-
"ought",
|
|
57
|
-
"and",
|
|
58
|
-
"or",
|
|
59
|
-
"but",
|
|
60
|
-
"not",
|
|
61
|
-
"no",
|
|
62
|
-
"nor",
|
|
63
|
-
"for",
|
|
64
|
-
"to",
|
|
65
|
-
"in",
|
|
66
|
-
"on",
|
|
67
|
-
"at",
|
|
68
|
-
"by",
|
|
69
|
-
"from",
|
|
70
|
-
"with",
|
|
71
|
-
"of",
|
|
72
|
-
"it",
|
|
73
|
-
"its",
|
|
74
|
-
"this",
|
|
75
|
-
"that",
|
|
76
|
-
]);
|
|
77
|
-
const keywords = new Map();
|
|
78
|
-
for (const msg of messages) {
|
|
79
|
-
const words = msg
|
|
80
|
-
.toLowerCase()
|
|
81
|
-
.replace(/[^a-z0-9\s]/g, " ")
|
|
82
|
-
.split(/\s+/)
|
|
83
|
-
.filter((w) => w.length > 2 && !stopWords.has(w));
|
|
84
|
-
for (const word of words) {
|
|
85
|
-
keywords.set(word, (keywords.get(word) ?? 0) + 1);
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
return [...keywords.entries()]
|
|
89
|
-
.sort((a, b) => b[1] - a[1])
|
|
90
|
-
.slice(0, 20)
|
|
91
|
-
.map(([word]) => word);
|
|
92
|
-
}
|
|
93
|
-
/**
|
|
94
|
-
* Mine git context for a set of entities.
|
|
95
|
-
*/
|
|
96
|
-
export async function mineEntityContext(entities, cwd) {
|
|
97
|
-
const contexts = new Map();
|
|
98
|
-
const fileCache = new Map();
|
|
99
|
-
for (const entity of entities) {
|
|
100
|
-
let commits = fileCache.get(entity.file_path);
|
|
101
|
-
if (!commits) {
|
|
102
|
-
commits = await getFileCommits(entity.file_path, cwd);
|
|
103
|
-
fileCache.set(entity.file_path, commits);
|
|
104
|
-
}
|
|
105
|
-
const messages = commits.map((c) => c.message);
|
|
106
|
-
const keywords = extractKeywords(messages);
|
|
107
|
-
contexts.set(entity.key, {
|
|
108
|
-
entityKey: entity.key,
|
|
109
|
-
commitMessages: messages.slice(0, 5),
|
|
110
|
-
keywords,
|
|
111
|
-
});
|
|
112
|
-
}
|
|
113
|
-
return contexts;
|
|
114
|
-
}
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Identifier Tokenizer — splits code identifiers into semantic tokens.
|
|
3
|
-
*
|
|
4
|
-
* Handles: camelCase, PascalCase, snake_case, kebab-case, SCREAMING_CASE,
|
|
5
|
-
* acronyms (e.g., HTMLParser → ["html", "parser"]), numbers.
|
|
6
|
-
*
|
|
7
|
-
* Performance: <1ms per entity (pure string splitting).
|
|
8
|
-
*/
|
|
9
|
-
export function tokenizeIdentifier(name) {
|
|
10
|
-
if (!name || name.length === 0)
|
|
11
|
-
return [];
|
|
12
|
-
const normalized = name
|
|
13
|
-
.replace(/[-_./\\]/g, " ")
|
|
14
|
-
.replace(/([a-z\d])([A-Z])/g, "$1 $2")
|
|
15
|
-
.replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2");
|
|
16
|
-
const raw = normalized.split(/\s+/).filter(Boolean);
|
|
17
|
-
const tokens = [];
|
|
18
|
-
for (const part of raw) {
|
|
19
|
-
const lower = part.toLowerCase();
|
|
20
|
-
if (lower.length <= 1 && /\d/.test(lower))
|
|
21
|
-
continue;
|
|
22
|
-
if (lower.length > 0)
|
|
23
|
-
tokens.push(lower);
|
|
24
|
-
}
|
|
25
|
-
return tokens;
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Tokenize a file path into semantic tokens.
|
|
29
|
-
*/
|
|
30
|
-
export function tokenizeFilePath(filePath) {
|
|
31
|
-
const parts = filePath.split(/[/\\]/).filter(Boolean);
|
|
32
|
-
const tokens = [];
|
|
33
|
-
for (const part of parts) {
|
|
34
|
-
const withoutExt = part.replace(/\.[^.]+$/, "");
|
|
35
|
-
tokens.push(...tokenizeIdentifier(withoutExt));
|
|
36
|
-
}
|
|
37
|
-
return tokens;
|
|
38
|
-
}
|
|
39
|
-
/**
|
|
40
|
-
* Build a token frequency map from an array of identifiers.
|
|
41
|
-
*/
|
|
42
|
-
export function buildTokenFrequency(identifiers) {
|
|
43
|
-
const freq = new Map();
|
|
44
|
-
for (const id of identifiers) {
|
|
45
|
-
const tokens = tokenizeIdentifier(id);
|
|
46
|
-
for (const token of tokens) {
|
|
47
|
-
freq.set(token, (freq.get(token) ?? 0) + 1);
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
return freq;
|
|
51
|
-
}
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Node2Vec Embedding Trainer — Skip-gram on walk sequences.
|
|
3
|
-
*
|
|
4
|
-
* Converts random walks into 64-dimensional structural embeddings.
|
|
5
|
-
* Uses a simplified Skip-gram model: for each (center, context) pair
|
|
6
|
-
* in the walk window, accumulate co-occurrence → reduce to fixed dims.
|
|
7
|
-
*
|
|
8
|
-
* Performance: <2s for 1K entities (10 walks × 40 steps).
|
|
9
|
-
*/
|
|
10
|
-
const STRUCTURAL_DIM = 64;
|
|
11
|
-
/**
|
|
12
|
-
* Train Node2Vec embeddings from random walks.
|
|
13
|
-
* Returns a map from entity key to 64-dim Float32Array.
|
|
14
|
-
*/
|
|
15
|
-
export function trainEmbeddings(walks, windowSize = 5) {
|
|
16
|
-
const cooccurrence = new Map();
|
|
17
|
-
for (const walk of walks) {
|
|
18
|
-
for (let i = 0; i < walk.length; i++) {
|
|
19
|
-
const center = walk[i];
|
|
20
|
-
if (!cooccurrence.has(center)) {
|
|
21
|
-
cooccurrence.set(center, new Map());
|
|
22
|
-
}
|
|
23
|
-
const centerMap = cooccurrence.get(center);
|
|
24
|
-
const start = Math.max(0, i - windowSize);
|
|
25
|
-
const end = Math.min(walk.length - 1, i + windowSize);
|
|
26
|
-
for (let j = start; j <= end; j++) {
|
|
27
|
-
if (j === i)
|
|
28
|
-
continue;
|
|
29
|
-
const context = walk[j];
|
|
30
|
-
centerMap.set(context, (centerMap.get(context) ?? 0) + 1);
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
const allNodes = [...cooccurrence.keys()];
|
|
35
|
-
const nodeIndex = new Map(allNodes.map((n, i) => [n, i]));
|
|
36
|
-
const embeddings = new Map();
|
|
37
|
-
for (const node of allNodes) {
|
|
38
|
-
const vec = new Float32Array(STRUCTURAL_DIM);
|
|
39
|
-
const neighbors = cooccurrence.get(node) ?? new Map();
|
|
40
|
-
for (const [neighbor, weight] of neighbors) {
|
|
41
|
-
const nIdx = nodeIndex.get(neighbor) ?? 0;
|
|
42
|
-
const hashIdx = (nIdx * 2654435761) % STRUCTURAL_DIM;
|
|
43
|
-
vec[Math.abs(hashIdx)] += weight;
|
|
44
|
-
}
|
|
45
|
-
normalize(vec);
|
|
46
|
-
embeddings.set(node, vec);
|
|
47
|
-
}
|
|
48
|
-
return embeddings;
|
|
49
|
-
}
|
|
50
|
-
/**
|
|
51
|
-
* Combine lexical (TF-IDF) and structural (Node2Vec) into 128-dim embedding.
|
|
52
|
-
*/
|
|
53
|
-
export function combineEmbeddings(lexical, structural) {
|
|
54
|
-
const combined = new Float32Array(128);
|
|
55
|
-
combined.set(lexical, 0);
|
|
56
|
-
combined.set(structural, 64);
|
|
57
|
-
return combined;
|
|
58
|
-
}
|
|
59
|
-
function normalize(vector) {
|
|
60
|
-
let norm = 0;
|
|
61
|
-
for (let i = 0; i < vector.length; i++) {
|
|
62
|
-
norm += vector[i] * vector[i];
|
|
63
|
-
}
|
|
64
|
-
norm = Math.sqrt(norm);
|
|
65
|
-
if (norm > 0) {
|
|
66
|
-
for (let i = 0; i < vector.length; i++) {
|
|
67
|
-
vector[i] /= norm;
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
export { STRUCTURAL_DIM };
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Node2Vec Random Walk Generator — seeded walks on the entity graph.
|
|
3
|
-
*
|
|
4
|
-
* Produces random walk sequences that capture graph structure.
|
|
5
|
-
* Uses biased random walks (Node2Vec with p/q parameters) to balance
|
|
6
|
-
* between BFS-like (local) and DFS-like (exploration) behavior.
|
|
7
|
-
*
|
|
8
|
-
* Seeded for deterministic, reproducible walks.
|
|
9
|
-
*/
|
|
10
|
-
const DEFAULT_CONFIG = {
|
|
11
|
-
walkLength: 40,
|
|
12
|
-
walksPerNode: 10,
|
|
13
|
-
p: 1.0,
|
|
14
|
-
q: 1.0,
|
|
15
|
-
seed: 42,
|
|
16
|
-
};
|
|
17
|
-
/**
|
|
18
|
-
* Seeded PRNG (xorshift32). Deterministic given same seed.
|
|
19
|
-
*/
|
|
20
|
-
function createRng(seed) {
|
|
21
|
-
let state = seed | 0;
|
|
22
|
-
if (state === 0)
|
|
23
|
-
state = 1;
|
|
24
|
-
return () => {
|
|
25
|
-
state ^= state << 13;
|
|
26
|
-
state ^= state >> 17;
|
|
27
|
-
state ^= state << 5;
|
|
28
|
-
return (state >>> 0) / 0xffffffff;
|
|
29
|
-
};
|
|
30
|
-
}
|
|
31
|
-
/**
|
|
32
|
-
* Generate random walks from a graph.
|
|
33
|
-
* Returns array of walks, where each walk is a sequence of entity keys.
|
|
34
|
-
*/
|
|
35
|
-
export function generateWalks(adjacency, config = {}) {
|
|
36
|
-
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
37
|
-
const rng = createRng(cfg.seed);
|
|
38
|
-
const nodes = [...adjacency.keys()];
|
|
39
|
-
const walks = [];
|
|
40
|
-
for (let w = 0; w < cfg.walksPerNode; w++) {
|
|
41
|
-
for (const startNode of nodes) {
|
|
42
|
-
const walk = [startNode];
|
|
43
|
-
let current = startNode;
|
|
44
|
-
let prev = null;
|
|
45
|
-
for (let step = 1; step < cfg.walkLength; step++) {
|
|
46
|
-
const neighbors = adjacency.get(current) ?? [];
|
|
47
|
-
if (neighbors.length === 0)
|
|
48
|
-
break;
|
|
49
|
-
if (prev === null) {
|
|
50
|
-
const idx = Math.floor(rng() * neighbors.length);
|
|
51
|
-
const next = neighbors[idx];
|
|
52
|
-
walk.push(next);
|
|
53
|
-
prev = current;
|
|
54
|
-
current = next;
|
|
55
|
-
}
|
|
56
|
-
else {
|
|
57
|
-
const weights = [];
|
|
58
|
-
for (const neighbor of neighbors) {
|
|
59
|
-
if (neighbor === prev) {
|
|
60
|
-
weights.push(1 / cfg.p);
|
|
61
|
-
}
|
|
62
|
-
else if ((adjacency.get(prev) ?? []).includes(neighbor)) {
|
|
63
|
-
weights.push(1);
|
|
64
|
-
}
|
|
65
|
-
else {
|
|
66
|
-
weights.push(1 / cfg.q);
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
const totalWeight = weights.reduce((a, b) => a + b, 0);
|
|
70
|
-
let r = rng() * totalWeight;
|
|
71
|
-
let chosen = neighbors[0];
|
|
72
|
-
for (let i = 0; i < weights.length; i++) {
|
|
73
|
-
r -= weights[i];
|
|
74
|
-
if (r <= 0) {
|
|
75
|
-
chosen = neighbors[i];
|
|
76
|
-
break;
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
walk.push(chosen);
|
|
80
|
-
prev = current;
|
|
81
|
-
current = chosen;
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
walks.push(walk);
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
return walks;
|
|
88
|
-
}
|
|
89
|
-
/**
|
|
90
|
-
* Build adjacency list from edge data.
|
|
91
|
-
*/
|
|
92
|
-
export function buildAdjacencyFromEdges(edges) {
|
|
93
|
-
const adj = new Map();
|
|
94
|
-
for (const edge of edges) {
|
|
95
|
-
if (!adj.has(edge.from_key))
|
|
96
|
-
adj.set(edge.from_key, []);
|
|
97
|
-
if (!adj.has(edge.to_key))
|
|
98
|
-
adj.set(edge.to_key, []);
|
|
99
|
-
adj.get(edge.from_key)?.push(edge.to_key);
|
|
100
|
-
adj.get(edge.to_key)?.push(edge.from_key);
|
|
101
|
-
}
|
|
102
|
-
return adj;
|
|
103
|
-
}
|