cap-pro 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/README.md +26 -0
- package/.claude-plugin/marketplace.json +24 -0
- package/.claude-plugin/plugin.json +24 -0
- package/LICENSE +21 -0
- package/README.ja-JP.md +834 -0
- package/README.ko-KR.md +823 -0
- package/README.md +806 -0
- package/README.pt-BR.md +452 -0
- package/README.zh-CN.md +800 -0
- package/agents/cap-architect.md +269 -0
- package/agents/cap-brainstormer.md +207 -0
- package/agents/cap-curator.md +276 -0
- package/agents/cap-debugger.md +365 -0
- package/agents/cap-designer.md +246 -0
- package/agents/cap-historian.md +464 -0
- package/agents/cap-migrator.md +291 -0
- package/agents/cap-prototyper.md +197 -0
- package/agents/cap-validator.md +308 -0
- package/bin/install.js +5433 -0
- package/cap/bin/cap-tools.cjs +853 -0
- package/cap/bin/lib/arc-scanner.cjs +344 -0
- package/cap/bin/lib/cap-affinity-engine.cjs +862 -0
- package/cap/bin/lib/cap-anchor.cjs +228 -0
- package/cap/bin/lib/cap-annotation-writer.cjs +340 -0
- package/cap/bin/lib/cap-checkpoint.cjs +434 -0
- package/cap/bin/lib/cap-cluster-detect.cjs +945 -0
- package/cap/bin/lib/cap-cluster-display.cjs +52 -0
- package/cap/bin/lib/cap-cluster-format.cjs +245 -0
- package/cap/bin/lib/cap-cluster-helpers.cjs +295 -0
- package/cap/bin/lib/cap-cluster-io.cjs +212 -0
- package/cap/bin/lib/cap-completeness.cjs +540 -0
- package/cap/bin/lib/cap-deps.cjs +583 -0
- package/cap/bin/lib/cap-design-families.cjs +332 -0
- package/cap/bin/lib/cap-design.cjs +966 -0
- package/cap/bin/lib/cap-divergence-detector.cjs +400 -0
- package/cap/bin/lib/cap-doctor.cjs +752 -0
- package/cap/bin/lib/cap-feature-map-internals.cjs +19 -0
- package/cap/bin/lib/cap-feature-map-migrate.cjs +335 -0
- package/cap/bin/lib/cap-feature-map-monorepo.cjs +885 -0
- package/cap/bin/lib/cap-feature-map-shard.cjs +315 -0
- package/cap/bin/lib/cap-feature-map.cjs +1943 -0
- package/cap/bin/lib/cap-fitness-score.cjs +1075 -0
- package/cap/bin/lib/cap-impact-analysis.cjs +652 -0
- package/cap/bin/lib/cap-learn-review.cjs +1072 -0
- package/cap/bin/lib/cap-learning-signals.cjs +627 -0
- package/cap/bin/lib/cap-loader.cjs +227 -0
- package/cap/bin/lib/cap-logger.cjs +57 -0
- package/cap/bin/lib/cap-memory-bridge.cjs +764 -0
- package/cap/bin/lib/cap-memory-confidence.cjs +452 -0
- package/cap/bin/lib/cap-memory-dir.cjs +987 -0
- package/cap/bin/lib/cap-memory-engine.cjs +698 -0
- package/cap/bin/lib/cap-memory-extends.cjs +398 -0
- package/cap/bin/lib/cap-memory-graph.cjs +790 -0
- package/cap/bin/lib/cap-memory-migrate.cjs +2015 -0
- package/cap/bin/lib/cap-memory-pin.cjs +183 -0
- package/cap/bin/lib/cap-memory-platform.cjs +490 -0
- package/cap/bin/lib/cap-memory-prune.cjs +707 -0
- package/cap/bin/lib/cap-memory-schema.cjs +812 -0
- package/cap/bin/lib/cap-migrate-tags.cjs +309 -0
- package/cap/bin/lib/cap-migrate.cjs +540 -0
- package/cap/bin/lib/cap-pattern-apply.cjs +1203 -0
- package/cap/bin/lib/cap-pattern-pipeline.cjs +1034 -0
- package/cap/bin/lib/cap-plugin-manifest.cjs +80 -0
- package/cap/bin/lib/cap-realtime-affinity.cjs +399 -0
- package/cap/bin/lib/cap-reconcile.cjs +570 -0
- package/cap/bin/lib/cap-research-gate.cjs +218 -0
- package/cap/bin/lib/cap-scope-filter.cjs +402 -0
- package/cap/bin/lib/cap-semantic-pipeline.cjs +1038 -0
- package/cap/bin/lib/cap-session-extract.cjs +987 -0
- package/cap/bin/lib/cap-session.cjs +445 -0
- package/cap/bin/lib/cap-snapshot-linkage.cjs +963 -0
- package/cap/bin/lib/cap-stack-docs.cjs +646 -0
- package/cap/bin/lib/cap-tag-observer.cjs +371 -0
- package/cap/bin/lib/cap-tag-scanner.cjs +1766 -0
- package/cap/bin/lib/cap-telemetry.cjs +466 -0
- package/cap/bin/lib/cap-test-audit.cjs +1438 -0
- package/cap/bin/lib/cap-thread-migrator.cjs +307 -0
- package/cap/bin/lib/cap-thread-synthesis.cjs +545 -0
- package/cap/bin/lib/cap-thread-tracker.cjs +519 -0
- package/cap/bin/lib/cap-trace.cjs +399 -0
- package/cap/bin/lib/cap-trust-mode.cjs +336 -0
- package/cap/bin/lib/cap-ui-design-editor.cjs +642 -0
- package/cap/bin/lib/cap-ui-mind-map.cjs +712 -0
- package/cap/bin/lib/cap-ui-thread-nav.cjs +693 -0
- package/cap/bin/lib/cap-ui.cjs +1245 -0
- package/cap/bin/lib/cap-upgrade.cjs +1028 -0
- package/cap/bin/lib/cli/arg-helpers.cjs +49 -0
- package/cap/bin/lib/cli/frontmatter-router.cjs +31 -0
- package/cap/bin/lib/cli/init-router.cjs +68 -0
- package/cap/bin/lib/cli/phase-router.cjs +102 -0
- package/cap/bin/lib/cli/state-router.cjs +61 -0
- package/cap/bin/lib/cli/template-router.cjs +37 -0
- package/cap/bin/lib/cli/uat-router.cjs +29 -0
- package/cap/bin/lib/cli/validation-router.cjs +26 -0
- package/cap/bin/lib/cli/verification-router.cjs +31 -0
- package/cap/bin/lib/cli/workstream-router.cjs +39 -0
- package/cap/bin/lib/commands.cjs +961 -0
- package/cap/bin/lib/config.cjs +467 -0
- package/cap/bin/lib/convention-reader.cjs +258 -0
- package/cap/bin/lib/core.cjs +1241 -0
- package/cap/bin/lib/feature-aggregator.cjs +423 -0
- package/cap/bin/lib/frontmatter.cjs +337 -0
- package/cap/bin/lib/init.cjs +1443 -0
- package/cap/bin/lib/manifest-generator.cjs +383 -0
- package/cap/bin/lib/milestone.cjs +253 -0
- package/cap/bin/lib/model-profiles.cjs +69 -0
- package/cap/bin/lib/monorepo-context.cjs +226 -0
- package/cap/bin/lib/monorepo-migrator.cjs +509 -0
- package/cap/bin/lib/phase.cjs +889 -0
- package/cap/bin/lib/profile-output.cjs +989 -0
- package/cap/bin/lib/profile-pipeline.cjs +540 -0
- package/cap/bin/lib/roadmap.cjs +330 -0
- package/cap/bin/lib/security.cjs +394 -0
- package/cap/bin/lib/session-manager.cjs +292 -0
- package/cap/bin/lib/skeleton-generator.cjs +179 -0
- package/cap/bin/lib/state.cjs +1032 -0
- package/cap/bin/lib/template.cjs +231 -0
- package/cap/bin/lib/test-detector.cjs +62 -0
- package/cap/bin/lib/uat.cjs +283 -0
- package/cap/bin/lib/verify.cjs +889 -0
- package/cap/bin/lib/workspace-detector.cjs +371 -0
- package/cap/bin/lib/workstream.cjs +492 -0
- package/cap/commands/gsd/workstreams.md +63 -0
- package/cap/references/arc-standard.md +315 -0
- package/cap/references/cap-agent-architecture.md +101 -0
- package/cap/references/cap-gitignore-template +9 -0
- package/cap/references/cap-zero-deps.md +158 -0
- package/cap/references/checkpoints.md +778 -0
- package/cap/references/continuation-format.md +249 -0
- package/cap/references/contract-test-templates.md +312 -0
- package/cap/references/feature-map-template.md +25 -0
- package/cap/references/git-integration.md +295 -0
- package/cap/references/git-planning-commit.md +38 -0
- package/cap/references/model-profiles.md +174 -0
- package/cap/references/phase-numbering.md +126 -0
- package/cap/references/planning-config.md +202 -0
- package/cap/references/property-test-templates.md +316 -0
- package/cap/references/security-test-templates.md +347 -0
- package/cap/references/session-template.json +8 -0
- package/cap/references/tdd.md +263 -0
- package/cap/references/user-profiling.md +681 -0
- package/cap/references/verification-patterns.md +612 -0
- package/cap/templates/UAT.md +265 -0
- package/cap/templates/claude-md.md +175 -0
- package/cap/templates/codebase/architecture.md +255 -0
- package/cap/templates/codebase/concerns.md +310 -0
- package/cap/templates/codebase/conventions.md +307 -0
- package/cap/templates/codebase/integrations.md +280 -0
- package/cap/templates/codebase/stack.md +186 -0
- package/cap/templates/codebase/structure.md +285 -0
- package/cap/templates/codebase/testing.md +480 -0
- package/cap/templates/config.json +44 -0
- package/cap/templates/context.md +352 -0
- package/cap/templates/continue-here.md +78 -0
- package/cap/templates/copilot-instructions.md +7 -0
- package/cap/templates/debug-subagent-prompt.md +91 -0
- package/cap/templates/discussion-log.md +63 -0
- package/cap/templates/milestone-archive.md +123 -0
- package/cap/templates/milestone.md +115 -0
- package/cap/templates/phase-prompt.md +610 -0
- package/cap/templates/planner-subagent-prompt.md +117 -0
- package/cap/templates/project.md +186 -0
- package/cap/templates/requirements.md +231 -0
- package/cap/templates/research-project/ARCHITECTURE.md +204 -0
- package/cap/templates/research-project/FEATURES.md +147 -0
- package/cap/templates/research-project/PITFALLS.md +200 -0
- package/cap/templates/research-project/STACK.md +120 -0
- package/cap/templates/research-project/SUMMARY.md +170 -0
- package/cap/templates/research.md +552 -0
- package/cap/templates/roadmap.md +202 -0
- package/cap/templates/state.md +176 -0
- package/cap/templates/summary.md +364 -0
- package/cap/templates/user-preferences.md +498 -0
- package/cap/templates/verification-report.md +322 -0
- package/cap/workflows/add-phase.md +112 -0
- package/cap/workflows/add-tests.md +351 -0
- package/cap/workflows/add-todo.md +158 -0
- package/cap/workflows/audit-milestone.md +340 -0
- package/cap/workflows/audit-uat.md +109 -0
- package/cap/workflows/autonomous.md +891 -0
- package/cap/workflows/check-todos.md +177 -0
- package/cap/workflows/cleanup.md +152 -0
- package/cap/workflows/complete-milestone.md +767 -0
- package/cap/workflows/diagnose-issues.md +231 -0
- package/cap/workflows/discovery-phase.md +289 -0
- package/cap/workflows/discuss-phase-assumptions.md +653 -0
- package/cap/workflows/discuss-phase.md +1049 -0
- package/cap/workflows/do.md +104 -0
- package/cap/workflows/execute-phase.md +846 -0
- package/cap/workflows/execute-plan.md +514 -0
- package/cap/workflows/fast.md +105 -0
- package/cap/workflows/forensics.md +265 -0
- package/cap/workflows/health.md +181 -0
- package/cap/workflows/help.md +660 -0
- package/cap/workflows/insert-phase.md +130 -0
- package/cap/workflows/list-phase-assumptions.md +178 -0
- package/cap/workflows/list-workspaces.md +56 -0
- package/cap/workflows/manager.md +362 -0
- package/cap/workflows/map-codebase.md +377 -0
- package/cap/workflows/milestone-summary.md +223 -0
- package/cap/workflows/new-milestone.md +486 -0
- package/cap/workflows/new-project.md +1250 -0
- package/cap/workflows/new-workspace.md +237 -0
- package/cap/workflows/next.md +97 -0
- package/cap/workflows/node-repair.md +92 -0
- package/cap/workflows/note.md +156 -0
- package/cap/workflows/pause-work.md +176 -0
- package/cap/workflows/plan-milestone-gaps.md +273 -0
- package/cap/workflows/plan-phase.md +857 -0
- package/cap/workflows/plant-seed.md +169 -0
- package/cap/workflows/pr-branch.md +129 -0
- package/cap/workflows/profile-user.md +449 -0
- package/cap/workflows/progress.md +507 -0
- package/cap/workflows/quick.md +757 -0
- package/cap/workflows/remove-phase.md +155 -0
- package/cap/workflows/remove-workspace.md +90 -0
- package/cap/workflows/research-phase.md +82 -0
- package/cap/workflows/resume-project.md +326 -0
- package/cap/workflows/review.md +228 -0
- package/cap/workflows/session-report.md +146 -0
- package/cap/workflows/settings.md +283 -0
- package/cap/workflows/ship.md +228 -0
- package/cap/workflows/stats.md +60 -0
- package/cap/workflows/transition.md +671 -0
- package/cap/workflows/ui-phase.md +298 -0
- package/cap/workflows/ui-review.md +161 -0
- package/cap/workflows/update.md +323 -0
- package/cap/workflows/validate-phase.md +170 -0
- package/cap/workflows/verify-phase.md +254 -0
- package/cap/workflows/verify-work.md +637 -0
- package/commands/cap/annotate.md +165 -0
- package/commands/cap/brainstorm.md +393 -0
- package/commands/cap/checkpoint.md +106 -0
- package/commands/cap/completeness.md +94 -0
- package/commands/cap/continue.md +72 -0
- package/commands/cap/debug.md +588 -0
- package/commands/cap/deps.md +169 -0
- package/commands/cap/design.md +479 -0
- package/commands/cap/init.md +354 -0
- package/commands/cap/iterate.md +249 -0
- package/commands/cap/learn.md +459 -0
- package/commands/cap/memory.md +275 -0
- package/commands/cap/migrate-feature-map.md +91 -0
- package/commands/cap/migrate-memory.md +108 -0
- package/commands/cap/migrate-tags.md +91 -0
- package/commands/cap/migrate.md +131 -0
- package/commands/cap/prototype.md +510 -0
- package/commands/cap/reconcile.md +121 -0
- package/commands/cap/review.md +360 -0
- package/commands/cap/save.md +72 -0
- package/commands/cap/scan.md +404 -0
- package/commands/cap/start.md +356 -0
- package/commands/cap/status.md +118 -0
- package/commands/cap/test-audit.md +262 -0
- package/commands/cap/test.md +394 -0
- package/commands/cap/trace.md +133 -0
- package/commands/cap/ui.md +167 -0
- package/hooks/dist/cap-check-update.js +115 -0
- package/hooks/dist/cap-context-monitor.js +185 -0
- package/hooks/dist/cap-learn-review-hook.js +114 -0
- package/hooks/dist/cap-learning-hook.js +192 -0
- package/hooks/dist/cap-memory.js +299 -0
- package/hooks/dist/cap-prompt-guard.js +97 -0
- package/hooks/dist/cap-statusline.js +157 -0
- package/hooks/dist/cap-tag-observer.js +115 -0
- package/hooks/dist/cap-version-check.js +112 -0
- package/hooks/dist/cap-workflow-guard.js +175 -0
- package/hooks/hooks.json +55 -0
- package/package.json +58 -0
- package/scripts/base64-scan.sh +262 -0
- package/scripts/build-hooks.js +93 -0
- package/scripts/cap-removal-checklist.md +202 -0
- package/scripts/prompt-injection-scan.sh +199 -0
- package/scripts/run-tests.cjs +181 -0
- package/scripts/secret-scan.sh +227 -0
|
@@ -0,0 +1,1038 @@
|
|
|
1
|
+
// @cap-feature(feature:F-037) Semantic Analysis Pipeline — 3-stage pipeline for computing thread similarity via TF-IDF, concept taxonomy, and graph propagation
|
|
2
|
+
// @cap-decision Pure logic module with zero I/O and zero dependencies. All functions accept data as input and return numeric scores.
|
|
3
|
+
// @cap-decision Three-stage architecture: Stage 1 (text signals) provides lexical similarity, Stage 2 (concept signals) provides semantic similarity via taxonomy, Stage 3 (graph propagation) discovers transitive connections.
|
|
4
|
+
// @cap-decision Weights within Stage 1 are TF-IDF=0.5, N-gram=0.2, Jaccard=0.1; Stage 2 concept vector=0.2. These sum to 1.0 and represent the full pipeline blend.
|
|
5
|
+
|
|
6
|
+
'use strict';
|
|
7
|
+
|
|
8
|
+
// --- Types ---
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @typedef {Object} Thread
|
|
12
|
+
* @property {string} id - Thread ID (thr-XXXX)
|
|
13
|
+
* @property {string} problemStatement - Problem being explored
|
|
14
|
+
* @property {string} solutionShape - Solution direction
|
|
15
|
+
* @property {string[]} boundaryDecisions - Key decisions
|
|
16
|
+
* @property {string[]} featureIds - Associated feature IDs
|
|
17
|
+
* @property {string[]} keywords - Problem-space keywords
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* @typedef {Object} MemoryGraph
|
|
22
|
+
* @property {Object<string, GraphNode>} nodes
|
|
23
|
+
* @property {GraphEdge[]} edges
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @typedef {Object} GraphNode
|
|
28
|
+
* @property {string} type
|
|
29
|
+
* @property {string} id
|
|
30
|
+
* @property {string} label
|
|
31
|
+
* @property {boolean} active
|
|
32
|
+
* @property {Object} metadata
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* @typedef {Object} GraphEdge
|
|
37
|
+
* @property {string} source
|
|
38
|
+
* @property {string} target
|
|
39
|
+
* @property {string} type
|
|
40
|
+
* @property {boolean} active
|
|
41
|
+
* @property {Object} metadata
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* @typedef {Object} Corpus
|
|
46
|
+
* @property {Map<string, number>} docFrequency - term -> number of docs containing it
|
|
47
|
+
* @property {number} docCount - total documents in corpus
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* @typedef {Object<string, number>} SparseVector
|
|
52
|
+
* Map of term -> TF-IDF weight
|
|
53
|
+
*/
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* @typedef {Object} CooccurrenceEntry
|
|
57
|
+
* @property {number} count - Times this concept pair co-occurred
|
|
58
|
+
* @property {string[]} threads - Thread IDs where co-occurrence was observed
|
|
59
|
+
*/
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* @typedef {Object<string, CooccurrenceEntry>} CooccurrenceMatrix
|
|
63
|
+
* Key format: "conceptA|conceptB" (alphabetically ordered)
|
|
64
|
+
*/
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* @typedef {Object} Stage1Result
|
|
68
|
+
* @property {number} tfidf - TF-IDF cosine similarity (weight 0.5)
|
|
69
|
+
* @property {number} ngram - Trigram overlap (weight 0.2)
|
|
70
|
+
* @property {number} jaccard - Keyword Jaccard (weight 0.1)
|
|
71
|
+
* @property {number} combined - Weighted combination
|
|
72
|
+
*/
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* @typedef {Object} Stage2Result
|
|
76
|
+
* @property {number} conceptSim - Concept vector cosine similarity
|
|
77
|
+
* @property {number} combined - Weighted combination (weight 0.2)
|
|
78
|
+
*/
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* @typedef {Object} PipelineResult
|
|
82
|
+
* @property {Stage1Result} stage1 - Text signal scores
|
|
83
|
+
* @property {Stage2Result} stage2 - Concept signal scores
|
|
84
|
+
* @property {Object<string, number>} stage3 - Propagated scores keyed by thread-pair ID
|
|
85
|
+
* @property {number} finalScore - Full pipeline score (0.0-1.0)
|
|
86
|
+
*/
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* @typedef {Object} PipelineContext
|
|
90
|
+
* @property {Thread[]} allThreads - All threads for corpus building
|
|
91
|
+
* @property {MemoryGraph} [graph] - Memory graph for Stage 3
|
|
92
|
+
* @property {Object<string, string[]>} [taxonomy] - Optional taxonomy override
|
|
93
|
+
* @property {Object} [propagationOptions] - { iterations: number, damping: number }
|
|
94
|
+
*/
|
|
95
|
+
|
|
96
|
+
// --- Stop Words ---
|
|
97
|
+
|
|
98
|
+
/** @type {Set<string>} */
|
|
99
|
+
const STOP_WORDS = new Set([
|
|
100
|
+
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
101
|
+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
102
|
+
'should', 'may', 'might', 'shall', 'can', 'need', 'must', 'ought',
|
|
103
|
+
'and', 'but', 'or', 'nor', 'not', 'so', 'yet', 'both', 'either',
|
|
104
|
+
'neither', 'each', 'every', 'all', 'any', 'few', 'more', 'most',
|
|
105
|
+
'other', 'some', 'such', 'no', 'only', 'own', 'same', 'than',
|
|
106
|
+
'too', 'very', 'just', 'because', 'as', 'until', 'while', 'of',
|
|
107
|
+
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'through',
|
|
108
|
+
'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
|
|
109
|
+
'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
|
|
110
|
+
'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
|
|
111
|
+
'how', 'what', 'which', 'who', 'whom', 'this', 'that', 'these',
|
|
112
|
+
'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'you',
|
|
113
|
+
'your', 'yours', 'he', 'him', 'his', 'she', 'her', 'hers', 'it',
|
|
114
|
+
'its', 'they', 'them', 'their', 'theirs', 'also', 'into', 'if',
|
|
115
|
+
]);
|
|
116
|
+
|
|
117
|
+
// --- Stage 1 Pipeline Weights ---
|
|
118
|
+
// @cap-decision Stage 1 weights: TF-IDF dominates at 0.5 because term frequency is the strongest lexical signal. N-gram at 0.2 handles typos/morphology. Jaccard at 0.1 is a simple fallback. Remaining 0.2 goes to Stage 2 concept similarity.
|
|
119
|
+
|
|
120
|
+
const STAGE1_WEIGHT_TFIDF = 0.5;
|
|
121
|
+
const STAGE1_WEIGHT_NGRAM = 0.2;
|
|
122
|
+
const STAGE1_WEIGHT_JACCARD = 0.1;
|
|
123
|
+
const STAGE2_WEIGHT_CONCEPT = 0.2;
|
|
124
|
+
|
|
125
|
+
// ============================================================================
|
|
126
|
+
// Stage 1: Text Signals
|
|
127
|
+
// ============================================================================
|
|
128
|
+
|
|
129
|
+
// --- Tokenization ---
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Tokenize text into lowercase terms, filtering stop words and short tokens.
|
|
133
|
+
* @param {string} text - Raw text input
|
|
134
|
+
* @returns {string[]} Array of tokens (may contain duplicates for TF counting)
|
|
135
|
+
*/
|
|
136
|
+
// @cap-todo(ac:F-037/AC-1) Tokenizer shared by TF-IDF and Jaccard stages
|
|
137
|
+
function tokenize(text) {
|
|
138
|
+
if (!text || typeof text !== 'string') return [];
|
|
139
|
+
return text
|
|
140
|
+
.toLowerCase()
|
|
141
|
+
.replace(/[^a-z0-9\s-]/g, ' ')
|
|
142
|
+
.split(/\s+/)
|
|
143
|
+
.filter(w => w.length >= 3 && !STOP_WORDS.has(w));
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Extract the full searchable text from a thread object.
|
|
148
|
+
* Concatenates problemStatement, solutionShape, and boundaryDecisions.
|
|
149
|
+
* @param {Thread} thread
|
|
150
|
+
* @returns {string}
|
|
151
|
+
*/
|
|
152
|
+
function getThreadText(thread) {
|
|
153
|
+
const parts = [];
|
|
154
|
+
if (thread.problemStatement) parts.push(thread.problemStatement);
|
|
155
|
+
if (thread.solutionShape) parts.push(thread.solutionShape);
|
|
156
|
+
if (Array.isArray(thread.boundaryDecisions)) {
|
|
157
|
+
parts.push(thread.boundaryDecisions.join(' '));
|
|
158
|
+
}
|
|
159
|
+
return parts.join(' ');
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// --- TF-IDF (AC-1) ---
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Build a corpus from an array of threads for IDF computation.
|
|
166
|
+
* @param {Thread[]} threads - All threads in the system
|
|
167
|
+
* @returns {Corpus} Corpus with document frequency map and document count
|
|
168
|
+
*/
|
|
169
|
+
// @cap-todo(ac:F-037/AC-1) Build corpus from all thread texts for IDF calculation
|
|
170
|
+
function buildCorpus(threads) {
|
|
171
|
+
/** @type {Map<string, number>} */
|
|
172
|
+
const docFrequency = new Map();
|
|
173
|
+
let docCount = 0;
|
|
174
|
+
|
|
175
|
+
for (const thread of threads) {
|
|
176
|
+
const text = getThreadText(thread);
|
|
177
|
+
const tokens = tokenize(text);
|
|
178
|
+
// Deduplicate tokens per document for DF counting
|
|
179
|
+
const uniqueTerms = new Set(tokens);
|
|
180
|
+
for (const term of uniqueTerms) {
|
|
181
|
+
docFrequency.set(term, (docFrequency.get(term) || 0) + 1);
|
|
182
|
+
}
|
|
183
|
+
docCount++;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return { docFrequency, docCount };
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Compute a TF-IDF vector for a given text against a corpus.
|
|
191
|
+
* TF(term, doc) = frequency / total terms in doc
|
|
192
|
+
* IDF(term, corpus) = log(N / (1 + docs containing term))
|
|
193
|
+
* @param {string} text - Text to vectorize
|
|
194
|
+
* @param {Corpus} corpus - Pre-built corpus
|
|
195
|
+
* @returns {Map<string, number>} Sparse TF-IDF vector
|
|
196
|
+
*/
|
|
197
|
+
// @cap-todo(ac:F-037/AC-1) TF-IDF vector computation: TF * IDF with +1 smoothing on IDF denominator
|
|
198
|
+
function computeTfIdfVector(text, corpus) {
|
|
199
|
+
const tokens = tokenize(text);
|
|
200
|
+
/** @type {Map<string, number>} */
|
|
201
|
+
const vector = new Map();
|
|
202
|
+
|
|
203
|
+
if (tokens.length === 0) return vector;
|
|
204
|
+
|
|
205
|
+
// Count term frequencies
|
|
206
|
+
/** @type {Map<string, number>} */
|
|
207
|
+
const termCounts = new Map();
|
|
208
|
+
for (const token of tokens) {
|
|
209
|
+
termCounts.set(token, (termCounts.get(token) || 0) + 1);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const totalTerms = tokens.length;
|
|
213
|
+
const N = corpus.docCount;
|
|
214
|
+
|
|
215
|
+
for (const [term, count] of termCounts) {
|
|
216
|
+
const tf = count / totalTerms;
|
|
217
|
+
const df = corpus.docFrequency.get(term) || 0;
|
|
218
|
+
// +1 in denominator avoids division by zero for unknown terms
|
|
219
|
+
const idf = Math.log(N / (1 + df));
|
|
220
|
+
const tfidf = tf * idf;
|
|
221
|
+
if (tfidf > 0) {
|
|
222
|
+
vector.set(term, tfidf);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return vector;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Compute cosine similarity between two sparse vectors.
|
|
231
|
+
* cosine = dot(A, B) / (|A| * |B|)
|
|
232
|
+
* @param {Map<string, number>} vecA - First sparse vector
|
|
233
|
+
* @param {Map<string, number>} vecB - Second sparse vector
|
|
234
|
+
* @returns {number} Cosine similarity (0.0-1.0)
|
|
235
|
+
*/
|
|
236
|
+
function cosineSimilarity(vecA, vecB) {
|
|
237
|
+
if (vecA.size === 0 || vecB.size === 0) return 0;
|
|
238
|
+
|
|
239
|
+
let dotProduct = 0;
|
|
240
|
+
let normA = 0;
|
|
241
|
+
let normB = 0;
|
|
242
|
+
|
|
243
|
+
// Iterate over the smaller vector for efficiency
|
|
244
|
+
const [smaller, larger] = vecA.size <= vecB.size ? [vecA, vecB] : [vecB, vecA];
|
|
245
|
+
|
|
246
|
+
for (const [term, valA] of smaller) {
|
|
247
|
+
const valB = larger.get(term);
|
|
248
|
+
if (valB !== undefined) {
|
|
249
|
+
dotProduct += valA * valB;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
for (const val of vecA.values()) {
|
|
254
|
+
normA += val * val;
|
|
255
|
+
}
|
|
256
|
+
for (const val of vecB.values()) {
|
|
257
|
+
normB += val * val;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
normA = Math.sqrt(normA);
|
|
261
|
+
normB = Math.sqrt(normB);
|
|
262
|
+
|
|
263
|
+
if (normA === 0 || normB === 0) return 0;
|
|
264
|
+
|
|
265
|
+
return clamp01(dotProduct / (normA * normB));
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Compute TF-IDF cosine similarity between two threads.
|
|
270
|
+
* @param {Thread} threadA
|
|
271
|
+
* @param {Thread} threadB
|
|
272
|
+
* @param {Corpus} corpus - Pre-built corpus from all threads
|
|
273
|
+
* @returns {number} Cosine similarity (0.0-1.0)
|
|
274
|
+
*/
|
|
275
|
+
// @cap-todo(ac:F-037/AC-1) TF-IDF cosine similarity with weight 0.5 in the pipeline blend
|
|
276
|
+
function tfidfSimilarity(threadA, threadB, corpus) {
|
|
277
|
+
const textA = getThreadText(threadA);
|
|
278
|
+
const textB = getThreadText(threadB);
|
|
279
|
+
const vecA = computeTfIdfVector(textA, corpus);
|
|
280
|
+
const vecB = computeTfIdfVector(textB, corpus);
|
|
281
|
+
return cosineSimilarity(vecA, vecB);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// --- Character N-Gram Overlap (AC-2) ---
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Extract character trigrams from text.
|
|
288
|
+
* "session" -> Set(["ses", "ess", "ssi", "sio", "ion"])
|
|
289
|
+
* @param {string} text - Input text
|
|
290
|
+
* @returns {Set<string>} Set of character trigrams
|
|
291
|
+
*/
|
|
292
|
+
// @cap-todo(ac:F-037/AC-2) Trigram extraction for typo-resilient matching
|
|
293
|
+
function extractTrigrams(text) {
|
|
294
|
+
if (!text || typeof text !== 'string') return new Set();
|
|
295
|
+
|
|
296
|
+
const normalized = text.toLowerCase().replace(/[^a-z0-9]/g, '');
|
|
297
|
+
const trigrams = new Set();
|
|
298
|
+
|
|
299
|
+
for (let i = 0; i <= normalized.length - 3; i++) {
|
|
300
|
+
trigrams.add(normalized.substring(i, i + 3));
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return trigrams;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Compute trigram-based Jaccard similarity between two texts.
|
|
308
|
+
* Catches morphological variants and typos: "authenticate" <-> "authentication".
|
|
309
|
+
* @param {string} textA
|
|
310
|
+
* @param {string} textB
|
|
311
|
+
* @returns {number} Similarity score (0.0-1.0)
|
|
312
|
+
*/
|
|
313
|
+
// @cap-todo(ac:F-037/AC-2) Character N-Gram overlap with weight 0.2 for typo-resilient matching
|
|
314
|
+
function trigramSimilarity(textA, textB) {
|
|
315
|
+
const gramsA = extractTrigrams(textA);
|
|
316
|
+
const gramsB = extractTrigrams(textB);
|
|
317
|
+
|
|
318
|
+
if (gramsA.size === 0 && gramsB.size === 0) return 0;
|
|
319
|
+
if (gramsA.size === 0 || gramsB.size === 0) return 0;
|
|
320
|
+
|
|
321
|
+
let intersectionSize = 0;
|
|
322
|
+
for (const gram of gramsA) {
|
|
323
|
+
if (gramsB.has(gram)) intersectionSize++;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
const unionSize = new Set([...gramsA, ...gramsB]).size;
|
|
327
|
+
if (unionSize === 0) return 0;
|
|
328
|
+
|
|
329
|
+
return clamp01(intersectionSize / unionSize);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// --- Jaccard Keywords (AC-3) ---
|
|
333
|
+
|
|
334
|
+
/**
|
|
335
|
+
* Compute Jaccard similarity over keyword sets from two threads.
|
|
336
|
+
* Uses thread.keywords arrays directly.
|
|
337
|
+
* @param {string[]} keywordsA - Keywords from thread A
|
|
338
|
+
* @param {string[]} keywordsB - Keywords from thread B
|
|
339
|
+
* @returns {number} Jaccard similarity (0.0-1.0)
|
|
340
|
+
*/
|
|
341
|
+
// @cap-todo(ac:F-037/AC-3) Jaccard keyword similarity with weight 0.1 as simple fallback signal
|
|
342
|
+
function jaccardKeywordSimilarity(keywordsA, keywordsB) {
|
|
343
|
+
const setA = new Set((keywordsA || []).map(k => k.toLowerCase()));
|
|
344
|
+
const setB = new Set((keywordsB || []).map(k => k.toLowerCase()));
|
|
345
|
+
|
|
346
|
+
if (setA.size === 0 && setB.size === 0) return 0;
|
|
347
|
+
|
|
348
|
+
let intersectionSize = 0;
|
|
349
|
+
for (const kw of setA) {
|
|
350
|
+
if (setB.has(kw)) intersectionSize++;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
const unionSize = new Set([...setA, ...setB]).size;
|
|
354
|
+
if (unionSize === 0) return 0;
|
|
355
|
+
|
|
356
|
+
return clamp01(intersectionSize / unionSize);
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// --- Stage 1 Combined ---
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Compute all Stage 1 text signals for a thread pair.
|
|
363
|
+
* @param {Thread} threadA
|
|
364
|
+
* @param {Thread} threadB
|
|
365
|
+
* @param {Corpus} corpus - Pre-built corpus
|
|
366
|
+
* @returns {Stage1Result}
|
|
367
|
+
*/
|
|
368
|
+
function computeStage1(threadA, threadB, corpus) {
|
|
369
|
+
const tfidf = tfidfSimilarity(threadA, threadB, corpus);
|
|
370
|
+
|
|
371
|
+
const textA = getThreadText(threadA);
|
|
372
|
+
const textB = getThreadText(threadB);
|
|
373
|
+
const ngram = trigramSimilarity(textA, textB);
|
|
374
|
+
|
|
375
|
+
const jaccard = jaccardKeywordSimilarity(threadA.keywords, threadB.keywords);
|
|
376
|
+
|
|
377
|
+
const combined = (tfidf * STAGE1_WEIGHT_TFIDF)
|
|
378
|
+
+ (ngram * STAGE1_WEIGHT_NGRAM)
|
|
379
|
+
+ (jaccard * STAGE1_WEIGHT_JACCARD);
|
|
380
|
+
|
|
381
|
+
return { tfidf, ngram, jaccard, combined };
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// ============================================================================
|
|
385
|
+
// Stage 2: Concept Signals
|
|
386
|
+
// ============================================================================
|
|
387
|
+
|
|
388
|
+
// --- Seed Taxonomy (AC-4) ---
|
|
389
|
+
// @cap-todo(ac:F-037/AC-4) Embedded seed taxonomy of 25 universal software development concepts, no external config
|
|
390
|
+
// @cap-decision Taxonomy concepts chosen for breadth across typical software projects. Keywords are lowercase stems/fragments that trigger concept association.
|
|
391
|
+
|
|
392
|
+
const SEED_TAXONOMY = {
|
|
393
|
+
'authentication': ['auth', 'login', 'logout', 'session', 'token', 'jwt', 'oauth', 'sso', 'password', 'credential'],
|
|
394
|
+
'authorization': ['permission', 'role', 'access', 'policy', 'rbac', 'rls', 'acl', 'grant'],
|
|
395
|
+
'database': ['sql', 'query', 'table', 'column', 'migration', 'schema', 'index', 'foreign', 'constraint'],
|
|
396
|
+
'api': ['endpoint', 'route', 'request', 'response', 'rest', 'graphql', 'middleware', 'handler'],
|
|
397
|
+
'testing': ['test', 'assert', 'mock', 'stub', 'coverage', 'vitest', 'jest', 'spec'],
|
|
398
|
+
'caching': ['cache', 'redis', 'ttl', 'invalidate', 'stale', 'refresh', 'memoize'],
|
|
399
|
+
'deployment': ['deploy', 'pipeline', 'docker', 'container', 'kubernetes', 'staging', 'production'],
|
|
400
|
+
'ui-frontend': ['component', 'render', 'react', 'vue', 'svelte', 'tailwind', 'css', 'layout', 'responsive'],
|
|
401
|
+
'state-management': ['state', 'store', 'reducer', 'context', 'redux', 'zustand', 'signal'],
|
|
402
|
+
'file-io': ['file', 'read', 'write', 'stream', 'buffer', 'upload', 'download', 'storage'],
|
|
403
|
+
'error-handling': ['error', 'exception', 'catch', 'throw', 'retry', 'fallback', 'timeout'],
|
|
404
|
+
'configuration': ['config', 'env', 'environment', 'setting', 'option', 'flag', 'feature-flag'],
|
|
405
|
+
'logging': ['log', 'debug', 'trace', 'monitor', 'observability', 'metric', 'alert'],
|
|
406
|
+
'security': ['encrypt', 'hash', 'csrf', 'xss', 'injection', 'sanitize', 'vulnerability', 'secure'],
|
|
407
|
+
'performance': ['optimize', 'latency', 'throughput', 'benchmark', 'profile', 'memory', 'cpu'],
|
|
408
|
+
'data-validation': ['validate', 'schema', 'zod', 'type', 'check', 'constraint', 'format'],
|
|
409
|
+
'messaging': ['queue', 'event', 'publish', 'subscribe', 'webhook', 'notification', 'email'],
|
|
410
|
+
'search': ['search', 'index', 'filter', 'sort', 'paginate', 'fulltext'],
|
|
411
|
+
'version-control': ['git', 'branch', 'commit', 'merge', 'rebase', 'diff', 'conflict'],
|
|
412
|
+
'documentation': ['docs', 'readme', 'comment', 'jsdoc', 'markdown', 'changelog'],
|
|
413
|
+
'build-tooling': ['build', 'bundle', 'compile', 'transpile', 'webpack', 'esbuild', 'vite'],
|
|
414
|
+
'networking': ['http', 'socket', 'websocket', 'fetch', 'cors', 'proxy', 'ssl', 'tls'],
|
|
415
|
+
'serialization': ['json', 'parse', 'stringify', 'serialize', 'deserialize', 'encode', 'decode'],
|
|
416
|
+
'concurrency': ['async', 'await', 'promise', 'parallel', 'worker', 'thread', 'mutex', 'lock'],
|
|
417
|
+
'migration': ['migrate', 'upgrade', 'backward', 'compatible', 'version', 'legacy', 'deprecate'],
|
|
418
|
+
};
|
|
419
|
+
|
|
420
|
+
/** Concept names in stable order for vector indexing. */
|
|
421
|
+
const CONCEPT_NAMES = Object.keys(SEED_TAXONOMY).sort();
|
|
422
|
+
|
|
423
|
+
// --- Co-occurrence Matrix (AC-5) ---
|
|
424
|
+
|
|
425
|
+
/**
|
|
426
|
+
* Build a co-occurrence matrix from observed thread data.
|
|
427
|
+
* Tracks which concept pairs appear together across threads.
|
|
428
|
+
* @param {Thread[]} threads - All threads to analyze
|
|
429
|
+
* @param {Object<string, string[]>} [taxonomy] - Taxonomy to use (defaults to SEED_TAXONOMY)
|
|
430
|
+
* @returns {CooccurrenceMatrix} Matrix keyed by "conceptA|conceptB"
|
|
431
|
+
*/
|
|
432
|
+
// @cap-todo(ac:F-037/AC-5) Co-occurrence matrix auto-learns from observed thread data
|
|
433
|
+
function buildCooccurrenceMatrix(threads, taxonomy) {
|
|
434
|
+
const tax = taxonomy || SEED_TAXONOMY;
|
|
435
|
+
/** @type {CooccurrenceMatrix} */
|
|
436
|
+
const matrix = {};
|
|
437
|
+
|
|
438
|
+
for (const thread of threads) {
|
|
439
|
+
const text = getThreadText(thread);
|
|
440
|
+
const tokens = new Set(tokenize(text));
|
|
441
|
+
|
|
442
|
+
// Identify which concepts are present in this thread
|
|
443
|
+
const presentConcepts = [];
|
|
444
|
+
for (const [concept, keywords] of Object.entries(tax)) {
|
|
445
|
+
const hits = keywords.filter(kw => tokens.has(kw) || textContainsKeyword(text, kw));
|
|
446
|
+
if (hits.length > 0) {
|
|
447
|
+
presentConcepts.push(concept);
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Record co-occurrences for every pair of present concepts
|
|
452
|
+
for (let i = 0; i < presentConcepts.length; i++) {
|
|
453
|
+
for (let j = i + 1; j < presentConcepts.length; j++) {
|
|
454
|
+
const key = makeCooccurrenceKey(presentConcepts[i], presentConcepts[j]);
|
|
455
|
+
if (!matrix[key]) {
|
|
456
|
+
matrix[key] = { count: 0, threads: [] };
|
|
457
|
+
}
|
|
458
|
+
matrix[key].count++;
|
|
459
|
+
matrix[key].threads.push(thread.id);
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return matrix;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/**
|
|
468
|
+
* Check if text contains a keyword (case-insensitive substring match).
|
|
469
|
+
* Used for taxonomy keywords that might be substrings of larger words.
|
|
470
|
+
* @param {string} text
|
|
471
|
+
* @param {string} keyword
|
|
472
|
+
* @returns {boolean}
|
|
473
|
+
*/
|
|
474
|
+
function textContainsKeyword(text, keyword) {
|
|
475
|
+
return text.toLowerCase().indexOf(keyword.toLowerCase()) !== -1;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
/**
|
|
479
|
+
* Create a stable co-occurrence key from two concept names.
|
|
480
|
+
* Alphabetically ordered to ensure "a|b" === "b|a".
|
|
481
|
+
* @param {string} conceptA
|
|
482
|
+
* @param {string} conceptB
|
|
483
|
+
* @returns {string}
|
|
484
|
+
*/
|
|
485
|
+
function makeCooccurrenceKey(conceptA, conceptB) {
|
|
486
|
+
return conceptA < conceptB
|
|
487
|
+
? `${conceptA}|${conceptB}`
|
|
488
|
+
: `${conceptB}|${conceptA}`;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Get confirmed concept pairs that have co-occurred at or above a threshold.
|
|
493
|
+
* @param {CooccurrenceMatrix} matrix
|
|
494
|
+
* @param {number} [threshold=5] - Minimum co-occurrence count
|
|
495
|
+
* @returns {Array<{key: string, count: number, concepts: [string, string]}>}
|
|
496
|
+
*/
|
|
497
|
+
// @cap-todo(ac:F-037/AC-5) Confirmed pairs override seed weights at >= 5 co-occurrences
|
|
498
|
+
function getConfirmedPairs(matrix, threshold) {
|
|
499
|
+
const minCount = typeof threshold === 'number' ? threshold : 5;
|
|
500
|
+
const confirmed = [];
|
|
501
|
+
|
|
502
|
+
for (const [key, entry] of Object.entries(matrix)) {
|
|
503
|
+
if (entry.count >= minCount) {
|
|
504
|
+
const [conceptA, conceptB] = key.split('|');
|
|
505
|
+
confirmed.push({ key, count: entry.count, concepts: [conceptA, conceptB] });
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
return confirmed.sort((a, b) => b.count - a.count);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// --- Concept Vector Projection (AC-6) ---
|
|
513
|
+
|
|
514
|
+
/**
|
|
515
|
+
* Project thread text into concept space using the taxonomy.
|
|
516
|
+
* For each concept, score = number of matching keywords found in the text,
|
|
517
|
+
* normalized by the total keyword count for that concept.
|
|
518
|
+
* @param {string} text - Thread text
|
|
519
|
+
* @param {Object<string, string[]>} [taxonomy] - Taxonomy to use
|
|
520
|
+
* @returns {Map<string, number>} Concept vector (concept name -> score)
|
|
521
|
+
*/
|
|
522
|
+
// @cap-todo(ac:F-037/AC-6) Concept vector similarity via concept space projection + cosine distance
|
|
523
|
+
function projectToConcepts(text, taxonomy) {
|
|
524
|
+
const tax = taxonomy || SEED_TAXONOMY;
|
|
525
|
+
/** @type {Map<string, number>} */
|
|
526
|
+
const vector = new Map();
|
|
527
|
+
|
|
528
|
+
if (!text || typeof text !== 'string') return vector;
|
|
529
|
+
|
|
530
|
+
const lowerText = text.toLowerCase();
|
|
531
|
+
|
|
532
|
+
for (const [concept, keywords] of Object.entries(tax)) {
|
|
533
|
+
let matchCount = 0;
|
|
534
|
+
for (const kw of keywords) {
|
|
535
|
+
if (lowerText.indexOf(kw) !== -1) {
|
|
536
|
+
matchCount++;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
// Normalize by keyword list length to avoid bias toward concepts with more keywords
|
|
540
|
+
const score = keywords.length > 0 ? matchCount / keywords.length : 0;
|
|
541
|
+
if (score > 0) {
|
|
542
|
+
vector.set(concept, score);
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
return vector;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
/**
|
|
550
|
+
* Apply co-occurrence boost to concept vectors.
|
|
551
|
+
* When confirmed pairs are found, boost the weaker concept in the pair
|
|
552
|
+
* based on the co-occurrence strength.
|
|
553
|
+
* @param {Map<string, number>} vector - Original concept vector
|
|
554
|
+
* @param {CooccurrenceMatrix} matrix - Co-occurrence data
|
|
555
|
+
* @param {number} [threshold=5] - Minimum co-occurrences to trigger boost
|
|
556
|
+
* @returns {Map<string, number>} Boosted concept vector
|
|
557
|
+
*/
|
|
558
|
+
// @cap-decision Co-occurrence boost adds 0.1 * (count/maxCount) to the weaker concept in a confirmed pair. This is a gentle nudge, not an override, to preserve the seed taxonomy signal.
|
|
559
|
+
function applyCooccurrenceBoost(vector, matrix, threshold) {
|
|
560
|
+
const confirmed = getConfirmedPairs(matrix, threshold);
|
|
561
|
+
if (confirmed.length === 0) return vector;
|
|
562
|
+
|
|
563
|
+
const boosted = new Map(vector);
|
|
564
|
+
const maxCount = confirmed[0].count; // Already sorted descending
|
|
565
|
+
|
|
566
|
+
for (const pair of confirmed) {
|
|
567
|
+
const [conceptA, conceptB] = pair.concepts;
|
|
568
|
+
const scoreA = boosted.get(conceptA) || 0;
|
|
569
|
+
const scoreB = boosted.get(conceptB) || 0;
|
|
570
|
+
|
|
571
|
+
// Only boost if at least one concept is present in the vector
|
|
572
|
+
if (scoreA > 0 || scoreB > 0) {
|
|
573
|
+
const boostFactor = 0.1 * (pair.count / maxCount);
|
|
574
|
+
|
|
575
|
+
// Boost the weaker concept toward the stronger one
|
|
576
|
+
if (scoreA > 0 && scoreB === 0) {
|
|
577
|
+
boosted.set(conceptB, boostFactor * scoreA);
|
|
578
|
+
} else if (scoreB > 0 && scoreA === 0) {
|
|
579
|
+
boosted.set(conceptA, boostFactor * scoreB);
|
|
580
|
+
}
|
|
581
|
+
// If both present, no boost needed — they already co-occur
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
return boosted;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
/**
|
|
589
|
+
* Compute concept vector similarity between two threads.
|
|
590
|
+
* Projects both threads into concept space, applies co-occurrence boost,
|
|
591
|
+
* then computes cosine similarity.
|
|
592
|
+
* @param {Thread} threadA
|
|
593
|
+
* @param {Thread} threadB
|
|
594
|
+
* @param {Object<string, string[]>} [taxonomy]
|
|
595
|
+
* @param {CooccurrenceMatrix} [cooccurrenceMatrix]
|
|
596
|
+
* @returns {number} Concept similarity (0.0-1.0)
|
|
597
|
+
*/
|
|
598
|
+
function conceptVectorSimilarity(threadA, threadB, taxonomy, cooccurrenceMatrix) {
|
|
599
|
+
const tax = taxonomy || SEED_TAXONOMY;
|
|
600
|
+
const textA = getThreadText(threadA);
|
|
601
|
+
const textB = getThreadText(threadB);
|
|
602
|
+
|
|
603
|
+
let vecA = projectToConcepts(textA, tax);
|
|
604
|
+
let vecB = projectToConcepts(textB, tax);
|
|
605
|
+
|
|
606
|
+
// Apply co-occurrence boost if matrix is available
|
|
607
|
+
if (cooccurrenceMatrix) {
|
|
608
|
+
vecA = applyCooccurrenceBoost(vecA, cooccurrenceMatrix);
|
|
609
|
+
vecB = applyCooccurrenceBoost(vecB, cooccurrenceMatrix);
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
return cosineSimilarity(vecA, vecB);
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
// --- Stage 2 Combined ---
|
|
616
|
+
|
|
617
|
+
/**
|
|
618
|
+
* Compute all Stage 2 concept signals for a thread pair.
|
|
619
|
+
* @param {Thread} threadA
|
|
620
|
+
* @param {Thread} threadB
|
|
621
|
+
* @param {Thread[]} allThreads - All threads for co-occurrence matrix
|
|
622
|
+
* @param {Object<string, string[]>} [taxonomy]
|
|
623
|
+
* @returns {Stage2Result}
|
|
624
|
+
*/
|
|
625
|
+
function computeStage2(threadA, threadB, allThreads, taxonomy) {
|
|
626
|
+
const tax = taxonomy || SEED_TAXONOMY;
|
|
627
|
+
const matrix = buildCooccurrenceMatrix(allThreads, tax);
|
|
628
|
+
const conceptSim = conceptVectorSimilarity(threadA, threadB, tax, matrix);
|
|
629
|
+
|
|
630
|
+
return {
|
|
631
|
+
conceptSim,
|
|
632
|
+
combined: conceptSim * STAGE2_WEIGHT_CONCEPT,
|
|
633
|
+
};
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// ============================================================================
|
|
637
|
+
// Stage 3: Graph Propagation
|
|
638
|
+
// ============================================================================
|
|
639
|
+
|
|
640
|
+
// @cap-todo(ac:F-037/AC-7) Iterative relaxation propagates affinity scores through memory graph edges
|
|
641
|
+
|
|
642
|
+
/**
|
|
643
|
+
* Find the graph node ID for a thread by its thread ID.
|
|
644
|
+
* @param {MemoryGraph} graph
|
|
645
|
+
* @param {string} threadId
|
|
646
|
+
* @returns {string|null}
|
|
647
|
+
*/
|
|
648
|
+
function findThreadNodeId(graph, threadId) {
|
|
649
|
+
for (const [nodeId, node] of Object.entries(graph.nodes || {})) {
|
|
650
|
+
if (node.type === 'thread' && node.metadata && node.metadata.threadId === threadId) {
|
|
651
|
+
return nodeId;
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
return null;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
/**
|
|
658
|
+
* Get all active neighbor node IDs for a given node.
|
|
659
|
+
* Returns map of neighborId -> edge weight (from metadata.compositeScore or 1.0).
|
|
660
|
+
* @param {MemoryGraph} graph
|
|
661
|
+
* @param {string} nodeId
|
|
662
|
+
* @returns {Map<string, number>} neighborId -> edge weight
|
|
663
|
+
*/
|
|
664
|
+
function getWeightedNeighbors(graph, nodeId) {
|
|
665
|
+
const neighbors = new Map();
|
|
666
|
+
for (const edge of (graph.edges || [])) {
|
|
667
|
+
if (!edge.active) continue;
|
|
668
|
+
let neighborId = null;
|
|
669
|
+
if (edge.source === nodeId) neighborId = edge.target;
|
|
670
|
+
else if (edge.target === nodeId) neighborId = edge.source;
|
|
671
|
+
if (neighborId) {
|
|
672
|
+
// Use affinity score as edge weight if available, otherwise 1.0
|
|
673
|
+
const weight = (edge.metadata && typeof edge.metadata.compositeScore === 'number')
|
|
674
|
+
? edge.metadata.compositeScore
|
|
675
|
+
: 1.0;
|
|
676
|
+
// Keep the strongest edge if multiple edges connect the same pair
|
|
677
|
+
const existing = neighbors.get(neighborId) || 0;
|
|
678
|
+
if (weight > existing) {
|
|
679
|
+
neighbors.set(neighborId, weight);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
return neighbors;
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
/**
|
|
687
|
+
* Propagate affinity scores through the memory graph using iterative relaxation.
|
|
688
|
+
*
|
|
689
|
+
* Algorithm:
|
|
690
|
+
* 1. Initialize scores from direct pairwise similarities (initialScores)
|
|
691
|
+
* 2. For each iteration:
|
|
692
|
+
* a. For each thread node, collect neighbor scores weighted by edge strength
|
|
693
|
+
* b. New score = damping * neighborContribution + (1 - damping) * initialScore
|
|
694
|
+
* 3. Return final propagated scores
|
|
695
|
+
*
|
|
696
|
+
* This strengthens connections between threads that share many intermediaries
|
|
697
|
+
* and weakens false connections that lack graph support.
|
|
698
|
+
*
|
|
699
|
+
* @param {MemoryGraph} graph - The memory graph with nodes and weighted edges
|
|
700
|
+
* @param {Object<string, number>} initialScores - Keyed by "threadIdA|threadIdB", values 0.0-1.0
|
|
701
|
+
* @param {Object} [options]
|
|
702
|
+
* @param {number} [options.iterations=5] - Number of relaxation iterations (3-5 recommended)
|
|
703
|
+
* @param {number} [options.damping=0.7] - Damping factor (0.0-1.0). Higher = more propagation influence.
|
|
704
|
+
* @returns {Object<string, number>} Propagated scores keyed the same as initialScores
|
|
705
|
+
*/
|
|
706
|
+
// @cap-todo(ac:F-037/AC-7) Graph propagation: 3-5 iterations, damping 0.7
|
|
707
|
+
function propagateScores(graph, initialScores, options) {
|
|
708
|
+
const iterations = (options && typeof options.iterations === 'number') ? options.iterations : 5;
|
|
709
|
+
const damping = (options && typeof options.damping === 'number') ? options.damping : 0.7;
|
|
710
|
+
|
|
711
|
+
if (!graph || !graph.nodes || !initialScores) {
|
|
712
|
+
return { ...(initialScores || {}) };
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// Build a lookup of thread ID -> graph node ID
|
|
716
|
+
/** @type {Map<string, string>} threadId -> nodeId */
|
|
717
|
+
const threadToNode = new Map();
|
|
718
|
+
for (const [nodeId, node] of Object.entries(graph.nodes)) {
|
|
719
|
+
if (node.type === 'thread' && node.metadata && node.metadata.threadId) {
|
|
720
|
+
threadToNode.set(node.metadata.threadId, nodeId);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
// Build adjacency with weights for all thread nodes
|
|
725
|
+
/** @type {Map<string, Map<string, number>>} nodeId -> Map(neighborNodeId -> weight) */
|
|
726
|
+
const adjacency = new Map();
|
|
727
|
+
for (const nodeId of threadToNode.values()) {
|
|
728
|
+
adjacency.set(nodeId, getWeightedNeighbors(graph, nodeId));
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
// Build a nodeId -> threadId reverse lookup
|
|
732
|
+
/** @type {Map<string, string>} */
|
|
733
|
+
const nodeToThread = new Map();
|
|
734
|
+
for (const [tid, nid] of threadToNode) {
|
|
735
|
+
nodeToThread.set(nid, tid);
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
// Current scores — start from initial
|
|
739
|
+
let currentScores = { ...initialScores };
|
|
740
|
+
|
|
741
|
+
// Iterative relaxation
|
|
742
|
+
for (let iter = 0; iter < iterations; iter++) {
|
|
743
|
+
const nextScores = {};
|
|
744
|
+
|
|
745
|
+
for (const [pairKey, initialScore] of Object.entries(initialScores)) {
|
|
746
|
+
const [tidA, tidB] = pairKey.split('|');
|
|
747
|
+
const nodeA = threadToNode.get(tidA);
|
|
748
|
+
const nodeB = threadToNode.get(tidB);
|
|
749
|
+
|
|
750
|
+
if (!nodeA || !nodeB) {
|
|
751
|
+
nextScores[pairKey] = initialScore;
|
|
752
|
+
continue;
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
// Compute neighbor contribution: average of scores between
|
|
756
|
+
// nodeA's neighbors and nodeB, and nodeB's neighbors and nodeA
|
|
757
|
+
const neighborsA = adjacency.get(nodeA) || new Map();
|
|
758
|
+
const neighborsB = adjacency.get(nodeB) || new Map();
|
|
759
|
+
|
|
760
|
+
let neighborSum = 0;
|
|
761
|
+
let neighborCount = 0;
|
|
762
|
+
|
|
763
|
+
// Contribution from A's neighbors toward B
|
|
764
|
+
for (const [neighborNodeId, edgeWeight] of neighborsA) {
|
|
765
|
+
const neighborThreadId = nodeToThread.get(neighborNodeId);
|
|
766
|
+
if (!neighborThreadId) continue;
|
|
767
|
+
// Look up score between this neighbor and threadB
|
|
768
|
+
const key1 = makePairKey(neighborThreadId, tidB);
|
|
769
|
+
const score = currentScores[key1];
|
|
770
|
+
if (score !== undefined) {
|
|
771
|
+
neighborSum += score * edgeWeight;
|
|
772
|
+
neighborCount++;
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
// Contribution from B's neighbors toward A
|
|
777
|
+
for (const [neighborNodeId, edgeWeight] of neighborsB) {
|
|
778
|
+
const neighborThreadId = nodeToThread.get(neighborNodeId);
|
|
779
|
+
if (!neighborThreadId) continue;
|
|
780
|
+
const key1 = makePairKey(neighborThreadId, tidA);
|
|
781
|
+
const score = currentScores[key1];
|
|
782
|
+
if (score !== undefined) {
|
|
783
|
+
neighborSum += score * edgeWeight;
|
|
784
|
+
neighborCount++;
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
const neighborContribution = neighborCount > 0 ? neighborSum / neighborCount : 0;
|
|
789
|
+
|
|
790
|
+
// Relaxation formula: blend of neighbor signal and original score
|
|
791
|
+
nextScores[pairKey] = clamp01(
|
|
792
|
+
damping * neighborContribution + (1 - damping) * initialScore
|
|
793
|
+
);
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
currentScores = nextScores;
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
return currentScores;
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
/**
|
|
803
|
+
* Create a stable pair key from two thread IDs (alphabetically ordered).
|
|
804
|
+
* @param {string} tidA
|
|
805
|
+
* @param {string} tidB
|
|
806
|
+
* @returns {string}
|
|
807
|
+
*/
|
|
808
|
+
function makePairKey(tidA, tidB) {
|
|
809
|
+
return tidA < tidB ? `${tidA}|${tidB}` : `${tidB}|${tidA}`;
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
// ============================================================================
|
|
813
|
+
// Full Pipeline
|
|
814
|
+
// ============================================================================
|
|
815
|
+
|
|
816
|
+
/**
|
|
817
|
+
* Run the complete 3-stage semantic analysis pipeline for a thread pair.
|
|
818
|
+
*
|
|
819
|
+
* Stage 1: Text signals (TF-IDF 0.5 + N-gram 0.2 + Jaccard 0.1)
|
|
820
|
+
* Stage 2: Concept signals (concept vector similarity 0.2)
|
|
821
|
+
* Stage 3: Graph propagation (optional, refines scores via transitive connections)
|
|
822
|
+
*
|
|
823
|
+
* @param {Thread} threadA - First thread
|
|
824
|
+
* @param {Thread} threadB - Second thread
|
|
825
|
+
* @param {PipelineContext} context - All threads, optional graph, taxonomy overrides
|
|
826
|
+
* @returns {PipelineResult}
|
|
827
|
+
*/
|
|
828
|
+
// @cap-todo(ac:F-037/AC-8) Pure logic pipeline — no I/O, all data passed as arguments
|
|
829
|
+
function runPipeline(threadA, threadB, context) {
|
|
830
|
+
const allThreads = (context && context.allThreads) || [threadA, threadB];
|
|
831
|
+
const taxonomy = (context && context.taxonomy) || SEED_TAXONOMY;
|
|
832
|
+
const graph = (context && context.graph) || null;
|
|
833
|
+
const propagationOptions = (context && context.propagationOptions) || { iterations: 5, damping: 0.7 };
|
|
834
|
+
|
|
835
|
+
// Stage 1: Text signals
|
|
836
|
+
const corpus = buildCorpus(allThreads);
|
|
837
|
+
const stage1 = computeStage1(threadA, threadB, corpus);
|
|
838
|
+
|
|
839
|
+
// Stage 2: Concept signals
|
|
840
|
+
const stage2 = computeStage2(threadA, threadB, allThreads, taxonomy);
|
|
841
|
+
|
|
842
|
+
// Pre-propagation score (stages 1 + 2)
|
|
843
|
+
const directScore = clamp01(stage1.combined + stage2.combined);
|
|
844
|
+
|
|
845
|
+
// Stage 3: Graph propagation (optional)
|
|
846
|
+
let stage3 = {};
|
|
847
|
+
let finalScore = directScore;
|
|
848
|
+
|
|
849
|
+
if (graph && graph.nodes && Object.keys(graph.nodes).length > 0) {
|
|
850
|
+
const pairKey = makePairKey(threadA.id, threadB.id);
|
|
851
|
+
const initialScores = { [pairKey]: directScore };
|
|
852
|
+
|
|
853
|
+
// Include existing affinity edges as additional initial scores
|
|
854
|
+
// so propagation can leverage the full graph
|
|
855
|
+
for (const edge of (graph.edges || [])) {
|
|
856
|
+
if (!edge.active || edge.type !== 'affinity') continue;
|
|
857
|
+
if (!edge.metadata || typeof edge.metadata.compositeScore !== 'number') continue;
|
|
858
|
+
|
|
859
|
+
const sourceNode = graph.nodes[edge.source];
|
|
860
|
+
const targetNode = graph.nodes[edge.target];
|
|
861
|
+
if (!sourceNode || !targetNode) continue;
|
|
862
|
+
if (sourceNode.type !== 'thread' || targetNode.type !== 'thread') continue;
|
|
863
|
+
|
|
864
|
+
const sTid = sourceNode.metadata && sourceNode.metadata.threadId;
|
|
865
|
+
const tTid = targetNode.metadata && targetNode.metadata.threadId;
|
|
866
|
+
if (!sTid || !tTid) continue;
|
|
867
|
+
|
|
868
|
+
const existingKey = makePairKey(sTid, tTid);
|
|
869
|
+
if (existingKey !== pairKey && initialScores[existingKey] === undefined) {
|
|
870
|
+
initialScores[existingKey] = edge.metadata.compositeScore;
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
stage3 = propagateScores(graph, initialScores, propagationOptions);
|
|
875
|
+
finalScore = clamp01(stage3[pairKey] !== undefined ? stage3[pairKey] : directScore);
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
return {
|
|
879
|
+
stage1,
|
|
880
|
+
stage2,
|
|
881
|
+
stage3,
|
|
882
|
+
finalScore,
|
|
883
|
+
};
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
/**
|
|
887
|
+
* Run the pipeline for all unique thread pairs.
|
|
888
|
+
* Returns a Map keyed by "threadIdA|threadIdB" -> PipelineResult.
|
|
889
|
+
* @param {Thread[]} threads - All threads
|
|
890
|
+
* @param {PipelineContext} context
|
|
891
|
+
* @returns {Map<string, PipelineResult>}
|
|
892
|
+
*/
|
|
893
|
+
function runPipelineBatch(threads, context) {
|
|
894
|
+
const results = new Map();
|
|
895
|
+
const allThreads = (context && context.allThreads) || threads;
|
|
896
|
+
const corpus = buildCorpus(allThreads);
|
|
897
|
+
const taxonomy = (context && context.taxonomy) || SEED_TAXONOMY;
|
|
898
|
+
const matrix = buildCooccurrenceMatrix(allThreads, taxonomy);
|
|
899
|
+
const graph = (context && context.graph) || null;
|
|
900
|
+
const propagationOptions = (context && context.propagationOptions) || { iterations: 5, damping: 0.7 };
|
|
901
|
+
|
|
902
|
+
// Compute direct scores for all pairs
|
|
903
|
+
/** @type {Object<string, number>} */
|
|
904
|
+
const directScores = {};
|
|
905
|
+
|
|
906
|
+
for (let i = 0; i < threads.length; i++) {
|
|
907
|
+
for (let j = i + 1; j < threads.length; j++) {
|
|
908
|
+
const a = threads[i];
|
|
909
|
+
const b = threads[j];
|
|
910
|
+
const pairKey = makePairKey(a.id, b.id);
|
|
911
|
+
|
|
912
|
+
// Stage 1
|
|
913
|
+
const tfidf = tfidfSimilarity(a, b, corpus);
|
|
914
|
+
const textA = getThreadText(a);
|
|
915
|
+
const textB = getThreadText(b);
|
|
916
|
+
const ngram = trigramSimilarity(textA, textB);
|
|
917
|
+
const jaccard = jaccardKeywordSimilarity(a.keywords, b.keywords);
|
|
918
|
+
const stage1Combined = (tfidf * STAGE1_WEIGHT_TFIDF)
|
|
919
|
+
+ (ngram * STAGE1_WEIGHT_NGRAM)
|
|
920
|
+
+ (jaccard * STAGE1_WEIGHT_JACCARD);
|
|
921
|
+
|
|
922
|
+
const stage1 = { tfidf, ngram, jaccard, combined: stage1Combined };
|
|
923
|
+
|
|
924
|
+
// Stage 2
|
|
925
|
+
const conceptSim = conceptVectorSimilarity(a, b, taxonomy, matrix);
|
|
926
|
+
const stage2 = { conceptSim, combined: conceptSim * STAGE2_WEIGHT_CONCEPT };
|
|
927
|
+
|
|
928
|
+
const directScore = clamp01(stage1.combined + stage2.combined);
|
|
929
|
+
directScores[pairKey] = directScore;
|
|
930
|
+
|
|
931
|
+
results.set(pairKey, {
|
|
932
|
+
stage1,
|
|
933
|
+
stage2,
|
|
934
|
+
stage3: {},
|
|
935
|
+
finalScore: directScore,
|
|
936
|
+
});
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
// Stage 3: batch graph propagation
|
|
941
|
+
if (graph && graph.nodes && Object.keys(graph.nodes).length > 0) {
|
|
942
|
+
// Add existing affinity edges
|
|
943
|
+
for (const edge of (graph.edges || [])) {
|
|
944
|
+
if (!edge.active || edge.type !== 'affinity') continue;
|
|
945
|
+
if (!edge.metadata || typeof edge.metadata.compositeScore !== 'number') continue;
|
|
946
|
+
|
|
947
|
+
const sourceNode = graph.nodes[edge.source];
|
|
948
|
+
const targetNode = graph.nodes[edge.target];
|
|
949
|
+
if (!sourceNode || !targetNode) continue;
|
|
950
|
+
if (sourceNode.type !== 'thread' || targetNode.type !== 'thread') continue;
|
|
951
|
+
|
|
952
|
+
const sTid = sourceNode.metadata && sourceNode.metadata.threadId;
|
|
953
|
+
const tTid = targetNode.metadata && targetNode.metadata.threadId;
|
|
954
|
+
if (!sTid || !tTid) continue;
|
|
955
|
+
|
|
956
|
+
const existingKey = makePairKey(sTid, tTid);
|
|
957
|
+
if (directScores[existingKey] === undefined) {
|
|
958
|
+
directScores[existingKey] = edge.metadata.compositeScore;
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
const propagated = propagateScores(graph, directScores, propagationOptions);
|
|
963
|
+
|
|
964
|
+
// Update results with propagated scores
|
|
965
|
+
for (const [pairKey, result] of results) {
|
|
966
|
+
result.stage3 = propagated;
|
|
967
|
+
if (propagated[pairKey] !== undefined) {
|
|
968
|
+
result.finalScore = clamp01(propagated[pairKey]);
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
return results;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
// ============================================================================
|
|
977
|
+
// Utility
|
|
978
|
+
// ============================================================================
|
|
979
|
+
|
|
980
|
+
/**
|
|
981
|
+
* Clamp a number to [0.0, 1.0].
|
|
982
|
+
* @param {number} n
|
|
983
|
+
* @returns {number}
|
|
984
|
+
*/
|
|
985
|
+
function clamp01(n) {
|
|
986
|
+
return Math.max(0, Math.min(1, n));
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
// ============================================================================
|
|
990
|
+
// Module Exports
|
|
991
|
+
// ============================================================================
|
|
992
|
+
|
|
993
|
+
// @cap-decision Exporting internal helpers with _ prefix for testing, matching cap-affinity-engine.cjs convention.
|
|
994
|
+
|
|
995
|
+
module.exports = {
|
|
996
|
+
// --- Full Pipeline ---
|
|
997
|
+
runPipeline,
|
|
998
|
+
runPipelineBatch,
|
|
999
|
+
|
|
1000
|
+
// --- Stage 1: Text Signals ---
|
|
1001
|
+
computeStage1,
|
|
1002
|
+
tfidfSimilarity,
|
|
1003
|
+
trigramSimilarity,
|
|
1004
|
+
jaccardKeywordSimilarity,
|
|
1005
|
+
|
|
1006
|
+
// --- Stage 2: Concept Signals ---
|
|
1007
|
+
computeStage2,
|
|
1008
|
+
conceptVectorSimilarity,
|
|
1009
|
+
buildCooccurrenceMatrix,
|
|
1010
|
+
getConfirmedPairs,
|
|
1011
|
+
projectToConcepts,
|
|
1012
|
+
|
|
1013
|
+
// --- Stage 3: Graph Propagation ---
|
|
1014
|
+
propagateScores,
|
|
1015
|
+
|
|
1016
|
+
// --- Constants ---
|
|
1017
|
+
SEED_TAXONOMY,
|
|
1018
|
+
CONCEPT_NAMES,
|
|
1019
|
+
STAGE1_WEIGHT_TFIDF,
|
|
1020
|
+
STAGE1_WEIGHT_NGRAM,
|
|
1021
|
+
STAGE1_WEIGHT_JACCARD,
|
|
1022
|
+
STAGE2_WEIGHT_CONCEPT,
|
|
1023
|
+
|
|
1024
|
+
// --- Internals (for testing) ---
|
|
1025
|
+
_tokenize: tokenize,
|
|
1026
|
+
_getThreadText: getThreadText,
|
|
1027
|
+
_buildCorpus: buildCorpus,
|
|
1028
|
+
_computeTfIdfVector: computeTfIdfVector,
|
|
1029
|
+
_cosineSimilarity: cosineSimilarity,
|
|
1030
|
+
_extractTrigrams: extractTrigrams,
|
|
1031
|
+
_makeCooccurrenceKey: makeCooccurrenceKey,
|
|
1032
|
+
_applyCooccurrenceBoost: applyCooccurrenceBoost,
|
|
1033
|
+
_findThreadNodeId: findThreadNodeId,
|
|
1034
|
+
_getWeightedNeighbors: getWeightedNeighbors,
|
|
1035
|
+
_makePairKey: makePairKey,
|
|
1036
|
+
_clamp01: clamp01,
|
|
1037
|
+
_textContainsKeyword: textContainsKeyword,
|
|
1038
|
+
};
|