amalfa 0.0.0-reserved → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/.biomeignore +19 -0
  2. package/:memory: +0 -0
  3. package/:memory:-shm +0 -0
  4. package/:memory:-wal +0 -0
  5. package/CHANGELOG.md.old +43 -0
  6. package/LICENSE +21 -0
  7. package/README.md +359 -13
  8. package/README.old.md +112 -0
  9. package/ROADMAP.md +316 -0
  10. package/TEST_PLAN.md +561 -0
  11. package/agents.config.json +11 -0
  12. package/amalfa.config.example.ts +102 -0
  13. package/biome.json +49 -0
  14. package/bun.lock +371 -0
  15. package/docs/AGENT_PROTOCOLS.md +28 -0
  16. package/docs/ARCHITECTURAL_OVERVIEW.md +123 -0
  17. package/docs/BENTO_BOXING_DEPRECATION.md +281 -0
  18. package/docs/Bun-SQLite.html +464 -0
  19. package/docs/COMMIT_GUIDELINES.md +367 -0
  20. package/docs/DEVELOPER_ONBOARDING.md +36 -0
  21. package/docs/Graph and Vector Database Best Practices.md +214 -0
  22. package/docs/PERFORMANCE_BASELINES.md +88 -0
  23. package/docs/REPOSITORY_CLEANUP_SUMMARY.md +261 -0
  24. package/docs/edge-generation-methods.md +57 -0
  25. package/docs/elevator-pitch.md +118 -0
  26. package/docs/graph-and-vector-database-playbook.html +480 -0
  27. package/docs/hardened-sqlite.md +85 -0
  28. package/docs/headless-knowledge-management.md +79 -0
  29. package/docs/john-kaye-flux-prompt.md +46 -0
  30. package/docs/keyboard-shortcuts.md +80 -0
  31. package/docs/opinion-proceed-pattern.md +29 -0
  32. package/docs/polyvis-nodes-edges-schema.md +77 -0
  33. package/docs/protocols/lab-protocol.md +30 -0
  34. package/docs/reaction-iquest-loop-coder.md +46 -0
  35. package/docs/services.md +60 -0
  36. package/docs/sqlite-wal-readonly-trap.md +228 -0
  37. package/docs/strategy/css-architecture.md +40 -0
  38. package/docs/test-document-cycle.md +83 -0
  39. package/docs/test_lifecycle_E2E.md +4 -0
  40. package/docs/the-bicameral-graph.md +83 -0
  41. package/docs/user-guide.md +70 -0
  42. package/docs/vision-helper.md +53 -0
  43. package/drizzle/0000_minor_iron_fist.sql +19 -0
  44. package/drizzle/meta/0000_snapshot.json +139 -0
  45. package/drizzle/meta/_journal.json +13 -0
  46. package/example_usage.ts +39 -0
  47. package/experiment.sh +35 -0
  48. package/hello +2 -0
  49. package/index.html +52 -0
  50. package/knowledge/excalibur.md +12 -0
  51. package/package.json +60 -15
  52. package/plans/experience-graph-integration.md +60 -0
  53. package/prompts/gemini-king-mode-prompt.md +46 -0
  54. package/public/docs/MCP_TOOLS.md +372 -0
  55. package/schemas/README.md +20 -0
  56. package/schemas/cda.schema.json +84 -0
  57. package/schemas/conceptual-lexicon.schema.json +75 -0
  58. package/scratchpads/dummy-debrief-boxed.md +39 -0
  59. package/scratchpads/dummy-debrief.md +27 -0
  60. package/scratchpads/scratchpad-design.md +50 -0
  61. package/scratchpads/scratchpad-scrolling.md +20 -0
  62. package/scratchpads/scratchpad-toc-disappearance.md +23 -0
  63. package/scratchpads/scratchpad-toc.md +28 -0
  64. package/scratchpads/test_gardener.md +7 -0
  65. package/src/EnlightenedTriad.ts +146 -0
  66. package/src/JIT_Triad.ts +137 -0
  67. package/src/cli.ts +364 -0
  68. package/src/config/constants.ts +7 -0
  69. package/src/config/defaults.ts +99 -0
  70. package/src/core/BentoNormalizer.ts +113 -0
  71. package/src/core/EdgeWeaver.ts +145 -0
  72. package/src/core/FractureLogic.ts +22 -0
  73. package/src/core/Harvester.ts +73 -0
  74. package/src/core/LLMClient.ts +93 -0
  75. package/src/core/LouvainGate.ts +67 -0
  76. package/src/core/MarkdownMasker.ts +49 -0
  77. package/src/core/README.md +11 -0
  78. package/src/core/SemanticMatcher.ts +89 -0
  79. package/src/core/SemanticWeaver.ts +96 -0
  80. package/src/core/TagEngine.ts +56 -0
  81. package/src/core/TimelineWeaver.ts +61 -0
  82. package/src/core/VectorEngine.ts +232 -0
  83. package/src/daemon/index.ts +225 -0
  84. package/src/data/experience/test_doc_1.md +2 -0
  85. package/src/data/experience/test_doc_2.md +2 -0
  86. package/src/db/schema.ts +46 -0
  87. package/src/demo-triad.ts +45 -0
  88. package/src/gardeners/AutoTagger.ts +116 -0
  89. package/src/gardeners/BaseGardener.ts +55 -0
  90. package/src/llm/EnlightenedProvider.ts +95 -0
  91. package/src/mcp/README.md +6 -0
  92. package/src/mcp/index.ts +341 -0
  93. package/src/pipeline/AmalfaIngestor.ts +272 -0
  94. package/src/pipeline/HarvesterPipeline.ts +101 -0
  95. package/src/pipeline/Ingestor.ts +555 -0
  96. package/src/pipeline/PreFlightAnalyzer.ts +434 -0
  97. package/src/pipeline/README.md +7 -0
  98. package/src/pipeline/SemanticHarvester.ts +222 -0
  99. package/src/resonance/DatabaseFactory.ts +100 -0
  100. package/src/resonance/README.md +148 -0
  101. package/src/resonance/cli/README.md +7 -0
  102. package/src/resonance/cli/ingest.ts +41 -0
  103. package/src/resonance/cli/migrate.ts +54 -0
  104. package/src/resonance/config.ts +40 -0
  105. package/src/resonance/daemon.ts +236 -0
  106. package/src/resonance/db.ts +424 -0
  107. package/src/resonance/pipeline/README.md +7 -0
  108. package/src/resonance/pipeline/extract.ts +89 -0
  109. package/src/resonance/pipeline/transform_docs.ts +60 -0
  110. package/src/resonance/schema.ts +156 -0
  111. package/src/resonance/services/embedder.ts +131 -0
  112. package/src/resonance/services/simpleTokenizer.ts +119 -0
  113. package/src/resonance/services/stats.ts +327 -0
  114. package/src/resonance/services/tokenizer.ts +159 -0
  115. package/src/resonance/transform/cda.ts +393 -0
  116. package/src/resonance/types/enriched-cda.ts +112 -0
  117. package/src/services/README.md +56 -0
  118. package/src/services/llama.ts +59 -0
  119. package/src/services/llamauv.ts +56 -0
  120. package/src/services/olmo3.ts +58 -0
  121. package/src/services/phi.ts +52 -0
  122. package/src/types/artifact.ts +12 -0
  123. package/src/utils/EnvironmentVerifier.ts +67 -0
  124. package/src/utils/Logger.ts +21 -0
  125. package/src/utils/ServiceLifecycle.ts +207 -0
  126. package/src/utils/ZombieDefense.ts +244 -0
  127. package/src/utils/validator.ts +264 -0
  128. package/substack/substack-playbook-1.md +95 -0
  129. package/substack/substack-playbook-2.md +78 -0
  130. package/tasks/ui-investigation.md +26 -0
  131. package/test-db +0 -0
  132. package/test-db-shm +0 -0
  133. package/test-db-wal +0 -0
  134. package/tests/canary/verify_pinch_check.ts +44 -0
  135. package/tests/fixtures/ingest_test.md +12 -0
  136. package/tests/fixtures/ingest_test_boxed.md +13 -0
  137. package/tests/fixtures/safety_test.md +45 -0
  138. package/tests/fixtures/safety_test_boxed.md +49 -0
  139. package/tests/fixtures/tagged_output.md +49 -0
  140. package/tests/fixtures/tagged_test.md +49 -0
  141. package/tests/mcp-server-settings.json +8 -0
  142. package/tsconfig.json +46 -0
  143. package/verify-embedder.ts +54 -0
@@ -0,0 +1,327 @@
1
+ /**
2
+ * Ingestion Pipeline Observability
3
+ *
4
+ * Tracks metrics at each pipeline stage and verifies against baseline
5
+ * to detect silent failures (e.g., PERSONA graph losing 3,000 edges).
6
+ */
7
+
8
+ export interface EdgeCounts {
9
+ [type: string]: number;
10
+ total: number;
11
+ }
12
+
13
+ export interface DomainStats {
14
+ nodes: number;
15
+ edges: EdgeCounts;
16
+ vectors: number;
17
+ semantic_tokens?: number;
18
+ }
19
+
20
+ export interface PipelineMetrics {
21
+ persona: DomainStats;
22
+ experience: DomainStats;
23
+ }
24
+
25
+ export interface Baseline {
26
+ version: string;
27
+ last_updated: string;
28
+ description: string;
29
+ persona: {
30
+ nodes: { concepts: number; directives: number; total: number };
31
+ edges: EdgeCounts;
32
+ vectors: number;
33
+ notes: string;
34
+ };
35
+ experience: {
36
+ nodes: {
37
+ debriefs: number;
38
+ playbooks: number;
39
+ documents: number;
40
+ total: number;
41
+ };
42
+ edges: EdgeCounts;
43
+ vectors: number;
44
+ semantic_tokens: number;
45
+ notes: string;
46
+ };
47
+ tolerance: {
48
+ nodes: number;
49
+ edges: number;
50
+ vectors: number;
51
+ description: string;
52
+ };
53
+ }
54
+
55
+ export interface Mismatch {
56
+ domain: string;
57
+ metric: string;
58
+ expected: number;
59
+ actual: number;
60
+ delta: number;
61
+ variance: number;
62
+ }
63
+
64
+ export class IngestionStats {
65
+ private metrics: PipelineMetrics;
66
+ private baseline: Baseline | null = null;
67
+
68
+ constructor() {
69
+ this.metrics = {
70
+ persona: {
71
+ nodes: 0,
72
+ edges: { total: 0 },
73
+ vectors: 0,
74
+ },
75
+ experience: {
76
+ nodes: 0,
77
+ edges: { total: 0 },
78
+ vectors: 0,
79
+ semantic_tokens: 0,
80
+ },
81
+ };
82
+ }
83
+
84
+ /**
85
+ * Load baseline from file
86
+ */
87
+ async loadBaseline(path: string): Promise<void> {
88
+ try {
89
+ const file = Bun.file(path);
90
+ this.baseline = await file.json();
91
+ console.log(`📊 Baseline loaded: v${this.baseline?.version}`);
92
+ } catch (_error) {
93
+ console.warn(
94
+ `⚠️ No baseline found at ${path}. Skipping baseline verification.`,
95
+ );
96
+ }
97
+ }
98
+
99
+ /**
100
+ * Record node creation
101
+ */
102
+ recordNode(domain: "persona" | "experience"): void {
103
+ this.metrics[domain].nodes++;
104
+ }
105
+
106
+ /**
107
+ * Record edge creation
108
+ */
109
+ recordEdge(domain: "persona" | "experience", type: string): void {
110
+ if (!this.metrics[domain].edges[type]) {
111
+ this.metrics[domain].edges[type] = 0;
112
+ }
113
+ this.metrics[domain].edges[type]++;
114
+ this.metrics[domain].edges.total++;
115
+ }
116
+
117
+ /**
118
+ * Record vector creation
119
+ */
120
+ recordVector(domain: "persona" | "experience"): void {
121
+ this.metrics[domain].vectors++;
122
+ }
123
+
124
+ /**
125
+ * Record semantic token extraction
126
+ */
127
+ recordSemanticTokens(domain: "experience"): void {
128
+ if (this.metrics[domain].semantic_tokens !== undefined) {
129
+ this.metrics[domain].semantic_tokens++;
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Get current metrics
135
+ */
136
+ getMetrics(): PipelineMetrics {
137
+ return this.metrics;
138
+ }
139
+
140
+ /**
141
+ * Verify against baseline
142
+ */
143
+ verifyAgainstBaseline(): Mismatch[] {
144
+ if (!this.baseline) {
145
+ console.warn("⚠️ No baseline loaded. Skipping verification.");
146
+ return [];
147
+ }
148
+
149
+ const mismatches: Mismatch[] = [];
150
+ const tolerance = this.baseline.tolerance;
151
+
152
+ // Verify PERSONA domain
153
+ this.checkMetric(
154
+ mismatches,
155
+ "persona",
156
+ "nodes",
157
+ this.baseline.persona.nodes.total,
158
+ this.metrics.persona.nodes,
159
+ tolerance.nodes,
160
+ );
161
+
162
+ this.checkMetric(
163
+ mismatches,
164
+ "persona",
165
+ "edges.total",
166
+ this.baseline.persona.edges.total,
167
+ this.metrics.persona.edges.total,
168
+ tolerance.edges,
169
+ );
170
+
171
+ this.checkMetric(
172
+ mismatches,
173
+ "persona",
174
+ "vectors",
175
+ this.baseline.persona.vectors,
176
+ this.metrics.persona.vectors,
177
+ tolerance.vectors,
178
+ );
179
+
180
+ // Verify EXPERIENCE domain
181
+ this.checkMetric(
182
+ mismatches,
183
+ "experience",
184
+ "nodes",
185
+ this.baseline.experience.nodes.total,
186
+ this.metrics.experience.nodes,
187
+ tolerance.nodes,
188
+ );
189
+
190
+ this.checkMetric(
191
+ mismatches,
192
+ "experience",
193
+ "edges.total",
194
+ this.baseline.experience.edges.total,
195
+ this.metrics.experience.edges.total,
196
+ tolerance.edges,
197
+ );
198
+
199
+ this.checkMetric(
200
+ mismatches,
201
+ "experience",
202
+ "vectors",
203
+ this.baseline.experience.vectors,
204
+ this.metrics.experience.vectors,
205
+ tolerance.vectors,
206
+ );
207
+
208
+ return mismatches;
209
+ }
210
+
211
+ /**
212
+ * Check a single metric against baseline
213
+ */
214
+ private checkMetric(
215
+ mismatches: Mismatch[],
216
+ domain: string,
217
+ metric: string,
218
+ expected: number,
219
+ actual: number,
220
+ tolerance: number,
221
+ ): void {
222
+ const delta = actual - expected;
223
+ const variance = expected > 0 ? Math.abs(delta) / expected : 0;
224
+
225
+ if (variance > tolerance) {
226
+ mismatches.push({
227
+ domain,
228
+ metric,
229
+ expected,
230
+ actual,
231
+ delta,
232
+ variance,
233
+ });
234
+ }
235
+ }
236
+
237
+ /**
238
+ * Print summary report
239
+ */
240
+ printSummary(): void {
241
+ console.log("\n📊 Ingestion Summary");
242
+ console.log("═".repeat(60));
243
+
244
+ console.log("\n🧠 PERSONA Domain:");
245
+ console.log(` Nodes: ${this.metrics.persona.nodes}`);
246
+ console.log(` Edges: ${this.metrics.persona.edges.total}`);
247
+ console.log(` Vectors: ${this.metrics.persona.vectors}`);
248
+
249
+ console.log("\n📚 EXPERIENCE Domain:");
250
+ console.log(` Nodes: ${this.metrics.experience.nodes}`);
251
+ console.log(` Edges: ${this.metrics.experience.edges.total}`);
252
+ console.log(` Vectors: ${this.metrics.experience.vectors}`);
253
+ console.log(` Tokens: ${this.metrics.experience.semantic_tokens || 0}`);
254
+
255
+ console.log(`\n${"═".repeat(60)}`);
256
+ }
257
+
258
+ /**
259
+ * Print verification results
260
+ */
261
+ printVerification(mismatches: Mismatch[]): void {
262
+ if (mismatches.length === 0) {
263
+ console.log("\n✅ Baseline Verification: PASSED");
264
+ return;
265
+ }
266
+
267
+ console.log("\n❌ Baseline Verification: FAILED");
268
+ console.log("═".repeat(60));
269
+ console.table(
270
+ mismatches.map((m) => ({
271
+ Domain: m.domain,
272
+ Metric: m.metric,
273
+ Expected: m.expected,
274
+ Actual: m.actual,
275
+ Delta: m.delta,
276
+ "Variance %": `${(m.variance * 100).toFixed(1)}%`,
277
+ })),
278
+ );
279
+ console.log("═".repeat(60));
280
+ }
281
+
282
+ /**
283
+ * Generate markdown report
284
+ */
285
+ async generateReport(outputPath: string): Promise<void> {
286
+ const now = new Date().toISOString().split("T")[0];
287
+ const mismatches = this.verifyAgainstBaseline();
288
+
289
+ let report = `# Ingestion Report: ${now}\n\n`;
290
+
291
+ // PERSONA Domain
292
+ report += `## PERSONA Domain\n\n`;
293
+ report += `| Metric | Expected | Actual | Status |\n`;
294
+ report += `|--------|----------|--------|--------|\n`;
295
+
296
+ if (this.baseline) {
297
+ report += `| Nodes | ${this.baseline.persona.nodes.total} | ${this.metrics.persona.nodes} | ${this.metrics.persona.nodes === this.baseline.persona.nodes.total ? "✅" : "❌"} |\n`;
298
+ report += `| Edges | ${this.baseline.persona.edges.total} | ${this.metrics.persona.edges.total} | ${this.metrics.persona.edges.total === this.baseline.persona.edges.total ? "✅" : "❌"} |\n`;
299
+ report += `| Vectors| ${this.baseline.persona.vectors} | ${this.metrics.persona.vectors} | ${this.metrics.persona.vectors === this.baseline.persona.vectors ? "✅" : "❌"} |\n`;
300
+ }
301
+
302
+ // EXPERIENCE Domain
303
+ report += `\n## EXPERIENCE Domain\n\n`;
304
+ report += `| Metric | Expected | Actual | Status |\n`;
305
+ report += `|--------|----------|--------|--------|\n`;
306
+
307
+ if (this.baseline) {
308
+ report += `| Nodes | ${this.baseline.experience.nodes.total} | ${this.metrics.experience.nodes} | ${this.metrics.experience.nodes === this.baseline.experience.nodes.total ? "✅" : "❌"} |\n`;
309
+ report += `| Edges | ${this.baseline.experience.edges.total} | ${this.metrics.experience.edges.total} | ${this.metrics.experience.edges.total === this.baseline.experience.edges.total ? "✅" : "❌"} |\n`;
310
+ report += `| Vectors| ${this.baseline.experience.vectors} | ${this.metrics.experience.vectors} | ${this.metrics.experience.vectors === this.baseline.experience.vectors ? "✅" : "❌"} |\n`;
311
+ }
312
+
313
+ // Mismatches
314
+ if (mismatches.length > 0) {
315
+ report += `\n## ⚠️ Baseline Violations\n\n`;
316
+ report += `| Domain | Metric | Expected | Actual | Delta | Variance |\n`;
317
+ report += `|--------|--------|----------|--------|-------|----------|\n`;
318
+
319
+ for (const m of mismatches) {
320
+ report += `| ${m.domain} | ${m.metric} | ${m.expected} | ${m.actual} | ${m.delta > 0 ? "+" : ""}${m.delta} | ${(m.variance * 100).toFixed(1)}% |\n`;
321
+ }
322
+ }
323
+
324
+ await Bun.write(outputPath, report);
325
+ console.log(`\n📄 Report saved to: ${outputPath}`);
326
+ }
327
+ }
@@ -0,0 +1,159 @@
1
+ import nlp from "compromise";
2
+
3
+ export interface SemanticTags {
4
+ people: string[];
5
+ places: string[];
6
+ organizations: string[];
7
+ topics: string[];
8
+ dates?: string[];
9
+ money: string[];
10
+ protocols?: string[];
11
+ concepts?: string[];
12
+ }
13
+
14
+ export class TokenizerService {
15
+ private static instance: TokenizerService;
16
+ // Map of normalized_term -> tag inside class state
17
+ private vocabulary: Map<string, string> = new Map();
18
+ // Cache keys sorted by length (desc) for greedy matching
19
+ private searchKeys: string[] = [];
20
+
21
+ // Compromise instance (optional, keeping for 'people', 'places' currently)
22
+ // Could eventually remove if we go 100% custom.
23
+
24
+ private constructor() {}
25
+
26
+ public static getInstance(): TokenizerService {
27
+ if (!TokenizerService.instance) {
28
+ TokenizerService.instance = new TokenizerService();
29
+ }
30
+ return TokenizerService.instance;
31
+ }
32
+
33
+ /**
34
+ * Extracts semantic entities.
35
+ * 1. Uses Compromise for generic Named Entity Recognition (NER)
36
+ * 2. Uses Custom "Zero Magic" Brute Force Scanner for Domain Vocabulary
37
+ */
38
+ public extract(text: string): SemanticTags {
39
+ const doc = nlp(text);
40
+
41
+ // 1. Generic NLP (Keep for now as fallback/enrichment)
42
+ const result: SemanticTags = {
43
+ people: doc.people().out("array"),
44
+ places: doc.places().out("array"),
45
+ organizations: doc.organizations().out("array"),
46
+ topics: doc.topics().out("array"),
47
+ money: [],
48
+ protocols: [],
49
+ concepts: [],
50
+ };
51
+
52
+ // 2. Zero Magic Domain Scan (Brute Force)
53
+ // Optimization: Check text.includes() only if vocabulary is small?
54
+ // But for regex construction or Aho-Corasick, naive loop is fine for now on small text blocks (Bento boxes).
55
+
56
+ const lowerText = text.toLowerCase();
57
+
58
+ for (const term of this.searchKeys) {
59
+ // Simple subset check.
60
+ // Limitation: Matches "pro" in "process". Needs word boundary check.
61
+ // RegExp construction is costly inside loop?
62
+ // Better: Pre-build a massive Regex?
63
+ // Or simpler: \bSTR\b with indexof?
64
+
65
+ // Fast "includes" check first
66
+ if (lowerText.includes(term)) {
67
+ // Confirm Word Boundary to avoid partial matches
68
+ // Regex is expensive, but safer for accuracy.
69
+ // We construct regex only on 'hit' to save cycles?
70
+ const escaped = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
71
+ const boundaryRegex = new RegExp(`\\b${escaped}\\b`, "i");
72
+
73
+ if (boundaryRegex.test(text)) {
74
+ const tag = this.vocabulary.get(term);
75
+ // Retrieve Canonical Form (Original Case) if needed?
76
+ // For now, we normalize to the lowercase SEARCH key, but we might prefer the original.
77
+ // Given the user wants "OH-058" format, let's try to map back if possible.
78
+ // But we only stored 'tag' in the map.
79
+ // Let's store Normalized -> Original ID in a separate map if required?
80
+ // User Request: "no need to manage case" -> likely means "return standard ID".
81
+
82
+ // Since we are returning the matched string (which is usually the ID in lowercase in our loop),
83
+ // If we want UPPERCASE OH-058, we need to know it.
84
+ // Simplest fix: Just return the term as found (lowercase) and EdgeWeaver will handle lookup.
85
+ // EdgeWeaver expects slugified keys anyway.
86
+
87
+ // Actually, let's return the TERM as it appears in the text?
88
+ // boundaryRegex.match(text) would give us the real casing used in the doc (e.g. "OH-058").
89
+
90
+ const match = boundaryRegex.exec(text);
91
+ const realTerm = match ? match[0] : term;
92
+
93
+ if (tag === "Protocol") {
94
+ if (!result.protocols) result.protocols = [];
95
+ if (!result.protocols.includes(realTerm))
96
+ result.protocols.push(realTerm);
97
+ } else if (tag === "Concept") {
98
+ if (!result.concepts) result.concepts = [];
99
+ if (!result.concepts.includes(realTerm))
100
+ result.concepts.push(realTerm);
101
+ } else if (tag === "Organization") {
102
+ if (!result.organizations.includes(realTerm))
103
+ result.organizations.push(realTerm);
104
+ } else {
105
+ // Default to Concept if tag is unknown or not explicitly handled
106
+ if (!result.concepts) result.concepts = [];
107
+ if (!result.concepts.includes(realTerm))
108
+ result.concepts.push(realTerm);
109
+ }
110
+ }
111
+ }
112
+ }
113
+
114
+ return result;
115
+ }
116
+
117
+ public loadLexicon(
118
+ lexicon: { id: string; title: string; type?: string; category?: string }[],
119
+ ) {
120
+ // Reset
121
+ this.vocabulary.clear();
122
+
123
+ for (const item of lexicon) {
124
+ let tag = "Concept";
125
+ if (item.type === "operational-heuristic") tag = "Protocol";
126
+ if (item.category === "Tool") tag = "Organization";
127
+
128
+ // Add Title
129
+ if (item.title) {
130
+ this.vocabulary.set(item.title.toLowerCase(), tag);
131
+ }
132
+ // Add ID
133
+ if (item.id) {
134
+ this.vocabulary.set(item.id.toLowerCase(), tag);
135
+ // Handle Hyphen Variants
136
+ if (item.id.includes("-")) {
137
+ this.vocabulary.set(item.id.toLowerCase().replace(/-/g, " "), tag);
138
+ }
139
+ }
140
+ }
141
+
142
+ // Sort keys by length desc to ensure "Web Standards" matches before "Web"
143
+ this.searchKeys = Array.from(this.vocabulary.keys()).sort(
144
+ (a, b) => b.length - a.length,
145
+ );
146
+
147
+ console.log(
148
+ `🧠 ZeroMagic Tokenizer learned ${this.vocabulary.size} terms from lexicon.`,
149
+ );
150
+ }
151
+
152
+ // Deprecated / No-Op
153
+ public extend(
154
+ _customWords: Record<string, string>,
155
+ _customPatterns: Record<string, string>,
156
+ ) {
157
+ // No-op for brute force scanner
158
+ }
159
+ }