amalfa 0.0.0-reserved → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.biomeignore +19 -0
- package/:memory: +0 -0
- package/:memory:-shm +0 -0
- package/:memory:-wal +0 -0
- package/CHANGELOG.md.old +43 -0
- package/LICENSE +21 -0
- package/README.md +359 -13
- package/README.old.md +112 -0
- package/ROADMAP.md +316 -0
- package/TEST_PLAN.md +561 -0
- package/agents.config.json +11 -0
- package/amalfa.config.example.ts +102 -0
- package/biome.json +49 -0
- package/bun.lock +371 -0
- package/docs/AGENT_PROTOCOLS.md +28 -0
- package/docs/ARCHITECTURAL_OVERVIEW.md +123 -0
- package/docs/BENTO_BOXING_DEPRECATION.md +281 -0
- package/docs/Bun-SQLite.html +464 -0
- package/docs/COMMIT_GUIDELINES.md +367 -0
- package/docs/DEVELOPER_ONBOARDING.md +36 -0
- package/docs/Graph and Vector Database Best Practices.md +214 -0
- package/docs/PERFORMANCE_BASELINES.md +88 -0
- package/docs/REPOSITORY_CLEANUP_SUMMARY.md +261 -0
- package/docs/edge-generation-methods.md +57 -0
- package/docs/elevator-pitch.md +118 -0
- package/docs/graph-and-vector-database-playbook.html +480 -0
- package/docs/hardened-sqlite.md +85 -0
- package/docs/headless-knowledge-management.md +79 -0
- package/docs/john-kaye-flux-prompt.md +46 -0
- package/docs/keyboard-shortcuts.md +80 -0
- package/docs/opinion-proceed-pattern.md +29 -0
- package/docs/polyvis-nodes-edges-schema.md +77 -0
- package/docs/protocols/lab-protocol.md +30 -0
- package/docs/reaction-iquest-loop-coder.md +46 -0
- package/docs/services.md +60 -0
- package/docs/sqlite-wal-readonly-trap.md +228 -0
- package/docs/strategy/css-architecture.md +40 -0
- package/docs/test-document-cycle.md +83 -0
- package/docs/test_lifecycle_E2E.md +4 -0
- package/docs/the-bicameral-graph.md +83 -0
- package/docs/user-guide.md +70 -0
- package/docs/vision-helper.md +53 -0
- package/drizzle/0000_minor_iron_fist.sql +19 -0
- package/drizzle/meta/0000_snapshot.json +139 -0
- package/drizzle/meta/_journal.json +13 -0
- package/example_usage.ts +39 -0
- package/experiment.sh +35 -0
- package/hello +2 -0
- package/index.html +52 -0
- package/knowledge/excalibur.md +12 -0
- package/package.json +60 -15
- package/plans/experience-graph-integration.md +60 -0
- package/prompts/gemini-king-mode-prompt.md +46 -0
- package/public/docs/MCP_TOOLS.md +372 -0
- package/schemas/README.md +20 -0
- package/schemas/cda.schema.json +84 -0
- package/schemas/conceptual-lexicon.schema.json +75 -0
- package/scratchpads/dummy-debrief-boxed.md +39 -0
- package/scratchpads/dummy-debrief.md +27 -0
- package/scratchpads/scratchpad-design.md +50 -0
- package/scratchpads/scratchpad-scrolling.md +20 -0
- package/scratchpads/scratchpad-toc-disappearance.md +23 -0
- package/scratchpads/scratchpad-toc.md +28 -0
- package/scratchpads/test_gardener.md +7 -0
- package/src/EnlightenedTriad.ts +146 -0
- package/src/JIT_Triad.ts +137 -0
- package/src/cli.ts +364 -0
- package/src/config/constants.ts +7 -0
- package/src/config/defaults.ts +99 -0
- package/src/core/BentoNormalizer.ts +113 -0
- package/src/core/EdgeWeaver.ts +145 -0
- package/src/core/FractureLogic.ts +22 -0
- package/src/core/Harvester.ts +73 -0
- package/src/core/LLMClient.ts +93 -0
- package/src/core/LouvainGate.ts +67 -0
- package/src/core/MarkdownMasker.ts +49 -0
- package/src/core/README.md +11 -0
- package/src/core/SemanticMatcher.ts +89 -0
- package/src/core/SemanticWeaver.ts +96 -0
- package/src/core/TagEngine.ts +56 -0
- package/src/core/TimelineWeaver.ts +61 -0
- package/src/core/VectorEngine.ts +232 -0
- package/src/daemon/index.ts +225 -0
- package/src/data/experience/test_doc_1.md +2 -0
- package/src/data/experience/test_doc_2.md +2 -0
- package/src/db/schema.ts +46 -0
- package/src/demo-triad.ts +45 -0
- package/src/gardeners/AutoTagger.ts +116 -0
- package/src/gardeners/BaseGardener.ts +55 -0
- package/src/llm/EnlightenedProvider.ts +95 -0
- package/src/mcp/README.md +6 -0
- package/src/mcp/index.ts +341 -0
- package/src/pipeline/AmalfaIngestor.ts +272 -0
- package/src/pipeline/HarvesterPipeline.ts +101 -0
- package/src/pipeline/Ingestor.ts +555 -0
- package/src/pipeline/PreFlightAnalyzer.ts +434 -0
- package/src/pipeline/README.md +7 -0
- package/src/pipeline/SemanticHarvester.ts +222 -0
- package/src/resonance/DatabaseFactory.ts +100 -0
- package/src/resonance/README.md +148 -0
- package/src/resonance/cli/README.md +7 -0
- package/src/resonance/cli/ingest.ts +41 -0
- package/src/resonance/cli/migrate.ts +54 -0
- package/src/resonance/config.ts +40 -0
- package/src/resonance/daemon.ts +236 -0
- package/src/resonance/db.ts +424 -0
- package/src/resonance/pipeline/README.md +7 -0
- package/src/resonance/pipeline/extract.ts +89 -0
- package/src/resonance/pipeline/transform_docs.ts +60 -0
- package/src/resonance/schema.ts +156 -0
- package/src/resonance/services/embedder.ts +131 -0
- package/src/resonance/services/simpleTokenizer.ts +119 -0
- package/src/resonance/services/stats.ts +327 -0
- package/src/resonance/services/tokenizer.ts +159 -0
- package/src/resonance/transform/cda.ts +393 -0
- package/src/resonance/types/enriched-cda.ts +112 -0
- package/src/services/README.md +56 -0
- package/src/services/llama.ts +59 -0
- package/src/services/llamauv.ts +56 -0
- package/src/services/olmo3.ts +58 -0
- package/src/services/phi.ts +52 -0
- package/src/types/artifact.ts +12 -0
- package/src/utils/EnvironmentVerifier.ts +67 -0
- package/src/utils/Logger.ts +21 -0
- package/src/utils/ServiceLifecycle.ts +207 -0
- package/src/utils/ZombieDefense.ts +244 -0
- package/src/utils/validator.ts +264 -0
- package/substack/substack-playbook-1.md +95 -0
- package/substack/substack-playbook-2.md +78 -0
- package/tasks/ui-investigation.md +26 -0
- package/test-db +0 -0
- package/test-db-shm +0 -0
- package/test-db-wal +0 -0
- package/tests/canary/verify_pinch_check.ts +44 -0
- package/tests/fixtures/ingest_test.md +12 -0
- package/tests/fixtures/ingest_test_boxed.md +13 -0
- package/tests/fixtures/safety_test.md +45 -0
- package/tests/fixtures/safety_test_boxed.md +49 -0
- package/tests/fixtures/tagged_output.md +49 -0
- package/tests/fixtures/tagged_test.md +49 -0
- package/tests/mcp-server-settings.json +8 -0
- package/tsconfig.json +46 -0
- package/verify-embedder.ts +54 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ingestion Pipeline Observability
|
|
3
|
+
*
|
|
4
|
+
* Tracks metrics at each pipeline stage and verifies against baseline
|
|
5
|
+
* to detect silent failures (e.g., PERSONA graph losing 3,000 edges).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface EdgeCounts {
|
|
9
|
+
[type: string]: number;
|
|
10
|
+
total: number;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface DomainStats {
|
|
14
|
+
nodes: number;
|
|
15
|
+
edges: EdgeCounts;
|
|
16
|
+
vectors: number;
|
|
17
|
+
semantic_tokens?: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface PipelineMetrics {
|
|
21
|
+
persona: DomainStats;
|
|
22
|
+
experience: DomainStats;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface Baseline {
|
|
26
|
+
version: string;
|
|
27
|
+
last_updated: string;
|
|
28
|
+
description: string;
|
|
29
|
+
persona: {
|
|
30
|
+
nodes: { concepts: number; directives: number; total: number };
|
|
31
|
+
edges: EdgeCounts;
|
|
32
|
+
vectors: number;
|
|
33
|
+
notes: string;
|
|
34
|
+
};
|
|
35
|
+
experience: {
|
|
36
|
+
nodes: {
|
|
37
|
+
debriefs: number;
|
|
38
|
+
playbooks: number;
|
|
39
|
+
documents: number;
|
|
40
|
+
total: number;
|
|
41
|
+
};
|
|
42
|
+
edges: EdgeCounts;
|
|
43
|
+
vectors: number;
|
|
44
|
+
semantic_tokens: number;
|
|
45
|
+
notes: string;
|
|
46
|
+
};
|
|
47
|
+
tolerance: {
|
|
48
|
+
nodes: number;
|
|
49
|
+
edges: number;
|
|
50
|
+
vectors: number;
|
|
51
|
+
description: string;
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export interface Mismatch {
|
|
56
|
+
domain: string;
|
|
57
|
+
metric: string;
|
|
58
|
+
expected: number;
|
|
59
|
+
actual: number;
|
|
60
|
+
delta: number;
|
|
61
|
+
variance: number;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export class IngestionStats {
|
|
65
|
+
private metrics: PipelineMetrics;
|
|
66
|
+
private baseline: Baseline | null = null;
|
|
67
|
+
|
|
68
|
+
constructor() {
|
|
69
|
+
this.metrics = {
|
|
70
|
+
persona: {
|
|
71
|
+
nodes: 0,
|
|
72
|
+
edges: { total: 0 },
|
|
73
|
+
vectors: 0,
|
|
74
|
+
},
|
|
75
|
+
experience: {
|
|
76
|
+
nodes: 0,
|
|
77
|
+
edges: { total: 0 },
|
|
78
|
+
vectors: 0,
|
|
79
|
+
semantic_tokens: 0,
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Load baseline from file
|
|
86
|
+
*/
|
|
87
|
+
async loadBaseline(path: string): Promise<void> {
|
|
88
|
+
try {
|
|
89
|
+
const file = Bun.file(path);
|
|
90
|
+
this.baseline = await file.json();
|
|
91
|
+
console.log(`📊 Baseline loaded: v${this.baseline?.version}`);
|
|
92
|
+
} catch (_error) {
|
|
93
|
+
console.warn(
|
|
94
|
+
`⚠️ No baseline found at ${path}. Skipping baseline verification.`,
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Record node creation
|
|
101
|
+
*/
|
|
102
|
+
recordNode(domain: "persona" | "experience"): void {
|
|
103
|
+
this.metrics[domain].nodes++;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Record edge creation
|
|
108
|
+
*/
|
|
109
|
+
recordEdge(domain: "persona" | "experience", type: string): void {
|
|
110
|
+
if (!this.metrics[domain].edges[type]) {
|
|
111
|
+
this.metrics[domain].edges[type] = 0;
|
|
112
|
+
}
|
|
113
|
+
this.metrics[domain].edges[type]++;
|
|
114
|
+
this.metrics[domain].edges.total++;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Record vector creation
|
|
119
|
+
*/
|
|
120
|
+
recordVector(domain: "persona" | "experience"): void {
|
|
121
|
+
this.metrics[domain].vectors++;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Record semantic token extraction
|
|
126
|
+
*/
|
|
127
|
+
recordSemanticTokens(domain: "experience"): void {
|
|
128
|
+
if (this.metrics[domain].semantic_tokens !== undefined) {
|
|
129
|
+
this.metrics[domain].semantic_tokens++;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Get current metrics
|
|
135
|
+
*/
|
|
136
|
+
getMetrics(): PipelineMetrics {
|
|
137
|
+
return this.metrics;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Verify against baseline
|
|
142
|
+
*/
|
|
143
|
+
verifyAgainstBaseline(): Mismatch[] {
|
|
144
|
+
if (!this.baseline) {
|
|
145
|
+
console.warn("⚠️ No baseline loaded. Skipping verification.");
|
|
146
|
+
return [];
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const mismatches: Mismatch[] = [];
|
|
150
|
+
const tolerance = this.baseline.tolerance;
|
|
151
|
+
|
|
152
|
+
// Verify PERSONA domain
|
|
153
|
+
this.checkMetric(
|
|
154
|
+
mismatches,
|
|
155
|
+
"persona",
|
|
156
|
+
"nodes",
|
|
157
|
+
this.baseline.persona.nodes.total,
|
|
158
|
+
this.metrics.persona.nodes,
|
|
159
|
+
tolerance.nodes,
|
|
160
|
+
);
|
|
161
|
+
|
|
162
|
+
this.checkMetric(
|
|
163
|
+
mismatches,
|
|
164
|
+
"persona",
|
|
165
|
+
"edges.total",
|
|
166
|
+
this.baseline.persona.edges.total,
|
|
167
|
+
this.metrics.persona.edges.total,
|
|
168
|
+
tolerance.edges,
|
|
169
|
+
);
|
|
170
|
+
|
|
171
|
+
this.checkMetric(
|
|
172
|
+
mismatches,
|
|
173
|
+
"persona",
|
|
174
|
+
"vectors",
|
|
175
|
+
this.baseline.persona.vectors,
|
|
176
|
+
this.metrics.persona.vectors,
|
|
177
|
+
tolerance.vectors,
|
|
178
|
+
);
|
|
179
|
+
|
|
180
|
+
// Verify EXPERIENCE domain
|
|
181
|
+
this.checkMetric(
|
|
182
|
+
mismatches,
|
|
183
|
+
"experience",
|
|
184
|
+
"nodes",
|
|
185
|
+
this.baseline.experience.nodes.total,
|
|
186
|
+
this.metrics.experience.nodes,
|
|
187
|
+
tolerance.nodes,
|
|
188
|
+
);
|
|
189
|
+
|
|
190
|
+
this.checkMetric(
|
|
191
|
+
mismatches,
|
|
192
|
+
"experience",
|
|
193
|
+
"edges.total",
|
|
194
|
+
this.baseline.experience.edges.total,
|
|
195
|
+
this.metrics.experience.edges.total,
|
|
196
|
+
tolerance.edges,
|
|
197
|
+
);
|
|
198
|
+
|
|
199
|
+
this.checkMetric(
|
|
200
|
+
mismatches,
|
|
201
|
+
"experience",
|
|
202
|
+
"vectors",
|
|
203
|
+
this.baseline.experience.vectors,
|
|
204
|
+
this.metrics.experience.vectors,
|
|
205
|
+
tolerance.vectors,
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
return mismatches;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Check a single metric against baseline
|
|
213
|
+
*/
|
|
214
|
+
private checkMetric(
|
|
215
|
+
mismatches: Mismatch[],
|
|
216
|
+
domain: string,
|
|
217
|
+
metric: string,
|
|
218
|
+
expected: number,
|
|
219
|
+
actual: number,
|
|
220
|
+
tolerance: number,
|
|
221
|
+
): void {
|
|
222
|
+
const delta = actual - expected;
|
|
223
|
+
const variance = expected > 0 ? Math.abs(delta) / expected : 0;
|
|
224
|
+
|
|
225
|
+
if (variance > tolerance) {
|
|
226
|
+
mismatches.push({
|
|
227
|
+
domain,
|
|
228
|
+
metric,
|
|
229
|
+
expected,
|
|
230
|
+
actual,
|
|
231
|
+
delta,
|
|
232
|
+
variance,
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Print summary report
|
|
239
|
+
*/
|
|
240
|
+
printSummary(): void {
|
|
241
|
+
console.log("\n📊 Ingestion Summary");
|
|
242
|
+
console.log("═".repeat(60));
|
|
243
|
+
|
|
244
|
+
console.log("\n🧠 PERSONA Domain:");
|
|
245
|
+
console.log(` Nodes: ${this.metrics.persona.nodes}`);
|
|
246
|
+
console.log(` Edges: ${this.metrics.persona.edges.total}`);
|
|
247
|
+
console.log(` Vectors: ${this.metrics.persona.vectors}`);
|
|
248
|
+
|
|
249
|
+
console.log("\n📚 EXPERIENCE Domain:");
|
|
250
|
+
console.log(` Nodes: ${this.metrics.experience.nodes}`);
|
|
251
|
+
console.log(` Edges: ${this.metrics.experience.edges.total}`);
|
|
252
|
+
console.log(` Vectors: ${this.metrics.experience.vectors}`);
|
|
253
|
+
console.log(` Tokens: ${this.metrics.experience.semantic_tokens || 0}`);
|
|
254
|
+
|
|
255
|
+
console.log(`\n${"═".repeat(60)}`);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Print verification results
|
|
260
|
+
*/
|
|
261
|
+
printVerification(mismatches: Mismatch[]): void {
|
|
262
|
+
if (mismatches.length === 0) {
|
|
263
|
+
console.log("\n✅ Baseline Verification: PASSED");
|
|
264
|
+
return;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
console.log("\n❌ Baseline Verification: FAILED");
|
|
268
|
+
console.log("═".repeat(60));
|
|
269
|
+
console.table(
|
|
270
|
+
mismatches.map((m) => ({
|
|
271
|
+
Domain: m.domain,
|
|
272
|
+
Metric: m.metric,
|
|
273
|
+
Expected: m.expected,
|
|
274
|
+
Actual: m.actual,
|
|
275
|
+
Delta: m.delta,
|
|
276
|
+
"Variance %": `${(m.variance * 100).toFixed(1)}%`,
|
|
277
|
+
})),
|
|
278
|
+
);
|
|
279
|
+
console.log("═".repeat(60));
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Generate markdown report
|
|
284
|
+
*/
|
|
285
|
+
async generateReport(outputPath: string): Promise<void> {
|
|
286
|
+
const now = new Date().toISOString().split("T")[0];
|
|
287
|
+
const mismatches = this.verifyAgainstBaseline();
|
|
288
|
+
|
|
289
|
+
let report = `# Ingestion Report: ${now}\n\n`;
|
|
290
|
+
|
|
291
|
+
// PERSONA Domain
|
|
292
|
+
report += `## PERSONA Domain\n\n`;
|
|
293
|
+
report += `| Metric | Expected | Actual | Status |\n`;
|
|
294
|
+
report += `|--------|----------|--------|--------|\n`;
|
|
295
|
+
|
|
296
|
+
if (this.baseline) {
|
|
297
|
+
report += `| Nodes | ${this.baseline.persona.nodes.total} | ${this.metrics.persona.nodes} | ${this.metrics.persona.nodes === this.baseline.persona.nodes.total ? "✅" : "❌"} |\n`;
|
|
298
|
+
report += `| Edges | ${this.baseline.persona.edges.total} | ${this.metrics.persona.edges.total} | ${this.metrics.persona.edges.total === this.baseline.persona.edges.total ? "✅" : "❌"} |\n`;
|
|
299
|
+
report += `| Vectors| ${this.baseline.persona.vectors} | ${this.metrics.persona.vectors} | ${this.metrics.persona.vectors === this.baseline.persona.vectors ? "✅" : "❌"} |\n`;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// EXPERIENCE Domain
|
|
303
|
+
report += `\n## EXPERIENCE Domain\n\n`;
|
|
304
|
+
report += `| Metric | Expected | Actual | Status |\n`;
|
|
305
|
+
report += `|--------|----------|--------|--------|\n`;
|
|
306
|
+
|
|
307
|
+
if (this.baseline) {
|
|
308
|
+
report += `| Nodes | ${this.baseline.experience.nodes.total} | ${this.metrics.experience.nodes} | ${this.metrics.experience.nodes === this.baseline.experience.nodes.total ? "✅" : "❌"} |\n`;
|
|
309
|
+
report += `| Edges | ${this.baseline.experience.edges.total} | ${this.metrics.experience.edges.total} | ${this.metrics.experience.edges.total === this.baseline.experience.edges.total ? "✅" : "❌"} |\n`;
|
|
310
|
+
report += `| Vectors| ${this.baseline.experience.vectors} | ${this.metrics.experience.vectors} | ${this.metrics.experience.vectors === this.baseline.experience.vectors ? "✅" : "❌"} |\n`;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Mismatches
|
|
314
|
+
if (mismatches.length > 0) {
|
|
315
|
+
report += `\n## ⚠️ Baseline Violations\n\n`;
|
|
316
|
+
report += `| Domain | Metric | Expected | Actual | Delta | Variance |\n`;
|
|
317
|
+
report += `|--------|--------|----------|--------|-------|----------|\n`;
|
|
318
|
+
|
|
319
|
+
for (const m of mismatches) {
|
|
320
|
+
report += `| ${m.domain} | ${m.metric} | ${m.expected} | ${m.actual} | ${m.delta > 0 ? "+" : ""}${m.delta} | ${(m.variance * 100).toFixed(1)}% |\n`;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
await Bun.write(outputPath, report);
|
|
325
|
+
console.log(`\n📄 Report saved to: ${outputPath}`);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import nlp from "compromise";
|
|
2
|
+
|
|
3
|
+
export interface SemanticTags {
|
|
4
|
+
people: string[];
|
|
5
|
+
places: string[];
|
|
6
|
+
organizations: string[];
|
|
7
|
+
topics: string[];
|
|
8
|
+
dates?: string[];
|
|
9
|
+
money: string[];
|
|
10
|
+
protocols?: string[];
|
|
11
|
+
concepts?: string[];
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export class TokenizerService {
|
|
15
|
+
private static instance: TokenizerService;
|
|
16
|
+
// Map of normalized_term -> tag inside class state
|
|
17
|
+
private vocabulary: Map<string, string> = new Map();
|
|
18
|
+
// Cache keys sorted by length (desc) for greedy matching
|
|
19
|
+
private searchKeys: string[] = [];
|
|
20
|
+
|
|
21
|
+
// Compromise instance (optional, keeping for 'people', 'places' currently)
|
|
22
|
+
// Could eventually remove if we go 100% custom.
|
|
23
|
+
|
|
24
|
+
private constructor() {}
|
|
25
|
+
|
|
26
|
+
public static getInstance(): TokenizerService {
|
|
27
|
+
if (!TokenizerService.instance) {
|
|
28
|
+
TokenizerService.instance = new TokenizerService();
|
|
29
|
+
}
|
|
30
|
+
return TokenizerService.instance;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Extracts semantic entities.
|
|
35
|
+
* 1. Uses Compromise for generic Named Entity Recognition (NER)
|
|
36
|
+
* 2. Uses Custom "Zero Magic" Brute Force Scanner for Domain Vocabulary
|
|
37
|
+
*/
|
|
38
|
+
public extract(text: string): SemanticTags {
|
|
39
|
+
const doc = nlp(text);
|
|
40
|
+
|
|
41
|
+
// 1. Generic NLP (Keep for now as fallback/enrichment)
|
|
42
|
+
const result: SemanticTags = {
|
|
43
|
+
people: doc.people().out("array"),
|
|
44
|
+
places: doc.places().out("array"),
|
|
45
|
+
organizations: doc.organizations().out("array"),
|
|
46
|
+
topics: doc.topics().out("array"),
|
|
47
|
+
money: [],
|
|
48
|
+
protocols: [],
|
|
49
|
+
concepts: [],
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
// 2. Zero Magic Domain Scan (Brute Force)
|
|
53
|
+
// Optimization: Check text.includes() only if vocabulary is small?
|
|
54
|
+
// But for regex construction or Aho-Corasick, naive loop is fine for now on small text blocks (Bento boxes).
|
|
55
|
+
|
|
56
|
+
const lowerText = text.toLowerCase();
|
|
57
|
+
|
|
58
|
+
for (const term of this.searchKeys) {
|
|
59
|
+
// Simple subset check.
|
|
60
|
+
// Limitation: Matches "pro" in "process". Needs word boundary check.
|
|
61
|
+
// RegExp construction is costly inside loop?
|
|
62
|
+
// Better: Pre-build a massive Regex?
|
|
63
|
+
// Or simpler: \bSTR\b with indexof?
|
|
64
|
+
|
|
65
|
+
// Fast "includes" check first
|
|
66
|
+
if (lowerText.includes(term)) {
|
|
67
|
+
// Confirm Word Boundary to avoid partial matches
|
|
68
|
+
// Regex is expensive, but safer for accuracy.
|
|
69
|
+
// We construct regex only on 'hit' to save cycles?
|
|
70
|
+
const escaped = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
71
|
+
const boundaryRegex = new RegExp(`\\b${escaped}\\b`, "i");
|
|
72
|
+
|
|
73
|
+
if (boundaryRegex.test(text)) {
|
|
74
|
+
const tag = this.vocabulary.get(term);
|
|
75
|
+
// Retrieve Canonical Form (Original Case) if needed?
|
|
76
|
+
// For now, we normalize to the lowercase SEARCH key, but we might prefer the original.
|
|
77
|
+
// Given the user wants "OH-058" format, let's try to map back if possible.
|
|
78
|
+
// But we only stored 'tag' in the map.
|
|
79
|
+
// Let's store Normalized -> Original ID in a separate map if required?
|
|
80
|
+
// User Request: "no need to manage case" -> likely means "return standard ID".
|
|
81
|
+
|
|
82
|
+
// Since we are returning the matched string (which is usually the ID in lowercase in our loop),
|
|
83
|
+
// If we want UPPERCASE OH-058, we need to know it.
|
|
84
|
+
// Simplest fix: Just return the term as found (lowercase) and EdgeWeaver will handle lookup.
|
|
85
|
+
// EdgeWeaver expects slugified keys anyway.
|
|
86
|
+
|
|
87
|
+
// Actually, let's return the TERM as it appears in the text?
|
|
88
|
+
// boundaryRegex.match(text) would give us the real casing used in the doc (e.g. "OH-058").
|
|
89
|
+
|
|
90
|
+
const match = boundaryRegex.exec(text);
|
|
91
|
+
const realTerm = match ? match[0] : term;
|
|
92
|
+
|
|
93
|
+
if (tag === "Protocol") {
|
|
94
|
+
if (!result.protocols) result.protocols = [];
|
|
95
|
+
if (!result.protocols.includes(realTerm))
|
|
96
|
+
result.protocols.push(realTerm);
|
|
97
|
+
} else if (tag === "Concept") {
|
|
98
|
+
if (!result.concepts) result.concepts = [];
|
|
99
|
+
if (!result.concepts.includes(realTerm))
|
|
100
|
+
result.concepts.push(realTerm);
|
|
101
|
+
} else if (tag === "Organization") {
|
|
102
|
+
if (!result.organizations.includes(realTerm))
|
|
103
|
+
result.organizations.push(realTerm);
|
|
104
|
+
} else {
|
|
105
|
+
// Default to Concept if tag is unknown or not explicitly handled
|
|
106
|
+
if (!result.concepts) result.concepts = [];
|
|
107
|
+
if (!result.concepts.includes(realTerm))
|
|
108
|
+
result.concepts.push(realTerm);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return result;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
public loadLexicon(
|
|
118
|
+
lexicon: { id: string; title: string; type?: string; category?: string }[],
|
|
119
|
+
) {
|
|
120
|
+
// Reset
|
|
121
|
+
this.vocabulary.clear();
|
|
122
|
+
|
|
123
|
+
for (const item of lexicon) {
|
|
124
|
+
let tag = "Concept";
|
|
125
|
+
if (item.type === "operational-heuristic") tag = "Protocol";
|
|
126
|
+
if (item.category === "Tool") tag = "Organization";
|
|
127
|
+
|
|
128
|
+
// Add Title
|
|
129
|
+
if (item.title) {
|
|
130
|
+
this.vocabulary.set(item.title.toLowerCase(), tag);
|
|
131
|
+
}
|
|
132
|
+
// Add ID
|
|
133
|
+
if (item.id) {
|
|
134
|
+
this.vocabulary.set(item.id.toLowerCase(), tag);
|
|
135
|
+
// Handle Hyphen Variants
|
|
136
|
+
if (item.id.includes("-")) {
|
|
137
|
+
this.vocabulary.set(item.id.toLowerCase().replace(/-/g, " "), tag);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Sort keys by length desc to ensure "Web Standards" matches before "Web"
|
|
143
|
+
this.searchKeys = Array.from(this.vocabulary.keys()).sort(
|
|
144
|
+
(a, b) => b.length - a.length,
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
console.log(
|
|
148
|
+
`🧠 ZeroMagic Tokenizer learned ${this.vocabulary.size} terms from lexicon.`,
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Deprecated / No-Op
|
|
153
|
+
public extend(
|
|
154
|
+
_customWords: Record<string, string>,
|
|
155
|
+
_customPatterns: Record<string, string>,
|
|
156
|
+
) {
|
|
157
|
+
// No-op for brute force scanner
|
|
158
|
+
}
|
|
159
|
+
}
|