amalfa 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +226 -247
- package/amalfa.config.example.ts +8 -6
- package/docs/AGENT-METADATA-PATTERNS.md +1021 -0
- package/docs/CONFIG_E2E_VALIDATION.md +147 -0
- package/docs/CONFIG_UNIFICATION.md +187 -0
- package/docs/CONFIG_VALIDATION.md +103 -0
- package/docs/LEGACY_DEPRECATION.md +174 -0
- package/docs/MCP_SETUP.md +317 -0
- package/docs/QUICK_START_MCP.md +168 -0
- package/docs/SESSION-2026-01-06-METADATA-PATTERNS.md +346 -0
- package/docs/SETUP.md +464 -0
- package/docs/SETUP_COMPLETE.md +464 -0
- package/docs/VISION-AGENT-LEARNING.md +1242 -0
- package/docs/_current-config-status.md +93 -0
- package/package.json +6 -3
- package/polyvis.settings.json.bak +38 -0
- package/src/cli.ts +159 -31
- package/src/config/defaults.ts +73 -15
- package/src/core/VectorEngine.ts +18 -9
- package/src/daemon/index.ts +12 -8
- package/src/mcp/index.ts +62 -7
- package/src/pipeline/AmalfaIngestor.ts +22 -12
- package/src/pipeline/PreFlightAnalyzer.ts +434 -0
- package/src/resonance/DatabaseFactory.ts +3 -4
- package/src/resonance/db.ts +8 -6
- package/src/resonance/schema.ts +19 -1
- package/src/resonance/services/vector-daemon.ts +151 -0
- package/src/utils/DaemonManager.ts +147 -0
- package/src/utils/ZombieDefense.ts +5 -1
- package/:memory: +0 -0
- package/:memory:-shm +0 -0
- package/:memory:-wal +0 -0
- package/README.old.md +0 -112
- package/agents.config.json +0 -11
- package/drizzle/0000_minor_iron_fist.sql +0 -19
- package/drizzle/meta/0000_snapshot.json +0 -139
- package/drizzle/meta/_journal.json +0 -13
- package/example_usage.ts +0 -39
- package/experiment.sh +0 -35
- package/hello +0 -2
- package/index.html +0 -52
- package/knowledge/excalibur.md +0 -12
- package/plans/experience-graph-integration.md +0 -60
- package/prompts/gemini-king-mode-prompt.md +0 -46
- package/public/docs/MCP_TOOLS.md +0 -372
- package/schemas/README.md +0 -20
- package/schemas/cda.schema.json +0 -84
- package/schemas/conceptual-lexicon.schema.json +0 -75
- package/scratchpads/dummy-debrief-boxed.md +0 -39
- package/scratchpads/dummy-debrief.md +0 -27
- package/scratchpads/scratchpad-design.md +0 -50
- package/scratchpads/scratchpad-scrolling.md +0 -20
- package/scratchpads/scratchpad-toc-disappearance.md +0 -23
- package/scratchpads/scratchpad-toc.md +0 -28
- package/scratchpads/test_gardener.md +0 -7
- package/src/core/LLMClient.ts +0 -93
- package/src/core/TagEngine.ts +0 -56
- package/src/db/schema.ts +0 -46
- package/src/gardeners/AutoTagger.ts +0 -116
- package/src/pipeline/HarvesterPipeline.ts +0 -101
- package/src/pipeline/Ingestor.ts +0 -555
- package/src/resonance/cli/ingest.ts +0 -41
- package/src/resonance/cli/migrate.ts +0 -54
- package/src/resonance/config.ts +0 -40
- package/src/resonance/daemon.ts +0 -236
- package/src/resonance/pipeline/extract.ts +0 -89
- package/src/resonance/pipeline/transform_docs.ts +0 -60
- package/src/resonance/services/tokenizer.ts +0 -159
- package/src/resonance/transform/cda.ts +0 -393
- package/src/utils/EnvironmentVerifier.ts +0 -67
- package/substack/substack-playbook-1.md +0 -95
- package/substack/substack-playbook-2.md +0 -78
- package/tasks/ui-investigation.md +0 -26
- package/test-db +0 -0
- package/test-db-shm +0 -0
- package/test-db-wal +0 -0
- package/tests/canary/verify_pinch_check.ts +0 -44
- package/tests/fixtures/ingest_test.md +0 -12
- package/tests/fixtures/ingest_test_boxed.md +0 -13
- package/tests/fixtures/safety_test.md +0 -45
- package/tests/fixtures/safety_test_boxed.md +0 -49
- package/tests/fixtures/tagged_output.md +0 -49
- package/tests/fixtures/tagged_test.md +0 -49
- package/tests/mcp-server-settings.json +0 -8
- package/verify-embedder.ts +0 -54
package/src/resonance/daemon.ts
DELETED
|
@@ -1,236 +0,0 @@
|
|
|
1
|
-
import { watch } from "node:fs";
|
|
2
|
-
import { join } from "node:path";
|
|
3
|
-
import settings from "@/polyvis.settings.json";
|
|
4
|
-
import { Ingestor } from "../pipeline/Ingestor";
|
|
5
|
-
import { EnvironmentVerifier } from "../utils/EnvironmentVerifier";
|
|
6
|
-
import { getLogger } from "../utils/Logger";
|
|
7
|
-
import { ServiceLifecycle } from "../utils/ServiceLifecycle";
|
|
8
|
-
import { Embedder } from "./services/embedder";
|
|
9
|
-
|
|
10
|
-
const args = process.argv.slice(2);
|
|
11
|
-
const command = args[0] || "serve";
|
|
12
|
-
const log = getLogger("Daemon");
|
|
13
|
-
|
|
14
|
-
// --- Helper: Notifications ---
|
|
15
|
-
|
|
16
|
-
async function notify(title: string, message: string) {
|
|
17
|
-
// Native macOS notifications via AppleScript
|
|
18
|
-
// Zero dependencies
|
|
19
|
-
try {
|
|
20
|
-
const script = `display notification "${message}" with title "${title}"`;
|
|
21
|
-
await Bun.spawn(["osascript", "-e", script]);
|
|
22
|
-
} catch (e) {
|
|
23
|
-
log.error({ err: e }, "Failed to send notification");
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
// --- Service Lifecycle ---
|
|
28
|
-
|
|
29
|
-
const lifecycle = new ServiceLifecycle({
|
|
30
|
-
name: "Daemon",
|
|
31
|
-
pidFile: ".daemon.pid",
|
|
32
|
-
logFile: ".daemon.log",
|
|
33
|
-
entryPoint: "src/resonance/daemon.ts",
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
// --- Server Logic (The actual Daemon) ---
|
|
37
|
-
|
|
38
|
-
async function main() {
|
|
39
|
-
// 0. Verify Environment
|
|
40
|
-
await EnvironmentVerifier.verifyOrExit();
|
|
41
|
-
|
|
42
|
-
// 1. Initialize Ingestion (Daemon Mode: Watch Enabled)
|
|
43
|
-
const PORT = parseInt(process.env.VECTOR_PORT || "3010", 10);
|
|
44
|
-
|
|
45
|
-
log.info({ port: PORT }, "🔌 Vector Daemon starting...");
|
|
46
|
-
log.info("Initializing Embedder...");
|
|
47
|
-
|
|
48
|
-
// 1. Initialize Embedder (Compute Node)
|
|
49
|
-
try {
|
|
50
|
-
const embedder = Embedder.getInstance();
|
|
51
|
-
await embedder.embed("warmup", true);
|
|
52
|
-
log.info("✅ Embedder Ready.");
|
|
53
|
-
} catch (e) {
|
|
54
|
-
log.fatal({ err: e }, "❌ Failed to initialize embedder");
|
|
55
|
-
process.exit(1);
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
// 2. Start HTTP Server
|
|
59
|
-
Bun.serve({
|
|
60
|
-
port: PORT,
|
|
61
|
-
async fetch(req) {
|
|
62
|
-
const url = new URL(req.url);
|
|
63
|
-
|
|
64
|
-
if (req.method === "GET" && url.pathname === "/health") {
|
|
65
|
-
return new Response(JSON.stringify({ status: "ok" }), {
|
|
66
|
-
headers: { "Content-Type": "application/json" },
|
|
67
|
-
});
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
if (req.method === "POST" && url.pathname === "/embed") {
|
|
71
|
-
try {
|
|
72
|
-
const body = (await req.json()) as { text: string };
|
|
73
|
-
if (!body.text || typeof body.text !== "string") {
|
|
74
|
-
return new Response("Bad Request: 'text' field required", {
|
|
75
|
-
status: 400,
|
|
76
|
-
});
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
const vector = await Embedder.getInstance().embed(body.text, true);
|
|
80
|
-
|
|
81
|
-
return new Response(JSON.stringify({ vector: Array.from(vector) }), {
|
|
82
|
-
headers: { "Content-Type": "application/json" },
|
|
83
|
-
});
|
|
84
|
-
} catch (e) {
|
|
85
|
-
log.error({ err: e }, "Embedder API Error");
|
|
86
|
-
return new Response("Internal Server Error", { status: 500 });
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
return new Response("Not Found", { status: 404 });
|
|
91
|
-
},
|
|
92
|
-
});
|
|
93
|
-
|
|
94
|
-
log.info(`🚀 Vector Daemon listening on http://localhost:${PORT}`);
|
|
95
|
-
|
|
96
|
-
// 3. Start The Watcher (Active Custodian)
|
|
97
|
-
startWatcher();
|
|
98
|
-
|
|
99
|
-
// Handle cleanup
|
|
100
|
-
process.on("SIGTERM", () => {
|
|
101
|
-
log.info("🛑 Received SIGTERM, shutting down...");
|
|
102
|
-
process.exit(0);
|
|
103
|
-
});
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
// --- Watcher Logic ---
|
|
107
|
-
|
|
108
|
-
let debounceTimer: ReturnType<typeof setTimeout> | null = null;
|
|
109
|
-
const DEBOUNCE_MS = 2000;
|
|
110
|
-
const pendingFiles = new Set<string>();
|
|
111
|
-
|
|
112
|
-
// Retry queue: Track failed ingestions with attempt counts
|
|
113
|
-
const retryQueue = new Map<
|
|
114
|
-
string,
|
|
115
|
-
{ attempts: number; lastError: string; lastAttempt: number }
|
|
116
|
-
>();
|
|
117
|
-
const MAX_RETRIES = 3;
|
|
118
|
-
const RETRY_BACKOFF_MS = 5000; // Wait 5 seconds before retry
|
|
119
|
-
|
|
120
|
-
function startWatcher() {
|
|
121
|
-
// Dynamically load watch targets from settings
|
|
122
|
-
const rawSources = settings.paths.sources.experience;
|
|
123
|
-
const dirsToWatch = rawSources.map((s) => s.path);
|
|
124
|
-
|
|
125
|
-
log.info({ triggers: dirsToWatch }, "👀 Watching directories");
|
|
126
|
-
|
|
127
|
-
dirsToWatch.forEach((dir) => {
|
|
128
|
-
const path = join(process.cwd(), dir);
|
|
129
|
-
try {
|
|
130
|
-
watch(path, { recursive: true }, (event, filename) => {
|
|
131
|
-
// Ignore dotfiles and ensure markdown
|
|
132
|
-
if (filename && !filename.startsWith(".") && filename.endsWith(".md")) {
|
|
133
|
-
log.debug(
|
|
134
|
-
{ file: `${dir}/${filename}`, event },
|
|
135
|
-
"📝 Change detected",
|
|
136
|
-
);
|
|
137
|
-
|
|
138
|
-
// Add full path to pending set
|
|
139
|
-
const fullPath = join(process.cwd(), dir, filename);
|
|
140
|
-
pendingFiles.add(fullPath);
|
|
141
|
-
|
|
142
|
-
triggerIngestion();
|
|
143
|
-
}
|
|
144
|
-
});
|
|
145
|
-
} catch (e) {
|
|
146
|
-
log.warn({ dir, err: e }, "⚠️ Could not watch directory");
|
|
147
|
-
}
|
|
148
|
-
});
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
function triggerIngestion() {
|
|
152
|
-
if (debounceTimer) {
|
|
153
|
-
clearTimeout(debounceTimer);
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
debounceTimer = setTimeout(async () => {
|
|
157
|
-
const batchSize = pendingFiles.size;
|
|
158
|
-
if (batchSize === 0) return;
|
|
159
|
-
|
|
160
|
-
log.info({ batchSize }, "🔄 Debounce settle. Starting Batch Ingestion...");
|
|
161
|
-
|
|
162
|
-
// Drain the set
|
|
163
|
-
const batch = Array.from(pendingFiles);
|
|
164
|
-
pendingFiles.clear();
|
|
165
|
-
|
|
166
|
-
try {
|
|
167
|
-
// Re-instantiate DB/Ingestor for fresh context
|
|
168
|
-
const ingestor = new Ingestor();
|
|
169
|
-
|
|
170
|
-
// OPTIMIZATION: Pass only the changed files
|
|
171
|
-
await ingestor.run({ files: batch });
|
|
172
|
-
|
|
173
|
-
log.info("✅ Batch Ingestion Complete.");
|
|
174
|
-
// Clear retry counts for successful files
|
|
175
|
-
for (const file of batch) {
|
|
176
|
-
retryQueue.delete(file);
|
|
177
|
-
}
|
|
178
|
-
await notify("PolyVis Resonance", `Graph Updated (${batchSize} files).`);
|
|
179
|
-
} catch (e) {
|
|
180
|
-
const errorMsg = e instanceof Error ? e.message : String(e);
|
|
181
|
-
log.error({ err: e }, "❌ Ingestion Failed");
|
|
182
|
-
|
|
183
|
-
// Re-queue failed files with retry logic
|
|
184
|
-
const now = Date.now();
|
|
185
|
-
for (const file of batch) {
|
|
186
|
-
const retryInfo = retryQueue.get(file) || {
|
|
187
|
-
attempts: 0,
|
|
188
|
-
lastError: "",
|
|
189
|
-
lastAttempt: 0,
|
|
190
|
-
};
|
|
191
|
-
|
|
192
|
-
if (retryInfo.attempts < MAX_RETRIES) {
|
|
193
|
-
// Re-queue with exponential backoff
|
|
194
|
-
const nextAttempt = retryInfo.attempts + 1;
|
|
195
|
-
retryQueue.set(file, {
|
|
196
|
-
attempts: nextAttempt,
|
|
197
|
-
lastError: errorMsg,
|
|
198
|
-
lastAttempt: now,
|
|
199
|
-
});
|
|
200
|
-
|
|
201
|
-
// Re-add to pending files after backoff delay
|
|
202
|
-
setTimeout(() => {
|
|
203
|
-
pendingFiles.add(file);
|
|
204
|
-
triggerIngestion();
|
|
205
|
-
}, RETRY_BACKOFF_MS * nextAttempt);
|
|
206
|
-
|
|
207
|
-
log.warn(
|
|
208
|
-
{
|
|
209
|
-
file,
|
|
210
|
-
attempt: nextAttempt,
|
|
211
|
-
max: MAX_RETRIES,
|
|
212
|
-
delayMs: RETRY_BACKOFF_MS * nextAttempt,
|
|
213
|
-
},
|
|
214
|
-
"🔄 Scheduling Retry",
|
|
215
|
-
);
|
|
216
|
-
} else {
|
|
217
|
-
// Abandon after max retries
|
|
218
|
-
log.error(
|
|
219
|
-
{ file, lastError: retryInfo.lastError },
|
|
220
|
-
"⛔ ABANDONED: File failed max retries",
|
|
221
|
-
);
|
|
222
|
-
retryQueue.delete(file); // Remove from tracking
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
await notify(
|
|
227
|
-
"PolyVis Resonance",
|
|
228
|
-
`Ingestion Failed (${batch.length} files will retry)`,
|
|
229
|
-
);
|
|
230
|
-
}
|
|
231
|
-
}, DEBOUNCE_MS);
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
// --- Dispatch ---
|
|
235
|
-
|
|
236
|
-
await lifecycle.run(command, main);
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
import { existsSync, mkdirSync } from "node:fs";
|
|
2
|
-
import { join } from "node:path";
|
|
3
|
-
|
|
4
|
-
console.log("Starting term extraction...");
|
|
5
|
-
|
|
6
|
-
// --- Path Resolution ---
|
|
7
|
-
import settings from "@/polyvis.settings.json";
|
|
8
|
-
|
|
9
|
-
// Reading from Resonance DB
|
|
10
|
-
const dbPath = join(process.cwd(), settings.paths.database.resonance);
|
|
11
|
-
const publicDir = join(process.cwd(), "public");
|
|
12
|
-
const outputPath = join(publicDir, "terms.json");
|
|
13
|
-
|
|
14
|
-
// --- Pre-flight Checks ---
|
|
15
|
-
if (!existsSync(dbPath)) {
|
|
16
|
-
console.error(`❌ Error: Database not found at ${dbPath}`);
|
|
17
|
-
console.error("Please run 'bun run scripts/build_db.ts' first.");
|
|
18
|
-
process.exit(1);
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
// Ensure the 'public' directory exists before trying to write to it.
|
|
22
|
-
if (!existsSync(publicDir)) {
|
|
23
|
-
console.log(`Creating 'public' directory at ${publicDir}...`);
|
|
24
|
-
mkdirSync(publicDir, { recursive: true });
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
import { DatabaseFactory } from "@src/resonance/DatabaseFactory";
|
|
28
|
-
|
|
29
|
-
// --- Database Query ---
|
|
30
|
-
// Open the database in read-only mode, using Factory for safe concurrency config
|
|
31
|
-
const db = DatabaseFactory.connect(dbPath, { readonly: true });
|
|
32
|
-
|
|
33
|
-
// This query identifies the most "valuable" terms by counting how many
|
|
34
|
-
// connections (edges) they have. Terms that are more connected are likely
|
|
35
|
-
// more central to the knowledge graph and will yield richer results.
|
|
36
|
-
const query = `
|
|
37
|
-
SELECT
|
|
38
|
-
n.title
|
|
39
|
-
FROM
|
|
40
|
-
nodes n
|
|
41
|
-
JOIN (
|
|
42
|
-
-- First, for each node ID, count its number of DISTINCT neighbors
|
|
43
|
-
SELECT
|
|
44
|
-
id,
|
|
45
|
-
COUNT(DISTINCT neighbor) as neighbor_count
|
|
46
|
-
FROM (
|
|
47
|
-
-- Create a unified list of all connections (id -> neighbor)
|
|
48
|
-
SELECT source as id, target as neighbor FROM edges
|
|
49
|
-
UNION
|
|
50
|
-
SELECT target as id, source as neighbor FROM edges
|
|
51
|
-
)
|
|
52
|
-
GROUP BY id
|
|
53
|
-
) AS counts ON n.id = counts.id
|
|
54
|
-
WHERE
|
|
55
|
-
-- Only include nodes that have 2 or more distinct neighbors
|
|
56
|
-
counts.neighbor_count >= 2
|
|
57
|
-
ORDER BY
|
|
58
|
-
counts.neighbor_count DESC, n.title ASC;
|
|
59
|
-
`;
|
|
60
|
-
|
|
61
|
-
try {
|
|
62
|
-
// --- Data Extraction & Transformation ---
|
|
63
|
-
console.log("Querying database for high-value terms...");
|
|
64
|
-
const results = db.query(query).all() as { title: string }[];
|
|
65
|
-
|
|
66
|
-
// We just want an array of the term labels.
|
|
67
|
-
const terms = results.map((row) => row.title);
|
|
68
|
-
|
|
69
|
-
if (terms.length === 0) {
|
|
70
|
-
console.warn(
|
|
71
|
-
"⚠️ Warning: Query returned no terms. The resulting file will be an empty array.",
|
|
72
|
-
);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
// --- File Output ---
|
|
76
|
-
// Write the curated list of terms to a static JSON file.
|
|
77
|
-
// This file can be easily fetched by the frontend.
|
|
78
|
-
await Bun.write(outputPath, JSON.stringify(terms, null, 2));
|
|
79
|
-
|
|
80
|
-
console.log(
|
|
81
|
-
`✅ Successfully extracted and wrote ${terms.length} terms to ${outputPath}`,
|
|
82
|
-
);
|
|
83
|
-
} catch (error) {
|
|
84
|
-
console.error(`❌ An error occurred: ${error}`);
|
|
85
|
-
} finally {
|
|
86
|
-
// --- Finalization ---
|
|
87
|
-
db.close();
|
|
88
|
-
console.log("Term extraction complete.");
|
|
89
|
-
}
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import { basename, join } from "node:path";
|
|
2
|
-
import type { IngestionArtifact } from "@src/types/artifact";
|
|
3
|
-
import { Glob } from "bun";
|
|
4
|
-
import settings from "@/polyvis.settings.json";
|
|
5
|
-
|
|
6
|
-
const artifacts: IngestionArtifact[] = [];
|
|
7
|
-
const root = process.cwd();
|
|
8
|
-
|
|
9
|
-
// --- Helpers ---
|
|
10
|
-
function extractTitle(content: string, filename: string): string {
|
|
11
|
-
const match = content.match(/^#\s+(.+)$/m);
|
|
12
|
-
return match?.[1] ? match[1].trim() : filename;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
// --- Transformation Loop ---
|
|
16
|
-
let orderCounter = 0;
|
|
17
|
-
|
|
18
|
-
for (const source of settings.paths.sources.experience) {
|
|
19
|
-
const sourceDirRelative = source.path;
|
|
20
|
-
const sourceDir = join(root, sourceDirRelative);
|
|
21
|
-
console.log(`Scanning ${sourceDir}...`);
|
|
22
|
-
|
|
23
|
-
const glob = new Glob("*.md");
|
|
24
|
-
// Sort logic is implicit in file system usually, but better to be explicit
|
|
25
|
-
// Array.from(glob.scanSync) gives unsorted?
|
|
26
|
-
const files = Array.from(glob.scanSync(sourceDir)).sort();
|
|
27
|
-
|
|
28
|
-
for (const file of files) {
|
|
29
|
-
const fullPath = join(sourceDir, file);
|
|
30
|
-
const content = await Bun.file(fullPath).text();
|
|
31
|
-
const id = basename(file, ".md");
|
|
32
|
-
const type: "playbook" | "debrief" = sourceDirRelative.includes("playbooks")
|
|
33
|
-
? "playbook"
|
|
34
|
-
: "debrief";
|
|
35
|
-
|
|
36
|
-
artifacts.push({
|
|
37
|
-
id,
|
|
38
|
-
type,
|
|
39
|
-
order_index: orderCounter++,
|
|
40
|
-
payload: {
|
|
41
|
-
title: extractTitle(content, id),
|
|
42
|
-
content: content,
|
|
43
|
-
domain: "knowledge",
|
|
44
|
-
layer: "experience",
|
|
45
|
-
metadata: { path: fullPath },
|
|
46
|
-
},
|
|
47
|
-
});
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
// --- Output ---
|
|
52
|
-
const outDir = join(root, ".resonance", "artifacts");
|
|
53
|
-
if (!require("node:fs").existsSync(outDir)) {
|
|
54
|
-
require("node:fs").mkdirSync(outDir, { recursive: true });
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
const outFile = join(outDir, "docs.json");
|
|
58
|
-
await Bun.write(outFile, JSON.stringify(artifacts, null, 2));
|
|
59
|
-
|
|
60
|
-
console.log(`✅ Transformed ${artifacts.length} docs to ${outFile}`);
|
|
@@ -1,159 +0,0 @@
|
|
|
1
|
-
import nlp from "compromise";
|
|
2
|
-
|
|
3
|
-
export interface SemanticTags {
|
|
4
|
-
people: string[];
|
|
5
|
-
places: string[];
|
|
6
|
-
organizations: string[];
|
|
7
|
-
topics: string[];
|
|
8
|
-
dates?: string[];
|
|
9
|
-
money: string[];
|
|
10
|
-
protocols?: string[];
|
|
11
|
-
concepts?: string[];
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export class TokenizerService {
|
|
15
|
-
private static instance: TokenizerService;
|
|
16
|
-
// Map of normalized_term -> tag inside class state
|
|
17
|
-
private vocabulary: Map<string, string> = new Map();
|
|
18
|
-
// Cache keys sorted by length (desc) for greedy matching
|
|
19
|
-
private searchKeys: string[] = [];
|
|
20
|
-
|
|
21
|
-
// Compromise instance (optional, keeping for 'people', 'places' currently)
|
|
22
|
-
// Could eventually remove if we go 100% custom.
|
|
23
|
-
|
|
24
|
-
private constructor() {}
|
|
25
|
-
|
|
26
|
-
public static getInstance(): TokenizerService {
|
|
27
|
-
if (!TokenizerService.instance) {
|
|
28
|
-
TokenizerService.instance = new TokenizerService();
|
|
29
|
-
}
|
|
30
|
-
return TokenizerService.instance;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
/**
|
|
34
|
-
* Extracts semantic entities.
|
|
35
|
-
* 1. Uses Compromise for generic Named Entity Recognition (NER)
|
|
36
|
-
* 2. Uses Custom "Zero Magic" Brute Force Scanner for Domain Vocabulary
|
|
37
|
-
*/
|
|
38
|
-
public extract(text: string): SemanticTags {
|
|
39
|
-
const doc = nlp(text);
|
|
40
|
-
|
|
41
|
-
// 1. Generic NLP (Keep for now as fallback/enrichment)
|
|
42
|
-
const result: SemanticTags = {
|
|
43
|
-
people: doc.people().out("array"),
|
|
44
|
-
places: doc.places().out("array"),
|
|
45
|
-
organizations: doc.organizations().out("array"),
|
|
46
|
-
topics: doc.topics().out("array"),
|
|
47
|
-
money: [],
|
|
48
|
-
protocols: [],
|
|
49
|
-
concepts: [],
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
// 2. Zero Magic Domain Scan (Brute Force)
|
|
53
|
-
// Optimization: Check text.includes() only if vocabulary is small?
|
|
54
|
-
// But for regex construction or Aho-Corasick, naive loop is fine for now on small text blocks (Bento boxes).
|
|
55
|
-
|
|
56
|
-
const lowerText = text.toLowerCase();
|
|
57
|
-
|
|
58
|
-
for (const term of this.searchKeys) {
|
|
59
|
-
// Simple subset check.
|
|
60
|
-
// Limitation: Matches "pro" in "process". Needs word boundary check.
|
|
61
|
-
// RegExp construction is costly inside loop?
|
|
62
|
-
// Better: Pre-build a massive Regex?
|
|
63
|
-
// Or simpler: \bSTR\b with indexof?
|
|
64
|
-
|
|
65
|
-
// Fast "includes" check first
|
|
66
|
-
if (lowerText.includes(term)) {
|
|
67
|
-
// Confirm Word Boundary to avoid partial matches
|
|
68
|
-
// Regex is expensive, but safer for accuracy.
|
|
69
|
-
// We construct regex only on 'hit' to save cycles?
|
|
70
|
-
const escaped = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
71
|
-
const boundaryRegex = new RegExp(`\\b${escaped}\\b`, "i");
|
|
72
|
-
|
|
73
|
-
if (boundaryRegex.test(text)) {
|
|
74
|
-
const tag = this.vocabulary.get(term);
|
|
75
|
-
// Retrieve Canonical Form (Original Case) if needed?
|
|
76
|
-
// For now, we normalize to the lowercase SEARCH key, but we might prefer the original.
|
|
77
|
-
// Given the user wants "OH-058" format, let's try to map back if possible.
|
|
78
|
-
// But we only stored 'tag' in the map.
|
|
79
|
-
// Let's store Normalized -> Original ID in a separate map if required?
|
|
80
|
-
// User Request: "no need to manage case" -> likely means "return standard ID".
|
|
81
|
-
|
|
82
|
-
// Since we are returning the matched string (which is usually the ID in lowercase in our loop),
|
|
83
|
-
// If we want UPPERCASE OH-058, we need to know it.
|
|
84
|
-
// Simplest fix: Just return the term as found (lowercase) and EdgeWeaver will handle lookup.
|
|
85
|
-
// EdgeWeaver expects slugified keys anyway.
|
|
86
|
-
|
|
87
|
-
// Actually, let's return the TERM as it appears in the text?
|
|
88
|
-
// boundaryRegex.match(text) would give us the real casing used in the doc (e.g. "OH-058").
|
|
89
|
-
|
|
90
|
-
const match = boundaryRegex.exec(text);
|
|
91
|
-
const realTerm = match ? match[0] : term;
|
|
92
|
-
|
|
93
|
-
if (tag === "Protocol") {
|
|
94
|
-
if (!result.protocols) result.protocols = [];
|
|
95
|
-
if (!result.protocols.includes(realTerm))
|
|
96
|
-
result.protocols.push(realTerm);
|
|
97
|
-
} else if (tag === "Concept") {
|
|
98
|
-
if (!result.concepts) result.concepts = [];
|
|
99
|
-
if (!result.concepts.includes(realTerm))
|
|
100
|
-
result.concepts.push(realTerm);
|
|
101
|
-
} else if (tag === "Organization") {
|
|
102
|
-
if (!result.organizations.includes(realTerm))
|
|
103
|
-
result.organizations.push(realTerm);
|
|
104
|
-
} else {
|
|
105
|
-
// Default to Concept if tag is unknown or not explicitly handled
|
|
106
|
-
if (!result.concepts) result.concepts = [];
|
|
107
|
-
if (!result.concepts.includes(realTerm))
|
|
108
|
-
result.concepts.push(realTerm);
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
return result;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
public loadLexicon(
|
|
118
|
-
lexicon: { id: string; title: string; type?: string; category?: string }[],
|
|
119
|
-
) {
|
|
120
|
-
// Reset
|
|
121
|
-
this.vocabulary.clear();
|
|
122
|
-
|
|
123
|
-
for (const item of lexicon) {
|
|
124
|
-
let tag = "Concept";
|
|
125
|
-
if (item.type === "operational-heuristic") tag = "Protocol";
|
|
126
|
-
if (item.category === "Tool") tag = "Organization";
|
|
127
|
-
|
|
128
|
-
// Add Title
|
|
129
|
-
if (item.title) {
|
|
130
|
-
this.vocabulary.set(item.title.toLowerCase(), tag);
|
|
131
|
-
}
|
|
132
|
-
// Add ID
|
|
133
|
-
if (item.id) {
|
|
134
|
-
this.vocabulary.set(item.id.toLowerCase(), tag);
|
|
135
|
-
// Handle Hyphen Variants
|
|
136
|
-
if (item.id.includes("-")) {
|
|
137
|
-
this.vocabulary.set(item.id.toLowerCase().replace(/-/g, " "), tag);
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
// Sort keys by length desc to ensure "Web Standards" matches before "Web"
|
|
143
|
-
this.searchKeys = Array.from(this.vocabulary.keys()).sort(
|
|
144
|
-
(a, b) => b.length - a.length,
|
|
145
|
-
);
|
|
146
|
-
|
|
147
|
-
console.log(
|
|
148
|
-
`🧠 ZeroMagic Tokenizer learned ${this.vocabulary.size} terms from lexicon.`,
|
|
149
|
-
);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// Deprecated / No-Op
|
|
153
|
-
public extend(
|
|
154
|
-
_customWords: Record<string, string>,
|
|
155
|
-
_customPatterns: Record<string, string>,
|
|
156
|
-
) {
|
|
157
|
-
// No-op for brute force scanner
|
|
158
|
-
}
|
|
159
|
-
}
|