@meshxdata/fops 0.1.31 → 0.1.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +372 -0
- package/package.json +1 -1
- package/src/commands/lifecycle.js +16 -0
- package/src/electron/icon.png +0 -0
- package/src/electron/main.js +24 -0
- package/src/plugins/bundled/fops-plugin-embeddings/index.js +9 -0
- package/src/plugins/bundled/fops-plugin-embeddings/lib/indexer.js +1 -1
- package/src/plugins/bundled/fops-plugin-file/demo/landscape.yaml +67 -0
- package/src/plugins/bundled/fops-plugin-file/demo/orders_bad.csv +6 -0
- package/src/plugins/bundled/fops-plugin-file/demo/orders_good.csv +7 -0
- package/src/plugins/bundled/fops-plugin-file/demo/orders_reference.csv +6 -0
- package/src/plugins/bundled/fops-plugin-file/demo/orders_renamed.aligned.csv +6 -0
- package/src/plugins/bundled/fops-plugin-file/demo/orders_renamed.csv +6 -0
- package/src/plugins/bundled/fops-plugin-file/demo/rules.json +8 -0
- package/src/plugins/bundled/fops-plugin-file/demo/run.sh +110 -0
- package/src/plugins/bundled/fops-plugin-file/index.js +140 -24
- package/src/plugins/bundled/fops-plugin-file/lib/embed-index.js +7 -0
- package/src/plugins/bundled/fops-plugin-file/lib/match.js +11 -4
- package/src/plugins/bundled/fops-plugin-foundation/index.js +1715 -2
- package/src/plugins/bundled/fops-plugin-foundation/lib/align.js +183 -0
- package/src/plugins/bundled/fops-plugin-foundation/lib/apply.js +83 -41
- package/src/plugins/bundled/fops-plugin-foundation/lib/client.js +40 -4
- package/src/plugins/bundled/fops-plugin-foundation/lib/stack-apply.js +4 -1
- package/src/plugins/bundled/fops-plugin-foundation/lib/tools-write.js +46 -0
- package/src/plugins/bundled/fops-plugin-foundation-graphql/index.js +39 -1
- package/src/plugins/bundled/fops-plugin-foundation-graphql/lib/graphql/resolvers/data-object.js +9 -6
- package/src/plugins/bundled/fops-plugin-foundation-graphql/lib/graphql/resolvers/data-product.js +9 -6
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# fops embed file — demo script
|
|
3
|
+
#
|
|
4
|
+
# Showcases the full pipeline:
|
|
5
|
+
# Steps 1-5: pure schema / type comparison (no index needed)
|
|
6
|
+
# Step 6: MiniLM + RRF smart matching (requires Foundation + `fops embed file index`)
|
|
7
|
+
# Step 7: Fix CSV — rename columns to match reference (fops foundation align)
|
|
8
|
+
# In interactive mode, step 6 prompts to fix CSV or update the Data Product automatically.
|
|
9
|
+
#
|
|
10
|
+
# Files:
|
|
11
|
+
# orders_reference.csv — expected schema (7 cols: order_id, customer_id, amount, currency, order_date, status, region)
|
|
12
|
+
# orders_good.csv — clean file, matches reference exactly → PASS
|
|
13
|
+
# orders_bad.csv — renamed column (total_amount), missing status → FAIL
|
|
14
|
+
# orders_renamed.csv — same data, all column names abbreviated (id, cust_id, price...) → no Jaccard match → MiniLM matches
|
|
15
|
+
|
|
16
|
+
set -e
|
|
17
|
+
DEMO_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
18
|
+
|
|
19
|
+
section() {
|
|
20
|
+
echo
|
|
21
|
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
22
|
+
echo " $1"
|
|
23
|
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
# ── 1. Preview: inspect schema + inferred types ───────────────────────────────
|
|
27
|
+
section "1 · Preview — inspect schema + inferred types"
|
|
28
|
+
fops embed file preview "$DEMO_DIR/orders_good.csv"
|
|
29
|
+
|
|
30
|
+
# ── 2. Validate: clean file against reference → PASS ─────────────────────────
|
|
31
|
+
section "2 · Validate — clean file (expect PASS)"
|
|
32
|
+
fops embed file validate "$DEMO_DIR/orders_good.csv" \
|
|
33
|
+
--reference "$DEMO_DIR/orders_reference.csv"
|
|
34
|
+
|
|
35
|
+
# ── 3. Validate: bad file → FAIL (missing column, renamed column) ─────────────
|
|
36
|
+
section "3 · Validate — bad file (expect FAIL)"
|
|
37
|
+
fops embed file validate "$DEMO_DIR/orders_bad.csv" \
|
|
38
|
+
--reference "$DEMO_DIR/orders_reference.csv" || true
|
|
39
|
+
|
|
40
|
+
# ── 4. Preview with reference diff ────────────────────────────────────────────
|
|
41
|
+
section "4 · Preview with diff — visualise column drift vs reference"
|
|
42
|
+
fops embed file preview "$DEMO_DIR/orders_bad.csv" \
|
|
43
|
+
--reference "$DEMO_DIR/orders_reference.csv"
|
|
44
|
+
|
|
45
|
+
# ── 5. Smart match (Jaccard only, no index) ───────────────────────────────────
|
|
46
|
+
# orders_renamed.csv has fully abbreviated column names — zero Jaccard overlap.
|
|
47
|
+
# --no-semantic forces Jaccard-only mode (skips MiniLM, no ONNX load).
|
|
48
|
+
section "5 · Smart match — Jaccard only (no index): expect no strong match"
|
|
49
|
+
fops embed file "$DEMO_DIR/orders_renamed.csv" --local --no-semantic || true
|
|
50
|
+
|
|
51
|
+
# ── 6. Smart match (MiniLM + RRF, requires index) ────────────────────────────
|
|
52
|
+
# When the SQLite schema index exists (built by `fops embed file index`), the
|
|
53
|
+
# matching pipeline upgrades to:
|
|
54
|
+
# - Signal 1: column-level MiniLM cosine + greedy bipartite matching
|
|
55
|
+
# - Signal 2: Jaccard overlap on exact column names
|
|
56
|
+
# - Signal 3: type compatibility score on matched column pairs
|
|
57
|
+
# - Signal 4: MiniLM cosine on table/entity names
|
|
58
|
+
# - Signal 5: fraction of candidate columns with sim > 0.75
|
|
59
|
+
# All fused via Reciprocal Rank Fusion (RRF k=60).
|
|
60
|
+
#
|
|
61
|
+
# With MiniLM, "id"→"order_id", "price"→"amount", "ccy"→"currency" etc. all
|
|
62
|
+
# match semantically even with zero Jaccard overlap.
|
|
63
|
+
section "6 · Smart match — MiniLM + RRF (requires Foundation + index)"
|
|
64
|
+
|
|
65
|
+
SCHEMA_DB="$HOME/.fops/file/schema-index.db"
|
|
66
|
+
if [ -f "$SCHEMA_DB" ]; then
|
|
67
|
+
echo " Index found — running full semantic match..."
|
|
68
|
+
fops embed file "$DEMO_DIR/orders_renamed.csv"
|
|
69
|
+
else
|
|
70
|
+
echo " No schema index found. To enable MiniLM + RRF matching, run:"
|
|
71
|
+
echo
|
|
72
|
+
echo " # 1. Start Foundation"
|
|
73
|
+
echo " fops up"
|
|
74
|
+
echo
|
|
75
|
+
echo " # 2. Register your data objects (or use the demo landscape)"
|
|
76
|
+
echo " fops apply $DEMO_DIR/landscape.yaml"
|
|
77
|
+
echo
|
|
78
|
+
echo " # 3. Build the SQLite schema index (embeds column names with MiniLM)"
|
|
79
|
+
echo " fops embed file index"
|
|
80
|
+
echo
|
|
81
|
+
echo " # 4. Re-run this step — orders_renamed.csv will now match via"
|
|
82
|
+
echo " # semantic similarity (id→order_id, price→amount, ccy→currency)"
|
|
83
|
+
echo " fops embed file $DEMO_DIR/orders_renamed.csv"
|
|
84
|
+
echo
|
|
85
|
+
echo " Matching signals when index is present:"
|
|
86
|
+
echo " [1] MiniLM column cosine — greedy bipartite match per column pair"
|
|
87
|
+
echo " [2] Jaccard overlap — exact column name set overlap"
|
|
88
|
+
echo " [3] Type compatibility — integer/decimal/date match on mapped cols"
|
|
89
|
+
echo " [4] Name semantic — MiniLM cosine on table/entity names"
|
|
90
|
+
echo " [5] Mapping confidence — fraction of cols with sim > 0.75"
|
|
91
|
+
echo " ↳ all fused via Reciprocal Rank Fusion (RRF k=60)"
|
|
92
|
+
fi
|
|
93
|
+
|
|
94
|
+
# ── 7. Fix CSV — rename abbreviated columns to reference names ────────────────
|
|
95
|
+
# After a smart match, fops embed file (interactive) prompts:
|
|
96
|
+
# ▸ Fix CSV — rename columns to match reference (→ orders_renamed.aligned.csv)
|
|
97
|
+
# Fix Data Product — update schema to match this file
|
|
98
|
+
# Nothing — done
|
|
99
|
+
#
|
|
100
|
+
# This step demonstrates the "Fix CSV" path explicitly via fops foundation align,
|
|
101
|
+
# which uses the same MiniLM + Levenshtein column alignment under the hood.
|
|
102
|
+
section "7 · Fix CSV — rename columns to match reference"
|
|
103
|
+
# Uses MiniLM semantic matching to map abbreviated column names to reference schema.
|
|
104
|
+
# The interactive Fix CSV prompt inside `fops embed file` (step 6) runs this automatically.
|
|
105
|
+
# || true: ONNX native handles trigger a SIGKILL on exit — this is expected and harmless.
|
|
106
|
+
fops foundation align "$DEMO_DIR/orders_renamed.csv" \
|
|
107
|
+
"order_id,customer_id,amount,currency,order_date,status,region" \
|
|
108
|
+
--output "$DEMO_DIR/orders_renamed.aligned.csv" || true
|
|
109
|
+
|
|
110
|
+
echo
|
|
@@ -35,6 +35,7 @@ export function register(api) {
|
|
|
35
35
|
.option("--json", "Output matches as JSON (for scripting)")
|
|
36
36
|
.option("--no-auto", "Always show picker, never auto-select")
|
|
37
37
|
.option("--local", "Match against local directory files instead of Foundation landscape")
|
|
38
|
+
.option("--no-semantic", "Jaccard-only matching — skip MiniLM embeddings (useful for testing without the model)")
|
|
38
39
|
.option("--sheet <name>", "Sheet name or index for XLSX files")
|
|
39
40
|
.option("--data-source <id>", "Data source identifier for attempt logging")
|
|
40
41
|
.option("--file-family <name>", "Logical file family name for logging")
|
|
@@ -71,7 +72,7 @@ export function register(api) {
|
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
let embeddingClient = null;
|
|
74
|
-
if (typeof api.getService === "function") {
|
|
75
|
+
if (!opts.noSemantic && typeof api.getService === "function") {
|
|
75
76
|
try { embeddingClient = api.getService("embeddings"); } catch { /* not loaded */ }
|
|
76
77
|
}
|
|
77
78
|
|
|
@@ -338,17 +339,24 @@ export function register(api) {
|
|
|
338
339
|
columnEmbeddings: candResult.columnEmbeddings,
|
|
339
340
|
nameEmbedding: candResult.nameEmbedding,
|
|
340
341
|
};
|
|
341
|
-
const
|
|
342
|
-
.filter((r) => r.score >= threshold)
|
|
343
|
-
|
|
342
|
+
const allRanked = rankMatches(candidateData, references, rankOpts)
|
|
343
|
+
.filter((r) => r.score >= threshold);
|
|
344
|
+
|
|
345
|
+
// In Jaccard-only mode, require at least 2 overlapping column names to avoid false positives.
|
|
346
|
+
// When embeddings are active (v2 or v3), semantic matching works without exact name overlap.
|
|
347
|
+
const ranked = (hasEmbeddings || hasColumnEmbeddings)
|
|
348
|
+
? allRanked
|
|
349
|
+
: allRanked.filter((r) => r.overlapCount >= 2 || r.overlapCount >= r.overlapTotal);
|
|
344
350
|
|
|
345
351
|
spinner.stop();
|
|
346
352
|
|
|
347
353
|
if (ranked.length === 0) {
|
|
348
|
-
console.log(chalk.yellow(" No strong matches found
|
|
354
|
+
console.log(chalk.yellow(" No strong matches found."));
|
|
349
355
|
console.log(chalk.dim(" Use `fops embed file validate <file> --reference <ref>` for explicit validation."));
|
|
350
|
-
if (!hasEmbeddings && foundationAvailable) {
|
|
351
|
-
console.log(chalk.dim(" Tip: run `fops embed index` to enable semantic matching."));
|
|
356
|
+
if (!hasEmbeddings && !hasColumnEmbeddings && foundationAvailable) {
|
|
357
|
+
console.log(chalk.dim(" Tip: run `fops embed index` then `fops embed file index` to enable semantic matching."));
|
|
358
|
+
} else if (hasEmbeddings && !hasColumnEmbeddings && foundationAvailable) {
|
|
359
|
+
console.log(chalk.dim(" Tip: run `fops embed file index` (model cached) for column-level semantic matching."));
|
|
352
360
|
}
|
|
353
361
|
return;
|
|
354
362
|
}
|
|
@@ -469,24 +477,52 @@ export function register(api) {
|
|
|
469
477
|
rows: null,
|
|
470
478
|
};
|
|
471
479
|
|
|
480
|
+
// When semantic column mappings are available, translate candidate column
|
|
481
|
+
// names to their matched reference names so schema/type checks are meaningful.
|
|
482
|
+
let candHeadersForValidation = candResult.headers;
|
|
483
|
+
let candTypesForValidation = candResult.types;
|
|
484
|
+
const colMap = selectedRef.columnMappings;
|
|
485
|
+
if (colMap?.length > 0) {
|
|
486
|
+
const translatedHeaders = [];
|
|
487
|
+
const translatedTypes = {};
|
|
488
|
+
for (const m of colMap) {
|
|
489
|
+
translatedHeaders.push(m.referenceCol);
|
|
490
|
+
if (candResult.types?.[m.candidateCol] !== undefined) {
|
|
491
|
+
translatedTypes[m.referenceCol] = candResult.types[m.candidateCol];
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
candHeadersForValidation = translatedHeaders;
|
|
495
|
+
candTypesForValidation = translatedTypes;
|
|
496
|
+
}
|
|
497
|
+
|
|
472
498
|
const schemaResult = compareSchemas(
|
|
473
499
|
{ headers: refResult.headers },
|
|
474
|
-
{ headers:
|
|
500
|
+
{ headers: candHeadersForValidation },
|
|
475
501
|
{ candidateCentric: true },
|
|
476
502
|
);
|
|
477
503
|
|
|
478
504
|
let typeResult = null;
|
|
479
|
-
const commonCols =
|
|
480
|
-
if (commonCols.length > 0 && refResult.types &&
|
|
481
|
-
typeResult = compareTypes(refResult.types,
|
|
505
|
+
const commonCols = candHeadersForValidation.filter((h) => refResult.headers.includes(h));
|
|
506
|
+
if (commonCols.length > 0 && refResult.types && candTypesForValidation) {
|
|
507
|
+
typeResult = compareTypes(refResult.types, candTypesForValidation, commonCols);
|
|
482
508
|
}
|
|
483
509
|
|
|
484
510
|
// Auto-derive value rules from schema metadata (sample up to 10K rows)
|
|
485
511
|
const autoRules = deriveRules(refResult.types, commonCols);
|
|
486
512
|
let valuesResult = null;
|
|
487
513
|
if (autoRules && candResult.rows?.length > 0) {
|
|
514
|
+
// Map row values to reference column names for value checks
|
|
488
515
|
const sampleRows = candResult.rows.length > 10000 ? candResult.rows.slice(0, 10000) : candResult.rows;
|
|
489
|
-
|
|
516
|
+
if (colMap?.length > 0) {
|
|
517
|
+
// Re-key rows from candidate names to reference names
|
|
518
|
+
const keyMap = Object.fromEntries(colMap.map((m) => [m.candidateCol, m.referenceCol]));
|
|
519
|
+
const remappedRows = sampleRows.map((row) =>
|
|
520
|
+
Object.fromEntries(Object.entries(row).map(([k, v]) => [keyMap[k] ?? k, v])),
|
|
521
|
+
);
|
|
522
|
+
valuesResult = checkValues(remappedRows, candHeadersForValidation, autoRules);
|
|
523
|
+
} else {
|
|
524
|
+
valuesResult = checkValues(sampleRows, candResult.headers, autoRules);
|
|
525
|
+
}
|
|
490
526
|
}
|
|
491
527
|
|
|
492
528
|
const report = buildReport(schemaResult, typeResult, valuesResult, null);
|
|
@@ -503,6 +539,82 @@ export function register(api) {
|
|
|
503
539
|
if (report.verdict === "FAIL") {
|
|
504
540
|
process.exitCode = 1;
|
|
505
541
|
}
|
|
542
|
+
|
|
543
|
+
// ── 11. Post-match action prompt ────────────────────────────────
|
|
544
|
+
if (isTTY && !opts.json) {
|
|
545
|
+
const ext = pathMod.extname(candidate).toLowerCase();
|
|
546
|
+
const canFixCsv = ext === ".csv" && colMap?.length > 0;
|
|
547
|
+
const canFixDp = selectedRef.entityType === "data_product" && !!foundationClient;
|
|
548
|
+
|
|
549
|
+
if (canFixCsv || canFixDp) {
|
|
550
|
+
const baseName = pathMod.basename(candidate, ext);
|
|
551
|
+
const alignedPath = pathMod.join(pathMod.dirname(candidate), `${baseName}.aligned${ext}`);
|
|
552
|
+
|
|
553
|
+
const actionOptions = [];
|
|
554
|
+
if (canFixCsv) {
|
|
555
|
+
actionOptions.push({
|
|
556
|
+
label: `Fix CSV — rename columns to match reference (→ ${pathMod.basename(alignedPath)})`,
|
|
557
|
+
value: "fix-csv",
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
if (canFixDp) {
|
|
561
|
+
actionOptions.push({
|
|
562
|
+
label: `Fix Data Product — update "${selectedRef.name}" schema to match this file`,
|
|
563
|
+
value: "fix-dp",
|
|
564
|
+
});
|
|
565
|
+
}
|
|
566
|
+
actionOptions.push({ label: "Nothing — done", value: null });
|
|
567
|
+
|
|
568
|
+
let selectActionFn;
|
|
569
|
+
const confirmPath2 = api.getCliPath?.("src", "ui", "confirm.js");
|
|
570
|
+
if (confirmPath2) {
|
|
571
|
+
const { pathToFileURL: p2u } = await import("node:url");
|
|
572
|
+
({ selectOption: selectActionFn } = await import(p2u(confirmPath2).href));
|
|
573
|
+
} else {
|
|
574
|
+
({ selectOption: selectActionFn } = await import("../../../ui/confirm.js"));
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
console.log("");
|
|
578
|
+
const action = await selectActionFn("Next step:", actionOptions);
|
|
579
|
+
|
|
580
|
+
if (action === "fix-csv") {
|
|
581
|
+
const Papa = (await import("papaparse")).default;
|
|
582
|
+
const renameMap = Object.fromEntries(colMap.map((m) => [m.candidateCol, m.referenceCol]));
|
|
583
|
+
const newHeaders = candResult.headers.map((h) => renameMap[h] ?? h);
|
|
584
|
+
const newRows = (candResult.rows || []).map((row) =>
|
|
585
|
+
Object.fromEntries(Object.entries(row).map(([k, v]) => [renameMap[k] ?? k, v])),
|
|
586
|
+
);
|
|
587
|
+
const csvText = Papa.unparse({ fields: newHeaders, data: newRows });
|
|
588
|
+
(await import("node:fs")).writeFileSync(alignedPath, csvText, "utf8");
|
|
589
|
+
console.log(chalk.green(`\n ✓ Written: ${alignedPath}`));
|
|
590
|
+
const renamed = colMap.filter((m) => m.candidateCol !== m.referenceCol);
|
|
591
|
+
if (renamed.length > 0) {
|
|
592
|
+
console.log(chalk.dim(`\n Renamed ${renamed.length} column(s):`));
|
|
593
|
+
for (const m of renamed) {
|
|
594
|
+
console.log(chalk.dim(` ${m.candidateCol.padEnd(22)} → ${m.referenceCol}`));
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
} else if (action === "fix-dp") {
|
|
598
|
+
const typeToApiCol = (t) => ({ integer: "INTEGER", decimal: "DECIMAL", date: "DATE", timestamp: "TIMESTAMP", boolean: "BOOLEAN" })[t] || "VARCHAR";
|
|
599
|
+
const fields = candResult.headers.map((h) => {
|
|
600
|
+
const ti = candResult.types?.[h];
|
|
601
|
+
const col = { name: h, data_type: { column_type: typeToApiCol(ti?.type || "string") } };
|
|
602
|
+
if (ti?.type === "decimal" && ti.precision != null) {
|
|
603
|
+
col.data_type.meta = { precision: String(ti.precision), scale: String(ti.scale ?? 0) };
|
|
604
|
+
}
|
|
605
|
+
return col;
|
|
606
|
+
});
|
|
607
|
+
const schemaBody = { details: { data_product_type: "user", fields } };
|
|
608
|
+
try {
|
|
609
|
+
await foundationClient.put(`/data/data_product/schema?identifier=${selectedRef.id}`, schemaBody);
|
|
610
|
+
console.log(chalk.green(`\n ✓ Schema updated for "${selectedRef.name}"`));
|
|
611
|
+
console.log(chalk.dim(` ${fields.length} columns from ${pathMod.basename(candidate)}`));
|
|
612
|
+
} catch (err) {
|
|
613
|
+
console.error(chalk.red(`\n ✗ Schema update failed: ${err.message}`));
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
506
618
|
} catch (err) {
|
|
507
619
|
spinner.stop();
|
|
508
620
|
console.error(chalk.red(`\n ✗ ${err.message}\n`));
|
|
@@ -777,21 +889,23 @@ export function register(api) {
|
|
|
777
889
|
let refTypes = null; // { colName: { type } }
|
|
778
890
|
let refLabel = null;
|
|
779
891
|
|
|
780
|
-
|
|
892
|
+
// opts.reference may be consumed by the parent fileCmd (Commander option inheritance)
|
|
893
|
+
const referenceArg = opts.reference || fileCmd.opts().reference;
|
|
894
|
+
if (referenceArg) {
|
|
781
895
|
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
|
782
|
-
if (UUID_RE.test(
|
|
896
|
+
if (UUID_RE.test(referenceArg)) {
|
|
783
897
|
spinner.text = "Fetching reference schema...";
|
|
784
|
-
const refResult = await resolveSchemaFromApi(
|
|
898
|
+
const refResult = await resolveSchemaFromApi(referenceArg, foundationClient, mapApiTypeToInferred);
|
|
785
899
|
refHeaders = new Set(refResult.headers);
|
|
786
900
|
refTypes = refResult.types;
|
|
787
|
-
refLabel = `data_object:${
|
|
901
|
+
refLabel = `data_object:${referenceArg.slice(0, 8)}`;
|
|
788
902
|
} else {
|
|
789
903
|
spinner.text = "Parsing reference...";
|
|
790
|
-
const refInput = await resolveInput(
|
|
904
|
+
const refInput = await resolveInput(referenceArg, storage, fs, pathMod);
|
|
791
905
|
const refResult = await parseFile(refInput.buffer, { format: refInput.format, maxRows: 1 });
|
|
792
906
|
refHeaders = new Set(refResult.headers);
|
|
793
907
|
refTypes = refResult.types;
|
|
794
|
-
refLabel = pathMod.basename(
|
|
908
|
+
refLabel = pathMod.basename(referenceArg);
|
|
795
909
|
}
|
|
796
910
|
}
|
|
797
911
|
|
|
@@ -1003,16 +1117,18 @@ export function register(api) {
|
|
|
1003
1117
|
let refResult = null;
|
|
1004
1118
|
let refLabel = "(none)";
|
|
1005
1119
|
|
|
1006
|
-
|
|
1007
|
-
|
|
1120
|
+
// opts.reference may be consumed by the parent fileCmd (Commander option inheritance)
|
|
1121
|
+
const referenceArg = opts.reference || fileCmd.opts().reference;
|
|
1122
|
+
if (referenceArg) {
|
|
1123
|
+
if (isDataObjectUuid(referenceArg)) {
|
|
1008
1124
|
spinner.text = "Fetching reference schema from Foundation API...";
|
|
1009
|
-
refResult = await resolveSchemaFromApi(
|
|
1010
|
-
refLabel = `data_object:${
|
|
1125
|
+
refResult = await resolveSchemaFromApi(referenceArg, foundationClient, mapApiTypeToInferred);
|
|
1126
|
+
refLabel = `data_object:${referenceArg.slice(0, 8)}`;
|
|
1011
1127
|
} else {
|
|
1012
1128
|
spinner.text = "Parsing reference file...";
|
|
1013
|
-
const refInput = await resolveInput(
|
|
1129
|
+
const refInput = await resolveInput(referenceArg, storage, fs, path);
|
|
1014
1130
|
refResult = await parseFile(refInput.buffer, { format: refInput.format, sheet: opts.sheet });
|
|
1015
|
-
refLabel = path.basename(
|
|
1131
|
+
refLabel = path.basename(referenceArg);
|
|
1016
1132
|
}
|
|
1017
1133
|
}
|
|
1018
1134
|
|
|
@@ -387,7 +387,14 @@ export async function runEmbedIndex(api, opts) {
|
|
|
387
387
|
store.setMeta("embedding_dim", "384");
|
|
388
388
|
} catch {
|
|
389
389
|
// Embeddings failed — index still usable without them
|
|
390
|
+
spinner.stop();
|
|
391
|
+
console.log(WARN(" ⚠ Column embeddings skipped — MiniLM model not ready."));
|
|
392
|
+
console.log(DIM(" Run `fops embed index` first to download the model, then re-run `fops embed file index`."));
|
|
393
|
+
console.log(DIM(" Without embeddings, matching falls back to Jaccard (exact column name overlap only)."));
|
|
390
394
|
}
|
|
395
|
+
} else {
|
|
396
|
+
console.log(WARN(" ⚠ Column embeddings skipped — embeddings plugin not available."));
|
|
397
|
+
console.log(DIM(" Run `fops embed index` first to download the MiniLM model, then re-run `fops embed file index`."));
|
|
391
398
|
}
|
|
392
399
|
|
|
393
400
|
store.setMeta("version", "3");
|
|
@@ -155,13 +155,20 @@ export async function embedSchemaEntries(entries, embeddingClient, opts = {}) {
|
|
|
155
155
|
|
|
156
156
|
/**
|
|
157
157
|
* Normalize a column name for embedding: split on separators, lowercase.
|
|
158
|
+
* For compound names (e.g. order_status), the suffix is repeated to boost its
|
|
159
|
+
* weight in the embedding so "order_status" stays close to "status" and doesn't
|
|
160
|
+
* drift toward "order_id" due to a shared prefix.
|
|
158
161
|
*/
|
|
159
162
|
function normalizeColumnName(name) {
|
|
160
|
-
|
|
161
|
-
.replace(/[_\-./]/g, " ")
|
|
163
|
+
const parts = (name || "")
|
|
162
164
|
.replace(/([a-z])([A-Z])/g, "$1 $2") // camelCase split
|
|
163
|
-
.
|
|
164
|
-
.
|
|
165
|
+
.split(/[_\-./\s]+/)
|
|
166
|
+
.filter(Boolean)
|
|
167
|
+
.map((p) => p.toLowerCase());
|
|
168
|
+
if (parts.length === 0) return name || "";
|
|
169
|
+
if (parts.length === 1) return parts[0];
|
|
170
|
+
// Repeat the last token to emphasise the semantic role over the entity prefix
|
|
171
|
+
return parts.join(" ") + " " + parts[parts.length - 1];
|
|
165
172
|
}
|
|
166
173
|
|
|
167
174
|
/**
|