@meshxdata/fops 0.1.32 → 0.1.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. package/CHANGELOG.md +184 -0
  2. package/package.json +1 -2
  3. package/src/commands/lifecycle.js +16 -0
  4. package/src/plugins/bundled/fops-plugin-embeddings/index.js +3 -1
  5. package/src/plugins/bundled/fops-plugin-embeddings/lib/indexer.js +1 -1
  6. package/src/plugins/bundled/fops-plugin-file/demo/landscape.yaml +67 -0
  7. package/src/plugins/bundled/fops-plugin-file/demo/orders_bad.csv +6 -0
  8. package/src/plugins/bundled/fops-plugin-file/demo/orders_good.csv +7 -0
  9. package/src/plugins/bundled/fops-plugin-file/demo/orders_reference.csv +6 -0
  10. package/src/plugins/bundled/fops-plugin-file/demo/orders_renamed.aligned.csv +6 -0
  11. package/src/plugins/bundled/fops-plugin-file/demo/orders_renamed.csv +6 -0
  12. package/src/plugins/bundled/fops-plugin-file/demo/rules.json +8 -0
  13. package/src/plugins/bundled/fops-plugin-file/demo/run.sh +110 -0
  14. package/src/plugins/bundled/fops-plugin-file/index.js +140 -24
  15. package/src/plugins/bundled/fops-plugin-file/lib/embed-index.js +7 -0
  16. package/src/plugins/bundled/fops-plugin-file/lib/match.js +11 -4
  17. package/src/plugins/bundled/fops-plugin-foundation/index.js +1574 -101
  18. package/src/plugins/bundled/fops-plugin-foundation/lib/align.js +42 -4
  19. package/src/plugins/bundled/fops-plugin-foundation/lib/apply.js +83 -41
  20. package/src/plugins/bundled/fops-plugin-foundation/lib/stack-apply.js +4 -1
  21. package/src/plugins/bundled/fops-plugin-foundation-graphql/index.js +39 -1
  22. package/src/plugins/bundled/fops-plugin-foundation-graphql/lib/graphql/resolvers/data-object.js +9 -6
  23. package/src/plugins/bundled/fops-plugin-foundation-graphql/lib/graphql/resolvers/data-product.js +9 -6
@@ -35,6 +35,7 @@ export function register(api) {
35
35
  .option("--json", "Output matches as JSON (for scripting)")
36
36
  .option("--no-auto", "Always show picker, never auto-select")
37
37
  .option("--local", "Match against local directory files instead of Foundation landscape")
38
+ .option("--no-semantic", "Jaccard-only matching — skip MiniLM embeddings (useful for testing without the model)")
38
39
  .option("--sheet <name>", "Sheet name or index for XLSX files")
39
40
  .option("--data-source <id>", "Data source identifier for attempt logging")
40
41
  .option("--file-family <name>", "Logical file family name for logging")
@@ -71,7 +72,7 @@ export function register(api) {
71
72
  }
72
73
 
73
74
  let embeddingClient = null;
74
- if (typeof api.getService === "function") {
75
+ if (!opts.noSemantic && typeof api.getService === "function") {
75
76
  try { embeddingClient = api.getService("embeddings"); } catch { /* not loaded */ }
76
77
  }
77
78
 
@@ -338,17 +339,24 @@ export function register(api) {
338
339
  columnEmbeddings: candResult.columnEmbeddings,
339
340
  nameEmbedding: candResult.nameEmbedding,
340
341
  };
341
- const ranked = rankMatches(candidateData, references, rankOpts)
342
- .filter((r) => r.score >= threshold)
343
- .filter((r) => r.overlapCount >= 2 || r.overlapCount >= r.overlapTotal);
342
+ const allRanked = rankMatches(candidateData, references, rankOpts)
343
+ .filter((r) => r.score >= threshold);
344
+
345
+ // In Jaccard-only mode, require at least 2 overlapping column names to avoid false positives.
346
+ // When embeddings are active (v2 or v3), semantic matching works without exact name overlap.
347
+ const ranked = (hasEmbeddings || hasColumnEmbeddings)
348
+ ? allRanked
349
+ : allRanked.filter((r) => r.overlapCount >= 2 || r.overlapCount >= r.overlapTotal);
344
350
 
345
351
  spinner.stop();
346
352
 
347
353
  if (ranked.length === 0) {
348
- console.log(chalk.yellow(" No strong matches found (need at least 2 overlapping columns)."));
354
+ console.log(chalk.yellow(" No strong matches found."));
349
355
  console.log(chalk.dim(" Use `fops embed file validate <file> --reference <ref>` for explicit validation."));
350
- if (!hasEmbeddings && foundationAvailable) {
351
- console.log(chalk.dim(" Tip: run `fops embed index` to enable semantic matching."));
356
+ if (!hasEmbeddings && !hasColumnEmbeddings && foundationAvailable) {
357
+ console.log(chalk.dim(" Tip: run `fops embed index` then `fops embed file index` to enable semantic matching."));
358
+ } else if (hasEmbeddings && !hasColumnEmbeddings && foundationAvailable) {
359
+ console.log(chalk.dim(" Tip: run `fops embed file index` (model cached) for column-level semantic matching."));
352
360
  }
353
361
  return;
354
362
  }
@@ -469,24 +477,52 @@ export function register(api) {
469
477
  rows: null,
470
478
  };
471
479
 
480
+ // When semantic column mappings are available, translate candidate column
481
+ // names to their matched reference names so schema/type checks are meaningful.
482
+ let candHeadersForValidation = candResult.headers;
483
+ let candTypesForValidation = candResult.types;
484
+ const colMap = selectedRef.columnMappings;
485
+ if (colMap?.length > 0) {
486
+ const translatedHeaders = [];
487
+ const translatedTypes = {};
488
+ for (const m of colMap) {
489
+ translatedHeaders.push(m.referenceCol);
490
+ if (candResult.types?.[m.candidateCol] !== undefined) {
491
+ translatedTypes[m.referenceCol] = candResult.types[m.candidateCol];
492
+ }
493
+ }
494
+ candHeadersForValidation = translatedHeaders;
495
+ candTypesForValidation = translatedTypes;
496
+ }
497
+
472
498
  const schemaResult = compareSchemas(
473
499
  { headers: refResult.headers },
474
- { headers: candResult.headers },
500
+ { headers: candHeadersForValidation },
475
501
  { candidateCentric: true },
476
502
  );
477
503
 
478
504
  let typeResult = null;
479
- const commonCols = candResult.headers.filter((h) => refResult.headers.includes(h));
480
- if (commonCols.length > 0 && refResult.types && candResult.types) {
481
- typeResult = compareTypes(refResult.types, candResult.types, commonCols);
505
+ const commonCols = candHeadersForValidation.filter((h) => refResult.headers.includes(h));
506
+ if (commonCols.length > 0 && refResult.types && candTypesForValidation) {
507
+ typeResult = compareTypes(refResult.types, candTypesForValidation, commonCols);
482
508
  }
483
509
 
484
510
  // Auto-derive value rules from schema metadata (sample up to 10K rows)
485
511
  const autoRules = deriveRules(refResult.types, commonCols);
486
512
  let valuesResult = null;
487
513
  if (autoRules && candResult.rows?.length > 0) {
514
+ // Map row values to reference column names for value checks
488
515
  const sampleRows = candResult.rows.length > 10000 ? candResult.rows.slice(0, 10000) : candResult.rows;
489
- valuesResult = checkValues(sampleRows, candResult.headers, autoRules);
516
+ if (colMap?.length > 0) {
517
+ // Re-key rows from candidate names to reference names
518
+ const keyMap = Object.fromEntries(colMap.map((m) => [m.candidateCol, m.referenceCol]));
519
+ const remappedRows = sampleRows.map((row) =>
520
+ Object.fromEntries(Object.entries(row).map(([k, v]) => [keyMap[k] ?? k, v])),
521
+ );
522
+ valuesResult = checkValues(remappedRows, candHeadersForValidation, autoRules);
523
+ } else {
524
+ valuesResult = checkValues(sampleRows, candResult.headers, autoRules);
525
+ }
490
526
  }
491
527
 
492
528
  const report = buildReport(schemaResult, typeResult, valuesResult, null);
@@ -503,6 +539,82 @@ export function register(api) {
503
539
  if (report.verdict === "FAIL") {
504
540
  process.exitCode = 1;
505
541
  }
542
+
543
+ // ── 11. Post-match action prompt ────────────────────────────────
544
+ if (isTTY && !opts.json) {
545
+ const ext = pathMod.extname(candidate).toLowerCase();
546
+ const canFixCsv = ext === ".csv" && colMap?.length > 0;
547
+ const canFixDp = selectedRef.entityType === "data_product" && !!foundationClient;
548
+
549
+ if (canFixCsv || canFixDp) {
550
+ const baseName = pathMod.basename(candidate, ext);
551
+ const alignedPath = pathMod.join(pathMod.dirname(candidate), `${baseName}.aligned${ext}`);
552
+
553
+ const actionOptions = [];
554
+ if (canFixCsv) {
555
+ actionOptions.push({
556
+ label: `Fix CSV — rename columns to match reference (→ ${pathMod.basename(alignedPath)})`,
557
+ value: "fix-csv",
558
+ });
559
+ }
560
+ if (canFixDp) {
561
+ actionOptions.push({
562
+ label: `Fix Data Product — update "${selectedRef.name}" schema to match this file`,
563
+ value: "fix-dp",
564
+ });
565
+ }
566
+ actionOptions.push({ label: "Nothing — done", value: null });
567
+
568
+ let selectActionFn;
569
+ const confirmPath2 = api.getCliPath?.("src", "ui", "confirm.js");
570
+ if (confirmPath2) {
571
+ const { pathToFileURL: p2u } = await import("node:url");
572
+ ({ selectOption: selectActionFn } = await import(p2u(confirmPath2).href));
573
+ } else {
574
+ ({ selectOption: selectActionFn } = await import("../../../ui/confirm.js"));
575
+ }
576
+
577
+ console.log("");
578
+ const action = await selectActionFn("Next step:", actionOptions);
579
+
580
+ if (action === "fix-csv") {
581
+ const Papa = (await import("papaparse")).default;
582
+ const renameMap = Object.fromEntries(colMap.map((m) => [m.candidateCol, m.referenceCol]));
583
+ const newHeaders = candResult.headers.map((h) => renameMap[h] ?? h);
584
+ const newRows = (candResult.rows || []).map((row) =>
585
+ Object.fromEntries(Object.entries(row).map(([k, v]) => [renameMap[k] ?? k, v])),
586
+ );
587
+ const csvText = Papa.unparse({ fields: newHeaders, data: newRows });
588
+ (await import("node:fs")).writeFileSync(alignedPath, csvText, "utf8");
589
+ console.log(chalk.green(`\n ✓ Written: ${alignedPath}`));
590
+ const renamed = colMap.filter((m) => m.candidateCol !== m.referenceCol);
591
+ if (renamed.length > 0) {
592
+ console.log(chalk.dim(`\n Renamed ${renamed.length} column(s):`));
593
+ for (const m of renamed) {
594
+ console.log(chalk.dim(` ${m.candidateCol.padEnd(22)} → ${m.referenceCol}`));
595
+ }
596
+ }
597
+ } else if (action === "fix-dp") {
598
+ const typeToApiCol = (t) => ({ integer: "INTEGER", decimal: "DECIMAL", date: "DATE", timestamp: "TIMESTAMP", boolean: "BOOLEAN" })[t] || "VARCHAR";
599
+ const fields = candResult.headers.map((h) => {
600
+ const ti = candResult.types?.[h];
601
+ const col = { name: h, data_type: { column_type: typeToApiCol(ti?.type || "string") } };
602
+ if (ti?.type === "decimal" && ti.precision != null) {
603
+ col.data_type.meta = { precision: String(ti.precision), scale: String(ti.scale ?? 0) };
604
+ }
605
+ return col;
606
+ });
607
+ const schemaBody = { details: { data_product_type: "user", fields } };
608
+ try {
609
+ await foundationClient.put(`/data/data_product/schema?identifier=${selectedRef.id}`, schemaBody);
610
+ console.log(chalk.green(`\n ✓ Schema updated for "${selectedRef.name}"`));
611
+ console.log(chalk.dim(` ${fields.length} columns from ${pathMod.basename(candidate)}`));
612
+ } catch (err) {
613
+ console.error(chalk.red(`\n ✗ Schema update failed: ${err.message}`));
614
+ }
615
+ }
616
+ }
617
+ }
506
618
  } catch (err) {
507
619
  spinner.stop();
508
620
  console.error(chalk.red(`\n ✗ ${err.message}\n`));
@@ -777,21 +889,23 @@ export function register(api) {
777
889
  let refTypes = null; // { colName: { type } }
778
890
  let refLabel = null;
779
891
 
780
- if (opts.reference) {
892
+ // opts.reference may be consumed by the parent fileCmd (Commander option inheritance)
893
+ const referenceArg = opts.reference || fileCmd.opts().reference;
894
+ if (referenceArg) {
781
895
  const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
782
- if (UUID_RE.test(opts.reference)) {
896
+ if (UUID_RE.test(referenceArg)) {
783
897
  spinner.text = "Fetching reference schema...";
784
- const refResult = await resolveSchemaFromApi(opts.reference, foundationClient, mapApiTypeToInferred);
898
+ const refResult = await resolveSchemaFromApi(referenceArg, foundationClient, mapApiTypeToInferred);
785
899
  refHeaders = new Set(refResult.headers);
786
900
  refTypes = refResult.types;
787
- refLabel = `data_object:${opts.reference.slice(0, 8)}`;
901
+ refLabel = `data_object:${referenceArg.slice(0, 8)}`;
788
902
  } else {
789
903
  spinner.text = "Parsing reference...";
790
- const refInput = await resolveInput(opts.reference, storage, fs, pathMod);
904
+ const refInput = await resolveInput(referenceArg, storage, fs, pathMod);
791
905
  const refResult = await parseFile(refInput.buffer, { format: refInput.format, maxRows: 1 });
792
906
  refHeaders = new Set(refResult.headers);
793
907
  refTypes = refResult.types;
794
- refLabel = pathMod.basename(opts.reference);
908
+ refLabel = pathMod.basename(referenceArg);
795
909
  }
796
910
  }
797
911
 
@@ -1003,16 +1117,18 @@ export function register(api) {
1003
1117
  let refResult = null;
1004
1118
  let refLabel = "(none)";
1005
1119
 
1006
- if (opts.reference) {
1007
- if (isDataObjectUuid(opts.reference)) {
1120
+ // opts.reference may be consumed by the parent fileCmd (Commander option inheritance)
1121
+ const referenceArg = opts.reference || fileCmd.opts().reference;
1122
+ if (referenceArg) {
1123
+ if (isDataObjectUuid(referenceArg)) {
1008
1124
  spinner.text = "Fetching reference schema from Foundation API...";
1009
- refResult = await resolveSchemaFromApi(opts.reference, foundationClient, mapApiTypeToInferred);
1010
- refLabel = `data_object:${opts.reference.slice(0, 8)}`;
1125
+ refResult = await resolveSchemaFromApi(referenceArg, foundationClient, mapApiTypeToInferred);
1126
+ refLabel = `data_object:${referenceArg.slice(0, 8)}`;
1011
1127
  } else {
1012
1128
  spinner.text = "Parsing reference file...";
1013
- const refInput = await resolveInput(opts.reference, storage, fs, path);
1129
+ const refInput = await resolveInput(referenceArg, storage, fs, path);
1014
1130
  refResult = await parseFile(refInput.buffer, { format: refInput.format, sheet: opts.sheet });
1015
- refLabel = path.basename(opts.reference);
1131
+ refLabel = path.basename(referenceArg);
1016
1132
  }
1017
1133
  }
1018
1134
 
@@ -387,7 +387,14 @@ export async function runEmbedIndex(api, opts) {
387
387
  store.setMeta("embedding_dim", "384");
388
388
  } catch {
389
389
  // Embeddings failed — index still usable without them
390
+ spinner.stop();
391
+ console.log(WARN(" ⚠ Column embeddings skipped — MiniLM model not ready."));
392
+ console.log(DIM(" Run `fops embed index` first to download the model, then re-run `fops embed file index`."));
393
+ console.log(DIM(" Without embeddings, matching falls back to Jaccard (exact column name overlap only)."));
390
394
  }
395
+ } else {
396
+ console.log(WARN(" ⚠ Column embeddings skipped — embeddings plugin not available."));
397
+ console.log(DIM(" Run `fops embed index` first to download the MiniLM model, then re-run `fops embed file index`."));
391
398
  }
392
399
 
393
400
  store.setMeta("version", "3");
@@ -155,13 +155,20 @@ export async function embedSchemaEntries(entries, embeddingClient, opts = {}) {
155
155
 
156
156
  /**
157
157
  * Normalize a column name for embedding: split on separators, lowercase.
158
+ * For compound names (e.g. order_status), the suffix is repeated to boost its
159
+ * weight in the embedding so "order_status" stays close to "status" and doesn't
160
+ * drift toward "order_id" due to a shared prefix.
158
161
  */
159
162
  function normalizeColumnName(name) {
160
- return (name || "")
161
- .replace(/[_\-./]/g, " ")
163
+ const parts = (name || "")
162
164
  .replace(/([a-z])([A-Z])/g, "$1 $2") // camelCase split
163
- .toLowerCase()
164
- .trim() || name;
165
+ .split(/[_\-./\s]+/)
166
+ .filter(Boolean)
167
+ .map((p) => p.toLowerCase());
168
+ if (parts.length === 0) return name || "";
169
+ if (parts.length === 1) return parts[0];
170
+ // Repeat the last token to emphasise the semantic role over the entity prefix
171
+ return parts.join(" ") + " " + parts[parts.length - 1];
165
172
  }
166
173
 
167
174
  /**