goldenmatch 0.1.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +54 -2
  2. package/dist/cli.cjs +371 -14
  3. package/dist/cli.cjs.map +1 -1
  4. package/dist/cli.js +371 -14
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/index.cjs +948 -213
  7. package/dist/core/index.cjs.map +1 -1
  8. package/dist/core/index.d.cts +10 -59
  9. package/dist/core/index.d.ts +10 -59
  10. package/dist/core/index.js +944 -214
  11. package/dist/core/index.js.map +1 -1
  12. package/dist/index.cjs +948 -213
  13. package/dist/index.cjs.map +1 -1
  14. package/dist/index.d.cts +2 -2
  15. package/dist/index.d.ts +2 -2
  16. package/dist/index.js +944 -214
  17. package/dist/index.js.map +1 -1
  18. package/dist/node/backends/score-worker.cjs.map +1 -1
  19. package/dist/node/backends/score-worker.d.cts +1 -1
  20. package/dist/node/backends/score-worker.d.ts +1 -1
  21. package/dist/node/backends/score-worker.js.map +1 -1
  22. package/dist/node/index.cjs +967 -213
  23. package/dist/node/index.cjs.map +1 -1
  24. package/dist/node/index.d.cts +3 -3
  25. package/dist/node/index.d.ts +3 -3
  26. package/dist/node/index.js +964 -214
  27. package/dist/node/index.js.map +1 -1
  28. package/dist/types-C-JSr4mQ.d.cts +557 -0
  29. package/dist/types-C-JSr4mQ.d.ts +557 -0
  30. package/examples/README.md +2 -0
  31. package/examples/strictModeParity.ts +78 -0
  32. package/examples/verificationInspection.ts +99 -0
  33. package/package.json +1 -1
  34. package/src/core/autoconfig.ts +133 -10
  35. package/src/core/autoconfigVerify.ts +907 -0
  36. package/src/core/domain.ts +9 -0
  37. package/src/core/index.ts +25 -0
  38. package/src/core/pipeline.ts +74 -2
  39. package/src/core/profiler.ts +122 -12
  40. package/src/core/types.ts +18 -0
  41. package/tests/parity/autoconfig-verify-fixtures.json +1147 -0
  42. package/tests/parity/autoconfigVerify-parity.test.ts +114 -0
  43. package/tests/smoke.test.ts +7 -0
  44. package/tests/unit/autoconfig-classifier.test.ts +59 -0
  45. package/tests/unit/autoconfig-multi-name.test.ts +33 -0
  46. package/tests/unit/autoconfig-preflight-integration.test.ts +37 -0
  47. package/tests/unit/autoconfig-weight-cap.test.ts +18 -0
  48. package/tests/unit/autoconfig-year.test.ts +49 -0
  49. package/tests/unit/autoconfig.test.ts +29 -8
  50. package/tests/unit/autoconfigVerify-invariants.test.ts +202 -0
  51. package/tests/unit/autoconfigVerify-postflight.test.ts +238 -0
  52. package/tests/unit/autoconfigVerify-preflight.test.ts +410 -0
  53. package/tests/unit/autoconfigVerify-types.test.ts +117 -0
  54. package/tests/unit/pprl-protocol.test.ts +1 -1
  55. package/tests/unit/profiler-confidence.test.ts +28 -0
  56. package/dist/types-DhUdX5Rc.d.cts +0 -304
  57. package/dist/types-DhUdX5Rc.d.ts +0 -304
package/README.md CHANGED
@@ -9,7 +9,7 @@ npm install goldenmatch
9
9
  [![npm](https://img.shields.io/npm/v/goldenmatch?color=d4a017)](https://www.npmjs.com/package/goldenmatch)
10
10
  [![Node](https://img.shields.io/node/v/goldenmatch?color=339933)](https://nodejs.org/)
11
11
  [![License: MIT](https://img.shields.io/badge/license-MIT-green)](https://github.com/benzsevern/goldenmatch/blob/main/LICENSE)
12
- [![Tests](https://img.shields.io/badge/tests-478%20passing-brightgreen)](https://github.com/benzsevern/goldenmatch/tree/main/packages/goldenmatch-js/tests)
12
+ [![Tests](https://img.shields.io/badge/tests-590%20passing-brightgreen)](https://github.com/benzsevern/goldenmatch/tree/main/packages/goldenmatch-js/tests)
13
13
 
14
14
  ---
15
15
 
@@ -18,7 +18,7 @@ npm install goldenmatch
18
18
  - **Edge-safe core** — the matching engine runs in browsers, Workers, Vercel Edge Runtime, Deno
19
19
  - **Pure TypeScript** — no native dependencies required; peer deps unlock performance (hnswlib, ONNX, piscina)
20
20
  - **Feature parity with Python goldenmatch** — same scorers, same clustering, same YAML configs
21
- - **478 tests, strict TypeScript** — `noUncheckedIndexedAccess`, `exactOptionalPropertyTypes`
21
+ - **590 tests, strict TypeScript** — `noUncheckedIndexedAccess`, `exactOptionalPropertyTypes`
22
22
 
23
23
  ## Quick Start
24
24
 
@@ -45,6 +45,58 @@ for (const record of result.goldenRecords) {
45
45
  }
46
46
  ```
47
47
 
48
+ ## Auto-Config Verification (v0.3)
49
+
50
+ Auto-generated configs are now checked both before the pipeline runs and after
51
+ scoring finishes, so you get actionable diagnostics instead of silent failures
52
+ on edge-case data.
53
+
54
+ ### Preflight — six static checks
55
+
56
+ When you call `autoConfigureRows(rows)`, the returned config ships with a
57
+ `_preflightReport` summarising six config-time checks:
58
+
59
+ 1. **missing_column** — matchkey/blocking references a column not in the data
60
+ 2. **cardinality_high** — a column is near-unique (poor blocking signal)
61
+ 3. **cardinality_low** — a column has too few distinct values to discriminate
62
+ 4. **block_size** — a blocking key would produce oversized blocks
63
+ 5. **remote_asset** — a scorer requires a model download (gated offline)
64
+ 6. **weight_confidence** — a weighted matchkey's weights look unbalanced
65
+
66
+ Many findings trigger **auto-repairs** (field dropped, scorer swapped,
67
+ weight clamped). `hasErrors === true` on unrepairable errors raises
68
+ `ConfigValidationError` with the full report attached.
69
+
70
+ ```ts
71
+ import { autoConfigureRows, ConfigValidationError } from "goldenmatch";
72
+
73
+ const cfg = autoConfigureRows(rows);
74
+ for (const f of cfg._preflightReport!.findings) {
75
+ console.log(`[${f.severity}] ${f.check}/${f.subject}: ${f.message}`);
76
+ }
77
+ ```
78
+
79
+ Defaults are **offline-safe**: remote-asset scorers (cross-encoder, remote
80
+ embeddings) are dropped unless you opt in with `allowRemoteAssets: true`.
81
+
82
+ ### Postflight — four runtime signals
83
+
84
+ Inside `dedupe()` / `match()`, after scoring but before clustering, the
85
+ pipeline computes four signals attached as `result.postflightReport`:
86
+
87
+ 1. **scoreHistogram** — 100-bin pair-score distribution
88
+ 2. **blockSizePercentiles** + **preliminaryClusterSizes** — p50/p95/p99/max
89
+ 3. **thresholdOverlapPct** — fraction of pairs near the current threshold
90
+ 4. **oversizedClusters** — components above size limit, with bottleneck pair
91
+
92
+ If the score distribution is clearly bimodal, postflight proposes a
93
+ threshold adjustment. In **strict mode** (`autoConfigureRows(rows, { strict: true })`
94
+ or manual `_strictAutoconfig: true`) the signals are still emitted but the
95
+ threshold is never touched — use this for reproducible CI pipelines.
96
+
97
+ See `examples/verificationInspection.ts` and `examples/strictModeParity.ts`
98
+ for runnable demos.
99
+
48
100
  ## Three entrypoints
49
101
 
50
102
  ```typescript
package/dist/cli.cjs CHANGED
@@ -2492,6 +2492,261 @@ var init_golden = __esm({
2492
2492
  ];
2493
2493
  }
2494
2494
  });
2495
+ var init_domain = __esm({
2496
+ "src/core/domain.ts"() {
2497
+ new Set(
2498
+ [
2499
+ "apple",
2500
+ "samsung",
2501
+ "sony",
2502
+ "lg",
2503
+ "dell",
2504
+ "hp",
2505
+ "lenovo",
2506
+ "asus",
2507
+ "acer",
2508
+ "microsoft",
2509
+ "google",
2510
+ "amazon",
2511
+ "bose",
2512
+ "canon",
2513
+ "nikon",
2514
+ "panasonic",
2515
+ "philips",
2516
+ "toshiba"
2517
+ ].map((s) => s.toLowerCase())
2518
+ );
2519
+ }
2520
+ });
2521
+
2522
+ // src/core/autoconfigVerify.ts
2523
+ function signalScoreHistogram(pairScores) {
2524
+ const bins = [];
2525
+ for (let i = 0; i <= 100; i++) bins.push(i / 100);
2526
+ const counts = new Array(100).fill(0);
2527
+ for (const p of pairScores) {
2528
+ const idx = Math.min(99, Math.max(0, Math.floor(p.score * 100)));
2529
+ counts[idx] += 1;
2530
+ }
2531
+ const smoothed = counts.map((_, i) => {
2532
+ let sum = 0;
2533
+ let n = 0;
2534
+ for (let j = -2; j <= 2; j++) {
2535
+ const k = i + j;
2536
+ if (k >= 0 && k < counts.length) {
2537
+ sum += counts[k];
2538
+ n += 1;
2539
+ }
2540
+ }
2541
+ return sum / Math.max(1, n);
2542
+ });
2543
+ const max = Math.max(...smoothed, 0);
2544
+ const mean = smoothed.reduce((a, b) => a + b, 0) / Math.max(1, smoothed.length);
2545
+ const minHeight = Math.max(max * 0.3, mean * 2);
2546
+ const peaks = [];
2547
+ for (let i = 1; i < smoothed.length - 1; i++) {
2548
+ if (smoothed[i] >= minHeight && smoothed[i] > smoothed[i - 1] && smoothed[i] >= smoothed[i + 1]) {
2549
+ peaks.push(i);
2550
+ }
2551
+ }
2552
+ if (peaks.length < 2) {
2553
+ return {
2554
+ histogram: { bins, counts },
2555
+ valleyLocation: null,
2556
+ isBimodal: false
2557
+ };
2558
+ }
2559
+ const first = peaks[0];
2560
+ const last = peaks[peaks.length - 1];
2561
+ if (last - first <= 10) {
2562
+ return {
2563
+ histogram: { bins, counts },
2564
+ valleyLocation: null,
2565
+ isBimodal: false
2566
+ };
2567
+ }
2568
+ let valleyIdx = first;
2569
+ let valleyVal = smoothed[first];
2570
+ for (let i = first + 1; i < last; i++) {
2571
+ if (smoothed[i] < valleyVal) {
2572
+ valleyVal = smoothed[i];
2573
+ valleyIdx = i;
2574
+ }
2575
+ }
2576
+ const depthRatio = valleyVal / Math.min(smoothed[first], smoothed[last]);
2577
+ if (depthRatio >= 0.5) {
2578
+ return {
2579
+ histogram: { bins, counts },
2580
+ valleyLocation: null,
2581
+ isBimodal: false
2582
+ };
2583
+ }
2584
+ return {
2585
+ histogram: { bins, counts },
2586
+ valleyLocation: valleyIdx / 100,
2587
+ isBimodal: true
2588
+ };
2589
+ }
2590
+ function getFirstWeightedThreshold(config) {
2591
+ for (const mk of config.matchkeys ?? []) {
2592
+ if (mk.type === "weighted") return mk.threshold;
2593
+ }
2594
+ return null;
2595
+ }
2596
+ function signalBlockingRecall() {
2597
+ return "deferred";
2598
+ }
2599
+ function signalBlockSizePercentiles(rows, config) {
2600
+ const b = config.blocking;
2601
+ if (b === void 0 || b.keys.length === 0 || rows.length === 0) {
2602
+ return { p50: 0, p95: 0, p99: 0, max: 0 };
2603
+ }
2604
+ const sample = rows.slice(0, Math.min(rows.length, 1e4));
2605
+ const sizes = [];
2606
+ for (const key of b.keys) {
2607
+ if (key.fields.length === 0) continue;
2608
+ const counts = /* @__PURE__ */ new Map();
2609
+ for (const row of sample) {
2610
+ const parts = key.fields.map((f) => {
2611
+ const v = row[f];
2612
+ return v === null || v === void 0 ? "\0" : String(v);
2613
+ });
2614
+ const k = parts.join("");
2615
+ counts.set(k, (counts.get(k) ?? 0) + 1);
2616
+ }
2617
+ for (const n of counts.values()) sizes.push(n);
2618
+ }
2619
+ sizes.sort((a, b2) => a - b2);
2620
+ const pct = (q) => sizes.length === 0 ? 0 : sizes[Math.min(sizes.length - 1, Math.floor(sizes.length * q))] ?? 0;
2621
+ return {
2622
+ p50: pct(0.5),
2623
+ p95: pct(0.95),
2624
+ p99: pct(0.99),
2625
+ max: sizes.length === 0 ? 0 : sizes[sizes.length - 1]
2626
+ };
2627
+ }
2628
+ function signalThresholdOverlap(pairScores, threshold) {
2629
+ if (pairScores.length === 0) return 0;
2630
+ const lo = threshold - 0.02;
2631
+ const hi = threshold + 0.02;
2632
+ let inBand = 0;
2633
+ for (const p of pairScores) {
2634
+ if (p.score >= lo && p.score <= hi) inBand += 1;
2635
+ }
2636
+ return inBand / pairScores.length;
2637
+ }
2638
+ function signalClusterSizes(pairScores, threshold) {
2639
+ const above = pairScores.filter((p) => p.score >= threshold);
2640
+ const parent = /* @__PURE__ */ new Map();
2641
+ const find = (x) => {
2642
+ let root = x;
2643
+ while ((parent.get(root) ?? root) !== root) root = parent.get(root);
2644
+ let y = x;
2645
+ while ((parent.get(y) ?? y) !== root) {
2646
+ const next = parent.get(y) ?? y;
2647
+ parent.set(y, root);
2648
+ y = next;
2649
+ }
2650
+ return root;
2651
+ };
2652
+ const union = (a, b) => {
2653
+ const ra = find(a);
2654
+ const rb = find(b);
2655
+ if (ra !== rb) parent.set(ra, rb);
2656
+ };
2657
+ for (const p of above) {
2658
+ if (!parent.has(p.idA)) parent.set(p.idA, p.idA);
2659
+ if (!parent.has(p.idB)) parent.set(p.idB, p.idB);
2660
+ union(p.idA, p.idB);
2661
+ }
2662
+ const sizeByRoot = /* @__PURE__ */ new Map();
2663
+ const membersByRoot = /* @__PURE__ */ new Map();
2664
+ for (const id of parent.keys()) {
2665
+ const r = find(id);
2666
+ sizeByRoot.set(r, (sizeByRoot.get(r) ?? 0) + 1);
2667
+ if (!membersByRoot.has(r)) membersByRoot.set(r, /* @__PURE__ */ new Set());
2668
+ membersByRoot.get(r).add(id);
2669
+ }
2670
+ const sizes = Array.from(sizeByRoot.values()).sort((a, b) => a - b);
2671
+ const pct = (q) => sizes.length === 0 ? 0 : sizes[Math.min(sizes.length - 1, Math.floor(sizes.length * q))] ?? 0;
2672
+ const percentiles = {
2673
+ p50: pct(0.5),
2674
+ p95: pct(0.95),
2675
+ p99: pct(0.99),
2676
+ max: sizes.length === 0 ? 0 : sizes[sizes.length - 1],
2677
+ count: sizes.length
2678
+ };
2679
+ const oversized = [];
2680
+ let clusterId = 0;
2681
+ for (const [root, size] of sizeByRoot) {
2682
+ if (size <= 100) continue;
2683
+ const members = membersByRoot.get(root);
2684
+ let bottleneckPair = [-1, -1];
2685
+ let minScore = Infinity;
2686
+ for (const p of above) {
2687
+ if (members.has(p.idA) && members.has(p.idB) && p.score < minScore) {
2688
+ minScore = p.score;
2689
+ bottleneckPair = [
2690
+ Math.min(p.idA, p.idB),
2691
+ Math.max(p.idA, p.idB)
2692
+ ];
2693
+ }
2694
+ }
2695
+ oversized.push({ clusterId: clusterId++, size, bottleneckPair });
2696
+ }
2697
+ return { percentiles, oversized };
2698
+ }
2699
+ function postflight(rows, config, options) {
2700
+ const currentThreshold = options.currentThreshold ?? getFirstWeightedThreshold(config) ?? 0.7;
2701
+ const hist = signalScoreHistogram(options.pairScores);
2702
+ const adjustments = [];
2703
+ const advisories = [];
2704
+ if (hist.isBimodal && hist.valleyLocation !== null) {
2705
+ if (!config._strictAutoconfig && Math.abs(hist.valleyLocation - currentThreshold) > 0.05) {
2706
+ adjustments.push({
2707
+ field: "threshold",
2708
+ fromValue: currentThreshold,
2709
+ toValue: hist.valleyLocation,
2710
+ reason: "histogram valley location differs from current threshold",
2711
+ signal: "scoreHistogram"
2712
+ });
2713
+ }
2714
+ } else {
2715
+ advisories.push(
2716
+ "score distribution is unimodal; threshold cannot be auto-set"
2717
+ );
2718
+ }
2719
+ const clusterResult = signalClusterSizes(
2720
+ options.pairScores,
2721
+ currentThreshold
2722
+ );
2723
+ const overlapPct = signalThresholdOverlap(
2724
+ options.pairScores,
2725
+ currentThreshold
2726
+ );
2727
+ if (overlapPct > 0.2 && config.llmScorer?.enabled !== true) {
2728
+ advisories.push(
2729
+ `${(overlapPct * 100).toFixed(1)}% of pairs within threshold \xB10.02 \u2014 consider enabling LLM auto mode for calibration`
2730
+ );
2731
+ }
2732
+ const blockSizePercentiles = signalBlockSizePercentiles(rows, config);
2733
+ const signals = {
2734
+ scoreHistogram: hist.histogram,
2735
+ blockingRecall: signalBlockingRecall(),
2736
+ blockSizePercentiles,
2737
+ thresholdOverlapPct: overlapPct,
2738
+ totalPairsScored: options.pairScores.length,
2739
+ currentThreshold,
2740
+ preliminaryClusterSizes: clusterResult.percentiles,
2741
+ oversizedClusters: clusterResult.oversized
2742
+ };
2743
+ return { signals, adjustments, advisories };
2744
+ }
2745
+ var init_autoconfigVerify = __esm({
2746
+ "src/core/autoconfigVerify.ts"() {
2747
+ init_domain();
2748
+ }
2749
+ });
2495
2750
 
2496
2751
  // src/core/pipeline.ts
2497
2752
  function buildSourceLookup(rows) {
@@ -2521,6 +2776,38 @@ function assignClusterIds(rows, clusters) {
2521
2776
  return cid !== void 0 ? { ...row, __cluster_id__: cid } : row;
2522
2777
  });
2523
2778
  }
2779
+ function isPreflightReport(v) {
2780
+ return typeof v === "object" && v !== null && "findings" in v && Array.isArray(v.findings);
2781
+ }
2782
+ function applyPostflight(rows, config, pairScores) {
2783
+ const pre = config._preflightReport;
2784
+ if (!isPreflightReport(pre)) {
2785
+ return { pairScores, report: void 0 };
2786
+ }
2787
+ const report = postflight(rows, config, {
2788
+ pairScores: pairScores.map((p) => ({
2789
+ idA: p.idA,
2790
+ idB: p.idB,
2791
+ score: p.score
2792
+ }))
2793
+ });
2794
+ let filtered = pairScores;
2795
+ if (config._strictAutoconfig !== true) {
2796
+ for (const adj of report.adjustments) {
2797
+ if (adj.field === "threshold") {
2798
+ const newThreshold = adj.toValue;
2799
+ const prev = filtered.length;
2800
+ filtered = filtered.filter((p) => p.score >= newThreshold);
2801
+ if (prev > 0 && filtered.length === 0) {
2802
+ report.advisories.push(
2803
+ `threshold adjustment to ${newThreshold.toFixed(3)} dropped all ${prev} pairs`
2804
+ );
2805
+ }
2806
+ }
2807
+ }
2808
+ }
2809
+ return { pairScores: filtered, report };
2810
+ }
2524
2811
  function runDedupePipeline(rows, config, options) {
2525
2812
  if (rows.length === 0) {
2526
2813
  return _emptyDedupeResult(config);
@@ -2570,8 +2857,13 @@ function runDedupePipeline(rows, config, options) {
2570
2857
  }
2571
2858
  }
2572
2859
  }
2860
+ const { pairScores: finalPairs, report: postflightReport } = applyPostflight(
2861
+ processed,
2862
+ config,
2863
+ allPairs
2864
+ );
2573
2865
  const allIds = collectRowIds(processed);
2574
- const pairTuples = allPairs.map((p) => [
2866
+ const pairTuples = finalPairs.map((p) => [
2575
2867
  p.idA,
2576
2868
  p.idB,
2577
2869
  p.score
@@ -2640,8 +2932,9 @@ function runDedupePipeline(rows, config, options) {
2640
2932
  dupes,
2641
2933
  unique,
2642
2934
  stats,
2643
- scoredPairs: allPairs,
2644
- config
2935
+ scoredPairs: finalPairs,
2936
+ config,
2937
+ ...postflightReport !== void 0 ? { postflightReport } : {}
2645
2938
  };
2646
2939
  }
2647
2940
  function runMatchPipeline(targetRows, referenceRows, config) {
@@ -2695,7 +2988,8 @@ function runMatchPipeline(targetRows, referenceRows, config) {
2695
2988
  matchedCount: matched.length,
2696
2989
  unmatchedCount: unmatched.length,
2697
2990
  matchRate: targetRows.length > 0 ? matched.length / targetRows.length : 0
2698
- }
2991
+ },
2992
+ ...result.postflightReport !== void 0 ? { postflightReport: result.postflightReport } : {}
2699
2993
  };
2700
2994
  }
2701
2995
  function _emptyDedupeResult(config) {
@@ -2724,6 +3018,7 @@ var init_pipeline = __esm({
2724
3018
  init_scorer();
2725
3019
  init_cluster();
2726
3020
  init_golden();
3021
+ init_autoconfigVerify();
2727
3022
  }
2728
3023
  });
2729
3024
 
@@ -3567,6 +3862,12 @@ var init_explain = __esm({
3567
3862
  });
3568
3863
 
3569
3864
  // src/core/profiler.ts
3865
+ function isYearValue(v) {
3866
+ const normalized = v.replace(/\.0+$/, "");
3867
+ const n = Number(normalized);
3868
+ if (Number.isNaN(n) || !Number.isFinite(n) || !Number.isInteger(n)) return false;
3869
+ return n >= 1900 && n <= 2100;
3870
+ }
3570
3871
  function toStringOrNull(value) {
3571
3872
  if (value === null || value === void 0) return null;
3572
3873
  if (typeof value === "string") {
@@ -3575,10 +3876,22 @@ function toStringOrNull(value) {
3575
3876
  }
3576
3877
  return String(value);
3577
3878
  }
3578
- function guessType(values, columnName) {
3579
- if (values.length === 0) return "text";
3580
- const n = values.length;
3879
+ function guessTypeByName(columnName) {
3581
3880
  const lname = columnName.toLowerCase();
3881
+ if (/email|e_mail|e-mail/i.test(lname)) return "email";
3882
+ if (/phone|tel(?!e)|mobile|cell/i.test(lname)) return "phone";
3883
+ if (/zip|postal|postcode/i.test(lname)) return "zip";
3884
+ if (YEAR_NAME_RE.test(lname)) return "year";
3885
+ if (/date|created|modified|updated|_at$|birth|dob/i.test(lname)) return "date";
3886
+ if (/^(city|state|county|country|region|province)/i.test(lname)) return "geo";
3887
+ if (/city_desc|state_cd|country_code|state_code/i.test(lname)) return "geo";
3888
+ if (/^id$|_id$|uuid|guid/i.test(lname)) return "id";
3889
+ if (/name|first|last|full_name|surname/i.test(lname)) return "name";
3890
+ return null;
3891
+ }
3892
+ function guessTypeByData(values) {
3893
+ if (values.length === 0) return null;
3894
+ const n = values.length;
3582
3895
  const emailCount = values.reduce(
3583
3896
  (acc, v) => acc + (EMAIL_VALUE_RE.test(v) ? 1 : 0),
3584
3897
  0
@@ -3597,14 +3910,16 @@ function guessType(values, columnName) {
3597
3910
  0
3598
3911
  );
3599
3912
  if (zipCount / n > 0.6) return "zip";
3913
+ let yearCount = 0;
3914
+ for (const v of values) {
3915
+ if (isYearValue(v)) yearCount++;
3916
+ }
3917
+ if (yearCount / n >= 0.95) return "year";
3600
3918
  let dateCount = 0;
3601
3919
  for (const v of values) {
3602
3920
  if (DATE_VALUE_RES.some((re) => re.test(v))) dateCount++;
3603
3921
  }
3604
3922
  if (dateCount / n > 0.6) return "date";
3605
- if (/^(city|state|county|country|region|province)/i.test(lname)) return "geo";
3606
- if (/city_desc|state_cd|country_code|state_code/i.test(lname)) return "geo";
3607
- if (/^id$|_id$|uuid|guid/i.test(lname)) return "id";
3608
3923
  const nameCount = values.reduce(
3609
3924
  (acc, v) => acc + (NAME_VALUE_RE.test(v) ? 1 : 0),
3610
3925
  0
@@ -3615,7 +3930,44 @@ function guessType(values, columnName) {
3615
3930
  if (/^-?\d+(\.\d+)?$/.test(v)) numericCount++;
3616
3931
  }
3617
3932
  if (numericCount / n > 0.8) return "numeric";
3618
- return "text";
3933
+ let totalLen = 0;
3934
+ let delimRows = 0;
3935
+ let totalDelims = 0;
3936
+ for (const v of values) {
3937
+ totalLen += v.length;
3938
+ const commas = (v.match(/,/g) ?? []).length;
3939
+ const semis = (v.match(/;/g) ?? []).length;
3940
+ const count = commas + semis;
3941
+ if (count > 0) {
3942
+ delimRows++;
3943
+ totalDelims += count;
3944
+ }
3945
+ }
3946
+ const avgLen = totalLen / n;
3947
+ const delimFraction = delimRows / n;
3948
+ const avgDelimsPerDelimRow = delimRows > 0 ? totalDelims / delimRows : 0;
3949
+ if (avgLen > 30 && delimFraction >= 0.7 && avgDelimsPerDelimRow >= 2) {
3950
+ return "multi_name";
3951
+ }
3952
+ return null;
3953
+ }
3954
+ function guessTypeAndConfidence(values, columnName) {
3955
+ if (values.length === 0) return { type: "text", confidence: 0.3 };
3956
+ const nameType = guessTypeByName(columnName);
3957
+ const dataType = guessTypeByData(values);
3958
+ if (nameType !== null && dataType !== null) {
3959
+ if (nameType === dataType) {
3960
+ return { type: nameType, confidence: 0.9 };
3961
+ }
3962
+ const nameAuthoritative = nameType === "date" || nameType === "year" || nameType === "geo" || nameType === "id" || nameType === "email" || nameType === "zip" || nameType === "name";
3963
+ return {
3964
+ type: nameAuthoritative ? nameType : dataType,
3965
+ confidence: 0.7
3966
+ };
3967
+ }
3968
+ if (nameType !== null) return { type: nameType, confidence: 0.7 };
3969
+ if (dataType !== null) return { type: dataType, confidence: 0.7 };
3970
+ return { type: "text", confidence: 0.3 };
3619
3971
  }
3620
3972
  function profileColumn(name, rawValues) {
3621
3973
  const totalCount = rawValues.length;
@@ -3643,7 +3995,10 @@ function profileColumn(name, rawValues) {
3643
3995
  if (sampleValues.length >= 5) break;
3644
3996
  }
3645
3997
  const sampleForType = nonNull.length > 500 ? nonNull.slice(0, 500) : nonNull;
3646
- const inferredType = guessType(sampleForType, name);
3998
+ const { type: inferredType, confidence } = guessTypeAndConfidence(
3999
+ sampleForType,
4000
+ name
4001
+ );
3647
4002
  return {
3648
4003
  name,
3649
4004
  nullRate,
@@ -3654,7 +4009,8 @@ function profileColumn(name, rawValues) {
3654
4009
  inferredType,
3655
4010
  avgLength,
3656
4011
  maxLength: maxLen,
3657
- sampleValues
4012
+ sampleValues,
4013
+ confidence
3658
4014
  };
3659
4015
  }
3660
4016
  function profileRows(rows) {
@@ -3682,7 +4038,7 @@ function profileRows(rows) {
3682
4038
  byName
3683
4039
  };
3684
4040
  }
3685
- var EMAIL_VALUE_RE, PHONE_STRIP_RE, DATE_VALUE_RES, ZIP_VALUE_RE, NAME_VALUE_RE;
4041
+ var EMAIL_VALUE_RE, PHONE_STRIP_RE, DATE_VALUE_RES, ZIP_VALUE_RE, NAME_VALUE_RE, YEAR_NAME_RE;
3686
4042
  var init_profiler = __esm({
3687
4043
  "src/core/profiler.ts"() {
3688
4044
  EMAIL_VALUE_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
@@ -3694,6 +4050,7 @@ var init_profiler = __esm({
3694
4050
  ];
3695
4051
  ZIP_VALUE_RE = /^\d{5}(-?\d{4})?$/;
3696
4052
  NAME_VALUE_RE = /^[A-Za-z][A-Za-z \-']{0,28}[A-Za-z]$|^[A-Za-z]{2,3}$/;
4053
+ YEAR_NAME_RE = /(^|_)(year|yr)(_|$)/i;
3697
4054
  }
3698
4055
  });
3699
4056