PyPI - goldencheck-native - Versions diffs - 0.1.0__tar.gz - Mend

goldencheck-native 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

goldencheck_native-0.1.0/PKG-INFO +36 -0
goldencheck_native-0.1.0/README.md +20 -0
goldencheck_native-0.1.0/goldencheck-core/Cargo.toml +25 -0
goldencheck_native-0.1.0/goldencheck-core/src/benford.rs +110 -0
goldencheck_native-0.1.0/goldencheck-core/src/fuzzy.rs +227 -0
goldencheck_native-0.1.0/goldencheck-core/src/keys.rs +399 -0
goldencheck_native-0.1.0/goldencheck-core/src/lib.rs +24 -0
goldencheck_native-0.1.0/goldencheck-native/Cargo.lock +959 -0
goldencheck_native-0.1.0/goldencheck-native/Cargo.toml +39 -0
goldencheck_native-0.1.0/goldencheck-native/README.md +20 -0
goldencheck_native-0.1.0/goldencheck-native/src/fuzzy.rs +21 -0
goldencheck_native-0.1.0/goldencheck-native/src/keys.rs +245 -0
goldencheck_native-0.1.0/goldencheck-native/src/lib.rs +30 -0
goldencheck_native-0.1.0/goldencheck-native/src/profile.rs +37 -0
goldencheck_native-0.1.0/pyproject.toml +39 -0
goldencheck_native-0.1.0/python/goldencheck_native/__init__.py +22 -0

goldencheck_native-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,36 @@
+Metadata-Version: 2.4
+Name: goldencheck-native
+Version: 0.1.0
+Classifier: Programming Language :: Rust
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Scientific/Engineering
+Summary: Optional native (Rust/PyO3) acceleration kernels for goldencheck
+Author-email: Ben Severn <ben@bensevern.dev>
+License: MIT
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
+Project-URL: Homepage, https://github.com/benseverndev-oss/goldenmatch
+# goldencheck-native
+Optional native (Rust/PyO3) acceleration kernels for
+[GoldenCheck](https://github.com/benseverndev-oss/goldenmatch/tree/main/packages/python/goldencheck).
+`goldencheck` is a pure-Python wheel; this package ships the compiled abi3
+`_native` extension that accelerates GoldenCheck's CPU-bound deep-profiling work
+(Benford conformance, composite-key and functional-dependency mining). It is
+pulled in via:
+```bash
+pip install goldencheck[native]
+```
+You never import this directly -- `goldencheck.core._native_loader` discovers it
+and falls back to the pure-Python paths when it isn't installed. Behaviour is
+identical either way; the native path only changes wall-clock time.
+The kernels live in the pyo3-free `goldencheck-core` crate; this crate is the
+thin Arrow-reading PyO3 shim over it.

goldencheck_native-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,20 @@
+# goldencheck-native
+Optional native (Rust/PyO3) acceleration kernels for
+[GoldenCheck](https://github.com/benseverndev-oss/goldenmatch/tree/main/packages/python/goldencheck).
+`goldencheck` is a pure-Python wheel; this package ships the compiled abi3
+`_native` extension that accelerates GoldenCheck's CPU-bound deep-profiling work
+(Benford conformance, composite-key and functional-dependency mining). It is
+pulled in via:
+```bash
+pip install goldencheck[native]
+```
+You never import this directly -- `goldencheck.core._native_loader` discovers it
+and falls back to the pure-Python paths when it isn't installed. Behaviour is
+identical either way; the native path only changes wall-clock time.
+The kernels live in the pyo3-free `goldencheck-core` crate; this crate is the
+thin Arrow-reading PyO3 shim over it.

goldencheck_native-0.1.0/goldencheck-core/Cargo.toml ADDED Viewed

@@ -0,0 +1,25 @@
+# goldencheck-core -- pyo3-free kernel library for GoldenCheck's CPU-bound
+# deep-profiling work. The `score-core` analogue: pure compute over slices, no
+# Python, no Arrow. The pyo3 + Arrow C Data Interface shims live in the sibling
+# `goldencheck-native` crate, which path-depends on this one. Keeping the kernel
+# logic pyo3-free means it can later back a DuckDB/DataFusion SQL surface (as
+# `score-core`/`graph-core` do for goldenmatch) without dragging in CPython.
+[package]
+name = "goldencheck-core"
+version = "0.1.0"
+edition = "2021"
+license = "MIT"
+authors = ["Ben Severn <ben@bensevern.dev>"]
+description = "Pyo3-free deep-profiling kernels for GoldenCheck (Benford, composite-key & functional-dependency mining)"
+[lib]
+name = "goldencheck_core"
+[dependencies]
+# Fast, deterministic hashing for the combinatorial key/FD kernels. rustc-hash
+# is already in the dependency graph (pulled by the goldenmatch native stack),
+# so this adds no new crates.io fetch.
+rustc-hash = "2"
+[dev-dependencies]
+# Property/parity checks for the Benford leading-digit extraction.

goldencheck_native-0.1.0/goldencheck-core/src/benford.rs ADDED Viewed

@@ -0,0 +1,110 @@
+//! Benford leading-digit histogram.
+//!
+//! Behaviour-exact replacement for `_extract_leading_digits` +
+//! `Counter(...)` in `goldencheck/baseline/statistical.py` (and the identical
+//! loop in `goldencheck/drift/detector.py`). The Python reference, per value:
+//!
+//! ```python
+//! if v <= 0 or not math.isfinite(v):
+//!     continue
+//! exp = math.floor(math.log10(v))
+//! normalised = v / (10 ** exp)
+//! d = int(normalised)            # truncates toward zero; v > 0
+//! if 1 <= d <= 9:
+//!     digits.append(d)
+//! ```
+//!
+//! We return the per-digit counts for 1..=9 directly (the `Counter` the Python
+//! caller builds), so the caller's chi-squared step is unchanged.
+//!
+//! Parity subtlety: Python divides by `10 ** exp`, where `exp` is an `int`, so
+//! the divisor is the *correctly-rounded* f64 nearest to 10^exp (Python forms an
+//! exact bignum for exp >= 0, then rounds to f64 on the division). Rust's
+//! `10f64.powi(exp)` instead accumulates rounding and disagrees at large
+//! exponents (e.g. 1e300 -> the quotient drifts off 1.0, dropping a digit-1
+//! count). We therefore divide by a precomputed table of correctly-rounded
+//! powers of ten -- `"1e{exp}".parse::<f64>()` yields the exact same f64 as
+//! Python's `10 ** exp` for every reachable exponent (verified -323..=308). The
+//! `log10().floor()` step already agrees with `math.log10` (shared libm), so
+//! this makes the histogram byte-identical; the goldencheck parity test asserts
+//! it on random + adversarial (powers-of-ten, sub-normal-magnitude) data.
+use std::sync::OnceLock;
+// f64 exponent range is roughly 1e-323 .. 1e308; index = exp + OFFSET.
+const POW10_MIN_EXP: i32 = -323;
+const POW10_MAX_EXP: i32 = 308;
+const POW10_LEN: usize = (POW10_MAX_EXP - POW10_MIN_EXP + 1) as usize;
+/// Correctly-rounded f64 powers of ten, indexed by `exp - POW10_MIN_EXP`.
+/// Built once by parsing the decimal literal `1e{exp}` (the same correctly-
+/// rounded conversion Python applies to `10 ** exp`).
+fn pow10_table() -> &'static [f64; POW10_LEN] {
+    static TABLE: OnceLock<[f64; POW10_LEN]> = OnceLock::new();
+    TABLE.get_or_init(|| {
+        let mut t = [0.0f64; POW10_LEN];
+        for (i, slot) in t.iter_mut().enumerate() {
+            let exp = i as i32 + POW10_MIN_EXP;
+            *slot = format!("1e{exp}")
+                .parse::<f64>()
+                .expect("decimal power of ten parses");
+        }
+        t
+    })
+}
+/// The correctly-rounded f64 value of 10^exp, matching Python's `10 ** exp`.
+fn pow10(exp: i32) -> f64 {
+    let idx = exp - POW10_MIN_EXP;
+    if (0..POW10_LEN as i32).contains(&idx) {
+        pow10_table()[idx as usize]
+    } else {
+        // Outside the representable normal range (extreme sub-normals); fall
+        // back rather than panic. Such magnitudes never appear in a real
+        // Benford column.
+        10f64.powi(exp)
+    }
+}
+/// Leading-digit (1..=9) counts for the Benford conformance check.
+///
+/// `out[i]` is the number of finite, strictly-positive values whose leading
+/// significant digit is `i + 1`. Non-positive and non-finite values are
+/// skipped, exactly as the Python reference does.
+pub fn benford_leading_digits(values: &[f64]) -> [u64; 9] {
+    let mut counts = [0u64; 9];
+    for &v in values {
+        if v <= 0.0 || !v.is_finite() {
+            continue;
+        }
+        let exp = v.log10().floor() as i32;
+        let normalised = v / pow10(exp);
+        // `as i32` truncates toward zero; `normalised` is > 0 here.
+        let d = normalised as i32;
+        if (1..=9).contains(&d) {
+            counts[(d - 1) as usize] += 1;
+        }
+    }
+    counts
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn basic_digits() {
+        // 1.x, 9.x, 19 -> 1, 200 -> 2, 0 and -5 skipped, NaN/inf skipped.
+        let v = [1.5, 9.9, 19.0, 200.0, 0.0, -5.0, f64::NAN, f64::INFINITY];
+        let c = benford_leading_digits(&v);
+        assert_eq!(c[0], 2); // digit 1: 1.5, 19.0
+        assert_eq!(c[1], 1); // digit 2: 200.0
+        assert_eq!(c[8], 1); // digit 9: 9.9
+        assert_eq!(c.iter().sum::<u64>(), 4);
+    }
+    #[test]
+    fn empty_is_all_zero() {
+        assert_eq!(benford_leading_digits(&[]), [0u64; 9]);
+    }
+}

goldencheck_native-0.1.0/goldencheck-core/src/fuzzy.rs ADDED Viewed

@@ -0,0 +1,227 @@
+//! Fuzzy near-duplicate VALUE clustering.
+//!
+//! Given the distinct values of a (categorical/string) column, find clusters of
+//! values that are edit-distance-close -- inconsistent encodings of the same
+//! thing: `"California"` / `"Californa"` / `"CALIFORNIA "`, or `"Jon"` / `"John"`.
+//! This complements `relations/approx_duplicate.py` (which catches values that
+//! are *equal* after normalization); here the values differ even after
+//! normalization but are typo-close.
+//!
+//! Whole-ROW fuzzy matching is deliberately out of scope -- that's entity
+//! resolution (GoldenMatch's job). This stays at the value level: a bounded,
+//! per-column data-quality check.
+//!
+//! Algorithm (blocking + scoring + union-find):
+//!   - **Blocking** generates candidate pairs cheaply so we never do the full
+//!     O(n^2) comparison: two values are candidates if they share a character
+//!     trigram OR the same first-two-character prefix (the prefix block catches
+//!     short strings like `jon`/`john`, which share no trigram). Over-common
+//!     blocks (size > `MAX_BLOCK`) are skipped to bound the work.
+//!   - **Scoring** each candidate pair with a Levenshtein similarity ratio
+//!     `1 - dist/max(len_a, len_b)` on the normalized (lowercased, whitespace-
+//!     collapsed) form. Pairs >= `min_similarity` are linked.
+//!   - **Union-find** groups linked values into clusters; clusters of size >= 2
+//!     are returned (as index lists into the input `values`).
+//!
+//! Pairwise edit distance is the part that is painfully slow in Python and fast
+//! here -- this kernel genuinely beats a pure-Python fallback.
+use rustc_hash::{FxHashMap, FxHashSet};
+const MAX_BLOCK: usize = 300;
+/// Values shorter than this are compared via the prefix block only (too short
+/// for meaningful trigrams), and never matched below `MIN_LEN_FOR_FUZZY` to
+/// avoid pairing near-everything (e.g. 1-2 char codes).
+const MIN_LEN_FOR_FUZZY: usize = 3;
+/// Normalize for matching: lowercase + collapse internal whitespace + trim.
+/// (Punctuation is kept -- stripping it is the exact-after-normalization job of
+/// the Polars duplicate profiler; here we measure edit distance.)
+fn normalize(s: &str) -> String {
+    let lower = s.to_lowercase();
+    let mut out = String::with_capacity(lower.len());
+    let mut prev_space = false;
+    for ch in lower.chars() {
+        if ch.is_whitespace() {
+            if !out.is_empty() && !prev_space {
+                out.push(' ');
+                prev_space = true;
+            }
+        } else {
+            out.push(ch);
+            prev_space = false;
+        }
+    }
+    if out.ends_with(' ') {
+        out.pop();
+    }
+    out
+}
+fn char_trigrams(chars: &[char]) -> Vec<[char; 3]> {
+    if chars.len() < 3 {
+        return Vec::new();
+    }
+    (0..=chars.len() - 3)
+        .map(|i| [chars[i], chars[i + 1], chars[i + 2]])
+        .collect()
+}
+/// Levenshtein edit distance between two char slices (classic two-row DP).
+fn levenshtein(a: &[char], b: &[char]) -> usize {
+    if a.is_empty() {
+        return b.len();
+    }
+    if b.is_empty() {
+        return a.len();
+    }
+    let mut prev: Vec<usize> = (0..=b.len()).collect();
+    let mut cur = vec![0usize; b.len() + 1];
+    for (i, &ca) in a.iter().enumerate() {
+        cur[0] = i + 1;
+        for (j, &cb) in b.iter().enumerate() {
+            let cost = if ca == cb { 0 } else { 1 };
+            cur[j + 1] = (prev[j + 1] + 1).min(cur[j] + 1).min(prev[j] + cost);
+        }
+        std::mem::swap(&mut prev, &mut cur);
+    }
+    prev[b.len()]
+}
+fn similarity(a: &[char], b: &[char]) -> f64 {
+    let maxlen = a.len().max(b.len());
+    if maxlen == 0 {
+        return 1.0;
+    }
+    1.0 - (levenshtein(a, b) as f64) / (maxlen as f64)
+}
+struct UnionFind {
+    parent: Vec<usize>,
+}
+impl UnionFind {
+    fn new(n: usize) -> Self {
+        Self {
+            parent: (0..n).collect(),
+        }
+    }
+    fn find(&mut self, mut x: usize) -> usize {
+        while self.parent[x] != x {
+            self.parent[x] = self.parent[self.parent[x]];
+            x = self.parent[x];
+        }
+        x
+    }
+    fn union(&mut self, a: usize, b: usize) {
+        let (ra, rb) = (self.find(a), self.find(b));
+        if ra != rb {
+            self.parent[ra] = rb;
+        }
+    }
+}
+/// Cluster the distinct `values` into groups of edit-distance-close strings.
+/// Returns clusters (each a sorted list of indices into `values`) of size >= 2.
+pub fn near_duplicate_clusters(values: &[String], min_similarity: f64) -> Vec<Vec<usize>> {
+    let n = values.len();
+    if n < 2 {
+        return Vec::new();
+    }
+    // Normalize once; keep char vectors for distance + trigrams.
+    let norm: Vec<Vec<char>> = values
+        .iter()
+        .map(|v| normalize(v).chars().collect())
+        .collect();
+    // Blocking buckets: trigram -> indices, and 2-char-prefix -> indices.
+    let mut trigram_buckets: FxHashMap<[char; 3], Vec<usize>> = FxHashMap::default();
+    let mut prefix_buckets: FxHashMap<[char; 2], Vec<usize>> = FxHashMap::default();
+    for (i, chars) in norm.iter().enumerate() {
+        if chars.len() < MIN_LEN_FOR_FUZZY {
+            continue;
+        }
+        for tg in char_trigrams(chars) {
+            trigram_buckets.entry(tg).or_default().push(i);
+        }
+        prefix_buckets
+            .entry([chars[0], chars[1]])
+            .or_default()
+            .push(i);
+    }
+    // Candidate pairs (i < j), de-duplicated across both blocking strategies.
+    let mut candidates: FxHashSet<(usize, usize)> = FxHashSet::default();
+    for bucket in trigram_buckets.values().chain(prefix_buckets.values()) {
+        if bucket.len() < 2 || bucket.len() > MAX_BLOCK {
+            continue; // singleton or over-common block (bounds the work)
+        }
+        for a in 0..bucket.len() {
+            for b in (a + 1)..bucket.len() {
+                let (i, j) = (bucket[a], bucket[b]);
+                candidates.insert(if i < j { (i, j) } else { (j, i) });
+            }
+        }
+    }
+    let mut uf = UnionFind::new(n);
+    let mut linked = false;
+    for (i, j) in candidates {
+        if similarity(&norm[i], &norm[j]) >= min_similarity {
+            uf.union(i, j);
+            linked = true;
+        }
+    }
+    if !linked {
+        return Vec::new();
+    }
+    // Gather clusters of size >= 2.
+    let mut groups: FxHashMap<usize, Vec<usize>> = FxHashMap::default();
+    for i in 0..n {
+        let r = uf.find(i);
+        groups.entry(r).or_default().push(i);
+    }
+    let mut clusters: Vec<Vec<usize>> = groups.into_values().filter(|g| g.len() >= 2).collect();
+    for c in &mut clusters {
+        c.sort_unstable();
+    }
+    clusters.sort_unstable();
+    clusters
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    fn v(items: &[&str]) -> Vec<String> {
+        items.iter().map(|s| s.to_string()).collect()
+    }
+    #[test]
+    fn clusters_typos_and_case() {
+        let values = v(&["California", "Californa", "CALIFORNIA", "Texas", "New York"]);
+        let clusters = near_duplicate_clusters(&values, 0.8);
+        // The three California variants cluster; Texas / New York stand alone.
+        assert_eq!(clusters.len(), 1);
+        assert_eq!(clusters[0], vec![0, 1, 2]);
+    }
+    #[test]
+    fn short_string_typo_via_prefix_block() {
+        let values = v(&["Jon", "John", "Jane"]);
+        let clusters = near_duplicate_clusters(&values, 0.7);
+        assert_eq!(clusters, vec![vec![0, 1]]); // jon/john; jane separate
+    }
+    #[test]
+    fn nothing_when_all_distinct() {
+        let values = v(&["apple", "banana", "cherry"]);
+        assert!(near_duplicate_clusters(&values, 0.8).is_empty());
+    }
+    #[test]
+    fn empty_and_singleton() {
+        assert!(near_duplicate_clusters(&[], 0.8).is_empty());
+        assert!(near_duplicate_clusters(&v(&["x"]), 0.8).is_empty());
+    }
+}