goldencheck-native 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ Metadata-Version: 2.4
2
+ Name: goldencheck-native
3
+ Version: 0.1.0
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: 3 :: Only
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Topic :: Scientific/Engineering
9
+ Summary: Optional native (Rust/PyO3) acceleration kernels for goldencheck
10
+ Author-email: Ben Severn <ben@bensevern.dev>
11
+ License: MIT
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
14
+ Project-URL: Homepage, https://github.com/benseverndev-oss/goldenmatch
15
+
16
+ # goldencheck-native
17
+
18
+ Optional native (Rust/PyO3) acceleration kernels for
19
+ [GoldenCheck](https://github.com/benseverndev-oss/goldenmatch/tree/main/packages/python/goldencheck).
20
+
21
+ `goldencheck` is a pure-Python wheel; this package ships the compiled abi3
22
+ `_native` extension that accelerates GoldenCheck's CPU-bound deep-profiling work
23
+ (Benford conformance, composite-key and functional-dependency mining). It is
24
+ pulled in via:
25
+
26
+ ```bash
27
+ pip install goldencheck[native]
28
+ ```
29
+
30
+ You never import this directly -- `goldencheck.core._native_loader` discovers it
31
+ and falls back to the pure-Python paths when it isn't installed. Behaviour is
32
+ identical either way; the native path only changes wall-clock time.
33
+
34
+ The kernels live in the pyo3-free `goldencheck-core` crate; this crate is the
35
+ thin Arrow-reading PyO3 shim over it.
36
+
@@ -0,0 +1,20 @@
1
+ # goldencheck-native
2
+
3
+ Optional native (Rust/PyO3) acceleration kernels for
4
+ [GoldenCheck](https://github.com/benseverndev-oss/goldenmatch/tree/main/packages/python/goldencheck).
5
+
6
+ `goldencheck` is a pure-Python wheel; this package ships the compiled abi3
7
+ `_native` extension that accelerates GoldenCheck's CPU-bound deep-profiling work
8
+ (Benford conformance, composite-key and functional-dependency mining). It is
9
+ pulled in via:
10
+
11
+ ```bash
12
+ pip install goldencheck[native]
13
+ ```
14
+
15
+ You never import this directly -- `goldencheck.core._native_loader` discovers it
16
+ and falls back to the pure-Python paths when it isn't installed. Behaviour is
17
+ identical either way; the native path only changes wall-clock time.
18
+
19
+ The kernels live in the pyo3-free `goldencheck-core` crate; this crate is the
20
+ thin Arrow-reading PyO3 shim over it.
@@ -0,0 +1,25 @@
1
+ # goldencheck-core -- pyo3-free kernel library for GoldenCheck's CPU-bound
2
+ # deep-profiling work. The `score-core` analogue: pure compute over slices, no
3
+ # Python, no Arrow. The pyo3 + Arrow C Data Interface shims live in the sibling
4
+ # `goldencheck-native` crate, which path-depends on this one. Keeping the kernel
5
+ # logic pyo3-free means it can later back a DuckDB/DataFusion SQL surface (as
6
+ # `score-core`/`graph-core` do for goldenmatch) without dragging in CPython.
7
+ [package]
8
+ name = "goldencheck-core"
9
+ version = "0.1.0"
10
+ edition = "2021"
11
+ license = "MIT"
12
+ authors = ["Ben Severn <ben@bensevern.dev>"]
13
+ description = "Pyo3-free deep-profiling kernels for GoldenCheck (Benford, composite-key & functional-dependency mining)"
14
+
15
+ [lib]
16
+ name = "goldencheck_core"
17
+
18
+ [dependencies]
19
+ # Fast, deterministic hashing for the combinatorial key/FD kernels. rustc-hash
20
+ # is already in the dependency graph (pulled by the goldenmatch native stack),
21
+ # so this adds no new crates.io fetch.
22
+ rustc-hash = "2"
23
+
24
+ [dev-dependencies]
25
+ # Property/parity checks for the Benford leading-digit extraction.
@@ -0,0 +1,110 @@
1
+ //! Benford leading-digit histogram.
2
+ //!
3
+ //! Behaviour-exact replacement for `_extract_leading_digits` +
4
+ //! `Counter(...)` in `goldencheck/baseline/statistical.py` (and the identical
5
+ //! loop in `goldencheck/drift/detector.py`). The Python reference, per value:
6
+ //!
7
+ //! ```python
8
+ //! if v <= 0 or not math.isfinite(v):
9
+ //! continue
10
+ //! exp = math.floor(math.log10(v))
11
+ //! normalised = v / (10 ** exp)
12
+ //! d = int(normalised) # truncates toward zero; v > 0
13
+ //! if 1 <= d <= 9:
14
+ //! digits.append(d)
15
+ //! ```
16
+ //!
17
+ //! We return the per-digit counts for 1..=9 directly (the `Counter` the Python
18
+ //! caller builds), so the caller's chi-squared step is unchanged.
19
+ //!
20
+ //! Parity subtlety: Python divides by `10 ** exp`, where `exp` is an `int`, so
21
+ //! the divisor is the *correctly-rounded* f64 nearest to 10^exp (Python forms an
22
+ //! exact bignum for exp >= 0, then rounds to f64 on the division). Rust's
23
+ //! `10f64.powi(exp)` instead accumulates rounding and disagrees at large
24
+ //! exponents (e.g. 1e300 -> the quotient drifts off 1.0, dropping a digit-1
25
+ //! count). We therefore divide by a precomputed table of correctly-rounded
26
+ //! powers of ten -- `"1e{exp}".parse::<f64>()` yields the exact same f64 as
27
+ //! Python's `10 ** exp` for every reachable exponent (verified -323..=308). The
28
+ //! `log10().floor()` step already agrees with `math.log10` (shared libm), so
29
+ //! this makes the histogram byte-identical; the goldencheck parity test asserts
30
+ //! it on random + adversarial (powers-of-ten, sub-normal-magnitude) data.
31
+
32
+ use std::sync::OnceLock;
33
+
34
+ // f64 exponent range is roughly 1e-323 .. 1e308; index = exp + OFFSET.
35
+ const POW10_MIN_EXP: i32 = -323;
36
+ const POW10_MAX_EXP: i32 = 308;
37
+ const POW10_LEN: usize = (POW10_MAX_EXP - POW10_MIN_EXP + 1) as usize;
38
+
39
+ /// Correctly-rounded f64 powers of ten, indexed by `exp - POW10_MIN_EXP`.
40
+ /// Built once by parsing the decimal literal `1e{exp}` (the same correctly-
41
+ /// rounded conversion Python applies to `10 ** exp`).
42
+ fn pow10_table() -> &'static [f64; POW10_LEN] {
43
+ static TABLE: OnceLock<[f64; POW10_LEN]> = OnceLock::new();
44
+ TABLE.get_or_init(|| {
45
+ let mut t = [0.0f64; POW10_LEN];
46
+ for (i, slot) in t.iter_mut().enumerate() {
47
+ let exp = i as i32 + POW10_MIN_EXP;
48
+ *slot = format!("1e{exp}")
49
+ .parse::<f64>()
50
+ .expect("decimal power of ten parses");
51
+ }
52
+ t
53
+ })
54
+ }
55
+
56
+ /// The correctly-rounded f64 value of 10^exp, matching Python's `10 ** exp`.
57
+ fn pow10(exp: i32) -> f64 {
58
+ let idx = exp - POW10_MIN_EXP;
59
+ if (0..POW10_LEN as i32).contains(&idx) {
60
+ pow10_table()[idx as usize]
61
+ } else {
62
+ // Outside the representable normal range (extreme sub-normals); fall
63
+ // back rather than panic. Such magnitudes never appear in a real
64
+ // Benford column.
65
+ 10f64.powi(exp)
66
+ }
67
+ }
68
+
69
+ /// Leading-digit (1..=9) counts for the Benford conformance check.
70
+ ///
71
+ /// `out[i]` is the number of finite, strictly-positive values whose leading
72
+ /// significant digit is `i + 1`. Non-positive and non-finite values are
73
+ /// skipped, exactly as the Python reference does.
74
+ pub fn benford_leading_digits(values: &[f64]) -> [u64; 9] {
75
+ let mut counts = [0u64; 9];
76
+ for &v in values {
77
+ if v <= 0.0 || !v.is_finite() {
78
+ continue;
79
+ }
80
+ let exp = v.log10().floor() as i32;
81
+ let normalised = v / pow10(exp);
82
+ // `as i32` truncates toward zero; `normalised` is > 0 here.
83
+ let d = normalised as i32;
84
+ if (1..=9).contains(&d) {
85
+ counts[(d - 1) as usize] += 1;
86
+ }
87
+ }
88
+ counts
89
+ }
90
+
91
+ #[cfg(test)]
92
+ mod tests {
93
+ use super::*;
94
+
95
+ #[test]
96
+ fn basic_digits() {
97
+ // 1.x, 9.x, 19 -> 1, 200 -> 2, 0 and -5 skipped, NaN/inf skipped.
98
+ let v = [1.5, 9.9, 19.0, 200.0, 0.0, -5.0, f64::NAN, f64::INFINITY];
99
+ let c = benford_leading_digits(&v);
100
+ assert_eq!(c[0], 2); // digit 1: 1.5, 19.0
101
+ assert_eq!(c[1], 1); // digit 2: 200.0
102
+ assert_eq!(c[8], 1); // digit 9: 9.9
103
+ assert_eq!(c.iter().sum::<u64>(), 4);
104
+ }
105
+
106
+ #[test]
107
+ fn empty_is_all_zero() {
108
+ assert_eq!(benford_leading_digits(&[]), [0u64; 9]);
109
+ }
110
+ }
@@ -0,0 +1,227 @@
1
+ //! Fuzzy near-duplicate VALUE clustering.
2
+ //!
3
+ //! Given the distinct values of a (categorical/string) column, find clusters of
4
+ //! values that are edit-distance-close -- inconsistent encodings of the same
5
+ //! thing: `"California"` / `"Californa"` / `"CALIFORNIA "`, or `"Jon"` / `"John"`.
6
+ //! This complements `relations/approx_duplicate.py` (which catches values that
7
+ //! are *equal* after normalization); here the values differ even after
8
+ //! normalization but are typo-close.
9
+ //!
10
+ //! Whole-ROW fuzzy matching is deliberately out of scope -- that's entity
11
+ //! resolution (GoldenMatch's job). This stays at the value level: a bounded,
12
+ //! per-column data-quality check.
13
+ //!
14
+ //! Algorithm (blocking + scoring + union-find):
15
+ //! - **Blocking** generates candidate pairs cheaply so we never do the full
16
+ //! O(n^2) comparison: two values are candidates if they share a character
17
+ //! trigram OR the same first-two-character prefix (the prefix block catches
18
+ //! short strings like `jon`/`john`, which share no trigram). Over-common
19
+ //! blocks (size > `MAX_BLOCK`) are skipped to bound the work.
20
+ //! - **Scoring** each candidate pair with a Levenshtein similarity ratio
21
+ //! `1 - dist/max(len_a, len_b)` on the normalized (lowercased, whitespace-
22
+ //! collapsed) form. Pairs >= `min_similarity` are linked.
23
+ //! - **Union-find** groups linked values into clusters; clusters of size >= 2
24
+ //! are returned (as index lists into the input `values`).
25
+ //!
26
+ //! Pairwise edit distance is the part that is painfully slow in Python and fast
27
+ //! here -- this kernel genuinely beats a pure-Python fallback.
28
+
29
+ use rustc_hash::{FxHashMap, FxHashSet};
30
+
31
+ const MAX_BLOCK: usize = 300;
32
+ /// Values shorter than this are compared via the prefix block only (too short
33
+ /// for meaningful trigrams), and never matched below `MIN_LEN_FOR_FUZZY` to
34
+ /// avoid pairing near-everything (e.g. 1-2 char codes).
35
+ const MIN_LEN_FOR_FUZZY: usize = 3;
36
+
37
+ /// Normalize for matching: lowercase + collapse internal whitespace + trim.
38
+ /// (Punctuation is kept -- stripping it is the exact-after-normalization job of
39
+ /// the Polars duplicate profiler; here we measure edit distance.)
40
+ fn normalize(s: &str) -> String {
41
+ let lower = s.to_lowercase();
42
+ let mut out = String::with_capacity(lower.len());
43
+ let mut prev_space = false;
44
+ for ch in lower.chars() {
45
+ if ch.is_whitespace() {
46
+ if !out.is_empty() && !prev_space {
47
+ out.push(' ');
48
+ prev_space = true;
49
+ }
50
+ } else {
51
+ out.push(ch);
52
+ prev_space = false;
53
+ }
54
+ }
55
+ if out.ends_with(' ') {
56
+ out.pop();
57
+ }
58
+ out
59
+ }
60
+
61
+ fn char_trigrams(chars: &[char]) -> Vec<[char; 3]> {
62
+ if chars.len() < 3 {
63
+ return Vec::new();
64
+ }
65
+ (0..=chars.len() - 3)
66
+ .map(|i| [chars[i], chars[i + 1], chars[i + 2]])
67
+ .collect()
68
+ }
69
+
70
+ /// Levenshtein edit distance between two char slices (classic two-row DP).
71
+ fn levenshtein(a: &[char], b: &[char]) -> usize {
72
+ if a.is_empty() {
73
+ return b.len();
74
+ }
75
+ if b.is_empty() {
76
+ return a.len();
77
+ }
78
+ let mut prev: Vec<usize> = (0..=b.len()).collect();
79
+ let mut cur = vec![0usize; b.len() + 1];
80
+ for (i, &ca) in a.iter().enumerate() {
81
+ cur[0] = i + 1;
82
+ for (j, &cb) in b.iter().enumerate() {
83
+ let cost = if ca == cb { 0 } else { 1 };
84
+ cur[j + 1] = (prev[j + 1] + 1).min(cur[j] + 1).min(prev[j] + cost);
85
+ }
86
+ std::mem::swap(&mut prev, &mut cur);
87
+ }
88
+ prev[b.len()]
89
+ }
90
+
91
+ fn similarity(a: &[char], b: &[char]) -> f64 {
92
+ let maxlen = a.len().max(b.len());
93
+ if maxlen == 0 {
94
+ return 1.0;
95
+ }
96
+ 1.0 - (levenshtein(a, b) as f64) / (maxlen as f64)
97
+ }
98
+
99
+ struct UnionFind {
100
+ parent: Vec<usize>,
101
+ }
102
+ impl UnionFind {
103
+ fn new(n: usize) -> Self {
104
+ Self {
105
+ parent: (0..n).collect(),
106
+ }
107
+ }
108
+ fn find(&mut self, mut x: usize) -> usize {
109
+ while self.parent[x] != x {
110
+ self.parent[x] = self.parent[self.parent[x]];
111
+ x = self.parent[x];
112
+ }
113
+ x
114
+ }
115
+ fn union(&mut self, a: usize, b: usize) {
116
+ let (ra, rb) = (self.find(a), self.find(b));
117
+ if ra != rb {
118
+ self.parent[ra] = rb;
119
+ }
120
+ }
121
+ }
122
+
123
+ /// Cluster the distinct `values` into groups of edit-distance-close strings.
124
+ /// Returns clusters (each a sorted list of indices into `values`) of size >= 2.
125
+ pub fn near_duplicate_clusters(values: &[String], min_similarity: f64) -> Vec<Vec<usize>> {
126
+ let n = values.len();
127
+ if n < 2 {
128
+ return Vec::new();
129
+ }
130
+ // Normalize once; keep char vectors for distance + trigrams.
131
+ let norm: Vec<Vec<char>> = values
132
+ .iter()
133
+ .map(|v| normalize(v).chars().collect())
134
+ .collect();
135
+
136
+ // Blocking buckets: trigram -> indices, and 2-char-prefix -> indices.
137
+ let mut trigram_buckets: FxHashMap<[char; 3], Vec<usize>> = FxHashMap::default();
138
+ let mut prefix_buckets: FxHashMap<[char; 2], Vec<usize>> = FxHashMap::default();
139
+ for (i, chars) in norm.iter().enumerate() {
140
+ if chars.len() < MIN_LEN_FOR_FUZZY {
141
+ continue;
142
+ }
143
+ for tg in char_trigrams(chars) {
144
+ trigram_buckets.entry(tg).or_default().push(i);
145
+ }
146
+ prefix_buckets
147
+ .entry([chars[0], chars[1]])
148
+ .or_default()
149
+ .push(i);
150
+ }
151
+
152
+ // Candidate pairs (i < j), de-duplicated across both blocking strategies.
153
+ let mut candidates: FxHashSet<(usize, usize)> = FxHashSet::default();
154
+ for bucket in trigram_buckets.values().chain(prefix_buckets.values()) {
155
+ if bucket.len() < 2 || bucket.len() > MAX_BLOCK {
156
+ continue; // singleton or over-common block (bounds the work)
157
+ }
158
+ for a in 0..bucket.len() {
159
+ for b in (a + 1)..bucket.len() {
160
+ let (i, j) = (bucket[a], bucket[b]);
161
+ candidates.insert(if i < j { (i, j) } else { (j, i) });
162
+ }
163
+ }
164
+ }
165
+
166
+ let mut uf = UnionFind::new(n);
167
+ let mut linked = false;
168
+ for (i, j) in candidates {
169
+ if similarity(&norm[i], &norm[j]) >= min_similarity {
170
+ uf.union(i, j);
171
+ linked = true;
172
+ }
173
+ }
174
+ if !linked {
175
+ return Vec::new();
176
+ }
177
+
178
+ // Gather clusters of size >= 2.
179
+ let mut groups: FxHashMap<usize, Vec<usize>> = FxHashMap::default();
180
+ for i in 0..n {
181
+ let r = uf.find(i);
182
+ groups.entry(r).or_default().push(i);
183
+ }
184
+ let mut clusters: Vec<Vec<usize>> = groups.into_values().filter(|g| g.len() >= 2).collect();
185
+ for c in &mut clusters {
186
+ c.sort_unstable();
187
+ }
188
+ clusters.sort_unstable();
189
+ clusters
190
+ }
191
+
192
+ #[cfg(test)]
193
+ mod tests {
194
+ use super::*;
195
+
196
+ fn v(items: &[&str]) -> Vec<String> {
197
+ items.iter().map(|s| s.to_string()).collect()
198
+ }
199
+
200
+ #[test]
201
+ fn clusters_typos_and_case() {
202
+ let values = v(&["California", "Californa", "CALIFORNIA", "Texas", "New York"]);
203
+ let clusters = near_duplicate_clusters(&values, 0.8);
204
+ // The three California variants cluster; Texas / New York stand alone.
205
+ assert_eq!(clusters.len(), 1);
206
+ assert_eq!(clusters[0], vec![0, 1, 2]);
207
+ }
208
+
209
+ #[test]
210
+ fn short_string_typo_via_prefix_block() {
211
+ let values = v(&["Jon", "John", "Jane"]);
212
+ let clusters = near_duplicate_clusters(&values, 0.7);
213
+ assert_eq!(clusters, vec![vec![0, 1]]); // jon/john; jane separate
214
+ }
215
+
216
+ #[test]
217
+ fn nothing_when_all_distinct() {
218
+ let values = v(&["apple", "banana", "cherry"]);
219
+ assert!(near_duplicate_clusters(&values, 0.8).is_empty());
220
+ }
221
+
222
+ #[test]
223
+ fn empty_and_singleton() {
224
+ assert!(near_duplicate_clusters(&[], 0.8).is_empty());
225
+ assert!(near_duplicate_clusters(&v(&["x"]), 0.8).is_empty());
226
+ }
227
+ }