dumpling-cli 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/CHANGELOG.md +15 -0
  2. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/Cargo.lock +1 -1
  3. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/Cargo.toml +1 -1
  4. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/PKG-INFO +1 -1
  5. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/pyproject.toml +1 -1
  6. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/filter.rs +234 -5
  7. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/sql.rs +192 -2
  8. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.dumplingconf.example +0 -0
  9. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/ci.yml +0 -0
  10. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/docs-pr.yml +0 -0
  11. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/docs.yml +0 -0
  12. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/platform-compat-latest.yml +0 -0
  13. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/platform-compat-matrix.yml +0 -0
  14. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/policy-lint.yml +0 -0
  15. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/publish.yml +0 -0
  16. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/release.yml +0 -0
  17. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.github/workflows/tests.yml +0 -0
  18. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/.gitignore +0 -0
  19. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/AGENTS.md +0 -0
  20. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/CONTRIBUTING.md +0 -0
  21. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/MAINTENANCE.md +0 -0
  22. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/README.md +0 -0
  23. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/assets/logo.svg +0 -0
  24. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/book.toml +0 -0
  25. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/datetime_out.sql +0 -0
  26. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/datetime_sample.sql +0 -0
  27. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/docs/src/SUMMARY.md +0 -0
  28. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/docs/src/ci-guardrails.md +0 -0
  29. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/docs/src/configuration.md +0 -0
  30. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/docs/src/getting-started.md +0 -0
  31. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/docs/src/index.md +0 -0
  32. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/docs/src/releasing.md +0 -0
  33. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/rust-toolchain.toml +0 -0
  34. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/scripts/setup-dev.sh +0 -0
  35. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/faker_dispatch.rs +0 -0
  36. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/lint.rs +0 -0
  37. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/main.rs +0 -0
  38. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/report.rs +0 -0
  39. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/scan.rs +0 -0
  40. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/settings.rs +0 -0
  41. {dumpling_cli-0.4.1 → dumpling_cli-0.4.3}/src/transform.rs +0 -0
@@ -7,6 +7,19 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.3] - 2026-05-03
11
+
12
+ ### Fixed
13
+
14
+ - **COPY row integrity after anonymization**: Control characters in anonymized COPY text fields are escaped so tab/newline/etc. cannot break column alignment or row boundaries ([#53](https://github.com/ababic/dumpling/pull/53)).
15
+
16
+ ## [0.4.2] - 2026-05-03
17
+
18
+ ### Fixed
19
+
20
+ - **JSON path rules on non-JSON cells**: Path-based `[rules]` anonymization is skipped when the cell is not strict JSON, leaving the original value unchanged (consistent with row-filter JSON path behavior).
21
+ - **JSON scalar types in path-based anonymization**: Replacements at JSON paths preserve number and boolean leaf types where possible (numeric and boolean coercion from generated text).
22
+
10
23
  ## [0.4.1] - 2026-05-03
11
24
 
12
25
  ### Fixed
@@ -61,6 +74,8 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
61
74
  - Configurable output scan severities and per-category thresholds via `[output_scan]`.
62
75
  - JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
63
76
 
77
+ [0.4.3]: https://github.com/ababic/dumpling/compare/v0.4.2...v0.4.3
78
+ [0.4.2]: https://github.com/ababic/dumpling/compare/v0.4.1...v0.4.2
64
79
  [0.4.1]: https://github.com/ababic/dumpling/compare/v0.4.0...v0.4.1
65
80
  [0.4.0]: https://github.com/ababic/dumpling/compare/v0.3.0...v0.4.0
66
81
  [0.3.0]: https://github.com/ababic/dumpling/compare/v0.2.0...v0.3.0
@@ -262,7 +262,7 @@ dependencies = [
262
262
 
263
263
  [[package]]
264
264
  name = "dumpling"
265
- version = "0.4.1"
265
+ version = "0.4.3"
266
266
  dependencies = [
267
267
  "anyhow",
268
268
  "chrono",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "dumpling"
3
- version = "0.4.1"
3
+ version = "0.4.3"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dumpling-cli
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "dumpling-cli"
7
- version = "0.4.1"
7
+ version = "0.4.3"
8
8
  description = "Static anonymizer for plain SQL dumps (PostgreSQL, SQLite, SQL Server)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -224,25 +224,118 @@ fn replacement_to_json_value(repl: &Replacement) -> serde_json::Value {
224
224
  .unwrap_or_else(|_| serde_json::Value::String(repl.value.clone()))
225
225
  }
226
226
 
227
+ /// When rewriting JSON at a path, map `Replacement` back into [`serde_json::Value`] while keeping
228
+ /// the leaf's JSON type when the strategy still returns text (e.g. `Replacement::quoted` for
229
+ /// `string`, `hash`, etc.): numeric and boolean leaves stay JSON numbers/bools if the replacement
230
+ /// text parses as such.
231
+ fn coerce_json_path_replacement(
232
+ original: &serde_json::Value,
233
+ repl: &Replacement,
234
+ ) -> serde_json::Value {
235
+ if repl.is_null {
236
+ return serde_json::Value::Null;
237
+ }
238
+ match original {
239
+ serde_json::Value::Bool(_) => {
240
+ if let Some(b) = parse_loose_json_bool(&repl.value) {
241
+ return serde_json::Value::Bool(b);
242
+ }
243
+ if !repl.force_quoted {
244
+ if let Ok(v) = serde_json::from_str::<serde_json::Value>(&repl.value) {
245
+ match v {
246
+ serde_json::Value::Bool(b) => return serde_json::Value::Bool(b),
247
+ serde_json::Value::Number(n) => {
248
+ if n.as_u64() == Some(0) || n.as_i64() == Some(0) {
249
+ return serde_json::Value::Bool(false);
250
+ }
251
+ if n.as_u64() == Some(1) || n.as_i64() == Some(1) {
252
+ return serde_json::Value::Bool(true);
253
+ }
254
+ }
255
+ _ => {}
256
+ }
257
+ }
258
+ }
259
+ serde_json::Value::String(repl.value.clone())
260
+ }
261
+ serde_json::Value::Number(_) => {
262
+ if let Some(n) = parse_loose_json_number(&repl.value) {
263
+ return serde_json::Value::Number(n);
264
+ }
265
+ if !repl.force_quoted {
266
+ if let Ok(serde_json::Value::Number(n)) =
267
+ serde_json::from_str::<serde_json::Value>(&repl.value)
268
+ {
269
+ return serde_json::Value::Number(n);
270
+ }
271
+ }
272
+ serde_json::Value::String(repl.value.clone())
273
+ }
274
+ serde_json::Value::String(_) => {
275
+ if repl.force_quoted {
276
+ serde_json::Value::String(repl.value.clone())
277
+ } else {
278
+ serde_json::from_str(&repl.value)
279
+ .unwrap_or_else(|_| serde_json::Value::String(repl.value.clone()))
280
+ }
281
+ }
282
+ serde_json::Value::Null => replacement_to_json_value(repl),
283
+ serde_json::Value::Array(_) | serde_json::Value::Object(_) => {
284
+ replacement_to_json_value(repl)
285
+ }
286
+ }
287
+ }
288
+
289
+ fn parse_loose_json_bool(s: &str) -> Option<bool> {
290
+ match s.trim().to_ascii_lowercase().as_str() {
291
+ "true" => Some(true),
292
+ "false" => Some(false),
293
+ _ => None,
294
+ }
295
+ }
296
+
297
+ fn parse_loose_json_number(s: &str) -> Option<serde_json::Number> {
298
+ let t = s.trim();
299
+ if t.is_empty() {
300
+ return None;
301
+ }
302
+ if let Ok(i) = t.parse::<i64>() {
303
+ return Some(i.into());
304
+ }
305
+ if let Ok(u) = t.parse::<u64>() {
306
+ return Some(u.into());
307
+ }
308
+ let f = t.parse::<f64>().ok()?;
309
+ serde_json::Number::from_f64(f)
310
+ }
311
+
227
312
  fn apply_leaf_replacement(target: &mut serde_json::Value, repl: &Replacement) {
228
- *target = replacement_to_json_value(repl);
313
+ let original = target.clone();
314
+ *target = coerce_json_path_replacement(&original, repl);
229
315
  }
230
316
 
231
317
  /// Mutate JSON document strings at configured paths using the same path semantics as predicates.
318
+ ///
319
+ /// Returns [`None`] when `raw_json` is not valid strict JSON (same tolerance as row-filter JSON
320
+ /// path extraction): path rules are skipped for that cell and callers should passthrough the
321
+ /// original value unchanged.
232
322
  pub fn rewrite_json_paths_with_rules(
233
323
  registry: &AnonymizerRegistry,
234
324
  column_max_len: Option<usize>,
235
325
  json_rules: &[(Vec<String>, AnonymizerSpec)],
236
326
  raw_json: &str,
237
- ) -> anyhow::Result<String> {
238
- let mut root = serde_json::from_str::<serde_json::Value>(raw_json)?;
327
+ ) -> anyhow::Result<Option<String>> {
328
+ let mut root = match serde_json::from_str::<serde_json::Value>(raw_json) {
329
+ Ok(v) => v,
330
+ Err(_) => return Ok(None),
331
+ };
239
332
  for (path, spec) in json_rules {
240
333
  let mut apply = |original_cell: Option<String>| {
241
334
  apply_anonymizer(registry, spec, original_cell.as_deref(), column_max_len)
242
335
  };
243
336
  mutate_json_at_path(&mut root, path, &mut apply)?;
244
337
  }
245
- Ok(root.to_string())
338
+ Ok(Some(root.to_string()))
246
339
  }
247
340
 
248
341
  fn mutate_json_at_path<F>(
@@ -457,7 +550,8 @@ fn get_cached_regex(pat: &str, case_insensitive: bool) -> regex::Regex {
457
550
  #[cfg(test)]
458
551
  mod tests {
459
552
  use super::*;
460
- use crate::settings::{ResolvedConfig, RowFilterSet};
553
+ use crate::settings::{AnonymizerSpec, ResolvedConfig, RowFilterSet};
554
+ use crate::transform::AnonymizerRegistry;
461
555
  use std::collections::HashMap;
462
556
 
463
557
  #[test]
@@ -630,4 +724,139 @@ mod tests {
630
724
  &[Some(r#"{"items":[{"kind":"secondary"}]}"#.to_string())]
631
725
  ));
632
726
  }
727
+
728
+ #[test]
729
+ fn rewrite_json_paths_skips_non_json_cells_like_row_filters() {
730
+ let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
731
+ let spec = AnonymizerSpec {
732
+ strategy: "string".to_string(),
733
+ salt: None,
734
+ min: None,
735
+ max: None,
736
+ length: Some(4),
737
+ min_days: None,
738
+ max_days: None,
739
+ min_seconds: None,
740
+ max_seconds: None,
741
+ domain: None,
742
+ unique_within_domain: None,
743
+ as_string: Some(true),
744
+ locale: None,
745
+ faker: None,
746
+ format: None,
747
+ };
748
+ rules.insert("public.t".to_string(), HashMap::new());
749
+ let cfg = ResolvedConfig {
750
+ salt: None,
751
+ rules,
752
+ row_filters: HashMap::new(),
753
+ column_cases: HashMap::new(),
754
+ sensitive_columns: HashMap::new(),
755
+ output_scan: crate::settings::OutputScanConfig::default(),
756
+ source_path: None,
757
+ };
758
+ let registry = AnonymizerRegistry::from_config(&cfg);
759
+ let json_rules: Vec<(Vec<String>, AnonymizerSpec)> = vec![(
760
+ vec!["profile".to_string(), "secret".to_string()],
761
+ spec.clone(),
762
+ )];
763
+ assert!(
764
+ rewrite_json_paths_with_rules(&registry, None, &json_rules, "{not json")
765
+ .unwrap()
766
+ .is_none()
767
+ );
768
+ let out = rewrite_json_paths_with_rules(
769
+ &registry,
770
+ None,
771
+ &json_rules,
772
+ r#"{"profile":{"secret":"x"}}"#,
773
+ )
774
+ .unwrap()
775
+ .expect("valid JSON should rewrite");
776
+ let v: serde_json::Value = serde_json::from_str(&out).unwrap();
777
+ assert_ne!(v["profile"]["secret"], "x");
778
+ }
779
+
780
+ #[test]
781
+ fn rewrite_json_paths_preserves_number_and_bool_leaf_types_for_quoted_replacements() {
782
+ let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
783
+ rules.insert("public.t".to_string(), HashMap::new());
784
+ let cfg = ResolvedConfig {
785
+ salt: None,
786
+ rules,
787
+ row_filters: HashMap::new(),
788
+ column_cases: HashMap::new(),
789
+ sensitive_columns: HashMap::new(),
790
+ output_scan: crate::settings::OutputScanConfig::default(),
791
+ source_path: None,
792
+ };
793
+ let registry = AnonymizerRegistry::from_config(&cfg);
794
+
795
+ let int_spec = AnonymizerSpec {
796
+ strategy: "int_range".to_string(),
797
+ salt: None,
798
+ min: Some(0),
799
+ max: Some(9),
800
+ length: None,
801
+ min_days: None,
802
+ max_days: None,
803
+ min_seconds: None,
804
+ max_seconds: None,
805
+ domain: Some("coerce_int_leaf".to_string()),
806
+ unique_within_domain: None,
807
+ as_string: None,
808
+ locale: None,
809
+ faker: None,
810
+ format: None,
811
+ };
812
+ let out = rewrite_json_paths_with_rules(
813
+ &registry,
814
+ None,
815
+ &[(vec!["n".to_string()], int_spec)],
816
+ r#"{"n":1,"b":true,"s":"x"}"#,
817
+ )
818
+ .unwrap()
819
+ .unwrap();
820
+ let v: serde_json::Value = serde_json::from_str(&out).unwrap();
821
+ assert!(
822
+ v["n"].is_number(),
823
+ "int_range replacement should stay JSON number, got {:?}",
824
+ v["n"]
825
+ );
826
+ assert_eq!(v["b"], true);
827
+ assert_eq!(v["s"], "x");
828
+
829
+ let string_spec = AnonymizerSpec {
830
+ strategy: "int_range".to_string(),
831
+ salt: None,
832
+ min: Some(0),
833
+ max: Some(0),
834
+ length: None,
835
+ min_days: None,
836
+ max_days: None,
837
+ min_seconds: None,
838
+ max_seconds: None,
839
+ domain: Some("coerce_bool_leaf".to_string()),
840
+ unique_within_domain: None,
841
+ as_string: None,
842
+ locale: None,
843
+ faker: None,
844
+ format: None,
845
+ };
846
+ let out2 = rewrite_json_paths_with_rules(
847
+ &registry,
848
+ None,
849
+ &[(vec!["b".to_string()], string_spec)],
850
+ r#"{"b":false}"#,
851
+ )
852
+ .unwrap()
853
+ .unwrap();
854
+ let v2: serde_json::Value = serde_json::from_str(&out2).unwrap();
855
+ assert!(
856
+ v2["b"].is_boolean(),
857
+ "unquoted 0 from int_range should coerce to bool at bool leaf, got {:?}",
858
+ v2["b"]
859
+ );
860
+ assert_eq!(v2["b"], false);
861
+ }
633
862
  }
@@ -299,7 +299,8 @@ impl SqlStreamProcessor {
299
299
  if repl.is_null {
300
300
  new_fields.push(r"\N".to_string());
301
301
  } else {
302
- new_fields.push(repl.value);
302
+ new_fields
303
+ .push(escape_postgres_copy_text_field(&repl.value));
303
304
  }
304
305
  }
305
306
  Err(e) => return Err(e),
@@ -562,7 +563,11 @@ impl SqlStreamProcessor {
562
563
  None => return Ok(None),
563
564
  };
564
565
  let specs: Vec<AnonymizerSpec> = json_owned.iter().map(|(_, s)| s.clone()).collect();
565
- let out = rewrite_json_paths_with_rules(&self.anonymizers, col_len, &json_owned, raw)?;
566
+ let Some(out) =
567
+ rewrite_json_paths_with_rules(&self.anonymizers, col_len, &json_owned, raw)?
568
+ else {
569
+ return Ok(None);
570
+ };
566
571
  let repl = Replacement::quoted(out);
567
572
  Ok(Some((repl, specs)))
568
573
  }
@@ -1177,6 +1182,32 @@ impl Cell {
1177
1182
  }
1178
1183
  }
1179
1184
 
1185
+ /// Escapes a field value for PostgreSQL `COPY ... FROM stdin` **text** format so the output
1186
+ /// line still has one physical TAB-separated field per logical column. Without this, a
1187
+ /// replacement containing a literal TAB or newline would split the row on restore and surface
1188
+ /// as PostgreSQL errors like `missing data for column "..."`.
1189
+ fn escape_postgres_copy_text_field(s: &str) -> String {
1190
+ let mut out = String::with_capacity(s.len());
1191
+ for c in s.chars() {
1192
+ match c {
1193
+ '\\' => out.push_str("\\\\"),
1194
+ '\t' => out.push_str("\\t"),
1195
+ '\n' => out.push_str("\\n"),
1196
+ '\r' => out.push_str("\\r"),
1197
+ '\u{0008}' => out.push_str("\\b"),
1198
+ '\u{000c}' => out.push_str("\\f"),
1199
+ '\u{000b}' => out.push_str("\\v"),
1200
+ '\0' => out.push_str("\\0"),
1201
+ c if (c as u32) < 0x20 => {
1202
+ use std::fmt::Write;
1203
+ let _ = write!(out, "\\x{:02x}", c as u32);
1204
+ }
1205
+ c => out.push(c),
1206
+ }
1207
+ }
1208
+ out
1209
+ }
1210
+
1180
1211
  fn render_cell(repl: &Replacement, original: &Cell) -> String {
1181
1212
  let trailing = original.trailing_expr.as_deref().unwrap_or("");
1182
1213
  if repl.is_null {
@@ -2115,6 +2146,15 @@ INSERT INTO public.users (id, email) VALUES
2115
2146
  );
2116
2147
  }
2117
2148
 
2149
+ #[test]
2150
+ fn escape_postgres_copy_text_field_escapes_control_chars() {
2151
+ assert_eq!(
2152
+ escape_postgres_copy_text_field("a\tb\nc\\"),
2153
+ "a\\tb\\nc\\\\"
2154
+ );
2155
+ assert_eq!(escape_postgres_copy_text_field("\0\u{01}"), "\\0\\x01");
2156
+ }
2157
+
2118
2158
  #[test]
2119
2159
  fn domain_mapping_null_and_non_null_cross_table_consistency() {
2120
2160
  // When the same domain spans two tables, NULL stays NULL in both, and
@@ -2398,6 +2438,156 @@ COPY public.events (id, payload) FROM stdin;
2398
2438
  );
2399
2439
  }
2400
2440
 
2441
+ #[test]
2442
+ fn pipeline_json_path_rules_passthrough_non_json_cells() {
2443
+ let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
2444
+ let mut cols: HashMap<String, AnonymizerSpec> = HashMap::new();
2445
+ cols.insert(
2446
+ "payload.profile.secret".to_string(),
2447
+ AnonymizerSpec {
2448
+ strategy: "string".to_string(),
2449
+ salt: None,
2450
+ min: None,
2451
+ max: None,
2452
+ length: Some(8),
2453
+ min_days: None,
2454
+ max_days: None,
2455
+ min_seconds: None,
2456
+ max_seconds: None,
2457
+ domain: Some("secrets".to_string()),
2458
+ unique_within_domain: None,
2459
+ as_string: Some(true),
2460
+ locale: None,
2461
+ faker: None,
2462
+ format: None,
2463
+ },
2464
+ );
2465
+ rules.insert("public.events".to_string(), cols);
2466
+ let cfg = ResolvedConfig {
2467
+ salt: None,
2468
+ rules,
2469
+ row_filters: HashMap::new(),
2470
+ column_cases: HashMap::new(),
2471
+ sensitive_columns: HashMap::new(),
2472
+ output_scan: crate::settings::OutputScanConfig::default(),
2473
+ source_path: None,
2474
+ };
2475
+ let reg = AnonymizerRegistry::from_config(&cfg);
2476
+ let mut proc =
2477
+ SqlStreamProcessor::new(reg, cfg, Vec::new(), Vec::new(), None, DumpFormat::Postgres);
2478
+ let input = r#"
2479
+ CREATE TABLE public.events (id int, payload jsonb);
2480
+ INSERT INTO public.events (id, payload) VALUES
2481
+ (1, '{not strict json}'),
2482
+ (2, '{"profile":{"tier":"gold","secret":"alpha"}}');
2483
+
2484
+ COPY public.events (id, payload) FROM stdin;
2485
+ 3 {not strict json}
2486
+ 4 {"profile":{"tier":"gold","secret":"alpha"}}
2487
+ \.
2488
+ "#;
2489
+ let mut reader = std::io::BufReader::new(input.as_bytes());
2490
+ let mut out = Vec::new();
2491
+ proc.process(&mut reader, &mut out).unwrap();
2492
+ let s = String::from_utf8(out).unwrap();
2493
+ assert!(
2494
+ s.contains("(1, '{not strict json}')"),
2495
+ "non-JSON INSERT cell should passthrough unchanged, got:\n{s}"
2496
+ );
2497
+ assert!(
2498
+ !s.contains("alpha"),
2499
+ "valid JSON INSERT row should still anonymize nested paths, got:\n{s}"
2500
+ );
2501
+ assert!(
2502
+ s.contains("\n3\t{not strict json}\n"),
2503
+ "non-JSON COPY cell should passthrough unchanged, got:\n{s}"
2504
+ );
2505
+ assert!(
2506
+ !s.contains("\n4\t{\"profile\":{\"tier\":\"gold\",\"secret\":\"alpha\"}}\n"),
2507
+ "valid JSON COPY row should anonymize nested secret, got:\n{s}"
2508
+ );
2509
+ }
2510
+
2511
+ #[test]
2512
+ fn pipeline_json_path_int_range_preserves_json_number_type() {
2513
+ let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
2514
+ let mut cols: HashMap<String, AnonymizerSpec> = HashMap::new();
2515
+ cols.insert(
2516
+ "payload.score".to_string(),
2517
+ AnonymizerSpec {
2518
+ strategy: "int_range".to_string(),
2519
+ salt: None,
2520
+ min: Some(0),
2521
+ max: Some(100),
2522
+ length: None,
2523
+ min_days: None,
2524
+ max_days: None,
2525
+ min_seconds: None,
2526
+ max_seconds: None,
2527
+ domain: Some("pipeline_json_num".to_string()),
2528
+ unique_within_domain: None,
2529
+ as_string: None,
2530
+ locale: None,
2531
+ faker: None,
2532
+ format: None,
2533
+ },
2534
+ );
2535
+ rules.insert("public.events".to_string(), cols);
2536
+ let cfg = ResolvedConfig {
2537
+ salt: None,
2538
+ rules,
2539
+ row_filters: HashMap::new(),
2540
+ column_cases: HashMap::new(),
2541
+ sensitive_columns: HashMap::new(),
2542
+ output_scan: crate::settings::OutputScanConfig::default(),
2543
+ source_path: None,
2544
+ };
2545
+ let reg = AnonymizerRegistry::from_config(&cfg);
2546
+ let mut proc =
2547
+ SqlStreamProcessor::new(reg, cfg, Vec::new(), Vec::new(), None, DumpFormat::Postgres);
2548
+ let input = r#"
2549
+ CREATE TABLE public.events (id int, payload jsonb);
2550
+ INSERT INTO public.events (id, payload) VALUES
2551
+ (1, '{"score":42,"label":"x"}');
2552
+
2553
+ COPY public.events (id, payload) FROM stdin;
2554
+ 2 {"score":42,"label":"x"}
2555
+ \.
2556
+ "#;
2557
+ let mut reader = std::io::BufReader::new(input.as_bytes());
2558
+ let mut out = Vec::new();
2559
+ proc.process(&mut reader, &mut out).unwrap();
2560
+ let s = String::from_utf8(out).unwrap();
2561
+ let insert_pos = s.find("INSERT INTO public.events").unwrap();
2562
+ let insert_tail = &s[insert_pos..];
2563
+ let insert_end = insert_tail.find(";\n").unwrap() + insert_pos;
2564
+ let ins_stmt = &s[insert_pos..=insert_end];
2565
+ let vals_idx = ins_stmt.to_uppercase().find("VALUES").unwrap();
2566
+ let ins_block = strip_trailing_semicolon(ins_stmt[vals_idx + "VALUES".len()..].trim());
2567
+ let ins_rows = parse_values_rows(ins_block).unwrap();
2568
+ let copy_line = s
2569
+ .lines()
2570
+ .find(|l| l.starts_with("2\t{"))
2571
+ .expect("expected COPY data row");
2572
+ let copy_json = copy_line.split_once('\t').unwrap().1;
2573
+ let v_ins =
2574
+ serde_json::from_str::<serde_json::Value>(ins_rows[0][1].original.as_ref().unwrap())
2575
+ .unwrap();
2576
+ let v_copy = serde_json::from_str::<serde_json::Value>(copy_json).unwrap();
2577
+ assert!(
2578
+ v_ins["score"].is_number(),
2579
+ "INSERT payload.score should remain JSON number, got {:?}",
2580
+ v_ins["score"]
2581
+ );
2582
+ assert!(
2583
+ v_copy["score"].is_number(),
2584
+ "COPY payload.score should remain JSON number, got {:?}",
2585
+ v_copy["score"]
2586
+ );
2587
+ assert_eq!(v_ins["score"], v_copy["score"]);
2588
+ assert_eq!(v_ins["label"], "x");
2589
+ }
2590
+
2401
2591
  #[test]
2402
2592
  fn parse_values_rows_tracks_trailing_cast_for_quoted_literals() {
2403
2593
  let rows =
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes