datacortex 0.4.3__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {datacortex-0.4.3 → datacortex-0.5.0}/Cargo.lock +15 -4
  2. {datacortex-0.4.3 → datacortex-0.5.0}/Cargo.toml +6 -2
  3. {datacortex-0.4.3 → datacortex-0.5.0}/PKG-INFO +1 -1
  4. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/Cargo.toml +2 -0
  5. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/README.md +63 -4
  6. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/codec.rs +57 -9
  7. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/mod.rs +21 -19
  8. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/ndjson.rs +721 -91
  9. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/schema.rs +10 -0
  10. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/typed_encoding.rs +248 -7
  11. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/lib.rs +1 -1
  12. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/engine.rs +15 -3
  13. {datacortex-0.4.3 → datacortex-0.5.0}/README.md +0 -0
  14. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/dump_transform.rs +0 -0
  15. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/test_pipeline.rs +0 -0
  16. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/test_vdict.rs +0 -0
  17. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/trace_spacex.rs +0 -0
  18. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/validation_ab_test.rs +0 -0
  19. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/dcx.rs +0 -0
  20. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/entropy/arithmetic.rs +0 -0
  21. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/entropy/mod.rs +0 -0
  22. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/json.rs +0 -0
  23. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/json_array.rs +0 -0
  24. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/transform.rs +0 -0
  25. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/value_dict.rs +0 -0
  26. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/apm.rs +0 -0
  27. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/dual_mixer.rs +0 -0
  28. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/hierarchical_mixer.rs +0 -0
  29. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/isse.rs +0 -0
  30. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/logistic.rs +0 -0
  31. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/meta_mixer.rs +0 -0
  32. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/mod.rs +0 -0
  33. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/multi_set_mixer.rs +0 -0
  34. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/cm_model.rs +0 -0
  35. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/dmc_model.rs +0 -0
  36. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/gru_model.rs +0 -0
  37. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/indirect_model.rs +0 -0
  38. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/json_model.rs +0 -0
  39. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/match_model.rs +0 -0
  40. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/mod.rs +0 -0
  41. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/neural_model.rs +0 -0
  42. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/order0.rs +0 -0
  43. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/ppm_model.rs +0 -0
  44. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/run_model.rs +0 -0
  45. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/sparse_model.rs +0 -0
  46. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/word_model.rs +0 -0
  47. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/context_map.rs +0 -0
  48. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/mod.rs +0 -0
  49. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/state_map.rs +0 -0
  50. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/state_table.rs +0 -0
  51. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/tests/roundtrip.rs +0 -0
  52. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-python/Cargo.toml +0 -0
  53. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-python/README.md +0 -0
  54. {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-python/src/lib.rs +0 -0
  55. {datacortex-0.4.3 → datacortex-0.5.0}/pyproject.toml +0 -0
  56. {datacortex-0.4.3 → datacortex-0.5.0}/python/datacortex/__init__.py +0 -0
  57. {datacortex-0.4.3 → datacortex-0.5.0}/python/datacortex/py.typed +0 -0
@@ -258,7 +258,7 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
258
258
 
259
259
  [[package]]
260
260
  name = "datacortex-cli"
261
- version = "0.4.3"
261
+ version = "0.5.0"
262
262
  dependencies = [
263
263
  "clap",
264
264
  "datacortex-core",
@@ -267,10 +267,12 @@ dependencies = [
267
267
 
268
268
  [[package]]
269
269
  name = "datacortex-core"
270
- version = "0.4.3"
270
+ version = "0.5.0"
271
271
  dependencies = [
272
272
  "brotli",
273
273
  "crc32fast",
274
+ "fsst-rs",
275
+ "memchr",
274
276
  "rayon",
275
277
  "serde",
276
278
  "serde_json",
@@ -279,7 +281,7 @@ dependencies = [
279
281
 
280
282
  [[package]]
281
283
  name = "datacortex-neural"
282
- version = "0.4.3"
284
+ version = "0.5.0"
283
285
  dependencies = [
284
286
  "encoding_rs",
285
287
  "llama-cpp-2",
@@ -287,7 +289,7 @@ dependencies = [
287
289
 
288
290
  [[package]]
289
291
  name = "datacortex-python"
290
- version = "0.4.3"
292
+ version = "0.5.0"
291
293
  dependencies = [
292
294
  "datacortex-core",
293
295
  "pyo3",
@@ -343,6 +345,15 @@ dependencies = [
343
345
  "glob",
344
346
  ]
345
347
 
348
+ [[package]]
349
+ name = "fsst-rs"
350
+ version = "0.5.9"
351
+ source = "registry+https://github.com/rust-lang/crates.io-index"
352
+ checksum = "cdf65e16e100438be0030d113042e07a62bed67203998640ca6fae0404eed71e"
353
+ dependencies = [
354
+ "rustc-hash",
355
+ ]
356
+
346
357
  [[package]]
347
358
  name = "getrandom"
348
359
  version = "0.3.4"
@@ -1,9 +1,12 @@
1
1
  [workspace]
2
2
  resolver = "2"
3
3
  members = ["crates/datacortex-core", "crates/datacortex-python"]
4
+ exclude = [
5
+ "vector-plugin",
6
+ ]
4
7
 
5
8
  [workspace.package]
6
- version = "0.4.3"
9
+ version = "0.5.0"
7
10
  edition = "2024"
8
11
  license = "MIT"
9
12
  repository = "https://github.com/RushikeshMore/datacortex"
@@ -12,9 +15,10 @@ authors = ["Rushikesh More"]
12
15
  rust-version = "1.85"
13
16
 
14
17
  [workspace.dependencies]
15
- datacortex-core = { path = "crates/datacortex-core", version = "0.4.3" }
18
+ datacortex-core = { path = "crates/datacortex-core", version = "0.5.0" }
16
19
  datacortex-neural = { path = "crates/datacortex-neural" }
17
20
  crc32fast = "1"
21
+ memchr = "2"
18
22
  clap = { version = "4", features = ["derive"] }
19
23
  serde = { version = "1", features = ["derive"] }
20
24
  serde_json = "1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datacortex
3
- Version: 0.4.3
3
+ Version: 0.5.0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Developers
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -14,11 +14,13 @@ readme = "README.md"
14
14
 
15
15
  [dependencies]
16
16
  crc32fast = { workspace = true }
17
+ memchr = { workspace = true }
17
18
  serde = { workspace = true }
18
19
  serde_json = { workspace = true }
19
20
  zstd = { workspace = true }
20
21
  brotli = { workspace = true }
21
22
  rayon = { workspace = true }
23
+ fsst-rs = "0.5.5"
22
24
 
23
25
  [features]
24
26
  default = []
@@ -2,7 +2,9 @@
2
2
 
3
3
  The best standalone JSON/NDJSON compressor. Beats zstd-19 and brotli-11 on every file tested.
4
4
 
5
- DataCortex auto-infers your JSON schema, applies columnar reorg + type-specific encoding, then picks the optimal entropy coder (zstd or brotli). No schema files, no database, no configuration — just `datacortex compress data.json`.
5
+ [Site](https://datacortex-dcx.vercel.app) | [crates.io](https://crates.io/crates/datacortex-cli) | [PyPI](https://pypi.org/project/datacortex/) | [Docs](https://github.com/rushikeshmore/DataCortex)
6
+
7
+ DataCortex auto-infers your JSON schema, applies columnar reorg + type-specific encoding, then picks the optimal entropy coder (zstd or brotli). No schema files, no database, no configuration. Just `datacortex compress data.json`.
6
8
 
7
9
  ## Benchmarks
8
10
 
@@ -23,16 +25,36 @@ On larger structured logs:
23
25
  | k8s structured logs (100K rows) | 9.9 MB | **~40x** | 18.9x | **+113%** |
24
26
  | nginx access logs (100K rows) | 9.5 MB | **~28x** | 17.3x | **+62%** |
25
27
 
26
- > Higher is better. DataCortex wins on every file. Lossless byte-exact decompression guaranteed.
28
+ > Higher is better. DataCortex wins on every file. Lossless, byte-exact decompression guaranteed.
29
+
30
+ ## Performance
31
+
32
+ Throughput on an Apple M-series chip (Fast mode, single run, release build):
33
+
34
+ | File | Size | Ratio | Encode | Decode |
35
+ |------|------|-------|--------|--------|
36
+ | NDJSON (10K rows) | 3.3 MB | 27.6x | 4.1 MB/s | 176 MB/s |
37
+ | GH Archive (diverse) | 10.0 MB | 7.8x | 3.2 MB/s | 574 MB/s |
38
+ | Twitter API | 617 KB | 19.7x | 2.3 MB/s | 384 MB/s |
39
+ | Event tickets | 1.7 MB | 221.6x | 8.6 MB/s | 1124 MB/s |
40
+
41
+ **Decode is near-instant** (176-1124 MB/s). Encode trades speed for 2x better compression vs zstd. For throughput-critical pipelines, DataCortex is best suited as a batch compressor for log storage, not a real-time codec.
42
+
43
+ Run `datacortex bench corpus/ -m fast --compare` to measure on your hardware.
27
44
 
28
45
  ## Installation
29
46
 
47
+ **Rust:**
30
48
  ```bash
31
49
  cargo install datacortex-cli
32
50
  ```
33
51
 
34
- Or from source:
52
+ **Python:**
53
+ ```bash
54
+ pip install datacortex
55
+ ```
35
56
 
57
+ **From source:**
36
58
  ```bash
37
59
  git clone https://github.com/rushikeshmore/DataCortex
38
60
  cd DataCortex
@@ -52,6 +74,17 @@ datacortex compress logs.ndjson -m fast # explicit fast mode
52
74
  # Decompress
53
75
  datacortex decompress data.dcx output.ndjson
54
76
 
77
+ # Streaming (pipe-friendly)
78
+ cat logs.ndjson | datacortex compress - -o compressed.dcx
79
+ datacortex decompress compressed.dcx -o - # stdout
80
+
81
+ # Chunked compression (for large NDJSON)
82
+ datacortex compress logs.ndjson -o out.dcx --chunk-rows 10000
83
+
84
+ # Custom dictionary (for known schemas)
85
+ datacortex train-dict corpus/*.ndjson --output my.dict
86
+ datacortex compress logs.ndjson --dict my.dict
87
+
55
88
  # Benchmark against zstd
56
89
  datacortex bench corpus/ -m fast --compare
57
90
 
@@ -74,10 +107,36 @@ datacortex info data.dcx
74
107
 
75
108
  **Balanced/Max modes** use a bit-level context mixing engine with 13 specialized models. Better for general text but slower.
76
109
 
110
+ ## Python
111
+
112
+ ```python
113
+ import datacortex
114
+
115
+ compressed = datacortex.compress(json_bytes, mode="fast")
116
+ original = datacortex.decompress(compressed)
117
+
118
+ # File-based
119
+ datacortex.compress_file("logs.ndjson", "logs.dcx", mode="fast")
120
+ datacortex.decompress_file("logs.dcx", "logs.json")
121
+
122
+ # Format detection
123
+ fmt = datacortex.detect_format(data) # "ndjson", "json", "generic"
124
+ ```
125
+
126
+ ## How it works
127
+
128
+ 1. **Format detection** - auto-identifies JSON, NDJSON, or generic data
129
+ 2. **Schema inference** - discovers column types (integer, boolean, timestamp, enum, string, etc.)
130
+ 3. **Columnar reorg** - transposes row-oriented NDJSON into column-oriented layout
131
+ 4. **Type-specific encoding** - delta+varint for integers, bitmaps for booleans, epoch deltas for timestamps
132
+ 5. **Auto-fallback** - tries 6+ compression paths (zstd, brotli, with/without preprocessing) and picks the smallest
133
+
134
+ No schema files. No configuration. Fully automatic.
135
+
77
136
  ## Development
78
137
 
79
138
  ```bash
80
- cargo test # 374 tests
139
+ cargo test # 390 tests
81
140
  cargo clippy --all-targets -- -D warnings # lint (0 warnings)
82
141
  cargo fmt --check # formatting
83
142
  cargo build --release # optimized build
@@ -25,10 +25,22 @@ fn adaptive_fast_level(data_size: usize, level_override: Option<i32>) -> i32 {
25
25
  if let Some(level) = level_override {
26
26
  return level; // User explicitly set level, respect it
27
27
  }
28
+ // Empirically, zstd levels 9-15 produce nearly identical ratios on
29
+ // structured JSON (btlazy2 strategy plateau). The meaningful jump
30
+ // happens at level 16+ (btultra strategy). Level 13 wastes encode
31
+ // time without ratio gain over level 9.
32
+ //
33
+ // DataCortex benchmarks against zstd-19. Our preprocessing adds
34
+ // ~3-5% on top, but we need internal zstd at level 17+ to beat
35
+ // raw zstd-19 on diverse data like GH Archive.
36
+ //
37
+ // Encode time impact: preprocessing (columnar reorg, schema inference)
38
+ // dominates. With rayon parallelism the zstd level cost is marginal.
39
+ // Decode is completely unaffected by compression level.
28
40
  match data_size {
29
- 0..=1_048_576 => 19, // <1MB: zstd-19 is <50ms, use best ratio
30
- 1_048_577..=10_485_760 => 13, // 1MB-10MB: good balance
31
- _ => 9, // >10MB: use level 9 for speed
41
+ 0..=16_777_216 => 19, // ≤16MB: best ratio, <3s encode on 10MB
42
+ 16_777_217..=67_108_864 => 16, // 16-64MB: btultra breakpoint, good ratio
43
+ _ => 9, // >64MB: skip 10-15 plateau, use fast
32
44
  }
33
45
  }
34
46
 
@@ -1814,20 +1826,30 @@ mod tests {
1814
1826
 
1815
1827
  #[test]
1816
1828
  fn test_adaptive_level_small_data() {
1817
- // <1MB should use level 19 (zstd-19 is <50ms on small data).
1829
+ // ≤16MB should use level 19 best ratio, preprocessing dominates encode time.
1818
1830
  assert_eq!(adaptive_fast_level(100_000, None), 19);
1819
1831
  assert_eq!(adaptive_fast_level(500_000, None), 19);
1820
1832
  assert_eq!(adaptive_fast_level(1_048_576, None), 19);
1821
1833
  assert_eq!(adaptive_fast_level(0, None), 19);
1822
1834
  }
1823
1835
 
1836
+ #[test]
1837
+ fn test_adaptive_level_medium_data() {
1838
+ // 1-16MB still gets level 19 — zstd levels 9-15 are a plateau
1839
+ // (identical ratio on structured JSON), so we skip to 19.
1840
+ assert_eq!(adaptive_fast_level(1_048_577, None), 19);
1841
+ assert_eq!(adaptive_fast_level(5_000_000, None), 19);
1842
+ assert_eq!(adaptive_fast_level(10_485_760, None), 19);
1843
+ assert_eq!(adaptive_fast_level(16_777_216, None), 19);
1844
+ }
1845
+
1824
1846
  #[test]
1825
1847
  fn test_adaptive_level_large_data() {
1826
- // 1MB-10MB should use level 13, >10MB should use level 9.
1827
- assert_eq!(adaptive_fast_level(1_048_577, None), 13);
1828
- assert_eq!(adaptive_fast_level(5_000_000, None), 13);
1829
- assert_eq!(adaptive_fast_level(10_485_760, None), 13);
1830
- assert_eq!(adaptive_fast_level(10_485_761, None), 9);
1848
+ // 16-64MB uses level 16 (btultra breakpoint), >64MB uses level 9.
1849
+ assert_eq!(adaptive_fast_level(16_777_217, None), 16);
1850
+ assert_eq!(adaptive_fast_level(33_554_432, None), 16);
1851
+ assert_eq!(adaptive_fast_level(67_108_864, None), 16);
1852
+ assert_eq!(adaptive_fast_level(67_108_865, None), 9);
1831
1853
  assert_eq!(adaptive_fast_level(100_000_000, None), 9);
1832
1854
  }
1833
1855
 
@@ -2439,4 +2461,30 @@ mod tests {
2439
2461
  "null-heavy 30-row balanced mode roundtrip failed"
2440
2462
  );
2441
2463
  }
2464
+
2465
+ #[test]
2466
+ fn gharchive_selective_roundtrip() {
2467
+ // Verify GH Archive roundtrip with selective columnar transform.
2468
+ let path = concat!(
2469
+ env!("CARGO_MANIFEST_DIR"),
2470
+ "/../../corpus/json-bench/gharchive-10mb.ndjson"
2471
+ );
2472
+ let data = match std::fs::read(path) {
2473
+ Ok(d) => d,
2474
+ Err(_) => return, // Skip if corpus not available
2475
+ };
2476
+ let mut compressed = Vec::new();
2477
+ compress(
2478
+ &data,
2479
+ Mode::Fast,
2480
+ Some(crate::dcx::FormatHint::Ndjson),
2481
+ &mut compressed,
2482
+ )
2483
+ .unwrap();
2484
+ let decompressed = decompress(&mut std::io::Cursor::new(&compressed)).unwrap();
2485
+ assert_eq!(
2486
+ decompressed, data,
2487
+ "GH Archive selective columnar roundtrip failed"
2488
+ );
2489
+ }
2442
2490
  }
@@ -11,6 +11,8 @@ pub mod transform;
11
11
  pub mod typed_encoding;
12
12
  pub mod value_dict;
13
13
 
14
+ use std::borrow::Cow;
15
+
14
16
  use crate::dcx::{FormatHint, Mode};
15
17
  use transform::{
16
18
  TRANSFORM_JSON_ARRAY_COLUMNAR, TRANSFORM_JSON_KEY_INTERN, TRANSFORM_NDJSON_COLUMNAR,
@@ -54,7 +56,7 @@ pub fn detect_from_extension(path: &str) -> Option<FormatHint> {
54
56
  /// (keys are already removed from the data stream by the columnar transform).
55
57
  pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, TransformChain) {
56
58
  let mut chain = TransformChain::new();
57
- let mut current = data.to_vec();
59
+ let mut current: Cow<'_, [u8]> = Cow::Borrowed(data);
58
60
 
59
61
  // Track whether a uniform columnar transform was applied (for value dict chaining).
60
62
  // Uniform columnar = data is \x00/\x01-separated, downstream transforms are compatible.
@@ -70,7 +72,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
70
72
  if let Some(result) = ndjson::preprocess(&current) {
71
73
  let is_uniform_columnar = !result.metadata.is_empty() && result.metadata[0] == 1;
72
74
  chain.push(TRANSFORM_NDJSON_COLUMNAR, result.metadata);
73
- current = result.data;
75
+ current = Cow::Owned(result.data);
74
76
  ndjson_transform_applied = true;
75
77
  columnar_applied = is_uniform_columnar;
76
78
  }
@@ -85,7 +87,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
85
87
  if let Some(result) = json_array::preprocess(&current) {
86
88
  let is_uniform = !result.metadata.is_empty() && result.metadata[0] == 1;
87
89
  chain.push(TRANSFORM_JSON_ARRAY_COLUMNAR, result.metadata);
88
- current = result.data;
90
+ current = Cow::Owned(result.data);
89
91
  json_array_applied = true;
90
92
  columnar_applied = is_uniform;
91
93
  }
@@ -115,13 +117,13 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
115
117
  num_rows,
116
118
  total_flat_cols as usize,
117
119
  );
118
- if unflattened == current {
120
+ if unflattened == current.as_ref() {
119
121
  let mut nested_meta = Vec::new();
120
122
  nested_meta.extend_from_slice(&(num_rows as u32).to_le_bytes());
121
123
  nested_meta.extend_from_slice(&total_flat_cols.to_le_bytes());
122
124
  nested_meta.extend_from_slice(&ndjson::serialize_nested_info(&nested_groups));
123
125
  chain.push(TRANSFORM_NESTED_FLATTEN, nested_meta);
124
- current = flat_data;
126
+ current = Cow::Owned(flat_data);
125
127
  }
126
128
  // else: roundtrip not exact — skip nested flatten (data stays columnar
127
129
  // without sub-column decomposition, still benefits from typed encoding
@@ -136,7 +138,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
136
138
  if columnar_applied && mode == Mode::Fast {
137
139
  if let Some(result) = typed_encoding::preprocess(&current) {
138
140
  chain.push(TRANSFORM_TYPED_ENCODING, result.metadata);
139
- current = result.data;
141
+ current = Cow::Owned(result.data);
140
142
  }
141
143
  }
142
144
 
@@ -150,12 +152,12 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
150
152
  if columnar_applied {
151
153
  if let Some(result) = value_dict::preprocess(&current) {
152
154
  chain.push(TRANSFORM_VALUE_DICT, result.metadata);
153
- current = result.data;
155
+ current = Cow::Owned(result.data);
154
156
  }
155
157
  }
156
158
 
157
159
  if columnar_applied || ndjson_transform_applied || json_array_applied {
158
- return (current, chain);
160
+ return (current.into_owned(), chain);
159
161
  }
160
162
 
161
163
  // JSON key interning: Balanced/Max only (hurts Fast mode due to zstd redundancy).
@@ -164,33 +166,33 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
164
166
  && let Some(result) = json::preprocess(&current)
165
167
  {
166
168
  chain.push(TRANSFORM_JSON_KEY_INTERN, result.metadata);
167
- current = result.data;
169
+ current = Cow::Owned(result.data);
168
170
  }
169
171
 
170
- (current, chain)
172
+ (current.into_owned(), chain)
171
173
  }
172
174
 
173
175
  /// Reverse preprocessing transforms (applied in reverse order).
174
176
  pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
175
- let mut current = data.to_vec();
177
+ let mut current: Cow<'_, [u8]> = Cow::Borrowed(data);
176
178
 
177
179
  // Apply in reverse order.
178
180
  for record in chain.records.iter().rev() {
179
181
  match record.id {
180
182
  TRANSFORM_JSON_KEY_INTERN => {
181
- current = json::reverse(&current, &record.metadata);
183
+ current = Cow::Owned(json::reverse(&current, &record.metadata));
182
184
  }
183
185
  TRANSFORM_NDJSON_COLUMNAR => {
184
- current = ndjson::reverse(&current, &record.metadata);
186
+ current = Cow::Owned(ndjson::reverse(&current, &record.metadata));
185
187
  }
186
188
  TRANSFORM_JSON_ARRAY_COLUMNAR => {
187
- current = json_array::reverse(&current, &record.metadata);
189
+ current = Cow::Owned(json_array::reverse(&current, &record.metadata));
188
190
  }
189
191
  TRANSFORM_VALUE_DICT => {
190
- current = value_dict::reverse(&current, &record.metadata);
192
+ current = Cow::Owned(value_dict::reverse(&current, &record.metadata));
191
193
  }
192
194
  TRANSFORM_TYPED_ENCODING => {
193
- current = typed_encoding::reverse(&current, &record.metadata);
195
+ current = Cow::Owned(typed_encoding::reverse(&current, &record.metadata));
194
196
  }
195
197
  TRANSFORM_NESTED_FLATTEN => {
196
198
  // Metadata: num_rows (u32 LE) + total_flat_cols (u16 LE) + nested_info.
@@ -202,12 +204,12 @@ pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
202
204
  if let Some((nested_groups, _)) =
203
205
  ndjson::deserialize_nested_info(&record.metadata[6..])
204
206
  {
205
- current = ndjson::unflatten_nested_columns(
207
+ current = Cow::Owned(ndjson::unflatten_nested_columns(
206
208
  &current,
207
209
  &nested_groups,
208
210
  num_rows,
209
211
  total_flat_cols,
210
- );
212
+ ));
211
213
  }
212
214
  }
213
215
  }
@@ -215,7 +217,7 @@ pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
215
217
  }
216
218
  }
217
219
 
218
- current
220
+ current.into_owned()
219
221
  }
220
222
 
221
223
  // --- Detection helpers (unchanged from Phase 0) ---