datacortex 0.4.3__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datacortex-0.4.3 → datacortex-0.5.0}/Cargo.lock +15 -4
- {datacortex-0.4.3 → datacortex-0.5.0}/Cargo.toml +6 -2
- {datacortex-0.4.3 → datacortex-0.5.0}/PKG-INFO +1 -1
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/Cargo.toml +2 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/README.md +63 -4
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/codec.rs +57 -9
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/mod.rs +21 -19
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/ndjson.rs +721 -91
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/schema.rs +10 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/typed_encoding.rs +248 -7
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/lib.rs +1 -1
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/engine.rs +15 -3
- {datacortex-0.4.3 → datacortex-0.5.0}/README.md +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/dump_transform.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/test_pipeline.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/test_vdict.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/trace_spacex.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/examples/validation_ab_test.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/dcx.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/entropy/arithmetic.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/entropy/mod.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/json.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/json_array.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/transform.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/value_dict.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/apm.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/dual_mixer.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/hierarchical_mixer.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/isse.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/logistic.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/meta_mixer.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/mod.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/mixer/multi_set_mixer.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/cm_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/dmc_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/gru_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/indirect_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/json_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/match_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/mod.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/neural_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/order0.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/ppm_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/run_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/sparse_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/model/word_model.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/context_map.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/mod.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/state_map.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/state/state_table.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/tests/roundtrip.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-python/Cargo.toml +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-python/README.md +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-python/src/lib.rs +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/pyproject.toml +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/python/datacortex/__init__.py +0 -0
- {datacortex-0.4.3 → datacortex-0.5.0}/python/datacortex/py.typed +0 -0
|
@@ -258,7 +258,7 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
|
258
258
|
|
|
259
259
|
[[package]]
|
|
260
260
|
name = "datacortex-cli"
|
|
261
|
-
version = "0.
|
|
261
|
+
version = "0.5.0"
|
|
262
262
|
dependencies = [
|
|
263
263
|
"clap",
|
|
264
264
|
"datacortex-core",
|
|
@@ -267,10 +267,12 @@ dependencies = [
|
|
|
267
267
|
|
|
268
268
|
[[package]]
|
|
269
269
|
name = "datacortex-core"
|
|
270
|
-
version = "0.
|
|
270
|
+
version = "0.5.0"
|
|
271
271
|
dependencies = [
|
|
272
272
|
"brotli",
|
|
273
273
|
"crc32fast",
|
|
274
|
+
"fsst-rs",
|
|
275
|
+
"memchr",
|
|
274
276
|
"rayon",
|
|
275
277
|
"serde",
|
|
276
278
|
"serde_json",
|
|
@@ -279,7 +281,7 @@ dependencies = [
|
|
|
279
281
|
|
|
280
282
|
[[package]]
|
|
281
283
|
name = "datacortex-neural"
|
|
282
|
-
version = "0.
|
|
284
|
+
version = "0.5.0"
|
|
283
285
|
dependencies = [
|
|
284
286
|
"encoding_rs",
|
|
285
287
|
"llama-cpp-2",
|
|
@@ -287,7 +289,7 @@ dependencies = [
|
|
|
287
289
|
|
|
288
290
|
[[package]]
|
|
289
291
|
name = "datacortex-python"
|
|
290
|
-
version = "0.
|
|
292
|
+
version = "0.5.0"
|
|
291
293
|
dependencies = [
|
|
292
294
|
"datacortex-core",
|
|
293
295
|
"pyo3",
|
|
@@ -343,6 +345,15 @@ dependencies = [
|
|
|
343
345
|
"glob",
|
|
344
346
|
]
|
|
345
347
|
|
|
348
|
+
[[package]]
|
|
349
|
+
name = "fsst-rs"
|
|
350
|
+
version = "0.5.9"
|
|
351
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
352
|
+
checksum = "cdf65e16e100438be0030d113042e07a62bed67203998640ca6fae0404eed71e"
|
|
353
|
+
dependencies = [
|
|
354
|
+
"rustc-hash",
|
|
355
|
+
]
|
|
356
|
+
|
|
346
357
|
[[package]]
|
|
347
358
|
name = "getrandom"
|
|
348
359
|
version = "0.3.4"
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
[workspace]
|
|
2
2
|
resolver = "2"
|
|
3
3
|
members = ["crates/datacortex-core", "crates/datacortex-python"]
|
|
4
|
+
exclude = [
|
|
5
|
+
"vector-plugin",
|
|
6
|
+
]
|
|
4
7
|
|
|
5
8
|
[workspace.package]
|
|
6
|
-
version = "0.
|
|
9
|
+
version = "0.5.0"
|
|
7
10
|
edition = "2024"
|
|
8
11
|
license = "MIT"
|
|
9
12
|
repository = "https://github.com/RushikeshMore/datacortex"
|
|
@@ -12,9 +15,10 @@ authors = ["Rushikesh More"]
|
|
|
12
15
|
rust-version = "1.85"
|
|
13
16
|
|
|
14
17
|
[workspace.dependencies]
|
|
15
|
-
datacortex-core = { path = "crates/datacortex-core", version = "0.
|
|
18
|
+
datacortex-core = { path = "crates/datacortex-core", version = "0.5.0" }
|
|
16
19
|
datacortex-neural = { path = "crates/datacortex-neural" }
|
|
17
20
|
crc32fast = "1"
|
|
21
|
+
memchr = "2"
|
|
18
22
|
clap = { version = "4", features = ["derive"] }
|
|
19
23
|
serde = { version = "1", features = ["derive"] }
|
|
20
24
|
serde_json = "1"
|
|
@@ -14,11 +14,13 @@ readme = "README.md"
|
|
|
14
14
|
|
|
15
15
|
[dependencies]
|
|
16
16
|
crc32fast = { workspace = true }
|
|
17
|
+
memchr = { workspace = true }
|
|
17
18
|
serde = { workspace = true }
|
|
18
19
|
serde_json = { workspace = true }
|
|
19
20
|
zstd = { workspace = true }
|
|
20
21
|
brotli = { workspace = true }
|
|
21
22
|
rayon = { workspace = true }
|
|
23
|
+
fsst-rs = "0.5.5"
|
|
22
24
|
|
|
23
25
|
[features]
|
|
24
26
|
default = []
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
The best standalone JSON/NDJSON compressor. Beats zstd-19 and brotli-11 on every file tested.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
[Site](https://datacortex-dcx.vercel.app) | [crates.io](https://crates.io/crates/datacortex-cli) | [PyPI](https://pypi.org/project/datacortex/) | [Docs](https://github.com/rushikeshmore/DataCortex)
|
|
6
|
+
|
|
7
|
+
DataCortex auto-infers your JSON schema, applies columnar reorg + type-specific encoding, then picks the optimal entropy coder (zstd or brotli). No schema files, no database, no configuration. Just `datacortex compress data.json`.
|
|
6
8
|
|
|
7
9
|
## Benchmarks
|
|
8
10
|
|
|
@@ -23,16 +25,36 @@ On larger structured logs:
|
|
|
23
25
|
| k8s structured logs (100K rows) | 9.9 MB | **~40x** | 18.9x | **+113%** |
|
|
24
26
|
| nginx access logs (100K rows) | 9.5 MB | **~28x** | 17.3x | **+62%** |
|
|
25
27
|
|
|
26
|
-
> Higher is better. DataCortex wins on every file. Lossless
|
|
28
|
+
> Higher is better. DataCortex wins on every file. Lossless, byte-exact decompression guaranteed.
|
|
29
|
+
|
|
30
|
+
## Performance
|
|
31
|
+
|
|
32
|
+
Throughput on an Apple M-series chip (Fast mode, single run, release build):
|
|
33
|
+
|
|
34
|
+
| File | Size | Ratio | Encode | Decode |
|
|
35
|
+
|------|------|-------|--------|--------|
|
|
36
|
+
| NDJSON (10K rows) | 3.3 MB | 27.6x | 4.1 MB/s | 176 MB/s |
|
|
37
|
+
| GH Archive (diverse) | 10.0 MB | 7.8x | 3.2 MB/s | 574 MB/s |
|
|
38
|
+
| Twitter API | 617 KB | 19.7x | 2.3 MB/s | 384 MB/s |
|
|
39
|
+
| Event tickets | 1.7 MB | 221.6x | 8.6 MB/s | 1124 MB/s |
|
|
40
|
+
|
|
41
|
+
**Decode is near-instant** (176-1124 MB/s). Encode trades speed for 2x better compression vs zstd. For throughput-critical pipelines, DataCortex is best suited as a batch compressor for log storage, not a real-time codec.
|
|
42
|
+
|
|
43
|
+
Run `datacortex bench corpus/ -m fast --compare` to measure on your hardware.
|
|
27
44
|
|
|
28
45
|
## Installation
|
|
29
46
|
|
|
47
|
+
**Rust:**
|
|
30
48
|
```bash
|
|
31
49
|
cargo install datacortex-cli
|
|
32
50
|
```
|
|
33
51
|
|
|
34
|
-
|
|
52
|
+
**Python:**
|
|
53
|
+
```bash
|
|
54
|
+
pip install datacortex
|
|
55
|
+
```
|
|
35
56
|
|
|
57
|
+
**From source:**
|
|
36
58
|
```bash
|
|
37
59
|
git clone https://github.com/rushikeshmore/DataCortex
|
|
38
60
|
cd DataCortex
|
|
@@ -52,6 +74,17 @@ datacortex compress logs.ndjson -m fast # explicit fast mode
|
|
|
52
74
|
# Decompress
|
|
53
75
|
datacortex decompress data.dcx output.ndjson
|
|
54
76
|
|
|
77
|
+
# Streaming (pipe-friendly)
|
|
78
|
+
cat logs.ndjson | datacortex compress - -o compressed.dcx
|
|
79
|
+
datacortex decompress compressed.dcx -o - # stdout
|
|
80
|
+
|
|
81
|
+
# Chunked compression (for large NDJSON)
|
|
82
|
+
datacortex compress logs.ndjson -o out.dcx --chunk-rows 10000
|
|
83
|
+
|
|
84
|
+
# Custom dictionary (for known schemas)
|
|
85
|
+
datacortex train-dict corpus/*.ndjson --output my.dict
|
|
86
|
+
datacortex compress logs.ndjson --dict my.dict
|
|
87
|
+
|
|
55
88
|
# Benchmark against zstd
|
|
56
89
|
datacortex bench corpus/ -m fast --compare
|
|
57
90
|
|
|
@@ -74,10 +107,36 @@ datacortex info data.dcx
|
|
|
74
107
|
|
|
75
108
|
**Balanced/Max modes** use a bit-level context mixing engine with 13 specialized models. Better for general text but slower.
|
|
76
109
|
|
|
110
|
+
## Python
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
import datacortex
|
|
114
|
+
|
|
115
|
+
compressed = datacortex.compress(json_bytes, mode="fast")
|
|
116
|
+
original = datacortex.decompress(compressed)
|
|
117
|
+
|
|
118
|
+
# File-based
|
|
119
|
+
datacortex.compress_file("logs.ndjson", "logs.dcx", mode="fast")
|
|
120
|
+
datacortex.decompress_file("logs.dcx", "logs.json")
|
|
121
|
+
|
|
122
|
+
# Format detection
|
|
123
|
+
fmt = datacortex.detect_format(data) # "ndjson", "json", "generic"
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## How it works
|
|
127
|
+
|
|
128
|
+
1. **Format detection** - auto-identifies JSON, NDJSON, or generic data
|
|
129
|
+
2. **Schema inference** - discovers column types (integer, boolean, timestamp, enum, string, etc.)
|
|
130
|
+
3. **Columnar reorg** - transposes row-oriented NDJSON into column-oriented layout
|
|
131
|
+
4. **Type-specific encoding** - delta+varint for integers, bitmaps for booleans, epoch deltas for timestamps
|
|
132
|
+
5. **Auto-fallback** - tries 6+ compression paths (zstd, brotli, with/without preprocessing) and picks the smallest
|
|
133
|
+
|
|
134
|
+
No schema files. No configuration. Fully automatic.
|
|
135
|
+
|
|
77
136
|
## Development
|
|
78
137
|
|
|
79
138
|
```bash
|
|
80
|
-
cargo test #
|
|
139
|
+
cargo test # 390 tests
|
|
81
140
|
cargo clippy --all-targets -- -D warnings # lint (0 warnings)
|
|
82
141
|
cargo fmt --check # formatting
|
|
83
142
|
cargo build --release # optimized build
|
|
@@ -25,10 +25,22 @@ fn adaptive_fast_level(data_size: usize, level_override: Option<i32>) -> i32 {
|
|
|
25
25
|
if let Some(level) = level_override {
|
|
26
26
|
return level; // User explicitly set level, respect it
|
|
27
27
|
}
|
|
28
|
+
// Empirically, zstd levels 9-15 produce nearly identical ratios on
|
|
29
|
+
// structured JSON (btlazy2 strategy plateau). The meaningful jump
|
|
30
|
+
// happens at level 16+ (btultra strategy). Level 13 wastes encode
|
|
31
|
+
// time without ratio gain over level 9.
|
|
32
|
+
//
|
|
33
|
+
// DataCortex benchmarks against zstd-19. Our preprocessing adds
|
|
34
|
+
// ~3-5% on top, but we need internal zstd at level 17+ to beat
|
|
35
|
+
// raw zstd-19 on diverse data like GH Archive.
|
|
36
|
+
//
|
|
37
|
+
// Encode time impact: preprocessing (columnar reorg, schema inference)
|
|
38
|
+
// dominates. With rayon parallelism the zstd level cost is marginal.
|
|
39
|
+
// Decode is completely unaffected by compression level.
|
|
28
40
|
match data_size {
|
|
29
|
-
0..=
|
|
30
|
-
|
|
31
|
-
_ => 9,
|
|
41
|
+
0..=16_777_216 => 19, // ≤16MB: best ratio, <3s encode on 10MB
|
|
42
|
+
16_777_217..=67_108_864 => 16, // 16-64MB: btultra breakpoint, good ratio
|
|
43
|
+
_ => 9, // >64MB: skip 10-15 plateau, use fast
|
|
32
44
|
}
|
|
33
45
|
}
|
|
34
46
|
|
|
@@ -1814,20 +1826,30 @@ mod tests {
|
|
|
1814
1826
|
|
|
1815
1827
|
#[test]
|
|
1816
1828
|
fn test_adaptive_level_small_data() {
|
|
1817
|
-
//
|
|
1829
|
+
// ≤16MB should use level 19 — best ratio, preprocessing dominates encode time.
|
|
1818
1830
|
assert_eq!(adaptive_fast_level(100_000, None), 19);
|
|
1819
1831
|
assert_eq!(adaptive_fast_level(500_000, None), 19);
|
|
1820
1832
|
assert_eq!(adaptive_fast_level(1_048_576, None), 19);
|
|
1821
1833
|
assert_eq!(adaptive_fast_level(0, None), 19);
|
|
1822
1834
|
}
|
|
1823
1835
|
|
|
1836
|
+
#[test]
|
|
1837
|
+
fn test_adaptive_level_medium_data() {
|
|
1838
|
+
// 1-16MB still gets level 19 — zstd levels 9-15 are a plateau
|
|
1839
|
+
// (identical ratio on structured JSON), so we skip to 19.
|
|
1840
|
+
assert_eq!(adaptive_fast_level(1_048_577, None), 19);
|
|
1841
|
+
assert_eq!(adaptive_fast_level(5_000_000, None), 19);
|
|
1842
|
+
assert_eq!(adaptive_fast_level(10_485_760, None), 19);
|
|
1843
|
+
assert_eq!(adaptive_fast_level(16_777_216, None), 19);
|
|
1844
|
+
}
|
|
1845
|
+
|
|
1824
1846
|
#[test]
|
|
1825
1847
|
fn test_adaptive_level_large_data() {
|
|
1826
|
-
//
|
|
1827
|
-
assert_eq!(adaptive_fast_level(
|
|
1828
|
-
assert_eq!(adaptive_fast_level(
|
|
1829
|
-
assert_eq!(adaptive_fast_level(
|
|
1830
|
-
assert_eq!(adaptive_fast_level(
|
|
1848
|
+
// 16-64MB uses level 16 (btultra breakpoint), >64MB uses level 9.
|
|
1849
|
+
assert_eq!(adaptive_fast_level(16_777_217, None), 16);
|
|
1850
|
+
assert_eq!(adaptive_fast_level(33_554_432, None), 16);
|
|
1851
|
+
assert_eq!(adaptive_fast_level(67_108_864, None), 16);
|
|
1852
|
+
assert_eq!(adaptive_fast_level(67_108_865, None), 9);
|
|
1831
1853
|
assert_eq!(adaptive_fast_level(100_000_000, None), 9);
|
|
1832
1854
|
}
|
|
1833
1855
|
|
|
@@ -2439,4 +2461,30 @@ mod tests {
|
|
|
2439
2461
|
"null-heavy 30-row balanced mode roundtrip failed"
|
|
2440
2462
|
);
|
|
2441
2463
|
}
|
|
2464
|
+
|
|
2465
|
+
#[test]
|
|
2466
|
+
fn gharchive_selective_roundtrip() {
|
|
2467
|
+
// Verify GH Archive roundtrip with selective columnar transform.
|
|
2468
|
+
let path = concat!(
|
|
2469
|
+
env!("CARGO_MANIFEST_DIR"),
|
|
2470
|
+
"/../../corpus/json-bench/gharchive-10mb.ndjson"
|
|
2471
|
+
);
|
|
2472
|
+
let data = match std::fs::read(path) {
|
|
2473
|
+
Ok(d) => d,
|
|
2474
|
+
Err(_) => return, // Skip if corpus not available
|
|
2475
|
+
};
|
|
2476
|
+
let mut compressed = Vec::new();
|
|
2477
|
+
compress(
|
|
2478
|
+
&data,
|
|
2479
|
+
Mode::Fast,
|
|
2480
|
+
Some(crate::dcx::FormatHint::Ndjson),
|
|
2481
|
+
&mut compressed,
|
|
2482
|
+
)
|
|
2483
|
+
.unwrap();
|
|
2484
|
+
let decompressed = decompress(&mut std::io::Cursor::new(&compressed)).unwrap();
|
|
2485
|
+
assert_eq!(
|
|
2486
|
+
decompressed, data,
|
|
2487
|
+
"GH Archive selective columnar roundtrip failed"
|
|
2488
|
+
);
|
|
2489
|
+
}
|
|
2442
2490
|
}
|
|
@@ -11,6 +11,8 @@ pub mod transform;
|
|
|
11
11
|
pub mod typed_encoding;
|
|
12
12
|
pub mod value_dict;
|
|
13
13
|
|
|
14
|
+
use std::borrow::Cow;
|
|
15
|
+
|
|
14
16
|
use crate::dcx::{FormatHint, Mode};
|
|
15
17
|
use transform::{
|
|
16
18
|
TRANSFORM_JSON_ARRAY_COLUMNAR, TRANSFORM_JSON_KEY_INTERN, TRANSFORM_NDJSON_COLUMNAR,
|
|
@@ -54,7 +56,7 @@ pub fn detect_from_extension(path: &str) -> Option<FormatHint> {
|
|
|
54
56
|
/// (keys are already removed from the data stream by the columnar transform).
|
|
55
57
|
pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, TransformChain) {
|
|
56
58
|
let mut chain = TransformChain::new();
|
|
57
|
-
let mut current = data
|
|
59
|
+
let mut current: Cow<'_, [u8]> = Cow::Borrowed(data);
|
|
58
60
|
|
|
59
61
|
// Track whether a uniform columnar transform was applied (for value dict chaining).
|
|
60
62
|
// Uniform columnar = data is \x00/\x01-separated, downstream transforms are compatible.
|
|
@@ -70,7 +72,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
|
|
|
70
72
|
if let Some(result) = ndjson::preprocess(¤t) {
|
|
71
73
|
let is_uniform_columnar = !result.metadata.is_empty() && result.metadata[0] == 1;
|
|
72
74
|
chain.push(TRANSFORM_NDJSON_COLUMNAR, result.metadata);
|
|
73
|
-
current = result.data;
|
|
75
|
+
current = Cow::Owned(result.data);
|
|
74
76
|
ndjson_transform_applied = true;
|
|
75
77
|
columnar_applied = is_uniform_columnar;
|
|
76
78
|
}
|
|
@@ -85,7 +87,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
|
|
|
85
87
|
if let Some(result) = json_array::preprocess(¤t) {
|
|
86
88
|
let is_uniform = !result.metadata.is_empty() && result.metadata[0] == 1;
|
|
87
89
|
chain.push(TRANSFORM_JSON_ARRAY_COLUMNAR, result.metadata);
|
|
88
|
-
current = result.data;
|
|
90
|
+
current = Cow::Owned(result.data);
|
|
89
91
|
json_array_applied = true;
|
|
90
92
|
columnar_applied = is_uniform;
|
|
91
93
|
}
|
|
@@ -115,13 +117,13 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
|
|
|
115
117
|
num_rows,
|
|
116
118
|
total_flat_cols as usize,
|
|
117
119
|
);
|
|
118
|
-
if unflattened == current {
|
|
120
|
+
if unflattened == current.as_ref() {
|
|
119
121
|
let mut nested_meta = Vec::new();
|
|
120
122
|
nested_meta.extend_from_slice(&(num_rows as u32).to_le_bytes());
|
|
121
123
|
nested_meta.extend_from_slice(&total_flat_cols.to_le_bytes());
|
|
122
124
|
nested_meta.extend_from_slice(&ndjson::serialize_nested_info(&nested_groups));
|
|
123
125
|
chain.push(TRANSFORM_NESTED_FLATTEN, nested_meta);
|
|
124
|
-
current = flat_data;
|
|
126
|
+
current = Cow::Owned(flat_data);
|
|
125
127
|
}
|
|
126
128
|
// else: roundtrip not exact — skip nested flatten (data stays columnar
|
|
127
129
|
// without sub-column decomposition, still benefits from typed encoding
|
|
@@ -136,7 +138,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
|
|
|
136
138
|
if columnar_applied && mode == Mode::Fast {
|
|
137
139
|
if let Some(result) = typed_encoding::preprocess(¤t) {
|
|
138
140
|
chain.push(TRANSFORM_TYPED_ENCODING, result.metadata);
|
|
139
|
-
current = result.data;
|
|
141
|
+
current = Cow::Owned(result.data);
|
|
140
142
|
}
|
|
141
143
|
}
|
|
142
144
|
|
|
@@ -150,12 +152,12 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
|
|
|
150
152
|
if columnar_applied {
|
|
151
153
|
if let Some(result) = value_dict::preprocess(¤t) {
|
|
152
154
|
chain.push(TRANSFORM_VALUE_DICT, result.metadata);
|
|
153
|
-
current = result.data;
|
|
155
|
+
current = Cow::Owned(result.data);
|
|
154
156
|
}
|
|
155
157
|
}
|
|
156
158
|
|
|
157
159
|
if columnar_applied || ndjson_transform_applied || json_array_applied {
|
|
158
|
-
return (current, chain);
|
|
160
|
+
return (current.into_owned(), chain);
|
|
159
161
|
}
|
|
160
162
|
|
|
161
163
|
// JSON key interning: Balanced/Max only (hurts Fast mode due to zstd redundancy).
|
|
@@ -164,33 +166,33 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
|
|
|
164
166
|
&& let Some(result) = json::preprocess(¤t)
|
|
165
167
|
{
|
|
166
168
|
chain.push(TRANSFORM_JSON_KEY_INTERN, result.metadata);
|
|
167
|
-
current = result.data;
|
|
169
|
+
current = Cow::Owned(result.data);
|
|
168
170
|
}
|
|
169
171
|
|
|
170
|
-
(current, chain)
|
|
172
|
+
(current.into_owned(), chain)
|
|
171
173
|
}
|
|
172
174
|
|
|
173
175
|
/// Reverse preprocessing transforms (applied in reverse order).
|
|
174
176
|
pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
|
|
175
|
-
let mut current = data
|
|
177
|
+
let mut current: Cow<'_, [u8]> = Cow::Borrowed(data);
|
|
176
178
|
|
|
177
179
|
// Apply in reverse order.
|
|
178
180
|
for record in chain.records.iter().rev() {
|
|
179
181
|
match record.id {
|
|
180
182
|
TRANSFORM_JSON_KEY_INTERN => {
|
|
181
|
-
current = json::reverse(¤t, &record.metadata);
|
|
183
|
+
current = Cow::Owned(json::reverse(¤t, &record.metadata));
|
|
182
184
|
}
|
|
183
185
|
TRANSFORM_NDJSON_COLUMNAR => {
|
|
184
|
-
current = ndjson::reverse(¤t, &record.metadata);
|
|
186
|
+
current = Cow::Owned(ndjson::reverse(¤t, &record.metadata));
|
|
185
187
|
}
|
|
186
188
|
TRANSFORM_JSON_ARRAY_COLUMNAR => {
|
|
187
|
-
current = json_array::reverse(¤t, &record.metadata);
|
|
189
|
+
current = Cow::Owned(json_array::reverse(¤t, &record.metadata));
|
|
188
190
|
}
|
|
189
191
|
TRANSFORM_VALUE_DICT => {
|
|
190
|
-
current = value_dict::reverse(¤t, &record.metadata);
|
|
192
|
+
current = Cow::Owned(value_dict::reverse(¤t, &record.metadata));
|
|
191
193
|
}
|
|
192
194
|
TRANSFORM_TYPED_ENCODING => {
|
|
193
|
-
current = typed_encoding::reverse(¤t, &record.metadata);
|
|
195
|
+
current = Cow::Owned(typed_encoding::reverse(¤t, &record.metadata));
|
|
194
196
|
}
|
|
195
197
|
TRANSFORM_NESTED_FLATTEN => {
|
|
196
198
|
// Metadata: num_rows (u32 LE) + total_flat_cols (u16 LE) + nested_info.
|
|
@@ -202,12 +204,12 @@ pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
|
|
|
202
204
|
if let Some((nested_groups, _)) =
|
|
203
205
|
ndjson::deserialize_nested_info(&record.metadata[6..])
|
|
204
206
|
{
|
|
205
|
-
current = ndjson::unflatten_nested_columns(
|
|
207
|
+
current = Cow::Owned(ndjson::unflatten_nested_columns(
|
|
206
208
|
¤t,
|
|
207
209
|
&nested_groups,
|
|
208
210
|
num_rows,
|
|
209
211
|
total_flat_cols,
|
|
210
|
-
);
|
|
212
|
+
));
|
|
211
213
|
}
|
|
212
214
|
}
|
|
213
215
|
}
|
|
@@ -215,7 +217,7 @@ pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
|
|
|
215
217
|
}
|
|
216
218
|
}
|
|
217
219
|
|
|
218
|
-
current
|
|
220
|
+
current.into_owned()
|
|
219
221
|
}
|
|
220
222
|
|
|
221
223
|
// --- Detection helpers (unchanged from Phase 0) ---
|