PyPI - datacortex - Versions diffs - 0.4.3__tar.gz → 0.5.0__tar.gz - Mend

datacortex 0.4.3tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{datacortex-0.4.3 → datacortex-0.5.0}/Cargo.lock RENAMED Viewed

@@ -258,7 +258,7 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 [[package]]
 name = "datacortex-cli"
-version = "0.4.3"
+version = "0.5.0"
 dependencies = [
  "clap",
  "datacortex-core",
@@ -267,10 +267,12 @@ dependencies = [
 [[package]]
 name = "datacortex-core"
-version = "0.4.3"
+version = "0.5.0"
 dependencies = [
  "brotli",
  "crc32fast",
+ "fsst-rs",
+ "memchr",
  "rayon",
  "serde",
  "serde_json",
@@ -279,7 +281,7 @@ dependencies = [
 [[package]]
 name = "datacortex-neural"
-version = "0.4.3"
+version = "0.5.0"
 dependencies = [
  "encoding_rs",
  "llama-cpp-2",
@@ -287,7 +289,7 @@ dependencies = [
 [[package]]
 name = "datacortex-python"
-version = "0.4.3"
+version = "0.5.0"
 dependencies = [
  "datacortex-core",
  "pyo3",
@@ -343,6 +345,15 @@ dependencies = [
  "glob",
 ]
+[[package]]
+name = "fsst-rs"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdf65e16e100438be0030d113042e07a62bed67203998640ca6fae0404eed71e"
+dependencies = [
+ "rustc-hash",
+]
 [[package]]
 name = "getrandom"
 version = "0.3.4"

{datacortex-0.4.3 → datacortex-0.5.0}/Cargo.toml RENAMED Viewed

@@ -1,9 +1,12 @@
 [workspace]
 resolver = "2"
 members = ["crates/datacortex-core", "crates/datacortex-python"]
+exclude = [
+    "vector-plugin",
+]
 [workspace.package]
-version = "0.4.3"
+version = "0.5.0"
 edition = "2024"
 license = "MIT"
 repository = "https://github.com/RushikeshMore/datacortex"
@@ -12,9 +15,10 @@ authors = ["Rushikesh More"]
 rust-version = "1.85"
 [workspace.dependencies]
-datacortex-core = { path = "crates/datacortex-core", version = "0.4.3" }
+datacortex-core = { path = "crates/datacortex-core", version = "0.5.0" }
 datacortex-neural = { path = "crates/datacortex-neural" }
 crc32fast = "1"
+memchr = "2"
 clap = { version = "4", features = ["derive"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"

{datacortex-0.4.3 → datacortex-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datacortex
-Version: 0.4.3
+Version: 0.5.0
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License

{datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/Cargo.toml RENAMED Viewed

@@ -14,11 +14,13 @@ readme = "README.md"
 [dependencies]
 crc32fast = { workspace = true }
+memchr = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 zstd = { workspace = true }
 brotli = { workspace = true }
 rayon = { workspace = true }
+fsst-rs = "0.5.5"
 [features]
 default = []

{datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/README.md RENAMED Viewed

@@ -2,7 +2,9 @@
 The best standalone JSON/NDJSON compressor. Beats zstd-19 and brotli-11 on every file tested.
-DataCortex auto-infers your JSON schema, applies columnar reorg + type-specific encoding, then picks the optimal entropy coder (zstd or brotli). No schema files, no database, no configuration — just `datacortex compress data.json`.
+[Site](https://datacortex-dcx.vercel.app) | [crates.io](https://crates.io/crates/datacortex-cli) | [PyPI](https://pypi.org/project/datacortex/) | [Docs](https://github.com/rushikeshmore/DataCortex)
+DataCortex auto-infers your JSON schema, applies columnar reorg + type-specific encoding, then picks the optimal entropy coder (zstd or brotli). No schema files, no database, no configuration. Just `datacortex compress data.json`.
 ## Benchmarks
@@ -23,16 +25,36 @@ On larger structured logs:
 | k8s structured logs (100K rows) | 9.9 MB | **~40x** | 18.9x | **+113%** |
 | nginx access logs (100K rows) | 9.5 MB | **~28x** | 17.3x | **+62%** |
-> Higher is better. DataCortex wins on every file. Lossless — byte-exact decompression guaranteed.
+> Higher is better. DataCortex wins on every file. Lossless, byte-exact decompression guaranteed.
+## Performance
+Throughput on an Apple M-series chip (Fast mode, single run, release build):
+| File | Size | Ratio | Encode | Decode |
+|------|------|-------|--------|--------|
+| NDJSON (10K rows) | 3.3 MB | 27.6x | 4.1 MB/s | 176 MB/s |
+| GH Archive (diverse) | 10.0 MB | 7.8x | 3.2 MB/s | 574 MB/s |
+| Twitter API | 617 KB | 19.7x | 2.3 MB/s | 384 MB/s |
+| Event tickets | 1.7 MB | 221.6x | 8.6 MB/s | 1124 MB/s |
+**Decode is near-instant** (176-1124 MB/s). Encode trades speed for 2x better compression vs zstd. For throughput-critical pipelines, DataCortex is best suited as a batch compressor for log storage, not a real-time codec.
+Run `datacortex bench corpus/ -m fast --compare` to measure on your hardware.
 ## Installation
+**Rust:**
 ```bash
 cargo install datacortex-cli
 ```
-Or from source:
+**Python:**
+```bash
+pip install datacortex
+```
+**From source:**
 ```bash
 git clone https://github.com/rushikeshmore/DataCortex
 cd DataCortex
@@ -52,6 +74,17 @@ datacortex compress logs.ndjson -m fast          # explicit fast mode
 # Decompress
 datacortex decompress data.dcx output.ndjson
+# Streaming (pipe-friendly)
+cat logs.ndjson | datacortex compress - -o compressed.dcx
+datacortex decompress compressed.dcx -o -        # stdout
+# Chunked compression (for large NDJSON)
+datacortex compress logs.ndjson -o out.dcx --chunk-rows 10000
+# Custom dictionary (for known schemas)
+datacortex train-dict corpus/*.ndjson --output my.dict
+datacortex compress logs.ndjson --dict my.dict
 # Benchmark against zstd
 datacortex bench corpus/ -m fast --compare
@@ -74,10 +107,36 @@ datacortex info data.dcx
 **Balanced/Max modes** use a bit-level context mixing engine with 13 specialized models. Better for general text but slower.
+## Python
+```python
+import datacortex
+compressed = datacortex.compress(json_bytes, mode="fast")
+original = datacortex.decompress(compressed)
+# File-based
+datacortex.compress_file("logs.ndjson", "logs.dcx", mode="fast")
+datacortex.decompress_file("logs.dcx", "logs.json")
+# Format detection
+fmt = datacortex.detect_format(data)  # "ndjson", "json", "generic"
+```
+## How it works
+1. **Format detection** - auto-identifies JSON, NDJSON, or generic data
+2. **Schema inference** - discovers column types (integer, boolean, timestamp, enum, string, etc.)
+3. **Columnar reorg** - transposes row-oriented NDJSON into column-oriented layout
+4. **Type-specific encoding** - delta+varint for integers, bitmaps for booleans, epoch deltas for timestamps
+5. **Auto-fallback** - tries 6+ compression paths (zstd, brotli, with/without preprocessing) and picks the smallest
+No schema files. No configuration. Fully automatic.
 ## Development
 ```bash
-cargo test                                      # 374 tests
+cargo test                                      # 390 tests
 cargo clippy --all-targets -- -D warnings       # lint (0 warnings)
 cargo fmt --check                               # formatting
 cargo build --release                           # optimized build

{datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/codec.rs RENAMED Viewed

@@ -25,10 +25,22 @@ fn adaptive_fast_level(data_size: usize, level_override: Option<i32>) -> i32 {
     if let Some(level) = level_override {
         return level; // User explicitly set level, respect it
     }
+    // Empirically, zstd levels 9-15 produce nearly identical ratios on
+    // structured JSON (btlazy2 strategy plateau). The meaningful jump
+    // happens at level 16+ (btultra strategy). Level 13 wastes encode
+    // time without ratio gain over level 9.
+    //
+    // DataCortex benchmarks against zstd-19. Our preprocessing adds
+    // ~3-5% on top, but we need internal zstd at level 17+ to beat
+    // raw zstd-19 on diverse data like GH Archive.
+    //
+    // Encode time impact: preprocessing (columnar reorg, schema inference)
+    // dominates. With rayon parallelism the zstd level cost is marginal.
+    // Decode is completely unaffected by compression level.
     match data_size {
-        0..=1_048_576 => 19,          // <1MB: zstd-19 is <50ms, use best ratio
-        1_048_577..=10_485_760 => 13, // 1MB-10MB: good balance
-        _ => 9,                       // >10MB: use level 9 for speed
+        0..=16_777_216 => 19,          // ≤16MB: best ratio, <3s encode on 10MB
+        16_777_217..=67_108_864 => 16, // 16-64MB: btultra breakpoint, good ratio
+        _ => 9,                        // >64MB: skip 10-15 plateau, use fast
     }
 }
@@ -1814,20 +1826,30 @@ mod tests {
     #[test]
     fn test_adaptive_level_small_data() {
-        // <1MB should use level 19 (zstd-19 is <50ms on small data).
+        // ≤16MB should use level 19 — best ratio, preprocessing dominates encode time.
         assert_eq!(adaptive_fast_level(100_000, None), 19);
         assert_eq!(adaptive_fast_level(500_000, None), 19);
         assert_eq!(adaptive_fast_level(1_048_576, None), 19);
         assert_eq!(adaptive_fast_level(0, None), 19);
     }
+    #[test]
+    fn test_adaptive_level_medium_data() {
+        // 1-16MB still gets level 19 — zstd levels 9-15 are a plateau
+        // (identical ratio on structured JSON), so we skip to 19.
+        assert_eq!(adaptive_fast_level(1_048_577, None), 19);
+        assert_eq!(adaptive_fast_level(5_000_000, None), 19);
+        assert_eq!(adaptive_fast_level(10_485_760, None), 19);
+        assert_eq!(adaptive_fast_level(16_777_216, None), 19);
+    }
     #[test]
     fn test_adaptive_level_large_data() {
-        // 1MB-10MB should use level 13, >10MB should use level 9.
-        assert_eq!(adaptive_fast_level(1_048_577, None), 13);
-        assert_eq!(adaptive_fast_level(5_000_000, None), 13);
-        assert_eq!(adaptive_fast_level(10_485_760, None), 13);
-        assert_eq!(adaptive_fast_level(10_485_761, None), 9);
+        // 16-64MB uses level 16 (btultra breakpoint), >64MB uses level 9.
+        assert_eq!(adaptive_fast_level(16_777_217, None), 16);
+        assert_eq!(adaptive_fast_level(33_554_432, None), 16);
+        assert_eq!(adaptive_fast_level(67_108_864, None), 16);
+        assert_eq!(adaptive_fast_level(67_108_865, None), 9);
         assert_eq!(adaptive_fast_level(100_000_000, None), 9);
     }
@@ -2439,4 +2461,30 @@ mod tests {
             "null-heavy 30-row balanced mode roundtrip failed"
         );
     }
+    #[test]
+    fn gharchive_selective_roundtrip() {
+        // Verify GH Archive roundtrip with selective columnar transform.
+        let path = concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/../../corpus/json-bench/gharchive-10mb.ndjson"
+        );
+        let data = match std::fs::read(path) {
+            Ok(d) => d,
+            Err(_) => return, // Skip if corpus not available
+        };
+        let mut compressed = Vec::new();
+        compress(
+            &data,
+            Mode::Fast,
+            Some(crate::dcx::FormatHint::Ndjson),
+            &mut compressed,
+        )
+        .unwrap();
+        let decompressed = decompress(&mut std::io::Cursor::new(&compressed)).unwrap();
+        assert_eq!(
+            decompressed, data,
+            "GH Archive selective columnar roundtrip failed"
+        );
+    }
 }

{datacortex-0.4.3 → datacortex-0.5.0}/crates/datacortex-core/src/format/mod.rs RENAMED Viewed

@@ -11,6 +11,8 @@ pub mod transform;
 pub mod typed_encoding;
 pub mod value_dict;
+use std::borrow::Cow;
 use crate::dcx::{FormatHint, Mode};
 use transform::{
     TRANSFORM_JSON_ARRAY_COLUMNAR, TRANSFORM_JSON_KEY_INTERN, TRANSFORM_NDJSON_COLUMNAR,
@@ -54,7 +56,7 @@ pub fn detect_from_extension(path: &str) -> Option<FormatHint> {
 /// (keys are already removed from the data stream by the columnar transform).
 pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, TransformChain) {
     let mut chain = TransformChain::new();
-    let mut current = data.to_vec();
+    let mut current: Cow<'_, [u8]> = Cow::Borrowed(data);
     // Track whether a uniform columnar transform was applied (for value dict chaining).
     // Uniform columnar = data is \x00/\x01-separated, downstream transforms are compatible.
@@ -70,7 +72,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
         if let Some(result) = ndjson::preprocess(&current) {
             let is_uniform_columnar = !result.metadata.is_empty() && result.metadata[0] == 1;
             chain.push(TRANSFORM_NDJSON_COLUMNAR, result.metadata);
-            current = result.data;
+            current = Cow::Owned(result.data);
             ndjson_transform_applied = true;
             columnar_applied = is_uniform_columnar;
         }
@@ -85,7 +87,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
         if let Some(result) = json_array::preprocess(&current) {
             let is_uniform = !result.metadata.is_empty() && result.metadata[0] == 1;
             chain.push(TRANSFORM_JSON_ARRAY_COLUMNAR, result.metadata);
-            current = result.data;
+            current = Cow::Owned(result.data);
             json_array_applied = true;
             columnar_applied = is_uniform;
         }
@@ -115,13 +117,13 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
                     num_rows,
                     total_flat_cols as usize,
                 );
-                if unflattened == current {
+                if unflattened == current.as_ref() {
                     let mut nested_meta = Vec::new();
                     nested_meta.extend_from_slice(&(num_rows as u32).to_le_bytes());
                     nested_meta.extend_from_slice(&total_flat_cols.to_le_bytes());
                     nested_meta.extend_from_slice(&ndjson::serialize_nested_info(&nested_groups));
                     chain.push(TRANSFORM_NESTED_FLATTEN, nested_meta);
-                    current = flat_data;
+                    current = Cow::Owned(flat_data);
                 }
                 // else: roundtrip not exact — skip nested flatten (data stays columnar
                 // without sub-column decomposition, still benefits from typed encoding
@@ -136,7 +138,7 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
     if columnar_applied && mode == Mode::Fast {
         if let Some(result) = typed_encoding::preprocess(&current) {
             chain.push(TRANSFORM_TYPED_ENCODING, result.metadata);
-            current = result.data;
+            current = Cow::Owned(result.data);
         }
     }
@@ -150,12 +152,12 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
     if columnar_applied {
         if let Some(result) = value_dict::preprocess(&current) {
             chain.push(TRANSFORM_VALUE_DICT, result.metadata);
-            current = result.data;
+            current = Cow::Owned(result.data);
         }
     }
     if columnar_applied || ndjson_transform_applied || json_array_applied {
-        return (current, chain);
+        return (current.into_owned(), chain);
     }
     // JSON key interning: Balanced/Max only (hurts Fast mode due to zstd redundancy).
@@ -164,33 +166,33 @@ pub fn preprocess(data: &[u8], format: FormatHint, mode: Mode) -> (Vec<u8>, Tran
         && let Some(result) = json::preprocess(&current)
     {
         chain.push(TRANSFORM_JSON_KEY_INTERN, result.metadata);
-        current = result.data;
+        current = Cow::Owned(result.data);
     }
-    (current, chain)
+    (current.into_owned(), chain)
 }
 /// Reverse preprocessing transforms (applied in reverse order).
 pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
-    let mut current = data.to_vec();
+    let mut current: Cow<'_, [u8]> = Cow::Borrowed(data);
     // Apply in reverse order.
     for record in chain.records.iter().rev() {
         match record.id {
             TRANSFORM_JSON_KEY_INTERN => {
-                current = json::reverse(&current, &record.metadata);
+                current = Cow::Owned(json::reverse(&current, &record.metadata));
             }
             TRANSFORM_NDJSON_COLUMNAR => {
-                current = ndjson::reverse(&current, &record.metadata);
+                current = Cow::Owned(ndjson::reverse(&current, &record.metadata));
             }
             TRANSFORM_JSON_ARRAY_COLUMNAR => {
-                current = json_array::reverse(&current, &record.metadata);
+                current = Cow::Owned(json_array::reverse(&current, &record.metadata));
             }
             TRANSFORM_VALUE_DICT => {
-                current = value_dict::reverse(&current, &record.metadata);
+                current = Cow::Owned(value_dict::reverse(&current, &record.metadata));
             }
             TRANSFORM_TYPED_ENCODING => {
-                current = typed_encoding::reverse(&current, &record.metadata);
+                current = Cow::Owned(typed_encoding::reverse(&current, &record.metadata));
             }
             TRANSFORM_NESTED_FLATTEN => {
                 // Metadata: num_rows (u32 LE) + total_flat_cols (u16 LE) + nested_info.
@@ -202,12 +204,12 @@ pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
                     if let Some((nested_groups, _)) =
                         ndjson::deserialize_nested_info(&record.metadata[6..])
                     {
-                        current = ndjson::unflatten_nested_columns(
+                        current = Cow::Owned(ndjson::unflatten_nested_columns(
                             &current,
                             &nested_groups,
                             num_rows,
                             total_flat_cols,
-                        );
+                        ));
                     }
                 }
             }
@@ -215,7 +217,7 @@ pub fn reverse_preprocess(data: &[u8], chain: &TransformChain) -> Vec<u8> {
         }
     }
-    current
+    current.into_owned()
 }
 // --- Detection helpers (unchanged from Phase 0) ---

datacortex 0.4.3__tar.gz → 0.5.0__tar.gz

datacortex 0.4.3tar.gz → 0.5.0tar.gz