pycrowley 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycrowley-0.1.0/.forgejo/workflows/ci.yml +157 -0
- pycrowley-0.1.0/.gitignore +4 -0
- pycrowley-0.1.0/Cargo.toml +33 -0
- pycrowley-0.1.0/LICENSE-APACHE +14 -0
- pycrowley-0.1.0/LICENSE-MIT +21 -0
- pycrowley-0.1.0/PKG-INFO +314 -0
- pycrowley-0.1.0/README.md +121 -0
- pycrowley-0.1.0/pyproject.toml +43 -0
- pycrowley-0.1.0/python/crowley/.forgejo/workflows/ci.yml +133 -0
- pycrowley-0.1.0/python/crowley/.gitignore +72 -0
- pycrowley-0.1.0/python/crowley/Cargo.lock +470 -0
- pycrowley-0.1.0/python/crowley/Cargo.toml +15 -0
- pycrowley-0.1.0/python/crowley/LICENSE-APACHE +17 -0
- pycrowley-0.1.0/python/crowley/LICENSE-MIT +21 -0
- pycrowley-0.1.0/python/crowley/MIGRATION_ijson2crowley.md +186 -0
- pycrowley-0.1.0/python/crowley/README.md +284 -0
- pycrowley-0.1.0/python/crowley/__init__.py +170 -0
- pycrowley-0.1.0/python/crowley/__init__.pyi +204 -0
- pycrowley-0.1.0/python/crowley/benchmarks/bench.py +491 -0
- pycrowley-0.1.0/python/crowley/benchmarks/bench_cached.py +190 -0
- pycrowley-0.1.0/python/crowley/benchmarks/bench_competitors.py +227 -0
- pycrowley-0.1.0/python/crowley/benchmarks/bench_expressive.py +352 -0
- pycrowley-0.1.0/python/crowley/benchmarks/bench_large.py +197 -0
- pycrowley-0.1.0/python/crowley/benchmarks/bench_multifile.py +215 -0
- pycrowley-0.1.0/python/crowley/benchmarks/bench_multifile_daily.py +139 -0
- pycrowley-0.1.0/python/crowley/benchmarks/new_bench_large.py +50 -0
- pycrowley-0.1.0/python/crowley/py.typed +0 -0
- pycrowley-0.1.0/python/crowley/src/lib.rs +712 -0
- pycrowley-0.1.0/python/crowley/tests/data.json +8 -0
- pycrowley-0.1.0/python/crowley/tests/test_query.py +492 -0
- pycrowley-0.1.0/src/error.rs +24 -0
- pycrowley-0.1.0/src/grep/mod.rs +7 -0
- pycrowley-0.1.0/src/grep/query/ast.rs +605 -0
- pycrowley-0.1.0/src/grep/query/common.rs +98 -0
- pycrowley-0.1.0/src/grep/query/dfa.rs +1646 -0
- pycrowley-0.1.0/src/grep/query/grammar/query.pest +100 -0
- pycrowley-0.1.0/src/grep/query/mod.rs +5 -0
- pycrowley-0.1.0/src/grep/query/nfa.rs +801 -0
- pycrowley-0.1.0/src/grep/query/parser.rs +767 -0
- pycrowley-0.1.0/src/lib.rs +53 -0
- pycrowley-0.1.0/src/multi.rs +340 -0
- pycrowley-0.1.0/src/parse/mod.rs +23 -0
- pycrowley-0.1.0/src/parse/read.rs +1193 -0
- pycrowley-0.1.0/src/query.rs +447 -0
- pycrowley-0.1.0/src/stream/engine.rs +1357 -0
- pycrowley-0.1.0/src/stream/mod.rs +1 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
name: Build, test, and publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
tags: ["v*.*.*"]
|
|
8
|
+
pull_request:
|
|
9
|
+
branches: [main]
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
# ==================================================================
|
|
13
|
+
# Test: run Rust tests and Python tests
|
|
14
|
+
# ==================================================================
|
|
15
|
+
test:
|
|
16
|
+
name: Test
|
|
17
|
+
runs-on: codeberg-medium
|
|
18
|
+
steps:
|
|
19
|
+
- name: Checkout
|
|
20
|
+
uses: https://code.forgejo.org/actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Install Rust
|
|
23
|
+
run: |
|
|
24
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
25
|
+
. "$HOME/.cargo/env"
|
|
26
|
+
|
|
27
|
+
- name: Set up Python
|
|
28
|
+
uses: https://code.forgejo.org/actions/setup-python@v5
|
|
29
|
+
with:
|
|
30
|
+
python-version: "3.12"
|
|
31
|
+
|
|
32
|
+
- name: Rust tests
|
|
33
|
+
run: |
|
|
34
|
+
. "$HOME/.cargo/env"
|
|
35
|
+
cargo test --lib --test public_api
|
|
36
|
+
|
|
37
|
+
- name: Rust benchmarks compile check
|
|
38
|
+
run: |
|
|
39
|
+
. "$HOME/.cargo/env"
|
|
40
|
+
cargo build --bench throughput
|
|
41
|
+
|
|
42
|
+
- name: Install Python deps and build wheel
|
|
43
|
+
run: |
|
|
44
|
+
. "$HOME/.cargo/env"
|
|
45
|
+
python3 -m venv .venv
|
|
46
|
+
.venv/bin/pip install maturin pytest
|
|
47
|
+
.venv/bin/maturin develop --manifest-path python/crowley/Cargo.toml
|
|
48
|
+
|
|
49
|
+
- name: Python tests
|
|
50
|
+
run: .venv/bin/pytest python/crowley/tests/test_query.py -v
|
|
51
|
+
|
|
52
|
+
# ==================================================================
|
|
53
|
+
# Build + publish: manylinux wheels (only on tag push)
|
|
54
|
+
#
|
|
55
|
+
# Each matrix entry builds one wheel and uploads it directly to PyPI.
|
|
56
|
+
# We do NOT use upload-artifact/download-artifact because:
|
|
57
|
+
# - The manylinux container has no Node.js, so JS-based actions fail
|
|
58
|
+
# - Forgejo's artifact v4 support requires a patched fork and is fragile
|
|
59
|
+
# Instead each job uploads its wheel independently via twine.
|
|
60
|
+
# --skip-existing ensures no conflicts between parallel uploads.
|
|
61
|
+
#
|
|
62
|
+
# Python versions in the manylinux_2_28 image live under /opt/python/
|
|
63
|
+
# with PEP 425 tag names like cp312-cp312. We resolve the path below.
|
|
64
|
+
# ==================================================================
|
|
65
|
+
build_wheels:
|
|
66
|
+
name: Build + publish wheel (Python ${{ matrix.python-version }})
|
|
67
|
+
runs-on: codeberg-medium
|
|
68
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
69
|
+
needs: test
|
|
70
|
+
container:
|
|
71
|
+
image: quay.io/pypa/manylinux_2_28_x86_64
|
|
72
|
+
strategy:
|
|
73
|
+
fail-fast: false
|
|
74
|
+
matrix:
|
|
75
|
+
# These must match CPython versions available in the manylinux image.
|
|
76
|
+
# As of early 2026: 3.10 through 3.14 are available.
|
|
77
|
+
# 3.15 is alpha-only and NOT in the image — add it when it ships.
|
|
78
|
+
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
|
|
79
|
+
steps:
|
|
80
|
+
# No actions/checkout — it needs Node.js which manylinux lacks.
|
|
81
|
+
- name: Checkout
|
|
82
|
+
run: |
|
|
83
|
+
git clone --depth 1 --branch ${{ github.ref_name }} \
|
|
84
|
+
${{ github.server_url }}/${{ github.repository }} .
|
|
85
|
+
|
|
86
|
+
- name: Install Rust
|
|
87
|
+
run: |
|
|
88
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
89
|
+
. "$HOME/.cargo/env"
|
|
90
|
+
|
|
91
|
+
- name: Resolve Python path
|
|
92
|
+
run: |
|
|
93
|
+
# manylinux stores CPythons under /opt/python/cpXYZ-cpXYZ/bin/python
|
|
94
|
+
PYVER="${{ matrix.python-version }}"
|
|
95
|
+
PYTAG="cp${PYVER//./}"
|
|
96
|
+
PYBIN="/opt/python/${PYTAG}-${PYTAG}/bin"
|
|
97
|
+
if [ ! -d "$PYBIN" ]; then
|
|
98
|
+
echo "ERROR: Python $PYVER not found at $PYBIN"
|
|
99
|
+
echo "Available versions:"
|
|
100
|
+
ls /opt/python/
|
|
101
|
+
exit 1
|
|
102
|
+
fi
|
|
103
|
+
echo "PYBIN=$PYBIN" >> "$GITHUB_ENV"
|
|
104
|
+
|
|
105
|
+
- name: Build wheel
|
|
106
|
+
run: |
|
|
107
|
+
. "$HOME/.cargo/env"
|
|
108
|
+
${PYBIN}/python -m pip install maturin
|
|
109
|
+
${PYBIN}/python -m maturin build --release --strip --out dist \
|
|
110
|
+
-i ${PYBIN}/python
|
|
111
|
+
working-directory: python/crowley
|
|
112
|
+
|
|
113
|
+
# Upload directly — no artifact step, no Node.js dependency.
|
|
114
|
+
- name: Upload to PyPI
|
|
115
|
+
run: |
|
|
116
|
+
${PYBIN}/python -m pip install twine
|
|
117
|
+
${PYBIN}/python -m twine upload dist/* --skip-existing --verbose
|
|
118
|
+
working-directory: python/crowley
|
|
119
|
+
env:
|
|
120
|
+
TWINE_USERNAME: __token__
|
|
121
|
+
TWINE_PASSWORD: ${{ secrets.PYPI }}
|
|
122
|
+
|
|
123
|
+
# ==================================================================
|
|
124
|
+
# Build + publish: source distribution (only on tag push)
|
|
125
|
+
#
|
|
126
|
+
# This runs on the default runner image (not manylinux), so
|
|
127
|
+
# checkout, setup-python, and other JS-based actions work fine.
|
|
128
|
+
# ==================================================================
|
|
129
|
+
build_sdist:
|
|
130
|
+
name: Build + publish sdist
|
|
131
|
+
runs-on: codeberg-small
|
|
132
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
133
|
+
needs: test
|
|
134
|
+
steps:
|
|
135
|
+
- name: Checkout
|
|
136
|
+
uses: https://code.forgejo.org/actions/checkout@v4
|
|
137
|
+
|
|
138
|
+
- name: Install Rust
|
|
139
|
+
run: |
|
|
140
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
141
|
+
. "$HOME/.cargo/env"
|
|
142
|
+
|
|
143
|
+
- name: Set up Python
|
|
144
|
+
uses: https://code.forgejo.org/actions/setup-python@v5
|
|
145
|
+
with:
|
|
146
|
+
python-version: "3.12"
|
|
147
|
+
|
|
148
|
+
- name: Build and upload sdist
|
|
149
|
+
run: |
|
|
150
|
+
. "$HOME/.cargo/env"
|
|
151
|
+
pip install maturin twine
|
|
152
|
+
maturin sdist --out dist
|
|
153
|
+
twine upload dist/* --skip-existing --verbose
|
|
154
|
+
working-directory: python/crowley
|
|
155
|
+
env:
|
|
156
|
+
TWINE_USERNAME: __token__
|
|
157
|
+
TWINE_PASSWORD: ${{ secrets.PYPI }}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "crowley_rs"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2024"
|
|
5
|
+
description = "A high-performance streaming JSON query engine with DFA-based path matching"
|
|
6
|
+
license = "MIT OR Apache-2.0"
|
|
7
|
+
repository = "https://codeberg.org/nrposner/crowley"
|
|
8
|
+
homepage = "https://codeberg.org/nrposner/crowley"
|
|
9
|
+
keywords = ["json", "query", "streaming", "dfa", "search"]
|
|
10
|
+
categories = ["parser-implementations", "text-processing"]
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
exclude = ["python/", "benches/", "tests/"]
|
|
13
|
+
|
|
14
|
+
[dependencies]
|
|
15
|
+
foldhash = "0.2.0"
|
|
16
|
+
glob = "0.3.3"
|
|
17
|
+
memchr = "2.8.0"
|
|
18
|
+
pest = "2.8.6"
|
|
19
|
+
pest_derive = "2.8.6"
|
|
20
|
+
rayon = "1.11.0"
|
|
21
|
+
regex = "1.12.3"
|
|
22
|
+
serde = { version = "1.0.228", features = ["derive", "rc"] }
|
|
23
|
+
serde_json = "1.0.149"
|
|
24
|
+
serde_json_borrow = "0.9.0"
|
|
25
|
+
thiserror = "2.0.18"
|
|
26
|
+
|
|
27
|
+
[dev-dependencies]
|
|
28
|
+
criterion = { version = "0.8.2", features = ["html_reports"] }
|
|
29
|
+
tempfile = "3.27.0"
|
|
30
|
+
|
|
31
|
+
[profile.bench]
|
|
32
|
+
codegen-units = 1
|
|
33
|
+
lto = "thin"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Copyright 2026 Nicolas Posner
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
|
14
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nicolas Posner
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pycrowley-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pycrowley
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: Intended Audience :: Science/Research
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
9
|
+
Classifier: Programming Language :: Rust
|
|
10
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
17
|
+
Classifier: Topic :: Text Processing
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
License-File: LICENSE-APACHE
|
|
20
|
+
License-File: LICENSE-MIT
|
|
21
|
+
Summary: A high-performance streaming JSON query engine for out-of-memory files
|
|
22
|
+
Keywords: json,query,streaming,search,big-data
|
|
23
|
+
License-Expression: MIT OR Apache-2.0
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
26
|
+
Project-URL: Documentation, https://codeberg.org/nrposner/crowley
|
|
27
|
+
Project-URL: Issues, https://codeberg.org/nrposner/crowley/issues
|
|
28
|
+
Project-URL: Repository, https://codeberg.org/nrposner/crowley
|
|
29
|
+
|
|
30
|
+
# crowley
|
|
31
|
+
|
|
32
|
+
A high-performance JSON querying engine designed for fast starts, low flat memory usage, and out-of-memory streaming.
|
|
33
|
+
|
|
34
|
+
It is primarily designed to substitute for `ijson`. If you're coming to `crowley` from `ijson`, see the IJSON Migration Guide.
|
|
35
|
+
|
|
36
|
+
Written in Rust, with a SAX-style JSON event parser adapted from the [`json-event-parser`](https://crates.io/crates/json-event-parser) crate and a regular expression query language adapted from the [`jsongrep`](https://crates.io/crates/jsongrep) crate.
|
|
37
|
+
|
|
38
|
+
## Use cases
|
|
39
|
+
|
|
40
|
+
`crowley` is optimized for the following scenarios:
|
|
41
|
+
|
|
42
|
+
- **Queries over files too large to fit comfortably in memory.** `crowley` streams through JSON files with bounded memory regardless of file size. A 37 GB file uses ~30 MB of RAM.
|
|
43
|
+
- **Queries on transient data.** `crowley` quickly queries data which do not merit transformation into a more easily-queried structure such as a database or dataframe, because of time constraints or because the data is sensitive and cannot be loaded into an external application.
|
|
44
|
+
- **Queries over heterogeneous, deeply-nested, and schemaless data** which tools such as `pandas`, `polars`, or `duckdb` cannot ingest and transform. `crowley`'s regular-language queries don't require schema inference.
|
|
45
|
+
- **Queries over many files in parallel.** `crowley` natively supports searching over many files with the same query, using either a list of file paths or a pattern match. These files will be searched in parallel more quickly and with less memory overhead than `ijson` with a ProcessPool.
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
49
|
+
### Single-file search
|
|
50
|
+
```python
|
|
51
|
+
from crowley import Query
|
|
52
|
+
|
|
53
|
+
names = Query("data.json", "users[*].name")
|
|
54
|
+
ages = Query("data.json", "users[*].age")
|
|
55
|
+
|
|
56
|
+
names.count() # 4
|
|
57
|
+
names.exists() # True
|
|
58
|
+
names.values() # ['Alice', 'Bob', "Charlie", "Diana"]
|
|
59
|
+
ages.values() # [30, 25, 35, 28]
|
|
60
|
+
names.agg("sum") # nan
|
|
61
|
+
ages.agg("sum") # 118.0
|
|
62
|
+
names.types() # ['string']
|
|
63
|
+
ages.types() # ['number']
|
|
64
|
+
names.mode() # {'values': ['Alice', 'Bob', 'Charlie', 'Diana'], 'frequency': 1}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Multi-file search
|
|
68
|
+
```python
|
|
69
|
+
from crowley import Query
|
|
70
|
+
|
|
71
|
+
repo_names = Query("tests/github_daily_jsonl/2015*", "[*].repo.name")
|
|
72
|
+
|
|
73
|
+
repo_names.count() # [7702, 7427, 7234, 7387, 8273, 8971, 10307, 11351, 11749, 11961, 12229, 12314, 6743, 12442, 13111, 12473, 11601, 5971, 5869, 5887, 8322, 7105, 6139, 6371]
|
|
74
|
+
repo_names.total_count() # 218939
|
|
75
|
+
repo_names.total_unique() # 65703
|
|
76
|
+
repo_names.mode()[0] # {'values': ['KenanSulayman/heartbeat'], 'frequency': 79}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Query language
|
|
80
|
+
|
|
81
|
+
The [query language](https://github.com/micahkepe/jsongrep?tab=readme-ov-file#query-syntax) uses a regular-expression-inspired syntax for navigating JSON structure:
|
|
82
|
+
|
|
83
|
+
| Query | Meaning |
|
|
84
|
+
|-------|---------|
|
|
85
|
+
| `name` | Field `name` in the root object |
|
|
86
|
+
| `address.street` | Field `street` inside `address` |
|
|
87
|
+
| `users[*].name` | `name` field of every element in `users` array |
|
|
88
|
+
| `*` | Any field in the root object |
|
|
89
|
+
| `[*]` | Any element in the root array |
|
|
90
|
+
| `users[0]` | First element of `users` |
|
|
91
|
+
| `users[1:3]` | Elements at indices 1 and 2 |
|
|
92
|
+
| `(name \| age)` | Either `name` or `age` |
|
|
93
|
+
| `(* \| [*])*` | Any value at any depth (recursive descent) |
|
|
94
|
+
| `a?` | Returns the value of `a` if it exists
|
|
95
|
+
|
|
96
|
+
## Performance
|
|
97
|
+
|
|
98
|
+
Benchmarks measured on a Mac M3 Max with 32GB of RAM:
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
File: Flat GitHub log data, 34GB
|
|
102
|
+
Query: [*].repo.name
|
|
103
|
+
|
|
104
|
+
Count matches:
|
|
105
|
+
crowley: 71.6s
|
|
106
|
+
ijson: 128.8s
|
|
107
|
+
Difference: 1.8x
|
|
108
|
+
|
|
109
|
+
Return matches:
|
|
110
|
+
crowley: 116.0s
|
|
111
|
+
ijson: 126.1s
|
|
112
|
+
Difference: 1.09x
|
|
113
|
+
|
|
114
|
+
Return unique values:
|
|
115
|
+
crowley: 125.7
|
|
116
|
+
ijson: 129.5s
|
|
117
|
+
Difference: 1.03x
|
|
118
|
+
|
|
119
|
+
Return unique count:
|
|
120
|
+
crowley: 122.1
|
|
121
|
+
ijson: 129.5s
|
|
122
|
+
Difference: 1.06x
|
|
123
|
+
|
|
124
|
+
File: Nested GeoJSON, 30MB
|
|
125
|
+
Query: features[*].properties.name
|
|
126
|
+
|
|
127
|
+
Count matches:
|
|
128
|
+
crowley: 138.44ms
|
|
129
|
+
ijson: 421.85ms
|
|
130
|
+
Difference: 3.0x
|
|
131
|
+
|
|
132
|
+
Existence check (true):
|
|
133
|
+
crowley: 16µs
|
|
134
|
+
ijson: 793µs
|
|
135
|
+
Difference: 49x
|
|
136
|
+
|
|
137
|
+
Query: features[*].properties.scalerank
|
|
138
|
+
|
|
139
|
+
Sum matches:
|
|
140
|
+
crowley: 184.88ms
|
|
141
|
+
ijson: 425.89ms
|
|
142
|
+
Difference: 2.3x
|
|
143
|
+
|
|
144
|
+
Query: features[*].properties.nonexistent
|
|
145
|
+
|
|
146
|
+
Existence check (false):
|
|
147
|
+
crowley: 138.9ms
|
|
148
|
+
ijson: 409.7ms
|
|
149
|
+
Difference: 2.9x
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
On queries where the objective is to return values `crowley` outperforms `ijson` by 3-10%. In cases where a measure such as count or aggregate sum is returned, `crowley` can often outperform `ijson` by 2-3x by avoiding materializing values unnecessarily.
|
|
153
|
+
|
|
154
|
+
But the real benefit comes from `crowley`'s more expressive query language, which can efficiently express what would otherwise require Python loops aroung ijson.
|
|
155
|
+
|
|
156
|
+
It can extract multiple fields through disjunctions (at one or multiple levels) in a single pass without having to materialize the parent object:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
# get the number of matching objects
|
|
160
|
+
# 133.6ms
|
|
161
|
+
crowley.Query(file_str, "features[*].properties.(name | admin)").count()
|
|
162
|
+
|
|
163
|
+
# get the number of unique matches
|
|
164
|
+
# 144.2ms
|
|
165
|
+
crowley.Query(file_str, "features[*].properties.(name | admin)").unique_values()
|
|
166
|
+
|
|
167
|
+
# get the number of matching objects
|
|
168
|
+
# 851.6ms
|
|
169
|
+
def ijson_two_passes():
|
|
170
|
+
with open(file_str, "rb") as f:
|
|
171
|
+
count1 = sum(1 for _ in ijson.items(f, "features.item.properties.name"))
|
|
172
|
+
with open(file_str, "rb") as f:
|
|
173
|
+
count2 = sum(1 for _ in ijson.items(f, "features.item.properties.admin"))
|
|
174
|
+
return count1 + count2
|
|
175
|
+
ijson_two_passes()
|
|
176
|
+
|
|
177
|
+
# get the number of unique matches
|
|
178
|
+
# 430ms
|
|
179
|
+
def ijson_two_fields():
|
|
180
|
+
names = set()
|
|
181
|
+
with open(file_str, "rb") as f:
|
|
182
|
+
for obj in ijson.items(f, "features.item.properties"):
|
|
183
|
+
if "name" in obj:
|
|
184
|
+
names.add(obj["name"])
|
|
185
|
+
if "admin" in obj:
|
|
186
|
+
names.add(obj["admin"])
|
|
187
|
+
return names
|
|
188
|
+
ijson_two_fields()
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
It can extract all property values without internal iteration:
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# get the number of all matching property values by query
|
|
195
|
+
# 133.9ms
|
|
196
|
+
crowley.Query(file_str, "features[*].properties.*").count()
|
|
197
|
+
|
|
198
|
+
# get the number of all matching properties by internal iteration
|
|
199
|
+
# 427.9ms
|
|
200
|
+
def ijson_all_props():
|
|
201
|
+
count = 0
|
|
202
|
+
with open(file_str, "rb") as f:
|
|
203
|
+
for obj in ijson.items(f, "features.item.properties"):
|
|
204
|
+
count += len(obj)
|
|
205
|
+
return count
|
|
206
|
+
ijson_all_props()
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
It can select ranges of array elements without manual index checking:
|
|
210
|
+
|
|
211
|
+
- Note: this is one of the few places `crowley` can be slower under some conditions: if the array range is not at the root level, `ijson` + Python break logic can stop more quickly, while `crowley` must continue parsing the outer structure. For root-level array ranges, `crowley` remains faster. Attempting to use the same approach with `crowley` as with `ijson`, manually checking values and breaking out, makes crowley even slower, however.
|
|
212
|
+
|
|
213
|
+
```
|
|
214
|
+
Root-level array (github_array.json):
|
|
215
|
+
crowley [0:3]: 22µs (crowley terminates early more quickly)
|
|
216
|
+
ijson [0:3]+break: 234µs
|
|
217
|
+
Difference: 10.6x
|
|
218
|
+
|
|
219
|
+
crowley [97:102]: 464µs (crowley terminates early more quickly)
|
|
220
|
+
ijson [97:102]+break: 923 µs
|
|
221
|
+
Difference: 1.98x
|
|
222
|
+
|
|
223
|
+
crowley [*] (full): 49.4ms
|
|
224
|
+
crowley [*]+break: 60.9ms
|
|
225
|
+
|
|
226
|
+
Nested array (ne_10m.json):
|
|
227
|
+
crowley [0:3]: 131.4ms
|
|
228
|
+
ijson [0:3]+break: 847µs (ijson is able to short-circuit faster!)
|
|
229
|
+
Difference: 0.006x
|
|
230
|
+
|
|
231
|
+
crowley [97:102]: 133.8ms
|
|
232
|
+
ijson [97:102]+break: 11.5ms (ijson is able to short-circuit faster!)
|
|
233
|
+
Difference: 0.086x
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
# start of array
|
|
238
|
+
crowley.Query(file_str, "features[0:3].properties.name", no_seek=True).values()
|
|
239
|
+
|
|
240
|
+
# middle of array
|
|
241
|
+
crowley.Query(file_str, "features[97:102].properties.name", no_seek=True).values()
|
|
242
|
+
|
|
243
|
+
def ijson_range_start():
|
|
244
|
+
result = []
|
|
245
|
+
with open(file_str, "rb") as f:
|
|
246
|
+
for i, name in enumerate(ijson.items(f, "features.item.properties.name")):
|
|
247
|
+
if i < 3:
|
|
248
|
+
result.append(name)
|
|
249
|
+
else:
|
|
250
|
+
break
|
|
251
|
+
return result
|
|
252
|
+
ijson_range_start()
|
|
253
|
+
|
|
254
|
+
def ijson_range_mid():
|
|
255
|
+
result = []
|
|
256
|
+
with open(file_str, "rb") as f:
|
|
257
|
+
for i, name in enumerate(ijson.items(f, "features.item.properties.name")):
|
|
258
|
+
if 97 <= i < 102:
|
|
259
|
+
result.append(name)
|
|
260
|
+
if i >= 101:
|
|
261
|
+
break
|
|
262
|
+
return result
|
|
263
|
+
ijson_range_mid()
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
And can even descend recursively in a way that `ijson` simply cannot do: this would require a non-streaming solution like `json` that loads the whole file into memory.
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
# get unique values of 'type' at any depth
|
|
270
|
+
# 221.8ms : ['FeatureCollection', 'name', 'Feature', 'Polygon']
|
|
271
|
+
crowley.Query(file_str, "(* | [*])*.type", no_seek=True).unique_values()
|
|
272
|
+
|
|
273
|
+
# get count of all matching objects at all depths
|
|
274
|
+
# 156.7ms : 17090
|
|
275
|
+
crowley.Query(file_str, "(* | [*])*.type", no_seek=True).count()
|
|
276
|
+
|
|
277
|
+
# walk the entire json tree manually looking for matching keys
|
|
278
|
+
# 509.8ms
|
|
279
|
+
import json
|
|
280
|
+
def json_recursive_search(key):
|
|
281
|
+
with open(file_str) as f:
|
|
282
|
+
data = json.load(f)
|
|
283
|
+
|
|
284
|
+
results = []
|
|
285
|
+
def walk(obj):
|
|
286
|
+
if isinstance(obj, dict):
|
|
287
|
+
for k, v in obj.items():
|
|
288
|
+
if k == key:
|
|
289
|
+
results.append(v)
|
|
290
|
+
walk(v)
|
|
291
|
+
elif isinstance(obj, list):
|
|
292
|
+
for item in obj:
|
|
293
|
+
walk(item)
|
|
294
|
+
walk(data)
|
|
295
|
+
return results
|
|
296
|
+
|
|
297
|
+
values = json_recursive_search("type")
|
|
298
|
+
unique = set(str(x) for x in values)
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### Cold vs Hot Start
|
|
302
|
+
|
|
303
|
+
On cold starts (first query, no prior loading), `crowley` is **2-3x faster than pandas**, **3-7x faster than DuckDB**, and handles files that make Polars fail entirely due to schema inference errors.
|
|
304
|
+
|
|
305
|
+
On subsequent calls, methods such as `count()` or `exists()` return their pre-computed answer in **O(1)** with zero file I/O. Other methods like `types()` and `agg()` will determine whether reading only matched byte positions will be faster than a full sequential scan.
|
|
306
|
+
|
|
307
|
+
However, on very large files with a large volume of matches, the cached byte offsets for matches can considerably exceed the memory usage from streaming itself, and these offsets remain in the Query object until it is dropped. The query's cache can be manually cleared with `.clear_cache()`, and cache accumulation can be deactivated at query creation with the `no_seek=True` kwarg. This can be configured globally with `crowley.configure(no_seek=True)`.
|
|
308
|
+
|
|
309
|
+
## Acknowledgments
|
|
310
|
+
|
|
311
|
+
Built on the DFA-based query engine from [jsongrep](https://github.com/micahkepe/jsongrep) by Micah Kepe, and the SAX parser from [json-event-parser](https://github.com/oxigraph/json-event-parser) by the Oxigraph project.
|
|
312
|
+
|
|
313
|
+
This project benefits not only from the work of other developers, but also from their choice to make their source code public and freely re-usable under the MIT and Apache2.0 licenses.
|
|
314
|
+
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# crowley_rs
|
|
2
|
+
|
|
3
|
+
A high-performance JSON querying engine designed for fast starts, low flat memory usage, and out-of-memory streaming. It uses a custom SAX-style JSON event parser adapted from the [`json-event-parser`](https://crates.io/crates/json-event-parser) crate and a regular expression query language adapted from the [`jsongrep`](https://crates.io/crates/jsongrep) crate.
|
|
4
|
+
|
|
5
|
+
## Use cases
|
|
6
|
+
|
|
7
|
+
`crowley_rs` is optimized for the following scenarios:
|
|
8
|
+
|
|
9
|
+
- **Queries over files too large to fit comfortably in memory.** `crowley_rs` streams through JSON files with bounded memory regardless of file size. A 37 GB file uses ~30 MB of RAM.
|
|
10
|
+
- **Queries on transient data.** `crowley_rs` quickly queries data which do not merit transformation into a more easily-queried structure such as a database or dataframe, because of time constraints or because the data is sensitive and cannot be loaded into an external application.
|
|
11
|
+
- **Queries over heterogeneous, deeply-nested, and schemaless data** which tools such as `pandas`, `polars`, or `duckdb` cannot ingest and transform. `crowley_rs`'s path-based queries don't require schema inference.
|
|
12
|
+
- **Queries over many files in parallel.** `crowley` natively supports searching over many files with the same query, using either a list of file paths or a pattern match.
|
|
13
|
+
|
|
14
|
+
## Performance
|
|
15
|
+
|
|
16
|
+
Its closest streaming cousin is [`ijson`](https://pypi.org/project/ijson/), which it reliably beats in runtime performance by at least 2x on full scans, with a wider margin on deeply nested data. It is also less memory-hungry and starts up in <50 us as opposed to ~500 us.
|
|
17
|
+
|
|
18
|
+
If your `ijson` queries are a drag on other parts of your data pipeline and can be translated into the jsongrep regular query language, consider switching to `crowley_rs`.
|
|
19
|
+
|
|
20
|
+
On cold starts (first query, no prior loading), `crowley_rs` is **2-3x faster than pandas**, **3-7x faster than DuckDB**, and handles files that make Polars fail entirely due to schema inference errors.
|
|
21
|
+
|
|
22
|
+
Accumulating methods cache byte offsets after the first scan. Subsequent calls to `count()` and `exists()` return in **O(1)** with zero file I/O. Seek-based methods like `types()` and `agg()` read only the matched byte positions rather than re-scanning the entire file.
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```rust
|
|
27
|
+
use crowley_rs::{
|
|
28
|
+
Query, CrowleyError, Value,
|
|
29
|
+
AggMode, ModeResult, StreamMatch, PathType, JsonType,
|
|
30
|
+
format_path,
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
let mut q = Query::from_file("data.json", "users[*].name")?;
|
|
34
|
+
|
|
35
|
+
// Streaming modes — return iterators
|
|
36
|
+
let values: Vec<Value> = q.values()?.collect::<Result<_, _>>()?;
|
|
37
|
+
let paths: Vec<Vec<PathType>> = q.paths()?.collect::<Result<_, _>>()?;
|
|
38
|
+
let offsets: Vec<usize> = q.offsets()?.collect::<Result<_, _>>()?;
|
|
39
|
+
for m in q.contents()? {
|
|
40
|
+
let m: StreamMatch = m?;
|
|
41
|
+
println!("{} = {}", format_path(&m.path), m.value);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Accumulating modes — return single results
|
|
45
|
+
let count: usize = q.count()?;
|
|
46
|
+
let exists: bool = q.exists()?;
|
|
47
|
+
let types: std::collections::HashSet<JsonType, _> = q.types()?;
|
|
48
|
+
let unique: Vec<Value> = q.unique_values()?;
|
|
49
|
+
|
|
50
|
+
// Numeric aggregations
|
|
51
|
+
let sum: f64 = q.agg(AggMode::Sum)?;
|
|
52
|
+
let min: f64 = q.agg(AggMode::Min)?;
|
|
53
|
+
let max: f64 = q.agg(AggMode::Max)?;
|
|
54
|
+
let mean: f64 = q.agg(AggMode::Mean)?;
|
|
55
|
+
|
|
56
|
+
// Most frequent value(s) — handles ties
|
|
57
|
+
let mode: ModeResult = q.mode()?;
|
|
58
|
+
// mode.values: Vec<Value>, mode.frequency: usize
|
|
59
|
+
|
|
60
|
+
// Disable seek-based caching (e.g. for dense matches or slow-seek storage)
|
|
61
|
+
let mut q = Query::from_file("data.json", "*")?.no_seek();
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
`crowley` also supports searching over multiple files in parallel using the MultiQuery struct.
|
|
65
|
+
|
|
66
|
+
## Query language
|
|
67
|
+
|
|
68
|
+
The [query language](https://github.com/micahkepe/jsongrep?tab=readme-ov-file#query-syntax) uses a regular-expression-inspired syntax for navigating JSON structure:
|
|
69
|
+
|
|
70
|
+
| Query | Meaning |
|
|
71
|
+
|-------|---------|
|
|
72
|
+
| `name` | Field `name` in the root object |
|
|
73
|
+
| `address.street` | Field `street` inside `address` |
|
|
74
|
+
| `users[*].name` | `name` field of every element in `users` array |
|
|
75
|
+
| `*` | Any field in the root object |
|
|
76
|
+
| `[*]` | Any element in the root array |
|
|
77
|
+
| `users[0]` | First element of `users` |
|
|
78
|
+
| `users[1:3]` | Elements at indices 1 and 2 |
|
|
79
|
+
| `(name \| age)` | Either `name` or `age` |
|
|
80
|
+
| `(* \| [*])*` | Any value at any depth (recursive descent) |
|
|
81
|
+
| `a?` | Returns the value of `a` if it exists
|
|
82
|
+
|
|
83
|
+
## Python bindings
|
|
84
|
+
|
|
85
|
+
Python bindings are available as the `crowley` package (separate PyPI distribution):
|
|
86
|
+
|
|
87
|
+
### Single-file search
|
|
88
|
+
```python
|
|
89
|
+
from crowley import Query
|
|
90
|
+
|
|
91
|
+
names = Query("data.json", "users[*].name")
|
|
92
|
+
ages = Query("data.json", "users[*].age")
|
|
93
|
+
|
|
94
|
+
names.count() # 4
|
|
95
|
+
names.exists() # True
|
|
96
|
+
names.values() # ['Alice', 'Bob', "Charlie", "Diana"]
|
|
97
|
+
ages.values() # [30, 25, 35, 28]
|
|
98
|
+
names.agg("sum") # nan
|
|
99
|
+
ages.agg("sum") # 118.0
|
|
100
|
+
names.types() # ['string']
|
|
101
|
+
ages.types() # ['number']
|
|
102
|
+
names.mode() # {'values': ['Alice', 'Bob', 'Charlie', 'Diana'], 'frequency': 1}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Multi-file search
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from crowley import Query
|
|
109
|
+
|
|
110
|
+
repo_names = Query("tests/github_daily_jsonl/2015*", "[*].repo.name")
|
|
111
|
+
|
|
112
|
+
repo_names.count() # [7702, 7427, 7234, 7387, 8273, 8971, 10307, 11351, 11749, 11961, 12229, 12314, 6743, 12442, 13111, 12473, 11601, 5971, 5869, 5887, 8322, 7105, 6139, 6371]
|
|
113
|
+
repo_names.total_count() # 218939
|
|
114
|
+
repo_names.total_unique() # 65703
|
|
115
|
+
repo_names.mode()[0] # {'values': ['KenanSulayman/heartbeat'], 'frequency': 79}
|
|
116
|
+
```
|
|
117
|
+
## Acknowledgments
|
|
118
|
+
|
|
119
|
+
Built on the DFA-based query engine from [jsongrep](https://github.com/micahkepe/jsongrep) by Micah Kepe, and the SAX parser from [json-event-parser](https://github.com/oxigraph/json-event-parser) by the Oxigraph project.
|
|
120
|
+
|
|
121
|
+
This project benefits not only from the work of other developers, but also from their choice to make their source code public and freely re-usable under the MIT and Apache2.0 licenses.
|