pycrowley 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. pycrowley-0.1.0/.forgejo/workflows/ci.yml +157 -0
  2. pycrowley-0.1.0/.gitignore +4 -0
  3. pycrowley-0.1.0/Cargo.toml +33 -0
  4. pycrowley-0.1.0/LICENSE-APACHE +14 -0
  5. pycrowley-0.1.0/LICENSE-MIT +21 -0
  6. pycrowley-0.1.0/PKG-INFO +314 -0
  7. pycrowley-0.1.0/README.md +121 -0
  8. pycrowley-0.1.0/pyproject.toml +43 -0
  9. pycrowley-0.1.0/python/crowley/.forgejo/workflows/ci.yml +133 -0
  10. pycrowley-0.1.0/python/crowley/.gitignore +72 -0
  11. pycrowley-0.1.0/python/crowley/Cargo.lock +470 -0
  12. pycrowley-0.1.0/python/crowley/Cargo.toml +15 -0
  13. pycrowley-0.1.0/python/crowley/LICENSE-APACHE +17 -0
  14. pycrowley-0.1.0/python/crowley/LICENSE-MIT +21 -0
  15. pycrowley-0.1.0/python/crowley/MIGRATION_ijson2crowley.md +186 -0
  16. pycrowley-0.1.0/python/crowley/README.md +284 -0
  17. pycrowley-0.1.0/python/crowley/__init__.py +170 -0
  18. pycrowley-0.1.0/python/crowley/__init__.pyi +204 -0
  19. pycrowley-0.1.0/python/crowley/benchmarks/bench.py +491 -0
  20. pycrowley-0.1.0/python/crowley/benchmarks/bench_cached.py +190 -0
  21. pycrowley-0.1.0/python/crowley/benchmarks/bench_competitors.py +227 -0
  22. pycrowley-0.1.0/python/crowley/benchmarks/bench_expressive.py +352 -0
  23. pycrowley-0.1.0/python/crowley/benchmarks/bench_large.py +197 -0
  24. pycrowley-0.1.0/python/crowley/benchmarks/bench_multifile.py +215 -0
  25. pycrowley-0.1.0/python/crowley/benchmarks/bench_multifile_daily.py +139 -0
  26. pycrowley-0.1.0/python/crowley/benchmarks/new_bench_large.py +50 -0
  27. pycrowley-0.1.0/python/crowley/py.typed +0 -0
  28. pycrowley-0.1.0/python/crowley/src/lib.rs +712 -0
  29. pycrowley-0.1.0/python/crowley/tests/data.json +8 -0
  30. pycrowley-0.1.0/python/crowley/tests/test_query.py +492 -0
  31. pycrowley-0.1.0/src/error.rs +24 -0
  32. pycrowley-0.1.0/src/grep/mod.rs +7 -0
  33. pycrowley-0.1.0/src/grep/query/ast.rs +605 -0
  34. pycrowley-0.1.0/src/grep/query/common.rs +98 -0
  35. pycrowley-0.1.0/src/grep/query/dfa.rs +1646 -0
  36. pycrowley-0.1.0/src/grep/query/grammar/query.pest +100 -0
  37. pycrowley-0.1.0/src/grep/query/mod.rs +5 -0
  38. pycrowley-0.1.0/src/grep/query/nfa.rs +801 -0
  39. pycrowley-0.1.0/src/grep/query/parser.rs +767 -0
  40. pycrowley-0.1.0/src/lib.rs +53 -0
  41. pycrowley-0.1.0/src/multi.rs +340 -0
  42. pycrowley-0.1.0/src/parse/mod.rs +23 -0
  43. pycrowley-0.1.0/src/parse/read.rs +1193 -0
  44. pycrowley-0.1.0/src/query.rs +447 -0
  45. pycrowley-0.1.0/src/stream/engine.rs +1357 -0
  46. pycrowley-0.1.0/src/stream/mod.rs +1 -0
@@ -0,0 +1,157 @@
1
+ name: Build, test, and publish
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ branches: [main]
7
+ tags: ["v*.*.*"]
8
+ pull_request:
9
+ branches: [main]
10
+
11
+ jobs:
12
+ # ==================================================================
13
+ # Test: run Rust tests and Python tests
14
+ # ==================================================================
15
+ test:
16
+ name: Test
17
+ runs-on: codeberg-medium
18
+ steps:
19
+ - name: Checkout
20
+ uses: https://code.forgejo.org/actions/checkout@v4
21
+
22
+ - name: Install Rust
23
+ run: |
24
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
25
+ . "$HOME/.cargo/env"
26
+
27
+ - name: Set up Python
28
+ uses: https://code.forgejo.org/actions/setup-python@v5
29
+ with:
30
+ python-version: "3.12"
31
+
32
+ - name: Rust tests
33
+ run: |
34
+ . "$HOME/.cargo/env"
35
+ cargo test --lib --test public_api
36
+
37
+ - name: Rust benchmarks compile check
38
+ run: |
39
+ . "$HOME/.cargo/env"
40
+ cargo build --bench throughput
41
+
42
+ - name: Install Python deps and build wheel
43
+ run: |
44
+ . "$HOME/.cargo/env"
45
+ python3 -m venv .venv
46
+ .venv/bin/pip install maturin pytest
47
+ .venv/bin/maturin develop --manifest-path python/crowley/Cargo.toml
48
+
49
+ - name: Python tests
50
+ run: .venv/bin/pytest python/crowley/tests/test_query.py -v
51
+
52
+ # ==================================================================
53
+ # Build + publish: manylinux wheels (only on tag push)
54
+ #
55
+ # Each matrix entry builds one wheel and uploads it directly to PyPI.
56
+ # We do NOT use upload-artifact/download-artifact because:
57
+ # - The manylinux container has no Node.js, so JS-based actions fail
58
+ # - Forgejo's artifact v4 support requires a patched fork and is fragile
59
+ # Instead each job uploads its wheel independently via twine.
60
+ # --skip-existing ensures no conflicts between parallel uploads.
61
+ #
62
+ # Python versions in the manylinux_2_28 image live under /opt/python/
63
+ # with PEP 425 tag names like cp312-cp312. We resolve the path below.
64
+ # ==================================================================
65
+ build_wheels:
66
+ name: Build + publish wheel (Python ${{ matrix.python-version }})
67
+ runs-on: codeberg-medium
68
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
69
+ needs: test
70
+ container:
71
+ image: quay.io/pypa/manylinux_2_28_x86_64
72
+ strategy:
73
+ fail-fast: false
74
+ matrix:
75
+ # These must match CPython versions available in the manylinux image.
76
+ # As of early 2026: 3.10 through 3.14 are available.
77
+ # 3.15 is alpha-only and NOT in the image — add it when it ships.
78
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
79
+ steps:
80
+ # No actions/checkout — it needs Node.js which manylinux lacks.
81
+ - name: Checkout
82
+ run: |
83
+ git clone --depth 1 --branch ${{ github.ref_name }} \
84
+ ${{ github.server_url }}/${{ github.repository }} .
85
+
86
+ - name: Install Rust
87
+ run: |
88
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
89
+ . "$HOME/.cargo/env"
90
+
91
+ - name: Resolve Python path
92
+ run: |
93
+ # manylinux stores CPythons under /opt/python/cpXYZ-cpXYZ/bin/python
94
+ PYVER="${{ matrix.python-version }}"
95
+ PYTAG="cp${PYVER//./}"
96
+ PYBIN="/opt/python/${PYTAG}-${PYTAG}/bin"
97
+ if [ ! -d "$PYBIN" ]; then
98
+ echo "ERROR: Python $PYVER not found at $PYBIN"
99
+ echo "Available versions:"
100
+ ls /opt/python/
101
+ exit 1
102
+ fi
103
+ echo "PYBIN=$PYBIN" >> "$GITHUB_ENV"
104
+
105
+ - name: Build wheel
106
+ run: |
107
+ . "$HOME/.cargo/env"
108
+ ${PYBIN}/python -m pip install maturin
109
+ ${PYBIN}/python -m maturin build --release --strip --out dist \
110
+ -i ${PYBIN}/python
111
+ working-directory: python/crowley
112
+
113
+ # Upload directly — no artifact step, no Node.js dependency.
114
+ - name: Upload to PyPI
115
+ run: |
116
+ ${PYBIN}/python -m pip install twine
117
+ ${PYBIN}/python -m twine upload dist/* --skip-existing --verbose
118
+ working-directory: python/crowley
119
+ env:
120
+ TWINE_USERNAME: __token__
121
+ TWINE_PASSWORD: ${{ secrets.PYPI }}
122
+
123
+ # ==================================================================
124
+ # Build + publish: source distribution (only on tag push)
125
+ #
126
+ # This runs on the default runner image (not manylinux), so
127
+ # checkout, setup-python, and other JS-based actions work fine.
128
+ # ==================================================================
129
+ build_sdist:
130
+ name: Build + publish sdist
131
+ runs-on: codeberg-small
132
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
133
+ needs: test
134
+ steps:
135
+ - name: Checkout
136
+ uses: https://code.forgejo.org/actions/checkout@v4
137
+
138
+ - name: Install Rust
139
+ run: |
140
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
141
+ . "$HOME/.cargo/env"
142
+
143
+ - name: Set up Python
144
+ uses: https://code.forgejo.org/actions/setup-python@v5
145
+ with:
146
+ python-version: "3.12"
147
+
148
+ - name: Build and upload sdist
149
+ run: |
150
+ . "$HOME/.cargo/env"
151
+ pip install maturin twine
152
+ maturin sdist --out dist
153
+ twine upload dist/* --skip-existing --verbose
154
+ working-directory: python/crowley
155
+ env:
156
+ TWINE_USERNAME: __token__
157
+ TWINE_PASSWORD: ${{ secrets.PYPI }}
@@ -0,0 +1,4 @@
1
+ /target
2
+ /Cargo.lock
3
+ /python/crowley/tests/nobel.json
4
+ /python/crowley/Cargo.lock
@@ -0,0 +1,33 @@
1
+ [package]
2
+ name = "crowley_rs"
3
+ version = "0.1.0"
4
+ edition = "2024"
5
+ description = "A high-performance streaming JSON query engine with DFA-based path matching"
6
+ license = "MIT OR Apache-2.0"
7
+ repository = "https://codeberg.org/nrposner/crowley"
8
+ homepage = "https://codeberg.org/nrposner/crowley"
9
+ keywords = ["json", "query", "streaming", "dfa", "search"]
10
+ categories = ["parser-implementations", "text-processing"]
11
+ readme = "README.md"
12
+ exclude = ["python/", "benches/", "tests/"]
13
+
14
+ [dependencies]
15
+ foldhash = "0.2.0"
16
+ glob = "0.3.3"
17
+ memchr = "2.8.0"
18
+ pest = "2.8.6"
19
+ pest_derive = "2.8.6"
20
+ rayon = "1.11.0"
21
+ regex = "1.12.3"
22
+ serde = { version = "1.0.228", features = ["derive", "rc"] }
23
+ serde_json = "1.0.149"
24
+ serde_json_borrow = "0.9.0"
25
+ thiserror = "2.0.18"
26
+
27
+ [dev-dependencies]
28
+ criterion = { version = "0.8.2", features = ["html_reports"] }
29
+ tempfile = "3.27.0"
30
+
31
+ [profile.bench]
32
+ codegen-units = 1
33
+ lto = "thin"
@@ -0,0 +1,14 @@
1
+ Copyright 2026 Nicolas Posner
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nicolas Posner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,314 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycrowley
3
+ Version: 0.1.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Intended Audience :: Science/Research
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Programming Language :: Rust
10
+ Classifier: Programming Language :: Python :: Implementation :: CPython
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Libraries
17
+ Classifier: Topic :: Text Processing
18
+ Classifier: Typing :: Typed
19
+ License-File: LICENSE-APACHE
20
+ License-File: LICENSE-MIT
21
+ Summary: A high-performance streaming JSON query engine for out-of-memory files
22
+ Keywords: json,query,streaming,search,big-data
23
+ License-Expression: MIT OR Apache-2.0
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
26
+ Project-URL: Documentation, https://codeberg.org/nrposner/crowley
27
+ Project-URL: Issues, https://codeberg.org/nrposner/crowley/issues
28
+ Project-URL: Repository, https://codeberg.org/nrposner/crowley
29
+
30
+ # crowley
31
+
32
+ A high-performance JSON querying engine designed for fast starts, low flat memory usage, and out-of-memory streaming.
33
+
34
+ It is primarily designed to substitute for `ijson`. If you're coming to `crowley` from `ijson`, see the IJSON Migration Guide.
35
+
36
+ Written in Rust, with a SAX-style JSON event parser adapted from the [`json-event-parser`](https://crates.io/crates/json-event-parser) crate and a regular expression query language adapted from the [`jsongrep`](https://crates.io/crates/jsongrep) crate.
37
+
38
+ ## Use cases
39
+
40
+ `crowley` is optimized for the following scenarios:
41
+
42
+ - **Queries over files too large to fit comfortably in memory.** `crowley` streams through JSON files with bounded memory regardless of file size. A 37 GB file uses ~30 MB of RAM.
43
+ - **Queries on transient data.** `crowley` quickly queries data which do not merit transformation into a more easily-queried structure such as a database or dataframe, because of time constraints or because the data is sensitive and cannot be loaded into an external application.
44
+ - **Queries over heterogeneous, deeply-nested, and schemaless data** which tools such as `pandas`, `polars`, or `duckdb` cannot ingest and transform. `crowley`'s regular-language queries don't require schema inference.
45
+ - **Queries over many files in parallel.** `crowley` natively supports searching over many files with the same query, using either a list of file paths or a pattern match. These files will be searched in parallel more quickly and with less memory overhead than `ijson` with a ProcessPool.
46
+
47
+ ## Usage
48
+
49
+ ### Single-file search
50
+ ```python
51
+ from crowley import Query
52
+
53
+ names = Query("data.json", "users[*].name")
54
+ ages = Query("data.json", "users[*].age")
55
+
56
+ names.count() # 4
57
+ names.exists() # True
58
+ names.values() # ['Alice', 'Bob', "Charlie", "Diana"]
59
+ ages.values() # [30, 25, 35, 28]
60
+ names.agg("sum") # nan
61
+ ages.agg("sum") # 118.0
62
+ names.types() # ['string']
63
+ ages.types() # ['number']
64
+ names.mode() # {'values': ['Alice', 'Bob', 'Charlie', 'Diana'], 'frequency': 1}
65
+ ```
66
+
67
+ ### Multi-file search
68
+ ```python
69
+ from crowley import Query
70
+
71
+ repo_names = Query("tests/github_daily_jsonl/2015*", "[*].repo.name")
72
+
73
+ repo_names.count() # [7702, 7427, 7234, 7387, 8273, 8971, 10307, 11351, 11749, 11961, 12229, 12314, 6743, 12442, 13111, 12473, 11601, 5971, 5869, 5887, 8322, 7105, 6139, 6371]
74
+ repo_names.total_count() # 218939
75
+ repo_names.total_unique() # 65703
76
+ repo_names.mode()[0] # {'values': ['KenanSulayman/heartbeat'], 'frequency': 79}
77
+ ```
78
+
79
+ ## Query language
80
+
81
+ The [query language](https://github.com/micahkepe/jsongrep?tab=readme-ov-file#query-syntax) uses a regular-expression-inspired syntax for navigating JSON structure:
82
+
83
+ | Query | Meaning |
84
+ |-------|---------|
85
+ | `name` | Field `name` in the root object |
86
+ | `address.street` | Field `street` inside `address` |
87
+ | `users[*].name` | `name` field of every element in `users` array |
88
+ | `*` | Any field in the root object |
89
+ | `[*]` | Any element in the root array |
90
+ | `users[0]` | First element of `users` |
91
+ | `users[1:3]` | Elements at indices 1 and 2 |
92
+ | `(name \| age)` | Either `name` or `age` |
93
+ | `(* \| [*])*` | Any value at any depth (recursive descent) |
94
+ | `a?` | Returns the value of `a` if it exists
95
+
96
+ ## Performance
97
+
98
+ Benchmarks measured on a Mac M3 Max with 32GB of RAM:
99
+
100
+ ```
101
+ File: Flat GitHub log data, 34GB
102
+ Query: [*].repo.name
103
+
104
+ Count matches:
105
+ crowley: 71.6s
106
+ ijson: 128.8s
107
+ Difference: 1.8x
108
+
109
+ Return matches:
110
+ crowley: 116.0s
111
+ ijson: 126.1s
112
+ Difference: 1.09x
113
+
114
+ Return unique values:
115
+ crowley: 125.7
116
+ ijson: 129.5s
117
+ Difference: 1.03x
118
+
119
+ Return unique count:
120
+ crowley: 122.1
121
+ ijson: 129.5s
122
+ Difference: 1.06x
123
+
124
+ File: Nested GeoJSON, 30MB
125
+ Query: features[*].properties.name
126
+
127
+ Count matches:
128
+ crowley: 138.44ms
129
+ ijson: 421.85ms
130
+ Difference: 3.0x
131
+
132
+ Existence check (true):
133
+ crowley: 16µs
134
+ ijson: 793µs
135
+ Difference: 49x
136
+
137
+ Query: features[*].properties.scalerank
138
+
139
+ Sum matches:
140
+ crowley: 184.88ms
141
+ ijson: 425.89ms
142
+ Difference: 2.3x
143
+
144
+ Query: features[*].properties.nonexistent
145
+
146
+ Existence check (false):
147
+ crowley: 138.9ms
148
+ ijson: 409.7ms
149
+ Difference: 2.9x
150
+ ```
151
+
152
+ On queries where the objective is to return values `crowley` outperforms `ijson` by 3-10%. In cases where a measure such as count or aggregate sum is returned, `crowley` can often outperform `ijson` by 2-3x by avoiding materializing values unnecessarily.
153
+
154
+ But the real benefit comes from `crowley`'s more expressive query language, which can efficiently express what would otherwise require Python loops aroung ijson.
155
+
156
+ It can extract multiple fields through disjunctions (at one or multiple levels) in a single pass without having to materialize the parent object:
157
+
158
+ ```python
159
+ # get the number of matching objects
160
+ # 133.6ms
161
+ crowley.Query(file_str, "features[*].properties.(name | admin)").count()
162
+
163
+ # get the number of unique matches
164
+ # 144.2ms
165
+ crowley.Query(file_str, "features[*].properties.(name | admin)").unique_values()
166
+
167
+ # get the number of matching objects
168
+ # 851.6ms
169
+ def ijson_two_passes():
170
+ with open(file_str, "rb") as f:
171
+ count1 = sum(1 for _ in ijson.items(f, "features.item.properties.name"))
172
+ with open(file_str, "rb") as f:
173
+ count2 = sum(1 for _ in ijson.items(f, "features.item.properties.admin"))
174
+ return count1 + count2
175
+ ijson_two_passes()
176
+
177
+ # get the number of unique matches
178
+ # 430ms
179
+ def ijson_two_fields():
180
+ names = set()
181
+ with open(file_str, "rb") as f:
182
+ for obj in ijson.items(f, "features.item.properties"):
183
+ if "name" in obj:
184
+ names.add(obj["name"])
185
+ if "admin" in obj:
186
+ names.add(obj["admin"])
187
+ return names
188
+ ijson_two_fields()
189
+ ```
190
+
191
+ It can extract all property values without internal iteration:
192
+
193
+ ```python
194
+ # get the number of all matching property values by query
195
+ # 133.9ms
196
+ crowley.Query(file_str, "features[*].properties.*").count()
197
+
198
+ # get the number of all matching properties by internal iteration
199
+ # 427.9ms
200
+ def ijson_all_props():
201
+ count = 0
202
+ with open(file_str, "rb") as f:
203
+ for obj in ijson.items(f, "features.item.properties"):
204
+ count += len(obj)
205
+ return count
206
+ ijson_all_props()
207
+ ```
208
+
209
+ It can select ranges of array elements without manual index checking:
210
+
211
+ - Note: this is one of the few places `crowley` can be slower under some conditions: if the array range is not at the root level, `ijson` + Python break logic can stop more quickly, while `crowley` must continue parsing the outer structure. For root-level array ranges, `crowley` remains faster. Attempting to use the same approach with `crowley` as with `ijson`, manually checking values and breaking out, makes crowley even slower, however.
212
+
213
+ ```
214
+ Root-level array (github_array.json):
215
+ crowley [0:3]: 22µs (crowley terminates early more quickly)
216
+ ijson [0:3]+break: 234µs
217
+ Difference: 10.6x
218
+
219
+ crowley [97:102]: 464µs (crowley terminates early more quickly)
220
+ ijson [97:102]+break: 923 µs
221
+ Difference: 1.98x
222
+
223
+ crowley [*] (full): 49.4ms
224
+ crowley [*]+break: 60.9ms
225
+
226
+ Nested array (ne_10m.json):
227
+ crowley [0:3]: 131.4ms
228
+ ijson [0:3]+break: 847µs (ijson is able to short-circuit faster!)
229
+ Difference: 0.006x
230
+
231
+ crowley [97:102]: 133.8ms
232
+ ijson [97:102]+break: 11.5ms (ijson is able to short-circuit faster!)
233
+ Difference: 0.086x
234
+ ```
235
+
236
+ ```python
237
+ # start of array
238
+ crowley.Query(file_str, "features[0:3].properties.name", no_seek=True).values()
239
+
240
+ # middle of array
241
+ crowley.Query(file_str, "features[97:102].properties.name", no_seek=True).values()
242
+
243
+ def ijson_range_start():
244
+ result = []
245
+ with open(file_str, "rb") as f:
246
+ for i, name in enumerate(ijson.items(f, "features.item.properties.name")):
247
+ if i < 3:
248
+ result.append(name)
249
+ else:
250
+ break
251
+ return result
252
+ ijson_range_start()
253
+
254
+ def ijson_range_mid():
255
+ result = []
256
+ with open(file_str, "rb") as f:
257
+ for i, name in enumerate(ijson.items(f, "features.item.properties.name")):
258
+ if 97 <= i < 102:
259
+ result.append(name)
260
+ if i >= 101:
261
+ break
262
+ return result
263
+ ijson_range_mid()
264
+ ```
265
+
266
+ And can even descend recursively in a way that `ijson` simply cannot do: this would require a non-streaming solution like `json` that loads the whole file into memory.
267
+
268
+ ```python
269
+ # get unique values of 'type' at any depth
270
+ # 221.8ms : ['FeatureCollection', 'name', 'Feature', 'Polygon']
271
+ crowley.Query(file_str, "(* | [*])*.type", no_seek=True).unique_values()
272
+
273
+ # get count of all matching objects at all depths
274
+ # 156.7ms : 17090
275
+ crowley.Query(file_str, "(* | [*])*.type", no_seek=True).count()
276
+
277
+ # walk the entire json tree manually looking for matching keys
278
+ # 509.8ms
279
+ import json
280
+ def json_recursive_search(key):
281
+ with open(file_str) as f:
282
+ data = json.load(f)
283
+
284
+ results = []
285
+ def walk(obj):
286
+ if isinstance(obj, dict):
287
+ for k, v in obj.items():
288
+ if k == key:
289
+ results.append(v)
290
+ walk(v)
291
+ elif isinstance(obj, list):
292
+ for item in obj:
293
+ walk(item)
294
+ walk(data)
295
+ return results
296
+
297
+ values = json_recursive_search("type")
298
+ unique = set(str(x) for x in values)
299
+ ```
300
+
301
+ ### Cold vs Hot Start
302
+
303
+ On cold starts (first query, no prior loading), `crowley` is **2-3x faster than pandas**, **3-7x faster than DuckDB**, and handles files that make Polars fail entirely due to schema inference errors.
304
+
305
+ On subsequent calls, methods such as `count()` or `exists()` return their pre-computed answer in **O(1)** with zero file I/O. Other methods like `types()` and `agg()` will determine whether reading only matched byte positions will be faster than a full sequential scan.
306
+
307
+ However, on very large files with a large volume of matches, the cached byte offsets for matches can considerably exceed the memory usage from streaming itself, and these offsets remain in the Query object until it is dropped. The query's cache can be manually cleared with `.clear_cache()`, and cache accumulation can be deactivated at query creation with the `no_seek=True` kwarg. This can be configured globally with `crowley.configure(no_seek=True)`.
308
+
309
+ ## Acknowledgments
310
+
311
+ Built on the DFA-based query engine from [jsongrep](https://github.com/micahkepe/jsongrep) by Micah Kepe, and the SAX parser from [json-event-parser](https://github.com/oxigraph/json-event-parser) by the Oxigraph project.
312
+
313
+ This project benefits not only from the work of other developers, but also from their choice to make their source code public and freely re-usable under the MIT and Apache2.0 licenses.
314
+
@@ -0,0 +1,121 @@
1
+ # crowley_rs
2
+
3
+ A high-performance JSON querying engine designed for fast starts, low flat memory usage, and out-of-memory streaming. It uses a custom SAX-style JSON event parser adapted from the [`json-event-parser`](https://crates.io/crates/json-event-parser) crate and a regular expression query language adapted from the [`jsongrep`](https://crates.io/crates/jsongrep) crate.
4
+
5
+ ## Use cases
6
+
7
+ `crowley_rs` is optimized for the following scenarios:
8
+
9
+ - **Queries over files too large to fit comfortably in memory.** `crowley_rs` streams through JSON files with bounded memory regardless of file size. A 37 GB file uses ~30 MB of RAM.
10
+ - **Queries on transient data.** `crowley_rs` quickly queries data which do not merit transformation into a more easily-queried structure such as a database or dataframe, because of time constraints or because the data is sensitive and cannot be loaded into an external application.
11
+ - **Queries over heterogeneous, deeply-nested, and schemaless data** which tools such as `pandas`, `polars`, or `duckdb` cannot ingest and transform. `crowley_rs`'s path-based queries don't require schema inference.
12
+ - **Queries over many files in parallel.** `crowley` natively supports searching over many files with the same query, using either a list of file paths or a pattern match.
13
+
14
+ ## Performance
15
+
16
+ Its closest streaming cousin is [`ijson`](https://pypi.org/project/ijson/), which it reliably beats in runtime performance by at least 2x on full scans, with a wider margin on deeply nested data. It is also less memory-hungry and starts up in <50 us as opposed to ~500 us.
17
+
18
+ If your `ijson` queries are a drag on other parts of your data pipeline and can be translated into the jsongrep regular query language, consider switching to `crowley_rs`.
19
+
20
+ On cold starts (first query, no prior loading), `crowley_rs` is **2-3x faster than pandas**, **3-7x faster than DuckDB**, and handles files that make Polars fail entirely due to schema inference errors.
21
+
22
+ Accumulating methods cache byte offsets after the first scan. Subsequent calls to `count()` and `exists()` return in **O(1)** with zero file I/O. Seek-based methods like `types()` and `agg()` read only the matched byte positions rather than re-scanning the entire file.
23
+
24
+ ## Usage
25
+
26
+ ```rust
27
+ use crowley_rs::{
28
+ Query, CrowleyError, Value,
29
+ AggMode, ModeResult, StreamMatch, PathType, JsonType,
30
+ format_path,
31
+ };
32
+
33
+ let mut q = Query::from_file("data.json", "users[*].name")?;
34
+
35
+ // Streaming modes — return iterators
36
+ let values: Vec<Value> = q.values()?.collect::<Result<_, _>>()?;
37
+ let paths: Vec<Vec<PathType>> = q.paths()?.collect::<Result<_, _>>()?;
38
+ let offsets: Vec<usize> = q.offsets()?.collect::<Result<_, _>>()?;
39
+ for m in q.contents()? {
40
+ let m: StreamMatch = m?;
41
+ println!("{} = {}", format_path(&m.path), m.value);
42
+ }
43
+
44
+ // Accumulating modes — return single results
45
+ let count: usize = q.count()?;
46
+ let exists: bool = q.exists()?;
47
+ let types: std::collections::HashSet<JsonType, _> = q.types()?;
48
+ let unique: Vec<Value> = q.unique_values()?;
49
+
50
+ // Numeric aggregations
51
+ let sum: f64 = q.agg(AggMode::Sum)?;
52
+ let min: f64 = q.agg(AggMode::Min)?;
53
+ let max: f64 = q.agg(AggMode::Max)?;
54
+ let mean: f64 = q.agg(AggMode::Mean)?;
55
+
56
+ // Most frequent value(s) — handles ties
57
+ let mode: ModeResult = q.mode()?;
58
+ // mode.values: Vec<Value>, mode.frequency: usize
59
+
60
+ // Disable seek-based caching (e.g. for dense matches or slow-seek storage)
61
+ let mut q = Query::from_file("data.json", "*")?.no_seek();
62
+ ```
63
+
64
+ `crowley` also supports searching over multiple files in parallel using the MultiQuery struct.
65
+
66
+ ## Query language
67
+
68
+ The [query language](https://github.com/micahkepe/jsongrep?tab=readme-ov-file#query-syntax) uses a regular-expression-inspired syntax for navigating JSON structure:
69
+
70
+ | Query | Meaning |
71
+ |-------|---------|
72
+ | `name` | Field `name` in the root object |
73
+ | `address.street` | Field `street` inside `address` |
74
+ | `users[*].name` | `name` field of every element in `users` array |
75
+ | `*` | Any field in the root object |
76
+ | `[*]` | Any element in the root array |
77
+ | `users[0]` | First element of `users` |
78
+ | `users[1:3]` | Elements at indices 1 and 2 |
79
+ | `(name \| age)` | Either `name` or `age` |
80
+ | `(* \| [*])*` | Any value at any depth (recursive descent) |
81
+ | `a?` | Returns the value of `a` if it exists
82
+
83
+ ## Python bindings
84
+
85
+ Python bindings are available as the `crowley` package (separate PyPI distribution):
86
+
87
+ ### Single-file search
88
+ ```python
89
+ from crowley import Query
90
+
91
+ names = Query("data.json", "users[*].name")
92
+ ages = Query("data.json", "users[*].age")
93
+
94
+ names.count() # 4
95
+ names.exists() # True
96
+ names.values() # ['Alice', 'Bob', "Charlie", "Diana"]
97
+ ages.values() # [30, 25, 35, 28]
98
+ names.agg("sum") # nan
99
+ ages.agg("sum") # 118.0
100
+ names.types() # ['string']
101
+ ages.types() # ['number']
102
+ names.mode() # {'values': ['Alice', 'Bob', 'Charlie', 'Diana'], 'frequency': 1}
103
+ ```
104
+
105
+ ### Multi-file search
106
+
107
+ ```python
108
+ from crowley import Query
109
+
110
+ repo_names = Query("tests/github_daily_jsonl/2015*", "[*].repo.name")
111
+
112
+ repo_names.count() # [7702, 7427, 7234, 7387, 8273, 8971, 10307, 11351, 11749, 11961, 12229, 12314, 6743, 12442, 13111, 12473, 11601, 5971, 5869, 5887, 8322, 7105, 6139, 6371]
113
+ repo_names.total_count() # 218939
114
+ repo_names.total_unique() # 65703
115
+ repo_names.mode()[0] # {'values': ['KenanSulayman/heartbeat'], 'frequency': 79}
116
+ ```
117
+ ## Acknowledgments
118
+
119
+ Built on the DFA-based query engine from [jsongrep](https://github.com/micahkepe/jsongrep) by Micah Kepe, and the SAX parser from [json-event-parser](https://github.com/oxigraph/json-event-parser) by the Oxigraph project.
120
+
121
+ This project benefits not only from the work of other developers, but also from their choice to make their source code public and freely re-usable under the MIT and Apache2.0 licenses.