weji-goose 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weji_goose-0.3.0/LICENSE +21 -0
- weji_goose-0.3.0/PKG-INFO +289 -0
- weji_goose-0.3.0/README.md +231 -0
- weji_goose-0.3.0/goose_pg/__init__.py +8 -0
- weji_goose-0.3.0/goose_pg/cli.py +341 -0
- weji_goose-0.3.0/goose_pg/exporter.py +365 -0
- weji_goose-0.3.0/goose_pg/profiler.py +668 -0
- weji_goose-0.3.0/goose_pg/type_map.py +247 -0
- weji_goose-0.3.0/pyproject.toml +57 -0
- weji_goose-0.3.0/setup.cfg +4 -0
- weji_goose-0.3.0/tests/test_advisor.py +340 -0
- weji_goose-0.3.0/tests/test_bloom.py +181 -0
- weji_goose-0.3.0/tests/test_cli.py +127 -0
- weji_goose-0.3.0/tests/test_column.py +197 -0
- weji_goose-0.3.0/tests/test_mmap.py +208 -0
- weji_goose-0.3.0/tests/test_optimizers.py +690 -0
- weji_goose-0.3.0/tests/test_query.py +865 -0
- weji_goose-0.3.0/tests/test_roaring.py +527 -0
- weji_goose-0.3.0/tests/test_table.py +595 -0
- weji_goose-0.3.0/tests/test_zstd_dict.py +187 -0
- weji_goose-0.3.0/weji_goose/__init__.py +41 -0
- weji_goose-0.3.0/weji_goose/advisor.py +951 -0
- weji_goose-0.3.0/weji_goose/block_bloom.py +163 -0
- weji_goose-0.3.0/weji_goose/block_index.py +214 -0
- weji_goose-0.3.0/weji_goose/bloom.py +191 -0
- weji_goose-0.3.0/weji_goose/column.py +905 -0
- weji_goose-0.3.0/weji_goose/correlation.py +210 -0
- weji_goose-0.3.0/weji_goose/exceptions.py +76 -0
- weji_goose-0.3.0/weji_goose/feedback.py +208 -0
- weji_goose-0.3.0/weji_goose/query.py +1226 -0
- weji_goose-0.3.0/weji_goose/reader.py +763 -0
- weji_goose-0.3.0/weji_goose/roaring.py +672 -0
- weji_goose-0.3.0/weji_goose/schema.py +219 -0
- weji_goose-0.3.0/weji_goose/sketch.py +225 -0
- weji_goose-0.3.0/weji_goose/table.py +546 -0
- weji_goose-0.3.0/weji_goose/writer.py +688 -0
- weji_goose-0.3.0/weji_goose.egg-info/PKG-INFO +289 -0
- weji_goose-0.3.0/weji_goose.egg-info/SOURCES.txt +40 -0
- weji_goose-0.3.0/weji_goose.egg-info/dependency_links.txt +1 -0
- weji_goose-0.3.0/weji_goose.egg-info/entry_points.txt +2 -0
- weji_goose-0.3.0/weji_goose.egg-info/requires.txt +12 -0
- weji_goose-0.3.0/weji_goose.egg-info/top_level.txt +2 -0
weji_goose-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 WEJI Northern Technologies Inc.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: weji-goose
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Goose — a columnar storage engine with 10 specialized encodings, roaring-bitmap predicate pushdown, bloom-filter partition skipping, zone-map pruning, probabilistic sketches, and an SQL query advisor. Ships with a PostgreSQL profiler/exporter CLI.
|
|
5
|
+
Author-email: Gary Lucas <garylucas@bluedrop.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 WEJI Northern Technologies Inc.
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
Project-URL: Homepage, https://github.com/wejinortherntechnologiesinc/Goose
|
|
28
|
+
Project-URL: Repository, https://github.com/wejinortherntechnologiesinc/Goose
|
|
29
|
+
Project-URL: Issues, https://github.com/wejinortherntechnologiesinc/Goose/issues
|
|
30
|
+
Keywords: columnar,compression,database,analytics,predicate-pushdown,postgres,roaring-bitmap,bloom-filter
|
|
31
|
+
Classifier: Development Status :: 4 - Beta
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
36
|
+
Classifier: Operating System :: MacOS
|
|
37
|
+
Classifier: Programming Language :: Python :: 3
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
42
|
+
Classifier: Topic :: Database
|
|
43
|
+
Classifier: Topic :: Database :: Front-Ends
|
|
44
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
45
|
+
Requires-Python: >=3.10
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
License-File: LICENSE
|
|
48
|
+
Requires-Dist: numpy>=1.24
|
|
49
|
+
Requires-Dist: zstandard>=0.20
|
|
50
|
+
Requires-Dist: sqlglot>=25.0
|
|
51
|
+
Provides-Extra: dev
|
|
52
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
53
|
+
Provides-Extra: sql
|
|
54
|
+
Requires-Dist: sqlglot>=25.0; extra == "sql"
|
|
55
|
+
Provides-Extra: pg
|
|
56
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == "pg"
|
|
57
|
+
Dynamic: license-file
|
|
58
|
+
|
|
59
|
+
# Goose
|
|
60
|
+
|
|
61
|
+
> A columnar storage engine for analytical workloads — 10 specialized column encodings, roaring-bitmap predicate pushdown, bloom-filter partition skipping, zone-map pruning, probabilistic sketches, and an SQL query advisor. Ships with a PostgreSQL profiler/exporter CLI.
|
|
62
|
+
|
|
63
|
+
[](https://github.com/wejinortherntechnologiesinc/Goose/actions/workflows/ci.yml)
|
|
64
|
+
[](LICENSE)
|
|
65
|
+
[](https://www.python.org/downloads/)
|
|
66
|
+
|
|
67
|
+
Goose is a **universal booster**: a column-encoding and predicate-pushdown layer you can use three ways —
|
|
68
|
+
|
|
69
|
+
1. **As a Python library** — point it at data, let it pick the best encoding per column, write a compressed columnar store, and query it with predicate pushdown.
|
|
70
|
+
2. **As a PostgreSQL cold tier** — profile a Postgres table, export it to Goose, and query the compressed copy (CDC keeps them in sync).
|
|
71
|
+
3. **As an embedded encoding library** — link the C (`libgoose.a`) or Rust (`goose-encoding`) reference implementation into another database to get encoding intelligence and predicate pushdown without changing your storage engine.
|
|
72
|
+
|
|
73
|
+
The binary format, encoding selection algorithm, and predicate evaluation logic are specified language-agnostically in [`reference/SPEC.md`](reference/SPEC.md), so any database can implement a compatible reader/writer.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Why
|
|
78
|
+
|
|
79
|
+
PostgreSQL is great at OLTP and terrible at analytical scans over years of history. Goose takes the analytical workload off Postgres: a `verb_id` column with 6 distinct values compresses **58×**; a monotonic timestamp column compresses **10.8×** with FOR+ZSTD; a sparse boolean flag becomes a roaring bitmap you can filter *without decoding the column*. Predicate pushdown then skips whole partitions using bloom filters and zone maps before touching row data.
|
|
80
|
+
|
|
81
|
+
## Features
|
|
82
|
+
|
|
83
|
+
- **10 column encodings** — `raw`, `bitpacked`, `dictionary`, `offset_blob`, `delta_zstd`, `zstd`, `for` (frame-of-reference), `for_zstd`, `roaring` (roaring-bitmap booleans), `zstd_dict` (shared trained dictionary).
|
|
84
|
+
- **Predicate pushdown** — `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `in`, compound `and`/`or`. Each predicate produces a roaring bitmap of matching rows; predicates are intersected, never materializing the full column.
|
|
85
|
+
- **Partition skipping** — per-partition and per-block bloom filters + zone maps let the reader rule out entire partitions before decode.
|
|
86
|
+
- **Adaptive selectivity feedback** — observed predicate selectivities reorder filters so the most selective (cheapest) predicates run first.
|
|
87
|
+
- **Cross-column correlation statistics** — propagate partition pruning across correlated columns.
|
|
88
|
+
- **Probabilistic sketches** — approximate answers (count-distinct, etc.) for fast exploration.
|
|
89
|
+
- **Atomic writes + CRC32 checksums** — every file is written to a `.tmp` then renamed; partition manifests carry checksums, verified on read.
|
|
90
|
+
- **SQL query advisor** — `goose.advisor.optimize()` takes a SQL `WHERE` clause, reorders predicates by selectivity, and returns optimized SQL + a structured predicate tree for `GooseTable.query()`.
|
|
91
|
+
- **PostgreSQL integration** — `goose-pg` CLI to profile, export, benchmark, and query.
|
|
92
|
+
|
|
93
|
+
## Install
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
pip install weji-goose # core engine (numpy, zstandard, sqlglot)
|
|
97
|
+
pip install "weji-goose[pg]" # + psycopg2-binary for the goose-pg CLI
|
|
98
|
+
pip install "weji-goose[dev]" # + pytest for running the test suite
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
> **Note:** the PyPI distribution is `weji-goose`; the import name is `import weji_goose`.
|
|
102
|
+
|
|
103
|
+
Requires Python ≥ 3.10.
|
|
104
|
+
|
|
105
|
+
## Quick start — Python library
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
import numpy as np
|
|
109
|
+
from weji_goose.schema import TableSchema
|
|
110
|
+
from weji_goose.column import ColumnSpec, ColumnType, Encoding
|
|
111
|
+
from weji_goose.table import GooseTable
|
|
112
|
+
from weji_goose.query import Predicate, CompoundPredicate
|
|
113
|
+
|
|
114
|
+
schema = TableSchema("demo", columns=[
|
|
115
|
+
ColumnSpec("id", ColumnType.INT64, Encoding.DELTA_ZSTD), # monotonic → delta
|
|
116
|
+
ColumnSpec("flag", ColumnType.BOOLEAN, Encoding.ROARING), # sparse bool → bitmap
|
|
117
|
+
ColumnSpec("city", ColumnType.TEXT, Encoding.DICTIONARY), # low-card text → dict
|
|
118
|
+
])
|
|
119
|
+
|
|
120
|
+
# Create and write
|
|
121
|
+
table = GooseTable.create(schema, "./demo_table")
|
|
122
|
+
table.insert({
|
|
123
|
+
"id": np.arange(1_000_000, dtype=np.int64),
|
|
124
|
+
"flag": np.array([i % 5 == 0 for i in range(1_000_000)], dtype=bool),
|
|
125
|
+
"city": np.array(["stjohns", "corner", "bay"] * 333_333 + ["stjohns"], dtype=object),
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
# Predicate pushdown: filter without decoding the columns
|
|
129
|
+
table = GooseTable.open("./demo_table")
|
|
130
|
+
result = table.query(
|
|
131
|
+
["city", "id"],
|
|
132
|
+
where=CompoundPredicate("and", [
|
|
133
|
+
Predicate("flag", "eq", True),
|
|
134
|
+
Predicate("id", "gte", 999_000),
|
|
135
|
+
]),
|
|
136
|
+
)
|
|
137
|
+
print(result["city"][:5], result["id"][:5])
|
|
138
|
+
print("on-disk size:", table.total_size_bytes(), "bytes")
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Supported types
|
|
142
|
+
|
|
143
|
+
`int64`, `int32`, `int16`, `float64`, `float32`, `boolean`, `uuid`, `text`, `interval`.
|
|
144
|
+
|
|
145
|
+
## Quick start — PostgreSQL cold tier (`goose-pg`)
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# 1. Profile a table → schema + per-column compression estimates
|
|
149
|
+
goose-pg profile \
|
|
150
|
+
--db-url "postgresql://user:pass@host/db" \
|
|
151
|
+
--table sensor_readings \
|
|
152
|
+
--output ./sensor_readings_schema.json -v
|
|
153
|
+
|
|
154
|
+
# 2. Export the table to Goose format (auto-partitioned, resumable)
|
|
155
|
+
goose-pg export \
|
|
156
|
+
--db-url "postgresql://user:pass@host/db" \
|
|
157
|
+
--table sensor_readings \
|
|
158
|
+
--output-dir ./goose_data \
|
|
159
|
+
--partition-column recorded_at --partition-interval month --resume -v
|
|
160
|
+
|
|
161
|
+
# 3. Benchmark: profile + export + measured compression ratio
|
|
162
|
+
goose-pg benchmark \
|
|
163
|
+
--db-url "postgresql://user:pass@host/db" \
|
|
164
|
+
--table sensor_readings \
|
|
165
|
+
--output-dir ./bench
|
|
166
|
+
|
|
167
|
+
# 4. Query the compressed Goose data with predicate pushdown
|
|
168
|
+
goose-pg query \
|
|
169
|
+
--goose-dir ./goose_data/sensor_readings \
|
|
170
|
+
--columns recorded_at,pm25 \
|
|
171
|
+
--where "pm25 >= 35 AND region = 'bay_st_george'" \
|
|
172
|
+
--format csv
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
`goose-pg query --where` parses the SQL `WHERE` clause through the Goose advisor (using the opened table's schema) and pushes it down. Output formats: `table` (default), `csv`, `json`.
|
|
176
|
+
|
|
177
|
+
A `docker-compose.yml` is included for integration testing against a throwaway Postgres. The `goose` service is profile-gated under `cli`, so pass `--profile cli` to invoke it:
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
docker compose up -d postgres
|
|
181
|
+
docker compose --profile cli run --rm goose --help
|
|
182
|
+
docker compose --profile cli run --rm goose query --goose-dir ./goose_data/mytable --columns id,name
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## The query advisor
|
|
186
|
+
|
|
187
|
+
`goose.advisor.optimize(sql, schema)` reorders `WHERE` predicates by selectivity — roaring-bitmap lookups first, then bloom-filtered ID lookups, then range predicates — and returns optimized SQL plus a predicate tree:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
from weji_goose.advisor import optimize, SCHEMAS
|
|
191
|
+
|
|
192
|
+
oq = optimize(
|
|
193
|
+
"SELECT id, verb_id FROM xapi_events "
|
|
194
|
+
"WHERE verb_id IN ('completed', 'passed') AND actor_id = 42",
|
|
195
|
+
schema=SCHEMAS["xapi_events"],
|
|
196
|
+
)
|
|
197
|
+
print(oq.optimized_sql)
|
|
198
|
+
# SELECT id, verb_id
|
|
199
|
+
# FROM xapi_events
|
|
200
|
+
# WHERE actor_id = 42 AND verb_id IN ('completed', 'passed')
|
|
201
|
+
|
|
202
|
+
print(oq.predicate_json)
|
|
203
|
+
# {'op': 'and', 'predicates': [
|
|
204
|
+
# {'column': 'actor_id', 'op': 'eq', 'value': 42},
|
|
205
|
+
# {'column': 'verb_id', 'op': 'in', 'value': ['completed', 'passed']}]}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
`OptimizedQuery` exposes: `original_sql`, `optimized_sql`, `predicate_json`, `table_name`, `selected_columns`, `warnings`.
|
|
209
|
+
|
|
210
|
+
## Encodings
|
|
211
|
+
|
|
212
|
+
| Encoding | Best for |
|
|
213
|
+
|--------------|------------------------------------------------------|
|
|
214
|
+
| `raw` | High-cardinality, incompressible numeric data |
|
|
215
|
+
| `bitpacked` | Dense booleans (8 values/byte) |
|
|
216
|
+
| `dictionary` | Low-cardinality text/categorical (≈ < 50k distinct) |
|
|
217
|
+
| `offset_blob`| Variable-length blobs with a dictionary offset index |
|
|
218
|
+
| `delta_zstd` | Monotonic / near-sequential int64 (timestamps, IDs) |
|
|
219
|
+
| `zstd` | Generic high-entropy compressible data |
|
|
220
|
+
| `for` | Frame-of-reference: clustered int ranges |
|
|
221
|
+
| `for_zstd` | FOR + ZSTD for clustered ints that still compress |
|
|
222
|
+
| `roaring` | Sparse booleans — queryable without column decode |
|
|
223
|
+
| `zstd_dict` | Repeated text patterns via a shared trained dictionary |
|
|
224
|
+
|
|
225
|
+
## Benchmarks
|
|
226
|
+
|
|
227
|
+
On a 10,000-row synthetic sample (`weji_goose/benchmarks/benchmark_results.json`, reproducible via `python -m weji_goose.benchmarks.demo`):
|
|
228
|
+
|
|
229
|
+
| Metric | Value |
|
|
230
|
+
|------------------------------|-----------------|
|
|
231
|
+
| Overall compression vs PG | **3.83×** |
|
|
232
|
+
| Write throughput | ~10,000 rows/s |
|
|
233
|
+
| Full-scan throughput | ~3.96M rows/s |
|
|
234
|
+
| Best per-column compression | `verb_id` → **58.5×** (dictionary, 6 distinct) |
|
|
235
|
+
|
|
236
|
+
| Column | Encoding | Compression |
|
|
237
|
+
|----------------------|-------------|-------------|
|
|
238
|
+
| `verb_id` | dictionary | 58.5× |
|
|
239
|
+
| `context_org` | dictionary | 14.8× |
|
|
240
|
+
| `stored` | delta_zstd | 10.8× |
|
|
241
|
+
| `actor_id` | for | 8.0× |
|
|
242
|
+
| `object_id` | dictionary | 7.4× |
|
|
243
|
+
|
|
244
|
+
## Project layout
|
|
245
|
+
|
|
246
|
+
```
|
|
247
|
+
weji_goose/ core engine (import as `import weji_goose`)
|
|
248
|
+
schema.py TableSchema, ColumnType, Encoding enums
|
|
249
|
+
table.py GooseTable: create / open / insert / query / scan
|
|
250
|
+
query.py predicate pushdown engine, Predicate / CompoundPredicate
|
|
251
|
+
advisor.py SQL WHERE → optimized predicates
|
|
252
|
+
reader.py writer.py checksummed, atomic, mmap-backed I/O
|
|
253
|
+
bloom.py roaring.py partition skipping + row bitmaps
|
|
254
|
+
sketch.py probabilistic sketches
|
|
255
|
+
correlation.py cross-column pruning stats
|
|
256
|
+
goose_pg/ PostgreSQL integration
|
|
257
|
+
cli.py goose-pg CLI: profile / export / benchmark / query
|
|
258
|
+
profiler.py schema introspection + data profiling
|
|
259
|
+
exporter.py bulk export to Goose partitions
|
|
260
|
+
type_map.py PostgreSQL → Goose type/encoding mapping
|
|
261
|
+
reference/
|
|
262
|
+
SPEC.md language-agnostic binary format + algorithm spec
|
|
263
|
+
c/ C reference (libgoose.a / libgoose.so + tests)
|
|
264
|
+
rust/ Rust reference crate (goose-encoding)
|
|
265
|
+
tests/ 272 tests
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## Reference implementations & spec
|
|
269
|
+
|
|
270
|
+
Goose's on-disk format and algorithms are defined in [`reference/SPEC.md`](reference/SPEC.md) so any database can implement a compatible reader/writer. Reference implementations live under `reference/`:
|
|
271
|
+
|
|
272
|
+
- **C** — `reference/c/` builds `libgoose.a` / `libgoose.so` (`make`, `make test`).
|
|
273
|
+
- **Rust** — `reference/rust/` is the `goose-encoding` crate (`cargo build`).
|
|
274
|
+
|
|
275
|
+
These are maintained alongside the Python engine as the canonical cross-language contract.
|
|
276
|
+
|
|
277
|
+
## Development
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
python -m venv .venv && source .venv/bin/activate
|
|
281
|
+
pip install -e ".[dev,pg]"
|
|
282
|
+
pytest -q # 272 tests
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
The Python reference encoders/decoders are in `weji_goose/`; the C and Rust references are in `reference/`. Benchmarks: `python -m weji_goose.benchmarks.demo`.
|
|
286
|
+
|
|
287
|
+
## License
|
|
288
|
+
|
|
289
|
+
[MIT](LICENSE) © 2026 WEJI Northern Technologies Inc.
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# Goose
|
|
2
|
+
|
|
3
|
+
> A columnar storage engine for analytical workloads — 10 specialized column encodings, roaring-bitmap predicate pushdown, bloom-filter partition skipping, zone-map pruning, probabilistic sketches, and an SQL query advisor. Ships with a PostgreSQL profiler/exporter CLI.
|
|
4
|
+
|
|
5
|
+
[](https://github.com/wejinortherntechnologiesinc/Goose/actions/workflows/ci.yml)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
|
|
9
|
+
Goose is a **universal booster**: a column-encoding and predicate-pushdown layer you can use three ways —
|
|
10
|
+
|
|
11
|
+
1. **As a Python library** — point it at data, let it pick the best encoding per column, write a compressed columnar store, and query it with predicate pushdown.
|
|
12
|
+
2. **As a PostgreSQL cold tier** — profile a Postgres table, export it to Goose, and query the compressed copy (CDC keeps them in sync).
|
|
13
|
+
3. **As an embedded encoding library** — link the C (`libgoose.a`) or Rust (`goose-encoding`) reference implementation into another database to get encoding intelligence and predicate pushdown without changing your storage engine.
|
|
14
|
+
|
|
15
|
+
The binary format, encoding selection algorithm, and predicate evaluation logic are specified language-agnostically in [`reference/SPEC.md`](reference/SPEC.md), so any database can implement a compatible reader/writer.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Why
|
|
20
|
+
|
|
21
|
+
PostgreSQL is great at OLTP and terrible at analytical scans over years of history. Goose takes the analytical workload off Postgres: a `verb_id` column with 6 distinct values compresses **58×**; a monotonic timestamp column compresses **10.8×** with FOR+ZSTD; a sparse boolean flag becomes a roaring bitmap you can filter *without decoding the column*. Predicate pushdown then skips whole partitions using bloom filters and zone maps before touching row data.
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
|
|
25
|
+
- **10 column encodings** — `raw`, `bitpacked`, `dictionary`, `offset_blob`, `delta_zstd`, `zstd`, `for` (frame-of-reference), `for_zstd`, `roaring` (roaring-bitmap booleans), `zstd_dict` (shared trained dictionary).
|
|
26
|
+
- **Predicate pushdown** — `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `in`, compound `and`/`or`. Each predicate produces a roaring bitmap of matching rows; predicates are intersected, never materializing the full column.
|
|
27
|
+
- **Partition skipping** — per-partition and per-block bloom filters + zone maps let the reader rule out entire partitions before decode.
|
|
28
|
+
- **Adaptive selectivity feedback** — observed predicate selectivities reorder filters so the most selective (cheapest) predicates run first.
|
|
29
|
+
- **Cross-column correlation statistics** — propagate partition pruning across correlated columns.
|
|
30
|
+
- **Probabilistic sketches** — approximate answers (count-distinct, etc.) for fast exploration.
|
|
31
|
+
- **Atomic writes + CRC32 checksums** — every file is written to a `.tmp` then renamed; partition manifests carry checksums, verified on read.
|
|
32
|
+
- **SQL query advisor** — `goose.advisor.optimize()` takes a SQL `WHERE` clause, reorders predicates by selectivity, and returns optimized SQL + a structured predicate tree for `GooseTable.query()`.
|
|
33
|
+
- **PostgreSQL integration** — `goose-pg` CLI to profile, export, benchmark, and query.
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install weji-goose # core engine (numpy, zstandard, sqlglot)
|
|
39
|
+
pip install "weji-goose[pg]" # + psycopg2-binary for the goose-pg CLI
|
|
40
|
+
pip install "weji-goose[dev]" # + pytest for running the test suite
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
> **Note:** the PyPI distribution is `weji-goose`; the import name is `import weji_goose`.
|
|
44
|
+
|
|
45
|
+
Requires Python ≥ 3.10.
|
|
46
|
+
|
|
47
|
+
## Quick start — Python library
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import numpy as np
|
|
51
|
+
from weji_goose.schema import TableSchema
|
|
52
|
+
from weji_goose.column import ColumnSpec, ColumnType, Encoding
|
|
53
|
+
from weji_goose.table import GooseTable
|
|
54
|
+
from weji_goose.query import Predicate, CompoundPredicate
|
|
55
|
+
|
|
56
|
+
schema = TableSchema("demo", columns=[
|
|
57
|
+
ColumnSpec("id", ColumnType.INT64, Encoding.DELTA_ZSTD), # monotonic → delta
|
|
58
|
+
ColumnSpec("flag", ColumnType.BOOLEAN, Encoding.ROARING), # sparse bool → bitmap
|
|
59
|
+
ColumnSpec("city", ColumnType.TEXT, Encoding.DICTIONARY), # low-card text → dict
|
|
60
|
+
])
|
|
61
|
+
|
|
62
|
+
# Create and write
|
|
63
|
+
table = GooseTable.create(schema, "./demo_table")
|
|
64
|
+
table.insert({
|
|
65
|
+
"id": np.arange(1_000_000, dtype=np.int64),
|
|
66
|
+
"flag": np.array([i % 5 == 0 for i in range(1_000_000)], dtype=bool),
|
|
67
|
+
"city": np.array(["stjohns", "corner", "bay"] * 333_333 + ["stjohns"], dtype=object),
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
# Predicate pushdown: filter without decoding the columns
|
|
71
|
+
table = GooseTable.open("./demo_table")
|
|
72
|
+
result = table.query(
|
|
73
|
+
["city", "id"],
|
|
74
|
+
where=CompoundPredicate("and", [
|
|
75
|
+
Predicate("flag", "eq", True),
|
|
76
|
+
Predicate("id", "gte", 999_000),
|
|
77
|
+
]),
|
|
78
|
+
)
|
|
79
|
+
print(result["city"][:5], result["id"][:5])
|
|
80
|
+
print("on-disk size:", table.total_size_bytes(), "bytes")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Supported types
|
|
84
|
+
|
|
85
|
+
`int64`, `int32`, `int16`, `float64`, `float32`, `boolean`, `uuid`, `text`, `interval`.
|
|
86
|
+
|
|
87
|
+
## Quick start — PostgreSQL cold tier (`goose-pg`)
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# 1. Profile a table → schema + per-column compression estimates
|
|
91
|
+
goose-pg profile \
|
|
92
|
+
--db-url "postgresql://user:pass@host/db" \
|
|
93
|
+
--table sensor_readings \
|
|
94
|
+
--output ./sensor_readings_schema.json -v
|
|
95
|
+
|
|
96
|
+
# 2. Export the table to Goose format (auto-partitioned, resumable)
|
|
97
|
+
goose-pg export \
|
|
98
|
+
--db-url "postgresql://user:pass@host/db" \
|
|
99
|
+
--table sensor_readings \
|
|
100
|
+
--output-dir ./goose_data \
|
|
101
|
+
--partition-column recorded_at --partition-interval month --resume -v
|
|
102
|
+
|
|
103
|
+
# 3. Benchmark: profile + export + measured compression ratio
|
|
104
|
+
goose-pg benchmark \
|
|
105
|
+
--db-url "postgresql://user:pass@host/db" \
|
|
106
|
+
--table sensor_readings \
|
|
107
|
+
--output-dir ./bench
|
|
108
|
+
|
|
109
|
+
# 4. Query the compressed Goose data with predicate pushdown
|
|
110
|
+
goose-pg query \
|
|
111
|
+
--goose-dir ./goose_data/sensor_readings \
|
|
112
|
+
--columns recorded_at,pm25 \
|
|
113
|
+
--where "pm25 >= 35 AND region = 'bay_st_george'" \
|
|
114
|
+
--format csv
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
`goose-pg query --where` parses the SQL `WHERE` clause through the Goose advisor (using the opened table's schema) and pushes it down. Output formats: `table` (default), `csv`, `json`.
|
|
118
|
+
|
|
119
|
+
A `docker-compose.yml` is included for integration testing against a throwaway Postgres. The `goose` service is profile-gated under `cli`, so pass `--profile cli` to invoke it:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
docker compose up -d postgres
|
|
123
|
+
docker compose --profile cli run --rm goose --help
|
|
124
|
+
docker compose --profile cli run --rm goose query --goose-dir ./goose_data/mytable --columns id,name
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## The query advisor
|
|
128
|
+
|
|
129
|
+
`goose.advisor.optimize(sql, schema)` reorders `WHERE` predicates by selectivity — roaring-bitmap lookups first, then bloom-filtered ID lookups, then range predicates — and returns optimized SQL plus a predicate tree:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from weji_goose.advisor import optimize, SCHEMAS
|
|
133
|
+
|
|
134
|
+
oq = optimize(
|
|
135
|
+
"SELECT id, verb_id FROM xapi_events "
|
|
136
|
+
"WHERE verb_id IN ('completed', 'passed') AND actor_id = 42",
|
|
137
|
+
schema=SCHEMAS["xapi_events"],
|
|
138
|
+
)
|
|
139
|
+
print(oq.optimized_sql)
|
|
140
|
+
# SELECT id, verb_id
|
|
141
|
+
# FROM xapi_events
|
|
142
|
+
# WHERE actor_id = 42 AND verb_id IN ('completed', 'passed')
|
|
143
|
+
|
|
144
|
+
print(oq.predicate_json)
|
|
145
|
+
# {'op': 'and', 'predicates': [
|
|
146
|
+
# {'column': 'actor_id', 'op': 'eq', 'value': 42},
|
|
147
|
+
# {'column': 'verb_id', 'op': 'in', 'value': ['completed', 'passed']}]}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
`OptimizedQuery` exposes: `original_sql`, `optimized_sql`, `predicate_json`, `table_name`, `selected_columns`, `warnings`.
|
|
151
|
+
|
|
152
|
+
## Encodings
|
|
153
|
+
|
|
154
|
+
| Encoding | Best for |
|
|
155
|
+
|--------------|------------------------------------------------------|
|
|
156
|
+
| `raw` | High-cardinality, incompressible numeric data |
|
|
157
|
+
| `bitpacked` | Dense booleans (8 values/byte) |
|
|
158
|
+
| `dictionary` | Low-cardinality text/categorical (≈ < 50k distinct) |
|
|
159
|
+
| `offset_blob`| Variable-length blobs with a dictionary offset index |
|
|
160
|
+
| `delta_zstd` | Monotonic / near-sequential int64 (timestamps, IDs) |
|
|
161
|
+
| `zstd` | Generic high-entropy compressible data |
|
|
162
|
+
| `for` | Frame-of-reference: clustered int ranges |
|
|
163
|
+
| `for_zstd` | FOR + ZSTD for clustered ints that still compress |
|
|
164
|
+
| `roaring` | Sparse booleans — queryable without column decode |
|
|
165
|
+
| `zstd_dict` | Repeated text patterns via a shared trained dictionary |
|
|
166
|
+
|
|
167
|
+
## Benchmarks
|
|
168
|
+
|
|
169
|
+
On a 10,000-row synthetic sample (`weji_goose/benchmarks/benchmark_results.json`, reproducible via `python -m weji_goose.benchmarks.demo`):
|
|
170
|
+
|
|
171
|
+
| Metric | Value |
|
|
172
|
+
|------------------------------|-----------------|
|
|
173
|
+
| Overall compression vs PG | **3.83×** |
|
|
174
|
+
| Write throughput | ~10,000 rows/s |
|
|
175
|
+
| Full-scan throughput | ~3.96M rows/s |
|
|
176
|
+
| Best per-column compression | `verb_id` → **58.5×** (dictionary, 6 distinct) |
|
|
177
|
+
|
|
178
|
+
| Column | Encoding | Compression |
|
|
179
|
+
|----------------------|-------------|-------------|
|
|
180
|
+
| `verb_id` | dictionary | 58.5× |
|
|
181
|
+
| `context_org` | dictionary | 14.8× |
|
|
182
|
+
| `stored` | delta_zstd | 10.8× |
|
|
183
|
+
| `actor_id` | for | 8.0× |
|
|
184
|
+
| `object_id` | dictionary | 7.4× |
|
|
185
|
+
|
|
186
|
+
## Project layout
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
weji_goose/ core engine (import as `import weji_goose`)
|
|
190
|
+
schema.py TableSchema, ColumnType, Encoding enums
|
|
191
|
+
table.py GooseTable: create / open / insert / query / scan
|
|
192
|
+
query.py predicate pushdown engine, Predicate / CompoundPredicate
|
|
193
|
+
advisor.py SQL WHERE → optimized predicates
|
|
194
|
+
reader.py writer.py checksummed, atomic, mmap-backed I/O
|
|
195
|
+
bloom.py roaring.py partition skipping + row bitmaps
|
|
196
|
+
sketch.py probabilistic sketches
|
|
197
|
+
correlation.py cross-column pruning stats
|
|
198
|
+
goose_pg/ PostgreSQL integration
|
|
199
|
+
cli.py goose-pg CLI: profile / export / benchmark / query
|
|
200
|
+
profiler.py schema introspection + data profiling
|
|
201
|
+
exporter.py bulk export to Goose partitions
|
|
202
|
+
type_map.py PostgreSQL → Goose type/encoding mapping
|
|
203
|
+
reference/
|
|
204
|
+
SPEC.md language-agnostic binary format + algorithm spec
|
|
205
|
+
c/ C reference (libgoose.a / libgoose.so + tests)
|
|
206
|
+
rust/ Rust reference crate (goose-encoding)
|
|
207
|
+
tests/ 272 tests
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Reference implementations & spec
|
|
211
|
+
|
|
212
|
+
Goose's on-disk format and algorithms are defined in [`reference/SPEC.md`](reference/SPEC.md) so any database can implement a compatible reader/writer. Reference implementations live under `reference/`:
|
|
213
|
+
|
|
214
|
+
- **C** — `reference/c/` builds `libgoose.a` / `libgoose.so` (`make`, `make test`).
|
|
215
|
+
- **Rust** — `reference/rust/` is the `goose-encoding` crate (`cargo build`).
|
|
216
|
+
|
|
217
|
+
These are maintained alongside the Python engine as the canonical cross-language contract.
|
|
218
|
+
|
|
219
|
+
## Development
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
python -m venv .venv && source .venv/bin/activate
|
|
223
|
+
pip install -e ".[dev,pg]"
|
|
224
|
+
pytest -q # 272 tests
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
The Python reference encoders/decoders are in `weji_goose/`; the C and Rust references are in `reference/`. Benchmarks: `python -m weji_goose.benchmarks.demo`.
|
|
228
|
+
|
|
229
|
+
## License
|
|
230
|
+
|
|
231
|
+
[MIT](LICENSE) © 2026 WEJI Northern Technologies Inc.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Goose PostgreSQL integration: profile, export, and query compressed data."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import version as _version, PackageNotFoundError as _PackageNotFoundError
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
__version__ = _version("weji-goose")
|
|
7
|
+
except _PackageNotFoundError: # not installed (e.g. running from a source checkout)
|
|
8
|
+
__version__ = "0.3.0"
|