weji-goose 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. weji_goose-0.3.0/LICENSE +21 -0
  2. weji_goose-0.3.0/PKG-INFO +289 -0
  3. weji_goose-0.3.0/README.md +231 -0
  4. weji_goose-0.3.0/goose_pg/__init__.py +8 -0
  5. weji_goose-0.3.0/goose_pg/cli.py +341 -0
  6. weji_goose-0.3.0/goose_pg/exporter.py +365 -0
  7. weji_goose-0.3.0/goose_pg/profiler.py +668 -0
  8. weji_goose-0.3.0/goose_pg/type_map.py +247 -0
  9. weji_goose-0.3.0/pyproject.toml +57 -0
  10. weji_goose-0.3.0/setup.cfg +4 -0
  11. weji_goose-0.3.0/tests/test_advisor.py +340 -0
  12. weji_goose-0.3.0/tests/test_bloom.py +181 -0
  13. weji_goose-0.3.0/tests/test_cli.py +127 -0
  14. weji_goose-0.3.0/tests/test_column.py +197 -0
  15. weji_goose-0.3.0/tests/test_mmap.py +208 -0
  16. weji_goose-0.3.0/tests/test_optimizers.py +690 -0
  17. weji_goose-0.3.0/tests/test_query.py +865 -0
  18. weji_goose-0.3.0/tests/test_roaring.py +527 -0
  19. weji_goose-0.3.0/tests/test_table.py +595 -0
  20. weji_goose-0.3.0/tests/test_zstd_dict.py +187 -0
  21. weji_goose-0.3.0/weji_goose/__init__.py +41 -0
  22. weji_goose-0.3.0/weji_goose/advisor.py +951 -0
  23. weji_goose-0.3.0/weji_goose/block_bloom.py +163 -0
  24. weji_goose-0.3.0/weji_goose/block_index.py +214 -0
  25. weji_goose-0.3.0/weji_goose/bloom.py +191 -0
  26. weji_goose-0.3.0/weji_goose/column.py +905 -0
  27. weji_goose-0.3.0/weji_goose/correlation.py +210 -0
  28. weji_goose-0.3.0/weji_goose/exceptions.py +76 -0
  29. weji_goose-0.3.0/weji_goose/feedback.py +208 -0
  30. weji_goose-0.3.0/weji_goose/query.py +1226 -0
  31. weji_goose-0.3.0/weji_goose/reader.py +763 -0
  32. weji_goose-0.3.0/weji_goose/roaring.py +672 -0
  33. weji_goose-0.3.0/weji_goose/schema.py +219 -0
  34. weji_goose-0.3.0/weji_goose/sketch.py +225 -0
  35. weji_goose-0.3.0/weji_goose/table.py +546 -0
  36. weji_goose-0.3.0/weji_goose/writer.py +688 -0
  37. weji_goose-0.3.0/weji_goose.egg-info/PKG-INFO +289 -0
  38. weji_goose-0.3.0/weji_goose.egg-info/SOURCES.txt +40 -0
  39. weji_goose-0.3.0/weji_goose.egg-info/dependency_links.txt +1 -0
  40. weji_goose-0.3.0/weji_goose.egg-info/entry_points.txt +2 -0
  41. weji_goose-0.3.0/weji_goose.egg-info/requires.txt +12 -0
  42. weji_goose-0.3.0/weji_goose.egg-info/top_level.txt +2 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 WEJI Northern Technologies Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,289 @@
1
+ Metadata-Version: 2.4
2
+ Name: weji-goose
3
+ Version: 0.3.0
4
+ Summary: Goose — a columnar storage engine with 10 specialized encodings, roaring-bitmap predicate pushdown, bloom-filter partition skipping, zone-map pruning, probabilistic sketches, and an SQL query advisor. Ships with a PostgreSQL profiler/exporter CLI.
5
+ Author-email: Gary Lucas <garylucas@bluedrop.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 WEJI Northern Technologies Inc.
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ Project-URL: Homepage, https://github.com/wejinortherntechnologiesinc/Goose
28
+ Project-URL: Repository, https://github.com/wejinortherntechnologiesinc/Goose
29
+ Project-URL: Issues, https://github.com/wejinortherntechnologiesinc/Goose/issues
30
+ Keywords: columnar,compression,database,analytics,predicate-pushdown,postgres,roaring-bitmap,bloom-filter
31
+ Classifier: Development Status :: 4 - Beta
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Operating System :: POSIX :: Linux
36
+ Classifier: Operating System :: MacOS
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Programming Language :: Python :: 3.13
42
+ Classifier: Topic :: Database
43
+ Classifier: Topic :: Database :: Front-Ends
44
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
45
+ Requires-Python: >=3.10
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Requires-Dist: numpy>=1.24
49
+ Requires-Dist: zstandard>=0.20
50
+ Requires-Dist: sqlglot>=25.0
51
+ Provides-Extra: dev
52
+ Requires-Dist: pytest>=7.0; extra == "dev"
53
+ Provides-Extra: sql
54
+ Requires-Dist: sqlglot>=25.0; extra == "sql"
55
+ Provides-Extra: pg
56
+ Requires-Dist: psycopg2-binary>=2.9; extra == "pg"
57
+ Dynamic: license-file
58
+
59
+ # Goose
60
+
61
+ > A columnar storage engine for analytical workloads — 10 specialized column encodings, roaring-bitmap predicate pushdown, bloom-filter partition skipping, zone-map pruning, probabilistic sketches, and an SQL query advisor. Ships with a PostgreSQL profiler/exporter CLI.
62
+
63
+ [![CI](https://github.com/wejinortherntechnologiesinc/Goose/actions/workflows/ci.yml/badge.svg)](https://github.com/wejinortherntechnologiesinc/Goose/actions/workflows/ci.yml)
64
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
65
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
66
+
67
+ Goose is a **universal booster**: a column-encoding and predicate-pushdown layer you can use three ways —
68
+
69
+ 1. **As a Python library** — point it at data, let it pick the best encoding per column, write a compressed columnar store, and query it with predicate pushdown.
70
+ 2. **As a PostgreSQL cold tier** — profile a Postgres table, export it to Goose, and query the compressed copy (CDC keeps them in sync).
71
+ 3. **As an embedded encoding library** — link the C (`libgoose.a`) or Rust (`goose-encoding`) reference implementation into another database to get encoding intelligence and predicate pushdown without changing your storage engine.
72
+
73
+ The binary format, encoding selection algorithm, and predicate evaluation logic are specified language-agnostically in [`reference/SPEC.md`](reference/SPEC.md), so any database can implement a compatible reader/writer.
74
+
75
+ ---
76
+
77
+ ## Why
78
+
79
+ PostgreSQL is great at OLTP and terrible at analytical scans over years of history. Goose takes the analytical workload off Postgres: a `verb_id` column with 6 distinct values compresses **58×**; a monotonic timestamp column compresses **10.8×** with FOR+ZSTD; a sparse boolean flag becomes a roaring bitmap you can filter *without decoding the column*. Predicate pushdown then skips whole partitions using bloom filters and zone maps before touching row data.
80
+
81
+ ## Features
82
+
83
+ - **10 column encodings** — `raw`, `bitpacked`, `dictionary`, `offset_blob`, `delta_zstd`, `zstd`, `for` (frame-of-reference), `for_zstd`, `roaring` (roaring-bitmap booleans), `zstd_dict` (shared trained dictionary).
84
+ - **Predicate pushdown** — `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `in`, compound `and`/`or`. Each predicate produces a roaring bitmap of matching rows; predicates are intersected, never materializing the full column.
85
+ - **Partition skipping** — per-partition and per-block bloom filters + zone maps let the reader rule out entire partitions before decode.
86
+ - **Adaptive selectivity feedback** — observed predicate selectivities reorder filters so the most selective (cheapest) predicates run first.
87
+ - **Cross-column correlation statistics** — propagate partition pruning across correlated columns.
88
+ - **Probabilistic sketches** — approximate answers (count-distinct, etc.) for fast exploration.
89
+ - **Atomic writes + CRC32 checksums** — every file is written to a `.tmp` then renamed; partition manifests carry checksums, verified on read.
90
+ - **SQL query advisor** — `goose.advisor.optimize()` takes a SQL `WHERE` clause, reorders predicates by selectivity, and returns optimized SQL + a structured predicate tree for `GooseTable.query()`.
91
+ - **PostgreSQL integration** — `goose-pg` CLI to profile, export, benchmark, and query.
92
+
93
+ ## Install
94
+
95
+ ```bash
96
+ pip install weji-goose # core engine (numpy, zstandard, sqlglot)
97
+ pip install "weji-goose[pg]" # + psycopg2-binary for the goose-pg CLI
98
+ pip install "weji-goose[dev]" # + pytest for running the test suite
99
+ ```
100
+
101
+ > **Note:** the PyPI distribution is `weji-goose`; the import name is `import weji_goose`.
102
+
103
+ Requires Python ≥ 3.10.
104
+
105
+ ## Quick start — Python library
106
+
107
+ ```python
108
+ import numpy as np
109
+ from weji_goose.schema import TableSchema
110
+ from weji_goose.column import ColumnSpec, ColumnType, Encoding
111
+ from weji_goose.table import GooseTable
112
+ from weji_goose.query import Predicate, CompoundPredicate
113
+
114
+ schema = TableSchema("demo", columns=[
115
+ ColumnSpec("id", ColumnType.INT64, Encoding.DELTA_ZSTD), # monotonic → delta
116
+ ColumnSpec("flag", ColumnType.BOOLEAN, Encoding.ROARING), # sparse bool → bitmap
117
+ ColumnSpec("city", ColumnType.TEXT, Encoding.DICTIONARY), # low-card text → dict
118
+ ])
119
+
120
+ # Create and write
121
+ table = GooseTable.create(schema, "./demo_table")
122
+ table.insert({
123
+ "id": np.arange(1_000_000, dtype=np.int64),
124
+ "flag": np.array([i % 5 == 0 for i in range(1_000_000)], dtype=bool),
125
+ "city": np.array(["stjohns", "corner", "bay"] * 333_333 + ["stjohns"], dtype=object),
126
+ })
127
+
128
+ # Predicate pushdown: filter without decoding the columns
129
+ table = GooseTable.open("./demo_table")
130
+ result = table.query(
131
+ ["city", "id"],
132
+ where=CompoundPredicate("and", [
133
+ Predicate("flag", "eq", True),
134
+ Predicate("id", "gte", 999_000),
135
+ ]),
136
+ )
137
+ print(result["city"][:5], result["id"][:5])
138
+ print("on-disk size:", table.total_size_bytes(), "bytes")
139
+ ```
140
+
141
+ ### Supported types
142
+
143
+ `int64`, `int32`, `int16`, `float64`, `float32`, `boolean`, `uuid`, `text`, `interval`.
144
+
145
+ ## Quick start — PostgreSQL cold tier (`goose-pg`)
146
+
147
+ ```bash
148
+ # 1. Profile a table → schema + per-column compression estimates
149
+ goose-pg profile \
150
+ --db-url "postgresql://user:pass@host/db" \
151
+ --table sensor_readings \
152
+ --output ./sensor_readings_schema.json -v
153
+
154
+ # 2. Export the table to Goose format (auto-partitioned, resumable)
155
+ goose-pg export \
156
+ --db-url "postgresql://user:pass@host/db" \
157
+ --table sensor_readings \
158
+ --output-dir ./goose_data \
159
+ --partition-column recorded_at --partition-interval month --resume -v
160
+
161
+ # 3. Benchmark: profile + export + measured compression ratio
162
+ goose-pg benchmark \
163
+ --db-url "postgresql://user:pass@host/db" \
164
+ --table sensor_readings \
165
+ --output-dir ./bench
166
+
167
+ # 4. Query the compressed Goose data with predicate pushdown
168
+ goose-pg query \
169
+ --goose-dir ./goose_data/sensor_readings \
170
+ --columns recorded_at,pm25 \
171
+ --where "pm25 >= 35 AND region = 'bay_st_george'" \
172
+ --format csv
173
+ ```
174
+
175
+ `goose-pg query --where` parses the SQL `WHERE` clause through the Goose advisor (using the opened table's schema) and pushes it down. Output formats: `table` (default), `csv`, `json`.
176
+
177
+ A `docker-compose.yml` is included for integration testing against a throwaway Postgres. The `goose` service is profile-gated under `cli`, so pass `--profile cli` to invoke it:
178
+
179
+ ```bash
180
+ docker compose up -d postgres
181
+ docker compose --profile cli run --rm goose --help
182
+ docker compose --profile cli run --rm goose query --goose-dir ./goose_data/mytable --columns id,name
183
+ ```
184
+
185
+ ## The query advisor
186
+
187
+ `goose.advisor.optimize(sql, schema)` reorders `WHERE` predicates by selectivity — roaring-bitmap lookups first, then bloom-filtered ID lookups, then range predicates — and returns optimized SQL plus a predicate tree:
188
+
189
+ ```python
190
+ from weji_goose.advisor import optimize, SCHEMAS
191
+
192
+ oq = optimize(
193
+ "SELECT id, verb_id FROM xapi_events "
194
+ "WHERE verb_id IN ('completed', 'passed') AND actor_id = 42",
195
+ schema=SCHEMAS["xapi_events"],
196
+ )
197
+ print(oq.optimized_sql)
198
+ # SELECT id, verb_id
199
+ # FROM xapi_events
200
+ # WHERE actor_id = 42 AND verb_id IN ('completed', 'passed')
201
+
202
+ print(oq.predicate_json)
203
+ # {'op': 'and', 'predicates': [
204
+ # {'column': 'actor_id', 'op': 'eq', 'value': 42},
205
+ # {'column': 'verb_id', 'op': 'in', 'value': ['completed', 'passed']}]}
206
+ ```
207
+
208
+ `OptimizedQuery` exposes: `original_sql`, `optimized_sql`, `predicate_json`, `table_name`, `selected_columns`, `warnings`.
209
+
210
+ ## Encodings
211
+
212
+ | Encoding | Best for |
213
+ |--------------|------------------------------------------------------|
214
+ | `raw` | High-cardinality, incompressible numeric data |
215
+ | `bitpacked` | Dense booleans (8 values/byte) |
216
+ | `dictionary` | Low-cardinality text/categorical (≈ < 50k distinct) |
217
+ | `offset_blob`| Variable-length blobs with a dictionary offset index |
218
+ | `delta_zstd` | Monotonic / near-sequential int64 (timestamps, IDs) |
219
+ | `zstd` | Generic high-entropy compressible data |
220
+ | `for` | Frame-of-reference: clustered int ranges |
221
+ | `for_zstd` | FOR + ZSTD for clustered ints that still compress |
222
+ | `roaring` | Sparse booleans — queryable without column decode |
223
+ | `zstd_dict` | Repeated text patterns via a shared trained dictionary |
224
+
225
+ ## Benchmarks
226
+
227
+ On a 10,000-row synthetic sample (`weji_goose/benchmarks/benchmark_results.json`, reproducible via `python -m weji_goose.benchmarks.demo`):
228
+
229
+ | Metric | Value |
230
+ |------------------------------|-----------------|
231
+ | Overall compression vs PG | **3.83×** |
232
+ | Write throughput | ~10,000 rows/s |
233
+ | Full-scan throughput | ~3.96M rows/s |
234
+ | Best per-column compression | `verb_id` → **58.5×** (dictionary, 6 distinct) |
235
+
236
+ | Column | Encoding | Compression |
237
+ |----------------------|-------------|-------------|
238
+ | `verb_id` | dictionary | 58.5× |
239
+ | `context_org` | dictionary | 14.8× |
240
+ | `stored` | delta_zstd | 10.8× |
241
+ | `actor_id` | for | 8.0× |
242
+ | `object_id` | dictionary | 7.4× |
243
+
244
+ ## Project layout
245
+
246
+ ```
247
+ weji_goose/ core engine (import as `import weji_goose`)
248
+ schema.py TableSchema, ColumnType, Encoding enums
249
+ table.py GooseTable: create / open / insert / query / scan
250
+ query.py predicate pushdown engine, Predicate / CompoundPredicate
251
+ advisor.py SQL WHERE → optimized predicates
252
+ reader.py writer.py checksummed, atomic, mmap-backed I/O
253
+ bloom.py roaring.py partition skipping + row bitmaps
254
+ sketch.py probabilistic sketches
255
+ correlation.py cross-column pruning stats
256
+ goose_pg/ PostgreSQL integration
257
+ cli.py goose-pg CLI: profile / export / benchmark / query
258
+ profiler.py schema introspection + data profiling
259
+ exporter.py bulk export to Goose partitions
260
+ type_map.py PostgreSQL → Goose type/encoding mapping
261
+ reference/
262
+ SPEC.md language-agnostic binary format + algorithm spec
263
+ c/ C reference (libgoose.a / libgoose.so + tests)
264
+ rust/ Rust reference crate (goose-encoding)
265
+ tests/ 272 tests
266
+ ```
267
+
268
+ ## Reference implementations & spec
269
+
270
+ Goose's on-disk format and algorithms are defined in [`reference/SPEC.md`](reference/SPEC.md) so any database can implement a compatible reader/writer. Reference implementations live under `reference/`:
271
+
272
+ - **C** — `reference/c/` builds `libgoose.a` / `libgoose.so` (`make`, `make test`).
273
+ - **Rust** — `reference/rust/` is the `goose-encoding` crate (`cargo build`).
274
+
275
+ These are maintained alongside the Python engine as the canonical cross-language contract.
276
+
277
+ ## Development
278
+
279
+ ```bash
280
+ python -m venv .venv && source .venv/bin/activate
281
+ pip install -e ".[dev,pg]"
282
+ pytest -q # 272 tests
283
+ ```
284
+
285
+ The Python reference encoders/decoders are in `weji_goose/`; the C and Rust references are in `reference/`. Benchmarks: `python -m weji_goose.benchmarks.demo`.
286
+
287
+ ## License
288
+
289
+ [MIT](LICENSE) © 2026 WEJI Northern Technologies Inc.
@@ -0,0 +1,231 @@
1
+ # Goose
2
+
3
+ > A columnar storage engine for analytical workloads — 10 specialized column encodings, roaring-bitmap predicate pushdown, bloom-filter partition skipping, zone-map pruning, probabilistic sketches, and an SQL query advisor. Ships with a PostgreSQL profiler/exporter CLI.
4
+
5
+ [![CI](https://github.com/wejinortherntechnologiesinc/Goose/actions/workflows/ci.yml/badge.svg)](https://github.com/wejinortherntechnologiesinc/Goose/actions/workflows/ci.yml)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
7
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
8
+
9
+ Goose is a **universal booster**: a column-encoding and predicate-pushdown layer you can use three ways —
10
+
11
+ 1. **As a Python library** — point it at data, let it pick the best encoding per column, write a compressed columnar store, and query it with predicate pushdown.
12
+ 2. **As a PostgreSQL cold tier** — profile a Postgres table, export it to Goose, and query the compressed copy (CDC keeps them in sync).
13
+ 3. **As an embedded encoding library** — link the C (`libgoose.a`) or Rust (`goose-encoding`) reference implementation into another database to get encoding intelligence and predicate pushdown without changing your storage engine.
14
+
15
+ The binary format, encoding selection algorithm, and predicate evaluation logic are specified language-agnostically in [`reference/SPEC.md`](reference/SPEC.md), so any database can implement a compatible reader/writer.
16
+
17
+ ---
18
+
19
+ ## Why
20
+
21
+ PostgreSQL is great at OLTP and terrible at analytical scans over years of history. Goose takes the analytical workload off Postgres: a `verb_id` column with 6 distinct values compresses **58×**; a monotonic timestamp column compresses **10.8×** with FOR+ZSTD; a sparse boolean flag becomes a roaring bitmap you can filter *without decoding the column*. Predicate pushdown then skips whole partitions using bloom filters and zone maps before touching row data.
22
+
23
+ ## Features
24
+
25
+ - **10 column encodings** — `raw`, `bitpacked`, `dictionary`, `offset_blob`, `delta_zstd`, `zstd`, `for` (frame-of-reference), `for_zstd`, `roaring` (roaring-bitmap booleans), `zstd_dict` (shared trained dictionary).
26
+ - **Predicate pushdown** — `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `in`, compound `and`/`or`. Each predicate produces a roaring bitmap of matching rows; predicates are intersected, never materializing the full column.
27
+ - **Partition skipping** — per-partition and per-block bloom filters + zone maps let the reader rule out entire partitions before decode.
28
+ - **Adaptive selectivity feedback** — observed predicate selectivities reorder filters so the most selective (cheapest) predicates run first.
29
+ - **Cross-column correlation statistics** — propagate partition pruning across correlated columns.
30
+ - **Probabilistic sketches** — approximate answers (count-distinct, etc.) for fast exploration.
31
+ - **Atomic writes + CRC32 checksums** — every file is written to a `.tmp` then renamed; partition manifests carry checksums, verified on read.
32
+ - **SQL query advisor** — `goose.advisor.optimize()` takes a SQL `WHERE` clause, reorders predicates by selectivity, and returns optimized SQL + a structured predicate tree for `GooseTable.query()`.
33
+ - **PostgreSQL integration** — `goose-pg` CLI to profile, export, benchmark, and query.
34
+
35
+ ## Install
36
+
37
+ ```bash
38
+ pip install weji-goose # core engine (numpy, zstandard, sqlglot)
39
+ pip install "weji-goose[pg]" # + psycopg2-binary for the goose-pg CLI
40
+ pip install "weji-goose[dev]" # + pytest for running the test suite
41
+ ```
42
+
43
+ > **Note:** the PyPI distribution is `weji-goose`; the import name is `import weji_goose`.
44
+
45
+ Requires Python ≥ 3.10.
46
+
47
+ ## Quick start — Python library
48
+
49
+ ```python
50
+ import numpy as np
51
+ from weji_goose.schema import TableSchema
52
+ from weji_goose.column import ColumnSpec, ColumnType, Encoding
53
+ from weji_goose.table import GooseTable
54
+ from weji_goose.query import Predicate, CompoundPredicate
55
+
56
+ schema = TableSchema("demo", columns=[
57
+ ColumnSpec("id", ColumnType.INT64, Encoding.DELTA_ZSTD), # monotonic → delta
58
+ ColumnSpec("flag", ColumnType.BOOLEAN, Encoding.ROARING), # sparse bool → bitmap
59
+ ColumnSpec("city", ColumnType.TEXT, Encoding.DICTIONARY), # low-card text → dict
60
+ ])
61
+
62
+ # Create and write
63
+ table = GooseTable.create(schema, "./demo_table")
64
+ table.insert({
65
+ "id": np.arange(1_000_000, dtype=np.int64),
66
+ "flag": np.array([i % 5 == 0 for i in range(1_000_000)], dtype=bool),
67
+ "city": np.array(["stjohns", "corner", "bay"] * 333_333 + ["stjohns"], dtype=object),
68
+ })
69
+
70
+ # Predicate pushdown: filter without decoding the columns
71
+ table = GooseTable.open("./demo_table")
72
+ result = table.query(
73
+ ["city", "id"],
74
+ where=CompoundPredicate("and", [
75
+ Predicate("flag", "eq", True),
76
+ Predicate("id", "gte", 999_000),
77
+ ]),
78
+ )
79
+ print(result["city"][:5], result["id"][:5])
80
+ print("on-disk size:", table.total_size_bytes(), "bytes")
81
+ ```
82
+
83
+ ### Supported types
84
+
85
+ `int64`, `int32`, `int16`, `float64`, `float32`, `boolean`, `uuid`, `text`, `interval`.
86
+
87
+ ## Quick start — PostgreSQL cold tier (`goose-pg`)
88
+
89
+ ```bash
90
+ # 1. Profile a table → schema + per-column compression estimates
91
+ goose-pg profile \
92
+ --db-url "postgresql://user:pass@host/db" \
93
+ --table sensor_readings \
94
+ --output ./sensor_readings_schema.json -v
95
+
96
+ # 2. Export the table to Goose format (auto-partitioned, resumable)
97
+ goose-pg export \
98
+ --db-url "postgresql://user:pass@host/db" \
99
+ --table sensor_readings \
100
+ --output-dir ./goose_data \
101
+ --partition-column recorded_at --partition-interval month --resume -v
102
+
103
+ # 3. Benchmark: profile + export + measured compression ratio
104
+ goose-pg benchmark \
105
+ --db-url "postgresql://user:pass@host/db" \
106
+ --table sensor_readings \
107
+ --output-dir ./bench
108
+
109
+ # 4. Query the compressed Goose data with predicate pushdown
110
+ goose-pg query \
111
+ --goose-dir ./goose_data/sensor_readings \
112
+ --columns recorded_at,pm25 \
113
+ --where "pm25 >= 35 AND region = 'bay_st_george'" \
114
+ --format csv
115
+ ```
116
+
117
+ `goose-pg query --where` parses the SQL `WHERE` clause through the Goose advisor (using the opened table's schema) and pushes it down. Output formats: `table` (default), `csv`, `json`.
118
+
119
+ A `docker-compose.yml` is included for integration testing against a throwaway Postgres. The `goose` service is profile-gated under `cli`, so pass `--profile cli` to invoke it:
120
+
121
+ ```bash
122
+ docker compose up -d postgres
123
+ docker compose --profile cli run --rm goose --help
124
+ docker compose --profile cli run --rm goose query --goose-dir ./goose_data/mytable --columns id,name
125
+ ```
126
+
127
+ ## The query advisor
128
+
129
+ `goose.advisor.optimize(sql, schema)` reorders `WHERE` predicates by selectivity — roaring-bitmap lookups first, then bloom-filtered ID lookups, then range predicates — and returns optimized SQL plus a predicate tree:
130
+
131
+ ```python
132
+ from weji_goose.advisor import optimize, SCHEMAS
133
+
134
+ oq = optimize(
135
+ "SELECT id, verb_id FROM xapi_events "
136
+ "WHERE verb_id IN ('completed', 'passed') AND actor_id = 42",
137
+ schema=SCHEMAS["xapi_events"],
138
+ )
139
+ print(oq.optimized_sql)
140
+ # SELECT id, verb_id
141
+ # FROM xapi_events
142
+ # WHERE actor_id = 42 AND verb_id IN ('completed', 'passed')
143
+
144
+ print(oq.predicate_json)
145
+ # {'op': 'and', 'predicates': [
146
+ # {'column': 'actor_id', 'op': 'eq', 'value': 42},
147
+ # {'column': 'verb_id', 'op': 'in', 'value': ['completed', 'passed']}]}
148
+ ```
149
+
150
+ `OptimizedQuery` exposes: `original_sql`, `optimized_sql`, `predicate_json`, `table_name`, `selected_columns`, `warnings`.
151
+
152
+ ## Encodings
153
+
154
+ | Encoding | Best for |
155
+ |--------------|------------------------------------------------------|
156
+ | `raw` | High-cardinality, incompressible numeric data |
157
+ | `bitpacked` | Dense booleans (8 values/byte) |
158
+ | `dictionary` | Low-cardinality text/categorical (≈ < 50k distinct) |
159
+ | `offset_blob`| Variable-length blobs with a dictionary offset index |
160
+ | `delta_zstd` | Monotonic / near-sequential int64 (timestamps, IDs) |
161
+ | `zstd` | Generic high-entropy compressible data |
162
+ | `for` | Frame-of-reference: clustered int ranges |
163
+ | `for_zstd` | FOR + ZSTD for clustered ints that still compress |
164
+ | `roaring` | Sparse booleans — queryable without column decode |
165
+ | `zstd_dict` | Repeated text patterns via a shared trained dictionary |
166
+
167
+ ## Benchmarks
168
+
169
+ On a 10,000-row synthetic sample (`weji_goose/benchmarks/benchmark_results.json`, reproducible via `python -m weji_goose.benchmarks.demo`):
170
+
171
+ | Metric | Value |
172
+ |------------------------------|-----------------|
173
+ | Overall compression vs PG | **3.83×** |
174
+ | Write throughput | ~10,000 rows/s |
175
+ | Full-scan throughput | ~3.96M rows/s |
176
+ | Best per-column compression | `verb_id` → **58.5×** (dictionary, 6 distinct) |
177
+
178
+ | Column | Encoding | Compression |
179
+ |----------------------|-------------|-------------|
180
+ | `verb_id` | dictionary | 58.5× |
181
+ | `context_org` | dictionary | 14.8× |
182
+ | `stored` | delta_zstd | 10.8× |
183
+ | `actor_id` | for | 8.0× |
184
+ | `object_id` | dictionary | 7.4× |
185
+
186
+ ## Project layout
187
+
188
+ ```
189
+ weji_goose/ core engine (import as `import weji_goose`)
190
+ schema.py TableSchema, ColumnType, Encoding enums
191
+ table.py GooseTable: create / open / insert / query / scan
192
+ query.py predicate pushdown engine, Predicate / CompoundPredicate
193
+ advisor.py SQL WHERE → optimized predicates
194
+ reader.py writer.py checksummed, atomic, mmap-backed I/O
195
+ bloom.py roaring.py partition skipping + row bitmaps
196
+ sketch.py probabilistic sketches
197
+ correlation.py cross-column pruning stats
198
+ goose_pg/ PostgreSQL integration
199
+ cli.py goose-pg CLI: profile / export / benchmark / query
200
+ profiler.py schema introspection + data profiling
201
+ exporter.py bulk export to Goose partitions
202
+ type_map.py PostgreSQL → Goose type/encoding mapping
203
+ reference/
204
+ SPEC.md language-agnostic binary format + algorithm spec
205
+ c/ C reference (libgoose.a / libgoose.so + tests)
206
+ rust/ Rust reference crate (goose-encoding)
207
+ tests/ 272 tests
208
+ ```
209
+
210
+ ## Reference implementations & spec
211
+
212
+ Goose's on-disk format and algorithms are defined in [`reference/SPEC.md`](reference/SPEC.md) so any database can implement a compatible reader/writer. Reference implementations live under `reference/`:
213
+
214
+ - **C** — `reference/c/` builds `libgoose.a` / `libgoose.so` (`make`, `make test`).
215
+ - **Rust** — `reference/rust/` is the `goose-encoding` crate (`cargo build`).
216
+
217
+ These are maintained alongside the Python engine as the canonical cross-language contract.
218
+
219
+ ## Development
220
+
221
+ ```bash
222
+ python -m venv .venv && source .venv/bin/activate
223
+ pip install -e ".[dev,pg]"
224
+ pytest -q # 272 tests
225
+ ```
226
+
227
+ The Python reference encoders/decoders are in `weji_goose/`; the C and Rust references are in `reference/`. Benchmarks: `python -m weji_goose.benchmarks.demo`.
228
+
229
+ ## License
230
+
231
+ [MIT](LICENSE) © 2026 WEJI Northern Technologies Inc.
@@ -0,0 +1,8 @@
1
+ """Goose PostgreSQL integration: profile, export, and query compressed data."""
2
+
3
+ from importlib.metadata import version as _version, PackageNotFoundError as _PackageNotFoundError
4
+
5
+ try:
6
+ __version__ = _version("weji-goose")
7
+ except _PackageNotFoundError: # not installed (e.g. running from a source checkout)
8
+ __version__ = "0.3.0"