kgzip 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kgzip-0.1.0/LICENSE +21 -0
- kgzip-0.1.0/PKG-INFO +481 -0
- kgzip-0.1.0/README.md +447 -0
- kgzip-0.1.0/kgzip/__init__.py +7 -0
- kgzip-0.1.0/kgzip/_version.py +3 -0
- kgzip-0.1.0/kgzip/benchmark/__init__.py +5 -0
- kgzip-0.1.0/kgzip/benchmark/comparison.py +155 -0
- kgzip-0.1.0/kgzip/benchmark/harness.py +22 -0
- kgzip-0.1.0/kgzip/benchmark/medical_kg.py +71 -0
- kgzip-0.1.0/kgzip/benchmark/quality.py +226 -0
- kgzip-0.1.0/kgzip/compat.py +77 -0
- kgzip-0.1.0/kgzip/decision/__init__.py +101 -0
- kgzip-0.1.0/kgzip/decision/community.py +186 -0
- kgzip-0.1.0/kgzip/decision/gcs_scorer.py +84 -0
- kgzip-0.1.0/kgzip/decision/mode_selector.py +19 -0
- kgzip-0.1.0/kgzip/decision/profiler.py +71 -0
- kgzip-0.1.0/kgzip/decision/spectral.py +52 -0
- kgzip-0.1.0/kgzip/encoder/__init__.py +3 -0
- kgzip-0.1.0/kgzip/encoder/capsule_encoder.py +148 -0
- kgzip-0.1.0/kgzip/encoder/file_format.py +6 -0
- kgzip-0.1.0/kgzip/encoder/manifest.py +61 -0
- kgzip-0.1.0/kgzip/encoder/store_writer.py +263 -0
- kgzip-0.1.0/kgzip/exceptions.py +68 -0
- kgzip-0.1.0/kgzip/ingestion/__init__.py +3 -0
- kgzip-0.1.0/kgzip/ingestion/adapters/__init__.py +0 -0
- kgzip-0.1.0/kgzip/ingestion/adapters/csv.py +106 -0
- kgzip-0.1.0/kgzip/ingestion/adapters/jsonld.py +10 -0
- kgzip-0.1.0/kgzip/ingestion/adapters/neo4j.py +157 -0
- kgzip-0.1.0/kgzip/ingestion/adapters/networkx.py +92 -0
- kgzip-0.1.0/kgzip/ingestion/adapters/rdf.py +125 -0
- kgzip-0.1.0/kgzip/ingestion/normalizer.py +51 -0
- kgzip-0.1.0/kgzip/models.py +206 -0
- kgzip-0.1.0/kgzip/query/__init__.py +12 -0
- kgzip-0.1.0/kgzip/query/decoder.py +216 -0
- kgzip-0.1.0/kgzip/query/interface.py +180 -0
- kgzip-0.1.0/kgzip/query/merger.py +84 -0
- kgzip-0.1.0/kgzip/query/resolver.py +88 -0
- kgzip-0.1.0/kgzip/serialize.py +74 -0
- kgzip-0.1.0/kgzip/store.py +251 -0
- kgzip-0.1.0/kgzip.egg-info/PKG-INFO +481 -0
- kgzip-0.1.0/kgzip.egg-info/SOURCES.txt +54 -0
- kgzip-0.1.0/kgzip.egg-info/dependency_links.txt +1 -0
- kgzip-0.1.0/kgzip.egg-info/requires.txt +16 -0
- kgzip-0.1.0/kgzip.egg-info/top_level.txt +1 -0
- kgzip-0.1.0/pyproject.toml +48 -0
- kgzip-0.1.0/setup.cfg +4 -0
- kgzip-0.1.0/tests/test_benchmark.py +138 -0
- kgzip-0.1.0/tests/test_cross_layer_invariants.py +800 -0
- kgzip-0.1.0/tests/test_decision.py +418 -0
- kgzip-0.1.0/tests/test_encoder.py +484 -0
- kgzip-0.1.0/tests/test_ingestion.py +184 -0
- kgzip-0.1.0/tests/test_models.py +366 -0
- kgzip-0.1.0/tests/test_neo4j_adapter.py +226 -0
- kgzip-0.1.0/tests/test_query.py +604 -0
- kgzip-0.1.0/tests/test_query_trim.py +203 -0
- kgzip-0.1.0/tests/test_store.py +396 -0
kgzip-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ayush Mukherjee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
kgzip-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: kgzip
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Knowledge graph compression engine: parallel-decodable subgraph capsules for fast, storage-efficient KG queries.
|
|
5
|
+
Author: Ayush Mukherjee
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: knowledge-graph,graph,compression,rdf,networkx,neo4j
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Requires-Python: >=3.8
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: rdflib>=6.0
|
|
21
|
+
Requires-Dist: networkx>=2.6
|
|
22
|
+
Requires-Dist: pandas>=1.3
|
|
23
|
+
Requires-Dist: python-louvain>=0.16
|
|
24
|
+
Requires-Dist: numpy>=1.21
|
|
25
|
+
Requires-Dist: scipy>=1.7
|
|
26
|
+
Requires-Dist: msgpack>=1.0
|
|
27
|
+
Requires-Dist: zstandard>=0.18
|
|
28
|
+
Requires-Dist: nest_asyncio>=1.5
|
|
29
|
+
Requires-Dist: filelock>=3.4
|
|
30
|
+
Provides-Extra: neo4j
|
|
31
|
+
Requires-Dist: neo4j>=5.0; extra == "neo4j"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
34
|
+
|
|
35
|
+
# KGZip
|
|
36
|
+
|
|
37
|
+
**A compression engine for knowledge graphs.** KGZip takes a knowledge graph and
|
|
38
|
+
splits it into small, independently-loadable pieces called *capsules*, so that when
|
|
39
|
+
you ask a question about one part of the graph, you only read that part — not the
|
|
40
|
+
whole thing. The result is a store that is **smaller on disk** and lets you **query
|
|
41
|
+
large graphs without loading them entirely into memory**.
|
|
42
|
+
|
|
43
|
+
This README assumes **no prior knowledge of knowledge graphs**. If you already know
|
|
44
|
+
the basics, jump to [Quickstart](#quickstart) or the [API reference](#api-reference).
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## New here? Start with the concepts
|
|
49
|
+
|
|
50
|
+
### What is a knowledge graph?
|
|
51
|
+
|
|
52
|
+
A **knowledge graph (KG)** is just data stored as **things** and the **relationships
|
|
53
|
+
between them**.
|
|
54
|
+
|
|
55
|
+
- A **node** is a thing: a drug, a disease, a person, a movie.
|
|
56
|
+
- An **edge** is a relationship connecting two nodes: *Aspirin* **treats** *Headache*.
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
(Aspirin) --treats--> (Headache) --associated_with--> (GeneX)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Here `Aspirin`, `Headache`, and `GeneX` are nodes; `treats` and `associated_with`
|
|
63
|
+
are edges (also called *relations*). Each node can also carry **attributes**
|
|
64
|
+
(properties), e.g. Aspirin's `{ "formula": "C9H8O4" }`.
|
|
65
|
+
|
|
66
|
+
That's the whole idea. Real KGs just have many more nodes and edges (thousands to
|
|
67
|
+
billions), often describing a domain like medicine, finance, or social networks.
|
|
68
|
+
|
|
69
|
+
### What problem does KGZip solve?
|
|
70
|
+
|
|
71
|
+
When a graph is large, two things get painful:
|
|
72
|
+
|
|
73
|
+
1. **Storage** — keeping the whole graph around costs space.
|
|
74
|
+
2. **Querying** — to answer "what is near node X?", naive tools load or scan the
|
|
75
|
+
entire graph, even though you only care about a tiny neighbourhood.
|
|
76
|
+
|
|
77
|
+
KGZip pre-organises the graph into **capsules** (clusters of closely-related nodes)
|
|
78
|
+
and writes a small **manifest** (an index). A query then loads only the capsules it
|
|
79
|
+
needs. Think of it like a book with chapters and a table of contents: to read about
|
|
80
|
+
one topic you open one chapter, not the entire book.
|
|
81
|
+
|
|
82
|
+
### The golden rule: KGZip is a *read replica*
|
|
83
|
+
|
|
84
|
+
Your original graph (in a file, or in a database like Neo4j) is always the **source
|
|
85
|
+
of truth** — the "master". KGZip builds a **compressed copy** from it for fast
|
|
86
|
+
reads. **KGZip never modifies your original data.** If the KGZip store is ever lost
|
|
87
|
+
or corrupted, you can always rebuild it from the master.
|
|
88
|
+
|
|
89
|
+
### Is it lossless?
|
|
90
|
+
|
|
91
|
+
Yes. KGZip v1 is **lossless**: if you compress a graph and then ask for all of its
|
|
92
|
+
nodes back, you get *every node and every edge* exactly as they were. Capsules store
|
|
93
|
+
boundary-crossing edges (and a small "halo" of neighbouring nodes) precisely so that
|
|
94
|
+
nothing is lost when the pieces are reassembled.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Install
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install kgzip
|
|
102
|
+
|
|
103
|
+
# optional: to read directly from a Neo4j database
|
|
104
|
+
pip install "kgzip[neo4j]"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
From source (for development):
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
git clone <repo-url> && cd KGZip
|
|
111
|
+
pip install -e ".[dev]"
|
|
112
|
+
pytest # run the test suite
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Requires **Python ≥ 3.8**. Works in plain scripts and in Jupyter notebooks.
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Quickstart
|
|
120
|
+
|
|
121
|
+
Five lines to compress a graph and query it:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
import networkx as nx
|
|
125
|
+
from kgzip import KGZipStore
|
|
126
|
+
|
|
127
|
+
store = KGZipStore("./my_store") # 1. where the compressed store lives
|
|
128
|
+
store.compress(nx.karate_club_graph()) # 2. build capsules from a graph
|
|
129
|
+
result = store.query(["0", "1"], depth=2) # 3. ask: what's around nodes 0 and 1?
|
|
130
|
+
print(result.subgraph.meta.node_count) # 4. how many nodes came back
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
What just happened, line by line:
|
|
134
|
+
|
|
135
|
+
1. `KGZipStore("./my_store")` — open (or prepare to create) a store in that folder.
|
|
136
|
+
Nothing is read or written yet.
|
|
137
|
+
2. `compress(...)` — read the graph, cluster it into capsules, and write the capsule
|
|
138
|
+
files plus a manifest into `./my_store`.
|
|
139
|
+
3. `query(["0","1"], depth=2)` — find the capsules containing nodes `"0"` and `"1"`,
|
|
140
|
+
expand outward `depth` hops, decode just those capsules, and merge them.
|
|
141
|
+
4. The answer is a `QueryResult`; its `.subgraph` is a normal graph you can inspect.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Loading data from different sources
|
|
146
|
+
|
|
147
|
+
`compress()` accepts several common graph formats. You don't convert anything
|
|
148
|
+
yourself — KGZip detects the type and reads it.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
store.compress(nx.karate_club_graph()) # a NetworkX graph object (in memory)
|
|
152
|
+
store.compress("graph.ttl") # RDF / Turtle file (.ttl, .n3, .nt)
|
|
153
|
+
store.compress("graph.jsonld") # JSON-LD file
|
|
154
|
+
store.compress("edges.csv") # CSV edge list (see format below)
|
|
155
|
+
store.compress("bolt://localhost:7687") # a live Neo4j database (see below)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### CSV edge-list format
|
|
159
|
+
|
|
160
|
+
The simplest way to bring your own data. One row per edge:
|
|
161
|
+
|
|
162
|
+
```csv
|
|
163
|
+
src,dst,relation,weight
|
|
164
|
+
Aspirin,Headache,treats,1.0
|
|
165
|
+
Headache,GeneX,associated_with,1.0
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
- `src`, `dst` — **required**: the two node IDs the edge connects.
|
|
169
|
+
- `relation` — optional: the edge type (defaults to `related_to`).
|
|
170
|
+
- `weight` — optional: a number (defaults to `1.0`).
|
|
171
|
+
- `src_type`, `dst_type` — optional: node categories (default `unknown`).
|
|
172
|
+
- Any other columns are kept as edge attributes.
|
|
173
|
+
|
|
174
|
+
### Reading directly from Neo4j
|
|
175
|
+
|
|
176
|
+
[Neo4j](https://neo4j.com/) is a popular graph **database**. KGZip can read a full
|
|
177
|
+
snapshot of it over **Bolt** (Neo4j's network connection protocol — the `bolt://`
|
|
178
|
+
address is just "where the database is listening"). You supply the connection URL
|
|
179
|
+
and your login:
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from kgzip import KGZipStore
|
|
183
|
+
|
|
184
|
+
# If your Neo4j has no authentication:
|
|
185
|
+
store = KGZipStore("./my_store")
|
|
186
|
+
store.compress("bolt://localhost:7687")
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Most Neo4j databases require a username and password. Pass them via the store's
|
|
190
|
+
`IngestionConfig`:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from kgzip import KGZipStore
|
|
194
|
+
from kgzip.models import KGZipConfig, IngestionConfig
|
|
195
|
+
|
|
196
|
+
config = KGZipConfig(
|
|
197
|
+
ingestion=IngestionConfig(
|
|
198
|
+
neo4j_auth=("neo4j", "your-password"), # (username, password)
|
|
199
|
+
neo4j_database=None, # database name; None = server default
|
|
200
|
+
neo4j_node_label=None, # only nodes with this label; None = all
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
store = KGZipStore("./my_store", config)
|
|
204
|
+
store.compress("bolt://localhost:7687") # one-time snapshot read + compress
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
KGZip reads every node (`id(n)` becomes the node ID, the first label becomes the
|
|
208
|
+
node type, properties become attributes) and every relationship. It **only reads** —
|
|
209
|
+
your Neo4j data is never changed.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## How a query works (the mental model)
|
|
214
|
+
|
|
215
|
+
```
|
|
216
|
+
compress() once: query() many times:
|
|
217
|
+
|
|
218
|
+
master graph query(["X"], depth=2)
|
|
219
|
+
│ │
|
|
220
|
+
▼ ▼
|
|
221
|
+
┌──────────┐ find capsule holding "X"
|
|
222
|
+
│ capsules │ ◄──── reads only ──── + its neighbour capsules
|
|
223
|
+
│ + manifest │
|
|
224
|
+
└──────────┘ ▼
|
|
225
|
+
(on local disk) decode those capsules, merge
|
|
226
|
+
│
|
|
227
|
+
▼
|
|
228
|
+
QueryResult.subgraph
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
- **`depth`** controls how far out from your seed nodes to reach. `depth=1` is "the
|
|
232
|
+
seed nodes and their immediate surroundings"; higher depth pulls in more.
|
|
233
|
+
- KGZip retrieves at **capsule granularity** — it returns whole clusters, so the
|
|
234
|
+
result is a *superset* of the exact neighbourhood (great recall; some extra nodes).
|
|
235
|
+
Asking for *all* nodes always returns the complete original graph (lossless).
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## API reference
|
|
240
|
+
|
|
241
|
+
`KGZipStore` is the **only class you need**. Everything else is internal.
|
|
242
|
+
|
|
243
|
+
### Creating a store
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
KGZipStore(path, config=None)
|
|
247
|
+
```
|
|
248
|
+
- `path` — folder for the compressed store (created on first `compress()`).
|
|
249
|
+
- `config` — optional `KGZipConfig` to tune clustering/compression (see below).
|
|
250
|
+
- The manifest is loaded **lazily** (on your first `query()`), so creating a store
|
|
251
|
+
is instant and does no I/O.
|
|
252
|
+
- Works as a **context manager**: `with KGZipStore(path) as store: ...`.
|
|
253
|
+
|
|
254
|
+
### `compress(graph, *, config=None) → CapsuleStoreRef`
|
|
255
|
+
Builds the compressed store from a graph. Accepts any supported source (NetworkX
|
|
256
|
+
object, file path, or `bolt://` URL). Steps it runs for you: ingest → cluster →
|
|
257
|
+
encode → write capsules → write manifest (written last, as the safe commit point).
|
|
258
|
+
|
|
259
|
+
- Returns a **`CapsuleStoreRef`** describing the new store: `manifest_path`,
|
|
260
|
+
`capsule_count`, `total_bytes`, `gcs_summary`, `store_version`, `created_at`.
|
|
261
|
+
- **Idempotent** by default (`overwrite=False`): re-compressing the same graph skips
|
|
262
|
+
capsules whose content hasn't changed.
|
|
263
|
+
- Thread/process-safe: takes a file lock so two compresses can't clobber each other.
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
ref = store.compress("edges.csv")
|
|
267
|
+
print(ref.capsule_count, ref.total_bytes)
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
### `query(node_ids, depth=1, **kwargs) → QueryResult`
|
|
271
|
+
Fetch the subgraph around one or more seed nodes.
|
|
272
|
+
|
|
273
|
+
- `node_ids` — list of node IDs to start from (must be non-empty).
|
|
274
|
+
- `depth` — how many hops to expand. `depth=1` is the seeds and their immediate
|
|
275
|
+
surroundings; higher pulls in more. **`depth=None` = unbounded** (follow the graph
|
|
276
|
+
until nothing new is reachable — the whole connected subgraph).
|
|
277
|
+
- Optional keyword arguments:
|
|
278
|
+
- `trim: bool = False` — **token control.** `False` returns the full capsule
|
|
279
|
+
contents (a *superset* of the neighbourhood — more context). `True` prunes the
|
|
280
|
+
result down to the **exact `depth`-hop neighbourhood** of your seeds. Trimming is
|
|
281
|
+
*lossless relative to the query* (it never drops anything within `depth` hops) and
|
|
282
|
+
can cut output ~100× on large graphs. See [Saving tokens](#saving-tokens).
|
|
283
|
+
- `max_capsules: int = 50` — safety cap on how many capsules one query may load.
|
|
284
|
+
**Set higher, or `None`, to fetch large/complete subgraphs.** If the cap limits a
|
|
285
|
+
result, `QueryResult.truncated` is set to `True` (never a silent partial answer).
|
|
286
|
+
- `relation_filter: list[str]` — keep only edges of these relation types.
|
|
287
|
+
- `consistency: "eventual" | "strict"` — `"strict"` re-fetches stale parts from
|
|
288
|
+
the master via `master_kg_fn` instead of serving possibly-stale capsule data.
|
|
289
|
+
- `timeout_ms: int` — max time to wait for parallel decoding (default 5000).
|
|
290
|
+
- `master_kg_fn: Callable` — required when `consistency="strict"`; you write a
|
|
291
|
+
function `node_ids -> fresh subgraph` that fetches from your master.
|
|
292
|
+
|
|
293
|
+
Returns a **`QueryResult`**:
|
|
294
|
+
| Field | Meaning |
|
|
295
|
+
|---|---|
|
|
296
|
+
| `subgraph` | the merged result graph (a `NormalizedGraph`) |
|
|
297
|
+
| `capsules_loaded` | how many capsules were read |
|
|
298
|
+
| `latency_ms` | how long the query took |
|
|
299
|
+
| `stale_capsules` | IDs of capsules flagged stale |
|
|
300
|
+
| `fallback_used` | `True` if the master was consulted (strict mode) |
|
|
301
|
+
| `query_node_ids_not_found` | seed IDs that weren't in the store |
|
|
302
|
+
| `truncated` | `True` if `max_capsules` limited the result (it's incomplete) |
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
# Token-lean: exact 2-hop neighbourhood, only "treats" edges
|
|
306
|
+
res = store.query(["Aspirin"], depth=2, trim=True, relation_filter=["treats"])
|
|
307
|
+
|
|
308
|
+
# Agent escape hatch: not satisfied? fetch everything reachable, no caps
|
|
309
|
+
res = store.query(["Aspirin"], depth=None, max_capsules=None)
|
|
310
|
+
if res.truncated:
|
|
311
|
+
print("result was capped — raise max_capsules")
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
#### Iterative deepening (for AI agents)
|
|
315
|
+
The defaults are safe (you never get *less* than the true neighbourhood). An agent
|
|
316
|
+
can start cheap and widen only when needed:
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
res = store.query(seeds, depth=1, trim=True) # cheap, few tokens
|
|
320
|
+
if not_enough(res):
|
|
321
|
+
res = store.query(seeds, depth=3, trim=True) # go deeper
|
|
322
|
+
if still_not_enough(res):
|
|
323
|
+
res = store.query(seeds, depth=None, max_capsules=None) # the whole reachable graph
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### `sync(master_graph=None) → SyncReport`
|
|
327
|
+
Keep the store consistent with a changed master.
|
|
328
|
+
- `sync()` with no argument → marks all capsules **stale** (they'll be treated as
|
|
329
|
+
out-of-date until rebuilt).
|
|
330
|
+
- `sync(updated_graph)` → re-compresses the store from the updated graph.
|
|
331
|
+
- Returns a **`SyncReport`**: `stale_count`, `re_encoded_count`, `skipped_count`,
|
|
332
|
+
`sync_duration_ms`.
|
|
333
|
+
|
|
334
|
+
### `status() → StoreStatus`
|
|
335
|
+
A safe, never-raises health check.
|
|
336
|
+
- Returns **`StoreStatus`**: `exists`, `capsule_count`, `stale_count`, `total_bytes`,
|
|
337
|
+
`store_version`, `last_encoded_at`.
|
|
338
|
+
- `exists=False` means nothing has been compressed yet.
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
if not store.status().exists:
|
|
342
|
+
store.compress(my_graph)
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Configuration
|
|
346
|
+
|
|
347
|
+
Tune how KGZip clusters and compresses. Defaults are sensible — change these only if
|
|
348
|
+
you need to.
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
from kgzip.models import KGZipConfig, DecisionConfig, StorageConfig
|
|
352
|
+
|
|
353
|
+
config = KGZipConfig(
|
|
354
|
+
decision=DecisionConfig(
|
|
355
|
+
max_capsule_nodes=500, # biggest a capsule may get (bigger ones are split)
|
|
356
|
+
min_capsule_nodes=5, # smallest; tiny clusters merge into a neighbour
|
|
357
|
+
spectral_k=8, # size of each capsule's structural "fingerprint"
|
|
358
|
+
random_seed=42, # makes clustering reproducible
|
|
359
|
+
),
|
|
360
|
+
storage=StorageConfig(
|
|
361
|
+
base_path="./my_store",
|
|
362
|
+
compression="zstd", # "zstd" (best) | "gzip" | "none"
|
|
363
|
+
compression_level=3, # 1–19 for zstd (higher = smaller, slower)
|
|
364
|
+
overwrite=False, # True = always re-encode, even if unchanged
|
|
365
|
+
),
|
|
366
|
+
)
|
|
367
|
+
store = KGZipStore("./my_store", config)
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
### Errors
|
|
371
|
+
|
|
372
|
+
Every error KGZip raises is a subclass of **`kgzip.KGZipError`** and carries a
|
|
373
|
+
`message` plus a `context` dict for debugging. Common ones:
|
|
374
|
+
|
|
375
|
+
| Exception | When |
|
|
376
|
+
|---|---|
|
|
377
|
+
| `EmptyGraphError` | the input graph has no nodes |
|
|
378
|
+
| `SchemaError` | a CSV is missing required `src`/`dst` columns |
|
|
379
|
+
| `SoftDependencyError` | an optional library (e.g. `neo4j`) isn't installed |
|
|
380
|
+
| `ConnectionError` | a Neo4j database couldn't be reached |
|
|
381
|
+
| `StoreNotFoundError` | you queried before compressing |
|
|
382
|
+
| `CorruptionError` / `VersionError` | a capsule file is damaged or wrong version |
|
|
383
|
+
| `QueryError` | bad query input (e.g. empty `node_ids`) |
|
|
384
|
+
|
|
385
|
+
```python
|
|
386
|
+
from kgzip import KGZipError
|
|
387
|
+
try:
|
|
388
|
+
store.query([], depth=1)
|
|
389
|
+
except KGZipError as e:
|
|
390
|
+
print(e.message, e.context)
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
---
|
|
394
|
+
|
|
395
|
+
## Saving tokens
|
|
396
|
+
|
|
397
|
+
If you feed query results to an LLM/agent, the number of tokens matters. Two levers,
|
|
398
|
+
both **lossless** (they remove waste, not information you asked for):
|
|
399
|
+
|
|
400
|
+
**1. `trim=True` — return only the exact neighbourhood.** Without it, a query returns
|
|
401
|
+
the seed's whole community capsule (lots of extra context). With it, you get exactly
|
|
402
|
+
the `depth`-hop neighbourhood.
|
|
403
|
+
|
|
404
|
+
**2. Compact serialization** — render the subgraph as terse triples instead of verbose
|
|
405
|
+
JSON, with optional attribute projection:
|
|
406
|
+
|
|
407
|
+
```python
|
|
408
|
+
from kgzip import to_triples, to_compact
|
|
409
|
+
|
|
410
|
+
res = store.query(["Aspirin"], depth=2, trim=True)
|
|
411
|
+
|
|
412
|
+
print(to_triples(res.subgraph))
|
|
413
|
+
# Aspirin --treats--> Headache
|
|
414
|
+
# Headache --associated_with--> GeneX
|
|
415
|
+
|
|
416
|
+
to_compact(res.subgraph) # ids + types only (leanest)
|
|
417
|
+
to_compact(res.subgraph, attrs=["name"], include_attrs=True) # keep only the 'name' attr
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
Measured on a 1,000-node medical KG, average tokens for a single depth-2 query
|
|
421
|
+
(`chars/4` estimate):
|
|
422
|
+
|
|
423
|
+
| Strategy | tokens | vs naive |
|
|
424
|
+
|---|---:|---:|
|
|
425
|
+
| Full-capsule result, verbose JSON | 54,114 | 1× |
|
|
426
|
+
| Full-capsule result, compact triples | 26,191 | 2.1× less |
|
|
427
|
+
| **`trim=True` + compact triples (= exact neighbourhood)** | **367** | **~147× less** |
|
|
428
|
+
|
|
429
|
+
The trimmed output equals what a precise Neo4j neighbourhood query would return — so
|
|
430
|
+
you get targeted-query token cost *plus* KGZip's storage/offline benefits. If you need
|
|
431
|
+
more, just widen `depth` or set `trim=False`; nothing is lost, it's your choice.
|
|
432
|
+
|
|
433
|
+
---
|
|
434
|
+
|
|
435
|
+
## When should I (not) use KGZip?
|
|
436
|
+
|
|
437
|
+
- ✅ Your graph is **large** and you mostly read **local neighbourhoods**.
|
|
438
|
+
- ✅ You want a **smaller on-disk** representation than raw JSON.
|
|
439
|
+
- ✅ Your graph **doesn't fit comfortably in memory**, so you must read from storage.
|
|
440
|
+
- ❌ Your graph is small and fits in RAM, and you traverse it repeatedly in-process —
|
|
441
|
+
plain in-memory traversal (e.g. NetworkX) will be faster. KGZip's wins are storage
|
|
442
|
+
size and avoiding full-graph loads, **not** beating RAM-speed traversal.
|
|
443
|
+
|
|
444
|
+
---
|
|
445
|
+
|
|
446
|
+
## How it works (under the hood)
|
|
447
|
+
|
|
448
|
+
KGZip is built as five layers; you only ever touch the last one (`KGZipStore`).
|
|
449
|
+
|
|
450
|
+
1. **Ingestion (L1)** — any input → a clean, immutable `NormalizedGraph` with unique
|
|
451
|
+
string node IDs.
|
|
452
|
+
2. **Decision (L2)** — analyse the graph, detect communities (clusters of densely
|
|
453
|
+
connected nodes via the Louvain algorithm), and plan one capsule per community.
|
|
454
|
+
3. **Encoding (L3)** — write each capsule as a compact binary `.kgzc` file (magic
|
|
455
|
+
bytes, version, header, SHA-256 checksum, compressed payload). The
|
|
456
|
+
`manifest.kgz.json` index is written **last**, as the atomic commit point.
|
|
457
|
+
4. **Query (L4)** — use the manifest to find the right capsules, decode them in
|
|
458
|
+
parallel, verify their checksums, and merge.
|
|
459
|
+
5. **Facade (L5)** — `KGZipStore` ties it together with locking and lazy loading.
|
|
460
|
+
|
|
461
|
+
---
|
|
462
|
+
|
|
463
|
+
## Glossary
|
|
464
|
+
|
|
465
|
+
- **Node** — a thing in the graph (has an ID, a type, and attributes).
|
|
466
|
+
- **Edge / relation** — a directed connection between two nodes (has a type and weight).
|
|
467
|
+
- **Capsule** — a cluster of related nodes, stored as one `.kgzc` file. The unit KGZip
|
|
468
|
+
loads per query.
|
|
469
|
+
- **Manifest** — `manifest.kgz.json`, the index that maps nodes to capsules.
|
|
470
|
+
- **Community** — a group of nodes more densely connected to each other than to the
|
|
471
|
+
rest of the graph; KGZip turns each into a capsule.
|
|
472
|
+
- **Boundary / halo node** — a node on the edge of a capsule that also connects to a
|
|
473
|
+
neighbouring capsule; stored in both so no edge is lost.
|
|
474
|
+
- **Depth** — how many hops outward from your seed nodes a query reaches (`None` = unbounded).
|
|
475
|
+
- **Trim** — prune a query result to the exact depth-hop neighbourhood (token-lean,
|
|
476
|
+
lossless w.r.t. the query). Opt-in via `trim=True`.
|
|
477
|
+
- **Truncated** — a query result flagged `truncated=True` because `max_capsules` capped
|
|
478
|
+
it; the answer is incomplete and you should raise the cap.
|
|
479
|
+
- **Master** — your original source-of-truth graph. KGZip never writes to it.
|
|
480
|
+
- **Lossless** — compressing then querying everything returns the exact original graph.
|
|
481
|
+
- **Bolt** — Neo4j's network protocol; a `bolt://host:port` URL is the database address.
|