kgzip 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. kgzip-0.1.0/LICENSE +21 -0
  2. kgzip-0.1.0/PKG-INFO +481 -0
  3. kgzip-0.1.0/README.md +447 -0
  4. kgzip-0.1.0/kgzip/__init__.py +7 -0
  5. kgzip-0.1.0/kgzip/_version.py +3 -0
  6. kgzip-0.1.0/kgzip/benchmark/__init__.py +5 -0
  7. kgzip-0.1.0/kgzip/benchmark/comparison.py +155 -0
  8. kgzip-0.1.0/kgzip/benchmark/harness.py +22 -0
  9. kgzip-0.1.0/kgzip/benchmark/medical_kg.py +71 -0
  10. kgzip-0.1.0/kgzip/benchmark/quality.py +226 -0
  11. kgzip-0.1.0/kgzip/compat.py +77 -0
  12. kgzip-0.1.0/kgzip/decision/__init__.py +101 -0
  13. kgzip-0.1.0/kgzip/decision/community.py +186 -0
  14. kgzip-0.1.0/kgzip/decision/gcs_scorer.py +84 -0
  15. kgzip-0.1.0/kgzip/decision/mode_selector.py +19 -0
  16. kgzip-0.1.0/kgzip/decision/profiler.py +71 -0
  17. kgzip-0.1.0/kgzip/decision/spectral.py +52 -0
  18. kgzip-0.1.0/kgzip/encoder/__init__.py +3 -0
  19. kgzip-0.1.0/kgzip/encoder/capsule_encoder.py +148 -0
  20. kgzip-0.1.0/kgzip/encoder/file_format.py +6 -0
  21. kgzip-0.1.0/kgzip/encoder/manifest.py +61 -0
  22. kgzip-0.1.0/kgzip/encoder/store_writer.py +263 -0
  23. kgzip-0.1.0/kgzip/exceptions.py +68 -0
  24. kgzip-0.1.0/kgzip/ingestion/__init__.py +3 -0
  25. kgzip-0.1.0/kgzip/ingestion/adapters/__init__.py +0 -0
  26. kgzip-0.1.0/kgzip/ingestion/adapters/csv.py +106 -0
  27. kgzip-0.1.0/kgzip/ingestion/adapters/jsonld.py +10 -0
  28. kgzip-0.1.0/kgzip/ingestion/adapters/neo4j.py +157 -0
  29. kgzip-0.1.0/kgzip/ingestion/adapters/networkx.py +92 -0
  30. kgzip-0.1.0/kgzip/ingestion/adapters/rdf.py +125 -0
  31. kgzip-0.1.0/kgzip/ingestion/normalizer.py +51 -0
  32. kgzip-0.1.0/kgzip/models.py +206 -0
  33. kgzip-0.1.0/kgzip/query/__init__.py +12 -0
  34. kgzip-0.1.0/kgzip/query/decoder.py +216 -0
  35. kgzip-0.1.0/kgzip/query/interface.py +180 -0
  36. kgzip-0.1.0/kgzip/query/merger.py +84 -0
  37. kgzip-0.1.0/kgzip/query/resolver.py +88 -0
  38. kgzip-0.1.0/kgzip/serialize.py +74 -0
  39. kgzip-0.1.0/kgzip/store.py +251 -0
  40. kgzip-0.1.0/kgzip.egg-info/PKG-INFO +481 -0
  41. kgzip-0.1.0/kgzip.egg-info/SOURCES.txt +54 -0
  42. kgzip-0.1.0/kgzip.egg-info/dependency_links.txt +1 -0
  43. kgzip-0.1.0/kgzip.egg-info/requires.txt +16 -0
  44. kgzip-0.1.0/kgzip.egg-info/top_level.txt +1 -0
  45. kgzip-0.1.0/pyproject.toml +48 -0
  46. kgzip-0.1.0/setup.cfg +4 -0
  47. kgzip-0.1.0/tests/test_benchmark.py +138 -0
  48. kgzip-0.1.0/tests/test_cross_layer_invariants.py +800 -0
  49. kgzip-0.1.0/tests/test_decision.py +418 -0
  50. kgzip-0.1.0/tests/test_encoder.py +484 -0
  51. kgzip-0.1.0/tests/test_ingestion.py +184 -0
  52. kgzip-0.1.0/tests/test_models.py +366 -0
  53. kgzip-0.1.0/tests/test_neo4j_adapter.py +226 -0
  54. kgzip-0.1.0/tests/test_query.py +604 -0
  55. kgzip-0.1.0/tests/test_query_trim.py +203 -0
  56. kgzip-0.1.0/tests/test_store.py +396 -0
kgzip-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ayush Mukherjee
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
kgzip-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,481 @@
1
+ Metadata-Version: 2.1
2
+ Name: kgzip
3
+ Version: 0.1.0
4
+ Summary: Knowledge graph compression engine: parallel-decodable subgraph capsules for fast, storage-efficient KG queries.
5
+ Author: Ayush Mukherjee
6
+ License: MIT
7
+ Keywords: knowledge-graph,graph,compression,rdf,networkx,neo4j
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Requires-Python: >=3.8
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: rdflib>=6.0
21
+ Requires-Dist: networkx>=2.6
22
+ Requires-Dist: pandas>=1.3
23
+ Requires-Dist: python-louvain>=0.16
24
+ Requires-Dist: numpy>=1.21
25
+ Requires-Dist: scipy>=1.7
26
+ Requires-Dist: msgpack>=1.0
27
+ Requires-Dist: zstandard>=0.18
28
+ Requires-Dist: nest_asyncio>=1.5
29
+ Requires-Dist: filelock>=3.4
30
+ Provides-Extra: neo4j
31
+ Requires-Dist: neo4j>=5.0; extra == "neo4j"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0; extra == "dev"
34
+
35
+ # KGZip
36
+
37
+ **A compression engine for knowledge graphs.** KGZip takes a knowledge graph and
38
+ splits it into small, independently-loadable pieces called *capsules*, so that when
39
+ you ask a question about one part of the graph, you only read that part — not the
40
+ whole thing. The result is a store that is **smaller on disk** and lets you **query
41
+ large graphs without loading them entirely into memory**.
42
+
43
+ This README assumes **no prior knowledge of knowledge graphs**. If you already know
44
+ the basics, jump to [Quickstart](#quickstart) or the [API reference](#api-reference).
45
+
46
+ ---
47
+
48
+ ## New here? Start with the concepts
49
+
50
+ ### What is a knowledge graph?
51
+
52
+ A **knowledge graph (KG)** is just data stored as **things** and the **relationships
53
+ between them**.
54
+
55
+ - A **node** is a thing: a drug, a disease, a person, a movie.
56
+ - An **edge** is a relationship connecting two nodes: *Aspirin* **treats** *Headache*.
57
+
58
+ ```
59
+ (Aspirin) --treats--> (Headache) --associated_with--> (GeneX)
60
+ ```
61
+
62
+ Here `Aspirin`, `Headache`, and `GeneX` are nodes; `treats` and `associated_with`
63
+ are edges (also called *relations*). Each node can also carry **attributes**
64
+ (properties), e.g. Aspirin's `{ "formula": "C9H8O4" }`.
65
+
66
+ That's the whole idea. Real KGs just have many more nodes and edges (thousands to
67
+ billions), often describing a domain like medicine, finance, or social networks.
68
+
69
+ ### What problem does KGZip solve?
70
+
71
+ When a graph is large, two things get painful:
72
+
73
+ 1. **Storage** — keeping the whole graph around costs space.
74
+ 2. **Querying** — to answer "what is near node X?", naive tools load or scan the
75
+ entire graph, even though you only care about a tiny neighbourhood.
76
+
77
+ KGZip pre-organises the graph into **capsules** (clusters of closely-related nodes)
78
+ and writes a small **manifest** (an index). A query then loads only the capsules it
79
+ needs. Think of it like a book with chapters and a table of contents: to read about
80
+ one topic you open one chapter, not the entire book.
81
+
82
+ ### The golden rule: KGZip is a *read replica*
83
+
84
+ Your original graph (in a file, or in a database like Neo4j) is always the **source
85
+ of truth** — the "master". KGZip builds a **compressed copy** from it for fast
86
+ reads. **KGZip never modifies your original data.** If the KGZip store is ever lost
87
+ or corrupted, you can always rebuild it from the master.
88
+
89
+ ### Is it lossless?
90
+
91
+ Yes. KGZip v1 is **lossless**: if you compress a graph and then ask for all of its
92
+ nodes back, you get *every node and every edge* exactly as they were. Capsules store
93
+ boundary-crossing edges (and a small "halo" of neighbouring nodes) precisely so that
94
+ nothing is lost when the pieces are reassembled.
95
+
96
+ ---
97
+
98
+ ## Install
99
+
100
+ ```bash
101
+ pip install kgzip
102
+
103
+ # optional: to read directly from a Neo4j database
104
+ pip install "kgzip[neo4j]"
105
+ ```
106
+
107
+ From source (for development):
108
+
109
+ ```bash
110
+ git clone <repo-url> && cd KGZip
111
+ pip install -e ".[dev]"
112
+ pytest # run the test suite
113
+ ```
114
+
115
+ Requires **Python ≥ 3.8**. Works in plain scripts and in Jupyter notebooks.
116
+
117
+ ---
118
+
119
+ ## Quickstart
120
+
121
+ Five lines to compress a graph and query it:
122
+
123
+ ```python
124
+ import networkx as nx
125
+ from kgzip import KGZipStore
126
+
127
+ store = KGZipStore("./my_store") # 1. where the compressed store lives
128
+ store.compress(nx.karate_club_graph()) # 2. build capsules from a graph
129
+ result = store.query(["0", "1"], depth=2) # 3. ask: what's around nodes 0 and 1?
130
+ print(result.subgraph.meta.node_count) # 4. how many nodes came back
131
+ ```
132
+
133
+ What just happened, line by line:
134
+
135
+ 1. `KGZipStore("./my_store")` — open (or prepare to create) a store in that folder.
136
+ Nothing is read or written yet.
137
+ 2. `compress(...)` — read the graph, cluster it into capsules, and write the capsule
138
+ files plus a manifest into `./my_store`.
139
+ 3. `query(["0","1"], depth=2)` — find the capsules containing nodes `"0"` and `"1"`,
140
+ expand outward `depth` hops, decode just those capsules, and merge them.
141
+ 4. The answer is a `QueryResult`; its `.subgraph` is a normal graph you can inspect.
142
+
143
+ ---
144
+
145
+ ## Loading data from different sources
146
+
147
+ `compress()` accepts several common graph formats. You don't convert anything
148
+ yourself — KGZip detects the type and reads it.
149
+
150
+ ```python
151
+ store.compress(nx.karate_club_graph()) # a NetworkX graph object (in memory)
152
+ store.compress("graph.ttl") # RDF / Turtle file (.ttl, .n3, .nt)
153
+ store.compress("graph.jsonld") # JSON-LD file
154
+ store.compress("edges.csv") # CSV edge list (see format below)
155
+ store.compress("bolt://localhost:7687") # a live Neo4j database (see below)
156
+ ```
157
+
158
+ ### CSV edge-list format
159
+
160
+ The simplest way to bring your own data. One row per edge:
161
+
162
+ ```csv
163
+ src,dst,relation,weight
164
+ Aspirin,Headache,treats,1.0
165
+ Headache,GeneX,associated_with,1.0
166
+ ```
167
+
168
+ - `src`, `dst` — **required**: the two node IDs the edge connects.
169
+ - `relation` — optional: the edge type (defaults to `related_to`).
170
+ - `weight` — optional: a number (defaults to `1.0`).
171
+ - `src_type`, `dst_type` — optional: node categories (default `unknown`).
172
+ - Any other columns are kept as edge attributes.
173
+
174
+ ### Reading directly from Neo4j
175
+
176
+ [Neo4j](https://neo4j.com/) is a popular graph **database**. KGZip can read a full
177
+ snapshot of it over **Bolt** (Neo4j's network connection protocol — the `bolt://`
178
+ address is just "where the database is listening"). You supply the connection URL
179
+ and your login:
180
+
181
+ ```python
182
+ from kgzip import KGZipStore
183
+
184
+ # If your Neo4j has no authentication:
185
+ store = KGZipStore("./my_store")
186
+ store.compress("bolt://localhost:7687")
187
+ ```
188
+
189
+ Most Neo4j databases require a username and password. Pass them via the store's
190
+ `IngestionConfig`:
191
+
192
+ ```python
193
+ from kgzip import KGZipStore
194
+ from kgzip.models import KGZipConfig, IngestionConfig
195
+
196
+ config = KGZipConfig(
197
+ ingestion=IngestionConfig(
198
+ neo4j_auth=("neo4j", "your-password"), # (username, password)
199
+ neo4j_database=None, # database name; None = server default
200
+ neo4j_node_label=None, # only nodes with this label; None = all
201
+ ),
202
+ )
203
+ store = KGZipStore("./my_store", config)
204
+ store.compress("bolt://localhost:7687") # one-time snapshot read + compress
205
+ ```
206
+
207
+ KGZip reads every node (`id(n)` becomes the node ID, the first label becomes the
208
+ node type, properties become attributes) and every relationship. It **only reads** —
209
+ your Neo4j data is never changed.
210
+
211
+ ---
212
+
213
+ ## How a query works (the mental model)
214
+
215
+ ```
216
+ compress() once: query() many times:
217
+
218
+ master graph query(["X"], depth=2)
219
+ │ │
220
+ ▼ ▼
221
+ ┌──────────┐ find capsule holding "X"
222
+ │ capsules │ ◄──── reads only ──── + its neighbour capsules
223
+ │ + manifest │
224
+ └──────────┘ ▼
225
+ (on local disk) decode those capsules, merge
226
+
227
+
228
+ QueryResult.subgraph
229
+ ```
230
+
231
+ - **`depth`** controls how far out from your seed nodes to reach. `depth=1` is "the
232
+ seed nodes and their immediate surroundings"; higher depth pulls in more.
233
+ - KGZip retrieves at **capsule granularity** — it returns whole clusters, so the
234
+ result is a *superset* of the exact neighbourhood (great recall; some extra nodes).
235
+ Asking for *all* nodes always returns the complete original graph (lossless).
236
+
237
+ ---
238
+
239
+ ## API reference
240
+
241
+ `KGZipStore` is the **only class you need**. Everything else is internal.
242
+
243
+ ### Creating a store
244
+
245
+ ```python
246
+ KGZipStore(path, config=None)
247
+ ```
248
+ - `path` — folder for the compressed store (created on first `compress()`).
249
+ - `config` — optional `KGZipConfig` to tune clustering/compression (see below).
250
+ - The manifest is loaded **lazily** (on your first `query()`), so creating a store
251
+ is instant and does no I/O.
252
+ - Works as a **context manager**: `with KGZipStore(path) as store: ...`.
253
+
254
+ ### `compress(graph, *, config=None) → CapsuleStoreRef`
255
+ Builds the compressed store from a graph. Accepts any supported source (NetworkX
256
+ object, file path, or `bolt://` URL). Steps it runs for you: ingest → cluster →
257
+ encode → write capsules → write manifest (written last, as the safe commit point).
258
+
259
+ - Returns a **`CapsuleStoreRef`** describing the new store: `manifest_path`,
260
+ `capsule_count`, `total_bytes`, `gcs_summary`, `store_version`, `created_at`.
261
+ - **Idempotent** by default (`overwrite=False`): re-compressing the same graph skips
262
+ capsules whose content hasn't changed.
263
+ - Thread/process-safe: takes a file lock so two compresses can't clobber each other.
264
+
265
+ ```python
266
+ ref = store.compress("edges.csv")
267
+ print(ref.capsule_count, ref.total_bytes)
268
+ ```
269
+
270
+ ### `query(node_ids, depth=1, **kwargs) → QueryResult`
271
+ Fetch the subgraph around one or more seed nodes.
272
+
273
+ - `node_ids` — list of node IDs to start from (must be non-empty).
274
+ - `depth` — how many hops to expand. `depth=1` is the seeds and their immediate
275
+ surroundings; higher pulls in more. **`depth=None` = unbounded** (follow the graph
276
+ until nothing new is reachable — the whole connected subgraph).
277
+ - Optional keyword arguments:
278
+ - `trim: bool = False` — **token control.** `False` returns the full capsule
279
+ contents (a *superset* of the neighbourhood — more context). `True` prunes the
280
+ result down to the **exact `depth`-hop neighbourhood** of your seeds. Trimming is
281
+ *lossless relative to the query* (it never drops anything within `depth` hops) and
282
+ can cut output ~100× on large graphs. See [Saving tokens](#saving-tokens).
283
+ - `max_capsules: int = 50` — safety cap on how many capsules one query may load.
284
+ **Set higher, or `None`, to fetch large/complete subgraphs.** If the cap limits a
285
+ result, `QueryResult.truncated` is set to `True` (never a silent partial answer).
286
+ - `relation_filter: list[str]` — keep only edges of these relation types.
287
+ - `consistency: "eventual" | "strict"` — `"strict"` re-fetches stale parts from
288
+ the master via `master_kg_fn` instead of serving possibly-stale capsule data.
289
+ - `timeout_ms: int` — max time to wait for parallel decoding (default 5000).
290
+ - `master_kg_fn: Callable` — required when `consistency="strict"`; you write a
291
+ function `node_ids -> fresh subgraph` that fetches from your master.
292
+
293
+ Returns a **`QueryResult`**:
294
+ | Field | Meaning |
295
+ |---|---|
296
+ | `subgraph` | the merged result graph (a `NormalizedGraph`) |
297
+ | `capsules_loaded` | how many capsules were read |
298
+ | `latency_ms` | how long the query took |
299
+ | `stale_capsules` | IDs of capsules flagged stale |
300
+ | `fallback_used` | `True` if the master was consulted (strict mode) |
301
+ | `query_node_ids_not_found` | seed IDs that weren't in the store |
302
+ | `truncated` | `True` if `max_capsules` limited the result (it's incomplete) |
303
+
304
+ ```python
305
+ # Token-lean: exact 2-hop neighbourhood, only "treats" edges
306
+ res = store.query(["Aspirin"], depth=2, trim=True, relation_filter=["treats"])
307
+
308
+ # Agent escape hatch: not satisfied? fetch everything reachable, no caps
309
+ res = store.query(["Aspirin"], depth=None, max_capsules=None)
310
+ if res.truncated:
311
+ print("result was capped — raise max_capsules")
312
+ ```
313
+
314
+ #### Iterative deepening (for AI agents)
315
+ The defaults are safe (you never get *less* than the true neighbourhood). An agent
316
+ can start cheap and widen only when needed:
317
+
318
+ ```python
319
+ res = store.query(seeds, depth=1, trim=True) # cheap, few tokens
320
+ if not_enough(res):
321
+ res = store.query(seeds, depth=3, trim=True) # go deeper
322
+ if still_not_enough(res):
323
+ res = store.query(seeds, depth=None, max_capsules=None) # the whole reachable graph
324
+ ```
325
+
326
+ ### `sync(master_graph=None) → SyncReport`
327
+ Keep the store consistent with a changed master.
328
+ - `sync()` with no argument → marks all capsules **stale** (they'll be treated as
329
+ out-of-date until rebuilt).
330
+ - `sync(updated_graph)` → re-compresses the store from the updated graph.
331
+ - Returns a **`SyncReport`**: `stale_count`, `re_encoded_count`, `skipped_count`,
332
+ `sync_duration_ms`.
333
+
334
+ ### `status() → StoreStatus`
335
+ A safe, never-raises health check.
336
+ - Returns **`StoreStatus`**: `exists`, `capsule_count`, `stale_count`, `total_bytes`,
337
+ `store_version`, `last_encoded_at`.
338
+ - `exists=False` means nothing has been compressed yet.
339
+
340
+ ```python
341
+ if not store.status().exists:
342
+ store.compress(my_graph)
343
+ ```
344
+
345
+ ### Configuration
346
+
347
+ Tune how KGZip clusters and compresses. Defaults are sensible — change these only if
348
+ you need to.
349
+
350
+ ```python
351
+ from kgzip.models import KGZipConfig, DecisionConfig, StorageConfig
352
+
353
+ config = KGZipConfig(
354
+ decision=DecisionConfig(
355
+ max_capsule_nodes=500, # biggest a capsule may get (bigger ones are split)
356
+ min_capsule_nodes=5, # smallest; tiny clusters merge into a neighbour
357
+ spectral_k=8, # size of each capsule's structural "fingerprint"
358
+ random_seed=42, # makes clustering reproducible
359
+ ),
360
+ storage=StorageConfig(
361
+ base_path="./my_store",
362
+ compression="zstd", # "zstd" (best) | "gzip" | "none"
363
+ compression_level=3, # 1–19 for zstd (higher = smaller, slower)
364
+ overwrite=False, # True = always re-encode, even if unchanged
365
+ ),
366
+ )
367
+ store = KGZipStore("./my_store", config)
368
+ ```
369
+
370
+ ### Errors
371
+
372
+ Every error KGZip raises is a subclass of **`kgzip.KGZipError`** and carries a
373
+ `message` plus a `context` dict for debugging. Common ones:
374
+
375
+ | Exception | When |
376
+ |---|---|
377
+ | `EmptyGraphError` | the input graph has no nodes |
378
+ | `SchemaError` | a CSV is missing required `src`/`dst` columns |
379
+ | `SoftDependencyError` | an optional library (e.g. `neo4j`) isn't installed |
380
+ | `ConnectionError` | a Neo4j database couldn't be reached |
381
+ | `StoreNotFoundError` | you queried before compressing |
382
+ | `CorruptionError` / `VersionError` | a capsule file is damaged or wrong version |
383
+ | `QueryError` | bad query input (e.g. empty `node_ids`) |
384
+
385
+ ```python
386
+ from kgzip import KGZipError
387
+ try:
388
+ store.query([], depth=1)
389
+ except KGZipError as e:
390
+ print(e.message, e.context)
391
+ ```
392
+
393
+ ---
394
+
395
+ ## Saving tokens
396
+
397
+ If you feed query results to an LLM/agent, the number of tokens matters. Two levers,
398
+ both **lossless** (they remove waste, not information you asked for):
399
+
400
+ **1. `trim=True` — return only the exact neighbourhood.** Without it, a query returns
401
+ the seed's whole community capsule (lots of extra context). With it, you get exactly
402
+ the `depth`-hop neighbourhood.
403
+
404
+ **2. Compact serialization** — render the subgraph as terse triples instead of verbose
405
+ JSON, with optional attribute projection:
406
+
407
+ ```python
408
+ from kgzip import to_triples, to_compact
409
+
410
+ res = store.query(["Aspirin"], depth=2, trim=True)
411
+
412
+ print(to_triples(res.subgraph))
413
+ # Aspirin --treats--> Headache
414
+ # Headache --associated_with--> GeneX
415
+
416
+ to_compact(res.subgraph) # ids + types only (leanest)
417
+ to_compact(res.subgraph, attrs=["name"], include_attrs=True) # keep only the 'name' attr
418
+ ```
419
+
420
+ Measured on a 1,000-node medical KG, average tokens for a single depth-2 query
421
+ (`chars/4` estimate):
422
+
423
+ | Strategy | tokens | vs naive |
424
+ |---|---:|---:|
425
+ | Full-capsule result, verbose JSON | 54,114 | 1× |
426
+ | Full-capsule result, compact triples | 26,191 | 2.1× less |
427
+ | **`trim=True` + compact triples (= exact neighbourhood)** | **367** | **~147× less** |
428
+
429
+ The trimmed output equals what a precise Neo4j neighbourhood query would return — so
430
+ you get targeted-query token cost *plus* KGZip's storage/offline benefits. If you need
431
+ more, just widen `depth` or set `trim=False`; nothing is lost, it's your choice.
432
+
433
+ ---
434
+
435
+ ## When should I (not) use KGZip?
436
+
437
+ - ✅ Your graph is **large** and you mostly read **local neighbourhoods**.
438
+ - ✅ You want a **smaller on-disk** representation than raw JSON.
439
+ - ✅ Your graph **doesn't fit comfortably in memory**, so you must read from storage.
440
+ - ❌ Your graph is small and fits in RAM, and you traverse it repeatedly in-process —
441
+ plain in-memory traversal (e.g. NetworkX) will be faster. KGZip's wins are storage
442
+ size and avoiding full-graph loads, **not** beating RAM-speed traversal.
443
+
444
+ ---
445
+
446
+ ## How it works (under the hood)
447
+
448
+ KGZip is built as five layers; you only ever touch the last one (`KGZipStore`).
449
+
450
+ 1. **Ingestion (L1)** — any input → a clean, immutable `NormalizedGraph` with unique
451
+ string node IDs.
452
+ 2. **Decision (L2)** — analyse the graph, detect communities (clusters of densely
453
+ connected nodes via the Louvain algorithm), and plan one capsule per community.
454
+ 3. **Encoding (L3)** — write each capsule as a compact binary `.kgzc` file (magic
455
+ bytes, version, header, SHA-256 checksum, compressed payload). The
456
+ `manifest.kgz.json` index is written **last**, as the atomic commit point.
457
+ 4. **Query (L4)** — use the manifest to find the right capsules, decode them in
458
+ parallel, verify their checksums, and merge.
459
+ 5. **Facade (L5)** — `KGZipStore` ties it together with locking and lazy loading.
460
+
461
+ ---
462
+
463
+ ## Glossary
464
+
465
+ - **Node** — a thing in the graph (has an ID, a type, and attributes).
466
+ - **Edge / relation** — a directed connection between two nodes (has a type and weight).
467
+ - **Capsule** — a cluster of related nodes, stored as one `.kgzc` file. The unit KGZip
468
+ loads per query.
469
+ - **Manifest** — `manifest.kgz.json`, the index that maps nodes to capsules.
470
+ - **Community** — a group of nodes more densely connected to each other than to the
471
+ rest of the graph; KGZip turns each into a capsule.
472
+ - **Boundary / halo node** — a node on the edge of a capsule that also connects to a
473
+ neighbouring capsule; stored in both so no edge is lost.
474
+ - **Depth** — how many hops outward from your seed nodes a query reaches (`None` = unbounded).
475
+ - **Trim** — prune a query result to the exact depth-hop neighbourhood (token-lean,
476
+ lossless w.r.t. the query). Opt-in via `trim=True`.
477
+ - **Truncated** — a query result flagged `truncated=True` because `max_capsules` capped
478
+ it; the answer is incomplete and you should raise the cap.
479
+ - **Master** — your original source-of-truth graph. KGZip never writes to it.
480
+ - **Lossless** — compressing then querying everything returns the exact original graph.
481
+ - **Bolt** — Neo4j's network protocol; a `bolt://host:port` URL is the database address.