contextfit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextfit-0.1.0/.gitignore +62 -0
- contextfit-0.1.0/PKG-INFO +195 -0
- contextfit-0.1.0/README.md +161 -0
- contextfit-0.1.0/benchmarks/agent_memory_eval.py +820 -0
- contextfit-0.1.0/benchmarks/data/agent_memory_eval.json +4940 -0
- contextfit-0.1.0/benchmarks/data/agent_memory_eval_500.json +39424 -0
- contextfit-0.1.0/benchmarks/data/generated_cases.json +34486 -0
- contextfit-0.1.0/benchmarks/generate_eval_cases.py +314 -0
- contextfit-0.1.0/benchmarks/longmemeval_contextfit.py +690 -0
- contextfit-0.1.0/benchmarks/longmemeval_contextfit_report.md +685 -0
- contextfit-0.1.0/benchmarks/longmemeval_local_embed.py +259 -0
- contextfit-0.1.0/benchmarks/longmemeval_local_embed_fusion.json +12545 -0
- contextfit-0.1.0/benchmarks/longmemeval_pool_recency_full.json +13068 -0
- contextfit-0.1.0/benchmarks/longmemeval_structural_rerank.json +13068 -0
- contextfit-0.1.0/benchmarks/mem0_timing_sample.py +28 -0
- contextfit-0.1.0/benchmarks/vs_text_search.py +233 -0
- contextfit-0.1.0/benchmarks/vs_text_search_analysis.md +81 -0
- contextfit-0.1.0/benchmarks/vs_text_search_results.json +362 -0
- contextfit-0.1.0/build_env/.gitignore +2 -0
- contextfit-0.1.0/build_env/bin/Activate.ps1 +248 -0
- contextfit-0.1.0/build_env/bin/activate +76 -0
- contextfit-0.1.0/build_env/bin/activate.csh +27 -0
- contextfit-0.1.0/build_env/bin/activate.fish +69 -0
- contextfit-0.1.0/build_env/bin/docutils +6 -0
- contextfit-0.1.0/build_env/bin/keyring +6 -0
- contextfit-0.1.0/build_env/bin/markdown-it +6 -0
- contextfit-0.1.0/build_env/bin/normalizer +6 -0
- contextfit-0.1.0/build_env/bin/pip +6 -0
- contextfit-0.1.0/build_env/bin/pip3 +6 -0
- contextfit-0.1.0/build_env/bin/pip3.14 +6 -0
- contextfit-0.1.0/build_env/bin/pygmentize +6 -0
- contextfit-0.1.0/build_env/bin/pyproject-build +6 -0
- contextfit-0.1.0/build_env/bin/python +1 -0
- contextfit-0.1.0/build_env/bin/python3 +1 -0
- contextfit-0.1.0/build_env/bin/python3.14 +1 -0
- contextfit-0.1.0/build_env/bin/rst2html +6 -0
- contextfit-0.1.0/build_env/bin/rst2html4 +6 -0
- contextfit-0.1.0/build_env/bin/rst2html5 +6 -0
- contextfit-0.1.0/build_env/bin/rst2latex +6 -0
- contextfit-0.1.0/build_env/bin/rst2man +6 -0
- contextfit-0.1.0/build_env/bin/rst2odt +6 -0
- contextfit-0.1.0/build_env/bin/rst2pseudoxml +6 -0
- contextfit-0.1.0/build_env/bin/rst2s5 +6 -0
- contextfit-0.1.0/build_env/bin/rst2xetex +6 -0
- contextfit-0.1.0/build_env/bin/rst2xml +6 -0
- contextfit-0.1.0/build_env/bin/twine +6 -0
- contextfit-0.1.0/build_env/bin//360/235/234/213thon +1 -0
- contextfit-0.1.0/build_env/pyvenv.cfg +5 -0
- contextfit-0.1.0/docs/ARCHITECTURE.md +588 -0
- contextfit-0.1.0/docs/MACBOOK_CLI_DEPLOY.md +133 -0
- contextfit-0.1.0/docs/OPENCLAW_INTEGRATION.md +98 -0
- contextfit-0.1.0/docs/PERFORMANCE_TESTING.md +89 -0
- contextfit-0.1.0/examples/benchmark_needle_haystack.py +298 -0
- contextfit-0.1.0/examples/benchmark_sample_corpus.py +146 -0
- contextfit-0.1.0/examples/quickstart.py +97 -0
- contextfit-0.1.0/integrations/openclaw/contextfit-search/index.js +247 -0
- contextfit-0.1.0/integrations/openclaw/contextfit-search/openclaw.plugin.json +33 -0
- contextfit-0.1.0/integrations/openclaw/contextfit-search/package.json +10 -0
- contextfit-0.1.0/pyproject.toml +68 -0
- contextfit-0.1.0/scripts/benchmark_semantic_expansion.py +171 -0
- contextfit-0.1.0/scripts/build_embedding_expander.py +85 -0
- contextfit-0.1.0/scripts/convert_sessions.py +156 -0
- contextfit-0.1.0/scripts/index_memory.py +200 -0
- contextfit-0.1.0/scripts/install_openclaw_plugin.sh +24 -0
- contextfit-0.1.0/scripts/start_server.sh +21 -0
- contextfit-0.1.0/scripts/start_server_spark.sh +18 -0
- contextfit-0.1.0/server.py +484 -0
- contextfit-0.1.0/src/contextfit/__init__.py +37 -0
- contextfit-0.1.0/src/contextfit/cli.py +833 -0
- contextfit-0.1.0/src/contextfit/core/__init__.py +6 -0
- contextfit-0.1.0/src/contextfit/core/chunk.py +231 -0
- contextfit-0.1.0/src/contextfit/core/expander.py +213 -0
- contextfit-0.1.0/src/contextfit/core/tokenizer.py +179 -0
- contextfit-0.1.0/src/contextfit/extractors/__init__.py +25 -0
- contextfit-0.1.0/src/contextfit/extractors/auto.py +53 -0
- contextfit-0.1.0/src/contextfit/extractors/base.py +58 -0
- contextfit-0.1.0/src/contextfit/extractors/document.py +72 -0
- contextfit-0.1.0/src/contextfit/extractors/email.py +78 -0
- contextfit-0.1.0/src/contextfit/extractors/tmd.py +171 -0
- contextfit-0.1.0/src/contextfit/graph/__init__.py +6 -0
- contextfit-0.1.0/src/contextfit/graph/community.py +289 -0
- contextfit-0.1.0/src/contextfit/graph/similarity.py +300 -0
- contextfit-0.1.0/src/contextfit/hierarchy/__init__.py +5 -0
- contextfit-0.1.0/src/contextfit/hierarchy/levels.py +296 -0
- contextfit-0.1.0/src/contextfit/index/__init__.py +6 -0
- contextfit-0.1.0/src/contextfit/index/bm25.py +181 -0
- contextfit-0.1.0/src/contextfit/index/inverted.py +830 -0
- contextfit-0.1.0/src/contextfit/index/semantic_expand.py +416 -0
- contextfit-0.1.0/src/contextfit/metadata/__init__.py +3 -0
- contextfit-0.1.0/src/contextfit/metadata/index.py +325 -0
- contextfit-0.1.0/src/contextfit/retrieval/__init__.py +41 -0
- contextfit-0.1.0/src/contextfit/retrieval/engine.py +1233 -0
- contextfit-0.1.0/src/contextfit/retrieval/memory_atoms.py +412 -0
- contextfit-0.1.0/src/contextfit/retrieval/query_router.py +274 -0
- contextfit-0.1.0/src/contextfit/retrieval/token_rerank.py +239 -0
- contextfit-0.1.0/src/contextfit/sid/__init__.py +14 -0
- contextfit-0.1.0/src/contextfit/sid/generator.py +196 -0
- contextfit-0.1.0/src/contextfit/sid/learned.py +382 -0
- contextfit-0.1.0/src/contextfit/sid/semantic.py +317 -0
- contextfit-0.1.0/tests/__init__.py +1 -0
- contextfit-0.1.0/tests/test_binary_storage.py +59 -0
- contextfit-0.1.0/tests/test_cli_json.py +78 -0
- contextfit-0.1.0/tests/test_core.py +127 -0
- contextfit-0.1.0/tests/test_engine_auto.py +155 -0
- contextfit-0.1.0/tests/test_exact_search.py +30 -0
- contextfit-0.1.0/tests/test_memory_atoms.py +76 -0
- contextfit-0.1.0/tests/test_needle_haystack_benchmark.py +39 -0
- contextfit-0.1.0/tests/test_query_router.py +139 -0
- contextfit-0.1.0/tests/test_sample_corpus_performance.py +173 -0
- contextfit-0.1.0/tests/test_semantic_ids.py +176 -0
- contextfit-0.1.0/website/index.html +1393 -0
- contextfit-0.1.0/whitepaper/token-native-agent-memory.md +606 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
ENV/
|
|
27
|
+
env/
|
|
28
|
+
|
|
29
|
+
# IDE
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
*~
|
|
35
|
+
|
|
36
|
+
# Testing
|
|
37
|
+
.pytest_cache/
|
|
38
|
+
.coverage
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
|
|
43
|
+
# mypy
|
|
44
|
+
.mypy_cache/
|
|
45
|
+
.dmypy.json
|
|
46
|
+
dmypy.json
|
|
47
|
+
|
|
48
|
+
# Ruff
|
|
49
|
+
.ruff_cache/
|
|
50
|
+
|
|
51
|
+
# Knowledge bases (created by examples)
|
|
52
|
+
contextfit_kb/
|
|
53
|
+
*.bin
|
|
54
|
+
index.json
|
|
55
|
+
|
|
56
|
+
# OS
|
|
57
|
+
.DS_Store
|
|
58
|
+
Thumbs.db
|
|
59
|
+
benchmarks/data/longmemeval_*.json
|
|
60
|
+
benchmarks/cache/
|
|
61
|
+
benchmarks/longmemeval_contextfit_*.json
|
|
62
|
+
benchmarks/agent_memory_eval_*.json
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contextfit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Token-native knowledge base for LLM scale
|
|
5
|
+
Project-URL: Homepage, https://github.com/ContextFit/cf
|
|
6
|
+
Project-URL: Documentation, https://github.com/ContextFit/cf#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/ContextFit/cf
|
|
8
|
+
Author-email: Christophe Ponsart <cponsart@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: graph,knowledge-base,llm,rag,tokens
|
|
11
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: datasketch>=1.6.0
|
|
21
|
+
Requires-Dist: networkx>=3.0
|
|
22
|
+
Requires-Dist: numpy>=1.24.0
|
|
23
|
+
Requires-Dist: pyroaring>=0.4.0
|
|
24
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
25
|
+
Requires-Dist: tqdm>=4.65.0
|
|
26
|
+
Requires-Dist: zstandard>=0.21.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
32
|
+
Provides-Extra: rust
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# ContextFit
|
|
36
|
+
|
|
37
|
+
**A token-native knowledge base designed for LLM scale.**
|
|
38
|
+
|
|
39
|
+
ContextFit keeps everything—storage, indexing, search, relationships, traversal, and commonality detection—inside discrete token-ID space until the very last step, when you decode only the final retrieved token chunks for the LLM's output.
|
|
40
|
+
|
|
41
|
+
## Why Token-Native?
|
|
42
|
+
|
|
43
|
+
- **~2× smaller storage** than raw text (no repeated tokenization)
|
|
44
|
+
- **Blazing-fast integer-only operations** (no float embeddings)
|
|
45
|
+
- **Hierarchical "geo-map-style" traversal** for multi-hop reasoning
|
|
46
|
+
- **Neural-network-like chunk relationships** via token overlap graphs
|
|
47
|
+
- **Automatic commonality discovery** without vector spaces
|
|
48
|
+
- **Direct LLM injection** — feed `input_ids` directly, no conversion
|
|
49
|
+
|
|
50
|
+
## Architecture
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
54
|
+
│ ContextFit │
|
|
55
|
+
├─────────────────────────────────────────────────────────────────┤
|
|
56
|
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │
|
|
57
|
+
│ │ Storage │ │ Index │ │ Graph │ │
|
|
58
|
+
│ │ │ │ │ │ │ │
|
|
59
|
+
│ │ Token Arrays│ │ Inverted │ │ Chunk Relationships │ │
|
|
60
|
+
│ │ Chunk Store │ │ Suffix/FM │ │ Community Detection │ │
|
|
61
|
+
│ │ Compression │ │ BM25 Tokens │ │ Commonality Mining │ │
|
|
62
|
+
│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │
|
|
63
|
+
│ │
|
|
64
|
+
│ ┌─────────────────────────────┐ ┌─────────────────────────┐ │
|
|
65
|
+
│ │ Hierarchy │ │ Retrieval │ │
|
|
66
|
+
│ │ │ │ │ │
|
|
67
|
+
│ │ Level 0: Raw Chunks │ │ Query Tokenization │ │
|
|
68
|
+
│ │ Level 1+: Summary Clusters │ │ Graph Traversal │ │
|
|
69
|
+
│ │ Geo-Map Navigation │ │ Direct input_ids Output │ │
|
|
70
|
+
│ └─────────────────────────────┘ └─────────────────────────┘ │
|
|
71
|
+
│ │
|
|
72
|
+
│ ┌─────────────────────────────────────────────────────────────┐│
|
|
73
|
+
│ │ Semantic IDs (SIDs) ││
|
|
74
|
+
│ │ ││
|
|
75
|
+
│ │ Hierarchical token sequences → generative retrieval ││
|
|
76
|
+
│ │ Similar chunks share prefixes → trie-like navigation ││
|
|
77
|
+
│ └─────────────────────────────────────────────────────────────┘│
|
|
78
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Core Components
|
|
82
|
+
|
|
83
|
+
### 1. Storage Layer
|
|
84
|
+
- Token arrays (uint16/uint32 IDs)
|
|
85
|
+
- Memory-mapped files for large corpora
|
|
86
|
+
- Delta encoding + Zstd compression
|
|
87
|
+
- Chunk metadata headers
|
|
88
|
+
|
|
89
|
+
### 2. Index Layer
|
|
90
|
+
- **Inverted Index**: tokenID → [(chunkID, positions)] using Roaring bitmaps
|
|
91
|
+
- **Suffix Array / FM-Index**: Instant exact n-gram search
|
|
92
|
+
- **BM25 on Tokens**: TF-IDF scoring with token IDs as terms
|
|
93
|
+
- **Binary postings pack**: one compact `postings.bin` instead of JSON-per-token files
|
|
94
|
+
|
|
95
|
+
### 3. Graph Layer
|
|
96
|
+
- Nodes = chunks (or Semantic IDs)
|
|
97
|
+
- Edges = token n-gram overlap, Jaccard similarity, co-occurrence
|
|
98
|
+
- MinHash + LSH for fast similarity without floats
|
|
99
|
+
- Community detection for commonality discovery
|
|
100
|
+
|
|
101
|
+
### 4. Hierarchy Layer
|
|
102
|
+
- Level 0: Raw token chunks (256–1024 tokens each)
|
|
103
|
+
- Level 1+: Clustered summaries as token sequences
|
|
104
|
+
- GraphRAG-style community summaries
|
|
105
|
+
- Integer pointers for zoom navigation
|
|
106
|
+
|
|
107
|
+
### 5. Retrieval Layer
|
|
108
|
+
- Tokenize query → search indexes → traverse graph → collect token IDs
|
|
109
|
+
- Feed directly as `input_ids` to any LLM
|
|
110
|
+
- No detokenization until final generation
|
|
111
|
+
|
|
112
|
+
### 6. Semantic IDs
|
|
113
|
+
- Assign each chunk a short hierarchical SID token sequence
|
|
114
|
+
- Similar chunks share prefixes via MinHash-band residual buckets
|
|
115
|
+
- Resolve generated/predicted SID prefixes through a trie with prefix backoff
|
|
116
|
+
- Retrieval mode: `--method sid` or hybrid SID + BM25
|
|
117
|
+
|
|
118
|
+
### 7. SID Generator
|
|
119
|
+
- Predicts SID prefixes from query tokens without detokenizing
|
|
120
|
+
- Combines BM25 candidate chunks, MinHash similarity, and LSH neighbors
|
|
121
|
+
- Candidate chunks vote for hierarchical SID prefixes
|
|
122
|
+
- Returns generated SID predictions plus resolved chunk IDs
|
|
123
|
+
|
|
124
|
+
### 8. Learned SID Generator
|
|
125
|
+
- Trains a sparse token→SID associative model from stored chunks
|
|
126
|
+
- Uses beam search over valid SID prefixes
|
|
127
|
+
- No neural dependency yet; still token-native and deterministic
|
|
128
|
+
- CLI: `contextfit ingest ./docs --train-sid-generator`
|
|
129
|
+
|
|
130
|
+
## Getting Started
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Install dependencies
|
|
134
|
+
pip install -e .
|
|
135
|
+
|
|
136
|
+
# Ingest a knowledge base
|
|
137
|
+
contextfit ingest ./documents --tokenizer tiktoken
|
|
138
|
+
|
|
139
|
+
# Query
|
|
140
|
+
contextfit query "What is ContextFit?"
|
|
141
|
+
|
|
142
|
+
# Query through Semantic IDs
|
|
143
|
+
contextfit query "async retrieval" --method sid
|
|
144
|
+
|
|
145
|
+
# Agent-friendly machine-readable output
|
|
146
|
+
contextfit query "What is ContextFit?" --method hybrid --json
|
|
147
|
+
contextfit stats --json
|
|
148
|
+
|
|
149
|
+
# Run a deterministic sample benchmark
|
|
150
|
+
python examples/benchmark_sample_corpus.py --docs-per-topic 100 --json
|
|
151
|
+
|
|
152
|
+
# Run needle-in-a-haystack benchmark
|
|
153
|
+
python examples/benchmark_needle_haystack.py --needles 20 --distractors 200 --top-k 5 --json
|
|
154
|
+
|
|
155
|
+
# Ingest and train the learned SID generator
|
|
156
|
+
contextfit ingest ./documents --train-sid-generator
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
For installing on a MacBook/OpenClaw node, see [`docs/MACBOOK_CLI_DEPLOY.md`](docs/MACBOOK_CLI_DEPLOY.md).
|
|
160
|
+
|
|
161
|
+
For OpenClaw integration, including the `contextfit_search` tool and `contextfit` context engine plugin, see [`docs/OPENCLAW_INTEGRATION.md`](docs/OPENCLAW_INTEGRATION.md).
|
|
162
|
+
|
|
163
|
+
`--json` is intended for OpenClaw/agent use. Query JSON includes `input_ids`, retrieved chunk metadata, SID predictions, semantic IDs, and decoded previews.
|
|
164
|
+
|
|
165
|
+
## Current Storage Layout
|
|
166
|
+
|
|
167
|
+
```text
|
|
168
|
+
contextfit_kb/
|
|
169
|
+
chunks/
|
|
170
|
+
chunks.bin # zstd-compressed token-array records
|
|
171
|
+
index.json # chunk_id → byte offset/length
|
|
172
|
+
inverted/
|
|
173
|
+
meta.json # corpus/index metadata
|
|
174
|
+
postings.bin # compact binary token → roaring bitmap + positions pack
|
|
175
|
+
sid/
|
|
176
|
+
semantic_ids.json
|
|
177
|
+
learned_sid_generator.json
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
The inverted index now saves as a single binary postings pack by default. Legacy JSON-per-token indexes still load for compatibility.
|
|
181
|
+
|
|
182
|
+
## Project Status
|
|
183
|
+
|
|
184
|
+
🚧 **Early Development** — Architecture phase
|
|
185
|
+
|
|
186
|
+
## References
|
|
187
|
+
|
|
188
|
+
- TERAG: Token-Efficient GraphRAG (3–11% token reduction)
|
|
189
|
+
- Semantic IDs / Generative Retrieval
|
|
190
|
+
- GraphRAG community detection
|
|
191
|
+
- Letta's token-space learning
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
MIT
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# ContextFit
|
|
2
|
+
|
|
3
|
+
**A token-native knowledge base designed for LLM scale.**
|
|
4
|
+
|
|
5
|
+
ContextFit keeps everything—storage, indexing, search, relationships, traversal, and commonality detection—inside discrete token-ID space until the very last step, when you decode only the final retrieved token chunks for the LLM's output.
|
|
6
|
+
|
|
7
|
+
## Why Token-Native?
|
|
8
|
+
|
|
9
|
+
- **~2× smaller storage** than raw text (no repeated tokenization)
|
|
10
|
+
- **Blazing-fast integer-only operations** (no float embeddings)
|
|
11
|
+
- **Hierarchical "geo-map-style" traversal** for multi-hop reasoning
|
|
12
|
+
- **Neural-network-like chunk relationships** via token overlap graphs
|
|
13
|
+
- **Automatic commonality discovery** without vector spaces
|
|
14
|
+
- **Direct LLM injection** — feed `input_ids` directly, no conversion
|
|
15
|
+
|
|
16
|
+
## Architecture
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
20
|
+
│ ContextFit │
|
|
21
|
+
├─────────────────────────────────────────────────────────────────┤
|
|
22
|
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │
|
|
23
|
+
│ │ Storage │ │ Index │ │ Graph │ │
|
|
24
|
+
│ │ │ │ │ │ │ │
|
|
25
|
+
│ │ Token Arrays│ │ Inverted │ │ Chunk Relationships │ │
|
|
26
|
+
│ │ Chunk Store │ │ Suffix/FM │ │ Community Detection │ │
|
|
27
|
+
│ │ Compression │ │ BM25 Tokens │ │ Commonality Mining │ │
|
|
28
|
+
│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │
|
|
29
|
+
│ │
|
|
30
|
+
│ ┌─────────────────────────────┐ ┌─────────────────────────┐ │
|
|
31
|
+
│ │ Hierarchy │ │ Retrieval │ │
|
|
32
|
+
│ │ │ │ │ │
|
|
33
|
+
│ │ Level 0: Raw Chunks │ │ Query Tokenization │ │
|
|
34
|
+
│ │ Level 1+: Summary Clusters │ │ Graph Traversal │ │
|
|
35
|
+
│ │ Geo-Map Navigation │ │ Direct input_ids Output │ │
|
|
36
|
+
│ └─────────────────────────────┘ └─────────────────────────┘ │
|
|
37
|
+
│ │
|
|
38
|
+
│ ┌─────────────────────────────────────────────────────────────┐│
|
|
39
|
+
│ │ Semantic IDs (SIDs) ││
|
|
40
|
+
│ │ ││
|
|
41
|
+
│ │ Hierarchical token sequences → generative retrieval ││
|
|
42
|
+
│ │ Similar chunks share prefixes → trie-like navigation ││
|
|
43
|
+
│ └─────────────────────────────────────────────────────────────┘│
|
|
44
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Core Components
|
|
48
|
+
|
|
49
|
+
### 1. Storage Layer
|
|
50
|
+
- Token arrays (uint16/uint32 IDs)
|
|
51
|
+
- Memory-mapped files for large corpora
|
|
52
|
+
- Delta encoding + Zstd compression
|
|
53
|
+
- Chunk metadata headers
|
|
54
|
+
|
|
55
|
+
### 2. Index Layer
|
|
56
|
+
- **Inverted Index**: tokenID → [(chunkID, positions)] using Roaring bitmaps
|
|
57
|
+
- **Suffix Array / FM-Index**: Instant exact n-gram search
|
|
58
|
+
- **BM25 on Tokens**: TF-IDF scoring with token IDs as terms
|
|
59
|
+
- **Binary postings pack**: one compact `postings.bin` instead of JSON-per-token files
|
|
60
|
+
|
|
61
|
+
### 3. Graph Layer
|
|
62
|
+
- Nodes = chunks (or Semantic IDs)
|
|
63
|
+
- Edges = token n-gram overlap, Jaccard similarity, co-occurrence
|
|
64
|
+
- MinHash + LSH for fast similarity without floats
|
|
65
|
+
- Community detection for commonality discovery
|
|
66
|
+
|
|
67
|
+
### 4. Hierarchy Layer
|
|
68
|
+
- Level 0: Raw token chunks (256–1024 tokens each)
|
|
69
|
+
- Level 1+: Clustered summaries as token sequences
|
|
70
|
+
- GraphRAG-style community summaries
|
|
71
|
+
- Integer pointers for zoom navigation
|
|
72
|
+
|
|
73
|
+
### 5. Retrieval Layer
|
|
74
|
+
- Tokenize query → search indexes → traverse graph → collect token IDs
|
|
75
|
+
- Feed directly as `input_ids` to any LLM
|
|
76
|
+
- No detokenization until final generation
|
|
77
|
+
|
|
78
|
+
### 6. Semantic IDs
|
|
79
|
+
- Assign each chunk a short hierarchical SID token sequence
|
|
80
|
+
- Similar chunks share prefixes via MinHash-band residual buckets
|
|
81
|
+
- Resolve generated/predicted SID prefixes through a trie with prefix backoff
|
|
82
|
+
- Retrieval mode: `--method sid` or hybrid SID + BM25
|
|
83
|
+
|
|
84
|
+
### 7. SID Generator
|
|
85
|
+
- Predicts SID prefixes from query tokens without detokenizing
|
|
86
|
+
- Combines BM25 candidate chunks, MinHash similarity, and LSH neighbors
|
|
87
|
+
- Candidate chunks vote for hierarchical SID prefixes
|
|
88
|
+
- Returns generated SID predictions plus resolved chunk IDs
|
|
89
|
+
|
|
90
|
+
### 8. Learned SID Generator
|
|
91
|
+
- Trains a sparse token→SID associative model from stored chunks
|
|
92
|
+
- Uses beam search over valid SID prefixes
|
|
93
|
+
- No neural dependency yet; still token-native and deterministic
|
|
94
|
+
- CLI: `contextfit ingest ./docs --train-sid-generator`
|
|
95
|
+
|
|
96
|
+
## Getting Started
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# Install dependencies
|
|
100
|
+
pip install -e .
|
|
101
|
+
|
|
102
|
+
# Ingest a knowledge base
|
|
103
|
+
contextfit ingest ./documents --tokenizer tiktoken
|
|
104
|
+
|
|
105
|
+
# Query
|
|
106
|
+
contextfit query "What is ContextFit?"
|
|
107
|
+
|
|
108
|
+
# Query through Semantic IDs
|
|
109
|
+
contextfit query "async retrieval" --method sid
|
|
110
|
+
|
|
111
|
+
# Agent-friendly machine-readable output
|
|
112
|
+
contextfit query "What is ContextFit?" --method hybrid --json
|
|
113
|
+
contextfit stats --json
|
|
114
|
+
|
|
115
|
+
# Run a deterministic sample benchmark
|
|
116
|
+
python examples/benchmark_sample_corpus.py --docs-per-topic 100 --json
|
|
117
|
+
|
|
118
|
+
# Run needle-in-a-haystack benchmark
|
|
119
|
+
python examples/benchmark_needle_haystack.py --needles 20 --distractors 200 --top-k 5 --json
|
|
120
|
+
|
|
121
|
+
# Ingest and train the learned SID generator
|
|
122
|
+
contextfit ingest ./documents --train-sid-generator
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
For installing on a MacBook/OpenClaw node, see [`docs/MACBOOK_CLI_DEPLOY.md`](docs/MACBOOK_CLI_DEPLOY.md).
|
|
126
|
+
|
|
127
|
+
For OpenClaw integration, including the `contextfit_search` tool and `contextfit` context engine plugin, see [`docs/OPENCLAW_INTEGRATION.md`](docs/OPENCLAW_INTEGRATION.md).
|
|
128
|
+
|
|
129
|
+
`--json` is intended for OpenClaw/agent use. Query JSON includes `input_ids`, retrieved chunk metadata, SID predictions, semantic IDs, and decoded previews.
|
|
130
|
+
|
|
131
|
+
## Current Storage Layout
|
|
132
|
+
|
|
133
|
+
```text
|
|
134
|
+
contextfit_kb/
|
|
135
|
+
chunks/
|
|
136
|
+
chunks.bin # zstd-compressed token-array records
|
|
137
|
+
index.json # chunk_id → byte offset/length
|
|
138
|
+
inverted/
|
|
139
|
+
meta.json # corpus/index metadata
|
|
140
|
+
postings.bin # compact binary token → roaring bitmap + positions pack
|
|
141
|
+
sid/
|
|
142
|
+
semantic_ids.json
|
|
143
|
+
learned_sid_generator.json
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
The inverted index now saves as a single binary postings pack by default. Legacy JSON-per-token indexes still load for compatibility.
|
|
147
|
+
|
|
148
|
+
## Project Status
|
|
149
|
+
|
|
150
|
+
🚧 **Early Development** — Architecture phase
|
|
151
|
+
|
|
152
|
+
## References
|
|
153
|
+
|
|
154
|
+
- TERAG: Token-Efficient GraphRAG (3–11% token reduction)
|
|
155
|
+
- Semantic IDs / Generative Retrieval
|
|
156
|
+
- GraphRAG community detection
|
|
157
|
+
- Letta's token-space learning
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
MIT
|