contextfit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. contextfit-0.1.0/.gitignore +62 -0
  2. contextfit-0.1.0/PKG-INFO +195 -0
  3. contextfit-0.1.0/README.md +161 -0
  4. contextfit-0.1.0/benchmarks/agent_memory_eval.py +820 -0
  5. contextfit-0.1.0/benchmarks/data/agent_memory_eval.json +4940 -0
  6. contextfit-0.1.0/benchmarks/data/agent_memory_eval_500.json +39424 -0
  7. contextfit-0.1.0/benchmarks/data/generated_cases.json +34486 -0
  8. contextfit-0.1.0/benchmarks/generate_eval_cases.py +314 -0
  9. contextfit-0.1.0/benchmarks/longmemeval_contextfit.py +690 -0
  10. contextfit-0.1.0/benchmarks/longmemeval_contextfit_report.md +685 -0
  11. contextfit-0.1.0/benchmarks/longmemeval_local_embed.py +259 -0
  12. contextfit-0.1.0/benchmarks/longmemeval_local_embed_fusion.json +12545 -0
  13. contextfit-0.1.0/benchmarks/longmemeval_pool_recency_full.json +13068 -0
  14. contextfit-0.1.0/benchmarks/longmemeval_structural_rerank.json +13068 -0
  15. contextfit-0.1.0/benchmarks/mem0_timing_sample.py +28 -0
  16. contextfit-0.1.0/benchmarks/vs_text_search.py +233 -0
  17. contextfit-0.1.0/benchmarks/vs_text_search_analysis.md +81 -0
  18. contextfit-0.1.0/benchmarks/vs_text_search_results.json +362 -0
  19. contextfit-0.1.0/build_env/.gitignore +2 -0
  20. contextfit-0.1.0/build_env/bin/Activate.ps1 +248 -0
  21. contextfit-0.1.0/build_env/bin/activate +76 -0
  22. contextfit-0.1.0/build_env/bin/activate.csh +27 -0
  23. contextfit-0.1.0/build_env/bin/activate.fish +69 -0
  24. contextfit-0.1.0/build_env/bin/docutils +6 -0
  25. contextfit-0.1.0/build_env/bin/keyring +6 -0
  26. contextfit-0.1.0/build_env/bin/markdown-it +6 -0
  27. contextfit-0.1.0/build_env/bin/normalizer +6 -0
  28. contextfit-0.1.0/build_env/bin/pip +6 -0
  29. contextfit-0.1.0/build_env/bin/pip3 +6 -0
  30. contextfit-0.1.0/build_env/bin/pip3.14 +6 -0
  31. contextfit-0.1.0/build_env/bin/pygmentize +6 -0
  32. contextfit-0.1.0/build_env/bin/pyproject-build +6 -0
  33. contextfit-0.1.0/build_env/bin/python +1 -0
  34. contextfit-0.1.0/build_env/bin/python3 +1 -0
  35. contextfit-0.1.0/build_env/bin/python3.14 +1 -0
  36. contextfit-0.1.0/build_env/bin/rst2html +6 -0
  37. contextfit-0.1.0/build_env/bin/rst2html4 +6 -0
  38. contextfit-0.1.0/build_env/bin/rst2html5 +6 -0
  39. contextfit-0.1.0/build_env/bin/rst2latex +6 -0
  40. contextfit-0.1.0/build_env/bin/rst2man +6 -0
  41. contextfit-0.1.0/build_env/bin/rst2odt +6 -0
  42. contextfit-0.1.0/build_env/bin/rst2pseudoxml +6 -0
  43. contextfit-0.1.0/build_env/bin/rst2s5 +6 -0
  44. contextfit-0.1.0/build_env/bin/rst2xetex +6 -0
  45. contextfit-0.1.0/build_env/bin/rst2xml +6 -0
  46. contextfit-0.1.0/build_env/bin/twine +6 -0
  47. contextfit-0.1.0/build_env/bin//360/235/234/213thon +1 -0
  48. contextfit-0.1.0/build_env/pyvenv.cfg +5 -0
  49. contextfit-0.1.0/docs/ARCHITECTURE.md +588 -0
  50. contextfit-0.1.0/docs/MACBOOK_CLI_DEPLOY.md +133 -0
  51. contextfit-0.1.0/docs/OPENCLAW_INTEGRATION.md +98 -0
  52. contextfit-0.1.0/docs/PERFORMANCE_TESTING.md +89 -0
  53. contextfit-0.1.0/examples/benchmark_needle_haystack.py +298 -0
  54. contextfit-0.1.0/examples/benchmark_sample_corpus.py +146 -0
  55. contextfit-0.1.0/examples/quickstart.py +97 -0
  56. contextfit-0.1.0/integrations/openclaw/contextfit-search/index.js +247 -0
  57. contextfit-0.1.0/integrations/openclaw/contextfit-search/openclaw.plugin.json +33 -0
  58. contextfit-0.1.0/integrations/openclaw/contextfit-search/package.json +10 -0
  59. contextfit-0.1.0/pyproject.toml +68 -0
  60. contextfit-0.1.0/scripts/benchmark_semantic_expansion.py +171 -0
  61. contextfit-0.1.0/scripts/build_embedding_expander.py +85 -0
  62. contextfit-0.1.0/scripts/convert_sessions.py +156 -0
  63. contextfit-0.1.0/scripts/index_memory.py +200 -0
  64. contextfit-0.1.0/scripts/install_openclaw_plugin.sh +24 -0
  65. contextfit-0.1.0/scripts/start_server.sh +21 -0
  66. contextfit-0.1.0/scripts/start_server_spark.sh +18 -0
  67. contextfit-0.1.0/server.py +484 -0
  68. contextfit-0.1.0/src/contextfit/__init__.py +37 -0
  69. contextfit-0.1.0/src/contextfit/cli.py +833 -0
  70. contextfit-0.1.0/src/contextfit/core/__init__.py +6 -0
  71. contextfit-0.1.0/src/contextfit/core/chunk.py +231 -0
  72. contextfit-0.1.0/src/contextfit/core/expander.py +213 -0
  73. contextfit-0.1.0/src/contextfit/core/tokenizer.py +179 -0
  74. contextfit-0.1.0/src/contextfit/extractors/__init__.py +25 -0
  75. contextfit-0.1.0/src/contextfit/extractors/auto.py +53 -0
  76. contextfit-0.1.0/src/contextfit/extractors/base.py +58 -0
  77. contextfit-0.1.0/src/contextfit/extractors/document.py +72 -0
  78. contextfit-0.1.0/src/contextfit/extractors/email.py +78 -0
  79. contextfit-0.1.0/src/contextfit/extractors/tmd.py +171 -0
  80. contextfit-0.1.0/src/contextfit/graph/__init__.py +6 -0
  81. contextfit-0.1.0/src/contextfit/graph/community.py +289 -0
  82. contextfit-0.1.0/src/contextfit/graph/similarity.py +300 -0
  83. contextfit-0.1.0/src/contextfit/hierarchy/__init__.py +5 -0
  84. contextfit-0.1.0/src/contextfit/hierarchy/levels.py +296 -0
  85. contextfit-0.1.0/src/contextfit/index/__init__.py +6 -0
  86. contextfit-0.1.0/src/contextfit/index/bm25.py +181 -0
  87. contextfit-0.1.0/src/contextfit/index/inverted.py +830 -0
  88. contextfit-0.1.0/src/contextfit/index/semantic_expand.py +416 -0
  89. contextfit-0.1.0/src/contextfit/metadata/__init__.py +3 -0
  90. contextfit-0.1.0/src/contextfit/metadata/index.py +325 -0
  91. contextfit-0.1.0/src/contextfit/retrieval/__init__.py +41 -0
  92. contextfit-0.1.0/src/contextfit/retrieval/engine.py +1233 -0
  93. contextfit-0.1.0/src/contextfit/retrieval/memory_atoms.py +412 -0
  94. contextfit-0.1.0/src/contextfit/retrieval/query_router.py +274 -0
  95. contextfit-0.1.0/src/contextfit/retrieval/token_rerank.py +239 -0
  96. contextfit-0.1.0/src/contextfit/sid/__init__.py +14 -0
  97. contextfit-0.1.0/src/contextfit/sid/generator.py +196 -0
  98. contextfit-0.1.0/src/contextfit/sid/learned.py +382 -0
  99. contextfit-0.1.0/src/contextfit/sid/semantic.py +317 -0
  100. contextfit-0.1.0/tests/__init__.py +1 -0
  101. contextfit-0.1.0/tests/test_binary_storage.py +59 -0
  102. contextfit-0.1.0/tests/test_cli_json.py +78 -0
  103. contextfit-0.1.0/tests/test_core.py +127 -0
  104. contextfit-0.1.0/tests/test_engine_auto.py +155 -0
  105. contextfit-0.1.0/tests/test_exact_search.py +30 -0
  106. contextfit-0.1.0/tests/test_memory_atoms.py +76 -0
  107. contextfit-0.1.0/tests/test_needle_haystack_benchmark.py +39 -0
  108. contextfit-0.1.0/tests/test_query_router.py +139 -0
  109. contextfit-0.1.0/tests/test_sample_corpus_performance.py +173 -0
  110. contextfit-0.1.0/tests/test_semantic_ids.py +176 -0
  111. contextfit-0.1.0/website/index.html +1393 -0
  112. contextfit-0.1.0/whitepaper/token-native-agent-memory.md +606 -0
@@ -0,0 +1,62 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .venv/
25
+ venv/
26
+ ENV/
27
+ env/
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Testing
37
+ .pytest_cache/
38
+ .coverage
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+
43
+ # mypy
44
+ .mypy_cache/
45
+ .dmypy.json
46
+ dmypy.json
47
+
48
+ # Ruff
49
+ .ruff_cache/
50
+
51
+ # Knowledge bases (created by examples)
52
+ contextfit_kb/
53
+ *.bin
54
+ index.json
55
+
56
+ # OS
57
+ .DS_Store
58
+ Thumbs.db
59
+ benchmarks/data/longmemeval_*.json
60
+ benchmarks/cache/
61
+ benchmarks/longmemeval_contextfit_*.json
62
+ benchmarks/agent_memory_eval_*.json
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextfit
3
+ Version: 0.1.0
4
+ Summary: Token-native knowledge base for LLM scale
5
+ Project-URL: Homepage, https://github.com/ContextFit/cf
6
+ Project-URL: Documentation, https://github.com/ContextFit/cf#readme
7
+ Project-URL: Repository, https://github.com/ContextFit/cf
8
+ Author-email: Christophe Ponsart <cponsart@gmail.com>
9
+ License-Expression: MIT
10
+ Keywords: graph,knowledge-base,llm,rag,tokens
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: datasketch>=1.6.0
21
+ Requires-Dist: networkx>=3.0
22
+ Requires-Dist: numpy>=1.24.0
23
+ Requires-Dist: pyroaring>=0.4.0
24
+ Requires-Dist: tiktoken>=0.5.0
25
+ Requires-Dist: tqdm>=4.65.0
26
+ Requires-Dist: zstandard>=0.21.0
27
+ Provides-Extra: dev
28
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
29
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
30
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
31
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
32
+ Provides-Extra: rust
33
+ Description-Content-Type: text/markdown
34
+
35
+ # ContextFit
36
+
37
+ **A token-native knowledge base designed for LLM scale.**
38
+
39
+ ContextFit keeps everything—storage, indexing, search, relationships, traversal, and commonality detection—inside discrete token-ID space until the very last step, when you decode only the final retrieved token chunks for the LLM's output.
40
+
41
+ ## Why Token-Native?
42
+
43
+ - **~2× smaller storage** than raw text (no repeated tokenization)
44
+ - **Blazing-fast integer-only operations** (no float embeddings)
45
+ - **Hierarchical "geo-map-style" traversal** for multi-hop reasoning
46
+ - **Neural-network-like chunk relationships** via token overlap graphs
47
+ - **Automatic commonality discovery** without vector spaces
48
+ - **Direct LLM injection** — feed `input_ids` directly, no conversion
49
+
50
+ ## Architecture
51
+
52
+ ```
53
+ ┌─────────────────────────────────────────────────────────────────┐
54
+ │ ContextFit │
55
+ ├─────────────────────────────────────────────────────────────────┤
56
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │
57
+ │ │ Storage │ │ Index │ │ Graph │ │
58
+ │ │ │ │ │ │ │ │
59
+ │ │ Token Arrays│ │ Inverted │ │ Chunk Relationships │ │
60
+ │ │ Chunk Store │ │ Suffix/FM │ │ Community Detection │ │
61
+ │ │ Compression │ │ BM25 Tokens │ │ Commonality Mining │ │
62
+ │ └─────────────┘ └─────────────┘ └─────────────────────────┘ │
63
+ │ │
64
+ │ ┌─────────────────────────────┐ ┌─────────────────────────┐ │
65
+ │ │ Hierarchy │ │ Retrieval │ │
66
+ │ │ │ │ │ │
67
+ │ │ Level 0: Raw Chunks │ │ Query Tokenization │ │
68
+ │ │ Level 1+: Summary Clusters │ │ Graph Traversal │ │
69
+ │ │ Geo-Map Navigation │ │ Direct input_ids Output │ │
70
+ │ └─────────────────────────────┘ └─────────────────────────┘ │
71
+ │ │
72
+ │ ┌─────────────────────────────────────────────────────────────┐│
73
+ │ │ Semantic IDs (SIDs) ││
74
+ │ │ ││
75
+ │ │ Hierarchical token sequences → generative retrieval ││
76
+ │ │ Similar chunks share prefixes → trie-like navigation ││
77
+ │ └─────────────────────────────────────────────────────────────┘│
78
+ └─────────────────────────────────────────────────────────────────┘
79
+ ```
80
+
81
+ ## Core Components
82
+
83
+ ### 1. Storage Layer
84
+ - Token arrays (uint16/uint32 IDs)
85
+ - Memory-mapped files for large corpora
86
+ - Delta encoding + Zstd compression
87
+ - Chunk metadata headers
88
+
89
+ ### 2. Index Layer
90
+ - **Inverted Index**: tokenID → [(chunkID, positions)] using Roaring bitmaps
91
+ - **Suffix Array / FM-Index**: Instant exact n-gram search
92
+ - **BM25 on Tokens**: TF-IDF scoring with token IDs as terms
93
+ - **Binary postings pack**: one compact `postings.bin` instead of JSON-per-token files
94
+
95
+ ### 3. Graph Layer
96
+ - Nodes = chunks (or Semantic IDs)
97
+ - Edges = token n-gram overlap, Jaccard similarity, co-occurrence
98
+ - MinHash + LSH for fast similarity without floats
99
+ - Community detection for commonality discovery
100
+
101
+ ### 4. Hierarchy Layer
102
+ - Level 0: Raw token chunks (256–1024 tokens each)
103
+ - Level 1+: Clustered summaries as token sequences
104
+ - GraphRAG-style community summaries
105
+ - Integer pointers for zoom navigation
106
+
107
+ ### 5. Retrieval Layer
108
+ - Tokenize query → search indexes → traverse graph → collect token IDs
109
+ - Feed directly as `input_ids` to any LLM
110
+ - No detokenization until final generation
111
+
112
+ ### 6. Semantic IDs
113
+ - Assign each chunk a short hierarchical SID token sequence
114
+ - Similar chunks share prefixes via MinHash-band residual buckets
115
+ - Resolve generated/predicted SID prefixes through a trie with prefix backoff
116
+ - Retrieval mode: `--method sid` or hybrid SID + BM25
117
+
118
+ ### 7. SID Generator
119
+ - Predicts SID prefixes from query tokens without detokenizing
120
+ - Combines BM25 candidate chunks, MinHash similarity, and LSH neighbors
121
+ - Candidate chunks vote for hierarchical SID prefixes
122
+ - Returns generated SID predictions plus resolved chunk IDs
123
+
124
+ ### 8. Learned SID Generator
125
+ - Trains a sparse token→SID associative model from stored chunks
126
+ - Uses beam search over valid SID prefixes
127
+ - No neural dependency yet; still token-native and deterministic
128
+ - CLI: `contextfit ingest ./docs --train-sid-generator`
129
+
130
+ ## Getting Started
131
+
132
+ ```bash
133
+ # Install dependencies
134
+ pip install -e .
135
+
136
+ # Ingest a knowledge base
137
+ contextfit ingest ./documents --tokenizer tiktoken
138
+
139
+ # Query
140
+ contextfit query "What is ContextFit?"
141
+
142
+ # Query through Semantic IDs
143
+ contextfit query "async retrieval" --method sid
144
+
145
+ # Agent-friendly machine-readable output
146
+ contextfit query "What is ContextFit?" --method hybrid --json
147
+ contextfit stats --json
148
+
149
+ # Run a deterministic sample benchmark
150
+ python examples/benchmark_sample_corpus.py --docs-per-topic 100 --json
151
+
152
+ # Run needle-in-a-haystack benchmark
153
+ python examples/benchmark_needle_haystack.py --needles 20 --distractors 200 --top-k 5 --json
154
+
155
+ # Ingest and train the learned SID generator
156
+ contextfit ingest ./documents --train-sid-generator
157
+ ```
158
+
159
+ For installing on a MacBook/OpenClaw node, see [`docs/MACBOOK_CLI_DEPLOY.md`](docs/MACBOOK_CLI_DEPLOY.md).
160
+
161
+ For OpenClaw integration, including the `contextfit_search` tool and `contextfit` context engine plugin, see [`docs/OPENCLAW_INTEGRATION.md`](docs/OPENCLAW_INTEGRATION.md).
162
+
163
+ `--json` is intended for OpenClaw/agent use. Query JSON includes `input_ids`, retrieved chunk metadata, SID predictions, semantic IDs, and decoded previews.
164
+
165
+ ## Current Storage Layout
166
+
167
+ ```text
168
+ contextfit_kb/
169
+ chunks/
170
+ chunks.bin # zstd-compressed token-array records
171
+ index.json # chunk_id → byte offset/length
172
+ inverted/
173
+ meta.json # corpus/index metadata
174
+ postings.bin # compact binary token → roaring bitmap + positions pack
175
+ sid/
176
+ semantic_ids.json
177
+ learned_sid_generator.json
178
+ ```
179
+
180
+ The inverted index now saves as a single binary postings pack by default. Legacy JSON-per-token indexes still load for compatibility.
181
+
182
+ ## Project Status
183
+
184
+ 🚧 **Early Development** — Architecture phase
185
+
186
+ ## References
187
+
188
+ - TERAG: Token-Efficient GraphRAG (3–11% token reduction)
189
+ - Semantic IDs / Generative Retrieval
190
+ - GraphRAG community detection
191
+ - Letta's token-space learning
192
+
193
+ ## License
194
+
195
+ MIT
@@ -0,0 +1,161 @@
1
+ # ContextFit
2
+
3
+ **A token-native knowledge base designed for LLM scale.**
4
+
5
+ ContextFit keeps everything—storage, indexing, search, relationships, traversal, and commonality detection—inside discrete token-ID space until the very last step, when you decode only the final retrieved token chunks for the LLM's output.
6
+
7
+ ## Why Token-Native?
8
+
9
+ - **~2× smaller storage** than raw text (no repeated tokenization)
10
+ - **Blazing-fast integer-only operations** (no float embeddings)
11
+ - **Hierarchical "geo-map-style" traversal** for multi-hop reasoning
12
+ - **Neural-network-like chunk relationships** via token overlap graphs
13
+ - **Automatic commonality discovery** without vector spaces
14
+ - **Direct LLM injection** — feed `input_ids` directly, no conversion
15
+
16
+ ## Architecture
17
+
18
+ ```
19
+ ┌─────────────────────────────────────────────────────────────────┐
20
+ │ ContextFit │
21
+ ├─────────────────────────────────────────────────────────────────┤
22
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │
23
+ │ │ Storage │ │ Index │ │ Graph │ │
24
+ │ │ │ │ │ │ │ │
25
+ │ │ Token Arrays│ │ Inverted │ │ Chunk Relationships │ │
26
+ │ │ Chunk Store │ │ Suffix/FM │ │ Community Detection │ │
27
+ │ │ Compression │ │ BM25 Tokens │ │ Commonality Mining │ │
28
+ │ └─────────────┘ └─────────────┘ └─────────────────────────┘ │
29
+ │ │
30
+ │ ┌─────────────────────────────┐ ┌─────────────────────────┐ │
31
+ │ │ Hierarchy │ │ Retrieval │ │
32
+ │ │ │ │ │ │
33
+ │ │ Level 0: Raw Chunks │ │ Query Tokenization │ │
34
+ │ │ Level 1+: Summary Clusters │ │ Graph Traversal │ │
35
+ │ │ Geo-Map Navigation │ │ Direct input_ids Output │ │
36
+ │ └─────────────────────────────┘ └─────────────────────────┘ │
37
+ │ │
38
+ │ ┌─────────────────────────────────────────────────────────────┐│
39
+ │ │ Semantic IDs (SIDs) ││
40
+ │ │ ││
41
+ │ │ Hierarchical token sequences → generative retrieval ││
42
+ │ │ Similar chunks share prefixes → trie-like navigation ││
43
+ │ └─────────────────────────────────────────────────────────────┘│
44
+ └─────────────────────────────────────────────────────────────────┘
45
+ ```
46
+
47
+ ## Core Components
48
+
49
+ ### 1. Storage Layer
50
+ - Token arrays (uint16/uint32 IDs)
51
+ - Memory-mapped files for large corpora
52
+ - Delta encoding + Zstd compression
53
+ - Chunk metadata headers
54
+
55
+ ### 2. Index Layer
56
+ - **Inverted Index**: tokenID → [(chunkID, positions)] using Roaring bitmaps
57
+ - **Suffix Array / FM-Index**: Instant exact n-gram search
58
+ - **BM25 on Tokens**: TF-IDF scoring with token IDs as terms
59
+ - **Binary postings pack**: one compact `postings.bin` instead of JSON-per-token files
60
+
61
+ ### 3. Graph Layer
62
+ - Nodes = chunks (or Semantic IDs)
63
+ - Edges = token n-gram overlap, Jaccard similarity, co-occurrence
64
+ - MinHash + LSH for fast similarity without floats
65
+ - Community detection for commonality discovery
66
+
67
+ ### 4. Hierarchy Layer
68
+ - Level 0: Raw token chunks (256–1024 tokens each)
69
+ - Level 1+: Clustered summaries as token sequences
70
+ - GraphRAG-style community summaries
71
+ - Integer pointers for zoom navigation
72
+
73
+ ### 5. Retrieval Layer
74
+ - Tokenize query → search indexes → traverse graph → collect token IDs
75
+ - Feed directly as `input_ids` to any LLM
76
+ - No detokenization until final generation
77
+
78
+ ### 6. Semantic IDs
79
+ - Assign each chunk a short hierarchical SID token sequence
80
+ - Similar chunks share prefixes via MinHash-band residual buckets
81
+ - Resolve generated/predicted SID prefixes through a trie with prefix backoff
82
+ - Retrieval mode: `--method sid` or hybrid SID + BM25
83
+
84
+ ### 7. SID Generator
85
+ - Predicts SID prefixes from query tokens without detokenizing
86
+ - Combines BM25 candidate chunks, MinHash similarity, and LSH neighbors
87
+ - Candidate chunks vote for hierarchical SID prefixes
88
+ - Returns generated SID predictions plus resolved chunk IDs
89
+
90
+ ### 8. Learned SID Generator
91
+ - Trains a sparse token→SID associative model from stored chunks
92
+ - Uses beam search over valid SID prefixes
93
+ - No neural dependency yet; still token-native and deterministic
94
+ - CLI: `contextfit ingest ./docs --train-sid-generator`
95
+
96
+ ## Getting Started
97
+
98
+ ```bash
99
+ # Install dependencies
100
+ pip install -e .
101
+
102
+ # Ingest a knowledge base
103
+ contextfit ingest ./documents --tokenizer tiktoken
104
+
105
+ # Query
106
+ contextfit query "What is ContextFit?"
107
+
108
+ # Query through Semantic IDs
109
+ contextfit query "async retrieval" --method sid
110
+
111
+ # Agent-friendly machine-readable output
112
+ contextfit query "What is ContextFit?" --method hybrid --json
113
+ contextfit stats --json
114
+
115
+ # Run a deterministic sample benchmark
116
+ python examples/benchmark_sample_corpus.py --docs-per-topic 100 --json
117
+
118
+ # Run needle-in-a-haystack benchmark
119
+ python examples/benchmark_needle_haystack.py --needles 20 --distractors 200 --top-k 5 --json
120
+
121
+ # Ingest and train the learned SID generator
122
+ contextfit ingest ./documents --train-sid-generator
123
+ ```
124
+
125
+ For installing on a MacBook/OpenClaw node, see [`docs/MACBOOK_CLI_DEPLOY.md`](docs/MACBOOK_CLI_DEPLOY.md).
126
+
127
+ For OpenClaw integration, including the `contextfit_search` tool and `contextfit` context engine plugin, see [`docs/OPENCLAW_INTEGRATION.md`](docs/OPENCLAW_INTEGRATION.md).
128
+
129
+ `--json` is intended for OpenClaw/agent use. Query JSON includes `input_ids`, retrieved chunk metadata, SID predictions, semantic IDs, and decoded previews.
130
+
131
+ ## Current Storage Layout
132
+
133
+ ```text
134
+ contextfit_kb/
135
+ chunks/
136
+ chunks.bin # zstd-compressed token-array records
137
+ index.json # chunk_id → byte offset/length
138
+ inverted/
139
+ meta.json # corpus/index metadata
140
+ postings.bin # compact binary token → roaring bitmap + positions pack
141
+ sid/
142
+ semantic_ids.json
143
+ learned_sid_generator.json
144
+ ```
145
+
146
+ The inverted index now saves as a single binary postings pack by default. Legacy JSON-per-token indexes still load for compatibility.
147
+
148
+ ## Project Status
149
+
150
+ 🚧 **Early Development** — Architecture phase
151
+
152
+ ## References
153
+
154
+ - TERAG: Token-Efficient GraphRAG (3–11% token reduction)
155
+ - Semantic IDs / Generative Retrieval
156
+ - GraphRAG community detection
157
+ - Letta's token-space learning
158
+
159
+ ## License
160
+
161
+ MIT