amalfa 0.0.0-reserved → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/.biomeignore +19 -0
  2. package/:memory: +0 -0
  3. package/:memory:-shm +0 -0
  4. package/:memory:-wal +0 -0
  5. package/CHANGELOG.md.old +43 -0
  6. package/LICENSE +21 -0
  7. package/README.md +359 -13
  8. package/README.old.md +112 -0
  9. package/ROADMAP.md +316 -0
  10. package/TEST_PLAN.md +561 -0
  11. package/agents.config.json +11 -0
  12. package/amalfa.config.example.ts +102 -0
  13. package/biome.json +49 -0
  14. package/bun.lock +371 -0
  15. package/docs/AGENT_PROTOCOLS.md +28 -0
  16. package/docs/ARCHITECTURAL_OVERVIEW.md +123 -0
  17. package/docs/BENTO_BOXING_DEPRECATION.md +281 -0
  18. package/docs/Bun-SQLite.html +464 -0
  19. package/docs/COMMIT_GUIDELINES.md +367 -0
  20. package/docs/DEVELOPER_ONBOARDING.md +36 -0
  21. package/docs/Graph and Vector Database Best Practices.md +214 -0
  22. package/docs/PERFORMANCE_BASELINES.md +88 -0
  23. package/docs/REPOSITORY_CLEANUP_SUMMARY.md +261 -0
  24. package/docs/edge-generation-methods.md +57 -0
  25. package/docs/elevator-pitch.md +118 -0
  26. package/docs/graph-and-vector-database-playbook.html +480 -0
  27. package/docs/hardened-sqlite.md +85 -0
  28. package/docs/headless-knowledge-management.md +79 -0
  29. package/docs/john-kaye-flux-prompt.md +46 -0
  30. package/docs/keyboard-shortcuts.md +80 -0
  31. package/docs/opinion-proceed-pattern.md +29 -0
  32. package/docs/polyvis-nodes-edges-schema.md +77 -0
  33. package/docs/protocols/lab-protocol.md +30 -0
  34. package/docs/reaction-iquest-loop-coder.md +46 -0
  35. package/docs/services.md +60 -0
  36. package/docs/sqlite-wal-readonly-trap.md +228 -0
  37. package/docs/strategy/css-architecture.md +40 -0
  38. package/docs/test-document-cycle.md +83 -0
  39. package/docs/test_lifecycle_E2E.md +4 -0
  40. package/docs/the-bicameral-graph.md +83 -0
  41. package/docs/user-guide.md +70 -0
  42. package/docs/vision-helper.md +53 -0
  43. package/drizzle/0000_minor_iron_fist.sql +19 -0
  44. package/drizzle/meta/0000_snapshot.json +139 -0
  45. package/drizzle/meta/_journal.json +13 -0
  46. package/example_usage.ts +39 -0
  47. package/experiment.sh +35 -0
  48. package/hello +2 -0
  49. package/index.html +52 -0
  50. package/knowledge/excalibur.md +12 -0
  51. package/package.json +60 -15
  52. package/plans/experience-graph-integration.md +60 -0
  53. package/prompts/gemini-king-mode-prompt.md +46 -0
  54. package/public/docs/MCP_TOOLS.md +372 -0
  55. package/schemas/README.md +20 -0
  56. package/schemas/cda.schema.json +84 -0
  57. package/schemas/conceptual-lexicon.schema.json +75 -0
  58. package/scratchpads/dummy-debrief-boxed.md +39 -0
  59. package/scratchpads/dummy-debrief.md +27 -0
  60. package/scratchpads/scratchpad-design.md +50 -0
  61. package/scratchpads/scratchpad-scrolling.md +20 -0
  62. package/scratchpads/scratchpad-toc-disappearance.md +23 -0
  63. package/scratchpads/scratchpad-toc.md +28 -0
  64. package/scratchpads/test_gardener.md +7 -0
  65. package/src/EnlightenedTriad.ts +146 -0
  66. package/src/JIT_Triad.ts +137 -0
  67. package/src/cli.ts +364 -0
  68. package/src/config/constants.ts +7 -0
  69. package/src/config/defaults.ts +99 -0
  70. package/src/core/BentoNormalizer.ts +113 -0
  71. package/src/core/EdgeWeaver.ts +145 -0
  72. package/src/core/FractureLogic.ts +22 -0
  73. package/src/core/Harvester.ts +73 -0
  74. package/src/core/LLMClient.ts +93 -0
  75. package/src/core/LouvainGate.ts +67 -0
  76. package/src/core/MarkdownMasker.ts +49 -0
  77. package/src/core/README.md +11 -0
  78. package/src/core/SemanticMatcher.ts +89 -0
  79. package/src/core/SemanticWeaver.ts +96 -0
  80. package/src/core/TagEngine.ts +56 -0
  81. package/src/core/TimelineWeaver.ts +61 -0
  82. package/src/core/VectorEngine.ts +232 -0
  83. package/src/daemon/index.ts +225 -0
  84. package/src/data/experience/test_doc_1.md +2 -0
  85. package/src/data/experience/test_doc_2.md +2 -0
  86. package/src/db/schema.ts +46 -0
  87. package/src/demo-triad.ts +45 -0
  88. package/src/gardeners/AutoTagger.ts +116 -0
  89. package/src/gardeners/BaseGardener.ts +55 -0
  90. package/src/llm/EnlightenedProvider.ts +95 -0
  91. package/src/mcp/README.md +6 -0
  92. package/src/mcp/index.ts +341 -0
  93. package/src/pipeline/AmalfaIngestor.ts +272 -0
  94. package/src/pipeline/HarvesterPipeline.ts +101 -0
  95. package/src/pipeline/Ingestor.ts +555 -0
  96. package/src/pipeline/PreFlightAnalyzer.ts +434 -0
  97. package/src/pipeline/README.md +7 -0
  98. package/src/pipeline/SemanticHarvester.ts +222 -0
  99. package/src/resonance/DatabaseFactory.ts +100 -0
  100. package/src/resonance/README.md +148 -0
  101. package/src/resonance/cli/README.md +7 -0
  102. package/src/resonance/cli/ingest.ts +41 -0
  103. package/src/resonance/cli/migrate.ts +54 -0
  104. package/src/resonance/config.ts +40 -0
  105. package/src/resonance/daemon.ts +236 -0
  106. package/src/resonance/db.ts +424 -0
  107. package/src/resonance/pipeline/README.md +7 -0
  108. package/src/resonance/pipeline/extract.ts +89 -0
  109. package/src/resonance/pipeline/transform_docs.ts +60 -0
  110. package/src/resonance/schema.ts +156 -0
  111. package/src/resonance/services/embedder.ts +131 -0
  112. package/src/resonance/services/simpleTokenizer.ts +119 -0
  113. package/src/resonance/services/stats.ts +327 -0
  114. package/src/resonance/services/tokenizer.ts +159 -0
  115. package/src/resonance/transform/cda.ts +393 -0
  116. package/src/resonance/types/enriched-cda.ts +112 -0
  117. package/src/services/README.md +56 -0
  118. package/src/services/llama.ts +59 -0
  119. package/src/services/llamauv.ts +56 -0
  120. package/src/services/olmo3.ts +58 -0
  121. package/src/services/phi.ts +52 -0
  122. package/src/types/artifact.ts +12 -0
  123. package/src/utils/EnvironmentVerifier.ts +67 -0
  124. package/src/utils/Logger.ts +21 -0
  125. package/src/utils/ServiceLifecycle.ts +207 -0
  126. package/src/utils/ZombieDefense.ts +244 -0
  127. package/src/utils/validator.ts +264 -0
  128. package/substack/substack-playbook-1.md +95 -0
  129. package/substack/substack-playbook-2.md +78 -0
  130. package/tasks/ui-investigation.md +26 -0
  131. package/test-db +0 -0
  132. package/test-db-shm +0 -0
  133. package/test-db-wal +0 -0
  134. package/tests/canary/verify_pinch_check.ts +44 -0
  135. package/tests/fixtures/ingest_test.md +12 -0
  136. package/tests/fixtures/ingest_test_boxed.md +13 -0
  137. package/tests/fixtures/safety_test.md +45 -0
  138. package/tests/fixtures/safety_test_boxed.md +49 -0
  139. package/tests/fixtures/tagged_output.md +49 -0
  140. package/tests/fixtures/tagged_test.md +49 -0
  141. package/tests/mcp-server-settings.json +8 -0
  142. package/tsconfig.json +46 -0
  143. package/verify-embedder.ts +54 -0
@@ -0,0 +1,281 @@
1
+ # Bento Boxing Deprecation
2
+
3
+ **Date:** January 5, 2026
4
+ **Status:** ❌ Deprecated and Removed
5
+ **Replaced By:** Whole-document vector embeddings
6
+
7
+ ---
8
+
9
+ ## What Was Bento Boxing?
10
+
11
+ **Bento Boxing** was a markdown chunking system designed to fragment large documents into smaller, semantically meaningful pieces ("bentos") for better vector search precision.
12
+
13
+ ### Components
14
+
15
+ **Code (Removed):**
16
+ - `src/core/BentoBoxer.ts` - Chunking logic (split by H1-H4 headings)
17
+ - `src/data/LocusLedger.ts` - Content deduplication (hash → UUID mapping)
18
+ - `src/index.ts` - CLI tool for processing markdown files
19
+ - `tests/bento_ast.test.ts` - Unit tests
20
+
21
+ **Database (Removed):**
22
+ - `bento_ledger.sqlite` - Deduplication ledger (343 entries)
23
+
24
+ **Playbooks/Briefs (Removed):**
25
+ - `briefs/archive/1-brief-polyvis-bento-implementation.md`
26
+ - `briefs/archive/2-bento-box-core-logic.md`
27
+ - `playbooks/bento-box-playbook-2.md`
28
+
29
+ ---
30
+
31
+ ## Why It Was Deprecated
32
+
33
+ ### 1. Never Integrated with Vector Search
34
+
35
+ **Critical issue:** Bento Boxing was an orphaned CLI tool, not integrated into the main ingestion pipeline.
36
+
37
+ - ✅ Code existed and worked
38
+ - ❌ Never used by `src/pipeline/Ingestor.ts`
39
+ - ❌ Never used by `src/resonance/db.ts`
40
+ - ❌ Not connected to `public/resonance.db`
41
+
42
+ **Result:** Documents were ingested whole, not chunked. Vector search operated on complete documents.
43
+
44
+ ---
45
+
46
+ ### 2. Whole-Document Embeddings Work Excellently
47
+
48
+ **Testing revealed chunking was unnecessary:**
49
+
50
+ | Metric | Value | Assessment |
51
+ |--------|-------|------------|
52
+ | Average best match | 85.2% | Excellent |
53
+ | Average spread | 21.1% | Good differentiation |
54
+ | Corpus size | 489 docs | Manageable |
55
+ | Average doc size | 2.7 KB (~550 words) | Already chunk-sized |
56
+
57
+ **Key insight:** 80% of documents are <5KB. They're already "chunk-sized" for embedding models.
58
+
59
+ ---
60
+
61
+ ### 3. Document Size Distribution
62
+
63
+ **Analysis of 489 documents:**
64
+
65
+ ```
66
+ Size Range | Percentage | Chunking Benefit
67
+ ---------------|------------|------------------
68
+ < 5KB | ~80% | None (already small)
69
+ 5-20KB | ~15% | Minimal
70
+ > 20KB | ~5% | Potential (but not critical)
71
+ ```
72
+
73
+ **Largest document:** 47KB (~9,500 words)
74
+ - Still within LLM context windows (100K+ tokens)
75
+ - Embedding captures main themes well
76
+ - Can use grep for exact phrase search
77
+
78
+ ---
79
+
80
+ ### 4. Complexity vs Benefit
81
+
82
+ **Costs of chunking:**
83
+ - ❌ Chunk logic (where to split?)
84
+ - ❌ Chunk→document mapping
85
+ - ❌ Context loss (chunks lose surrounding context)
86
+ - ❌ Storage overhead (10x nodes for chunked docs)
87
+ - ❌ Search complexity (multiple chunks from same doc in results)
88
+ - ❌ UI complexity (show chunk vs full doc?)
89
+
90
+ **Benefits in this corpus:**
91
+ - ⚠️ Slightly better precision for 5% of large docs
92
+ - ⚠️ Granular retrieval (already achievable with grep)
93
+
94
+ **Verdict:** Costs > Benefits
95
+
96
+ ---
97
+
98
+ ## Search Architecture (Post-Deprecation)
99
+
100
+ ### Two-Tier Search System
101
+
102
+ **1. Vector Search (Primary)**
103
+ - Purpose: Semantic similarity, concept discovery
104
+ - Accuracy: 85.2% average best match
105
+ - Speed: <10ms per query
106
+ - Handles: "Find documents about CSS patterns"
107
+
108
+ **2. Grep/Ripgrep (Secondary)**
109
+ - Purpose: Exact phrase matches
110
+ - Accuracy: 100% (literal text)
111
+ - Speed: <1ms
112
+ - Handles: "Find exact phrase 'function fooBar'"
113
+
114
+ **No chunking needed:** This two-tier approach covers all search use cases.
115
+
116
+ ---
117
+
118
+ ## Decision Criteria
119
+
120
+ ### When Chunking IS NOT Needed
121
+
122
+ ✅ **Keep whole-document embeddings if:**
123
+ - Average doc size <5KB (most docs already chunk-sized)
124
+ - Vector search accuracy >70% (yours is 85%)
125
+ - Documents are well-structured (markdown with headers)
126
+ - Search is semantic (not keyword BM25)
127
+ - Source files are easily searchable with grep
128
+
129
+ **Polyvis meets ALL these criteria.**
130
+
131
+ ---
132
+
133
+ ### When Chunking WOULD Be Needed
134
+
135
+ Consider adding chunking if/when:
136
+
137
+ **1. External large documents**
138
+ - Research papers (30-50 pages)
139
+ - Books, manuals (100+ pages)
140
+ - API documentation (needs endpoint-level chunks)
141
+
142
+ **2. Accuracy degradation**
143
+ - Vector search drops below 70%
144
+ - Users report irrelevant results
145
+ - Long documents dominate search results
146
+
147
+ **3. Specific requirements**
148
+ - RAG system needs paragraph-level context
149
+ - Need to cite specific sections, not whole docs
150
+ - Document structure doesn't match search granularity
151
+
152
+ ---
153
+
154
+ ## Migration Notes
155
+
156
+ ### What Changed
157
+
158
+ **Removed:**
159
+ - All Bento Boxing source code
160
+ - bento_ledger.sqlite database
161
+ - CLI tool (`bun run src/index.ts box`)
162
+ - Related briefs and playbooks
163
+
164
+ **Unchanged:**
165
+ - Vector search pipeline (always used whole docs)
166
+ - Ingestion pipeline
167
+ - Database schema
168
+ - Search accuracy (still 85%)
169
+
170
+ **No migration required:** Bento Boxing was never in production.
171
+
172
+ ---
173
+
174
+ ## Historical Context
175
+
176
+ ### Development Timeline
177
+
178
+ **December 2025:**
179
+ - Bento Boxing designed and implemented
180
+ - CLI tool created for markdown chunking
181
+ - Deduplication ledger built (343 entries)
182
+ - Playbooks and briefs written
183
+
184
+ **January 2026:**
185
+ - Vector search testing revealed 85% accuracy without chunking
186
+ - Discovered Bento Boxing never integrated with main pipeline
187
+ - Analysis showed 80% of docs are already chunk-sized
188
+ - Decision: Deprecate and remove
189
+
190
+ **Lesson:** Test effectiveness before building infrastructure.
191
+
192
+ ---
193
+
194
+ ## Future Considerations
195
+
196
+ ### Recommended Approach: File Splitting (Not Runtime Chunking)
197
+
198
+ **If large documents (>15-20KB) become problematic, use simple file splitting:**
199
+
200
+ **Strategy:**
201
+ 1. Parse document structure with `ast-grep` or `marked`
202
+ 2. Split at natural boundaries (H1/H2 headers)
203
+ 3. Create multiple markdown files (e.g., `agents-part-1.md`, `agents-part-2.md`)
204
+ 4. Add metadata: `<!-- Part 1 of 3 -->`
205
+ 5. Optional: Keep parent file as TOC with links to parts
206
+ 6. Commit split files to version control
207
+
208
+ **Advantages:**
209
+ - ✅ **Simple:** No infrastructure, just split files once
210
+ - ✅ **Git-native:** Diffs are meaningful, history is granular
211
+ - ✅ **Transparent:** Files are the chunks (source of truth)
212
+ - ✅ **Reversible:** Reconstruct with `cat part-*.md > full.md`
213
+ - ✅ **Lazy:** Only split the 5% of docs that need it
214
+
215
+ **When to Split:**
216
+
217
+ | Document Size | Action |
218
+ |---------------|--------|
219
+ | <10KB | Leave as-is |
220
+ | 10-20KB | Consider if natural boundaries exist |
221
+ | >20KB | Strong candidate for splitting |
222
+
223
+ **Example: Splitting AGENTS.md (47KB)**
224
+ ```bash
225
+ # Parse and split at H1 boundaries
226
+ AGENTS.md → agents-tier1.md (protocols 1-6)
227
+ → agents-tier2.md (protocols 7-18)
228
+ → agents-tier3.md (playbooks index)
229
+
230
+ # Keep parent as TOC
231
+ AGENTS.md → "# Agent Protocols\n\nSee:\n- [Tier 1](agents-tier1.md)..."
232
+ ```
233
+
234
+ **Anti-Patterns to Avoid:**
235
+ - ❌ Premature splitting ("what if it grows?")
236
+ - ❌ Runtime chunking infrastructure
237
+ - ❌ Artificial boundaries (mid-paragraph splits)
238
+ - ❌ Complex deduplication/mapping systems
239
+
240
+ **Why This Works:**
241
+ - Documents remain markdown files in git
242
+ - Vector search ingests each part as separate node
243
+ - Search results link to specific part files
244
+ - Humans edit parts independently
245
+ - Reconstruction is trivial when needed
246
+
247
+ ---
248
+
249
+ ## References
250
+
251
+ ### Effectiveness Testing
252
+
253
+ See `scripts/test-embeddings.ts` for validation:
254
+ - 85.2% average best match
255
+ - 21.1% spread
256
+ - Tested across 5 query types (CSS, database, graph, debugging, tooling)
257
+
258
+ ### Related Documentation
259
+
260
+ - `src/resonance/README.md` - Search Architecture section
261
+ - `.legacy-databases-README.md` - bento_ledger.sqlite removal
262
+ - `playbooks/README.md` - Updated to remove Bento Boxing references
263
+
264
+ ---
265
+
266
+ ## Summary
267
+
268
+ **Bento Boxing was well-designed but unnecessary:**
269
+ - Never integrated into production pipeline
270
+ - Whole-document embeddings achieve excellent results (85%)
271
+ - Most documents are already chunk-sized (<5KB)
272
+ - Two-tier search (vector + grep) covers all use cases
273
+
274
+ **Decision:** Remove to simplify codebase. Revisit chunking only if:
275
+ - Adding large external documents (books, long PDFs)
276
+ - Vector search accuracy drops significantly
277
+ - Specific use case emerges that requires granular retrieval
278
+
279
+ ---
280
+
281
+ **Last updated:** 2026-01-05