@tekmidian/pai 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/ARCHITECTURE.md +148 -6
  2. package/FEATURE.md +11 -0
  3. package/README.md +79 -0
  4. package/dist/{auto-route-D7W6RE06.mjs → auto-route-JjW3f7pV.mjs} +4 -4
  5. package/dist/{auto-route-D7W6RE06.mjs.map → auto-route-JjW3f7pV.mjs.map} +1 -1
  6. package/dist/chunker-CbnBe0s0.mjs +191 -0
  7. package/dist/chunker-CbnBe0s0.mjs.map +1 -0
  8. package/dist/cli/index.mjs +835 -40
  9. package/dist/cli/index.mjs.map +1 -1
  10. package/dist/{config-DBh1bYM2.mjs → config-DELNqq3Z.mjs} +4 -2
  11. package/dist/{config-DBh1bYM2.mjs.map → config-DELNqq3Z.mjs.map} +1 -1
  12. package/dist/daemon/index.mjs +9 -9
  13. package/dist/{daemon-v5O897D4.mjs → daemon-CeTX4NpF.mjs} +94 -13
  14. package/dist/daemon-CeTX4NpF.mjs.map +1 -0
  15. package/dist/daemon-mcp/index.mjs +3 -3
  16. package/dist/db-Dp8VXIMR.mjs +212 -0
  17. package/dist/db-Dp8VXIMR.mjs.map +1 -0
  18. package/dist/{detect-BHqYcjJ1.mjs → detect-D7gPV3fQ.mjs} +1 -1
  19. package/dist/{detect-BHqYcjJ1.mjs.map → detect-D7gPV3fQ.mjs.map} +1 -1
  20. package/dist/{detector-DKA83aTZ.mjs → detector-cYYhK2Mi.mjs} +2 -2
  21. package/dist/{detector-DKA83aTZ.mjs.map → detector-cYYhK2Mi.mjs.map} +1 -1
  22. package/dist/{embeddings-mfqv-jFu.mjs → embeddings-DGRAPAYb.mjs} +2 -2
  23. package/dist/{embeddings-mfqv-jFu.mjs.map → embeddings-DGRAPAYb.mjs.map} +1 -1
  24. package/dist/{factory-BDAiKtYR.mjs → factory-DZLvRf4m.mjs} +4 -4
  25. package/dist/{factory-BDAiKtYR.mjs.map → factory-DZLvRf4m.mjs.map} +1 -1
  26. package/dist/index.d.mts +1 -1
  27. package/dist/index.d.mts.map +1 -1
  28. package/dist/index.mjs +9 -7
  29. package/dist/{indexer-B20bPHL-.mjs → indexer-CKQcgKsz.mjs} +4 -190
  30. package/dist/indexer-CKQcgKsz.mjs.map +1 -0
  31. package/dist/{indexer-backend-BXaocO5r.mjs → indexer-backend-BHztlJJg.mjs} +4 -3
  32. package/dist/{indexer-backend-BXaocO5r.mjs.map → indexer-backend-BHztlJJg.mjs.map} +1 -1
  33. package/dist/{ipc-client-DPy7s3iu.mjs → ipc-client-CLt2fNlC.mjs} +1 -1
  34. package/dist/ipc-client-CLt2fNlC.mjs.map +1 -0
  35. package/dist/mcp/index.mjs +118 -5
  36. package/dist/mcp/index.mjs.map +1 -1
  37. package/dist/{migrate-Bwj7qPaE.mjs → migrate-jokLenje.mjs} +8 -1
  38. package/dist/migrate-jokLenje.mjs.map +1 -0
  39. package/dist/{pai-marker-DX_mFLum.mjs → pai-marker-CXQPX2P6.mjs} +1 -1
  40. package/dist/{pai-marker-DX_mFLum.mjs.map → pai-marker-CXQPX2P6.mjs.map} +1 -1
  41. package/dist/{postgres-Ccvpc6fC.mjs → postgres-CRBe30Ag.mjs} +1 -1
  42. package/dist/{postgres-Ccvpc6fC.mjs.map → postgres-CRBe30Ag.mjs.map} +1 -1
  43. package/dist/{schemas-DjdwzIQ8.mjs → schemas-BY3Pjvje.mjs} +1 -1
  44. package/dist/{schemas-DjdwzIQ8.mjs.map → schemas-BY3Pjvje.mjs.map} +1 -1
  45. package/dist/{search-PjftDxxs.mjs → search-GK0ibTJy.mjs} +2 -2
  46. package/dist/{search-PjftDxxs.mjs.map → search-GK0ibTJy.mjs.map} +1 -1
  47. package/dist/{sqlite-CHUrNtbI.mjs → sqlite-RyR8Up1v.mjs} +3 -3
  48. package/dist/{sqlite-CHUrNtbI.mjs.map → sqlite-RyR8Up1v.mjs.map} +1 -1
  49. package/dist/{tools-CLK4080-.mjs → tools-CUg0Lyg-.mjs} +175 -11
  50. package/dist/{tools-CLK4080-.mjs.map → tools-CUg0Lyg-.mjs.map} +1 -1
  51. package/dist/{utils-DEWdIFQ0.mjs → utils-QSfKagcj.mjs} +62 -2
  52. package/dist/utils-QSfKagcj.mjs.map +1 -0
  53. package/dist/vault-indexer-Bo2aPSzP.mjs +499 -0
  54. package/dist/vault-indexer-Bo2aPSzP.mjs.map +1 -0
  55. package/dist/zettelkasten-Co-w0XSZ.mjs +901 -0
  56. package/dist/zettelkasten-Co-w0XSZ.mjs.map +1 -0
  57. package/package.json +2 -1
  58. package/src/hooks/README.md +99 -0
  59. package/src/hooks/hooks.md +13 -0
  60. package/src/hooks/pre-compact.sh +95 -0
  61. package/src/hooks/session-stop.sh +93 -0
  62. package/statusline-command.sh +9 -4
  63. package/templates/README.md +7 -0
  64. package/templates/agent-prefs.example.md +7 -0
  65. package/templates/claude-md.template.md +7 -0
  66. package/templates/pai-project.template.md +4 -6
  67. package/templates/pai-skill.template.md +295 -0
  68. package/templates/templates.md +20 -0
  69. package/dist/daemon-v5O897D4.mjs.map +0 -1
  70. package/dist/db-BcDxXVBu.mjs +0 -110
  71. package/dist/db-BcDxXVBu.mjs.map +0 -1
  72. package/dist/indexer-B20bPHL-.mjs.map +0 -1
  73. package/dist/ipc-client-DPy7s3iu.mjs.map +0 -1
  74. package/dist/migrate-Bwj7qPaE.mjs.map +0 -1
  75. package/dist/utils-DEWdIFQ0.mjs.map +0 -1
package/ARCHITECTURE.md CHANGED
@@ -1,8 +1,12 @@
1
+ ---
2
+ links: "[[Ideaverse/AI/PAI/PAI|PAI]]"
3
+ ---
4
+
1
5
  # PAI Knowledge OS — Architecture
2
6
 
3
7
  Technical reference for PAI's architecture, database schema, CLI commands, and development setup.
4
8
 
5
- For user-facing documentation, see [README.md](README.md) and [MANUAL.md](MANUAL.md).
9
+ For user-facing documentation, see [README.md](Ideaverse/AI/PAI/README.md) and [MANUAL.md](MANUAL.md).
6
10
 
7
11
  ---
8
12
 
@@ -28,7 +32,7 @@ Claude Code Session
28
32
 
29
33
  └── CLI (pai)
30
34
  project, session, registry, memory,
31
- daemon, obsidian, backup, restore, setup
35
+ daemon, obsidian, zettel, backup, restore, setup
32
36
  ```
33
37
 
34
38
  ### Key Components
@@ -37,10 +41,17 @@ Claude Code Session
37
41
 
38
42
  **Storage** — Two databases serve different roles:
39
43
 
40
- - **PostgreSQL + pgvector** (`pai` database, Docker): Stores text chunks, vector embeddings (768-dim, Snowflake Arctic), and file metadata. HNSW indexes for fast approximate nearest-neighbor search. GIN indexes for full-text search.
41
- - **SQLite registry** (`~/.pai/registry.db`): Lightweight metadata store for projects, sessions, tags, aliases, and cross-references.
44
+ | Layer | Backend | Location | Purpose |
45
+ |-------|---------|----------|---------|
46
+ | **Registry** | SQLite (always) | `~/.pai/registry.db` | Projects, sessions, tags, aliases, links. Single-writer is fine — only the CLI and daemon write. Uses `better-sqlite3`. |
47
+ | **Memory / Embeddings** | Factory-switchable | PostgreSQL (full) or SQLite (simple) | Text chunks, vector embeddings, file metadata. Chosen at setup time via `~/.config/pai/config.json`. |
48
+
49
+ - **Simple mode (SQLite)**: Zero dependencies. Keyword search (BM25 via FTS5) works immediately. No Docker needed. Best for trying PAI or smaller setups.
50
+ - **Full mode (PostgreSQL + pgvector)**: Semantic search via HNSW vector indexes (768-dim, Snowflake Arctic). GIN indexes for full-text search. Runs in Docker (`pai-pgvector` container, `restart: unless-stopped`). Best for large knowledge bases (100K+ documents).
42
51
 
43
- **Embeddings** Snowflake Arctic Embed produces 768-dimensional embeddings. The daemon generates embeddings asynchronously in the background after initial text indexing, so keyword search is available immediately and semantic search follows within minutes.
52
+ The storage backend is selected during `pai setup` and configured in `~/.config/pai/config.json` (`storageBackend: "sqlite"` or `"postgres"`). The factory pattern (`src/storage/factory.ts`) instantiates the correct backend at runtime. Both backends implement the same `StorageInterface` (`src/storage/interface.ts`), so all higher-level code (indexer, search, MCP tools) is backend-agnostic.
53
+
54
+ **Embeddings** — Snowflake Arctic Embed produces 768-dimensional embeddings (PostgreSQL mode only). The daemon generates embeddings asynchronously in the background after initial text indexing, so keyword search is available immediately and semantic search follows within minutes. The embedding process runs at reduced CPU priority (`setPriority(pid, 10)`).
44
55
 
45
56
  ---
46
57
 
@@ -154,6 +165,12 @@ Claude Code (stdio)
154
165
  | `session_list` | List session notes, optionally filtered by project |
155
166
  | `registry_search` | Search project metadata (names, paths, tags) |
156
167
  | `project_detect` | Identify which project a given path belongs to |
168
+ | `zettel_explore` | BFS traversal of wikilink graph from a seed note |
169
+ | `zettel_surprise` | Find semantically distant but graph-close notes |
170
+ | `zettel_converse` | Hybrid search with graph expansion and cross-domain connections |
171
+ | `zettel_themes` | Cluster vault notes into thematic groups by embedding similarity |
172
+ | `zettel_health` | Audit vault for broken links, orphans, and isolated clusters |
173
+ | `zettel_suggest` | Suggest link targets weighted by semantics, tags, and graph neighborhood |
157
174
 
158
175
  ### Tool Reference
159
176
 
@@ -171,6 +188,18 @@ Claude Code (stdio)
171
188
 
172
189
  **`project_detect(path?)`** — Given a filesystem path (defaults to CWD), returns the matching project.
173
190
 
191
+ **`zettel_explore(note, depth?, direction?)`** — BFS walk from a seed note across `vault_links`. Returns a subgraph of neighboring notes with each edge classified as `sequential` or `associative`. `direction`: `outbound` (default), `inbound`, or `both`.
192
+
193
+ **`zettel_surprise(note, limit?)`** — Returns notes that are semantically dissimilar to `note` but reachable within a short graph distance. Scored as `cosine_similarity × log2(graph_distance + 1)`. Useful for lateral discovery.
194
+
195
+ **`zettel_converse(query, limit?)`** — Runs a hybrid memory search, expands the result set via graph neighborhood, then surfaces cross-domain connections — notes from unrelated clusters that are semantically close to the query.
196
+
197
+ **`zettel_themes(min_cluster_size?)`** — Clusters all vault embeddings using agglomerative single-linkage clustering. Returns thematic groups with representative note titles and cluster size.
198
+
199
+ **`zettel_health()`** — Full structural audit of the vault. Reports broken links (target not in `vault_files`), orphaned notes (no inbound or outbound edges), notes missing embeddings, and isolated clusters detected via union-find.
200
+
201
+ **`zettel_suggest(note, limit?)`** — Ranks candidate link targets for a given note. Score is a weighted sum: semantic embedding similarity (0.5), shared tags (0.2), graph neighborhood overlap with existing links (0.3).
202
+
174
203
  ### Installation
175
204
 
176
205
  ```bash
@@ -332,6 +361,26 @@ pai obsidian sync
332
361
  pai obsidian status
333
362
  ```
334
363
 
364
+ ### Zettelkasten
365
+
366
+ | Subcommand | Description |
367
+ |------------|-------------|
368
+ | `zettel explore <note>` | BFS traversal of wikilink graph from a seed note |
369
+ | `zettel surprise <note>` | Find semantically distant but graph-close notes |
370
+ | `zettel converse <query>` | Hybrid search with graph expansion and cross-domain connections |
371
+ | `zettel themes` | Cluster vault notes into thematic groups |
372
+ | `zettel health` | Audit vault for broken links, orphans, and isolated clusters |
373
+ | `zettel suggest <note>` | Suggest link targets weighted by semantics, tags, and graph neighborhood |
374
+
375
+ ```bash
376
+ pai zettel explore "My Seed Note" --depth 3 --direction both
377
+ pai zettel surprise "My Seed Note" --limit 10
378
+ pai zettel converse "distributed systems tradeoffs"
379
+ pai zettel themes --min-cluster-size 3
380
+ pai zettel health
381
+ pai zettel suggest "My Seed Note" --limit 5
382
+ ```
383
+
335
384
  ### Other Commands
336
385
 
337
386
  ```bash
@@ -412,6 +461,37 @@ PAI can expose your project memory as an Obsidian vault. The vault contains no a
412
461
 
413
462
  ---
414
463
 
464
+ ## Zettelkasten Intelligence
465
+
466
+ PAI implements six Luhmann-inspired operations on the vault's dual representation: a wikilink graph stored in `vault_links` and semantic embeddings stored alongside the vault file records. Together these two layers enable graph-based navigation, serendipitous discovery, and structural health analysis.
467
+
468
+ ### Operations
469
+
470
+ | Operation | Module | Algorithm |
471
+ |-----------|--------|-----------|
472
+ | Explore | `src/zettelkasten/explore.ts` | BFS on vault_links, classifies sequential vs associative edges |
473
+ | Surprise | `src/zettelkasten/surprise.ts` | Cosine similarity × log2(graph_distance + 1) |
474
+ | Converse | `src/zettelkasten/converse.ts` | Hybrid search → graph expansion → cross-domain connections |
475
+ | Themes | `src/zettelkasten/themes.ts` | Agglomerative single-linkage clustering of embeddings |
476
+ | Health | `src/zettelkasten/health.ts` | SQL-driven audit with union-find for cluster detection |
477
+ | Suggest | `src/zettelkasten/suggest.ts` | Weighted: semantic (0.5) + tags (0.2) + graph neighborhood (0.3) |
478
+
479
+ ### Design Notes
480
+
481
+ **Explore** performs a BFS walk from a seed note across `vault_links`. Each edge is classified as sequential (the linked note shares a common tag or is a direct sequence continuation) or associative (a lateral connection between different topics). The result is a subgraph that exposes the local neighborhood of a note.
482
+
483
+ **Surprise** finds notes that are semantically distant from a seed note in embedding space but close in graph distance — the "surprising bridge" pattern Luhmann valued. The score `cosine_similarity × log2(graph_distance + 1)` rewards notes that are conceptually different yet structurally nearby.
484
+
485
+ **Converse** treats the vault as a conversation partner. It runs a hybrid memory search, expands results via the graph to pull in neighboring notes, then identifies cross-domain connections — notes from unrelated topic clusters that share embedding proximity with the query.
486
+
487
+ **Themes** clusters vault embeddings using agglomerative single-linkage clustering. The output is a flat list of thematic groups with representative note titles. Useful for detecting topic drift, finding redundancy, or building a high-level map of the vault.
488
+
489
+ **Health** runs a SQL-driven structural audit: broken links, orphaned notes (no inbound or outbound links), notes with no embedding, and isolated clusters detected via union-find on the `vault_links` graph.
490
+
491
+ **Suggest** ranks candidate link targets for a given note using a weighted sum of three signals: semantic similarity of embeddings (weight 0.5), shared tags (weight 0.2), and presence in the graph neighborhood of already-linked notes (weight 0.3).
492
+
493
+ ---
494
+
415
495
  ## Templates
416
496
 
417
497
  PAI ships three templates used during setup and customizable for your workflow.
@@ -484,6 +564,54 @@ Copy to `~/.config/pai/voices.json` and configure your preferred backend.
484
564
 
485
565
  **Indexes:** HNSW on embedding (cosine), GIN on text (tsvector), B-tree on project_id/path.
486
566
 
567
+ ### Vault Tables (v3 — PostgreSQL)
568
+
569
+ These tables are populated by `src/memory/vault-indexer.ts` and queried by all six zettelkasten operations.
570
+
571
+ **`vault_files`** — One row per Obsidian note:
572
+
573
+ | Column | Type | Description |
574
+ |--------|------|-------------|
575
+ | `id` | SERIAL | Surrogate key |
576
+ | `vault_path` | TEXT | Path relative to vault root |
577
+ | `title` | TEXT | Note title (H1 or filename) |
578
+ | `tags` | TEXT[] | Frontmatter tags |
579
+ | `embedding` | vector(768) | Snowflake Arctic embedding |
580
+ | `mtime` | BIGINT | Modification time |
581
+ | `hash` | TEXT | SHA-256 of file content |
582
+
583
+ **`vault_aliases`** — Obsidian alias metadata:
584
+
585
+ | Column | Type | Description |
586
+ |--------|------|-------------|
587
+ | `file_id` | INTEGER | FK → vault_files.id |
588
+ | `alias` | TEXT | Alias string from frontmatter |
589
+
590
+ **`vault_links`** — Directed wikilink edges:
591
+
592
+ | Column | Type | Description |
593
+ |--------|------|-------------|
594
+ | `source_id` | INTEGER | FK → vault_files.id (linking note) |
595
+ | `target_id` | INTEGER | FK → vault_files.id (linked note) |
596
+ | `link_text` | TEXT | Display text of the link |
597
+ | `link_type` | TEXT | `sequential` or `associative` |
598
+
599
+ **`vault_name_index`** — Reverse lookup for wikilink resolution:
600
+
601
+ | Column | Type | Description |
602
+ |--------|------|-------------|
603
+ | `name` | TEXT | Lowercased title or alias |
604
+ | `file_id` | INTEGER | FK → vault_files.id |
605
+
606
+ **`vault_health`** — Cached audit results from the Health operation:
607
+
608
+ | Column | Type | Description |
609
+ |--------|------|-------------|
610
+ | `file_id` | INTEGER | FK → vault_files.id |
611
+ | `issue_type` | TEXT | `broken_link`, `orphan`, `no_embedding`, `isolated_cluster` |
612
+ | `detail` | TEXT | Human-readable description |
613
+ | `checked_at` | BIGINT | Timestamp of the audit run |
614
+
487
615
  **Content Tiers:**
488
616
 
489
617
  | Tier | Description | Example |
@@ -541,16 +669,27 @@ bun run lint # tsc --noEmit
541
669
  ```
542
670
  src/
543
671
  ├── cli/commands/ # CLI command implementations
672
+ │ └── zettel.ts # `pai zettel` with 6 subcommands
544
673
  ├── daemon/ # Daemon server and index scheduler
545
674
  ├── daemon-mcp/ # MCP shim (stdio → daemon socket)
546
675
  ├── federation/ # Federation schema definitions
547
676
  ├── hooks/ # Lifecycle hooks (pre-compact, session-stop)
548
677
  ├── mcp/ # Direct MCP server (legacy)
549
678
  ├── memory/ # Indexer, chunker, embeddings, search
679
+ │ └── vault-indexer.ts # Obsidian vault indexing into v3 vault tables
550
680
  ├── obsidian/ # Obsidian vault bridge
681
+ │ └── vault-fixer.ts # Repairs broken wikilinks and orphaned entries
551
682
  ├── registry/ # Registry migrations and queries
552
683
  ├── session/ # Session slug generator
553
- └── storage/ # Storage backend interface (SQLite/Postgres)
684
+ ├── storage/ # Storage backend interface (SQLite/Postgres)
685
+ └── zettelkasten/ # Luhmann-inspired graph + semantic operations
686
+ ├── explore.ts # BFS traversal classifying sequential/associative edges
687
+ ├── surprise.ts # Serendipitous bridge discovery via cosine × graph distance
688
+ ├── converse.ts # Hybrid search → graph expansion → cross-domain connections
689
+ ├── themes.ts # Agglomerative embedding clustering for thematic groups
690
+ ├── health.ts # SQL-driven vault audit with union-find cluster detection
691
+ ├── suggest.ts # Weighted link suggestions (semantic + tags + graph)
692
+ └── index.ts # Barrel export for all zettelkasten operations
554
693
  ```
555
694
 
556
695
  ### Important Notes
@@ -565,3 +704,6 @@ src/
565
704
  ## License
566
705
 
567
706
  MIT
707
+
708
+ ---
709
+ *Links:* [[Ideaverse/AI/PAI/PAI|PAI]]
package/FEATURE.md CHANGED
@@ -1,3 +1,7 @@
1
+ ---
2
+ links: "[[Ideaverse/AI/PAI/PAI|PAI]]"
3
+ ---
4
+
1
5
  # PAI Feature Comparison
2
6
 
3
7
  ## Credit
@@ -20,6 +24,10 @@ different direction: persistent memory, session continuity, and deep Claude Code
20
24
  | **Primary interface** | CLI pipe (`echo "..." \| fabric -p pattern`) | MCP server + CLI (`pai`) |
21
25
  | **Prompt templates** | Yes — 200+ community "patterns" | No (out of scope) |
22
26
  | **YouTube transcript extraction** | Yes (built-in) | Yes — via [Scribe MCP](https://github.com/mnott/Scribe) |
27
+ | **WhatsApp integration** | No | Yes — via [Whazaa MCP](https://github.com/mnott/Whazaa) |
28
+ | **Google Workspace integration** | No | Yes — via [Coogle MCP](https://github.com/mnott/Coogle) |
29
+ | **DEVONthink integration** | No | Yes — via [devonthink-mcp](https://github.com/mnott/Devon) |
30
+ | **Hookmark integration** | No | Yes — via [Hook MCP](https://github.com/mnott/Hook) |
23
31
  | **LLM pipe-through workflow** | Yes — core feature | No |
24
32
  | **Persistent session memory** | No | Yes — auto-indexed, 449K+ chunks |
25
33
  | **Session registry** | No | Yes — SQLite, tracks 77+ projects |
@@ -109,3 +117,6 @@ Code to remember everything across sessions, use this.
109
117
 
110
118
  They're not mutually exclusive. Fabric handles one-shot prompt workflows. PAI Knowledge OS
111
119
  handles persistent memory for Claude Code. Many people will want both.
120
+
121
+ ---
122
+ *Links:* [[Ideaverse/AI/PAI/PAI|PAI]]
package/README.md CHANGED
@@ -1,3 +1,7 @@
1
+ ---
2
+ links: "[[Ideaverse/AI/PAI/PAI|PAI]]"
3
+ ---
4
+
1
5
  # PAI Knowledge OS
2
6
 
3
7
  Claude Code has a memory problem. Every new session starts cold — no idea what you built yesterday, what decisions you made, or where you left off. You re-explain everything, every time. PAI fixes this.
@@ -39,6 +43,15 @@ Install PAI and Claude remembers. Ask it what you were working on. Ask it to fin
39
43
  - "Sync my Obsidian vault" — updates your linked vault with the latest notes
40
44
  - "Open my notes in Obsidian" — launches Obsidian with your full knowledge graph
41
45
 
46
+ ### Zettelkasten Intelligence
47
+
48
+ - "Explore notes linked to PAI" — follow trains of thought through wikilink chains
49
+ - "Find surprising connections to this note" — discover semantically similar but graph-distant notes
50
+ - "What themes are emerging in my vault?" — detect clusters of related notes forming new ideas
51
+ - "How healthy is my vault?" — structural audit: dead links, orphans, disconnected clusters
52
+ - "Suggest connections for this note" — proactive link suggestions using semantic + graph signals
53
+ - "What does my vault say about knowledge management?" — use the vault as a thinking partner
54
+
42
55
  ---
43
56
 
44
57
  ## Quick Start
@@ -51,6 +64,40 @@ Claude finds the setup skill, checks your system, runs the interactive wizard, a
51
64
 
52
65
  ---
53
66
 
67
+ ## Auto-Compact Context Window
68
+
69
+ Claude Code can automatically compact your context window when it fills up, preventing session interruptions mid-task. PAI's statusline shows you at a glance whether auto-compact is active.
70
+
71
+ ### Why the GUI setting doesn't work
72
+
73
+ Claude Code has an `autoCompactEnabled` setting in `~/.claude.json`, but it gets overwritten on every restart. Do not use it — changes don't survive.
74
+
75
+ ### The durable approach: environment variable
76
+
77
+ Set `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE` in your `~/.claude/settings.json` under the `env` block. This survives restarts, `/clear`, and Claude Code updates.
78
+
79
+ ```json
80
+ {
81
+ "env": {
82
+ "CLAUDE_AUTOCOMPACT_PCT_OVERRIDE": "80"
83
+ }
84
+ }
85
+ ```
86
+
87
+ The value is the context percentage at which compaction triggers. `80` means compact when the context window reaches 80% full. Restart Claude Code after saving.
88
+
89
+ ### Statusline indicator
90
+
91
+ Once set, PAI's statusline shows `[auto-compact:80%]` next to the context meter on line 3, so you always know auto-compact is active and at what threshold.
92
+
93
+ ### Set it up with one prompt
94
+
95
+ Give Claude Code this prompt and it handles everything:
96
+
97
+ > Add `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE` set to `80` to the `env` block in `~/.claude/settings.json`. This enables durable auto-compact that survives restarts. Do not touch `~/.claude.json` — that file gets overwritten on startup. After saving, confirm the setting is in place and tell me to restart Claude Code.
98
+
99
+ ---
100
+
54
101
  ## Storage Options
55
102
 
56
103
  PAI offers two modes, and the setup wizard asks which you prefer.
@@ -80,6 +127,35 @@ For the technical deep-dive — architecture, database schema, CLI reference, an
80
127
 
81
128
  ---
82
129
 
130
+ ## Zettelkasten Intelligence
131
+
132
+ PAI implements Niklas Luhmann's Zettelkasten principles as six computational operations on your Obsidian vault.
133
+
134
+ ### How it works
135
+
136
+ PAI indexes your entire vault — following symlinks, deduplicating by inode, parsing every wikilink — and builds a graph database alongside semantic embeddings. Six tools then operate on this dual representation:
137
+
138
+ | Tool | What it does |
139
+ |------|-------------|
140
+ | `pai zettel explore` | Follow trains of thought through link chains (Folgezettel traversal) |
141
+ | `pai zettel surprise` | Find notes that are semantically close but far apart in the link graph |
142
+ | `pai zettel converse` | Ask questions and let the vault "talk back" with unexpected connections |
143
+ | `pai zettel themes` | Detect emerging clusters of related notes across folders |
144
+ | `pai zettel health` | Structural audit — dead links, orphans, disconnected clusters, health score |
145
+ | `pai zettel suggest` | Proactive connection suggestions combining semantic similarity, tags, and graph proximity |
146
+
147
+ All tools work as CLI commands (`pai zettel <command>`) and MCP tools (`zettel_*`) accessible through the daemon.
148
+
149
+ ### Vault Indexing
150
+
151
+ The vault indexer follows symlinks (critical for vaults built on symlinks), deduplicates files by inode to handle multiple paths to the same file, and builds a complete wikilink graph with Obsidian-compatible shortest-match resolution.
152
+
153
+ - Full index: ~10 seconds for ~1,000 files
154
+ - Incremental: ~2 seconds (hash-based change detection)
155
+ - Runs automatically via the daemon scheduler
156
+
157
+ ---
158
+
83
159
  ## Companion Projects
84
160
 
85
161
  PAI works great alongside these tools (also by the same author):
@@ -99,3 +175,6 @@ PAI Knowledge OS is inspired by [Daniel Miessler](https://github.com/danielmiess
99
175
  ## License
100
176
 
101
177
  MIT
178
+
179
+ ---
180
+ *Links:* [[Ideaverse/AI/PAI/PAI|PAI]]
@@ -1,5 +1,5 @@
1
- import { r as readPaiMarker } from "./pai-marker-DX_mFLum.mjs";
2
- import { t as detectProject } from "./detect-BHqYcjJ1.mjs";
1
+ import { r as readPaiMarker } from "./pai-marker-CXQPX2P6.mjs";
2
+ import { t as detectProject } from "./detect-D7gPV3fQ.mjs";
3
3
  import { existsSync } from "node:fs";
4
4
  import { dirname, resolve } from "node:path";
5
5
 
@@ -26,7 +26,7 @@ async function autoRoute(registryDb, federation, cwd, context) {
26
26
  const markerResult = findMarkerUpward(registryDb, target);
27
27
  if (markerResult) return markerResult;
28
28
  if (context && context.trim().length > 0) {
29
- const { detectTopicShift } = await import("./detector-DKA83aTZ.mjs").then((n) => n.n);
29
+ const { detectTopicShift } = await import("./detector-cYYhK2Mi.mjs").then((n) => n.n);
30
30
  const topicResult = await detectTopicShift(registryDb, federation, {
31
31
  context,
32
32
  threshold: .5
@@ -83,4 +83,4 @@ function formatAutoRouteJson(result) {
83
83
 
84
84
  //#endregion
85
85
  export { autoRoute, formatAutoRouteJson };
86
- //# sourceMappingURL=auto-route-D7W6RE06.mjs.map
86
+ //# sourceMappingURL=auto-route-JjW3f7pV.mjs.map
@@ -1 +1 @@
1
- {"version":3,"file":"auto-route-D7W6RE06.mjs","names":[],"sources":["../src/session/auto-route.ts"],"sourcesContent":["/**\n * Auto-route: automatic project routing suggestion on session start.\n *\n * Given a working directory (and optional conversation context), determine\n * which registered project the session belongs to.\n *\n * Strategy (in priority order):\n * 1. Path match — exact or parent-directory match in the project registry\n * 2. Marker walk — walk up from cwd looking for Notes/PAI.md, resolve slug\n * 3. Topic match — BM25 keyword search against memory (requires context text)\n *\n * The function is stateless and works with direct DB access (no daemon\n * required), making it fast and safe to call during session startup.\n */\n\nimport type { Database } from \"better-sqlite3\";\nimport type { StorageBackend } from \"../storage/interface.js\";\nimport { resolve, dirname } from \"node:path\";\nimport { existsSync } from \"node:fs\";\nimport { readPaiMarker } from \"../registry/pai-marker.js\";\nimport { detectProject } from \"../cli/commands/detect.js\";\n\n// ---------------------------------------------------------------------------\n// Types\n// ---------------------------------------------------------------------------\n\nexport type AutoRouteMethod = \"path\" | \"marker\" | \"topic\";\n\nexport interface AutoRouteResult {\n /** Project slug */\n slug: string;\n /** Human-readable project name */\n display_name: string;\n /** Absolute path to the project root */\n root_path: string;\n /** How the project was detected */\n method: AutoRouteMethod;\n /** Confidence [0,1]: 1.0 for path/marker matches, BM25 fraction for topic */\n confidence: number;\n}\n\n// ---------------------------------------------------------------------------\n// Core function\n// ---------------------------------------------------------------------------\n\n/**\n * Determine which project a session should be routed to.\n *\n * @param registryDb Open PAI registry database\n * @param federation Memory storage backend (needed only for topic fallback)\n * @param cwd Working directory to detect from (defaults to process.cwd())\n * @param context Optional conversation text for topic-based fallback\n * @returns Best project match, or null if nothing matched\n */\nexport async function autoRoute(\n registryDb: Database,\n federation: Database | StorageBackend,\n cwd?: string,\n context?: string\n): Promise<AutoRouteResult | null> {\n const target = resolve(cwd ?? process.cwd());\n\n // -------------------------------------------------------------------------\n // Strategy 1: Path match via registry\n // -------------------------------------------------------------------------\n\n const pathMatch = detectProject(registryDb, target);\n\n if (pathMatch) {\n return {\n slug: pathMatch.slug,\n display_name: pathMatch.display_name,\n root_path: pathMatch.root_path,\n method: \"path\",\n confidence: 1.0,\n };\n }\n\n // -------------------------------------------------------------------------\n // Strategy 2: PAI.md marker file walk\n //\n // Walk up from cwd, checking <dir>/Notes/PAI.md at each level.\n // Once found, resolve the slug against the registry to get full project info.\n // -------------------------------------------------------------------------\n\n const markerResult = findMarkerUpward(registryDb, target);\n if (markerResult) {\n return markerResult;\n }\n\n // -------------------------------------------------------------------------\n // Strategy 3: Topic detection (requires context text)\n // -------------------------------------------------------------------------\n\n if (context && context.trim().length > 0) {\n // Lazy import to avoid bundler pulling in daemon/index.mjs at module load time\n const { detectTopicShift } = await import(\"../topics/detector.js\");\n const topicResult = await detectTopicShift(registryDb, federation, {\n context,\n threshold: 0.5, // Lower threshold for initial routing (vs shift detection)\n });\n\n if (topicResult.suggestedProject && topicResult.confidence > 0) {\n // Look up the full project info from the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(topicResult.suggestedProject) as\n | { slug: string; display_name: string; root_path: string }\n | undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"topic\",\n confidence: topicResult.confidence,\n };\n }\n }\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Marker walk helper\n// ---------------------------------------------------------------------------\n\n/**\n * Walk up the directory tree from `startDir`, checking each level for a\n * `Notes/PAI.md` file. If found, read the slug and look up the project.\n *\n * Stops at the filesystem root or after 20 levels (safety guard).\n */\nfunction findMarkerUpward(\n registryDb: Database,\n startDir: string\n): AutoRouteResult | null {\n let current = startDir;\n let depth = 0;\n\n while (depth < 20) {\n const markerPath = `${current}/Notes/PAI.md`;\n\n if (existsSync(markerPath)) {\n const marker = readPaiMarker(current);\n\n if (marker && marker.status !== \"archived\") {\n // Resolve slug to full project info in the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(marker.slug) as\n | { slug: string; display_name: string; root_path: string }\n | undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"marker\",\n confidence: 1.0,\n };\n }\n }\n }\n\n const parent = dirname(current);\n if (parent === current) break; // Reached filesystem root\n current = parent;\n depth++;\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Format helpers\n// ---------------------------------------------------------------------------\n\n/**\n * Format an AutoRouteResult as a human-readable string for CLI output.\n */\nexport function formatAutoRoute(result: AutoRouteResult): string {\n const lines: string[] = [\n `slug: ${result.slug}`,\n `display_name: ${result.display_name}`,\n `root_path: ${result.root_path}`,\n `method: ${result.method}`,\n `confidence: ${(result.confidence * 100).toFixed(0)}%`,\n ];\n return lines.join(\"\\n\");\n}\n\n/**\n * Format an AutoRouteResult as JSON for machine consumption.\n */\nexport function formatAutoRouteJson(result: AutoRouteResult): string {\n return JSON.stringify(result, null, 2);\n}\n"],"mappings":";;;;;;;;;;;;;;;AAsDA,eAAsB,UACpB,YACA,YACA,KACA,SACiC;CACjC,MAAM,SAAS,QAAQ,OAAO,QAAQ,KAAK,CAAC;CAM5C,MAAM,YAAY,cAAc,YAAY,OAAO;AAEnD,KAAI,UACF,QAAO;EACL,MAAM,UAAU;EAChB,cAAc,UAAU;EACxB,WAAW,UAAU;EACrB,QAAQ;EACR,YAAY;EACb;CAUH,MAAM,eAAe,iBAAiB,YAAY,OAAO;AACzD,KAAI,aACF,QAAO;AAOT,KAAI,WAAW,QAAQ,MAAM,CAAC,SAAS,GAAG;EAExC,MAAM,EAAE,qBAAqB,MAAM,OAAO;EAC1C,MAAM,cAAc,MAAM,iBAAiB,YAAY,YAAY;GACjE;GACA,WAAW;GACZ,CAAC;AAEF,MAAI,YAAY,oBAAoB,YAAY,aAAa,GAAG;GAE9D,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,YAAY,iBAAiB;AAIpC,OAAI,WACF,QAAO;IACL,MAAM,WAAW;IACjB,cAAc,WAAW;IACzB,WAAW,WAAW;IACtB,QAAQ;IACR,YAAY,YAAY;IACzB;;;AAKP,QAAO;;;;;;;;AAaT,SAAS,iBACP,YACA,UACwB;CACxB,IAAI,UAAU;CACd,IAAI,QAAQ;AAEZ,QAAO,QAAQ,IAAI;AAGjB,MAAI,WAFe,GAAG,QAAQ,eAEJ,EAAE;GAC1B,MAAM,SAAS,cAAc,QAAQ;AAErC,OAAI,UAAU,OAAO,WAAW,YAAY;IAE1C,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,OAAO,KAAK;AAInB,QAAI,WACF,QAAO;KACL,MAAM,WAAW;KACjB,cAAc,WAAW;KACzB,WAAW,WAAW;KACtB,QAAQ;KACR,YAAY;KACb;;;EAKP,MAAM,SAAS,QAAQ,QAAQ;AAC/B,MAAI,WAAW,QAAS;AACxB,YAAU;AACV;;AAGF,QAAO;;;;;AAwBT,SAAgB,oBAAoB,QAAiC;AACnE,QAAO,KAAK,UAAU,QAAQ,MAAM,EAAE"}
1
+ {"version":3,"file":"auto-route-JjW3f7pV.mjs","names":[],"sources":["../src/session/auto-route.ts"],"sourcesContent":["/**\n * Auto-route: automatic project routing suggestion on session start.\n *\n * Given a working directory (and optional conversation context), determine\n * which registered project the session belongs to.\n *\n * Strategy (in priority order):\n * 1. Path match — exact or parent-directory match in the project registry\n * 2. Marker walk — walk up from cwd looking for Notes/PAI.md, resolve slug\n * 3. Topic match — BM25 keyword search against memory (requires context text)\n *\n * The function is stateless and works with direct DB access (no daemon\n * required), making it fast and safe to call during session startup.\n */\n\nimport type { Database } from \"better-sqlite3\";\nimport type { StorageBackend } from \"../storage/interface.js\";\nimport { resolve, dirname } from \"node:path\";\nimport { existsSync } from \"node:fs\";\nimport { readPaiMarker } from \"../registry/pai-marker.js\";\nimport { detectProject } from \"../cli/commands/detect.js\";\n\n// ---------------------------------------------------------------------------\n// Types\n// ---------------------------------------------------------------------------\n\nexport type AutoRouteMethod = \"path\" | \"marker\" | \"topic\";\n\nexport interface AutoRouteResult {\n /** Project slug */\n slug: string;\n /** Human-readable project name */\n display_name: string;\n /** Absolute path to the project root */\n root_path: string;\n /** How the project was detected */\n method: AutoRouteMethod;\n /** Confidence [0,1]: 1.0 for path/marker matches, BM25 fraction for topic */\n confidence: number;\n}\n\n// ---------------------------------------------------------------------------\n// Core function\n// ---------------------------------------------------------------------------\n\n/**\n * Determine which project a session should be routed to.\n *\n * @param registryDb Open PAI registry database\n * @param federation Memory storage backend (needed only for topic fallback)\n * @param cwd Working directory to detect from (defaults to process.cwd())\n * @param context Optional conversation text for topic-based fallback\n * @returns Best project match, or null if nothing matched\n */\nexport async function autoRoute(\n registryDb: Database,\n federation: Database | StorageBackend,\n cwd?: string,\n context?: string\n): Promise<AutoRouteResult | null> {\n const target = resolve(cwd ?? process.cwd());\n\n // -------------------------------------------------------------------------\n // Strategy 1: Path match via registry\n // -------------------------------------------------------------------------\n\n const pathMatch = detectProject(registryDb, target);\n\n if (pathMatch) {\n return {\n slug: pathMatch.slug,\n display_name: pathMatch.display_name,\n root_path: pathMatch.root_path,\n method: \"path\",\n confidence: 1.0,\n };\n }\n\n // -------------------------------------------------------------------------\n // Strategy 2: PAI.md marker file walk\n //\n // Walk up from cwd, checking <dir>/Notes/PAI.md at each level.\n // Once found, resolve the slug against the registry to get full project info.\n // -------------------------------------------------------------------------\n\n const markerResult = findMarkerUpward(registryDb, target);\n if (markerResult) {\n return markerResult;\n }\n\n // -------------------------------------------------------------------------\n // Strategy 3: Topic detection (requires context text)\n // -------------------------------------------------------------------------\n\n if (context && context.trim().length > 0) {\n // Lazy import to avoid bundler pulling in daemon/index.mjs at module load time\n const { detectTopicShift } = await import(\"../topics/detector.js\");\n const topicResult = await detectTopicShift(registryDb, federation, {\n context,\n threshold: 0.5, // Lower threshold for initial routing (vs shift detection)\n });\n\n if (topicResult.suggestedProject && topicResult.confidence > 0) {\n // Look up the full project info from the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(topicResult.suggestedProject) as\n | { slug: string; display_name: string; root_path: string }\n | undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"topic\",\n confidence: topicResult.confidence,\n };\n }\n }\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Marker walk helper\n// ---------------------------------------------------------------------------\n\n/**\n * Walk up the directory tree from `startDir`, checking each level for a\n * `Notes/PAI.md` file. If found, read the slug and look up the project.\n *\n * Stops at the filesystem root or after 20 levels (safety guard).\n */\nfunction findMarkerUpward(\n registryDb: Database,\n startDir: string\n): AutoRouteResult | null {\n let current = startDir;\n let depth = 0;\n\n while (depth < 20) {\n const markerPath = `${current}/Notes/PAI.md`;\n\n if (existsSync(markerPath)) {\n const marker = readPaiMarker(current);\n\n if (marker && marker.status !== \"archived\") {\n // Resolve slug to full project info in the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(marker.slug) as\n | { slug: string; display_name: string; root_path: string }\n | undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"marker\",\n confidence: 1.0,\n };\n }\n }\n }\n\n const parent = dirname(current);\n if (parent === current) break; // Reached filesystem root\n current = parent;\n depth++;\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Format helpers\n// ---------------------------------------------------------------------------\n\n/**\n * Format an AutoRouteResult as a human-readable string for CLI output.\n */\nexport function formatAutoRoute(result: AutoRouteResult): string {\n const lines: string[] = [\n `slug: ${result.slug}`,\n `display_name: ${result.display_name}`,\n `root_path: ${result.root_path}`,\n `method: ${result.method}`,\n `confidence: ${(result.confidence * 100).toFixed(0)}%`,\n ];\n return lines.join(\"\\n\");\n}\n\n/**\n * Format an AutoRouteResult as JSON for machine consumption.\n */\nexport function formatAutoRouteJson(result: AutoRouteResult): string {\n return JSON.stringify(result, null, 2);\n}\n"],"mappings":";;;;;;;;;;;;;;;AAsDA,eAAsB,UACpB,YACA,YACA,KACA,SACiC;CACjC,MAAM,SAAS,QAAQ,OAAO,QAAQ,KAAK,CAAC;CAM5C,MAAM,YAAY,cAAc,YAAY,OAAO;AAEnD,KAAI,UACF,QAAO;EACL,MAAM,UAAU;EAChB,cAAc,UAAU;EACxB,WAAW,UAAU;EACrB,QAAQ;EACR,YAAY;EACb;CAUH,MAAM,eAAe,iBAAiB,YAAY,OAAO;AACzD,KAAI,aACF,QAAO;AAOT,KAAI,WAAW,QAAQ,MAAM,CAAC,SAAS,GAAG;EAExC,MAAM,EAAE,qBAAqB,MAAM,OAAO;EAC1C,MAAM,cAAc,MAAM,iBAAiB,YAAY,YAAY;GACjE;GACA,WAAW;GACZ,CAAC;AAEF,MAAI,YAAY,oBAAoB,YAAY,aAAa,GAAG;GAE9D,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,YAAY,iBAAiB;AAIpC,OAAI,WACF,QAAO;IACL,MAAM,WAAW;IACjB,cAAc,WAAW;IACzB,WAAW,WAAW;IACtB,QAAQ;IACR,YAAY,YAAY;IACzB;;;AAKP,QAAO;;;;;;;;AAaT,SAAS,iBACP,YACA,UACwB;CACxB,IAAI,UAAU;CACd,IAAI,QAAQ;AAEZ,QAAO,QAAQ,IAAI;AAGjB,MAAI,WAFe,GAAG,QAAQ,eAEJ,EAAE;GAC1B,MAAM,SAAS,cAAc,QAAQ;AAErC,OAAI,UAAU,OAAO,WAAW,YAAY;IAE1C,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,OAAO,KAAK;AAInB,QAAI,WACF,QAAO;KACL,MAAM,WAAW;KACjB,cAAc,WAAW;KACzB,WAAW,WAAW;KACtB,QAAQ;KACR,YAAY;KACb;;;EAKP,MAAM,SAAS,QAAQ,QAAQ;AAC/B,MAAI,WAAW,QAAS;AACxB,YAAU;AACV;;AAGF,QAAO;;;;;AAwBT,SAAgB,oBAAoB,QAAiC;AACnE,QAAO,KAAK,UAAU,QAAQ,MAAM,EAAE"}
@@ -0,0 +1,191 @@
1
+ import { createHash } from "node:crypto";
2
+
3
+ //#region src/memory/chunker.ts
4
+ /**
5
+ * Markdown text chunker for the PAI memory engine.
6
+ *
7
+ * Splits markdown files into overlapping text segments suitable for BM25
8
+ * full-text indexing. Respects heading boundaries where possible, falling
9
+ * back to paragraph and sentence splitting when sections are large.
10
+ */
11
+ const DEFAULT_MAX_TOKENS = 400;
12
+ const DEFAULT_OVERLAP = 80;
13
+ /**
14
+ * Approximate token count using a words * 1.3 heuristic.
15
+ * Matches the OpenClaw estimate approach.
16
+ */
17
+ function estimateTokens(text) {
18
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
19
+ return Math.ceil(wordCount * 1.3);
20
+ }
21
+ /**
22
+ * Compute SHA-256 hash of a string, returning a hex string.
23
+ */
24
+ function sha256(text) {
25
+ return createHash("sha256").update(text).digest("hex");
26
+ }
27
+ /**
28
+ * Split content into sections delimited by ## or ### headings.
29
+ * Each section starts at its heading line (or at line 1 for a preamble).
30
+ */
31
+ function splitBySections(lines) {
32
+ const sections = [];
33
+ let current = [];
34
+ for (const line of lines) {
35
+ if (/^#{1,3}\s/.test(line.text) && current.length > 0) {
36
+ const text = current.map((l) => l.text).join("\n");
37
+ sections.push({
38
+ lines: current,
39
+ tokens: estimateTokens(text)
40
+ });
41
+ current = [];
42
+ }
43
+ current.push(line);
44
+ }
45
+ if (current.length > 0) {
46
+ const text = current.map((l) => l.text).join("\n");
47
+ sections.push({
48
+ lines: current,
49
+ tokens: estimateTokens(text)
50
+ });
51
+ }
52
+ return sections;
53
+ }
54
+ /**
55
+ * Split a LineBlock by double-newline paragraph boundaries.
56
+ */
57
+ function splitByParagraphs(block) {
58
+ const paragraphs = [];
59
+ let current = [];
60
+ for (const line of block.lines) if (line.text.trim() === "" && current.length > 0) {
61
+ const text = current.map((l) => l.text).join("\n");
62
+ paragraphs.push({
63
+ lines: [...current],
64
+ tokens: estimateTokens(text)
65
+ });
66
+ current = [];
67
+ } else current.push(line);
68
+ if (current.length > 0) {
69
+ const text = current.map((l) => l.text).join("\n");
70
+ paragraphs.push({
71
+ lines: current,
72
+ tokens: estimateTokens(text)
73
+ });
74
+ }
75
+ return paragraphs.length > 0 ? paragraphs : [block];
76
+ }
77
+ /**
78
+ * Split a LineBlock by sentence boundaries (. ! ?) when even paragraphs are
79
+ * too large. Works character-by-character within joined lines.
80
+ */
81
+ function splitBySentences(block, maxTokens) {
82
+ const sentences = block.lines.map((l) => l.text).join(" ").split(/(?<=[.!?])\s+(?=[A-Z"'])/g);
83
+ const result = [];
84
+ let accText = "";
85
+ const startLine = block.lines[0]?.lineNo ?? 1;
86
+ const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;
87
+ const totalLines = endLine - startLine + 1;
88
+ const linesPerSentence = Math.max(1, Math.floor(totalLines / Math.max(1, sentences.length)));
89
+ let sentenceIdx = 0;
90
+ let approxLine = startLine;
91
+ const flush = () => {
92
+ if (!accText.trim()) return;
93
+ const endApprox = Math.min(approxLine + linesPerSentence - 1, endLine);
94
+ result.push({
95
+ lines: [{
96
+ text: accText.trim(),
97
+ lineNo: approxLine
98
+ }],
99
+ tokens: estimateTokens(accText)
100
+ });
101
+ approxLine = endApprox + 1;
102
+ accText = "";
103
+ };
104
+ for (const sentence of sentences) {
105
+ sentenceIdx++;
106
+ const candidateText = accText ? accText + " " + sentence : sentence;
107
+ if (estimateTokens(candidateText) > maxTokens && accText) {
108
+ flush();
109
+ accText = sentence;
110
+ } else accText = candidateText;
111
+ }
112
+ flush();
113
+ return result.length > 0 ? result : [block];
114
+ }
115
+ /**
116
+ * Extract the last `overlapTokens` worth of text from a list of previously
117
+ * emitted chunks to prepend to the next chunk.
118
+ */
119
+ function buildOverlapPrefix(chunks, overlapTokens) {
120
+ if (overlapTokens <= 0 || chunks.length === 0) return [];
121
+ const lastChunk = chunks[chunks.length - 1];
122
+ if (!lastChunk) return [];
123
+ const lines = lastChunk.text.split("\n");
124
+ const kept = [];
125
+ let acc = 0;
126
+ for (let i = lines.length - 1; i >= 0; i--) {
127
+ const lineTokens = estimateTokens(lines[i] ?? "");
128
+ acc += lineTokens;
129
+ kept.unshift(lines[i] ?? "");
130
+ if (acc >= overlapTokens) break;
131
+ }
132
+ const startLine = lastChunk.endLine - kept.length + 1;
133
+ return kept.map((text, idx) => ({
134
+ text,
135
+ lineNo: Math.max(lastChunk.startLine, startLine + idx)
136
+ }));
137
+ }
138
+ /**
139
+ * Chunk a markdown file into overlapping segments for BM25 indexing.
140
+ *
141
+ * Strategy:
142
+ * 1. Split by headings (##, ###) as natural boundaries.
143
+ * 2. If a section exceeds maxTokens, split by paragraphs.
144
+ * 3. If a paragraph still exceeds maxTokens, split by sentences.
145
+ * 4. Apply overlap: each chunk includes the last `overlap` tokens from the
146
+ * previous chunk.
147
+ */
148
+ function chunkMarkdown(content, opts) {
149
+ const maxTokens = opts?.maxTokens ?? DEFAULT_MAX_TOKENS;
150
+ const overlapTokens = opts?.overlap ?? DEFAULT_OVERLAP;
151
+ if (!content.trim()) return [];
152
+ const sections = splitBySections(content.split("\n").map((text, idx) => ({
153
+ text,
154
+ lineNo: idx + 1
155
+ })));
156
+ const finalBlocks = [];
157
+ for (const section of sections) {
158
+ if (section.tokens <= maxTokens) {
159
+ finalBlocks.push(section);
160
+ continue;
161
+ }
162
+ const paras = splitByParagraphs(section);
163
+ for (const para of paras) {
164
+ if (para.tokens <= maxTokens) {
165
+ finalBlocks.push(para);
166
+ continue;
167
+ }
168
+ const sentences = splitBySentences(para, maxTokens);
169
+ finalBlocks.push(...sentences);
170
+ }
171
+ }
172
+ const chunks = [];
173
+ for (const block of finalBlocks) {
174
+ if (block.lines.length === 0) continue;
175
+ const text = [...buildOverlapPrefix(chunks, overlapTokens), ...block.lines].map((l) => l.text).join("\n").trim();
176
+ if (!text) continue;
177
+ const startLine = block.lines[0]?.lineNo ?? 1;
178
+ const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;
179
+ chunks.push({
180
+ text,
181
+ startLine,
182
+ endLine,
183
+ hash: sha256(text)
184
+ });
185
+ }
186
+ return chunks;
187
+ }
188
+
189
+ //#endregion
190
+ export { estimateTokens as n, chunkMarkdown as t };
191
+ //# sourceMappingURL=chunker-CbnBe0s0.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker-CbnBe0s0.mjs","names":[],"sources":["../src/memory/chunker.ts"],"sourcesContent":["/**\n * Markdown text chunker for the PAI memory engine.\n *\n * Splits markdown files into overlapping text segments suitable for BM25\n * full-text indexing. Respects heading boundaries where possible, falling\n * back to paragraph and sentence splitting when sections are large.\n */\n\nimport { createHash } from \"node:crypto\";\n\nexport interface Chunk {\n text: string;\n startLine: number; // 1-indexed\n endLine: number; // 1-indexed, inclusive\n hash: string; // SHA-256 of text\n}\n\nexport interface ChunkOptions {\n /** Approximate maximum tokens per chunk. Default 400. */\n maxTokens?: number;\n /** Overlap in tokens from the previous chunk. Default 80. */\n overlap?: number;\n}\n\nconst DEFAULT_MAX_TOKENS = 400;\nconst DEFAULT_OVERLAP = 80;\n\n/**\n * Approximate token count using a words * 1.3 heuristic.\n * Matches the OpenClaw estimate approach.\n */\nexport function estimateTokens(text: string): number {\n const wordCount = text.split(/\\s+/).filter(Boolean).length;\n return Math.ceil(wordCount * 1.3);\n}\n\n/**\n * Compute SHA-256 hash of a string, returning a hex string.\n */\nfunction sha256(text: string): string {\n return createHash(\"sha256\").update(text).digest(\"hex\");\n}\n\n// ---------------------------------------------------------------------------\n// Internal section / paragraph / sentence splitters\n// ---------------------------------------------------------------------------\n\n/**\n * A contiguous block of lines associated with an approximate token count.\n */\ninterface LineBlock {\n lines: Array<{ text: string; lineNo: number }>;\n tokens: number;\n}\n\n/**\n * Split content into sections delimited by ## or ### headings.\n * Each section starts at its heading line (or at line 1 for a preamble).\n */\nfunction splitBySections(\n lines: Array<{ text: string; lineNo: number }>,\n): LineBlock[] {\n const sections: LineBlock[] = [];\n let current: Array<{ text: string; lineNo: number }> = [];\n\n for (const line of lines) {\n const isHeading = /^#{1,3}\\s/.test(line.text);\n if (isHeading && current.length > 0) {\n const text = current.map((l) => l.text).join(\"\\n\");\n sections.push({ lines: current, tokens: estimateTokens(text) });\n current = [];\n }\n current.push(line);\n }\n\n if (current.length > 0) {\n const text = current.map((l) => l.text).join(\"\\n\");\n sections.push({ lines: current, tokens: estimateTokens(text) });\n }\n\n return sections;\n}\n\n/**\n * Split a LineBlock by double-newline paragraph boundaries.\n */\nfunction splitByParagraphs(block: LineBlock): LineBlock[] {\n const paragraphs: LineBlock[] = [];\n let current: Array<{ text: string; lineNo: number }> = [];\n\n for (const line of block.lines) {\n if (line.text.trim() === \"\" && current.length > 0) {\n // Empty line — potential paragraph boundary\n const text = current.map((l) => l.text).join(\"\\n\");\n paragraphs.push({ lines: [...current], tokens: estimateTokens(text) });\n current = [];\n } else {\n current.push(line);\n }\n }\n\n if (current.length > 0) {\n const text = current.map((l) => l.text).join(\"\\n\");\n paragraphs.push({ lines: current, tokens: estimateTokens(text) });\n }\n\n return paragraphs.length > 0 ? paragraphs : [block];\n}\n\n/**\n * Split a LineBlock by sentence boundaries (. ! ?) when even paragraphs are\n * too large. Works character-by-character within joined lines.\n */\nfunction splitBySentences(block: LineBlock, maxTokens: number): LineBlock[] {\n const fullText = block.lines.map((l) => l.text).join(\" \");\n // Very rough sentence split — split on '. ', '! ', '? ' followed by uppercase\n const sentenceRe = /(?<=[.!?])\\s+(?=[A-Z\"'])/g;\n const sentences = fullText.split(sentenceRe);\n\n const result: LineBlock[] = [];\n let accText = \"\";\n // We can't recover exact line numbers inside a single oversized paragraph,\n // so we approximate using the block's start/end lines distributed evenly.\n const startLine = block.lines[0]?.lineNo ?? 1;\n const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;\n const totalLines = endLine - startLine + 1;\n const linesPerSentence = Math.max(1, Math.floor(totalLines / Math.max(1, sentences.length)));\n\n let sentenceIdx = 0;\n let approxLine = startLine;\n\n const flush = () => {\n if (!accText.trim()) return;\n const endApprox = Math.min(approxLine + linesPerSentence - 1, endLine);\n result.push({\n lines: [{ text: accText.trim(), lineNo: approxLine }],\n tokens: estimateTokens(accText),\n });\n approxLine = endApprox + 1;\n accText = \"\";\n };\n\n for (const sentence of sentences) {\n sentenceIdx++;\n const candidateText = accText ? accText + \" \" + sentence : sentence;\n if (estimateTokens(candidateText) > maxTokens && accText) {\n flush();\n accText = sentence;\n } else {\n accText = candidateText;\n }\n }\n void sentenceIdx; // used only for iteration count\n flush();\n\n return result.length > 0 ? result : [block];\n}\n\n// ---------------------------------------------------------------------------\n// Overlap helper\n// ---------------------------------------------------------------------------\n\n/**\n * Extract the last `overlapTokens` worth of text from a list of previously\n * emitted chunks to prepend to the next chunk.\n */\nfunction buildOverlapPrefix(\n chunks: Chunk[],\n overlapTokens: number,\n): Array<{ text: string; lineNo: number }> {\n if (overlapTokens <= 0 || chunks.length === 0) return [];\n\n const lastChunk = chunks[chunks.length - 1];\n if (!lastChunk) return [];\n\n const lines = lastChunk.text.split(\"\\n\");\n const kept: string[] = [];\n let acc = 0;\n\n for (let i = lines.length - 1; i >= 0; i--) {\n const lineTokens = estimateTokens(lines[i] ?? \"\");\n acc += lineTokens;\n kept.unshift(lines[i] ?? \"\");\n if (acc >= overlapTokens) break;\n }\n\n // Distribute overlap lines across the lastChunk's line range\n const startLine = lastChunk.endLine - kept.length + 1;\n return kept.map((text, idx) => ({ text, lineNo: Math.max(lastChunk.startLine, startLine + idx) }));\n}\n\n// ---------------------------------------------------------------------------\n// Public API\n// ---------------------------------------------------------------------------\n\n/**\n * Chunk a markdown file into overlapping segments for BM25 indexing.\n *\n * Strategy:\n * 1. Split by headings (##, ###) as natural boundaries.\n * 2. If a section exceeds maxTokens, split by paragraphs.\n * 3. If a paragraph still exceeds maxTokens, split by sentences.\n * 4. Apply overlap: each chunk includes the last `overlap` tokens from the\n * previous chunk.\n */\nexport function chunkMarkdown(content: string, opts?: ChunkOptions): Chunk[] {\n const maxTokens = opts?.maxTokens ?? DEFAULT_MAX_TOKENS;\n const overlapTokens = opts?.overlap ?? DEFAULT_OVERLAP;\n\n if (!content.trim()) return [];\n\n const rawLines = content.split(\"\\n\");\n const lines: Array<{ text: string; lineNo: number }> = rawLines.map((text, idx) => ({\n text,\n lineNo: idx + 1, // 1-indexed\n }));\n\n // Step 1: section split\n const sections = splitBySections(lines);\n\n // Step 2 & 3: further split oversized sections\n const finalBlocks: LineBlock[] = [];\n for (const section of sections) {\n if (section.tokens <= maxTokens) {\n finalBlocks.push(section);\n continue;\n }\n // Too big — split by paragraphs\n const paras = splitByParagraphs(section);\n for (const para of paras) {\n if (para.tokens <= maxTokens) {\n finalBlocks.push(para);\n continue;\n }\n // Still too big — split by sentences\n const sentences = splitBySentences(para, maxTokens);\n finalBlocks.push(...sentences);\n }\n }\n\n // Step 4: build final chunks with overlap\n const chunks: Chunk[] = [];\n\n for (const block of finalBlocks) {\n if (block.lines.length === 0) continue;\n\n // Build overlap prefix from previous chunks\n const overlapLines = buildOverlapPrefix(chunks, overlapTokens);\n\n // Combine overlap + block lines\n const allLines = [...overlapLines, ...block.lines];\n const text = allLines.map((l) => l.text).join(\"\\n\").trim();\n\n if (!text) continue;\n\n const startLine = block.lines[0]?.lineNo ?? 1;\n const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;\n\n chunks.push({\n text,\n startLine,\n endLine,\n hash: sha256(text),\n });\n }\n\n return chunks;\n}\n"],"mappings":";;;;;;;;;;AAwBA,MAAM,qBAAqB;AAC3B,MAAM,kBAAkB;;;;;AAMxB,SAAgB,eAAe,MAAsB;CACnD,MAAM,YAAY,KAAK,MAAM,MAAM,CAAC,OAAO,QAAQ,CAAC;AACpD,QAAO,KAAK,KAAK,YAAY,IAAI;;;;;AAMnC,SAAS,OAAO,MAAsB;AACpC,QAAO,WAAW,SAAS,CAAC,OAAO,KAAK,CAAC,OAAO,MAAM;;;;;;AAmBxD,SAAS,gBACP,OACa;CACb,MAAM,WAAwB,EAAE;CAChC,IAAI,UAAmD,EAAE;AAEzD,MAAK,MAAM,QAAQ,OAAO;AAExB,MADkB,YAAY,KAAK,KAAK,KAAK,IAC5B,QAAQ,SAAS,GAAG;GACnC,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,YAAS,KAAK;IAAE,OAAO;IAAS,QAAQ,eAAe,KAAK;IAAE,CAAC;AAC/D,aAAU,EAAE;;AAEd,UAAQ,KAAK,KAAK;;AAGpB,KAAI,QAAQ,SAAS,GAAG;EACtB,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,WAAS,KAAK;GAAE,OAAO;GAAS,QAAQ,eAAe,KAAK;GAAE,CAAC;;AAGjE,QAAO;;;;;AAMT,SAAS,kBAAkB,OAA+B;CACxD,MAAM,aAA0B,EAAE;CAClC,IAAI,UAAmD,EAAE;AAEzD,MAAK,MAAM,QAAQ,MAAM,MACvB,KAAI,KAAK,KAAK,MAAM,KAAK,MAAM,QAAQ,SAAS,GAAG;EAEjD,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,aAAW,KAAK;GAAE,OAAO,CAAC,GAAG,QAAQ;GAAE,QAAQ,eAAe,KAAK;GAAE,CAAC;AACtE,YAAU,EAAE;OAEZ,SAAQ,KAAK,KAAK;AAItB,KAAI,QAAQ,SAAS,GAAG;EACtB,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,aAAW,KAAK;GAAE,OAAO;GAAS,QAAQ,eAAe,KAAK;GAAE,CAAC;;AAGnE,QAAO,WAAW,SAAS,IAAI,aAAa,CAAC,MAAM;;;;;;AAOrD,SAAS,iBAAiB,OAAkB,WAAgC;CAI1E,MAAM,YAHW,MAAM,MAAM,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,IAAI,CAG9B,MADR,4BACyB;CAE5C,MAAM,SAAsB,EAAE;CAC9B,IAAI,UAAU;CAGd,MAAM,YAAY,MAAM,MAAM,IAAI,UAAU;CAC5C,MAAM,UAAU,MAAM,MAAM,MAAM,MAAM,SAAS,IAAI,UAAU;CAC/D,MAAM,aAAa,UAAU,YAAY;CACzC,MAAM,mBAAmB,KAAK,IAAI,GAAG,KAAK,MAAM,aAAa,KAAK,IAAI,GAAG,UAAU,OAAO,CAAC,CAAC;CAE5F,IAAI,cAAc;CAClB,IAAI,aAAa;CAEjB,MAAM,cAAc;AAClB,MAAI,CAAC,QAAQ,MAAM,CAAE;EACrB,MAAM,YAAY,KAAK,IAAI,aAAa,mBAAmB,GAAG,QAAQ;AACtE,SAAO,KAAK;GACV,OAAO,CAAC;IAAE,MAAM,QAAQ,MAAM;IAAE,QAAQ;IAAY,CAAC;GACrD,QAAQ,eAAe,QAAQ;GAChC,CAAC;AACF,eAAa,YAAY;AACzB,YAAU;;AAGZ,MAAK,MAAM,YAAY,WAAW;AAChC;EACA,MAAM,gBAAgB,UAAU,UAAU,MAAM,WAAW;AAC3D,MAAI,eAAe,cAAc,GAAG,aAAa,SAAS;AACxD,UAAO;AACP,aAAU;QAEV,WAAU;;AAId,QAAO;AAEP,QAAO,OAAO,SAAS,IAAI,SAAS,CAAC,MAAM;;;;;;AAW7C,SAAS,mBACP,QACA,eACyC;AACzC,KAAI,iBAAiB,KAAK,OAAO,WAAW,EAAG,QAAO,EAAE;CAExD,MAAM,YAAY,OAAO,OAAO,SAAS;AACzC,KAAI,CAAC,UAAW,QAAO,EAAE;CAEzB,MAAM,QAAQ,UAAU,KAAK,MAAM,KAAK;CACxC,MAAM,OAAiB,EAAE;CACzB,IAAI,MAAM;AAEV,MAAK,IAAI,IAAI,MAAM,SAAS,GAAG,KAAK,GAAG,KAAK;EAC1C,MAAM,aAAa,eAAe,MAAM,MAAM,GAAG;AACjD,SAAO;AACP,OAAK,QAAQ,MAAM,MAAM,GAAG;AAC5B,MAAI,OAAO,cAAe;;CAI5B,MAAM,YAAY,UAAU,UAAU,KAAK,SAAS;AACpD,QAAO,KAAK,KAAK,MAAM,SAAS;EAAE;EAAM,QAAQ,KAAK,IAAI,UAAU,WAAW,YAAY,IAAI;EAAE,EAAE;;;;;;;;;;;;AAiBpG,SAAgB,cAAc,SAAiB,MAA8B;CAC3E,MAAM,YAAY,MAAM,aAAa;CACrC,MAAM,gBAAgB,MAAM,WAAW;AAEvC,KAAI,CAAC,QAAQ,MAAM,CAAE,QAAO,EAAE;CAS9B,MAAM,WAAW,gBAPA,QAAQ,MAAM,KAAK,CAC4B,KAAK,MAAM,SAAS;EAClF;EACA,QAAQ,MAAM;EACf,EAAE,CAGoC;CAGvC,MAAM,cAA2B,EAAE;AACnC,MAAK,MAAM,WAAW,UAAU;AAC9B,MAAI,QAAQ,UAAU,WAAW;AAC/B,eAAY,KAAK,QAAQ;AACzB;;EAGF,MAAM,QAAQ,kBAAkB,QAAQ;AACxC,OAAK,MAAM,QAAQ,OAAO;AACxB,OAAI,KAAK,UAAU,WAAW;AAC5B,gBAAY,KAAK,KAAK;AACtB;;GAGF,MAAM,YAAY,iBAAiB,MAAM,UAAU;AACnD,eAAY,KAAK,GAAG,UAAU;;;CAKlC,MAAM,SAAkB,EAAE;AAE1B,MAAK,MAAM,SAAS,aAAa;AAC/B,MAAI,MAAM,MAAM,WAAW,EAAG;EAO9B,MAAM,OADW,CAAC,GAHG,mBAAmB,QAAQ,cAAc,EAG3B,GAAG,MAAM,MAAM,CAC5B,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK,CAAC,MAAM;AAE1D,MAAI,CAAC,KAAM;EAEX,MAAM,YAAY,MAAM,MAAM,IAAI,UAAU;EAC5C,MAAM,UAAU,MAAM,MAAM,MAAM,MAAM,SAAS,IAAI,UAAU;AAE/D,SAAO,KAAK;GACV;GACA;GACA;GACA,MAAM,OAAO,KAAK;GACnB,CAAC;;AAGJ,QAAO"}