npm - @tekmidian/pai - Versions diffs - 0.2.1 → 0.3.0 - Mend

@tekmidian/pai 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/ARCHITECTURE.md +148 -6
package/FEATURE.md +11 -0
package/README.md +79 -0
package/dist/{auto-route-D7W6RE06.mjs → auto-route-JjW3f7pV.mjs} +4 -4
package/dist/{auto-route-D7W6RE06.mjs.map → auto-route-JjW3f7pV.mjs.map} +1 -1
package/dist/chunker-CbnBe0s0.mjs +191 -0
package/dist/chunker-CbnBe0s0.mjs.map +1 -0
package/dist/cli/index.mjs +835 -40
package/dist/cli/index.mjs.map +1 -1
package/dist/{config-DBh1bYM2.mjs → config-DELNqq3Z.mjs} +4 -2
package/dist/{config-DBh1bYM2.mjs.map → config-DELNqq3Z.mjs.map} +1 -1
package/dist/daemon/index.mjs +9 -9
package/dist/{daemon-v5O897D4.mjs → daemon-CeTX4NpF.mjs} +94 -13
package/dist/daemon-CeTX4NpF.mjs.map +1 -0
package/dist/daemon-mcp/index.mjs +3 -3
package/dist/db-Dp8VXIMR.mjs +212 -0
package/dist/db-Dp8VXIMR.mjs.map +1 -0
package/dist/{detect-BHqYcjJ1.mjs → detect-D7gPV3fQ.mjs} +1 -1
package/dist/{detect-BHqYcjJ1.mjs.map → detect-D7gPV3fQ.mjs.map} +1 -1
package/dist/{detector-DKA83aTZ.mjs → detector-cYYhK2Mi.mjs} +2 -2
package/dist/{detector-DKA83aTZ.mjs.map → detector-cYYhK2Mi.mjs.map} +1 -1
package/dist/{embeddings-mfqv-jFu.mjs → embeddings-DGRAPAYb.mjs} +2 -2
package/dist/{embeddings-mfqv-jFu.mjs.map → embeddings-DGRAPAYb.mjs.map} +1 -1
package/dist/{factory-BDAiKtYR.mjs → factory-DZLvRf4m.mjs} +4 -4
package/dist/{factory-BDAiKtYR.mjs.map → factory-DZLvRf4m.mjs.map} +1 -1
package/dist/index.d.mts +1 -1
package/dist/index.d.mts.map +1 -1
package/dist/index.mjs +9 -7
package/dist/{indexer-B20bPHL-.mjs → indexer-CKQcgKsz.mjs} +4 -190
package/dist/indexer-CKQcgKsz.mjs.map +1 -0
package/dist/{indexer-backend-BXaocO5r.mjs → indexer-backend-BHztlJJg.mjs} +4 -3
package/dist/{indexer-backend-BXaocO5r.mjs.map → indexer-backend-BHztlJJg.mjs.map} +1 -1
package/dist/{ipc-client-DPy7s3iu.mjs → ipc-client-CLt2fNlC.mjs} +1 -1
package/dist/ipc-client-CLt2fNlC.mjs.map +1 -0
package/dist/mcp/index.mjs +118 -5
package/dist/mcp/index.mjs.map +1 -1
package/dist/{migrate-Bwj7qPaE.mjs → migrate-jokLenje.mjs} +8 -1
package/dist/migrate-jokLenje.mjs.map +1 -0
package/dist/{pai-marker-DX_mFLum.mjs → pai-marker-CXQPX2P6.mjs} +1 -1
package/dist/{pai-marker-DX_mFLum.mjs.map → pai-marker-CXQPX2P6.mjs.map} +1 -1
package/dist/{postgres-Ccvpc6fC.mjs → postgres-CRBe30Ag.mjs} +1 -1
package/dist/{postgres-Ccvpc6fC.mjs.map → postgres-CRBe30Ag.mjs.map} +1 -1
package/dist/{schemas-DjdwzIQ8.mjs → schemas-BY3Pjvje.mjs} +1 -1
package/dist/{schemas-DjdwzIQ8.mjs.map → schemas-BY3Pjvje.mjs.map} +1 -1
package/dist/{search-PjftDxxs.mjs → search-GK0ibTJy.mjs} +2 -2
package/dist/{search-PjftDxxs.mjs.map → search-GK0ibTJy.mjs.map} +1 -1
package/dist/{sqlite-CHUrNtbI.mjs → sqlite-RyR8Up1v.mjs} +3 -3
package/dist/{sqlite-CHUrNtbI.mjs.map → sqlite-RyR8Up1v.mjs.map} +1 -1
package/dist/{tools-CLK4080-.mjs → tools-CUg0Lyg-.mjs} +175 -11
package/dist/{tools-CLK4080-.mjs.map → tools-CUg0Lyg-.mjs.map} +1 -1
package/dist/{utils-DEWdIFQ0.mjs → utils-QSfKagcj.mjs} +62 -2
package/dist/utils-QSfKagcj.mjs.map +1 -0
package/dist/vault-indexer-Bo2aPSzP.mjs +499 -0
package/dist/vault-indexer-Bo2aPSzP.mjs.map +1 -0
package/dist/zettelkasten-Co-w0XSZ.mjs +901 -0
package/dist/zettelkasten-Co-w0XSZ.mjs.map +1 -0
package/package.json +2 -1
package/src/hooks/README.md +99 -0
package/src/hooks/hooks.md +13 -0
package/src/hooks/pre-compact.sh +95 -0
package/src/hooks/session-stop.sh +93 -0
package/statusline-command.sh +9 -4
package/templates/README.md +7 -0
package/templates/agent-prefs.example.md +7 -0
package/templates/claude-md.template.md +7 -0
package/templates/pai-project.template.md +4 -6
package/templates/pai-skill.template.md +295 -0
package/templates/templates.md +20 -0
package/dist/daemon-v5O897D4.mjs.map +0 -1
package/dist/db-BcDxXVBu.mjs +0 -110
package/dist/db-BcDxXVBu.mjs.map +0 -1
package/dist/indexer-B20bPHL-.mjs.map +0 -1
package/dist/ipc-client-DPy7s3iu.mjs.map +0 -1
package/dist/migrate-Bwj7qPaE.mjs.map +0 -1
package/dist/utils-DEWdIFQ0.mjs.map +0 -1

package/ARCHITECTURE.md CHANGED Viewed

@@ -1,8 +1,12 @@
+---
+links: "[[Ideaverse/AI/PAI/PAI|PAI]]"
+---
 # PAI Knowledge OS — Architecture
 Technical reference for PAI's architecture, database schema, CLI commands, and development setup.
-For user-facing documentation, see [README.md](README.md) and [MANUAL.md](MANUAL.md).
+For user-facing documentation, see [README.md](Ideaverse/AI/PAI/README.md) and [MANUAL.md](MANUAL.md).
 ---
@@ -28,7 +32,7 @@ Claude Code Session
     │
     └── CLI (pai)
             project, session, registry, memory,
-            daemon, obsidian, backup, restore, setup
+            daemon, obsidian, zettel, backup, restore, setup
 ```
 ### Key Components
@@ -37,10 +41,17 @@ Claude Code Session
 **Storage** — Two databases serve different roles:
-- **PostgreSQL + pgvector** (`pai` database, Docker): Stores text chunks, vector embeddings (768-dim, Snowflake Arctic), and file metadata. HNSW indexes for fast approximate nearest-neighbor search. GIN indexes for full-text search.
-- **SQLite registry** (`~/.pai/registry.db`): Lightweight metadata store for projects, sessions, tags, aliases, and cross-references.
+| Layer | Backend | Location | Purpose |
+|-------|---------|----------|---------|
+| **Registry** | SQLite (always) | `~/.pai/registry.db` | Projects, sessions, tags, aliases, links. Single-writer is fine — only the CLI and daemon write. Uses `better-sqlite3`. |
+| **Memory / Embeddings** | Factory-switchable | PostgreSQL (full) or SQLite (simple) | Text chunks, vector embeddings, file metadata. Chosen at setup time via `~/.config/pai/config.json`. |
+- **Simple mode (SQLite)**: Zero dependencies. Keyword search (BM25 via FTS5) works immediately. No Docker needed. Best for trying PAI or smaller setups.
+- **Full mode (PostgreSQL + pgvector)**: Semantic search via HNSW vector indexes (768-dim, Snowflake Arctic). GIN indexes for full-text search. Runs in Docker (`pai-pgvector` container, `restart: unless-stopped`). Best for large knowledge bases (100K+ documents).
-**Embeddings** — Snowflake Arctic Embed produces 768-dimensional embeddings. The daemon generates embeddings asynchronously in the background after initial text indexing, so keyword search is available immediately and semantic search follows within minutes.
+The storage backend is selected during `pai setup` and configured in `~/.config/pai/config.json` (`storageBackend: "sqlite"` or `"postgres"`). The factory pattern (`src/storage/factory.ts`) instantiates the correct backend at runtime. Both backends implement the same `StorageInterface` (`src/storage/interface.ts`), so all higher-level code (indexer, search, MCP tools) is backend-agnostic.
+**Embeddings** — Snowflake Arctic Embed produces 768-dimensional embeddings (PostgreSQL mode only). The daemon generates embeddings asynchronously in the background after initial text indexing, so keyword search is available immediately and semantic search follows within minutes. The embedding process runs at reduced CPU priority (`setPriority(pid, 10)`).
 ---
@@ -154,6 +165,12 @@ Claude Code (stdio)
 | `session_list` | List session notes, optionally filtered by project |
 | `registry_search` | Search project metadata (names, paths, tags) |
 | `project_detect` | Identify which project a given path belongs to |
+| `zettel_explore` | BFS traversal of wikilink graph from a seed note |
+| `zettel_surprise` | Find semantically distant but graph-close notes |
+| `zettel_converse` | Hybrid search with graph expansion and cross-domain connections |
+| `zettel_themes` | Cluster vault notes into thematic groups by embedding similarity |
+| `zettel_health` | Audit vault for broken links, orphans, and isolated clusters |
+| `zettel_suggest` | Suggest link targets weighted by semantics, tags, and graph neighborhood |
 ### Tool Reference
@@ -171,6 +188,18 @@ Claude Code (stdio)
 **`project_detect(path?)`** — Given a filesystem path (defaults to CWD), returns the matching project.
+**`zettel_explore(note, depth?, direction?)`** — BFS walk from a seed note across `vault_links`. Returns a subgraph of neighboring notes with each edge classified as `sequential` or `associative`. `direction`: `outbound` (default), `inbound`, or `both`.
+**`zettel_surprise(note, limit?)`** — Returns notes that are semantically dissimilar to `note` but reachable within a short graph distance. Scored as `cosine_similarity × log2(graph_distance + 1)`. Useful for lateral discovery.
+**`zettel_converse(query, limit?)`** — Runs a hybrid memory search, expands the result set via graph neighborhood, then surfaces cross-domain connections — notes from unrelated clusters that are semantically close to the query.
+**`zettel_themes(min_cluster_size?)`** — Clusters all vault embeddings using agglomerative single-linkage clustering. Returns thematic groups with representative note titles and cluster size.
+**`zettel_health()`** — Full structural audit of the vault. Reports broken links (target not in `vault_files`), orphaned notes (no inbound or outbound edges), notes missing embeddings, and isolated clusters detected via union-find.
+**`zettel_suggest(note, limit?)`** — Ranks candidate link targets for a given note. Score is a weighted sum: semantic embedding similarity (0.5), shared tags (0.2), graph neighborhood overlap with existing links (0.3).
 ### Installation
 ```bash
@@ -332,6 +361,26 @@ pai obsidian sync
 pai obsidian status
 ```
+### Zettelkasten
+| Subcommand | Description |
+|------------|-------------|
+| `zettel explore <note>` | BFS traversal of wikilink graph from a seed note |
+| `zettel surprise <note>` | Find semantically distant but graph-close notes |
+| `zettel converse <query>` | Hybrid search with graph expansion and cross-domain connections |
+| `zettel themes` | Cluster vault notes into thematic groups |
+| `zettel health` | Audit vault for broken links, orphans, and isolated clusters |
+| `zettel suggest <note>` | Suggest link targets weighted by semantics, tags, and graph neighborhood |
+```bash
+pai zettel explore "My Seed Note" --depth 3 --direction both
+pai zettel surprise "My Seed Note" --limit 10
+pai zettel converse "distributed systems tradeoffs"
+pai zettel themes --min-cluster-size 3
+pai zettel health
+pai zettel suggest "My Seed Note" --limit 5
+```
 ### Other Commands
 ```bash
@@ -412,6 +461,37 @@ PAI can expose your project memory as an Obsidian vault. The vault contains no a
 ---
+## Zettelkasten Intelligence
+PAI implements six Luhmann-inspired operations on the vault's dual representation: a wikilink graph stored in `vault_links` and semantic embeddings stored alongside the vault file records. Together these two layers enable graph-based navigation, serendipitous discovery, and structural health analysis.
+### Operations
+| Operation | Module | Algorithm |
+|-----------|--------|-----------|
+| Explore | `src/zettelkasten/explore.ts` | BFS on vault_links, classifies sequential vs associative edges |
+| Surprise | `src/zettelkasten/surprise.ts` | Cosine similarity × log2(graph_distance + 1) |
+| Converse | `src/zettelkasten/converse.ts` | Hybrid search → graph expansion → cross-domain connections |
+| Themes | `src/zettelkasten/themes.ts` | Agglomerative single-linkage clustering of embeddings |
+| Health | `src/zettelkasten/health.ts` | SQL-driven audit with union-find for cluster detection |
+| Suggest | `src/zettelkasten/suggest.ts` | Weighted: semantic (0.5) + tags (0.2) + graph neighborhood (0.3) |
+### Design Notes
+**Explore** performs a BFS walk from a seed note across `vault_links`. Each edge is classified as sequential (the linked note shares a common tag or is a direct sequence continuation) or associative (a lateral connection between different topics). The result is a subgraph that exposes the local neighborhood of a note.
+**Surprise** finds notes that are semantically distant from a seed note in embedding space but close in graph distance — the "surprising bridge" pattern Luhmann valued. The score `cosine_similarity × log2(graph_distance + 1)` rewards notes that are conceptually different yet structurally nearby.
+**Converse** treats the vault as a conversation partner. It runs a hybrid memory search, expands results via the graph to pull in neighboring notes, then identifies cross-domain connections — notes from unrelated topic clusters that share embedding proximity with the query.
+**Themes** clusters vault embeddings using agglomerative single-linkage clustering. The output is a flat list of thematic groups with representative note titles. Useful for detecting topic drift, finding redundancy, or building a high-level map of the vault.
+**Health** runs a SQL-driven structural audit: broken links, orphaned notes (no inbound or outbound links), notes with no embedding, and isolated clusters detected via union-find on the `vault_links` graph.
+**Suggest** ranks candidate link targets for a given note using a weighted sum of three signals: semantic similarity of embeddings (weight 0.5), shared tags (weight 0.2), and presence in the graph neighborhood of already-linked notes (weight 0.3).
+---
 ## Templates
 PAI ships three templates used during setup and customizable for your workflow.
@@ -484,6 +564,54 @@ Copy to `~/.config/pai/voices.json` and configure your preferred backend.
 **Indexes:** HNSW on embedding (cosine), GIN on text (tsvector), B-tree on project_id/path.
+### Vault Tables (v3 — PostgreSQL)
+These tables are populated by `src/memory/vault-indexer.ts` and queried by all six zettelkasten operations.
+**`vault_files`** — One row per Obsidian note:
+| Column | Type | Description |
+|--------|------|-------------|
+| `id` | SERIAL | Surrogate key |
+| `vault_path` | TEXT | Path relative to vault root |
+| `title` | TEXT | Note title (H1 or filename) |
+| `tags` | TEXT[] | Frontmatter tags |
+| `embedding` | vector(768) | Snowflake Arctic embedding |
+| `mtime` | BIGINT | Modification time |
+| `hash` | TEXT | SHA-256 of file content |
+**`vault_aliases`** — Obsidian alias metadata:
+| Column | Type | Description |
+|--------|------|-------------|
+| `file_id` | INTEGER | FK → vault_files.id |
+| `alias` | TEXT | Alias string from frontmatter |
+**`vault_links`** — Directed wikilink edges:
+| Column | Type | Description |
+|--------|------|-------------|
+| `source_id` | INTEGER | FK → vault_files.id (linking note) |
+| `target_id` | INTEGER | FK → vault_files.id (linked note) |
+| `link_text` | TEXT | Display text of the link |
+| `link_type` | TEXT | `sequential` or `associative` |
+**`vault_name_index`** — Reverse lookup for wikilink resolution:
+| Column | Type | Description |
+|--------|------|-------------|
+| `name` | TEXT | Lowercased title or alias |
+| `file_id` | INTEGER | FK → vault_files.id |
+**`vault_health`** — Cached audit results from the Health operation:
+| Column | Type | Description |
+|--------|------|-------------|
+| `file_id` | INTEGER | FK → vault_files.id |
+| `issue_type` | TEXT | `broken_link`, `orphan`, `no_embedding`, `isolated_cluster` |
+| `detail` | TEXT | Human-readable description |
+| `checked_at` | BIGINT | Timestamp of the audit run |
 **Content Tiers:**
 | Tier | Description | Example |
@@ -541,16 +669,27 @@ bun run lint     # tsc --noEmit
 ```
 src/
 ├── cli/commands/    # CLI command implementations
+│   └── zettel.ts    # `pai zettel` with 6 subcommands
 ├── daemon/          # Daemon server and index scheduler
 ├── daemon-mcp/      # MCP shim (stdio → daemon socket)
 ├── federation/      # Federation schema definitions
 ├── hooks/           # Lifecycle hooks (pre-compact, session-stop)
 ├── mcp/             # Direct MCP server (legacy)
 ├── memory/          # Indexer, chunker, embeddings, search
+│   └── vault-indexer.ts  # Obsidian vault indexing into v3 vault tables
 ├── obsidian/        # Obsidian vault bridge
+│   └── vault-fixer.ts    # Repairs broken wikilinks and orphaned entries
 ├── registry/        # Registry migrations and queries
 ├── session/         # Session slug generator
-└── storage/         # Storage backend interface (SQLite/Postgres)
+├── storage/         # Storage backend interface (SQLite/Postgres)
+└── zettelkasten/    # Luhmann-inspired graph + semantic operations
+    ├── explore.ts   # BFS traversal classifying sequential/associative edges
+    ├── surprise.ts  # Serendipitous bridge discovery via cosine × graph distance
+    ├── converse.ts  # Hybrid search → graph expansion → cross-domain connections
+    ├── themes.ts    # Agglomerative embedding clustering for thematic groups
+    ├── health.ts    # SQL-driven vault audit with union-find cluster detection
+    ├── suggest.ts   # Weighted link suggestions (semantic + tags + graph)
+    └── index.ts     # Barrel export for all zettelkasten operations
 ```
 ### Important Notes
@@ -565,3 +704,6 @@ src/
 ## License
 MIT
+---
+*Links:* [[Ideaverse/AI/PAI/PAI|PAI]]

package/FEATURE.md CHANGED Viewed

@@ -1,3 +1,7 @@
+---
+links: "[[Ideaverse/AI/PAI/PAI|PAI]]"
+---
 # PAI Feature Comparison
 ## Credit
@@ -20,6 +24,10 @@ different direction: persistent memory, session continuity, and deep Claude Code
 | **Primary interface** | CLI pipe (`echo "..." \| fabric -p pattern`) | MCP server + CLI (`pai`) |
 | **Prompt templates** | Yes — 200+ community "patterns" | No (out of scope) |
 | **YouTube transcript extraction** | Yes (built-in) | Yes — via [Scribe MCP](https://github.com/mnott/Scribe) |
+| **WhatsApp integration** | No | Yes — via [Whazaa MCP](https://github.com/mnott/Whazaa) |
+| **Google Workspace integration** | No | Yes — via [Coogle MCP](https://github.com/mnott/Coogle) |
+| **DEVONthink integration** | No | Yes — via [devonthink-mcp](https://github.com/mnott/Devon) |
+| **Hookmark integration** | No | Yes — via [Hook MCP](https://github.com/mnott/Hook) |
 | **LLM pipe-through workflow** | Yes — core feature | No |
 | **Persistent session memory** | No | Yes — auto-indexed, 449K+ chunks |
 | **Session registry** | No | Yes — SQLite, tracks 77+ projects |
@@ -109,3 +117,6 @@ Code to remember everything across sessions, use this.
 They're not mutually exclusive. Fabric handles one-shot prompt workflows. PAI Knowledge OS
 handles persistent memory for Claude Code. Many people will want both.
+---
+*Links:* [[Ideaverse/AI/PAI/PAI|PAI]]

package/README.md CHANGED Viewed

@@ -1,3 +1,7 @@
+---
+links: "[[Ideaverse/AI/PAI/PAI|PAI]]"
+---
 # PAI Knowledge OS
 Claude Code has a memory problem. Every new session starts cold — no idea what you built yesterday, what decisions you made, or where you left off. You re-explain everything, every time. PAI fixes this.
@@ -39,6 +43,15 @@ Install PAI and Claude remembers. Ask it what you were working on. Ask it to fin
 - "Sync my Obsidian vault" — updates your linked vault with the latest notes
 - "Open my notes in Obsidian" — launches Obsidian with your full knowledge graph
+### Zettelkasten Intelligence
+- "Explore notes linked to PAI" — follow trains of thought through wikilink chains
+- "Find surprising connections to this note" — discover semantically similar but graph-distant notes
+- "What themes are emerging in my vault?" — detect clusters of related notes forming new ideas
+- "How healthy is my vault?" — structural audit: dead links, orphans, disconnected clusters
+- "Suggest connections for this note" — proactive link suggestions using semantic + graph signals
+- "What does my vault say about knowledge management?" — use the vault as a thinking partner
 ---
 ## Quick Start
@@ -51,6 +64,40 @@ Claude finds the setup skill, checks your system, runs the interactive wizard, a
 ---
+## Auto-Compact Context Window
+Claude Code can automatically compact your context window when it fills up, preventing session interruptions mid-task. PAI's statusline shows you at a glance whether auto-compact is active.
+### Why the GUI setting doesn't work
+Claude Code has an `autoCompactEnabled` setting in `~/.claude.json`, but it gets overwritten on every restart. Do not use it — changes don't survive.
+### The durable approach: environment variable
+Set `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE` in your `~/.claude/settings.json` under the `env` block. This survives restarts, `/clear`, and Claude Code updates.
+```json
+{
+  "env": {
+    "CLAUDE_AUTOCOMPACT_PCT_OVERRIDE": "80"
+  }
+}
+```
+The value is the context percentage at which compaction triggers. `80` means compact when the context window reaches 80% full. Restart Claude Code after saving.
+### Statusline indicator
+Once set, PAI's statusline shows `[auto-compact:80%]` next to the context meter on line 3, so you always know auto-compact is active and at what threshold.
+### Set it up with one prompt
+Give Claude Code this prompt and it handles everything:
+> Add `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE` set to `80` to the `env` block in `~/.claude/settings.json`. This enables durable auto-compact that survives restarts. Do not touch `~/.claude.json` — that file gets overwritten on startup. After saving, confirm the setting is in place and tell me to restart Claude Code.
+---
 ## Storage Options
 PAI offers two modes, and the setup wizard asks which you prefer.
@@ -80,6 +127,35 @@ For the technical deep-dive — architecture, database schema, CLI reference, an
 ---
+## Zettelkasten Intelligence
+PAI implements Niklas Luhmann's Zettelkasten principles as six computational operations on your Obsidian vault.
+### How it works
+PAI indexes your entire vault — following symlinks, deduplicating by inode, parsing every wikilink — and builds a graph database alongside semantic embeddings. Six tools then operate on this dual representation:
+| Tool | What it does |
+|------|-------------|
+| `pai zettel explore` | Follow trains of thought through link chains (Folgezettel traversal) |
+| `pai zettel surprise` | Find notes that are semantically close but far apart in the link graph |
+| `pai zettel converse` | Ask questions and let the vault "talk back" with unexpected connections |
+| `pai zettel themes` | Detect emerging clusters of related notes across folders |
+| `pai zettel health` | Structural audit — dead links, orphans, disconnected clusters, health score |
+| `pai zettel suggest` | Proactive connection suggestions combining semantic similarity, tags, and graph proximity |
+All tools work as CLI commands (`pai zettel <command>`) and MCP tools (`zettel_*`) accessible through the daemon.
+### Vault Indexing
+The vault indexer follows symlinks (critical for vaults built on symlinks), deduplicates files by inode to handle multiple paths to the same file, and builds a complete wikilink graph with Obsidian-compatible shortest-match resolution.
+- Full index: ~10 seconds for ~1,000 files
+- Incremental: ~2 seconds (hash-based change detection)
+- Runs automatically via the daemon scheduler
+---
 ## Companion Projects
 PAI works great alongside these tools (also by the same author):
@@ -99,3 +175,6 @@ PAI Knowledge OS is inspired by [Daniel Miessler](https://github.com/danielmiess
 ## License
 MIT
+---
+*Links:* [[Ideaverse/AI/PAI/PAI|PAI]]

package/dist/{auto-route-D7W6RE06.mjs → auto-route-JjW3f7pV.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { r as readPaiMarker } from "./pai-marker-DX_mFLum.mjs";
-import { t as detectProject } from "./detect-BHqYcjJ1.mjs";
+import { r as readPaiMarker } from "./pai-marker-CXQPX2P6.mjs";
+import { t as detectProject } from "./detect-D7gPV3fQ.mjs";
 import { existsSync } from "node:fs";
 import { dirname, resolve } from "node:path";
@@ -26,7 +26,7 @@ async function autoRoute(registryDb, federation, cwd, context) {
 	const markerResult = findMarkerUpward(registryDb, target);
 	if (markerResult) return markerResult;
 	if (context && context.trim().length > 0) {
-		const { detectTopicShift } = await import("./detector-DKA83aTZ.mjs").then((n) => n.n);
+		const { detectTopicShift } = await import("./detector-cYYhK2Mi.mjs").then((n) => n.n);
 		const topicResult = await detectTopicShift(registryDb, federation, {
 			context,
 			threshold: .5
@@ -83,4 +83,4 @@ function formatAutoRouteJson(result) {
 //#endregion
 export { autoRoute, formatAutoRouteJson };
-//# sourceMappingURL=auto-route-D7W6RE06.mjs.map
+//# sourceMappingURL=auto-route-JjW3f7pV.mjs.map

package/dist/{auto-route-D7W6RE06.mjs.map → auto-route-JjW3f7pV.mjs.map} RENAMED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"auto-route-~~D7W6RE06~~.mjs","names":[],"sources":["../src/session/auto-route.ts"],"sourcesContent":["/*\n Auto-route: automatic project routing suggestion on session start.\n \n Given a working directory (and optional conversation context), determine\n * which registered project the session belongs to.\n \n Strategy (in priority order):\n * 1. Path match — exact or parent-directory match in the project registry\n * 2. Marker walk — walk up from cwd looking for Notes/PAI.md, resolve slug\n * 3. Topic match — BM25 keyword search against memory (requires context text)\n \n The function is stateless and works with direct DB access (no daemon\n * required), making it fast and safe to call during session startup.\n /\n\nimport type { Database } from \"better-sqlite3\";\nimport type { StorageBackend } from \"../storage/interface.js\";\nimport { resolve, dirname } from \"node:path\";\nimport { existsSync } from \"node:fs\";\nimport { readPaiMarker } from \"../registry/pai-marker.js\";\nimport { detectProject } from \"../cli/commands/detect.js\";\n\n// ---------------------------------------------------------------------------\n// Types\n// ---------------------------------------------------------------------------\n\nexport type AutoRouteMethod = \"path\" \| \"marker\" \| \"topic\";\n\nexport interface AutoRouteResult {\n /* Project slug /\n slug: string;\n /* Human-readable project name /\n display_name: string;\n /* Absolute path to the project root /\n root_path: string;\n /* How the project was detected /\n method: AutoRouteMethod;\n /* Confidence [0,1]: 1.0 for path/marker matches, BM25 fraction for topic /\n confidence: number;\n}\n\n// ---------------------------------------------------------------------------\n// Core function\n// ---------------------------------------------------------------------------\n\n/\n Determine which project a session should be routed to.\n \n @param registryDb Open PAI registry database\n * @param federation Memory storage backend (needed only for topic fallback)\n * @param cwd Working directory to detect from (defaults to process.cwd())\n * @param context Optional conversation text for topic-based fallback\n * @returns Best project match, or null if nothing matched\n /\nexport async function autoRoute(\n registryDb: Database,\n federation: Database \| StorageBackend,\n cwd?: string,\n context?: string\n): Promise<AutoRouteResult \| null> {\n const target = resolve(cwd ?? process.cwd());\n\n // -------------------------------------------------------------------------\n // Strategy 1: Path match via registry\n // -------------------------------------------------------------------------\n\n const pathMatch = detectProject(registryDb, target);\n\n if (pathMatch) {\n return {\n slug: pathMatch.slug,\n display_name: pathMatch.display_name,\n root_path: pathMatch.root_path,\n method: \"path\",\n confidence: 1.0,\n };\n }\n\n // -------------------------------------------------------------------------\n // Strategy 2: PAI.md marker file walk\n //\n // Walk up from cwd, checking <dir>/Notes/PAI.md at each level.\n // Once found, resolve the slug against the registry to get full project info.\n // -------------------------------------------------------------------------\n\n const markerResult = findMarkerUpward(registryDb, target);\n if (markerResult) {\n return markerResult;\n }\n\n // -------------------------------------------------------------------------\n // Strategy 3: Topic detection (requires context text)\n // -------------------------------------------------------------------------\n\n if (context && context.trim().length > 0) {\n // Lazy import to avoid bundler pulling in daemon/index.mjs at module load time\n const { detectTopicShift } = await import(\"../topics/detector.js\");\n const topicResult = await detectTopicShift(registryDb, federation, {\n context,\n threshold: 0.5, // Lower threshold for initial routing (vs shift detection)\n });\n\n if (topicResult.suggestedProject && topicResult.confidence > 0) {\n // Look up the full project info from the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(topicResult.suggestedProject) as\n \| { slug: string; display_name: string; root_path: string }\n \| undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"topic\",\n confidence: topicResult.confidence,\n };\n }\n }\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Marker walk helper\n// ---------------------------------------------------------------------------\n\n/\n Walk up the directory tree from `startDir`, checking each level for a\n * `Notes/PAI.md` file. If found, read the slug and look up the project.\n \n Stops at the filesystem root or after 20 levels (safety guard).\n /\nfunction findMarkerUpward(\n registryDb: Database,\n startDir: string\n): AutoRouteResult \| null {\n let current = startDir;\n let depth = 0;\n\n while (depth < 20) {\n const markerPath = `${current}/Notes/PAI.md`;\n\n if (existsSync(markerPath)) {\n const marker = readPaiMarker(current);\n\n if (marker && marker.status !== \"archived\") {\n // Resolve slug to full project info in the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(marker.slug) as\n \| { slug: string; display_name: string; root_path: string }\n \| undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"marker\",\n confidence: 1.0,\n };\n }\n }\n }\n\n const parent = dirname(current);\n if (parent === current) break; // Reached filesystem root\n current = parent;\n depth++;\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Format helpers\n// ---------------------------------------------------------------------------\n\n/\n Format an AutoRouteResult as a human-readable string for CLI output.\n /\nexport function formatAutoRoute(result: AutoRouteResult): string {\n const lines: string[] = [\n `slug: ${result.slug}`,\n `display_name: ${result.display_name}`,\n `root_path: ${result.root_path}`,\n `method: ${result.method}`,\n `confidence: ${(result.confidence 100).toFixed(0)}%`,\n ];\n return lines.join(\"\\n\");\n}\n\n/*\n Format an AutoRouteResult as JSON for machine consumption.\n */\nexport function formatAutoRouteJson(result: AutoRouteResult): string {\n return JSON.stringify(result, null, 2);\n}\n"],"mappings":";;;;;;;;;;;;;;;AAsDA,eAAsB,UACpB,YACA,YACA,KACA,SACiC;CACjC,MAAM,SAAS,QAAQ,OAAO,QAAQ,KAAK,CAAC;CAM5C,MAAM,YAAY,cAAc,YAAY,OAAO;AAEnD,KAAI,UACF,QAAO;EACL,MAAM,UAAU;EAChB,cAAc,UAAU;EACxB,WAAW,UAAU;EACrB,QAAQ;EACR,YAAY;EACb;CAUH,MAAM,eAAe,iBAAiB,YAAY,OAAO;AACzD,KAAI,aACF,QAAO;AAOT,KAAI,WAAW,QAAQ,MAAM,CAAC,SAAS,GAAG;EAExC,MAAM,EAAE,qBAAqB,MAAM,OAAO;EAC1C,MAAM,cAAc,MAAM,iBAAiB,YAAY,YAAY;GACjE;GACA,WAAW;GACZ,CAAC;AAEF,MAAI,YAAY,oBAAoB,YAAY,aAAa,GAAG;GAE9D,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,YAAY,iBAAiB;AAIpC,OAAI,WACF,QAAO;IACL,MAAM,WAAW;IACjB,cAAc,WAAW;IACzB,WAAW,WAAW;IACtB,QAAQ;IACR,YAAY,YAAY;IACzB;;;AAKP,QAAO;;;;;;;;AAaT,SAAS,iBACP,YACA,UACwB;CACxB,IAAI,UAAU;CACd,IAAI,QAAQ;AAEZ,QAAO,QAAQ,IAAI;AAGjB,MAAI,WAFe,GAAG,QAAQ,eAEJ,EAAE;GAC1B,MAAM,SAAS,cAAc,QAAQ;AAErC,OAAI,UAAU,OAAO,WAAW,YAAY;IAE1C,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,OAAO,KAAK;AAInB,QAAI,WACF,QAAO;KACL,MAAM,WAAW;KACjB,cAAc,WAAW;KACzB,WAAW,WAAW;KACtB,QAAQ;KACR,YAAY;KACb;;;EAKP,MAAM,SAAS,QAAQ,QAAQ;AAC/B,MAAI,WAAW,QAAS;AACxB,YAAU;AACV;;AAGF,QAAO;;;;;AAwBT,SAAgB,oBAAoB,QAAiC;AACnE,QAAO,KAAK,UAAU,QAAQ,MAAM,EAAE"}
1	+ {"version":3,"file":"auto-route-JjW3f7pV.mjs","names":[],"sources":["../src/session/auto-route.ts"],"sourcesContent":["/*\n Auto-route: automatic project routing suggestion on session start.\n \n Given a working directory (and optional conversation context), determine\n * which registered project the session belongs to.\n \n Strategy (in priority order):\n * 1. Path match — exact or parent-directory match in the project registry\n * 2. Marker walk — walk up from cwd looking for Notes/PAI.md, resolve slug\n * 3. Topic match — BM25 keyword search against memory (requires context text)\n \n The function is stateless and works with direct DB access (no daemon\n * required), making it fast and safe to call during session startup.\n /\n\nimport type { Database } from \"better-sqlite3\";\nimport type { StorageBackend } from \"../storage/interface.js\";\nimport { resolve, dirname } from \"node:path\";\nimport { existsSync } from \"node:fs\";\nimport { readPaiMarker } from \"../registry/pai-marker.js\";\nimport { detectProject } from \"../cli/commands/detect.js\";\n\n// ---------------------------------------------------------------------------\n// Types\n// ---------------------------------------------------------------------------\n\nexport type AutoRouteMethod = \"path\" \| \"marker\" \| \"topic\";\n\nexport interface AutoRouteResult {\n /* Project slug /\n slug: string;\n /* Human-readable project name /\n display_name: string;\n /* Absolute path to the project root /\n root_path: string;\n /* How the project was detected /\n method: AutoRouteMethod;\n /* Confidence [0,1]: 1.0 for path/marker matches, BM25 fraction for topic /\n confidence: number;\n}\n\n// ---------------------------------------------------------------------------\n// Core function\n// ---------------------------------------------------------------------------\n\n/\n Determine which project a session should be routed to.\n \n @param registryDb Open PAI registry database\n * @param federation Memory storage backend (needed only for topic fallback)\n * @param cwd Working directory to detect from (defaults to process.cwd())\n * @param context Optional conversation text for topic-based fallback\n * @returns Best project match, or null if nothing matched\n /\nexport async function autoRoute(\n registryDb: Database,\n federation: Database \| StorageBackend,\n cwd?: string,\n context?: string\n): Promise<AutoRouteResult \| null> {\n const target = resolve(cwd ?? process.cwd());\n\n // -------------------------------------------------------------------------\n // Strategy 1: Path match via registry\n // -------------------------------------------------------------------------\n\n const pathMatch = detectProject(registryDb, target);\n\n if (pathMatch) {\n return {\n slug: pathMatch.slug,\n display_name: pathMatch.display_name,\n root_path: pathMatch.root_path,\n method: \"path\",\n confidence: 1.0,\n };\n }\n\n // -------------------------------------------------------------------------\n // Strategy 2: PAI.md marker file walk\n //\n // Walk up from cwd, checking <dir>/Notes/PAI.md at each level.\n // Once found, resolve the slug against the registry to get full project info.\n // -------------------------------------------------------------------------\n\n const markerResult = findMarkerUpward(registryDb, target);\n if (markerResult) {\n return markerResult;\n }\n\n // -------------------------------------------------------------------------\n // Strategy 3: Topic detection (requires context text)\n // -------------------------------------------------------------------------\n\n if (context && context.trim().length > 0) {\n // Lazy import to avoid bundler pulling in daemon/index.mjs at module load time\n const { detectTopicShift } = await import(\"../topics/detector.js\");\n const topicResult = await detectTopicShift(registryDb, federation, {\n context,\n threshold: 0.5, // Lower threshold for initial routing (vs shift detection)\n });\n\n if (topicResult.suggestedProject && topicResult.confidence > 0) {\n // Look up the full project info from the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(topicResult.suggestedProject) as\n \| { slug: string; display_name: string; root_path: string }\n \| undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"topic\",\n confidence: topicResult.confidence,\n };\n }\n }\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Marker walk helper\n// ---------------------------------------------------------------------------\n\n/\n Walk up the directory tree from `startDir`, checking each level for a\n * `Notes/PAI.md` file. If found, read the slug and look up the project.\n \n Stops at the filesystem root or after 20 levels (safety guard).\n /\nfunction findMarkerUpward(\n registryDb: Database,\n startDir: string\n): AutoRouteResult \| null {\n let current = startDir;\n let depth = 0;\n\n while (depth < 20) {\n const markerPath = `${current}/Notes/PAI.md`;\n\n if (existsSync(markerPath)) {\n const marker = readPaiMarker(current);\n\n if (marker && marker.status !== \"archived\") {\n // Resolve slug to full project info in the registry\n const projectRow = registryDb\n .prepare(\n \"SELECT slug, display_name, root_path FROM projects WHERE slug = ? AND status != 'archived'\"\n )\n .get(marker.slug) as\n \| { slug: string; display_name: string; root_path: string }\n \| undefined;\n\n if (projectRow) {\n return {\n slug: projectRow.slug,\n display_name: projectRow.display_name,\n root_path: projectRow.root_path,\n method: \"marker\",\n confidence: 1.0,\n };\n }\n }\n }\n\n const parent = dirname(current);\n if (parent === current) break; // Reached filesystem root\n current = parent;\n depth++;\n }\n\n return null;\n}\n\n// ---------------------------------------------------------------------------\n// Format helpers\n// ---------------------------------------------------------------------------\n\n/\n Format an AutoRouteResult as a human-readable string for CLI output.\n /\nexport function formatAutoRoute(result: AutoRouteResult): string {\n const lines: string[] = [\n `slug: ${result.slug}`,\n `display_name: ${result.display_name}`,\n `root_path: ${result.root_path}`,\n `method: ${result.method}`,\n `confidence: ${(result.confidence 100).toFixed(0)}%`,\n ];\n return lines.join(\"\\n\");\n}\n\n/*\n Format an AutoRouteResult as JSON for machine consumption.\n */\nexport function formatAutoRouteJson(result: AutoRouteResult): string {\n return JSON.stringify(result, null, 2);\n}\n"],"mappings":";;;;;;;;;;;;;;;AAsDA,eAAsB,UACpB,YACA,YACA,KACA,SACiC;CACjC,MAAM,SAAS,QAAQ,OAAO,QAAQ,KAAK,CAAC;CAM5C,MAAM,YAAY,cAAc,YAAY,OAAO;AAEnD,KAAI,UACF,QAAO;EACL,MAAM,UAAU;EAChB,cAAc,UAAU;EACxB,WAAW,UAAU;EACrB,QAAQ;EACR,YAAY;EACb;CAUH,MAAM,eAAe,iBAAiB,YAAY,OAAO;AACzD,KAAI,aACF,QAAO;AAOT,KAAI,WAAW,QAAQ,MAAM,CAAC,SAAS,GAAG;EAExC,MAAM,EAAE,qBAAqB,MAAM,OAAO;EAC1C,MAAM,cAAc,MAAM,iBAAiB,YAAY,YAAY;GACjE;GACA,WAAW;GACZ,CAAC;AAEF,MAAI,YAAY,oBAAoB,YAAY,aAAa,GAAG;GAE9D,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,YAAY,iBAAiB;AAIpC,OAAI,WACF,QAAO;IACL,MAAM,WAAW;IACjB,cAAc,WAAW;IACzB,WAAW,WAAW;IACtB,QAAQ;IACR,YAAY,YAAY;IACzB;;;AAKP,QAAO;;;;;;;;AAaT,SAAS,iBACP,YACA,UACwB;CACxB,IAAI,UAAU;CACd,IAAI,QAAQ;AAEZ,QAAO,QAAQ,IAAI;AAGjB,MAAI,WAFe,GAAG,QAAQ,eAEJ,EAAE;GAC1B,MAAM,SAAS,cAAc,QAAQ;AAErC,OAAI,UAAU,OAAO,WAAW,YAAY;IAE1C,MAAM,aAAa,WAChB,QACC,6FACD,CACA,IAAI,OAAO,KAAK;AAInB,QAAI,WACF,QAAO;KACL,MAAM,WAAW;KACjB,cAAc,WAAW;KACzB,WAAW,WAAW;KACtB,QAAQ;KACR,YAAY;KACb;;;EAKP,MAAM,SAAS,QAAQ,QAAQ;AAC/B,MAAI,WAAW,QAAS;AACxB,YAAU;AACV;;AAGF,QAAO;;;;;AAwBT,SAAgB,oBAAoB,QAAiC;AACnE,QAAO,KAAK,UAAU,QAAQ,MAAM,EAAE"}

package/dist/chunker-CbnBe0s0.mjs ADDED Viewed

@@ -0,0 +1,191 @@
+import { createHash } from "node:crypto";
+//#region src/memory/chunker.ts
+/**
+* Markdown text chunker for the PAI memory engine.
+*
+* Splits markdown files into overlapping text segments suitable for BM25
+* full-text indexing.  Respects heading boundaries where possible, falling
+* back to paragraph and sentence splitting when sections are large.
+*/
+const DEFAULT_MAX_TOKENS = 400;
+const DEFAULT_OVERLAP = 80;
+/**
+* Approximate token count using a words * 1.3 heuristic.
+* Matches the OpenClaw estimate approach.
+*/
+function estimateTokens(text) {
+	const wordCount = text.split(/\s+/).filter(Boolean).length;
+	return Math.ceil(wordCount * 1.3);
+}
+/**
+* Compute SHA-256 hash of a string, returning a hex string.
+*/
+function sha256(text) {
+	return createHash("sha256").update(text).digest("hex");
+}
+/**
+* Split content into sections delimited by ## or ### headings.
+* Each section starts at its heading line (or at line 1 for a preamble).
+*/
+function splitBySections(lines) {
+	const sections = [];
+	let current = [];
+	for (const line of lines) {
+		if (/^#{1,3}\s/.test(line.text) && current.length > 0) {
+			const text = current.map((l) => l.text).join("\n");
+			sections.push({
+				lines: current,
+				tokens: estimateTokens(text)
+			});
+			current = [];
+		}
+		current.push(line);
+	}
+	if (current.length > 0) {
+		const text = current.map((l) => l.text).join("\n");
+		sections.push({
+			lines: current,
+			tokens: estimateTokens(text)
+		});
+	}
+	return sections;
+}
+/**
+* Split a LineBlock by double-newline paragraph boundaries.
+*/
+function splitByParagraphs(block) {
+	const paragraphs = [];
+	let current = [];
+	for (const line of block.lines) if (line.text.trim() === "" && current.length > 0) {
+		const text = current.map((l) => l.text).join("\n");
+		paragraphs.push({
+			lines: [...current],
+			tokens: estimateTokens(text)
+		});
+		current = [];
+	} else current.push(line);
+	if (current.length > 0) {
+		const text = current.map((l) => l.text).join("\n");
+		paragraphs.push({
+			lines: current,
+			tokens: estimateTokens(text)
+		});
+	}
+	return paragraphs.length > 0 ? paragraphs : [block];
+}
+/**
+* Split a LineBlock by sentence boundaries (. ! ?) when even paragraphs are
+* too large.  Works character-by-character within joined lines.
+*/
+function splitBySentences(block, maxTokens) {
+	const sentences = block.lines.map((l) => l.text).join(" ").split(/(?<=[.!?])\s+(?=[A-Z"'])/g);
+	const result = [];
+	let accText = "";
+	const startLine = block.lines[0]?.lineNo ?? 1;
+	const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;
+	const totalLines = endLine - startLine + 1;
+	const linesPerSentence = Math.max(1, Math.floor(totalLines / Math.max(1, sentences.length)));
+	let sentenceIdx = 0;
+	let approxLine = startLine;
+	const flush = () => {
+		if (!accText.trim()) return;
+		const endApprox = Math.min(approxLine + linesPerSentence - 1, endLine);
+		result.push({
+			lines: [{
+				text: accText.trim(),
+				lineNo: approxLine
+			}],
+			tokens: estimateTokens(accText)
+		});
+		approxLine = endApprox + 1;
+		accText = "";
+	};
+	for (const sentence of sentences) {
+		sentenceIdx++;
+		const candidateText = accText ? accText + " " + sentence : sentence;
+		if (estimateTokens(candidateText) > maxTokens && accText) {
+			flush();
+			accText = sentence;
+		} else accText = candidateText;
+	}
+	flush();
+	return result.length > 0 ? result : [block];
+}
+/**
+* Extract the last `overlapTokens` worth of text from a list of previously
+* emitted chunks to prepend to the next chunk.
+*/
+function buildOverlapPrefix(chunks, overlapTokens) {
+	if (overlapTokens <= 0 || chunks.length === 0) return [];
+	const lastChunk = chunks[chunks.length - 1];
+	if (!lastChunk) return [];
+	const lines = lastChunk.text.split("\n");
+	const kept = [];
+	let acc = 0;
+	for (let i = lines.length - 1; i >= 0; i--) {
+		const lineTokens = estimateTokens(lines[i] ?? "");
+		acc += lineTokens;
+		kept.unshift(lines[i] ?? "");
+		if (acc >= overlapTokens) break;
+	}
+	const startLine = lastChunk.endLine - kept.length + 1;
+	return kept.map((text, idx) => ({
+		text,
+		lineNo: Math.max(lastChunk.startLine, startLine + idx)
+	}));
+}
+/**
+* Chunk a markdown file into overlapping segments for BM25 indexing.
+*
+* Strategy:
+*  1. Split by headings (##, ###) as natural boundaries.
+*  2. If a section exceeds maxTokens, split by paragraphs.
+*  3. If a paragraph still exceeds maxTokens, split by sentences.
+*  4. Apply overlap: each chunk includes the last `overlap` tokens from the
+*     previous chunk.
+*/
+function chunkMarkdown(content, opts) {
+	const maxTokens = opts?.maxTokens ?? DEFAULT_MAX_TOKENS;
+	const overlapTokens = opts?.overlap ?? DEFAULT_OVERLAP;
+	if (!content.trim()) return [];
+	const sections = splitBySections(content.split("\n").map((text, idx) => ({
+		text,
+		lineNo: idx + 1
+	})));
+	const finalBlocks = [];
+	for (const section of sections) {
+		if (section.tokens <= maxTokens) {
+			finalBlocks.push(section);
+			continue;
+		}
+		const paras = splitByParagraphs(section);
+		for (const para of paras) {
+			if (para.tokens <= maxTokens) {
+				finalBlocks.push(para);
+				continue;
+			}
+			const sentences = splitBySentences(para, maxTokens);
+			finalBlocks.push(...sentences);
+		}
+	}
+	const chunks = [];
+	for (const block of finalBlocks) {
+		if (block.lines.length === 0) continue;
+		const text = [...buildOverlapPrefix(chunks, overlapTokens), ...block.lines].map((l) => l.text).join("\n").trim();
+		if (!text) continue;
+		const startLine = block.lines[0]?.lineNo ?? 1;
+		const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;
+		chunks.push({
+			text,
+			startLine,
+			endLine,
+			hash: sha256(text)
+		});
+	}
+	return chunks;
+}
+//#endregion
+export { estimateTokens as n, chunkMarkdown as t };
+//# sourceMappingURL=chunker-CbnBe0s0.mjs.map

package/dist/chunker-CbnBe0s0.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"chunker-CbnBe0s0.mjs","names":[],"sources":["../src/memory/chunker.ts"],"sourcesContent":["/**\n * Markdown text chunker for the PAI memory engine.\n *\n * Splits markdown files into overlapping text segments suitable for BM25\n * full-text indexing. Respects heading boundaries where possible, falling\n * back to paragraph and sentence splitting when sections are large.\n */\n\nimport { createHash } from \"node:crypto\";\n\nexport interface Chunk {\n text: string;\n startLine: number; // 1-indexed\n endLine: number; // 1-indexed, inclusive\n hash: string; // SHA-256 of text\n}\n\nexport interface ChunkOptions {\n /** Approximate maximum tokens per chunk. Default 400. */\n maxTokens?: number;\n /** Overlap in tokens from the previous chunk. Default 80. */\n overlap?: number;\n}\n\nconst DEFAULT_MAX_TOKENS = 400;\nconst DEFAULT_OVERLAP = 80;\n\n/**\n * Approximate token count using a words * 1.3 heuristic.\n * Matches the OpenClaw estimate approach.\n */\nexport function estimateTokens(text: string): number {\n const wordCount = text.split(/\\s+/).filter(Boolean).length;\n return Math.ceil(wordCount * 1.3);\n}\n\n/**\n * Compute SHA-256 hash of a string, returning a hex string.\n */\nfunction sha256(text: string): string {\n return createHash(\"sha256\").update(text).digest(\"hex\");\n}\n\n// ---------------------------------------------------------------------------\n// Internal section / paragraph / sentence splitters\n// ---------------------------------------------------------------------------\n\n/**\n * A contiguous block of lines associated with an approximate token count.\n */\ninterface LineBlock {\n lines: Array<{ text: string; lineNo: number }>;\n tokens: number;\n}\n\n/**\n * Split content into sections delimited by ## or ### headings.\n * Each section starts at its heading line (or at line 1 for a preamble).\n */\nfunction splitBySections(\n lines: Array<{ text: string; lineNo: number }>,\n): LineBlock[] {\n const sections: LineBlock[] = [];\n let current: Array<{ text: string; lineNo: number }> = [];\n\n for (const line of lines) {\n const isHeading = /^#{1,3}\\s/.test(line.text);\n if (isHeading && current.length > 0) {\n const text = current.map((l) => l.text).join(\"\\n\");\n sections.push({ lines: current, tokens: estimateTokens(text) });\n current = [];\n }\n current.push(line);\n }\n\n if (current.length > 0) {\n const text = current.map((l) => l.text).join(\"\\n\");\n sections.push({ lines: current, tokens: estimateTokens(text) });\n }\n\n return sections;\n}\n\n/**\n * Split a LineBlock by double-newline paragraph boundaries.\n */\nfunction splitByParagraphs(block: LineBlock): LineBlock[] {\n const paragraphs: LineBlock[] = [];\n let current: Array<{ text: string; lineNo: number }> = [];\n\n for (const line of block.lines) {\n if (line.text.trim() === \"\" && current.length > 0) {\n // Empty line — potential paragraph boundary\n const text = current.map((l) => l.text).join(\"\\n\");\n paragraphs.push({ lines: [...current], tokens: estimateTokens(text) });\n current = [];\n } else {\n current.push(line);\n }\n }\n\n if (current.length > 0) {\n const text = current.map((l) => l.text).join(\"\\n\");\n paragraphs.push({ lines: current, tokens: estimateTokens(text) });\n }\n\n return paragraphs.length > 0 ? paragraphs : [block];\n}\n\n/**\n * Split a LineBlock by sentence boundaries (. ! ?) when even paragraphs are\n * too large. Works character-by-character within joined lines.\n */\nfunction splitBySentences(block: LineBlock, maxTokens: number): LineBlock[] {\n const fullText = block.lines.map((l) => l.text).join(\" \");\n // Very rough sentence split — split on '. ', '! ', '? ' followed by uppercase\n const sentenceRe = /(?<=[.!?])\\s+(?=[A-Z\"'])/g;\n const sentences = fullText.split(sentenceRe);\n\n const result: LineBlock[] = [];\n let accText = \"\";\n // We can't recover exact line numbers inside a single oversized paragraph,\n // so we approximate using the block's start/end lines distributed evenly.\n const startLine = block.lines[0]?.lineNo ?? 1;\n const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;\n const totalLines = endLine - startLine + 1;\n const linesPerSentence = Math.max(1, Math.floor(totalLines / Math.max(1, sentences.length)));\n\n let sentenceIdx = 0;\n let approxLine = startLine;\n\n const flush = () => {\n if (!accText.trim()) return;\n const endApprox = Math.min(approxLine + linesPerSentence - 1, endLine);\n result.push({\n lines: [{ text: accText.trim(), lineNo: approxLine }],\n tokens: estimateTokens(accText),\n });\n approxLine = endApprox + 1;\n accText = \"\";\n };\n\n for (const sentence of sentences) {\n sentenceIdx++;\n const candidateText = accText ? accText + \" \" + sentence : sentence;\n if (estimateTokens(candidateText) > maxTokens && accText) {\n flush();\n accText = sentence;\n } else {\n accText = candidateText;\n }\n }\n void sentenceIdx; // used only for iteration count\n flush();\n\n return result.length > 0 ? result : [block];\n}\n\n// ---------------------------------------------------------------------------\n// Overlap helper\n// ---------------------------------------------------------------------------\n\n/**\n * Extract the last `overlapTokens` worth of text from a list of previously\n * emitted chunks to prepend to the next chunk.\n */\nfunction buildOverlapPrefix(\n chunks: Chunk[],\n overlapTokens: number,\n): Array<{ text: string; lineNo: number }> {\n if (overlapTokens <= 0 || chunks.length === 0) return [];\n\n const lastChunk = chunks[chunks.length - 1];\n if (!lastChunk) return [];\n\n const lines = lastChunk.text.split(\"\\n\");\n const kept: string[] = [];\n let acc = 0;\n\n for (let i = lines.length - 1; i >= 0; i--) {\n const lineTokens = estimateTokens(lines[i] ?? \"\");\n acc += lineTokens;\n kept.unshift(lines[i] ?? \"\");\n if (acc >= overlapTokens) break;\n }\n\n // Distribute overlap lines across the lastChunk's line range\n const startLine = lastChunk.endLine - kept.length + 1;\n return kept.map((text, idx) => ({ text, lineNo: Math.max(lastChunk.startLine, startLine + idx) }));\n}\n\n// ---------------------------------------------------------------------------\n// Public API\n// ---------------------------------------------------------------------------\n\n/**\n * Chunk a markdown file into overlapping segments for BM25 indexing.\n *\n * Strategy:\n * 1. Split by headings (##, ###) as natural boundaries.\n * 2. If a section exceeds maxTokens, split by paragraphs.\n * 3. If a paragraph still exceeds maxTokens, split by sentences.\n * 4. Apply overlap: each chunk includes the last `overlap` tokens from the\n * previous chunk.\n */\nexport function chunkMarkdown(content: string, opts?: ChunkOptions): Chunk[] {\n const maxTokens = opts?.maxTokens ?? DEFAULT_MAX_TOKENS;\n const overlapTokens = opts?.overlap ?? DEFAULT_OVERLAP;\n\n if (!content.trim()) return [];\n\n const rawLines = content.split(\"\\n\");\n const lines: Array<{ text: string; lineNo: number }> = rawLines.map((text, idx) => ({\n text,\n lineNo: idx + 1, // 1-indexed\n }));\n\n // Step 1: section split\n const sections = splitBySections(lines);\n\n // Step 2 & 3: further split oversized sections\n const finalBlocks: LineBlock[] = [];\n for (const section of sections) {\n if (section.tokens <= maxTokens) {\n finalBlocks.push(section);\n continue;\n }\n // Too big — split by paragraphs\n const paras = splitByParagraphs(section);\n for (const para of paras) {\n if (para.tokens <= maxTokens) {\n finalBlocks.push(para);\n continue;\n }\n // Still too big — split by sentences\n const sentences = splitBySentences(para, maxTokens);\n finalBlocks.push(...sentences);\n }\n }\n\n // Step 4: build final chunks with overlap\n const chunks: Chunk[] = [];\n\n for (const block of finalBlocks) {\n if (block.lines.length === 0) continue;\n\n // Build overlap prefix from previous chunks\n const overlapLines = buildOverlapPrefix(chunks, overlapTokens);\n\n // Combine overlap + block lines\n const allLines = [...overlapLines, ...block.lines];\n const text = allLines.map((l) => l.text).join(\"\\n\").trim();\n\n if (!text) continue;\n\n const startLine = block.lines[0]?.lineNo ?? 1;\n const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;\n\n chunks.push({\n text,\n startLine,\n endLine,\n hash: sha256(text),\n });\n }\n\n return chunks;\n}\n"],"mappings":";;;;;;;;;;AAwBA,MAAM,qBAAqB;AAC3B,MAAM,kBAAkB;;;;;AAMxB,SAAgB,eAAe,MAAsB;CACnD,MAAM,YAAY,KAAK,MAAM,MAAM,CAAC,OAAO,QAAQ,CAAC;AACpD,QAAO,KAAK,KAAK,YAAY,IAAI;;;;;AAMnC,SAAS,OAAO,MAAsB;AACpC,QAAO,WAAW,SAAS,CAAC,OAAO,KAAK,CAAC,OAAO,MAAM;;;;;;AAmBxD,SAAS,gBACP,OACa;CACb,MAAM,WAAwB,EAAE;CAChC,IAAI,UAAmD,EAAE;AAEzD,MAAK,MAAM,QAAQ,OAAO;AAExB,MADkB,YAAY,KAAK,KAAK,KAAK,IAC5B,QAAQ,SAAS,GAAG;GACnC,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,YAAS,KAAK;IAAE,OAAO;IAAS,QAAQ,eAAe,KAAK;IAAE,CAAC;AAC/D,aAAU,EAAE;;AAEd,UAAQ,KAAK,KAAK;;AAGpB,KAAI,QAAQ,SAAS,GAAG;EACtB,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,WAAS,KAAK;GAAE,OAAO;GAAS,QAAQ,eAAe,KAAK;GAAE,CAAC;;AAGjE,QAAO;;;;;AAMT,SAAS,kBAAkB,OAA+B;CACxD,MAAM,aAA0B,EAAE;CAClC,IAAI,UAAmD,EAAE;AAEzD,MAAK,MAAM,QAAQ,MAAM,MACvB,KAAI,KAAK,KAAK,MAAM,KAAK,MAAM,QAAQ,SAAS,GAAG;EAEjD,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,aAAW,KAAK;GAAE,OAAO,CAAC,GAAG,QAAQ;GAAE,QAAQ,eAAe,KAAK;GAAE,CAAC;AACtE,YAAU,EAAE;OAEZ,SAAQ,KAAK,KAAK;AAItB,KAAI,QAAQ,SAAS,GAAG;EACtB,MAAM,OAAO,QAAQ,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK;AAClD,aAAW,KAAK;GAAE,OAAO;GAAS,QAAQ,eAAe,KAAK;GAAE,CAAC;;AAGnE,QAAO,WAAW,SAAS,IAAI,aAAa,CAAC,MAAM;;;;;;AAOrD,SAAS,iBAAiB,OAAkB,WAAgC;CAI1E,MAAM,YAHW,MAAM,MAAM,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,IAAI,CAG9B,MADR,4BACyB;CAE5C,MAAM,SAAsB,EAAE;CAC9B,IAAI,UAAU;CAGd,MAAM,YAAY,MAAM,MAAM,IAAI,UAAU;CAC5C,MAAM,UAAU,MAAM,MAAM,MAAM,MAAM,SAAS,IAAI,UAAU;CAC/D,MAAM,aAAa,UAAU,YAAY;CACzC,MAAM,mBAAmB,KAAK,IAAI,GAAG,KAAK,MAAM,aAAa,KAAK,IAAI,GAAG,UAAU,OAAO,CAAC,CAAC;CAE5F,IAAI,cAAc;CAClB,IAAI,aAAa;CAEjB,MAAM,cAAc;AAClB,MAAI,CAAC,QAAQ,MAAM,CAAE;EACrB,MAAM,YAAY,KAAK,IAAI,aAAa,mBAAmB,GAAG,QAAQ;AACtE,SAAO,KAAK;GACV,OAAO,CAAC;IAAE,MAAM,QAAQ,MAAM;IAAE,QAAQ;IAAY,CAAC;GACrD,QAAQ,eAAe,QAAQ;GAChC,CAAC;AACF,eAAa,YAAY;AACzB,YAAU;;AAGZ,MAAK,MAAM,YAAY,WAAW;AAChC;EACA,MAAM,gBAAgB,UAAU,UAAU,MAAM,WAAW;AAC3D,MAAI,eAAe,cAAc,GAAG,aAAa,SAAS;AACxD,UAAO;AACP,aAAU;QAEV,WAAU;;AAId,QAAO;AAEP,QAAO,OAAO,SAAS,IAAI,SAAS,CAAC,MAAM;;;;;;AAW7C,SAAS,mBACP,QACA,eACyC;AACzC,KAAI,iBAAiB,KAAK,OAAO,WAAW,EAAG,QAAO,EAAE;CAExD,MAAM,YAAY,OAAO,OAAO,SAAS;AACzC,KAAI,CAAC,UAAW,QAAO,EAAE;CAEzB,MAAM,QAAQ,UAAU,KAAK,MAAM,KAAK;CACxC,MAAM,OAAiB,EAAE;CACzB,IAAI,MAAM;AAEV,MAAK,IAAI,IAAI,MAAM,SAAS,GAAG,KAAK,GAAG,KAAK;EAC1C,MAAM,aAAa,eAAe,MAAM,MAAM,GAAG;AACjD,SAAO;AACP,OAAK,QAAQ,MAAM,MAAM,GAAG;AAC5B,MAAI,OAAO,cAAe;;CAI5B,MAAM,YAAY,UAAU,UAAU,KAAK,SAAS;AACpD,QAAO,KAAK,KAAK,MAAM,SAAS;EAAE;EAAM,QAAQ,KAAK,IAAI,UAAU,WAAW,YAAY,IAAI;EAAE,EAAE;;;;;;;;;;;;AAiBpG,SAAgB,cAAc,SAAiB,MAA8B;CAC3E,MAAM,YAAY,MAAM,aAAa;CACrC,MAAM,gBAAgB,MAAM,WAAW;AAEvC,KAAI,CAAC,QAAQ,MAAM,CAAE,QAAO,EAAE;CAS9B,MAAM,WAAW,gBAPA,QAAQ,MAAM,KAAK,CAC4B,KAAK,MAAM,SAAS;EAClF;EACA,QAAQ,MAAM;EACf,EAAE,CAGoC;CAGvC,MAAM,cAA2B,EAAE;AACnC,MAAK,MAAM,WAAW,UAAU;AAC9B,MAAI,QAAQ,UAAU,WAAW;AAC/B,eAAY,KAAK,QAAQ;AACzB;;EAGF,MAAM,QAAQ,kBAAkB,QAAQ;AACxC,OAAK,MAAM,QAAQ,OAAO;AACxB,OAAI,KAAK,UAAU,WAAW;AAC5B,gBAAY,KAAK,KAAK;AACtB;;GAGF,MAAM,YAAY,iBAAiB,MAAM,UAAU;AACnD,eAAY,KAAK,GAAG,UAAU;;;CAKlC,MAAM,SAAkB,EAAE;AAE1B,MAAK,MAAM,SAAS,aAAa;AAC/B,MAAI,MAAM,MAAM,WAAW,EAAG;EAO9B,MAAM,OADW,CAAC,GAHG,mBAAmB,QAAQ,cAAc,EAG3B,GAAG,MAAM,MAAM,CAC5B,KAAK,MAAM,EAAE,KAAK,CAAC,KAAK,KAAK,CAAC,MAAM;AAE1D,MAAI,CAAC,KAAM;EAEX,MAAM,YAAY,MAAM,MAAM,IAAI,UAAU;EAC5C,MAAM,UAAU,MAAM,MAAM,MAAM,MAAM,SAAS,IAAI,UAAU;AAE/D,SAAO,KAAK;GACV;GACA;GACA;GACA,MAAM,OAAO,KAAK;GACnB,CAAC;;AAGJ,QAAO"}