@astrofoundry/grimoire 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +148 -0
- package/dist/apikey.d.ts +5 -0
- package/dist/apikey.d.ts.map +1 -0
- package/dist/apikey.js +85 -0
- package/dist/apikey.js.map +1 -0
- package/dist/chunker.d.ts +7 -0
- package/dist/chunker.d.ts.map +1 -0
- package/dist/chunker.js +153 -0
- package/dist/chunker.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +496 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +18 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +76 -0
- package/dist/config.js.map +1 -0
- package/dist/consumer-config.d.ts +11 -0
- package/dist/consumer-config.d.ts.map +1 -0
- package/dist/consumer-config.js +58 -0
- package/dist/consumer-config.js.map +1 -0
- package/dist/consumer.d.ts +8 -0
- package/dist/consumer.d.ts.map +1 -0
- package/dist/consumer.js +71 -0
- package/dist/consumer.js.map +1 -0
- package/dist/converter.d.ts +12 -0
- package/dist/converter.d.ts.map +1 -0
- package/dist/converter.js +95 -0
- package/dist/converter.js.map +1 -0
- package/dist/embedder.d.ts +3 -0
- package/dist/embedder.d.ts.map +1 -0
- package/dist/embedder.js +38 -0
- package/dist/embedder.js.map +1 -0
- package/dist/format.d.ts +5 -0
- package/dist/format.d.ts.map +1 -0
- package/dist/format.js +6 -0
- package/dist/format.js.map +1 -0
- package/dist/reranker.d.ts +6 -0
- package/dist/reranker.d.ts.map +1 -0
- package/dist/reranker.js +21 -0
- package/dist/reranker.js.map +1 -0
- package/dist/scraper.d.ts +9 -0
- package/dist/scraper.d.ts.map +1 -0
- package/dist/scraper.js +77 -0
- package/dist/scraper.js.map +1 -0
- package/dist/search.d.ts +8 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +43 -0
- package/dist/search.js.map +1 -0
- package/dist/store.d.ts +11 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +102 -0
- package/dist/store.js.map +1 -0
- package/dist/types.d.ts +25 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +47 -0
package/README.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# grimoire
|
|
2
|
+
|
|
3
|
+
Documentation RAG System — scrape docs, embed, search with reranking.
|
|
4
|
+
|
|
5
|
+
## Consumer Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install -g @astrofoundry/grimoire
|
|
9
|
+
grimoire init
|
|
10
|
+
# Enter API URL and API key (provided by admin)
|
|
11
|
+
grimoire search "how to query firestore"
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Admin Setup
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pnpm install
|
|
18
|
+
pnpm build
|
|
19
|
+
pnpm link --global
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Firebase / GCP
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Authenticate for Firestore access (grimoire-docs project)
|
|
26
|
+
gcloud auth application-default login --project=grimoire-docs
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Vector indexes (one-time, before first search)
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
gcloud firestore indexes composite create \
|
|
33
|
+
--collection-group=grimoire_chunks \
|
|
34
|
+
--query-scope=COLLECTION \
|
|
35
|
+
--field-config='field-path=embedding,vector-config={"dimension":"768","flat":{}}' \
|
|
36
|
+
--database="(default)" \
|
|
37
|
+
--project=grimoire-docs
|
|
38
|
+
|
|
39
|
+
gcloud firestore indexes composite create \
|
|
40
|
+
--collection-group=grimoire_chunks \
|
|
41
|
+
--query-scope=COLLECTION \
|
|
42
|
+
--field-config='field-path=source,order=ASCENDING' \
|
|
43
|
+
--field-config='field-path=embedding,vector-config={"dimension":"768","flat":{}}' \
|
|
44
|
+
--database="(default)" \
|
|
45
|
+
--project=grimoire-docs
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Commands
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Add a documentation source (interactive)
|
|
52
|
+
grimoire add <name> --url <start_url>
|
|
53
|
+
|
|
54
|
+
# Refresh a source (scrape → convert → chunk → embed → store)
|
|
55
|
+
grimoire refresh <source>
|
|
56
|
+
|
|
57
|
+
# Full refresh (purge all data, re-scrape everything)
|
|
58
|
+
grimoire refresh <source> --full
|
|
59
|
+
|
|
60
|
+
# Re-run from cached HTML (skip scraping)
|
|
61
|
+
grimoire refresh <source> --from-raw
|
|
62
|
+
|
|
63
|
+
# Re-store from cached embeddings (skip scraping + embedding)
|
|
64
|
+
grimoire refresh <source> --from-store
|
|
65
|
+
|
|
66
|
+
# Override concurrency (default: 10)
|
|
67
|
+
grimoire refresh <source> --concurrency 20
|
|
68
|
+
|
|
69
|
+
# Refresh all sources
|
|
70
|
+
grimoire refresh --all
|
|
71
|
+
|
|
72
|
+
# Search across all sources
|
|
73
|
+
grimoire search "<query>"
|
|
74
|
+
|
|
75
|
+
# Search within a specific source
|
|
76
|
+
grimoire search "<query>" --source <name>
|
|
77
|
+
|
|
78
|
+
# List all configured sources
|
|
79
|
+
grimoire list
|
|
80
|
+
|
|
81
|
+
# Show statistics
|
|
82
|
+
grimoire stats
|
|
83
|
+
|
|
84
|
+
# Export source as JSON
|
|
85
|
+
grimoire export <source>
|
|
86
|
+
|
|
87
|
+
# API key management (admin only)
|
|
88
|
+
grimoire apikey create <name>
|
|
89
|
+
grimoire apikey list
|
|
90
|
+
grimoire apikey revoke <name>
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Configuration
|
|
94
|
+
|
|
95
|
+
Sources are defined in `config/sources.yaml`. Each source needs site-specific cleanup config.
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
sources:
|
|
99
|
+
my-source:
|
|
100
|
+
name: My Docs # Display name
|
|
101
|
+
start_url: https://example.com/docs
|
|
102
|
+
nav_selector: nav # CSS selector for navigation element
|
|
103
|
+
content_selector: article # CSS selector for main content
|
|
104
|
+
include_patterns: # URL patterns to include
|
|
105
|
+
- /docs
|
|
106
|
+
exclude_patterns: # URL patterns to exclude (optional)
|
|
107
|
+
- /docs/legacy
|
|
108
|
+
remove_selectors: # CSS selectors to strip from content (site-specific)
|
|
109
|
+
- footer
|
|
110
|
+
- nav
|
|
111
|
+
- .sidebar
|
|
112
|
+
remove_text_patterns: # Regex patterns to strip from markdown (site-specific)
|
|
113
|
+
- "^Cookie notice.*$"
|
|
114
|
+
concurrency: 10 # Parallel browser tabs (default: 10)
|
|
115
|
+
rate_limit_ms: 1000 # Delay between requests (optional)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
The converter only strips `style`, `script`, `noscript`, `iframe`, `svg` by default. All other cleanup (nav, footer, banners, site-specific UI elements) must be configured per source via `remove_selectors` and `remove_text_patterns`.
|
|
119
|
+
|
|
120
|
+
See `config/sources.yaml` for the Firebase Firestore example with full cleanup config.
|
|
121
|
+
|
|
122
|
+
## Environment Variables
|
|
123
|
+
|
|
124
|
+
Set in `.env` at project root (auto-loaded by CLI):
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
GOOGLE_CLOUD_PROJECT=grimoire-docs # Firebase/GCP project ID
|
|
128
|
+
GEMINI_API_KEY=... # Google Gemini API key
|
|
129
|
+
RERANKER_URL=... # llama-cpp reranker endpoint
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Releasing
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
pnpm release:patch # bump, commit, tag, push → GH Actions deploys functions + publishes npm
|
|
136
|
+
pnpm release:minor
|
|
137
|
+
pnpm release:major
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Development
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
pnpm test # Run tests
|
|
144
|
+
pnpm lint # ESLint
|
|
145
|
+
pnpm check # Typecheck + lint + test
|
|
146
|
+
pnpm build # Compile TypeScript
|
|
147
|
+
pnpm build:watch # Watch mode
|
|
148
|
+
```
|
package/dist/apikey.d.ts
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export declare function createApiKey(name: string): Promise<string>;
|
|
2
|
+
export declare function listApiKeys(): Promise<void>;
|
|
3
|
+
export declare function revokeApiKey(name: string): Promise<void>;
|
|
4
|
+
export declare function cmdApiKey(): Promise<void>;
|
|
5
|
+
//# sourceMappingURL=apikey.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"apikey.d.ts","sourceRoot":"","sources":["../src/apikey.ts"],"names":[],"mappings":"AA2BA,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAYhE;AAED,wBAAsB,WAAW,IAAI,OAAO,CAAC,IAAI,CAAC,CAiBjD;AAED,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAY9D;AAED,wBAAsB,SAAS,IAAI,OAAO,CAAC,IAAI,CAAC,CAyB/C"}
|
package/dist/apikey.js
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { randomBytes, createHash } from "node:crypto";
|
|
2
|
+
import { getFirestore, } from "firebase-admin/firestore";
|
|
3
|
+
import { initializeApp, applicationDefault } from "firebase-admin/app";
|
|
4
|
+
import { bold } from "./format.js";
|
|
5
|
+
let app;
|
|
6
|
+
let db;
|
|
7
|
+
function getDb() {
|
|
8
|
+
if (!db) {
|
|
9
|
+
app = app ?? initializeApp({ credential: applicationDefault() });
|
|
10
|
+
db = getFirestore(app);
|
|
11
|
+
}
|
|
12
|
+
return db;
|
|
13
|
+
}
|
|
14
|
+
function hashKey(key) {
|
|
15
|
+
return createHash("sha256").update(key).digest("hex");
|
|
16
|
+
}
|
|
17
|
+
function apiKeysCol() {
|
|
18
|
+
return getDb().collection("grimoire_api_keys");
|
|
19
|
+
}
|
|
20
|
+
export async function createApiKey(name) {
|
|
21
|
+
const raw = `grim_${randomBytes(32).toString("base64url")}`;
|
|
22
|
+
const hash = hashKey(raw);
|
|
23
|
+
await apiKeysCol().doc(hash).set({
|
|
24
|
+
name,
|
|
25
|
+
created_at: new Date().toISOString(),
|
|
26
|
+
last_used_at: null,
|
|
27
|
+
active: true,
|
|
28
|
+
});
|
|
29
|
+
return raw;
|
|
30
|
+
}
|
|
31
|
+
export async function listApiKeys() {
|
|
32
|
+
const snapshot = await apiKeysCol().get();
|
|
33
|
+
if (snapshot.empty) {
|
|
34
|
+
console.log("No API keys found.");
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
console.log("\nAPI Keys:\n");
|
|
38
|
+
for (const doc of snapshot.docs) {
|
|
39
|
+
const data = doc.data();
|
|
40
|
+
const status = data.active ? "active" : "revoked";
|
|
41
|
+
const lastUsed = data.last_used_at ?? "never";
|
|
42
|
+
console.log(` ${bold(data.name)} (${status})`);
|
|
43
|
+
console.log(` Created: ${data.created_at}`);
|
|
44
|
+
console.log(` Last used: ${lastUsed}`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
export async function revokeApiKey(name) {
|
|
48
|
+
const snapshot = await apiKeysCol().where("name", "==", name).where("active", "==", true).get();
|
|
49
|
+
if (snapshot.empty) {
|
|
50
|
+
throw new Error(`No active API key found with name "${name}".`);
|
|
51
|
+
}
|
|
52
|
+
for (const doc of snapshot.docs) {
|
|
53
|
+
await doc.ref.update({ active: false });
|
|
54
|
+
}
|
|
55
|
+
console.log(`API key "${name}" revoked.`);
|
|
56
|
+
}
|
|
57
|
+
export async function cmdApiKey() {
|
|
58
|
+
const subcommand = process.argv[3];
|
|
59
|
+
const arg = process.argv[4];
|
|
60
|
+
if (subcommand === "create") {
|
|
61
|
+
if (!arg) {
|
|
62
|
+
console.error("Usage: grimoire apikey create <name>");
|
|
63
|
+
process.exit(1);
|
|
64
|
+
}
|
|
65
|
+
const key = await createApiKey(arg);
|
|
66
|
+
console.log(`\nAPI key created for "${arg}":\n`);
|
|
67
|
+
console.log(` ${key}\n`);
|
|
68
|
+
console.log("Save this key — it will not be shown again.");
|
|
69
|
+
}
|
|
70
|
+
else if (subcommand === "list") {
|
|
71
|
+
await listApiKeys();
|
|
72
|
+
}
|
|
73
|
+
else if (subcommand === "revoke") {
|
|
74
|
+
if (!arg) {
|
|
75
|
+
console.error("Usage: grimoire apikey revoke <name>");
|
|
76
|
+
process.exit(1);
|
|
77
|
+
}
|
|
78
|
+
await revokeApiKey(arg);
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
console.error("Usage: grimoire apikey <create|list|revoke> [name]");
|
|
82
|
+
process.exit(1);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
//# sourceMappingURL=apikey.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"apikey.js","sourceRoot":"","sources":["../src/apikey.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACtD,OAAO,EACL,YAAY,GAEb,MAAM,0BAA0B,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAY,MAAM,oBAAoB,CAAC;AACjF,OAAO,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AAEnC,IAAI,GAAoB,CAAC;AACzB,IAAI,EAAyB,CAAC;AAE9B,SAAS,KAAK;IACZ,IAAI,CAAC,EAAE,EAAE,CAAC;QACR,GAAG,GAAG,GAAG,IAAI,aAAa,CAAC,EAAE,UAAU,EAAE,kBAAkB,EAAE,EAAE,CAAC,CAAC;QACjE,EAAE,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;IACzB,CAAC;IACD,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,OAAO,CAAC,GAAW;IAC1B,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACxD,CAAC;AAED,SAAS,UAAU;IACjB,OAAO,KAAK,EAAE,CAAC,UAAU,CAAC,mBAAmB,CAAC,CAAC;AACjD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,IAAY;IAC7C,MAAM,GAAG,GAAG,QAAQ,WAAW,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;IAC5D,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC;IAE1B,MAAM,UAAU,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC;QAC/B,IAAI;QACJ,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACpC,YAAY,EAAE,IAAI;QAClB,MAAM,EAAE,IAAI;KACb,CAAC,CAAC;IAEH,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW;IAC/B,MAAM,QAAQ,GAAG,MAAM,UAAU,EAAE,CAAC,GAAG,EAAE,CAAC;IAE1C,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;QAClC,OAAO;IACT,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;IAC7B,KAAK,MAAM,GAAG,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC;QAChC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QACxB,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;QAClD,MAAM,QAAQ,GAAG,IAAI,CAAC,YAAY,IAAI,OAAO,CAAC;QAC9C,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,MAAM,GAAG,CAAC,CAAC;QAChD,OAAO,CAAC,GAAG,CAAC,gBAAgB,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC;QAC/C,OAAO,CAAC,GAAG,CAAC,kBAAkB,QAAQ,EAAE,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,IAAY;IAC7C,MAAM,QAAQ,GAAG,MAAM,UAAU,EAAE,CAAC,KAAK,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,KAAK,CAAC,QAAQ,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC;IAEhG,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,sCAAsC,IAAI,IAAI,CAAC,CAAC;IAClE,CAAC;IAED,KAAK,MAAM,GAAG,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC;QAChC,MAAM,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,CAAC;IAC1C,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,YAAY,IAAI,YAAY,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,MAAM,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACnC,MAAM,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAE5B,IAAI,UAAU,KAAK,QAAQ,EAAE,CAAC;QAC5B,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,OAAO,CAAC,KAAK,CAAC,sCAAsC,CAAC,CAAC;YACtD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QACD,MAAM,GAAG,GAAG,MAAM,YAAY,CAAC,GAAG,CAAC,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,0BAA0B,GAAG,MAAM,CAAC,CAAC;QACjD,OAAO,CAAC,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC;QAC1B,OAAO,CAAC,GAAG,CAAC,6CAA6C,CAAC,CAAC;IAC7D,CAAC;SAAM,IAAI,UAAU,KAAK,MAAM,EAAE,CAAC;QACjC,MAAM,WAAW,EAAE,CAAC;IACtB,CAAC;SAAM,IAAI,UAAU,KAAK,QAAQ,EAAE,CAAC;QACnC,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,OAAO,CAAC,KAAK,CAAC,sCAAsC,CAAC,CAAC;YACtD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QACD,MAAM,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1B,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,KAAK,CAAC,oDAAoD,CAAC,CAAC;QACpE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { Chunk } from "./types.js";
|
|
2
|
+
export type { Chunk };
|
|
3
|
+
export declare function estimateTokens(text: string): number;
|
|
4
|
+
export declare function slugifyHeading(heading: string): string;
|
|
5
|
+
export declare function buildChunkId(source: string, url: string, headingSlug: string, index?: number): string;
|
|
6
|
+
export declare function chunkMarkdown(markdown: string, source: string, url: string, title: string): Chunk[];
|
|
7
|
+
//# sourceMappingURL=chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAExC,YAAY,EAAE,KAAK,EAAE,CAAC;AAItB,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEnD;AAED,wBAAgB,cAAc,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAOtD;AAED,wBAAgB,YAAY,CAC1B,MAAM,EAAE,MAAM,EACd,GAAG,EAAE,MAAM,EACX,WAAW,EAAE,MAAM,EACnB,KAAK,CAAC,EAAE,MAAM,GACb,MAAM,CAIR;AA+FD,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,GACZ,KAAK,EAAE,CAkET"}
|
package/dist/chunker.js
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import { slugifyUrl } from "./scraper.js";
|
|
2
|
+
const MAX_TOKENS = 500;
|
|
3
|
+
export function estimateTokens(text) {
|
|
4
|
+
return Math.ceil(text.length / 4);
|
|
5
|
+
}
|
|
6
|
+
export function slugifyHeading(heading) {
|
|
7
|
+
return heading
|
|
8
|
+
.toLowerCase()
|
|
9
|
+
.replace(/[^a-z0-9\s-]/g, "")
|
|
10
|
+
.replace(/\s+/g, "-")
|
|
11
|
+
.replace(/-+/g, "-")
|
|
12
|
+
.replace(/^-|-$/g, "");
|
|
13
|
+
}
|
|
14
|
+
export function buildChunkId(source, url, headingSlug, index) {
|
|
15
|
+
const urlSlug = slugifyUrl(url);
|
|
16
|
+
const base = `${source}::${urlSlug}::${headingSlug}`;
|
|
17
|
+
return index !== undefined ? `${base}-${index}` : base;
|
|
18
|
+
}
|
|
19
|
+
function parseHeadingSections(markdown) {
|
|
20
|
+
const lines = markdown.split("\n");
|
|
21
|
+
const sections = [];
|
|
22
|
+
const headingStack = [];
|
|
23
|
+
const levelStack = [];
|
|
24
|
+
let currentSection = {
|
|
25
|
+
level: 0,
|
|
26
|
+
heading: "",
|
|
27
|
+
headingPath: [],
|
|
28
|
+
lines: [],
|
|
29
|
+
};
|
|
30
|
+
for (const line of lines) {
|
|
31
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
32
|
+
if (headingMatch) {
|
|
33
|
+
if (currentSection.lines.length > 0 || currentSection.heading !== "") {
|
|
34
|
+
sections.push(currentSection);
|
|
35
|
+
}
|
|
36
|
+
const level = headingMatch[1].length;
|
|
37
|
+
const heading = headingMatch[2].trim();
|
|
38
|
+
while (levelStack.length > 0 && levelStack[levelStack.length - 1] >= level) {
|
|
39
|
+
levelStack.pop();
|
|
40
|
+
headingStack.pop();
|
|
41
|
+
}
|
|
42
|
+
headingStack.push(heading);
|
|
43
|
+
levelStack.push(level);
|
|
44
|
+
currentSection = {
|
|
45
|
+
level,
|
|
46
|
+
heading,
|
|
47
|
+
headingPath: [...headingStack],
|
|
48
|
+
lines: [],
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
currentSection.lines.push(line);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
if (currentSection.lines.length > 0 || currentSection.heading !== "") {
|
|
56
|
+
sections.push(currentSection);
|
|
57
|
+
}
|
|
58
|
+
return sections;
|
|
59
|
+
}
|
|
60
|
+
function splitAtParagraphBoundaries(text, maxTokens) {
|
|
61
|
+
const paragraphs = text.split(/\n\n+/);
|
|
62
|
+
const parts = [];
|
|
63
|
+
let current = [];
|
|
64
|
+
let currentTokens = 0;
|
|
65
|
+
for (const para of paragraphs) {
|
|
66
|
+
const paraTokens = estimateTokens(para);
|
|
67
|
+
if (currentTokens + paraTokens > maxTokens && current.length > 0) {
|
|
68
|
+
parts.push(current.join("\n\n"));
|
|
69
|
+
current = [para];
|
|
70
|
+
currentTokens = paraTokens;
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
current.push(para);
|
|
74
|
+
currentTokens += paraTokens;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
if (current.length > 0) {
|
|
78
|
+
parts.push(current.join("\n\n"));
|
|
79
|
+
}
|
|
80
|
+
return parts;
|
|
81
|
+
}
|
|
82
|
+
function stripFrontmatter(markdown) {
|
|
83
|
+
if (markdown.startsWith("---")) {
|
|
84
|
+
const endIndex = markdown.indexOf("---", 3);
|
|
85
|
+
if (endIndex !== -1) {
|
|
86
|
+
return markdown.slice(endIndex + 3).trim();
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return markdown;
|
|
90
|
+
}
|
|
91
|
+
export function chunkMarkdown(markdown, source, url, title) {
|
|
92
|
+
const stripped = stripFrontmatter(markdown);
|
|
93
|
+
const sections = parseHeadingSections(stripped);
|
|
94
|
+
const chunks = [];
|
|
95
|
+
const usedIds = new Set();
|
|
96
|
+
function uniqueId(baseSlug) {
|
|
97
|
+
let id = buildChunkId(source, url, baseSlug);
|
|
98
|
+
if (!usedIds.has(id)) {
|
|
99
|
+
usedIds.add(id);
|
|
100
|
+
return id;
|
|
101
|
+
}
|
|
102
|
+
let counter = 1;
|
|
103
|
+
while (usedIds.has(buildChunkId(source, url, baseSlug, counter))) {
|
|
104
|
+
counter++;
|
|
105
|
+
}
|
|
106
|
+
id = buildChunkId(source, url, baseSlug, counter);
|
|
107
|
+
usedIds.add(id);
|
|
108
|
+
return id;
|
|
109
|
+
}
|
|
110
|
+
for (const section of sections) {
|
|
111
|
+
const headingLine = section.heading
|
|
112
|
+
? `${"#".repeat(section.level)} ${section.heading}\n\n`
|
|
113
|
+
: "";
|
|
114
|
+
const content = headingLine + section.lines.join("\n").trim();
|
|
115
|
+
if (!content.trim())
|
|
116
|
+
continue;
|
|
117
|
+
const headingSlug = section.heading
|
|
118
|
+
? slugifyHeading(section.heading)
|
|
119
|
+
: "intro";
|
|
120
|
+
const tokens = estimateTokens(content);
|
|
121
|
+
if (tokens <= MAX_TOKENS) {
|
|
122
|
+
chunks.push({
|
|
123
|
+
id: uniqueId(headingSlug),
|
|
124
|
+
source,
|
|
125
|
+
url,
|
|
126
|
+
title,
|
|
127
|
+
heading_path: section.headingPath,
|
|
128
|
+
content,
|
|
129
|
+
token_count: tokens,
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
const parts = splitAtParagraphBoundaries(content, MAX_TOKENS);
|
|
134
|
+
for (let i = 0; i < parts.length; i++) {
|
|
135
|
+
const partContent = parts[i].trim();
|
|
136
|
+
if (!partContent)
|
|
137
|
+
continue;
|
|
138
|
+
const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
|
|
139
|
+
chunks.push({
|
|
140
|
+
id: uniqueId(partSlug),
|
|
141
|
+
source,
|
|
142
|
+
url,
|
|
143
|
+
title,
|
|
144
|
+
heading_path: section.headingPath,
|
|
145
|
+
content: partContent,
|
|
146
|
+
token_count: estimateTokens(partContent),
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return chunks;
|
|
152
|
+
}
|
|
153
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAK1C,MAAM,UAAU,GAAG,GAAG,CAAC;AAEvB,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACpC,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,OAAe;IAC5C,OAAO,OAAO;SACX,WAAW,EAAE;SACb,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC;SAC5B,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;SACnB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,MAAc,EACd,GAAW,EACX,WAAmB,EACnB,KAAc;IAEd,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;IAChC,MAAM,IAAI,GAAG,GAAG,MAAM,KAAK,OAAO,KAAK,WAAW,EAAE,CAAC;IACrD,OAAO,KAAK,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,IAAI,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;AACzD,CAAC;AASD,SAAS,oBAAoB,CAAC,QAAgB;IAC5C,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,QAAQ,GAAqB,EAAE,CAAC;IACtC,MAAM,YAAY,GAAa,EAAE,CAAC;IAClC,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,IAAI,cAAc,GAAmB;QACnC,KAAK,EAAE,CAAC;QACR,OAAO,EAAE,EAAE;QACX,WAAW,EAAE,EAAE;QACf,KAAK,EAAE,EAAE;KACV,CAAC;IAEF,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAErD,IAAI,YAAY,EAAE,CAAC;YACjB,IAAI,cAAc,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,cAAc,CAAC,OAAO,KAAK,EAAE,EAAE,CAAC;gBACrE,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAChC,CAAC;YAED,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YACrC,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAEvC,OAAO,UAAU,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,IAAI,KAAK,EAAE,CAAC;gBAC3E,UAAU,CAAC,GAAG,EAAE,CAAC;gBACjB,YAAY,CAAC,GAAG,EAAE,CAAC;YACrB,CAAC;YAED,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAC3B,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAEvB,cAAc,GAAG;gBACf,KAAK;gBACL,OAAO;gBACP,WAAW,EAAE,CAAC,GAAG,YAAY,CAAC;gBAC9B,KAAK,EAAE,EAAE;aACV,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,cAAc,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,IAAI,cAAc,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,cAAc,CAAC,OAAO,KAAK,EAAE,EAAE,CAAC;QACrE,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IAChC,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,0BAA0B,CAAC,IAAY,EAAE,SAAiB;IACjE,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACvC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO,GAAa,EAAE,CAAC;IAC3B,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QAExC,IAAI,aAAa,GAAG,UAAU,GAAG,SAAS,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjE,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;YACjC,OAAO,GAAG,CAAC,IAAI,CAAC,CAAC;YACjB,aAAa,GAAG,UAAU,CAAC;QAC7B,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACnB,aAAa,IAAI,UAAU,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;IACnC,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,gBAAgB,CAAC,QAAgB;IACxC,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAC5C,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACpB,OAAO,QAAQ,CAAC,KAAK,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC7C,CAAC;IACH,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,UAAU,aAAa,CAC3B,QAAgB,EAChB,MAAc,EACd,GAAW,EACX,KAAa;IAEb,MAAM,QAAQ,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAC5C,MAAM,QAAQ,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAChD,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAElC,SAAS,QAAQ,CAAC,QAAgB;QAChC,IAAI,EAAE,GAAG,YAAY,CAAC,MAAM,EAAE,GAAG,EAAE,QAAQ,CAAC,CAAC;QAC7C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC;YACrB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAChB,OAAO,EAAE,CAAC;QACZ,CAAC;QACD,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,OAAO,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,MAAM,EAAE,GAAG,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,EAAE,CAAC;YACjE,OAAO,EAAE,CAAC;QACZ,CAAC;QACD,EAAE,GAAG,YAAY,CAAC,MAAM,EAAE,GAAG,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;QAClD,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAChB,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,OAAO;YACjC,CAAC,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,OAAO,MAAM;YACvD,CAAC,CAAC,EAAE,CAAC;QACP,MAAM,OAAO,GAAG,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAE9D,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE;YAAE,SAAS;QAE9B,MAAM,WAAW,GAAG,OAAO,CAAC,OAAO;YACjC,CAAC,CAAC,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC;YACjC,CAAC,CAAC,OAAO,CAAC;QAEZ,MAAM,MAAM,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;QAEvC,IAAI,MAAM,IAAI,UAAU,EAAE,CAAC;YACzB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,QAAQ,CAAC,WAAW,CAAC;gBACzB,MAAM;gBACN,GAAG;gBACH,KAAK;gBACL,YAAY,EAAE,OAAO,CAAC,WAAW;gBACjC,OAAO;gBACP,WAAW,EAAE,MAAM;aACpB,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,GAAG,0BAA0B,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;YAC9D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,MAAM,WAAW,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACpC,IAAI,CAAC,WAAW;oBAAE,SAAS;gBAE3B,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,WAAW,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC;gBACxE,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,QAAQ,CAAC,QAAQ,CAAC;oBACtB,MAAM;oBACN,GAAG;oBACH,KAAK;oBACL,YAAY,EAAE,OAAO,CAAC,WAAW;oBACjC,OAAO,EAAE,WAAW;oBACpB,WAAW,EAAE,cAAc,CAAC,WAAW,CAAC;iBACzC,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}
|