@astrofoundry/grimoire 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +148 -0
- package/dist/apikey.d.ts +5 -0
- package/dist/apikey.d.ts.map +1 -0
- package/dist/apikey.js +85 -0
- package/dist/apikey.js.map +1 -0
- package/dist/chunker.d.ts +7 -0
- package/dist/chunker.d.ts.map +1 -0
- package/dist/chunker.js +153 -0
- package/dist/chunker.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +496 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +18 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +76 -0
- package/dist/config.js.map +1 -0
- package/dist/consumer-config.d.ts +11 -0
- package/dist/consumer-config.d.ts.map +1 -0
- package/dist/consumer-config.js +58 -0
- package/dist/consumer-config.js.map +1 -0
- package/dist/consumer.d.ts +8 -0
- package/dist/consumer.d.ts.map +1 -0
- package/dist/consumer.js +71 -0
- package/dist/consumer.js.map +1 -0
- package/dist/converter.d.ts +12 -0
- package/dist/converter.d.ts.map +1 -0
- package/dist/converter.js +95 -0
- package/dist/converter.js.map +1 -0
- package/dist/embedder.d.ts +3 -0
- package/dist/embedder.d.ts.map +1 -0
- package/dist/embedder.js +38 -0
- package/dist/embedder.js.map +1 -0
- package/dist/format.d.ts +5 -0
- package/dist/format.d.ts.map +1 -0
- package/dist/format.js +6 -0
- package/dist/format.js.map +1 -0
- package/dist/reranker.d.ts +6 -0
- package/dist/reranker.d.ts.map +1 -0
- package/dist/reranker.js +21 -0
- package/dist/reranker.js.map +1 -0
- package/dist/scraper.d.ts +9 -0
- package/dist/scraper.d.ts.map +1 -0
- package/dist/scraper.js +77 -0
- package/dist/scraper.js.map +1 -0
- package/dist/search.d.ts +8 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +43 -0
- package/dist/search.js.map +1 -0
- package/dist/store.d.ts +11 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +102 -0
- package/dist/store.js.map +1 -0
- package/dist/types.d.ts +25 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +47 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
import { createInterface } from "node:readline";
|
|
5
|
+
const CONFIG_DIR = join(homedir(), ".grimoire");
|
|
6
|
+
const CONFIG_FILE = join(CONFIG_DIR, "config.json");
|
|
7
|
+
export async function loadConsumerConfig() {
|
|
8
|
+
const raw = await readFile(CONFIG_FILE, "utf-8").catch(() => null);
|
|
9
|
+
if (!raw)
|
|
10
|
+
return null;
|
|
11
|
+
const data = JSON.parse(raw);
|
|
12
|
+
if (typeof data.apiUrl === "string" && typeof data.apiKey === "string") {
|
|
13
|
+
return { apiUrl: data.apiUrl, apiKey: data.apiKey };
|
|
14
|
+
}
|
|
15
|
+
return null;
|
|
16
|
+
}
|
|
17
|
+
export async function saveConsumerConfig(config) {
|
|
18
|
+
await mkdir(CONFIG_DIR, { recursive: true });
|
|
19
|
+
await writeFile(CONFIG_FILE, JSON.stringify(config, null, 2) + "\n", "utf-8");
|
|
20
|
+
}
|
|
21
|
+
export async function resolveConsumerConfig() {
|
|
22
|
+
const envUrl = process.env.GRIMOIRE_API_URL;
|
|
23
|
+
const envKey = process.env.GRIMOIRE_API_KEY;
|
|
24
|
+
if (envUrl && envKey) {
|
|
25
|
+
return { apiUrl: envUrl, apiKey: envKey };
|
|
26
|
+
}
|
|
27
|
+
const fileConfig = await loadConsumerConfig();
|
|
28
|
+
if (fileConfig)
|
|
29
|
+
return fileConfig;
|
|
30
|
+
throw new Error("Grimoire is not configured. Run 'grimoire init' to set up.");
|
|
31
|
+
}
|
|
32
|
+
export function isConsumerMode() {
|
|
33
|
+
return !!process.env.GRIMOIRE_API_URL;
|
|
34
|
+
}
|
|
35
|
+
export async function detectConsumerMode() {
|
|
36
|
+
if (process.env.GRIMOIRE_API_URL)
|
|
37
|
+
return true;
|
|
38
|
+
const config = await loadConsumerConfig();
|
|
39
|
+
return config !== null;
|
|
40
|
+
}
|
|
41
|
+
export async function cmdInit() {
|
|
42
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
43
|
+
const ask = (q) => new Promise((resolve) => rl.question(q, resolve));
|
|
44
|
+
const existing = await loadConsumerConfig();
|
|
45
|
+
const apiUrl = await ask(`API URL${existing ? ` [${existing.apiUrl}]` : ""}: `);
|
|
46
|
+
const apiKey = await ask(`API Key${existing ? " [****]" : ""}: `);
|
|
47
|
+
const config = {
|
|
48
|
+
apiUrl: apiUrl.trim() || existing?.apiUrl || "",
|
|
49
|
+
apiKey: apiKey.trim() || existing?.apiKey || "",
|
|
50
|
+
};
|
|
51
|
+
rl.close();
|
|
52
|
+
if (!config.apiUrl || !config.apiKey) {
|
|
53
|
+
throw new Error("Both API URL and API Key are required.");
|
|
54
|
+
}
|
|
55
|
+
await saveConsumerConfig(config);
|
|
56
|
+
console.log(`\nSaved to ${CONFIG_FILE}`);
|
|
57
|
+
}
|
|
58
|
+
//# sourceMappingURL=consumer-config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"consumer-config.js","sourceRoot":"","sources":["../src/consumer-config.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAEhD,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,EAAE,EAAE,WAAW,CAAC,CAAC;AAChD,MAAM,WAAW,GAAG,IAAI,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;AAOpD,MAAM,CAAC,KAAK,UAAU,kBAAkB;IACtC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,CAAC;IACnE,IAAI,CAAC,GAAG;QAAE,OAAO,IAAI,CAAC;IACtB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC7B,IAAI,OAAO,IAAI,CAAC,MAAM,KAAK,QAAQ,IAAI,OAAO,IAAI,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;QACvE,OAAO,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC;IACtD,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,MAAsB;IAC7D,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC7C,MAAM,SAAS,CAAC,WAAW,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAC;AAChF,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,qBAAqB;IACzC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;IAC5C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;IAE5C,IAAI,MAAM,IAAI,MAAM,EAAE,CAAC;QACrB,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC;IAC5C,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,kBAAkB,EAAE,CAAC;IAC9C,IAAI,UAAU;QAAE,OAAO,UAAU,CAAC;IAElC,MAAM,IAAI,KAAK,CAAC,4DAA4D,CAAC,CAAC;AAChF,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,OAAO,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;AACxC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB;IACtC,IAAI,OAAO,CAAC,GAAG,CAAC,gBAAgB;QAAE,OAAO,IAAI,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,kBAAkB,EAAE,CAAC;IAC1C,OAAO,MAAM,KAAK,IAAI,CAAC;AACzB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO;IAC3B,MAAM,EAAE,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7E,MAAM,GAAG,GAAG,CAAC,CAAS,EAAmB,EAAE,CACzC,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;IAEpD,MAAM,QAAQ,GAAG,MAAM,kBAAkB,EAAE,CAAC;IAE5C,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,UAAU,QAAQ,CAAC,CAAC,CAAC,KAAK,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;IAChF,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,UAAU,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;IAElE,MAAM,MAAM,GAAmB;QAC7B,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,QAAQ,EAAE,MAAM,IAAI,EAAE;QAC/C,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,QAAQ,EAAE,MAAM,IAAI,EAAE;KAChD,CAAC;IAEF,EAAE,CAAC,KAAK,EAAE,CAAC;IAEX,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACrC,MAAM,IAAI,KAAK,CAAC,wCAAwC,CAAC,CAAC;IAC5D,CAAC;IAED,MAAM,kBAAkB,CAAC,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,GAAG,CAAC,cAAc,WAAW,EAAE,CAAC,CAAC;AAC3C,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { ConsumerConfig } from "./consumer-config.js";
|
|
2
|
+
export declare function cmdConsumerSearch(config: ConsumerConfig, query: string, options: {
|
|
3
|
+
source?: string;
|
|
4
|
+
topN?: number;
|
|
5
|
+
}): Promise<void>;
|
|
6
|
+
export declare function cmdConsumerList(config: ConsumerConfig): Promise<void>;
|
|
7
|
+
export declare function cmdConsumerStats(config: ConsumerConfig): Promise<void>;
|
|
8
|
+
//# sourceMappingURL=consumer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"consumer.d.ts","sourceRoot":"","sources":["../src/consumer.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AA+B3D,wBAAsB,iBAAiB,CACrC,MAAM,EAAE,cAAc,EACtB,KAAK,EAAE,MAAM,EACb,OAAO,EAAE;IAAE,MAAM,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,CAAA;CAAE,GAC1C,OAAO,CAAC,IAAI,CAAC,CAmBf;AAED,wBAAsB,eAAe,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAa3E;AAED,wBAAsB,gBAAgB,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAiB5E"}
|
package/dist/consumer.js
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { bold, cyan, yellow } from "./format.js";
|
|
2
|
+
async function apiRequest(config, path, options) {
|
|
3
|
+
const url = `${config.apiUrl.replace(/\/$/, "")}${path}`;
|
|
4
|
+
let response;
|
|
5
|
+
try {
|
|
6
|
+
response = await fetch(url, {
|
|
7
|
+
...options,
|
|
8
|
+
headers: {
|
|
9
|
+
"Content-Type": "application/json",
|
|
10
|
+
"x-api-key": config.apiKey,
|
|
11
|
+
...options?.headers,
|
|
12
|
+
},
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
throw new Error(`Cannot reach Grimoire API at ${config.apiUrl}. Check your GRIMOIRE_API_URL.`);
|
|
17
|
+
}
|
|
18
|
+
if (response.status === 401 || response.status === 403) {
|
|
19
|
+
throw new Error("Invalid API key. Check your GRIMOIRE_API_KEY or run 'grimoire init'.");
|
|
20
|
+
}
|
|
21
|
+
if (!response.ok) {
|
|
22
|
+
throw new Error(`API error: ${response.status} ${response.statusText}`);
|
|
23
|
+
}
|
|
24
|
+
return response.json();
|
|
25
|
+
}
|
|
26
|
+
export async function cmdConsumerSearch(config, query, options) {
|
|
27
|
+
const data = await apiRequest(config, "/search", {
|
|
28
|
+
method: "POST",
|
|
29
|
+
body: JSON.stringify({ query, source: options.source, topN: options.topN }),
|
|
30
|
+
});
|
|
31
|
+
if (data.results.length === 0) {
|
|
32
|
+
console.log("No results found.");
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
for (let i = 0; i < data.results.length; i++) {
|
|
36
|
+
const r = data.results[i];
|
|
37
|
+
const score = r.relevance_score < 0.001 ? r.relevance_score.toExponential(2) : r.relevance_score.toFixed(4);
|
|
38
|
+
console.log(`\n${bold(`[${i + 1}] ${r.title}`)} (${score})`);
|
|
39
|
+
console.log(` ${cyan(r.url)}`);
|
|
40
|
+
console.log(` ${yellow(r.heading_path.join(" > "))}`);
|
|
41
|
+
console.log(` ${r.content.slice(0, 200).replace(/\n/g, " ")}...`);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
export async function cmdConsumerList(config) {
|
|
45
|
+
const data = await apiRequest(config, "/list");
|
|
46
|
+
if (data.sources.length === 0) {
|
|
47
|
+
console.log("No sources available.");
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
console.log("\nSources:\n");
|
|
51
|
+
for (const s of data.sources) {
|
|
52
|
+
console.log(` ${bold(s.source)}`);
|
|
53
|
+
console.log(` ${s.chunk_count} chunks, ${s.url_count} URLs, last refreshed ${s.last_refreshed}`);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
export async function cmdConsumerStats(config) {
|
|
57
|
+
const data = await apiRequest(config, "/stats");
|
|
58
|
+
if (data.sources.length === 0) {
|
|
59
|
+
console.log("No sources have been refreshed yet.");
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
console.log("\nSource Statistics:\n");
|
|
63
|
+
for (const s of data.sources) {
|
|
64
|
+
console.log(` ${bold(s.source)}`);
|
|
65
|
+
console.log(` Chunks: ${s.chunk_count}`);
|
|
66
|
+
console.log(` URLs: ${s.url_count}`);
|
|
67
|
+
console.log(` Last refreshed: ${s.last_refreshed}`);
|
|
68
|
+
}
|
|
69
|
+
console.log(`\n Total: ${data.totalChunks} chunks across ${data.totalUrls} URLs from ${data.sources.length} sources`);
|
|
70
|
+
}
|
|
71
|
+
//# sourceMappingURL=consumer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"consumer.js","sourceRoot":"","sources":["../src/consumer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAEjD,KAAK,UAAU,UAAU,CAAI,MAAsB,EAAE,IAAY,EAAE,OAAqB;IACtF,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC;IAEzD,IAAI,QAAkB,CAAC;IACvB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC1B,GAAG,OAAO;YACV,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,WAAW,EAAE,MAAM,CAAC,MAAM;gBAC1B,GAAG,OAAO,EAAE,OAAO;aACpB;SACF,CAAC,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,gCAAgC,MAAM,CAAC,MAAM,gCAAgC,CAAC,CAAC;IACjG,CAAC;IAED,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;QACvD,MAAM,IAAI,KAAK,CAAC,sEAAsE,CAAC,CAAC;IAC1F,CAAC;IAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,KAAK,CAAC,cAAc,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1E,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,EAAgB,CAAC;AACvC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,MAAsB,EACtB,KAAa,EACb,OAA2C;IAE3C,MAAM,IAAI,GAAG,MAAM,UAAU,CAA8B,MAAM,EAAE,SAAS,EAAE;QAC5E,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,EAAE,CAAC;KAC5E,CAAC,CAAC;IAEH,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;QACjC,OAAO;IACT,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7C,MAAM,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,KAAK,GAAG,CAAC,CAAC,eAAe,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAC5G,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,CAAC,KAAK,KAAK,GAAG,CAAC,CAAC;QAC7D,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAClC,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,CAAC,CAAC,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;QACzD,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,KAAK,CAAC,CAAC;IACvE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,MAAsB;IAC1D,MAAM,IAAI,GAAG,MAAM,UAAU,CAA4B,MAAM,EAAE,OAAO,CAAC,CAAC;IAE1E,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;QACrC,OAAO;IACT,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;IAC5B,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;QAC7B,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,WAAW,YAAY,CAAC,CAAC,SAAS,yBAAyB,CAAC,CAAC,cAAc,EAAE,CAAC,CAAC;IACtG,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,MAAsB;IAC3D,MAAM,IAAI,GAAG,MAAM,UAAU,CAAoE,MAAM,EAAE,QAAQ,CAAC,CAAC;IAEnH,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC;QACnD,OAAO;IACT,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;IACtC,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;QAC7B,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;QACxC,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC,cAAc,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,CAAC,WAAW,kBAAkB,IAAI,CAAC,SAAS,cAAc,IAAI,CAAC,OAAO,CAAC,MAAM,UAAU,CAAC,CAAC;AACzH,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export interface ConvertedPage {
|
|
2
|
+
source: string;
|
|
3
|
+
url: string;
|
|
4
|
+
title: string;
|
|
5
|
+
markdown: string;
|
|
6
|
+
}
|
|
7
|
+
export declare function extractContent(html: string, contentSelector: string, removeSelectors?: string[], removeTextPatterns?: string[]): string;
|
|
8
|
+
export declare function extractTitle(html: string): string;
|
|
9
|
+
export declare function buildFrontmatter(source: string, url: string, title: string): string;
|
|
10
|
+
export declare function convertPage(html: string, source: string, url: string, contentSelector: string, removeSelectors?: string[], removeTextPatterns?: string[]): ConvertedPage;
|
|
11
|
+
export declare function convertSource(sourceName: string, urls: string[], contentSelector: string, removeSelectors: string[] | undefined, removeTextPatterns: string[] | undefined, dataDir: string, concurrency?: number, onProgress?: (current: number, total: number, url: string) => void): Promise<ConvertedPage[]>;
|
|
12
|
+
//# sourceMappingURL=converter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"converter.d.ts","sourceRoot":"","sources":["../src/converter.ts"],"names":[],"mappings":"AAYA,MAAM,WAAW,aAAa;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;CAClB;AAyBD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,eAAe,EAAE,MAAM,EACvB,eAAe,CAAC,EAAE,MAAM,EAAE,EAC1B,kBAAkB,CAAC,EAAE,MAAM,EAAE,GAC5B,MAAM,CAiBR;AAED,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAKjD;AAED,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,MAAM,CASnF;AAED,wBAAgB,WAAW,CACzB,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,MAAM,EACd,GAAG,EAAE,MAAM,EACX,eAAe,EAAE,MAAM,EACvB,eAAe,CAAC,EAAE,MAAM,EAAE,EAC1B,kBAAkB,CAAC,EAAE,MAAM,EAAE,GAC5B,aAAa,CAOf;AAID,wBAAsB,aAAa,CACjC,UAAU,EAAE,MAAM,EAClB,IAAI,EAAE,MAAM,EAAE,EACd,eAAe,EAAE,MAAM,EACvB,eAAe,EAAE,MAAM,EAAE,GAAG,SAAS,EACrC,kBAAkB,EAAE,MAAM,EAAE,GAAG,SAAS,EACxC,OAAO,EAAE,MAAM,EACf,WAAW,SAAsB,EACjC,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,KAAK,IAAI,GACjE,OAAO,CAAC,aAAa,EAAE,CAAC,CAgC1B"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { JSDOM } from "jsdom";
|
|
4
|
+
import TurndownService from "turndown";
|
|
5
|
+
import { slugifyUrl } from "./scraper.js";
|
|
6
|
+
const turndown = new TurndownService({
|
|
7
|
+
headingStyle: "atx",
|
|
8
|
+
codeBlockStyle: "fenced",
|
|
9
|
+
bulletListMarker: "-",
|
|
10
|
+
});
|
|
11
|
+
const GENERIC_REMOVE = [
|
|
12
|
+
"style",
|
|
13
|
+
"script",
|
|
14
|
+
"noscript",
|
|
15
|
+
"iframe",
|
|
16
|
+
"svg",
|
|
17
|
+
];
|
|
18
|
+
function cleanMarkdown(md, textPatterns) {
|
|
19
|
+
let cleaned = md
|
|
20
|
+
.replace(/^(#+)\s*$/gm, "")
|
|
21
|
+
.replace(/\n{3,}/g, "\n\n");
|
|
22
|
+
if (textPatterns) {
|
|
23
|
+
for (const pattern of textPatterns) {
|
|
24
|
+
cleaned = cleaned.replace(new RegExp(pattern, "gm"), "");
|
|
25
|
+
}
|
|
26
|
+
cleaned = cleaned.replace(/\n{3,}/g, "\n\n");
|
|
27
|
+
}
|
|
28
|
+
return cleaned.trim();
|
|
29
|
+
}
|
|
30
|
+
export function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
|
|
31
|
+
const dom = new JSDOM(html);
|
|
32
|
+
const doc = dom.window.document;
|
|
33
|
+
const contentEl = doc.querySelector(contentSelector);
|
|
34
|
+
if (!contentEl) {
|
|
35
|
+
return cleanMarkdown(turndown.turndown(doc.body.innerHTML), removeTextPatterns);
|
|
36
|
+
}
|
|
37
|
+
const allSelectors = [...GENERIC_REMOVE, ...(removeSelectors ?? [])];
|
|
38
|
+
for (const selector of allSelectors) {
|
|
39
|
+
for (const el of contentEl.querySelectorAll(selector)) {
|
|
40
|
+
el.remove();
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return cleanMarkdown(turndown.turndown(contentEl.innerHTML), removeTextPatterns);
|
|
44
|
+
}
|
|
45
|
+
export function extractTitle(html) {
|
|
46
|
+
const dom = new JSDOM(html);
|
|
47
|
+
const titleEl = dom.window.document.querySelector("title");
|
|
48
|
+
if (!titleEl)
|
|
49
|
+
return "Untitled";
|
|
50
|
+
return titleEl.textContent?.replace(/\s*[|–—-]\s*.+$/, "").trim() ?? "Untitled";
|
|
51
|
+
}
|
|
52
|
+
export function buildFrontmatter(source, url, title) {
|
|
53
|
+
return [
|
|
54
|
+
"---",
|
|
55
|
+
`source: ${source}`,
|
|
56
|
+
`url: "${url}"`,
|
|
57
|
+
`title: "${title.replace(/"/g, '\\"')}"`,
|
|
58
|
+
`fetched_at: "${new Date().toISOString()}"`,
|
|
59
|
+
"---",
|
|
60
|
+
].join("\n");
|
|
61
|
+
}
|
|
62
|
+
export function convertPage(html, source, url, contentSelector, removeSelectors, removeTextPatterns) {
|
|
63
|
+
const title = extractTitle(html);
|
|
64
|
+
const content = extractContent(html, contentSelector, removeSelectors, removeTextPatterns);
|
|
65
|
+
const frontmatter = buildFrontmatter(source, url, title);
|
|
66
|
+
const markdown = `${frontmatter}\n\n${content}`;
|
|
67
|
+
return { source, url, title, markdown };
|
|
68
|
+
}
|
|
69
|
+
const DEFAULT_CONCURRENCY = 10;
|
|
70
|
+
export async function convertSource(sourceName, urls, contentSelector, removeSelectors, removeTextPatterns, dataDir, concurrency = DEFAULT_CONCURRENCY, onProgress) {
|
|
71
|
+
const rawDir = join(dataDir, "raw", sourceName);
|
|
72
|
+
const mdDir = join(dataDir, "markdown", sourceName);
|
|
73
|
+
await mkdir(mdDir, { recursive: true });
|
|
74
|
+
const pages = new Array(urls.length);
|
|
75
|
+
let completed = 0;
|
|
76
|
+
let nextIndex = 0;
|
|
77
|
+
async function worker() {
|
|
78
|
+
while (nextIndex < urls.length) {
|
|
79
|
+
const i = nextIndex++;
|
|
80
|
+
const url = urls[i];
|
|
81
|
+
const slug = slugifyUrl(url);
|
|
82
|
+
const htmlPath = join(rawDir, `${slug}.html`);
|
|
83
|
+
const html = await readFile(htmlPath, "utf-8");
|
|
84
|
+
const page = convertPage(html, sourceName, url, contentSelector, removeSelectors, removeTextPatterns);
|
|
85
|
+
await writeFile(join(mdDir, `${slug}.md`), page.markdown, "utf-8");
|
|
86
|
+
pages[i] = page;
|
|
87
|
+
completed++;
|
|
88
|
+
onProgress?.(completed, urls.length, url);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
const workers = Array.from({ length: Math.min(concurrency, urls.length) }, () => worker());
|
|
92
|
+
await Promise.all(workers);
|
|
93
|
+
return pages;
|
|
94
|
+
}
|
|
95
|
+
//# sourceMappingURL=converter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"converter.js","sourceRoot":"","sources":["../src/converter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,eAAe,MAAM,UAAU,CAAC;AACvC,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAE1C,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC;IACnC,YAAY,EAAE,KAAK;IACnB,cAAc,EAAE,QAAQ;IACxB,gBAAgB,EAAE,GAAG;CACtB,CAAC,CAAC;AASH,MAAM,cAAc,GAAG;IACrB,OAAO;IACP,QAAQ;IACR,UAAU;IACV,QAAQ;IACR,KAAK;CACN,CAAC;AAEF,SAAS,aAAa,CAAC,EAAU,EAAE,YAAuB;IACxD,IAAI,OAAO,GAAG,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE9B,IAAI,YAAY,EAAE,CAAC;QACjB,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;YACnC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC;QAC3D,CAAC;QACD,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAC/C,CAAC;IAED,OAAO,OAAO,CAAC,IAAI,EAAE,CAAC;AACxB,CAAC;AAED,MAAM,UAAU,cAAc,CAC5B,IAAY,EACZ,eAAuB,EACvB,eAA0B,EAC1B,kBAA6B;IAE7B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;IAC5B,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;IAEhC,MAAM,SAAS,GAAG,GAAG,CAAC,aAAa,CAAC,eAAe,CAAC,CAAC;IACrD,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO,aAAa,CAAC,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,kBAAkB,CAAC,CAAC;IAClF,CAAC;IAED,MAAM,YAAY,GAAG,CAAC,GAAG,cAAc,EAAE,GAAG,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,CAAC;IACrE,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE,CAAC;QACpC,KAAK,MAAM,EAAE,IAAI,SAAS,CAAC,gBAAgB,CAAC,QAAQ,CAAC,EAAE,CAAC;YACtD,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,aAAa,CAAC,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,kBAAkB,CAAC,CAAC;AACnF,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;IAC5B,MAAM,OAAO,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAC3D,IAAI,CAAC,OAAO;QAAE,OAAO,UAAU,CAAC;IAChC,OAAO,OAAO,CAAC,WAAW,EAAE,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,IAAI,UAAU,CAAC;AAClF,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,MAAc,EAAE,GAAW,EAAE,KAAa;IACzE,OAAO;QACL,KAAK;QACL,WAAW,MAAM,EAAE;QACnB,SAAS,GAAG,GAAG;QACf,WAAW,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,GAAG;QACxC,gBAAgB,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,GAAG;QAC3C,KAAK;KACN,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACf,CAAC;AAED,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,MAAc,EACd,GAAW,EACX,eAAuB,EACvB,eAA0B,EAC1B,kBAA6B;IAE7B,MAAM,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACjC,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,EAAE,eAAe,EAAE,eAAe,EAAE,kBAAkB,CAAC,CAAC;IAC3F,MAAM,WAAW,GAAG,gBAAgB,CAAC,MAAM,EAAE,GAAG,EAAE,KAAK,CAAC,CAAC;IACzD,MAAM,QAAQ,GAAG,GAAG,WAAW,OAAO,OAAO,EAAE,CAAC;IAEhD,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;AAC1C,CAAC;AAED,MAAM,mBAAmB,GAAG,EAAE,CAAC;AAE/B,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,UAAkB,EAClB,IAAc,EACd,eAAuB,EACvB,eAAqC,EACrC,kBAAwC,EACxC,OAAe,EACf,WAAW,GAAG,mBAAmB,EACjC,UAAkE;IAElE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,EAAE,UAAU,EAAE,UAAU,CAAC,CAAC;IACpD,MAAM,KAAK,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAExC,MAAM,KAAK,GAAoB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACtD,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,UAAU,MAAM;QACnB,OAAO,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC/B,MAAM,CAAC,GAAG,SAAS,EAAE,CAAC;YACtB,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YACpB,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;YAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,OAAO,CAAC,CAAC;YAC9C,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;YAE/C,MAAM,IAAI,GAAG,WAAW,CAAC,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,eAAe,EAAE,eAAe,EAAE,kBAAkB,CAAC,CAAC;YACtG,MAAM,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,GAAG,IAAI,KAAK,CAAC,EAAE,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;YACnE,KAAK,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC;YAChB,SAAS,EAAE,CAAC;YACZ,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CACxB,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,EAAE,EAC9C,GAAG,EAAE,CAAC,MAAM,EAAE,CACf,CAAC;IACF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAE3B,OAAO,KAAK,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedder.d.ts","sourceRoot":"","sources":["../src/embedder.ts"],"names":[],"mappings":"AAmBA,wBAAsB,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAqBrE;AAED,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAG/D"}
|
package/dist/embedder.js
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { GoogleGenerativeAI } from "@google/generative-ai";
|
|
2
|
+
const BATCH_SIZE = 100;
|
|
3
|
+
const MODEL = "gemini-embedding-001";
|
|
4
|
+
const OUTPUT_DIMENSIONALITY = 768;
|
|
5
|
+
let genAI;
|
|
6
|
+
function getClient() {
|
|
7
|
+
if (!genAI) {
|
|
8
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
9
|
+
if (!apiKey) {
|
|
10
|
+
throw new Error("GEMINI_API_KEY environment variable is not set");
|
|
11
|
+
}
|
|
12
|
+
genAI = new GoogleGenerativeAI(apiKey);
|
|
13
|
+
}
|
|
14
|
+
return genAI;
|
|
15
|
+
}
|
|
16
|
+
export async function embedTexts(texts) {
|
|
17
|
+
const client = getClient();
|
|
18
|
+
const model = client.getGenerativeModel({ model: MODEL });
|
|
19
|
+
const embeddings = [];
|
|
20
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
21
|
+
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
22
|
+
const result = await model.batchEmbedContents({
|
|
23
|
+
requests: batch.map((text) => ({
|
|
24
|
+
content: { role: "user", parts: [{ text }] },
|
|
25
|
+
outputDimensionality: OUTPUT_DIMENSIONALITY,
|
|
26
|
+
})),
|
|
27
|
+
});
|
|
28
|
+
for (const embedding of result.embeddings) {
|
|
29
|
+
embeddings.push(embedding.values);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return embeddings;
|
|
33
|
+
}
|
|
34
|
+
export async function embedText(text) {
|
|
35
|
+
const [embedding] = await embedTexts([text]);
|
|
36
|
+
return embedding;
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=embedder.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedder.js","sourceRoot":"","sources":["../src/embedder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAE3D,MAAM,UAAU,GAAG,GAAG,CAAC;AACvB,MAAM,KAAK,GAAG,sBAAsB,CAAC;AACrC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,IAAI,KAAqC,CAAC;AAE1C,SAAS,SAAS;IAChB,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;QAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;QACpE,CAAC;QACD,KAAK,GAAG,IAAI,kBAAkB,CAAC,MAAM,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,KAAe;IAC9C,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;IAC3B,MAAM,KAAK,GAAG,MAAM,CAAC,kBAAkB,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;IAE1D,MAAM,UAAU,GAAe,EAAE,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QAClD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAC7C,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,kBAAkB,CAAC;YAC5C,QAAQ,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBAC7B,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE;gBAC5C,oBAAoB,EAAE,qBAAqB;aAC5C,CAAC,CAAC;SACJ,CAAC,CAAC;QAEH,KAAK,MAAM,SAAS,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;YAC1C,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACpC,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAY;IAC1C,MAAM,CAAC,SAAS,CAAC,GAAG,MAAM,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IAC7C,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
package/dist/format.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"format.d.ts","sourceRoot":"","sources":["../src/format.ts"],"names":[],"mappings":"AAEA,eAAO,MAAM,IAAI,GAAI,GAAG,MAAM,KAAG,MAA0C,CAAC;AAC5E,eAAO,MAAM,IAAI,GAAI,GAAG,MAAM,KAAG,MAA2C,CAAC;AAC7E,eAAO,MAAM,MAAM,GAAI,GAAG,MAAM,KAAG,MAA2C,CAAC;AAC/E,eAAO,MAAM,GAAG,GAAI,GAAG,MAAM,KAAG,MAA2C,CAAC"}
|
package/dist/format.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
const isTTY = process.stdout.isTTY ?? false;
|
|
2
|
+
export const bold = (s) => isTTY ? `\x1b[1m${s}\x1b[0m` : s;
|
|
3
|
+
export const cyan = (s) => isTTY ? `\x1b[36m${s}\x1b[0m` : s;
|
|
4
|
+
export const yellow = (s) => isTTY ? `\x1b[33m${s}\x1b[0m` : s;
|
|
5
|
+
export const red = (s) => isTTY ? `\x1b[31m${s}\x1b[0m` : s;
|
|
6
|
+
//# sourceMappingURL=format.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"format.js","sourceRoot":"","sources":["../src/format.ts"],"names":[],"mappings":"AAAA,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,IAAI,KAAK,CAAC;AAE5C,MAAM,CAAC,MAAM,IAAI,GAAG,CAAC,CAAS,EAAU,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;AAC5E,MAAM,CAAC,MAAM,IAAI,GAAG,CAAC,CAAS,EAAU,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;AAC7E,MAAM,CAAC,MAAM,MAAM,GAAG,CAAC,CAAS,EAAU,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;AAC/E,MAAM,CAAC,MAAM,GAAG,GAAG,CAAC,CAAS,EAAU,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reranker.d.ts","sourceRoot":"","sources":["../src/reranker.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,YAAY;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,MAAM,CAAC;CACzB;AAcD,wBAAsB,MAAM,CAC1B,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,MAAM,EAAE,EACnB,IAAI,SAAI,GACP,OAAO,CAAC,YAAY,EAAE,CAAC,CAczB"}
|
package/dist/reranker.js
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
function getRerankerUrl() {
|
|
2
|
+
const url = process.env.RERANKER_URL;
|
|
3
|
+
if (!url) {
|
|
4
|
+
throw new Error("RERANKER_URL environment variable is not set");
|
|
5
|
+
}
|
|
6
|
+
return url;
|
|
7
|
+
}
|
|
8
|
+
export async function rerank(query, documents, topN = 5) {
|
|
9
|
+
const baseUrl = getRerankerUrl();
|
|
10
|
+
const response = await fetch(`${baseUrl}/v1/rerank`, {
|
|
11
|
+
method: "POST",
|
|
12
|
+
headers: { "Content-Type": "application/json" },
|
|
13
|
+
body: JSON.stringify({ model: "zerank-2", query, documents, top_n: topN }),
|
|
14
|
+
});
|
|
15
|
+
if (!response.ok) {
|
|
16
|
+
throw new Error(`Reranker request failed: ${response.status} ${response.statusText}`);
|
|
17
|
+
}
|
|
18
|
+
const data = (await response.json());
|
|
19
|
+
return data.results;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=reranker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reranker.js","sourceRoot":"","sources":["../src/reranker.ts"],"names":[],"mappings":"AASA,SAAS,cAAc;IACrB,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;IACrC,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,KAAK,CAAC,8CAA8C,CAAC,CAAC;IAClE,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,MAAM,CAC1B,KAAa,EACb,SAAmB,EACnB,IAAI,GAAG,CAAC;IAER,MAAM,OAAO,GAAG,cAAc,EAAE,CAAC;IACjC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,OAAO,YAAY,EAAE;QACnD,MAAM,EAAE,MAAM;QACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;QAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;KAC3E,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,KAAK,CAAC,4BAA4B,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IACxF,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAA2B,CAAC;IAC/D,OAAO,IAAI,CAAC,OAAO,CAAC;AACtB,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { type Browser, type Page } from "playwright";
|
|
2
|
+
import type { SourceConfig } from "./config.js";
|
|
3
|
+
export declare function slugifyUrl(url: string): string;
|
|
4
|
+
export declare function filterUrls(urls: string[], includePatterns?: string[], excludePatterns?: string[]): string[];
|
|
5
|
+
export declare function discoverUrls(page: Page, source: SourceConfig): Promise<string[]>;
|
|
6
|
+
export declare function fetchPage(page: Page, url: string): Promise<string>;
|
|
7
|
+
export declare function scrapeSource(source: SourceConfig, sourceName: string, dataDir: string, onProgress?: (current: number, total: number, url: string) => void): Promise<string[]>;
|
|
8
|
+
export declare function createBrowser(): Promise<Browser>;
|
|
9
|
+
//# sourceMappingURL=scraper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scraper.d.ts","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAEA,OAAO,EAAY,KAAK,OAAO,EAAE,KAAK,IAAI,EAAE,MAAM,YAAY,CAAC;AAC/D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAEhD,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAO9C;AAED,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EAAE,EACd,eAAe,CAAC,EAAE,MAAM,EAAE,EAC1B,eAAe,CAAC,EAAE,MAAM,EAAE,GACzB,MAAM,EAAE,CAgBV;AAED,wBAAsB,YAAY,CAChC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,YAAY,GACnB,OAAO,CAAC,MAAM,EAAE,CAAC,CAanB;AAED,wBAAsB,SAAS,CAC7B,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,MAAM,CAAC,CAGjB;AAsBD,wBAAsB,YAAY,CAChC,MAAM,EAAE,YAAY,EACpB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,KAAK,IAAI,GACjE,OAAO,CAAC,MAAM,EAAE,CAAC,CAgCnB;AAED,wBAAsB,aAAa,IAAI,OAAO,CAAC,OAAO,CAAC,CAEtD"}
|
package/dist/scraper.js
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { chromium } from "playwright";
|
|
4
|
+
export function slugifyUrl(url) {
|
|
5
|
+
const parsed = new URL(url);
|
|
6
|
+
return parsed.pathname
|
|
7
|
+
.replace(/^\//, "")
|
|
8
|
+
.replace(/\/$/, "")
|
|
9
|
+
.replace(/\//g, "-")
|
|
10
|
+
.replace(/[^a-zA-Z0-9-]/g, "");
|
|
11
|
+
}
|
|
12
|
+
export function filterUrls(urls, includePatterns, excludePatterns) {
|
|
13
|
+
let filtered = urls;
|
|
14
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
15
|
+
filtered = filtered.filter((url) => includePatterns.some((pattern) => url.includes(pattern)));
|
|
16
|
+
}
|
|
17
|
+
if (excludePatterns && excludePatterns.length > 0) {
|
|
18
|
+
filtered = filtered.filter((url) => !excludePatterns.some((pattern) => url.includes(pattern)));
|
|
19
|
+
}
|
|
20
|
+
return [...new Set(filtered)].sort();
|
|
21
|
+
}
|
|
22
|
+
export async function discoverUrls(page, source) {
|
|
23
|
+
await page.goto(source.start_url, { waitUntil: "domcontentloaded" });
|
|
24
|
+
const rawUrls = await page.$$eval(`${source.nav_selector} a[href]`, (links) => links.map((a) => a.href));
|
|
25
|
+
const cleanUrls = rawUrls.filter((url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#"));
|
|
26
|
+
return filterUrls(cleanUrls, source.include_patterns, source.exclude_patterns);
|
|
27
|
+
}
|
|
28
|
+
export async function fetchPage(page, url) {
|
|
29
|
+
await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
30
|
+
return page.content();
|
|
31
|
+
}
|
|
32
|
+
const DEFAULT_CONCURRENCY = 10;
|
|
33
|
+
async function runPool(items, concurrency, fn) {
|
|
34
|
+
let nextIndex = 0;
|
|
35
|
+
async function worker() {
|
|
36
|
+
while (nextIndex < items.length) {
|
|
37
|
+
const index = nextIndex++;
|
|
38
|
+
await fn(items[index], index);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => worker());
|
|
42
|
+
await Promise.all(workers);
|
|
43
|
+
}
|
|
44
|
+
export async function scrapeSource(source, sourceName, dataDir, onProgress) {
|
|
45
|
+
const rawDir = join(dataDir, "raw", sourceName);
|
|
46
|
+
await mkdir(rawDir, { recursive: true });
|
|
47
|
+
const concurrency = source.concurrency ?? DEFAULT_CONCURRENCY;
|
|
48
|
+
const browser = await chromium.launch({ channel: "chrome" });
|
|
49
|
+
const context = await browser.newContext();
|
|
50
|
+
const discoveryPage = await context.newPage();
|
|
51
|
+
const urls = await discoverUrls(discoveryPage, source);
|
|
52
|
+
await discoveryPage.close();
|
|
53
|
+
let completed = 0;
|
|
54
|
+
try {
|
|
55
|
+
await runPool(urls, concurrency, async (url) => {
|
|
56
|
+
const page = await context.newPage();
|
|
57
|
+
try {
|
|
58
|
+
const html = await fetchPage(page, url);
|
|
59
|
+
const slug = slugifyUrl(url);
|
|
60
|
+
await writeFile(join(rawDir, `${slug}.html`), html, "utf-8");
|
|
61
|
+
completed++;
|
|
62
|
+
onProgress?.(completed, urls.length, url);
|
|
63
|
+
}
|
|
64
|
+
finally {
|
|
65
|
+
await page.close();
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
return urls;
|
|
69
|
+
}
|
|
70
|
+
finally {
|
|
71
|
+
await browser.close();
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
export async function createBrowser() {
|
|
75
|
+
return chromium.launch({ channel: "chrome" });
|
|
76
|
+
}
|
|
77
|
+
//# sourceMappingURL=scraper.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scraper.js","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAA2B,MAAM,YAAY,CAAC;AAG/D,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,OAAO,MAAM,CAAC,QAAQ;SACnB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;SACnB,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACnC,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,IAAc,EACd,eAA0B,EAC1B,eAA0B;IAE1B,IAAI,QAAQ,GAAG,IAAI,CAAC;IAEpB,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CACjC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACzD,CAAC;IACJ,CAAC;IAED,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACnE,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;AACvC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,IAAU,EACV,MAAoB;IAEpB,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;IAErE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,MAAM,CAC/B,GAAG,MAAM,CAAC,YAAY,UAAU,EAChC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAE,CAAuB,CAAC,IAAI,CAAC,CAC3D,CAAC;IAEF,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAC9B,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAC/E,CAAC;IAEF,OAAO,UAAU,CAAC,SAAS,EAAE,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;AACjF,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAU,EACV,GAAW;IAEX,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;IACxD,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;AACxB,CAAC;AAED,MAAM,mBAAmB,GAAG,EAAE,CAAC;AAE/B,KAAK,UAAU,OAAO,CACpB,KAAU,EACV,WAAmB,EACnB,EAA6C;IAE7C,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,UAAU,MAAM;QACnB,OAAO,SAAS,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,KAAK,GAAG,SAAS,EAAE,CAAC;YAC1B,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,KAAK,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;IAC5F,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,MAAoB,EACpB,UAAkB,EAClB,OAAe,EACf,UAAkE;IAElE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;IAChD,MAAM,KAAK,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAEzC,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,IAAI,mBAAmB,CAAC;IAC9D,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;IAC7D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAE3C,MAAM,aAAa,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IAC9C,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;IACvD,MAAM,aAAa,CAAC,KAAK,EAAE,CAAC;IAE5B,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,IAAI,CAAC;QACH,MAAM,OAAO,CAAC,IAAI,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;YAC7C,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YACrC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBACxC,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;gBAC7B,MAAM,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,OAAO,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;gBAC7D,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;YAC5C,CAAC;oBAAS,CAAC;gBACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC;IACd,CAAC;YAAS,CAAC;QACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa;IACjC,OAAO,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;AAChD,CAAC"}
|
package/dist/search.d.ts
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { SearchResult } from "./types.js";
|
|
2
|
+
export type { SearchResult };
|
|
3
|
+
export declare function search(query: string, options?: {
|
|
4
|
+
source?: string;
|
|
5
|
+
candidates?: number;
|
|
6
|
+
topN?: number;
|
|
7
|
+
}): Promise<SearchResult[]>;
|
|
8
|
+
//# sourceMappingURL=search.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.d.ts","sourceRoot":"","sources":["../src/search.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,YAAY,EAAE,YAAY,EAAE,CAAC;AAM7B,wBAAsB,MAAM,CAC1B,KAAK,EAAE,MAAM,EACb,OAAO,GAAE;IAAE,MAAM,CAAC,EAAE,MAAM,CAAC;IAAC,UAAU,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,CAAA;CAAO,GACpE,OAAO,CAAC,YAAY,EAAE,CAAC,CAuCzB"}
|
package/dist/search.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { embedText } from "./embedder.js";
|
|
2
|
+
import { vectorSearch } from "./store.js";
|
|
3
|
+
import { rerank } from "./reranker.js";
|
|
4
|
+
function hasReranker() {
|
|
5
|
+
return !!process.env.RERANKER_URL;
|
|
6
|
+
}
|
|
7
|
+
export async function search(query, options = {}) {
|
|
8
|
+
const { source, candidates = 20, topN = 5 } = options;
|
|
9
|
+
const queryEmbedding = await embedText(query);
|
|
10
|
+
const rawResults = await vectorSearch(queryEmbedding, candidates, source);
|
|
11
|
+
if (rawResults.length === 0)
|
|
12
|
+
return [];
|
|
13
|
+
if (hasReranker()) {
|
|
14
|
+
const documents = rawResults.map((r) => r.data.content);
|
|
15
|
+
const reranked = await rerank(query, documents, topN);
|
|
16
|
+
return reranked.map((r) => {
|
|
17
|
+
const original = rawResults[r.index];
|
|
18
|
+
const data = original.data;
|
|
19
|
+
return {
|
|
20
|
+
id: original.id,
|
|
21
|
+
source: data.source,
|
|
22
|
+
url: data.url,
|
|
23
|
+
title: data.title,
|
|
24
|
+
heading_path: data.heading_path,
|
|
25
|
+
content: data.content,
|
|
26
|
+
relevance_score: r.relevance_score,
|
|
27
|
+
};
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
return rawResults.slice(0, topN).map((r, i) => {
|
|
31
|
+
const data = r.data;
|
|
32
|
+
return {
|
|
33
|
+
id: r.id,
|
|
34
|
+
source: data.source,
|
|
35
|
+
url: data.url,
|
|
36
|
+
title: data.title,
|
|
37
|
+
heading_path: data.heading_path,
|
|
38
|
+
content: data.content,
|
|
39
|
+
relevance_score: 1 - i * 0.1,
|
|
40
|
+
};
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=search.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.js","sourceRoot":"","sources":["../src/search.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAC1C,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAKvC,SAAS,WAAW;IAClB,OAAO,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;AACpC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,MAAM,CAC1B,KAAa,EACb,UAAmE,EAAE;IAErE,MAAM,EAAE,MAAM,EAAE,UAAU,GAAG,EAAE,EAAE,IAAI,GAAG,CAAC,EAAE,GAAG,OAAO,CAAC;IAEtD,MAAM,cAAc,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,MAAM,YAAY,CAAC,cAAc,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;IAE1E,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEvC,IAAI,WAAW,EAAE,EAAE,CAAC;QAClB,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,OAAiB,CAAC,CAAC;QAClE,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,KAAK,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAEtD,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;YACxB,MAAM,QAAQ,GAAG,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACrC,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;YAC3B,OAAO;gBACL,EAAE,EAAE,QAAQ,CAAC,EAAE;gBACf,MAAM,EAAE,IAAI,CAAC,MAAgB;gBAC7B,GAAG,EAAE,IAAI,CAAC,GAAa;gBACvB,KAAK,EAAE,IAAI,CAAC,KAAe;gBAC3B,YAAY,EAAE,IAAI,CAAC,YAAwB;gBAC3C,OAAO,EAAE,IAAI,CAAC,OAAiB;gBAC/B,eAAe,EAAE,CAAC,CAAC,eAAe;aACnC,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC;IAED,OAAO,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC5C,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;QACpB,OAAO;YACL,EAAE,EAAE,CAAC,CAAC,EAAE;YACR,MAAM,EAAE,IAAI,CAAC,MAAgB;YAC7B,GAAG,EAAE,IAAI,CAAC,GAAa;YACvB,KAAK,EAAE,IAAI,CAAC,KAAe;YAC3B,YAAY,EAAE,IAAI,CAAC,YAAwB;YAC3C,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,eAAe,EAAE,CAAC,GAAG,CAAC,GAAG,GAAG;SAC7B,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/dist/store.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { Chunk, SourceMeta } from "./types.js";
|
|
2
|
+
export declare function storeChunks(chunks: Chunk[], embeddings: number[][], onProgress?: (current: number, total: number) => void): Promise<void>;
|
|
3
|
+
export declare function purgeSource(sourceName: string): Promise<number>;
|
|
4
|
+
export declare function updateSourceMeta(sourceName: string, chunkCount: number, urlCount: number): Promise<void>;
|
|
5
|
+
export declare function getSourceMeta(sourceName: string): Promise<SourceMeta | null>;
|
|
6
|
+
export declare function getAllSourcesMeta(): Promise<SourceMeta[]>;
|
|
7
|
+
export declare function vectorSearch(queryEmbedding: number[], limit: number, source?: string): Promise<{
|
|
8
|
+
id: string;
|
|
9
|
+
data: Record<string, unknown>;
|
|
10
|
+
}[]>;
|
|
11
|
+
//# sourceMappingURL=store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"store.d.ts","sourceRoot":"","sources":["../src/store.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAwBpD,wBAAsB,WAAW,CAC/B,MAAM,EAAE,KAAK,EAAE,EACf,UAAU,EAAE,MAAM,EAAE,EAAE,EACtB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,GACpD,OAAO,CAAC,IAAI,CAAC,CA0Bf;AAED,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CA0BrE;AAED,wBAAsB,gBAAgB,CACpC,UAAU,EAAE,MAAM,EAClB,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,IAAI,CAAC,CAOf;AAED,wBAAsB,aAAa,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,CAIlF;AAED,wBAAsB,iBAAiB,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC,CAG/D;AAED,wBAAsB,YAAY,CAChC,cAAc,EAAE,MAAM,EAAE,EACxB,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,MAAM,GACd,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;CAAE,EAAE,CAAC,CAmB1D"}
|