@de-otio/chaoskb-client 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/agent-registry/config-merger.d.ts +28 -0
- package/dist/cli/agent-registry/config-merger.d.ts.map +1 -0
- package/dist/cli/agent-registry/config-merger.js +90 -0
- package/dist/cli/agent-registry/config-merger.js.map +1 -0
- package/dist/cli/agent-registry/detector.d.ts +7 -0
- package/dist/cli/agent-registry/detector.d.ts.map +1 -0
- package/dist/cli/agent-registry/detector.js +100 -0
- package/dist/cli/agent-registry/detector.js.map +1 -0
- package/dist/cli/agent-registry/index.d.ts +26 -0
- package/dist/cli/agent-registry/index.d.ts.map +1 -0
- package/dist/cli/agent-registry/index.js +77 -0
- package/dist/cli/agent-registry/index.js.map +1 -0
- package/dist/cli/agent-registry/path-validator.d.ts +11 -0
- package/dist/cli/agent-registry/path-validator.d.ts.map +1 -0
- package/dist/cli/agent-registry/path-validator.js +69 -0
- package/dist/cli/agent-registry/path-validator.js.map +1 -0
- package/dist/cli/agent-registry/registry.json +108 -0
- package/dist/cli/agent-registry/types.d.ts +29 -0
- package/dist/cli/agent-registry/types.d.ts.map +1 -0
- package/dist/cli/agent-registry/types.js +2 -0
- package/dist/cli/agent-registry/types.js.map +1 -0
- package/dist/cli/bootstrap-lock.d.ts +7 -0
- package/dist/cli/bootstrap-lock.d.ts.map +1 -0
- package/dist/cli/bootstrap-lock.js +62 -0
- package/dist/cli/bootstrap-lock.js.map +1 -0
- package/dist/cli/bootstrap.d.ts +23 -0
- package/dist/cli/bootstrap.d.ts.map +1 -0
- package/dist/cli/bootstrap.js +438 -0
- package/dist/cli/bootstrap.js.map +1 -0
- package/dist/cli/commands/config.d.ts +13 -0
- package/dist/cli/commands/config.d.ts.map +1 -0
- package/dist/cli/commands/config.js +244 -0
- package/dist/cli/commands/config.js.map +1 -0
- package/dist/cli/commands/devices.d.ts +21 -0
- package/dist/cli/commands/devices.d.ts.map +1 -0
- package/dist/cli/commands/devices.js +229 -0
- package/dist/cli/commands/devices.js.map +1 -0
- package/dist/cli/commands/export.d.ts +12 -0
- package/dist/cli/commands/export.d.ts.map +1 -0
- package/dist/cli/commands/export.js +183 -0
- package/dist/cli/commands/export.js.map +1 -0
- package/dist/cli/commands/import.d.ts +26 -0
- package/dist/cli/commands/import.d.ts.map +1 -0
- package/dist/cli/commands/import.js +311 -0
- package/dist/cli/commands/import.js.map +1 -0
- package/dist/cli/commands/kb.d.ts +39 -0
- package/dist/cli/commands/kb.d.ts.map +1 -0
- package/dist/cli/commands/kb.js +138 -0
- package/dist/cli/commands/kb.js.map +1 -0
- package/dist/cli/commands/project.d.ts +6 -0
- package/dist/cli/commands/project.d.ts.map +1 -0
- package/dist/cli/commands/project.js +115 -0
- package/dist/cli/commands/project.js.map +1 -0
- package/dist/cli/commands/projects.d.ts +33 -0
- package/dist/cli/commands/projects.d.ts.map +1 -0
- package/dist/cli/commands/projects.js +189 -0
- package/dist/cli/commands/projects.js.map +1 -0
- package/dist/cli/commands/register.d.ts +8 -0
- package/dist/cli/commands/register.d.ts.map +1 -0
- package/dist/cli/commands/register.js +146 -0
- package/dist/cli/commands/register.js.map +1 -0
- package/dist/cli/commands/rotate-key.d.ts +16 -0
- package/dist/cli/commands/rotate-key.d.ts.map +1 -0
- package/dist/cli/commands/rotate-key.js +197 -0
- package/dist/cli/commands/rotate-key.js.map +1 -0
- package/dist/cli/commands/setup-sync.d.ts +2 -0
- package/dist/cli/commands/setup-sync.d.ts.map +1 -0
- package/dist/cli/commands/setup-sync.js +165 -0
- package/dist/cli/commands/setup-sync.js.map +1 -0
- package/dist/cli/commands/setup.d.ts +12 -0
- package/dist/cli/commands/setup.d.ts.map +1 -0
- package/dist/cli/commands/setup.js +39 -0
- package/dist/cli/commands/setup.js.map +1 -0
- package/dist/cli/commands/status.d.ts +5 -0
- package/dist/cli/commands/status.d.ts.map +1 -0
- package/dist/cli/commands/status.js +96 -0
- package/dist/cli/commands/status.js.map +1 -0
- package/dist/cli/commands/uninstall.d.ts +4 -0
- package/dist/cli/commands/uninstall.d.ts.map +1 -0
- package/dist/cli/commands/uninstall.js +85 -0
- package/dist/cli/commands/uninstall.js.map +1 -0
- package/dist/cli/commands/unregister.d.ts +2 -0
- package/dist/cli/commands/unregister.d.ts.map +1 -0
- package/dist/cli/commands/unregister.js +46 -0
- package/dist/cli/commands/unregister.js.map +1 -0
- package/dist/cli/device-metadata.d.ts +15 -0
- package/dist/cli/device-metadata.d.ts.map +1 -0
- package/dist/cli/device-metadata.js +58 -0
- package/dist/cli/device-metadata.js.map +1 -0
- package/dist/cli/github.d.ts +38 -0
- package/dist/cli/github.d.ts.map +1 -0
- package/dist/cli/github.js +159 -0
- package/dist/cli/github.js.map +1 -0
- package/dist/cli/guide-hashes.json +13 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +226 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/mcp-server.d.ts +205 -0
- package/dist/cli/mcp-server.d.ts.map +1 -0
- package/dist/cli/mcp-server.js +366 -0
- package/dist/cli/mcp-server.js.map +1 -0
- package/dist/cli/tools/kb-delete.d.ts +10 -0
- package/dist/cli/tools/kb-delete.d.ts.map +1 -0
- package/dist/cli/tools/kb-delete.js +28 -0
- package/dist/cli/tools/kb-delete.js.map +1 -0
- package/dist/cli/tools/kb-ingest.d.ts +13 -0
- package/dist/cli/tools/kb-ingest.d.ts.map +1 -0
- package/dist/cli/tools/kb-ingest.js +72 -0
- package/dist/cli/tools/kb-ingest.js.map +1 -0
- package/dist/cli/tools/kb-list.d.ts +20 -0
- package/dist/cli/tools/kb-list.d.ts.map +1 -0
- package/dist/cli/tools/kb-list.js +24 -0
- package/dist/cli/tools/kb-list.js.map +1 -0
- package/dist/cli/tools/kb-query-shared.d.ts +27 -0
- package/dist/cli/tools/kb-query-shared.d.ts.map +1 -0
- package/dist/cli/tools/kb-query-shared.js +28 -0
- package/dist/cli/tools/kb-query-shared.js.map +1 -0
- package/dist/cli/tools/kb-query.d.ts +20 -0
- package/dist/cli/tools/kb-query.d.ts.map +1 -0
- package/dist/cli/tools/kb-query.js +109 -0
- package/dist/cli/tools/kb-query.js.map +1 -0
- package/dist/cli/tools/kb-summary.d.ts +29 -0
- package/dist/cli/tools/kb-summary.d.ts.map +1 -0
- package/dist/cli/tools/kb-summary.js +89 -0
- package/dist/cli/tools/kb-summary.js.map +1 -0
- package/dist/cli/tools/kb-sync-status.d.ts +7 -0
- package/dist/cli/tools/kb-sync-status.d.ts.map +1 -0
- package/dist/cli/tools/kb-sync-status.js +48 -0
- package/dist/cli/tools/kb-sync-status.js.map +1 -0
- package/dist/crypto/aad.d.ts +8 -0
- package/dist/crypto/aad.d.ts.map +1 -0
- package/dist/crypto/aad.js +11 -0
- package/dist/crypto/aad.js.map +1 -0
- package/dist/crypto/aead.d.ts +21 -0
- package/dist/crypto/aead.d.ts.map +1 -0
- package/dist/crypto/aead.js +43 -0
- package/dist/crypto/aead.js.map +1 -0
- package/dist/crypto/argon2.d.ts +11 -0
- package/dist/crypto/argon2.d.ts.map +1 -0
- package/dist/crypto/argon2.js +33 -0
- package/dist/crypto/argon2.js.map +1 -0
- package/dist/crypto/blob-id.d.ts +6 -0
- package/dist/crypto/blob-id.d.ts.map +1 -0
- package/dist/crypto/blob-id.js +33 -0
- package/dist/crypto/blob-id.js.map +1 -0
- package/dist/crypto/canonical-json.d.ts +6 -0
- package/dist/crypto/canonical-json.d.ts.map +1 -0
- package/dist/crypto/canonical-json.js +88 -0
- package/dist/crypto/canonical-json.js.map +1 -0
- package/dist/crypto/commitment.d.ts +12 -0
- package/dist/crypto/commitment.d.ts.map +1 -0
- package/dist/crypto/commitment.js +37 -0
- package/dist/crypto/commitment.js.map +1 -0
- package/dist/crypto/encryption-service.d.ts +19 -0
- package/dist/crypto/encryption-service.d.ts.map +1 -0
- package/dist/crypto/encryption-service.js +38 -0
- package/dist/crypto/encryption-service.js.map +1 -0
- package/dist/crypto/envelope-cbor.d.ts +37 -0
- package/dist/crypto/envelope-cbor.d.ts.map +1 -0
- package/dist/crypto/envelope-cbor.js +124 -0
- package/dist/crypto/envelope-cbor.js.map +1 -0
- package/dist/crypto/envelope.d.ts +34 -0
- package/dist/crypto/envelope.d.ts.map +1 -0
- package/dist/crypto/envelope.js +160 -0
- package/dist/crypto/envelope.js.map +1 -0
- package/dist/crypto/hkdf.d.ts +16 -0
- package/dist/crypto/hkdf.d.ts.map +1 -0
- package/dist/crypto/hkdf.js +33 -0
- package/dist/crypto/hkdf.js.map +1 -0
- package/dist/crypto/index.d.ts +15 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +15 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/invite.d.ts +31 -0
- package/dist/crypto/invite.d.ts.map +1 -0
- package/dist/crypto/invite.js +137 -0
- package/dist/crypto/invite.js.map +1 -0
- package/dist/crypto/keyring.d.ts +37 -0
- package/dist/crypto/keyring.d.ts.map +1 -0
- package/dist/crypto/keyring.js +219 -0
- package/dist/crypto/keyring.js.map +1 -0
- package/dist/crypto/known-keys.d.ts +34 -0
- package/dist/crypto/known-keys.d.ts.map +1 -0
- package/dist/crypto/known-keys.js +106 -0
- package/dist/crypto/known-keys.js.map +1 -0
- package/dist/crypto/project-keys.d.ts +26 -0
- package/dist/crypto/project-keys.d.ts.map +1 -0
- package/dist/crypto/project-keys.js +69 -0
- package/dist/crypto/project-keys.js.map +1 -0
- package/dist/crypto/secure-buffer.d.ts +31 -0
- package/dist/crypto/secure-buffer.d.ts.map +1 -0
- package/dist/crypto/secure-buffer.js +61 -0
- package/dist/crypto/secure-buffer.js.map +1 -0
- package/dist/crypto/ssh-agent.d.ts +16 -0
- package/dist/crypto/ssh-agent.d.ts.map +1 -0
- package/dist/crypto/ssh-agent.js +225 -0
- package/dist/crypto/ssh-agent.js.map +1 -0
- package/dist/crypto/ssh-keys.d.ts +19 -0
- package/dist/crypto/ssh-keys.d.ts.map +1 -0
- package/dist/crypto/ssh-keys.js +121 -0
- package/dist/crypto/ssh-keys.js.map +1 -0
- package/dist/crypto/tiers/enhanced.d.ts +25 -0
- package/dist/crypto/tiers/enhanced.d.ts.map +1 -0
- package/dist/crypto/tiers/enhanced.js +56 -0
- package/dist/crypto/tiers/enhanced.js.map +1 -0
- package/dist/crypto/tiers/maximum.d.ts +19 -0
- package/dist/crypto/tiers/maximum.d.ts.map +1 -0
- package/dist/crypto/tiers/maximum.js +25 -0
- package/dist/crypto/tiers/maximum.js.map +1 -0
- package/dist/crypto/tiers/standard.d.ts +27 -0
- package/dist/crypto/tiers/standard.d.ts.map +1 -0
- package/dist/crypto/tiers/standard.js +147 -0
- package/dist/crypto/tiers/standard.js.map +1 -0
- package/dist/crypto/types.d.ts +169 -0
- package/dist/crypto/types.d.ts.map +1 -0
- package/dist/crypto/types.js +11 -0
- package/dist/crypto/types.js.map +1 -0
- package/dist/pipeline/chunker.d.ts +27 -0
- package/dist/pipeline/chunker.d.ts.map +1 -0
- package/dist/pipeline/chunker.js +96 -0
- package/dist/pipeline/chunker.js.map +1 -0
- package/dist/pipeline/content-pipeline.d.ts +24 -0
- package/dist/pipeline/content-pipeline.d.ts.map +1 -0
- package/dist/pipeline/content-pipeline.js +49 -0
- package/dist/pipeline/content-pipeline.js.map +1 -0
- package/dist/pipeline/embedder.d.ts +49 -0
- package/dist/pipeline/embedder.d.ts.map +1 -0
- package/dist/pipeline/embedder.js +195 -0
- package/dist/pipeline/embedder.js.map +1 -0
- package/dist/pipeline/extract.d.ts +17 -0
- package/dist/pipeline/extract.d.ts.map +1 -0
- package/dist/pipeline/extract.js +70 -0
- package/dist/pipeline/extract.js.map +1 -0
- package/dist/pipeline/fetch.d.ts +26 -0
- package/dist/pipeline/fetch.d.ts.map +1 -0
- package/dist/pipeline/fetch.js +91 -0
- package/dist/pipeline/fetch.js.map +1 -0
- package/dist/pipeline/index.d.ts +10 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +10 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/model-manager.d.ts +57 -0
- package/dist/pipeline/model-manager.d.ts.map +1 -0
- package/dist/pipeline/model-manager.js +234 -0
- package/dist/pipeline/model-manager.js.map +1 -0
- package/dist/pipeline/search.d.ts +37 -0
- package/dist/pipeline/search.d.ts.map +1 -0
- package/dist/pipeline/search.js +65 -0
- package/dist/pipeline/search.js.map +1 -0
- package/dist/pipeline/tokenizer.d.ts +29 -0
- package/dist/pipeline/tokenizer.d.ts.map +1 -0
- package/dist/pipeline/tokenizer.js +54 -0
- package/dist/pipeline/tokenizer.js.map +1 -0
- package/dist/pipeline/types.d.ts +86 -0
- package/dist/pipeline/types.d.ts.map +1 -0
- package/dist/pipeline/types.js +2 -0
- package/dist/pipeline/types.js.map +1 -0
- package/dist/pipeline/wordpiece-tokenizer.d.ts +60 -0
- package/dist/pipeline/wordpiece-tokenizer.d.ts.map +1 -0
- package/dist/pipeline/wordpiece-tokenizer.js +251 -0
- package/dist/pipeline/wordpiece-tokenizer.js.map +1 -0
- package/dist/storage/chunk-repo.d.ts +29 -0
- package/dist/storage/chunk-repo.d.ts.map +1 -0
- package/dist/storage/chunk-repo.js +115 -0
- package/dist/storage/chunk-repo.js.map +1 -0
- package/dist/storage/database-manager.d.ts +17 -0
- package/dist/storage/database-manager.d.ts.map +1 -0
- package/dist/storage/database-manager.js +100 -0
- package/dist/storage/database-manager.js.map +1 -0
- package/dist/storage/database.d.ts +10 -0
- package/dist/storage/database.d.ts.map +1 -0
- package/dist/storage/database.js +34 -0
- package/dist/storage/database.js.map +1 -0
- package/dist/storage/embedding-index.d.ts +22 -0
- package/dist/storage/embedding-index.d.ts.map +1 -0
- package/dist/storage/embedding-index.js +78 -0
- package/dist/storage/embedding-index.js.map +1 -0
- package/dist/storage/index.d.ts +10 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +10 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/kb-database.d.ts +11 -0
- package/dist/storage/kb-database.d.ts.map +1 -0
- package/dist/storage/kb-database.js +24 -0
- package/dist/storage/kb-database.js.map +1 -0
- package/dist/storage/schema.d.ts +6 -0
- package/dist/storage/schema.d.ts.map +1 -0
- package/dist/storage/schema.js +122 -0
- package/dist/storage/schema.js.map +1 -0
- package/dist/storage/source-repo.d.ts +20 -0
- package/dist/storage/source-repo.d.ts.map +1 -0
- package/dist/storage/source-repo.js +120 -0
- package/dist/storage/source-repo.js.map +1 -0
- package/dist/storage/sync-status-repo.d.ts +15 -0
- package/dist/storage/sync-status-repo.d.ts.map +1 -0
- package/dist/storage/sync-status-repo.js +40 -0
- package/dist/storage/sync-status-repo.js.map +1 -0
- package/dist/storage/types.d.ts +139 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/storage/types.js +9 -0
- package/dist/storage/types.js.map +1 -0
- package/dist/sync/canary.d.ts +14 -0
- package/dist/sync/canary.d.ts.map +1 -0
- package/dist/sync/canary.js +53 -0
- package/dist/sync/canary.js.map +1 -0
- package/dist/sync/full-sync.d.ts +16 -0
- package/dist/sync/full-sync.d.ts.map +1 -0
- package/dist/sync/full-sync.js +91 -0
- package/dist/sync/full-sync.js.map +1 -0
- package/dist/sync/http-client.d.ts +28 -0
- package/dist/sync/http-client.d.ts.map +1 -0
- package/dist/sync/http-client.js +90 -0
- package/dist/sync/http-client.js.map +1 -0
- package/dist/sync/incremental-sync.d.ts +17 -0
- package/dist/sync/incremental-sync.d.ts.map +1 -0
- package/dist/sync/incremental-sync.js +155 -0
- package/dist/sync/incremental-sync.js.map +1 -0
- package/dist/sync/index.d.ts +12 -0
- package/dist/sync/index.d.ts.map +1 -0
- package/dist/sync/index.js +12 -0
- package/dist/sync/index.js.map +1 -0
- package/dist/sync/quota.d.ts +17 -0
- package/dist/sync/quota.d.ts.map +1 -0
- package/dist/sync/quota.js +48 -0
- package/dist/sync/quota.js.map +1 -0
- package/dist/sync/sequence.d.ts +21 -0
- package/dist/sync/sequence.d.ts.map +1 -0
- package/dist/sync/sequence.js +49 -0
- package/dist/sync/sequence.js.map +1 -0
- package/dist/sync/ssh-signer.d.ts +59 -0
- package/dist/sync/ssh-signer.d.ts.map +1 -0
- package/dist/sync/ssh-signer.js +241 -0
- package/dist/sync/ssh-signer.js.map +1 -0
- package/dist/sync/sync-service.d.ts +48 -0
- package/dist/sync/sync-service.d.ts.map +1 -0
- package/dist/sync/sync-service.js +116 -0
- package/dist/sync/sync-service.js.map +1 -0
- package/dist/sync/types.d.ts +106 -0
- package/dist/sync/types.d.ts.map +1 -0
- package/dist/sync/types.js +2 -0
- package/dist/sync/types.js.map +1 -0
- package/dist/sync/upload-queue.d.ts +40 -0
- package/dist/sync/upload-queue.d.ts.map +1 -0
- package/dist/sync/upload-queue.js +148 -0
- package/dist/sync/upload-queue.js.map +1 -0
- package/dist/sync/verification.d.ts +17 -0
- package/dist/sync/verification.d.ts.map +1 -0
- package/dist/sync/verification.js +25 -0
- package/dist/sync/verification.js.map +1 -0
- package/dist/vitest.config.d.ts +3 -0
- package/dist/vitest.config.d.ts.map +1 -0
- package/dist/vitest.config.js +16 -0
- package/dist/vitest.config.js.map +1 -0
- package/package.json +68 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL fetching for the content pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Uses Node.js built-in `fetch` (available since Node 18) with
|
|
5
|
+
* configurable timeout, redirect limits, and user-agent.
|
|
6
|
+
*/
|
|
7
|
+
/** Default pipeline configuration values relevant to fetching. */
|
|
8
|
+
const DEFAULTS = {
|
|
9
|
+
fetchTimeoutMs: 30_000,
|
|
10
|
+
maxRedirects: 5,
|
|
11
|
+
userAgent: 'ChaosKB/0.1',
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* Fetch the HTML content of a URL.
|
|
15
|
+
*
|
|
16
|
+
* @param url - The URL to fetch.
|
|
17
|
+
* @param config - Optional partial pipeline config overrides.
|
|
18
|
+
* @returns The HTML content, final URL, and content type.
|
|
19
|
+
* @throws On network errors, non-2xx status codes, or non-HTML content.
|
|
20
|
+
*/
|
|
21
|
+
export async function fetchUrl(url, config) {
|
|
22
|
+
const timeoutMs = config?.fetchTimeoutMs ?? DEFAULTS.fetchTimeoutMs;
|
|
23
|
+
const userAgent = config?.userAgent ?? DEFAULTS.userAgent;
|
|
24
|
+
const controller = new AbortController();
|
|
25
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
26
|
+
let response;
|
|
27
|
+
try {
|
|
28
|
+
response = await fetch(url, {
|
|
29
|
+
method: 'GET',
|
|
30
|
+
headers: {
|
|
31
|
+
'User-Agent': userAgent,
|
|
32
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
33
|
+
},
|
|
34
|
+
signal: controller.signal,
|
|
35
|
+
redirect: 'follow',
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
catch (error) {
|
|
39
|
+
clearTimeout(timer);
|
|
40
|
+
if (error instanceof Error) {
|
|
41
|
+
if (error.name === 'AbortError') {
|
|
42
|
+
throw new Error(`Fetch timed out after ${timeoutMs}ms: ${url}`);
|
|
43
|
+
}
|
|
44
|
+
// DNS resolution failures
|
|
45
|
+
if (error.cause && typeof error.cause === 'object' && 'code' in error.cause) {
|
|
46
|
+
const code = error.cause.code;
|
|
47
|
+
if (code === 'ENOTFOUND') {
|
|
48
|
+
throw new Error(`DNS resolution failed for ${url}: host not found`);
|
|
49
|
+
}
|
|
50
|
+
if (code === 'ECONNREFUSED') {
|
|
51
|
+
throw new Error(`Connection refused for ${url}`);
|
|
52
|
+
}
|
|
53
|
+
if (code === 'UNABLE_TO_VERIFY_LEAF_SIGNATURE' || code === 'ERR_TLS_CERT_ALTNAME_INVALID') {
|
|
54
|
+
throw new Error(`TLS certificate error for ${url}: ${code}`);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
// TLS errors often show up in the message
|
|
58
|
+
if (error.message.includes('SSL') || error.message.includes('TLS') || error.message.includes('certificate')) {
|
|
59
|
+
throw new Error(`TLS error fetching ${url}: ${error.message}`);
|
|
60
|
+
}
|
|
61
|
+
throw new Error(`Failed to fetch ${url}: ${error.message}`);
|
|
62
|
+
}
|
|
63
|
+
throw new Error(`Failed to fetch ${url}: unknown error`);
|
|
64
|
+
}
|
|
65
|
+
finally {
|
|
66
|
+
clearTimeout(timer);
|
|
67
|
+
}
|
|
68
|
+
// Check HTTP status
|
|
69
|
+
if (!response.ok) {
|
|
70
|
+
const status = response.status;
|
|
71
|
+
if (status >= 400 && status < 500) {
|
|
72
|
+
throw new Error(`HTTP ${status} Client Error for ${url}: ${response.statusText}`);
|
|
73
|
+
}
|
|
74
|
+
if (status >= 500) {
|
|
75
|
+
throw new Error(`HTTP ${status} Server Error for ${url}: ${response.statusText}`);
|
|
76
|
+
}
|
|
77
|
+
throw new Error(`HTTP ${status} for ${url}: ${response.statusText}`);
|
|
78
|
+
}
|
|
79
|
+
// Verify content type is HTML-like
|
|
80
|
+
const contentType = response.headers.get('content-type') ?? '';
|
|
81
|
+
const isHtml = contentType.includes('text/html') ||
|
|
82
|
+
contentType.includes('application/xhtml+xml') ||
|
|
83
|
+
contentType.includes('application/xml');
|
|
84
|
+
if (!isHtml) {
|
|
85
|
+
throw new Error(`Non-HTML content type "${contentType}" for ${url}. Only text/html is supported.`);
|
|
86
|
+
}
|
|
87
|
+
const html = await response.text();
|
|
88
|
+
const finalUrl = response.url || url;
|
|
89
|
+
return { html, finalUrl, contentType };
|
|
90
|
+
}
|
|
91
|
+
//# sourceMappingURL=fetch.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch.js","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,kEAAkE;AAClE,MAAM,QAAQ,GAA0E;IACtF,cAAc,EAAE,MAAM;IACtB,YAAY,EAAE,CAAC;IACf,SAAS,EAAE,aAAa;CACzB,CAAC;AAYF;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,GAAW,EACX,MAAgC;IAEhC,MAAM,SAAS,GAAG,MAAM,EAAE,cAAc,IAAI,QAAQ,CAAC,cAAc,CAAC;IACpE,MAAM,SAAS,GAAG,MAAM,EAAE,SAAS,IAAI,QAAQ,CAAC,SAAS,CAAC;IAE1D,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;IAE9D,IAAI,QAAkB,CAAC;IACvB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC1B,MAAM,EAAE,KAAK;YACb,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,MAAM,EAAE,iEAAiE;aAC1E;YACD,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,YAAY,CAAC,KAAK,CAAC,CAAC;QACpB,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAChC,MAAM,IAAI,KAAK,CAAC,yBAAyB,SAAS,OAAO,GAAG,EAAE,CAAC,CAAC;YAClE,CAAC;YACD,0BAA0B;YAC1B,IAAI,KAAK,CAAC,KAAK,IAAI,OAAO,KAAK,CAAC,KAAK,KAAK,QAAQ,IAAI,MAAM,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;gBAC5E,MAAM,IAAI,GAAI,KAAK,CAAC,KAA2B,CAAC,IAAI,CAAC;gBACrD,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;oBACzB,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,kBAAkB,CAAC,CAAC;gBACtE,CAAC;gBACD,IAAI,IAAI,KAAK,cAAc,EAAE,CAAC;oBAC5B,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;gBACnD,CAAC;gBACD,IAAI,IAAI,KAAK,iCAAiC,IAAI,IAAI,KAAK,8BAA8B,EAAE,CAAC;oBAC1F,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,KAAK,IAAI,EAAE,CAAC,CAAC;gBAC/D,CAAC;YACH,CAAC;YACD,0CAA0C;YAC1C,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;gBAC5G,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YACjE,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,iBAAiB,CAAC,CAAC;IAC3D,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;IAED,oBAAoB;IACpB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC;QAC/B,IAAI,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IACvE,CAAC;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;IAC/D,MAAM,MAAM,GACV,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;QACjC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC;QAC7C,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,0BAA0B,WAAW,SAAS,GAAG,gCAAgC,CAClF,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACnC,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,IAAI,GAAG,CAAC;IAErC,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,CAAC;AACzC,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export * from './types.js';
|
|
2
|
+
export { fetchUrl } from './fetch.js';
|
|
3
|
+
export { extractContent } from './extract.js';
|
|
4
|
+
export { chunkText } from './chunker.js';
|
|
5
|
+
export { countTokens } from './tokenizer.js';
|
|
6
|
+
export { Embedder } from './embedder.js';
|
|
7
|
+
export { ModelManager } from './model-manager.js';
|
|
8
|
+
export { ContentPipeline } from './content-pipeline.js';
|
|
9
|
+
export { cosineSimilarity, searchEmbeddings } from './search.js';
|
|
10
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export * from './types.js';
|
|
2
|
+
export { fetchUrl } from './fetch.js';
|
|
3
|
+
export { extractContent } from './extract.js';
|
|
4
|
+
export { chunkText } from './chunker.js';
|
|
5
|
+
export { countTokens } from './tokenizer.js';
|
|
6
|
+
export { Embedder } from './embedder.js';
|
|
7
|
+
export { ModelManager } from './model-manager.js';
|
|
8
|
+
export { ContentPipeline } from './content-pipeline.js';
|
|
9
|
+
export { cosineSimilarity, searchEmbeddings } from './search.js';
|
|
10
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ONNX model download and verification manager.
|
|
3
|
+
*
|
|
4
|
+
* Handles downloading the snowflake-arctic-embed-s ONNX model from
|
|
5
|
+
* HuggingFace, verifying its SHA-256 hash, and managing the local
|
|
6
|
+
* model cache directory.
|
|
7
|
+
*/
|
|
8
|
+
import type { DownloadProgressCallback, IModelManager } from './types.js';
|
|
9
|
+
/**
|
|
10
|
+
* Manages downloading and verifying the ONNX embedding model.
|
|
11
|
+
*/
|
|
12
|
+
export declare class ModelManager implements IModelManager {
|
|
13
|
+
private readonly modelsDir;
|
|
14
|
+
/**
|
|
15
|
+
* @param modelsDir - Directory to store model files.
|
|
16
|
+
* Defaults to `~/.chaoskb/models/`.
|
|
17
|
+
*/
|
|
18
|
+
constructor(modelsDir?: string);
|
|
19
|
+
/**
|
|
20
|
+
* Get the expected path to the model file.
|
|
21
|
+
*/
|
|
22
|
+
getModelPath(): string;
|
|
23
|
+
/**
|
|
24
|
+
* Get the expected path to the vocabulary file.
|
|
25
|
+
*/
|
|
26
|
+
getVocabPath(): string;
|
|
27
|
+
/**
|
|
28
|
+
* Check if the model file exists and its SHA-256 matches the stored hash.
|
|
29
|
+
*/
|
|
30
|
+
isModelReady(): Promise<boolean>;
|
|
31
|
+
/**
|
|
32
|
+
* Ensure the model is downloaded and verified. Downloads if missing
|
|
33
|
+
* or hash mismatch. Also downloads the vocabulary file.
|
|
34
|
+
* Returns the path to the model file.
|
|
35
|
+
*
|
|
36
|
+
* @param onProgress - Optional callback for download progress.
|
|
37
|
+
* @returns Absolute path to the verified model file.
|
|
38
|
+
*/
|
|
39
|
+
ensureModel(onProgress?: DownloadProgressCallback): Promise<string>;
|
|
40
|
+
/**
|
|
41
|
+
* Check if the vocabulary file exists and its hash matches.
|
|
42
|
+
*/
|
|
43
|
+
isVocabReady(): Promise<boolean>;
|
|
44
|
+
/**
|
|
45
|
+
* Ensure the vocabulary file is downloaded and verified.
|
|
46
|
+
*/
|
|
47
|
+
ensureVocab(): Promise<string>;
|
|
48
|
+
/**
|
|
49
|
+
* Download the model file with progress reporting.
|
|
50
|
+
*/
|
|
51
|
+
private downloadModel;
|
|
52
|
+
/**
|
|
53
|
+
* Download a file from a URL with optional progress reporting and resume support.
|
|
54
|
+
*/
|
|
55
|
+
private downloadFile;
|
|
56
|
+
}
|
|
57
|
+
//# sourceMappingURL=model-manager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-manager.d.ts","sourceRoot":"","sources":["../../pipeline/model-manager.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AASH,OAAO,KAAK,EAAE,wBAAwB,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAyB1E;;GAEG;AACH,qBAAa,YAAa,YAAW,aAAa;IAChD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IAEnC;;;OAGG;gBACS,SAAS,CAAC,EAAE,MAAM;IAI9B;;OAEG;IACH,YAAY,IAAI,MAAM;IAItB;;OAEG;IACH,YAAY,IAAI,MAAM;IAItB;;OAEG;IACG,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC;IAoBtC;;;;;;;OAOG;IACG,WAAW,CAAC,UAAU,CAAC,EAAE,wBAAwB,GAAG,OAAO,CAAC,MAAM,CAAC;IA6BzE;;OAEG;IACG,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC;IAoBtC;;OAEG;IACG,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC;IAmBpC;;OAEG;YACW,aAAa;IAO3B;;OAEG;YACW,YAAY;CA6E3B"}
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ONNX model download and verification manager.
|
|
3
|
+
*
|
|
4
|
+
* Handles downloading the snowflake-arctic-embed-s ONNX model from
|
|
5
|
+
* HuggingFace, verifying its SHA-256 hash, and managing the local
|
|
6
|
+
* model cache directory.
|
|
7
|
+
*/
|
|
8
|
+
import { createHash } from 'node:crypto';
|
|
9
|
+
import { createReadStream, createWriteStream } from 'node:fs';
|
|
10
|
+
import { access, mkdir, readFile, rename, unlink, writeFile } from 'node:fs/promises';
|
|
11
|
+
import { homedir } from 'node:os';
|
|
12
|
+
import { join } from 'node:path';
|
|
13
|
+
import { pipeline } from 'node:stream/promises';
|
|
14
|
+
import { Readable } from 'node:stream';
|
|
15
|
+
/** Default model download URL. */
|
|
16
|
+
const MODEL_URL = 'https://huggingface.co/Snowflake/snowflake-arctic-embed-s/resolve/main/onnx/model.onnx';
|
|
17
|
+
/** Vocabulary file download URL. */
|
|
18
|
+
const VOCAB_URL = 'https://huggingface.co/Snowflake/snowflake-arctic-embed-s/resolve/main/vocab.txt';
|
|
19
|
+
/** Model filename. */
|
|
20
|
+
const MODEL_FILENAME = 'model.onnx';
|
|
21
|
+
/** Vocabulary filename. */
|
|
22
|
+
const VOCAB_FILENAME = 'vocab.txt';
|
|
23
|
+
/** SHA-256 sidecar filename. */
|
|
24
|
+
const HASH_FILENAME = 'model.onnx.sha256';
|
|
25
|
+
/** Vocabulary SHA-256 sidecar filename. */
|
|
26
|
+
const VOCAB_HASH_FILENAME = 'vocab.txt.sha256';
|
|
27
|
+
/** Temporary download suffix. */
|
|
28
|
+
const TEMP_SUFFIX = '.download';
|
|
29
|
+
/**
|
|
30
|
+
* Manages downloading and verifying the ONNX embedding model.
|
|
31
|
+
*/
|
|
32
|
+
export class ModelManager {
|
|
33
|
+
modelsDir;
|
|
34
|
+
/**
|
|
35
|
+
* @param modelsDir - Directory to store model files.
|
|
36
|
+
* Defaults to `~/.chaoskb/models/`.
|
|
37
|
+
*/
|
|
38
|
+
constructor(modelsDir) {
|
|
39
|
+
this.modelsDir = modelsDir ?? join(homedir(), '.chaoskb', 'models');
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Get the expected path to the model file.
|
|
43
|
+
*/
|
|
44
|
+
getModelPath() {
|
|
45
|
+
return join(this.modelsDir, MODEL_FILENAME);
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Get the expected path to the vocabulary file.
|
|
49
|
+
*/
|
|
50
|
+
getVocabPath() {
|
|
51
|
+
return join(this.modelsDir, VOCAB_FILENAME);
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Check if the model file exists and its SHA-256 matches the stored hash.
|
|
55
|
+
*/
|
|
56
|
+
async isModelReady() {
|
|
57
|
+
const modelPath = this.getModelPath();
|
|
58
|
+
const hashPath = join(this.modelsDir, HASH_FILENAME);
|
|
59
|
+
try {
|
|
60
|
+
await access(modelPath);
|
|
61
|
+
await access(hashPath);
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
try {
|
|
67
|
+
const storedHash = (await readFile(hashPath, 'utf-8')).trim();
|
|
68
|
+
const actualHash = await computeFileHash(modelPath);
|
|
69
|
+
return storedHash === actualHash;
|
|
70
|
+
}
|
|
71
|
+
catch {
|
|
72
|
+
return false;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Ensure the model is downloaded and verified. Downloads if missing
|
|
77
|
+
* or hash mismatch. Also downloads the vocabulary file.
|
|
78
|
+
* Returns the path to the model file.
|
|
79
|
+
*
|
|
80
|
+
* @param onProgress - Optional callback for download progress.
|
|
81
|
+
* @returns Absolute path to the verified model file.
|
|
82
|
+
*/
|
|
83
|
+
async ensureModel(onProgress) {
|
|
84
|
+
const modelPath = this.getModelPath();
|
|
85
|
+
// Create directory
|
|
86
|
+
await mkdir(this.modelsDir, { recursive: true });
|
|
87
|
+
// Ensure vocab is downloaded (small file, do first)
|
|
88
|
+
await this.ensureVocab();
|
|
89
|
+
if (await this.isModelReady()) {
|
|
90
|
+
return modelPath;
|
|
91
|
+
}
|
|
92
|
+
// Download to temporary file
|
|
93
|
+
const tempPath = modelPath + TEMP_SUFFIX;
|
|
94
|
+
await this.downloadModel(tempPath, onProgress);
|
|
95
|
+
// Compute hash
|
|
96
|
+
const hash = await computeFileHash(tempPath);
|
|
97
|
+
// Move to final location
|
|
98
|
+
await rename(tempPath, modelPath);
|
|
99
|
+
// Write hash sidecar
|
|
100
|
+
await writeFile(join(this.modelsDir, HASH_FILENAME), hash + '\n', 'utf-8');
|
|
101
|
+
return modelPath;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Check if the vocabulary file exists and its hash matches.
|
|
105
|
+
*/
|
|
106
|
+
async isVocabReady() {
|
|
107
|
+
const vocabPath = this.getVocabPath();
|
|
108
|
+
const hashPath = join(this.modelsDir, VOCAB_HASH_FILENAME);
|
|
109
|
+
try {
|
|
110
|
+
await access(vocabPath);
|
|
111
|
+
await access(hashPath);
|
|
112
|
+
}
|
|
113
|
+
catch {
|
|
114
|
+
return false;
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
const storedHash = (await readFile(hashPath, 'utf-8')).trim();
|
|
118
|
+
const actualHash = await computeFileHash(vocabPath);
|
|
119
|
+
return storedHash === actualHash;
|
|
120
|
+
}
|
|
121
|
+
catch {
|
|
122
|
+
return false;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Ensure the vocabulary file is downloaded and verified.
|
|
127
|
+
*/
|
|
128
|
+
async ensureVocab() {
|
|
129
|
+
const vocabPath = this.getVocabPath();
|
|
130
|
+
if (await this.isVocabReady()) {
|
|
131
|
+
return vocabPath;
|
|
132
|
+
}
|
|
133
|
+
await mkdir(this.modelsDir, { recursive: true });
|
|
134
|
+
const tempPath = vocabPath + TEMP_SUFFIX;
|
|
135
|
+
await this.downloadFile(VOCAB_URL, tempPath);
|
|
136
|
+
const hash = await computeFileHash(tempPath);
|
|
137
|
+
await rename(tempPath, vocabPath);
|
|
138
|
+
await writeFile(join(this.modelsDir, VOCAB_HASH_FILENAME), hash + '\n', 'utf-8');
|
|
139
|
+
return vocabPath;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Download the model file with progress reporting.
|
|
143
|
+
*/
|
|
144
|
+
async downloadModel(destPath, onProgress) {
|
|
145
|
+
return this.downloadFile(MODEL_URL, destPath, onProgress);
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Download a file from a URL with optional progress reporting and resume support.
|
|
149
|
+
*/
|
|
150
|
+
async downloadFile(url, destPath, onProgress) {
|
|
151
|
+
// Check for existing partial download for potential resume
|
|
152
|
+
let existingSize = 0;
|
|
153
|
+
try {
|
|
154
|
+
const { stat } = await import('node:fs/promises');
|
|
155
|
+
const stats = await stat(destPath);
|
|
156
|
+
existingSize = stats.size;
|
|
157
|
+
}
|
|
158
|
+
catch {
|
|
159
|
+
// No partial download exists
|
|
160
|
+
}
|
|
161
|
+
const headers = {
|
|
162
|
+
'User-Agent': 'ChaosKB/0.1',
|
|
163
|
+
};
|
|
164
|
+
// Attempt resume if partial file exists
|
|
165
|
+
if (existingSize > 0) {
|
|
166
|
+
headers['Range'] = `bytes=${existingSize}-`;
|
|
167
|
+
}
|
|
168
|
+
let response;
|
|
169
|
+
try {
|
|
170
|
+
response = await fetch(url, { headers, redirect: 'follow' });
|
|
171
|
+
}
|
|
172
|
+
catch (error) {
|
|
173
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
174
|
+
throw new Error(`Failed to download ${url}: ${msg}`);
|
|
175
|
+
}
|
|
176
|
+
// If server doesn't support range or returned full content, start fresh
|
|
177
|
+
if (response.status === 200) {
|
|
178
|
+
existingSize = 0;
|
|
179
|
+
}
|
|
180
|
+
else if (response.status === 206) {
|
|
181
|
+
// Partial content — resume supported
|
|
182
|
+
}
|
|
183
|
+
else if (!response.ok) {
|
|
184
|
+
throw new Error(`Failed to download ${url}: HTTP ${response.status} ${response.statusText}`);
|
|
185
|
+
}
|
|
186
|
+
const contentLength = response.headers.get('content-length');
|
|
187
|
+
const totalSize = contentLength
|
|
188
|
+
? existingSize + parseInt(contentLength, 10)
|
|
189
|
+
: 0;
|
|
190
|
+
if (!response.body) {
|
|
191
|
+
throw new Error('Response body is null');
|
|
192
|
+
}
|
|
193
|
+
// Convert web ReadableStream to Node.js Readable
|
|
194
|
+
const nodeStream = Readable.fromWeb(response.body);
|
|
195
|
+
const writeStream = createWriteStream(destPath, {
|
|
196
|
+
flags: existingSize > 0 && response.status === 206 ? 'a' : 'w',
|
|
197
|
+
});
|
|
198
|
+
let downloaded = existingSize;
|
|
199
|
+
nodeStream.on('data', (chunk) => {
|
|
200
|
+
downloaded += chunk.length;
|
|
201
|
+
onProgress?.(downloaded, totalSize);
|
|
202
|
+
});
|
|
203
|
+
try {
|
|
204
|
+
await pipeline(nodeStream, writeStream);
|
|
205
|
+
}
|
|
206
|
+
catch (error) {
|
|
207
|
+
// Clean up partial download on error
|
|
208
|
+
try {
|
|
209
|
+
await unlink(destPath);
|
|
210
|
+
}
|
|
211
|
+
catch {
|
|
212
|
+
// Ignore cleanup errors
|
|
213
|
+
}
|
|
214
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
215
|
+
throw new Error(`Download interrupted: ${msg}`);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Compute the SHA-256 hash of a file.
|
|
221
|
+
*
|
|
222
|
+
* @param filePath - Path to the file.
|
|
223
|
+
* @returns Hex-encoded SHA-256 hash string.
|
|
224
|
+
*/
|
|
225
|
+
async function computeFileHash(filePath) {
|
|
226
|
+
return new Promise((resolve, reject) => {
|
|
227
|
+
const hash = createHash('sha256');
|
|
228
|
+
const stream = createReadStream(filePath);
|
|
229
|
+
stream.on('data', (chunk) => hash.update(chunk));
|
|
230
|
+
stream.on('end', () => resolve(hash.digest('hex')));
|
|
231
|
+
stream.on('error', reject);
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
//# sourceMappingURL=model-manager.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-manager.js","sourceRoot":"","sources":["../../pipeline/model-manager.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,SAAS,CAAC;AAC9D,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACtF,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAGvC,kCAAkC;AAClC,MAAM,SAAS,GACb,wFAAwF,CAAC;AAE3F,oCAAoC;AACpC,MAAM,SAAS,GACb,kFAAkF,CAAC;AAErF,sBAAsB;AACtB,MAAM,cAAc,GAAG,YAAY,CAAC;AAEpC,2BAA2B;AAC3B,MAAM,cAAc,GAAG,WAAW,CAAC;AAEnC,gCAAgC;AAChC,MAAM,aAAa,GAAG,mBAAmB,CAAC;AAE1C,2CAA2C;AAC3C,MAAM,mBAAmB,GAAG,kBAAkB,CAAC;AAE/C,iCAAiC;AACjC,MAAM,WAAW,GAAG,WAAW,CAAC;AAEhC;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,SAAS,CAAS;IAEnC;;;OAGG;IACH,YAAY,SAAkB;QAC5B,IAAI,CAAC,SAAS,GAAG,SAAS,IAAI,IAAI,CAAC,OAAO,EAAE,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;IACtE,CAAC;IAED;;OAEG;IACH,YAAY;QACV,OAAO,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,cAAc,CAAC,CAAC;IAC9C,CAAC;IAED;;OAEG;IACH,YAAY;QACV,OAAO,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,cAAc,CAAC,CAAC;IAC9C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,YAAY;QAChB,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;QAErD,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,MAAM,CAAC,QAAQ,CAAC,CAAC;QACzB,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,CAAC,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC9D,MAAM,UAAU,GAAG,MAAM,eAAe,CAAC,SAAS,CAAC,CAAC;YACpD,OAAO,UAAU,KAAK,UAAU,CAAC;QACnC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,WAAW,CAAC,UAAqC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QAEtC,mBAAmB;QACnB,MAAM,KAAK,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjD,oDAAoD;QACpD,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;QAEzB,IAAI,MAAM,IAAI,CAAC,YAAY,EAAE,EAAE,CAAC;YAC9B,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,6BAA6B;QAC7B,MAAM,QAAQ,GAAG,SAAS,GAAG,WAAW,CAAC;QACzC,MAAM,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;QAE/C,eAAe;QACf,MAAM,IAAI,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;QAE7C,yBAAyB;QACzB,MAAM,MAAM,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAElC,qBAAqB;QACrB,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,EAAE,IAAI,GAAG,IAAI,EAAE,OAAO,CAAC,CAAC;QAE3E,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,YAAY;QAChB,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,mBAAmB,CAAC,CAAC;QAE3D,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,MAAM,CAAC,QAAQ,CAAC,CAAC;QACzB,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,CAAC,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC9D,MAAM,UAAU,GAAG,MAAM,eAAe,CAAC,SAAS,CAAC,CAAC;YACpD,OAAO,UAAU,KAAK,UAAU,CAAC;QACnC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW;QACf,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QAEtC,IAAI,MAAM,IAAI,CAAC,YAAY,EAAE,EAAE,CAAC;YAC9B,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,MAAM,KAAK,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjD,MAAM,QAAQ,GAAG,SAAS,GAAG,WAAW,CAAC;QACzC,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAE7C,MAAM,IAAI,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;QAC7C,MAAM,MAAM,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAClC,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,mBAAmB,CAAC,EAAE,IAAI,GAAG,IAAI,EAAE,OAAO,CAAC,CAAC;QAEjF,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,aAAa,CACzB,QAAgB,EAChB,UAAqC;QAErC,OAAO,IAAI,CAAC,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC5D,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,GAAW,EACX,QAAgB,EAChB,UAAqC;QAErC,2DAA2D;QAC3D,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,IAAI,CAAC;YACH,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,MAAM,CAAC,kBAAkB,CAAC,CAAC;YAClD,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,CAAC;YACnC,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC;QAC5B,CAAC;QAAC,MAAM,CAAC;YACP,6BAA6B;QAC/B,CAAC;QAED,MAAM,OAAO,GAA2B;YACtC,YAAY,EAAE,aAAa;SAC5B,CAAC;QAEF,wCAAwC;QACxC,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;YACrB,OAAO,CAAC,OAAO,CAAC,GAAG,SAAS,YAAY,GAAG,CAAC;QAC9C,CAAC;QAED,IAAI,QAAkB,CAAC;QACvB,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,CAAC;QAC/D,CAAC;QAAC,OAAO,KAAc,EAAE,CAAC;YACxB,MAAM,GAAG,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACnE,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,GAAG,EAAE,CAAC,CAAC;QACvD,CAAC;QAED,wEAAwE;QACxE,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YAC5B,YAAY,GAAG,CAAC,CAAC;QACnB,CAAC;aAAM,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YACnC,qCAAqC;QACvC,CAAC;aAAM,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACxB,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,UAAU,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QAC/F,CAAC;QAED,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;QAC7D,MAAM,SAAS,GAAG,aAAa;YAC7B,CAAC,CAAC,YAAY,GAAG,QAAQ,CAAC,aAAa,EAAE,EAAE,CAAC;YAC5C,CAAC,CAAC,CAAC,CAAC;QAEN,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;QAC3C,CAAC;QAED,iDAAiD;QACjD,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAgD,CAAC,CAAC;QAE/F,MAAM,WAAW,GAAG,iBAAiB,CAAC,QAAQ,EAAE;YAC9C,KAAK,EAAE,YAAY,GAAG,CAAC,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG;SAC/D,CAAC,CAAC;QAEH,IAAI,UAAU,GAAG,YAAY,CAAC;QAE9B,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;YACtC,UAAU,IAAI,KAAK,CAAC,MAAM,CAAC;YAC3B,UAAU,EAAE,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC;QACtC,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,QAAQ,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;QAC1C,CAAC;QAAC,OAAO,KAAc,EAAE,CAAC;YACxB,qCAAqC;YACrC,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,QAAQ,CAAC,CAAC;YACzB,CAAC;YAAC,MAAM,CAAC;gBACP,wBAAwB;YAC1B,CAAC;YACD,MAAM,GAAG,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACnE,MAAM,IAAI,KAAK,CAAC,yBAAyB,GAAG,EAAE,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;CACF;AAED;;;;;GAKG;AACH,KAAK,UAAU,eAAe,CAAC,QAAgB;IAC7C,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;QAClC,MAAM,MAAM,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QAC1C,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QACjD,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Brute-force cosine similarity search over embedding vectors.
|
|
3
|
+
*
|
|
4
|
+
* Designed for in-memory search of up to ~50k 384-dimensional embeddings.
|
|
5
|
+
* At that scale, brute-force cosine similarity is fast enough (<50ms)
|
|
6
|
+
* and avoids the complexity of approximate nearest-neighbor indices.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Compute the cosine similarity between two vectors.
|
|
10
|
+
*
|
|
11
|
+
* cosine_similarity = (a . b) / (|a| * |b|)
|
|
12
|
+
*
|
|
13
|
+
* Optimized for Float32Array — no intermediate allocations.
|
|
14
|
+
*
|
|
15
|
+
* @param a - First vector.
|
|
16
|
+
* @param b - Second vector (must be same length as `a`).
|
|
17
|
+
* @returns Cosine similarity in the range [-1, 1]. Returns 0 if either
|
|
18
|
+
* vector has zero magnitude.
|
|
19
|
+
*/
|
|
20
|
+
export declare function cosineSimilarity(a: Float32Array, b: Float32Array): number;
|
|
21
|
+
/** A search result with index and similarity score. */
|
|
22
|
+
export interface ScoredResult {
|
|
23
|
+
/** Index of the embedding in the input array. */
|
|
24
|
+
index: number;
|
|
25
|
+
/** Cosine similarity score. */
|
|
26
|
+
score: number;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Search a collection of embeddings for the top-K most similar to a query.
|
|
30
|
+
*
|
|
31
|
+
* @param query - The query embedding vector.
|
|
32
|
+
* @param embeddings - Array of embedding vectors to search.
|
|
33
|
+
* @param topK - Number of top results to return.
|
|
34
|
+
* @returns Array of `{ index, score }` sorted by score descending, length <= topK.
|
|
35
|
+
*/
|
|
36
|
+
export declare function searchEmbeddings(query: Float32Array, embeddings: Float32Array[], topK: number): ScoredResult[];
|
|
37
|
+
//# sourceMappingURL=search.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.d.ts","sourceRoot":"","sources":["../../pipeline/search.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH;;;;;;;;;;;GAWG;AACH,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CAwBzE;AAED,uDAAuD;AACvD,MAAM,WAAW,YAAY;IAC3B,iDAAiD;IACjD,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,KAAK,EAAE,MAAM,CAAC;CACf;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAC9B,KAAK,EAAE,YAAY,EACnB,UAAU,EAAE,YAAY,EAAE,EAC1B,IAAI,EAAE,MAAM,GACX,YAAY,EAAE,CAmBhB"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Brute-force cosine similarity search over embedding vectors.
|
|
3
|
+
*
|
|
4
|
+
* Designed for in-memory search of up to ~50k 384-dimensional embeddings.
|
|
5
|
+
* At that scale, brute-force cosine similarity is fast enough (<50ms)
|
|
6
|
+
* and avoids the complexity of approximate nearest-neighbor indices.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Compute the cosine similarity between two vectors.
|
|
10
|
+
*
|
|
11
|
+
* cosine_similarity = (a . b) / (|a| * |b|)
|
|
12
|
+
*
|
|
13
|
+
* Optimized for Float32Array — no intermediate allocations.
|
|
14
|
+
*
|
|
15
|
+
* @param a - First vector.
|
|
16
|
+
* @param b - Second vector (must be same length as `a`).
|
|
17
|
+
* @returns Cosine similarity in the range [-1, 1]. Returns 0 if either
|
|
18
|
+
* vector has zero magnitude.
|
|
19
|
+
*/
|
|
20
|
+
export function cosineSimilarity(a, b) {
|
|
21
|
+
if (a.length !== b.length) {
|
|
22
|
+
throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
|
|
23
|
+
}
|
|
24
|
+
const len = a.length;
|
|
25
|
+
let dot = 0;
|
|
26
|
+
let magA = 0;
|
|
27
|
+
let magB = 0;
|
|
28
|
+
for (let i = 0; i < len; i++) {
|
|
29
|
+
const ai = a[i];
|
|
30
|
+
const bi = b[i];
|
|
31
|
+
dot += ai * bi;
|
|
32
|
+
magA += ai * ai;
|
|
33
|
+
magB += bi * bi;
|
|
34
|
+
}
|
|
35
|
+
const denom = Math.sqrt(magA) * Math.sqrt(magB);
|
|
36
|
+
if (denom === 0) {
|
|
37
|
+
return 0;
|
|
38
|
+
}
|
|
39
|
+
return dot / denom;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Search a collection of embeddings for the top-K most similar to a query.
|
|
43
|
+
*
|
|
44
|
+
* @param query - The query embedding vector.
|
|
45
|
+
* @param embeddings - Array of embedding vectors to search.
|
|
46
|
+
* @param topK - Number of top results to return.
|
|
47
|
+
* @returns Array of `{ index, score }` sorted by score descending, length <= topK.
|
|
48
|
+
*/
|
|
49
|
+
export function searchEmbeddings(query, embeddings, topK) {
|
|
50
|
+
if (embeddings.length === 0 || topK <= 0) {
|
|
51
|
+
return [];
|
|
52
|
+
}
|
|
53
|
+
const k = Math.min(topK, embeddings.length);
|
|
54
|
+
// Compute all similarities
|
|
55
|
+
const scored = new Array(embeddings.length);
|
|
56
|
+
for (let i = 0; i < embeddings.length; i++) {
|
|
57
|
+
scored[i] = { index: i, score: cosineSimilarity(query, embeddings[i]) };
|
|
58
|
+
}
|
|
59
|
+
// Partial sort: only need top-K. For small K relative to N,
|
|
60
|
+
// a selection algorithm is faster than full sort, but for simplicity
|
|
61
|
+
// and correctness we sort and slice. At 50k embeddings this is <10ms.
|
|
62
|
+
scored.sort((a, b) => b.score - a.score);
|
|
63
|
+
return scored.slice(0, k);
|
|
64
|
+
}
|
|
65
|
+
//# sourceMappingURL=search.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.js","sourceRoot":"","sources":["../../pipeline/search.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,gBAAgB,CAAC,CAAe,EAAE,CAAe;IAC/D,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC,MAAM,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;IAC3E,CAAC;IAED,MAAM,GAAG,GAAG,CAAC,CAAC,MAAM,CAAC;IACrB,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,IAAI,IAAI,GAAG,CAAC,CAAC;IAEb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAChB,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAChB,GAAG,IAAI,EAAE,GAAG,EAAE,CAAC;QACf,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;QAChB,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;IAClB,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAChD,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;QAChB,OAAO,CAAC,CAAC;IACX,CAAC;IAED,OAAO,GAAG,GAAG,KAAK,CAAC;AACrB,CAAC;AAUD;;;;;;;GAOG;AACH,MAAM,UAAU,gBAAgB,CAC9B,KAAmB,EACnB,UAA0B,EAC1B,IAAY;IAEZ,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,IAAI,CAAC,EAAE,CAAC;QACzC,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,MAAM,CAAC,CAAC;IAE5C,2BAA2B;IAC3B,MAAM,MAAM,GAAmB,IAAI,KAAK,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;IAC5D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,gBAAgB,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC1E,CAAC;IAED,4DAA4D;IAC5D,qEAAqE;IACrE,sEAAsE;IACtE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAEzC,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AAC5B,CAAC"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token counting for chunking decisions.
|
|
3
|
+
*
|
|
4
|
+
* Uses the real WordPiece tokenizer with the model vocabulary when available,
|
|
5
|
+
* falling back to a lightweight heuristic when the vocab hasn't been loaded.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Set the vocabulary for accurate token counting.
|
|
9
|
+
* Call this once after the model/vocab has been downloaded.
|
|
10
|
+
*
|
|
11
|
+
* @param vocabPath - Absolute path to vocab.txt.
|
|
12
|
+
*/
|
|
13
|
+
export declare function setTokenizerVocab(vocabPath: string): void;
|
|
14
|
+
/**
|
|
15
|
+
* Clear the tokenizer vocabulary (for testing).
|
|
16
|
+
*/
|
|
17
|
+
export declare function clearTokenizerVocab(): void;
|
|
18
|
+
/**
|
|
19
|
+
* Count the number of tokens in a text string.
|
|
20
|
+
*
|
|
21
|
+
* If a vocabulary has been loaded via `setTokenizerVocab()`, returns
|
|
22
|
+
* an accurate WordPiece token count. Otherwise, falls back to the
|
|
23
|
+
* heuristic estimate (suitable for chunking decisions).
|
|
24
|
+
*
|
|
25
|
+
* @param text - The input text to count tokens for.
|
|
26
|
+
* @returns Token count (always >= 0).
|
|
27
|
+
*/
|
|
28
|
+
export declare function countTokens(text: string): number;
|
|
29
|
+
//# sourceMappingURL=tokenizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../pipeline/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI,CAEzD;AAED;;GAEG;AACH,wBAAgB,mBAAmB,IAAI,IAAI,CAE1C;AAYD;;;;;;;;;GASG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAYhD"}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token counting for chunking decisions.
|
|
3
|
+
*
|
|
4
|
+
* Uses the real WordPiece tokenizer with the model vocabulary when available,
|
|
5
|
+
* falling back to a lightweight heuristic when the vocab hasn't been loaded.
|
|
6
|
+
*/
|
|
7
|
+
import { countWordPieceTokens, loadVocabulary } from './wordpiece-tokenizer.js';
|
|
8
|
+
/** Cached vocabulary reference for token counting. */
|
|
9
|
+
let cachedVocab = null;
|
|
10
|
+
/**
|
|
11
|
+
* Set the vocabulary for accurate token counting.
|
|
12
|
+
* Call this once after the model/vocab has been downloaded.
|
|
13
|
+
*
|
|
14
|
+
* @param vocabPath - Absolute path to vocab.txt.
|
|
15
|
+
*/
|
|
16
|
+
export function setTokenizerVocab(vocabPath) {
|
|
17
|
+
cachedVocab = loadVocabulary(vocabPath);
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Clear the tokenizer vocabulary (for testing).
|
|
21
|
+
*/
|
|
22
|
+
export function clearTokenizerVocab() {
|
|
23
|
+
cachedVocab = null;
|
|
24
|
+
}
|
|
25
|
+
/** Punctuation characters that act as token boundaries (for heuristic fallback). */
|
|
26
|
+
const PUNCTUATION_RE = /[\s.,;:!?()\[\]{}'"\-\u2014\u2013\u2026]+/;
|
|
27
|
+
/**
|
|
28
|
+
* Approximate multiplier to account for sub-word splitting.
|
|
29
|
+
* WordPiece / BPE tokenizers typically produce ~1.3x the number of
|
|
30
|
+
* whitespace-delimited words.
|
|
31
|
+
*/
|
|
32
|
+
const SUBWORD_MULTIPLIER = 1.3;
|
|
33
|
+
/**
|
|
34
|
+
* Count the number of tokens in a text string.
|
|
35
|
+
*
|
|
36
|
+
* If a vocabulary has been loaded via `setTokenizerVocab()`, returns
|
|
37
|
+
* an accurate WordPiece token count. Otherwise, falls back to the
|
|
38
|
+
* heuristic estimate (suitable for chunking decisions).
|
|
39
|
+
*
|
|
40
|
+
* @param text - The input text to count tokens for.
|
|
41
|
+
* @returns Token count (always >= 0).
|
|
42
|
+
*/
|
|
43
|
+
export function countTokens(text) {
|
|
44
|
+
if (!text || text.trim().length === 0) {
|
|
45
|
+
return 0;
|
|
46
|
+
}
|
|
47
|
+
if (cachedVocab) {
|
|
48
|
+
return countWordPieceTokens(text, cachedVocab);
|
|
49
|
+
}
|
|
50
|
+
// Heuristic fallback when vocab is not loaded
|
|
51
|
+
const words = text.split(PUNCTUATION_RE).filter((w) => w.length > 0);
|
|
52
|
+
return Math.ceil(words.length * SUBWORD_MULTIPLIER);
|
|
53
|
+
}
|
|
54
|
+
//# sourceMappingURL=tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../pipeline/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,oBAAoB,EAAE,cAAc,EAAmB,MAAM,0BAA0B,CAAC;AAEjG,sDAAsD;AACtD,IAAI,WAAW,GAAsB,IAAI,CAAC;AAE1C;;;;;GAKG;AACH,MAAM,UAAU,iBAAiB,CAAC,SAAiB;IACjD,WAAW,GAAG,cAAc,CAAC,SAAS,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB;IACjC,WAAW,GAAG,IAAI,CAAC;AACrB,CAAC;AAED,oFAAoF;AACpF,MAAM,cAAc,GAAG,2CAA2C,CAAC;AAEnE;;;;GAIG;AACH,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAE/B;;;;;;;;;GASG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,CAAC,CAAC;IACX,CAAC;IAED,IAAI,WAAW,EAAE,CAAC;QAChB,OAAO,oBAAoB,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IACjD,CAAC;IAED,8CAA8C;IAC9C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACrE,OAAO,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,kBAAkB,CAAC,CAAC;AACtD,CAAC"}
|