@de-otio/chaoskb-client 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/agent-registry/config-merger.d.ts +28 -0
- package/dist/cli/agent-registry/config-merger.d.ts.map +1 -0
- package/dist/cli/agent-registry/config-merger.js +90 -0
- package/dist/cli/agent-registry/config-merger.js.map +1 -0
- package/dist/cli/agent-registry/detector.d.ts +7 -0
- package/dist/cli/agent-registry/detector.d.ts.map +1 -0
- package/dist/cli/agent-registry/detector.js +100 -0
- package/dist/cli/agent-registry/detector.js.map +1 -0
- package/dist/cli/agent-registry/index.d.ts +26 -0
- package/dist/cli/agent-registry/index.d.ts.map +1 -0
- package/dist/cli/agent-registry/index.js +77 -0
- package/dist/cli/agent-registry/index.js.map +1 -0
- package/dist/cli/agent-registry/path-validator.d.ts +11 -0
- package/dist/cli/agent-registry/path-validator.d.ts.map +1 -0
- package/dist/cli/agent-registry/path-validator.js +69 -0
- package/dist/cli/agent-registry/path-validator.js.map +1 -0
- package/dist/cli/agent-registry/registry.json +108 -0
- package/dist/cli/agent-registry/types.d.ts +29 -0
- package/dist/cli/agent-registry/types.d.ts.map +1 -0
- package/dist/cli/agent-registry/types.js +2 -0
- package/dist/cli/agent-registry/types.js.map +1 -0
- package/dist/cli/bootstrap-lock.d.ts +7 -0
- package/dist/cli/bootstrap-lock.d.ts.map +1 -0
- package/dist/cli/bootstrap-lock.js +62 -0
- package/dist/cli/bootstrap-lock.js.map +1 -0
- package/dist/cli/bootstrap.d.ts +23 -0
- package/dist/cli/bootstrap.d.ts.map +1 -0
- package/dist/cli/bootstrap.js +438 -0
- package/dist/cli/bootstrap.js.map +1 -0
- package/dist/cli/commands/config.d.ts +13 -0
- package/dist/cli/commands/config.d.ts.map +1 -0
- package/dist/cli/commands/config.js +244 -0
- package/dist/cli/commands/config.js.map +1 -0
- package/dist/cli/commands/devices.d.ts +21 -0
- package/dist/cli/commands/devices.d.ts.map +1 -0
- package/dist/cli/commands/devices.js +229 -0
- package/dist/cli/commands/devices.js.map +1 -0
- package/dist/cli/commands/export.d.ts +12 -0
- package/dist/cli/commands/export.d.ts.map +1 -0
- package/dist/cli/commands/export.js +183 -0
- package/dist/cli/commands/export.js.map +1 -0
- package/dist/cli/commands/import.d.ts +26 -0
- package/dist/cli/commands/import.d.ts.map +1 -0
- package/dist/cli/commands/import.js +311 -0
- package/dist/cli/commands/import.js.map +1 -0
- package/dist/cli/commands/kb.d.ts +39 -0
- package/dist/cli/commands/kb.d.ts.map +1 -0
- package/dist/cli/commands/kb.js +138 -0
- package/dist/cli/commands/kb.js.map +1 -0
- package/dist/cli/commands/project.d.ts +6 -0
- package/dist/cli/commands/project.d.ts.map +1 -0
- package/dist/cli/commands/project.js +115 -0
- package/dist/cli/commands/project.js.map +1 -0
- package/dist/cli/commands/projects.d.ts +33 -0
- package/dist/cli/commands/projects.d.ts.map +1 -0
- package/dist/cli/commands/projects.js +189 -0
- package/dist/cli/commands/projects.js.map +1 -0
- package/dist/cli/commands/register.d.ts +8 -0
- package/dist/cli/commands/register.d.ts.map +1 -0
- package/dist/cli/commands/register.js +146 -0
- package/dist/cli/commands/register.js.map +1 -0
- package/dist/cli/commands/rotate-key.d.ts +16 -0
- package/dist/cli/commands/rotate-key.d.ts.map +1 -0
- package/dist/cli/commands/rotate-key.js +197 -0
- package/dist/cli/commands/rotate-key.js.map +1 -0
- package/dist/cli/commands/setup-sync.d.ts +2 -0
- package/dist/cli/commands/setup-sync.d.ts.map +1 -0
- package/dist/cli/commands/setup-sync.js +165 -0
- package/dist/cli/commands/setup-sync.js.map +1 -0
- package/dist/cli/commands/setup.d.ts +12 -0
- package/dist/cli/commands/setup.d.ts.map +1 -0
- package/dist/cli/commands/setup.js +39 -0
- package/dist/cli/commands/setup.js.map +1 -0
- package/dist/cli/commands/status.d.ts +5 -0
- package/dist/cli/commands/status.d.ts.map +1 -0
- package/dist/cli/commands/status.js +96 -0
- package/dist/cli/commands/status.js.map +1 -0
- package/dist/cli/commands/uninstall.d.ts +4 -0
- package/dist/cli/commands/uninstall.d.ts.map +1 -0
- package/dist/cli/commands/uninstall.js +85 -0
- package/dist/cli/commands/uninstall.js.map +1 -0
- package/dist/cli/commands/unregister.d.ts +2 -0
- package/dist/cli/commands/unregister.d.ts.map +1 -0
- package/dist/cli/commands/unregister.js +46 -0
- package/dist/cli/commands/unregister.js.map +1 -0
- package/dist/cli/device-metadata.d.ts +15 -0
- package/dist/cli/device-metadata.d.ts.map +1 -0
- package/dist/cli/device-metadata.js +58 -0
- package/dist/cli/device-metadata.js.map +1 -0
- package/dist/cli/github.d.ts +38 -0
- package/dist/cli/github.d.ts.map +1 -0
- package/dist/cli/github.js +159 -0
- package/dist/cli/github.js.map +1 -0
- package/dist/cli/guide-hashes.json +13 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +226 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/mcp-server.d.ts +205 -0
- package/dist/cli/mcp-server.d.ts.map +1 -0
- package/dist/cli/mcp-server.js +366 -0
- package/dist/cli/mcp-server.js.map +1 -0
- package/dist/cli/tools/kb-delete.d.ts +10 -0
- package/dist/cli/tools/kb-delete.d.ts.map +1 -0
- package/dist/cli/tools/kb-delete.js +28 -0
- package/dist/cli/tools/kb-delete.js.map +1 -0
- package/dist/cli/tools/kb-ingest.d.ts +13 -0
- package/dist/cli/tools/kb-ingest.d.ts.map +1 -0
- package/dist/cli/tools/kb-ingest.js +72 -0
- package/dist/cli/tools/kb-ingest.js.map +1 -0
- package/dist/cli/tools/kb-list.d.ts +20 -0
- package/dist/cli/tools/kb-list.d.ts.map +1 -0
- package/dist/cli/tools/kb-list.js +24 -0
- package/dist/cli/tools/kb-list.js.map +1 -0
- package/dist/cli/tools/kb-query-shared.d.ts +27 -0
- package/dist/cli/tools/kb-query-shared.d.ts.map +1 -0
- package/dist/cli/tools/kb-query-shared.js +28 -0
- package/dist/cli/tools/kb-query-shared.js.map +1 -0
- package/dist/cli/tools/kb-query.d.ts +20 -0
- package/dist/cli/tools/kb-query.d.ts.map +1 -0
- package/dist/cli/tools/kb-query.js +109 -0
- package/dist/cli/tools/kb-query.js.map +1 -0
- package/dist/cli/tools/kb-summary.d.ts +29 -0
- package/dist/cli/tools/kb-summary.d.ts.map +1 -0
- package/dist/cli/tools/kb-summary.js +89 -0
- package/dist/cli/tools/kb-summary.js.map +1 -0
- package/dist/cli/tools/kb-sync-status.d.ts +7 -0
- package/dist/cli/tools/kb-sync-status.d.ts.map +1 -0
- package/dist/cli/tools/kb-sync-status.js +48 -0
- package/dist/cli/tools/kb-sync-status.js.map +1 -0
- package/dist/crypto/aad.d.ts +8 -0
- package/dist/crypto/aad.d.ts.map +1 -0
- package/dist/crypto/aad.js +11 -0
- package/dist/crypto/aad.js.map +1 -0
- package/dist/crypto/aead.d.ts +21 -0
- package/dist/crypto/aead.d.ts.map +1 -0
- package/dist/crypto/aead.js +43 -0
- package/dist/crypto/aead.js.map +1 -0
- package/dist/crypto/argon2.d.ts +11 -0
- package/dist/crypto/argon2.d.ts.map +1 -0
- package/dist/crypto/argon2.js +33 -0
- package/dist/crypto/argon2.js.map +1 -0
- package/dist/crypto/blob-id.d.ts +6 -0
- package/dist/crypto/blob-id.d.ts.map +1 -0
- package/dist/crypto/blob-id.js +33 -0
- package/dist/crypto/blob-id.js.map +1 -0
- package/dist/crypto/canonical-json.d.ts +6 -0
- package/dist/crypto/canonical-json.d.ts.map +1 -0
- package/dist/crypto/canonical-json.js +88 -0
- package/dist/crypto/canonical-json.js.map +1 -0
- package/dist/crypto/commitment.d.ts +12 -0
- package/dist/crypto/commitment.d.ts.map +1 -0
- package/dist/crypto/commitment.js +37 -0
- package/dist/crypto/commitment.js.map +1 -0
- package/dist/crypto/encryption-service.d.ts +19 -0
- package/dist/crypto/encryption-service.d.ts.map +1 -0
- package/dist/crypto/encryption-service.js +38 -0
- package/dist/crypto/encryption-service.js.map +1 -0
- package/dist/crypto/envelope-cbor.d.ts +37 -0
- package/dist/crypto/envelope-cbor.d.ts.map +1 -0
- package/dist/crypto/envelope-cbor.js +124 -0
- package/dist/crypto/envelope-cbor.js.map +1 -0
- package/dist/crypto/envelope.d.ts +34 -0
- package/dist/crypto/envelope.d.ts.map +1 -0
- package/dist/crypto/envelope.js +160 -0
- package/dist/crypto/envelope.js.map +1 -0
- package/dist/crypto/hkdf.d.ts +16 -0
- package/dist/crypto/hkdf.d.ts.map +1 -0
- package/dist/crypto/hkdf.js +33 -0
- package/dist/crypto/hkdf.js.map +1 -0
- package/dist/crypto/index.d.ts +15 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +15 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/invite.d.ts +31 -0
- package/dist/crypto/invite.d.ts.map +1 -0
- package/dist/crypto/invite.js +137 -0
- package/dist/crypto/invite.js.map +1 -0
- package/dist/crypto/keyring.d.ts +37 -0
- package/dist/crypto/keyring.d.ts.map +1 -0
- package/dist/crypto/keyring.js +219 -0
- package/dist/crypto/keyring.js.map +1 -0
- package/dist/crypto/known-keys.d.ts +34 -0
- package/dist/crypto/known-keys.d.ts.map +1 -0
- package/dist/crypto/known-keys.js +106 -0
- package/dist/crypto/known-keys.js.map +1 -0
- package/dist/crypto/project-keys.d.ts +26 -0
- package/dist/crypto/project-keys.d.ts.map +1 -0
- package/dist/crypto/project-keys.js +69 -0
- package/dist/crypto/project-keys.js.map +1 -0
- package/dist/crypto/secure-buffer.d.ts +31 -0
- package/dist/crypto/secure-buffer.d.ts.map +1 -0
- package/dist/crypto/secure-buffer.js +61 -0
- package/dist/crypto/secure-buffer.js.map +1 -0
- package/dist/crypto/ssh-agent.d.ts +16 -0
- package/dist/crypto/ssh-agent.d.ts.map +1 -0
- package/dist/crypto/ssh-agent.js +225 -0
- package/dist/crypto/ssh-agent.js.map +1 -0
- package/dist/crypto/ssh-keys.d.ts +19 -0
- package/dist/crypto/ssh-keys.d.ts.map +1 -0
- package/dist/crypto/ssh-keys.js +121 -0
- package/dist/crypto/ssh-keys.js.map +1 -0
- package/dist/crypto/tiers/enhanced.d.ts +25 -0
- package/dist/crypto/tiers/enhanced.d.ts.map +1 -0
- package/dist/crypto/tiers/enhanced.js +56 -0
- package/dist/crypto/tiers/enhanced.js.map +1 -0
- package/dist/crypto/tiers/maximum.d.ts +19 -0
- package/dist/crypto/tiers/maximum.d.ts.map +1 -0
- package/dist/crypto/tiers/maximum.js +25 -0
- package/dist/crypto/tiers/maximum.js.map +1 -0
- package/dist/crypto/tiers/standard.d.ts +27 -0
- package/dist/crypto/tiers/standard.d.ts.map +1 -0
- package/dist/crypto/tiers/standard.js +147 -0
- package/dist/crypto/tiers/standard.js.map +1 -0
- package/dist/crypto/types.d.ts +169 -0
- package/dist/crypto/types.d.ts.map +1 -0
- package/dist/crypto/types.js +11 -0
- package/dist/crypto/types.js.map +1 -0
- package/dist/pipeline/chunker.d.ts +27 -0
- package/dist/pipeline/chunker.d.ts.map +1 -0
- package/dist/pipeline/chunker.js +96 -0
- package/dist/pipeline/chunker.js.map +1 -0
- package/dist/pipeline/content-pipeline.d.ts +24 -0
- package/dist/pipeline/content-pipeline.d.ts.map +1 -0
- package/dist/pipeline/content-pipeline.js +49 -0
- package/dist/pipeline/content-pipeline.js.map +1 -0
- package/dist/pipeline/embedder.d.ts +49 -0
- package/dist/pipeline/embedder.d.ts.map +1 -0
- package/dist/pipeline/embedder.js +195 -0
- package/dist/pipeline/embedder.js.map +1 -0
- package/dist/pipeline/extract.d.ts +17 -0
- package/dist/pipeline/extract.d.ts.map +1 -0
- package/dist/pipeline/extract.js +70 -0
- package/dist/pipeline/extract.js.map +1 -0
- package/dist/pipeline/fetch.d.ts +26 -0
- package/dist/pipeline/fetch.d.ts.map +1 -0
- package/dist/pipeline/fetch.js +91 -0
- package/dist/pipeline/fetch.js.map +1 -0
- package/dist/pipeline/index.d.ts +10 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +10 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/model-manager.d.ts +57 -0
- package/dist/pipeline/model-manager.d.ts.map +1 -0
- package/dist/pipeline/model-manager.js +234 -0
- package/dist/pipeline/model-manager.js.map +1 -0
- package/dist/pipeline/search.d.ts +37 -0
- package/dist/pipeline/search.d.ts.map +1 -0
- package/dist/pipeline/search.js +65 -0
- package/dist/pipeline/search.js.map +1 -0
- package/dist/pipeline/tokenizer.d.ts +29 -0
- package/dist/pipeline/tokenizer.d.ts.map +1 -0
- package/dist/pipeline/tokenizer.js +54 -0
- package/dist/pipeline/tokenizer.js.map +1 -0
- package/dist/pipeline/types.d.ts +86 -0
- package/dist/pipeline/types.d.ts.map +1 -0
- package/dist/pipeline/types.js +2 -0
- package/dist/pipeline/types.js.map +1 -0
- package/dist/pipeline/wordpiece-tokenizer.d.ts +60 -0
- package/dist/pipeline/wordpiece-tokenizer.d.ts.map +1 -0
- package/dist/pipeline/wordpiece-tokenizer.js +251 -0
- package/dist/pipeline/wordpiece-tokenizer.js.map +1 -0
- package/dist/storage/chunk-repo.d.ts +29 -0
- package/dist/storage/chunk-repo.d.ts.map +1 -0
- package/dist/storage/chunk-repo.js +115 -0
- package/dist/storage/chunk-repo.js.map +1 -0
- package/dist/storage/database-manager.d.ts +17 -0
- package/dist/storage/database-manager.d.ts.map +1 -0
- package/dist/storage/database-manager.js +100 -0
- package/dist/storage/database-manager.js.map +1 -0
- package/dist/storage/database.d.ts +10 -0
- package/dist/storage/database.d.ts.map +1 -0
- package/dist/storage/database.js +34 -0
- package/dist/storage/database.js.map +1 -0
- package/dist/storage/embedding-index.d.ts +22 -0
- package/dist/storage/embedding-index.d.ts.map +1 -0
- package/dist/storage/embedding-index.js +78 -0
- package/dist/storage/embedding-index.js.map +1 -0
- package/dist/storage/index.d.ts +10 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +10 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/kb-database.d.ts +11 -0
- package/dist/storage/kb-database.d.ts.map +1 -0
- package/dist/storage/kb-database.js +24 -0
- package/dist/storage/kb-database.js.map +1 -0
- package/dist/storage/schema.d.ts +6 -0
- package/dist/storage/schema.d.ts.map +1 -0
- package/dist/storage/schema.js +122 -0
- package/dist/storage/schema.js.map +1 -0
- package/dist/storage/source-repo.d.ts +20 -0
- package/dist/storage/source-repo.d.ts.map +1 -0
- package/dist/storage/source-repo.js +120 -0
- package/dist/storage/source-repo.js.map +1 -0
- package/dist/storage/sync-status-repo.d.ts +15 -0
- package/dist/storage/sync-status-repo.d.ts.map +1 -0
- package/dist/storage/sync-status-repo.js +40 -0
- package/dist/storage/sync-status-repo.js.map +1 -0
- package/dist/storage/types.d.ts +139 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/storage/types.js +9 -0
- package/dist/storage/types.js.map +1 -0
- package/dist/sync/canary.d.ts +14 -0
- package/dist/sync/canary.d.ts.map +1 -0
- package/dist/sync/canary.js +53 -0
- package/dist/sync/canary.js.map +1 -0
- package/dist/sync/full-sync.d.ts +16 -0
- package/dist/sync/full-sync.d.ts.map +1 -0
- package/dist/sync/full-sync.js +91 -0
- package/dist/sync/full-sync.js.map +1 -0
- package/dist/sync/http-client.d.ts +28 -0
- package/dist/sync/http-client.d.ts.map +1 -0
- package/dist/sync/http-client.js +90 -0
- package/dist/sync/http-client.js.map +1 -0
- package/dist/sync/incremental-sync.d.ts +17 -0
- package/dist/sync/incremental-sync.d.ts.map +1 -0
- package/dist/sync/incremental-sync.js +155 -0
- package/dist/sync/incremental-sync.js.map +1 -0
- package/dist/sync/index.d.ts +12 -0
- package/dist/sync/index.d.ts.map +1 -0
- package/dist/sync/index.js +12 -0
- package/dist/sync/index.js.map +1 -0
- package/dist/sync/quota.d.ts +17 -0
- package/dist/sync/quota.d.ts.map +1 -0
- package/dist/sync/quota.js +48 -0
- package/dist/sync/quota.js.map +1 -0
- package/dist/sync/sequence.d.ts +21 -0
- package/dist/sync/sequence.d.ts.map +1 -0
- package/dist/sync/sequence.js +49 -0
- package/dist/sync/sequence.js.map +1 -0
- package/dist/sync/ssh-signer.d.ts +59 -0
- package/dist/sync/ssh-signer.d.ts.map +1 -0
- package/dist/sync/ssh-signer.js +241 -0
- package/dist/sync/ssh-signer.js.map +1 -0
- package/dist/sync/sync-service.d.ts +48 -0
- package/dist/sync/sync-service.d.ts.map +1 -0
- package/dist/sync/sync-service.js +116 -0
- package/dist/sync/sync-service.js.map +1 -0
- package/dist/sync/types.d.ts +106 -0
- package/dist/sync/types.d.ts.map +1 -0
- package/dist/sync/types.js +2 -0
- package/dist/sync/types.js.map +1 -0
- package/dist/sync/upload-queue.d.ts +40 -0
- package/dist/sync/upload-queue.d.ts.map +1 -0
- package/dist/sync/upload-queue.js +148 -0
- package/dist/sync/upload-queue.js.map +1 -0
- package/dist/sync/verification.d.ts +17 -0
- package/dist/sync/verification.d.ts.map +1 -0
- package/dist/sync/verification.js +25 -0
- package/dist/sync/verification.js.map +1 -0
- package/dist/vitest.config.d.ts +3 -0
- package/dist/vitest.config.d.ts.map +1 -0
- package/dist/vitest.config.js +16 -0
- package/dist/vitest.config.js.map +1 -0
- package/package.json +68 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../crypto/types.ts"],"names":[],"mappings":"AAqBA,uCAAuC;AACvC,MAAM,CAAN,IAAY,YAOX;AAPD,WAAY,YAAY;IACtB,+EAA+E;IAC/E,qCAAqB,CAAA;IACrB,yFAAyF;IACzF,qCAAqB,CAAA;IACrB,kDAAkD;IAClD,mCAAmB,CAAA;AACrB,CAAC,EAPW,YAAY,KAAZ,YAAY,QAOvB"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunking for the content pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Splits extracted text into overlapping chunks of approximately
|
|
5
|
+
* `maxTokens` tokens, breaking on sentence boundaries where possible.
|
|
6
|
+
*/
|
|
7
|
+
import type { Chunk } from './types.js';
|
|
8
|
+
/** Configuration for the chunker. */
|
|
9
|
+
export interface ChunkConfig {
|
|
10
|
+
/** Maximum tokens per chunk (default: 500). */
|
|
11
|
+
maxTokens: number;
|
|
12
|
+
/** Token overlap between consecutive chunks (default: 50). */
|
|
13
|
+
overlapTokens: number;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Split text into overlapping chunks of approximately `maxTokens` tokens.
|
|
17
|
+
*
|
|
18
|
+
* Splitting is performed on sentence boundaries where possible. If a
|
|
19
|
+
* single sentence exceeds `maxTokens`, it is included as-is in its own
|
|
20
|
+
* chunk (no mid-sentence splitting).
|
|
21
|
+
*
|
|
22
|
+
* @param text - The text to split into chunks.
|
|
23
|
+
* @param config - Optional chunking configuration.
|
|
24
|
+
* @returns Array of chunks with content, index, token count, and byte offset.
|
|
25
|
+
*/
|
|
26
|
+
export declare function chunkText(text: string, config?: Partial<ChunkConfig>): Chunk[];
|
|
27
|
+
//# sourceMappingURL=chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../../pipeline/chunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAExC,qCAAqC;AACrC,MAAM,WAAW,WAAW;IAC1B,+CAA+C;IAC/C,SAAS,EAAE,MAAM,CAAC;IAClB,8DAA8D;IAC9D,aAAa,EAAE,MAAM,CAAC;CACvB;AAaD;;;;;;;;;;GAUG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,GAAG,KAAK,EAAE,CAoE9E"}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunking for the content pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Splits extracted text into overlapping chunks of approximately
|
|
5
|
+
* `maxTokens` tokens, breaking on sentence boundaries where possible.
|
|
6
|
+
*/
|
|
7
|
+
import { countTokens } from './tokenizer.js';
|
|
8
|
+
const DEFAULT_CONFIG = {
|
|
9
|
+
maxTokens: 500,
|
|
10
|
+
overlapTokens: 50,
|
|
11
|
+
};
|
|
12
|
+
/**
|
|
13
|
+
* Sentence-boundary regex. Matches `.`, `!`, or `?` followed by
|
|
14
|
+
* whitespace or end of string. Keeps the punctuation with the sentence.
|
|
15
|
+
*/
|
|
16
|
+
const SENTENCE_BOUNDARY_RE = /(?<=[.!?])\s+/;
|
|
17
|
+
/**
|
|
18
|
+
* Split text into overlapping chunks of approximately `maxTokens` tokens.
|
|
19
|
+
*
|
|
20
|
+
* Splitting is performed on sentence boundaries where possible. If a
|
|
21
|
+
* single sentence exceeds `maxTokens`, it is included as-is in its own
|
|
22
|
+
* chunk (no mid-sentence splitting).
|
|
23
|
+
*
|
|
24
|
+
* @param text - The text to split into chunks.
|
|
25
|
+
* @param config - Optional chunking configuration.
|
|
26
|
+
* @returns Array of chunks with content, index, token count, and byte offset.
|
|
27
|
+
*/
|
|
28
|
+
export function chunkText(text, config) {
|
|
29
|
+
const maxTokens = config?.maxTokens ?? DEFAULT_CONFIG.maxTokens;
|
|
30
|
+
const overlapTokens = config?.overlapTokens ?? DEFAULT_CONFIG.overlapTokens;
|
|
31
|
+
if (!text || text.trim().length === 0) {
|
|
32
|
+
return [];
|
|
33
|
+
}
|
|
34
|
+
// Split into sentences
|
|
35
|
+
const sentences = text.split(SENTENCE_BOUNDARY_RE).filter((s) => s.length > 0);
|
|
36
|
+
if (sentences.length === 0) {
|
|
37
|
+
return [];
|
|
38
|
+
}
|
|
39
|
+
// Pre-compute token counts for each sentence
|
|
40
|
+
const sentenceTokens = sentences.map((s) => countTokens(s));
|
|
41
|
+
const chunks = [];
|
|
42
|
+
let sentenceIdx = 0;
|
|
43
|
+
while (sentenceIdx < sentences.length) {
|
|
44
|
+
// Build a chunk by accumulating sentences up to maxTokens
|
|
45
|
+
const chunkSentences = [];
|
|
46
|
+
let chunkTokenCount = 0;
|
|
47
|
+
const startSentenceIdx = sentenceIdx;
|
|
48
|
+
while (sentenceIdx < sentences.length) {
|
|
49
|
+
const stc = sentenceTokens[sentenceIdx];
|
|
50
|
+
// If adding this sentence would exceed max and we already have content,
|
|
51
|
+
// stop (unless the chunk is empty — always include at least one sentence).
|
|
52
|
+
if (chunkTokenCount + stc > maxTokens && chunkSentences.length > 0) {
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
chunkSentences.push(sentences[sentenceIdx]);
|
|
56
|
+
chunkTokenCount += stc;
|
|
57
|
+
sentenceIdx++;
|
|
58
|
+
}
|
|
59
|
+
const content = chunkSentences.join(' ');
|
|
60
|
+
// Compute byte offset: sum of byte lengths of all sentences before startSentenceIdx
|
|
61
|
+
// plus the separator spaces between them
|
|
62
|
+
const byteOffset = computeByteOffset(sentences, startSentenceIdx);
|
|
63
|
+
chunks.push({
|
|
64
|
+
content,
|
|
65
|
+
index: chunks.length,
|
|
66
|
+
tokenCount: countTokens(content),
|
|
67
|
+
byteOffset,
|
|
68
|
+
});
|
|
69
|
+
// Apply overlap: back up by enough sentences to cover overlapTokens
|
|
70
|
+
if (sentenceIdx < sentences.length) {
|
|
71
|
+
let overlapCount = 0;
|
|
72
|
+
let backtrack = sentenceIdx - 1;
|
|
73
|
+
while (backtrack > startSentenceIdx && overlapCount < overlapTokens) {
|
|
74
|
+
overlapCount += sentenceTokens[backtrack];
|
|
75
|
+
backtrack--;
|
|
76
|
+
}
|
|
77
|
+
// sentenceIdx should start from backtrack + 1 (the first overlap sentence)
|
|
78
|
+
sentenceIdx = backtrack + 1;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return chunks;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Compute the byte offset of the sentence at the given index
|
|
85
|
+
* within the original text (assuming sentences are joined by single spaces).
|
|
86
|
+
*/
|
|
87
|
+
function computeByteOffset(sentences, targetIdx) {
|
|
88
|
+
let offset = 0;
|
|
89
|
+
for (let i = 0; i < targetIdx; i++) {
|
|
90
|
+
offset += Buffer.byteLength(sentences[i], 'utf-8');
|
|
91
|
+
// Account for the whitespace separator between sentences
|
|
92
|
+
offset += 1; // The space that was split on
|
|
93
|
+
}
|
|
94
|
+
return offset;
|
|
95
|
+
}
|
|
96
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../../pipeline/chunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAW7C,MAAM,cAAc,GAAgB;IAClC,SAAS,EAAE,GAAG;IACd,aAAa,EAAE,EAAE;CAClB,CAAC;AAEF;;;GAGG;AACH,MAAM,oBAAoB,GAAG,eAAe,CAAC;AAE7C;;;;;;;;;;GAUG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,MAA6B;IACnE,MAAM,SAAS,GAAG,MAAM,EAAE,SAAS,IAAI,cAAc,CAAC,SAAS,CAAC;IAChE,MAAM,aAAa,GAAG,MAAM,EAAE,aAAa,IAAI,cAAc,CAAC,aAAa,CAAC;IAE5E,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,uBAAuB;IACvB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE/E,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,6CAA6C;IAC7C,MAAM,cAAc,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5D,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,OAAO,WAAW,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;QACtC,0DAA0D;QAC1D,MAAM,cAAc,GAAa,EAAE,CAAC;QACpC,IAAI,eAAe,GAAG,CAAC,CAAC;QACxB,MAAM,gBAAgB,GAAG,WAAW,CAAC;QAErC,OAAO,WAAW,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;YACtC,MAAM,GAAG,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;YAExC,wEAAwE;YACxE,2EAA2E;YAC3E,IAAI,eAAe,GAAG,GAAG,GAAG,SAAS,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnE,MAAM;YACR,CAAC;YAED,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC;YAC5C,eAAe,IAAI,GAAG,CAAC;YACvB,WAAW,EAAE,CAAC;QAChB,CAAC;QAED,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEzC,oFAAoF;QACpF,yCAAyC;QACzC,MAAM,UAAU,GAAG,iBAAiB,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;QAElE,MAAM,CAAC,IAAI,CAAC;YACV,OAAO;YACP,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,UAAU,EAAE,WAAW,CAAC,OAAO,CAAC;YAChC,UAAU;SACX,CAAC,CAAC;QAEH,oEAAoE;QACpE,IAAI,WAAW,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;YACnC,IAAI,YAAY,GAAG,CAAC,CAAC;YACrB,IAAI,SAAS,GAAG,WAAW,GAAG,CAAC,CAAC;YAChC,OAAO,SAAS,GAAG,gBAAgB,IAAI,YAAY,GAAG,aAAa,EAAE,CAAC;gBACpE,YAAY,IAAI,cAAc,CAAC,SAAS,CAAC,CAAC;gBAC1C,SAAS,EAAE,CAAC;YACd,CAAC;YACD,2EAA2E;YAC3E,WAAW,GAAG,SAAS,GAAG,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAS,iBAAiB,CAAC,SAAmB,EAAE,SAAiB;IAC/D,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,MAAM,IAAI,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QACnD,yDAAyD;QACzD,MAAM,IAAI,CAAC,CAAC,CAAC,8BAA8B;IAC7C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { Embedder } from './embedder.js';
|
|
2
|
+
import type { Chunk, EmbeddedChunk, EmbeddingVector, ExtractedContent, IContentPipeline, PipelineConfig } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Concrete implementation of IContentPipeline.
|
|
5
|
+
*
|
|
6
|
+
* Orchestrates fetching, extraction, chunking, embedding, and search
|
|
7
|
+
* by delegating to the standalone pipeline functions and an injected Embedder.
|
|
8
|
+
*/
|
|
9
|
+
export declare class ContentPipeline implements IContentPipeline {
|
|
10
|
+
private readonly config;
|
|
11
|
+
private readonly embedder;
|
|
12
|
+
constructor(config: Partial<PipelineConfig>, embedder: Embedder);
|
|
13
|
+
/** Fetch a URL and extract its main article content. */
|
|
14
|
+
fetchAndExtract(url: string): Promise<ExtractedContent>;
|
|
15
|
+
/** Split text into overlapping chunks. */
|
|
16
|
+
chunk(text: string): Chunk[];
|
|
17
|
+
/** Embed a single text string. */
|
|
18
|
+
embed(text: string): Promise<EmbeddingVector>;
|
|
19
|
+
/** Embed multiple chunks and zip results into EmbeddedChunk[]. */
|
|
20
|
+
embedChunks(chunks: Chunk[]): Promise<EmbeddedChunk[]>;
|
|
21
|
+
/** Search embeddings for the top-K most similar, returning their indices. */
|
|
22
|
+
search(query: EmbeddingVector, embeddings: EmbeddingVector[], topK: number): number[];
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=content-pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-pipeline.d.ts","sourceRoot":"","sources":["../../pipeline/content-pipeline.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAI9C,OAAO,KAAK,EACV,KAAK,EACL,aAAa,EACb,eAAe,EACf,gBAAgB,EAChB,gBAAgB,EAChB,cAAc,EACf,MAAM,YAAY,CAAC;AAEpB;;;;;GAKG;AACH,qBAAa,eAAgB,YAAW,gBAAgB;IACtD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA0B;IACjD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAW;gBAExB,MAAM,EAAE,OAAO,CAAC,cAAc,CAAC,EAAE,QAAQ,EAAE,QAAQ;IAK/D,wDAAwD;IAClD,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAK7D,0CAA0C;IAC1C,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,EAAE;IAO5B,kCAAkC;IAC5B,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAInD,kEAAkE;IAC5D,WAAW,CAAC,MAAM,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;IAS5D,6EAA6E;IAC7E,MAAM,CAAC,KAAK,EAAE,eAAe,EAAE,UAAU,EAAE,eAAe,EAAE,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE;CAItF"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { chunkText } from './chunker.js';
|
|
2
|
+
import { extractContent } from './extract.js';
|
|
3
|
+
import { fetchUrl } from './fetch.js';
|
|
4
|
+
import { searchEmbeddings } from './search.js';
|
|
5
|
+
/**
|
|
6
|
+
* Concrete implementation of IContentPipeline.
|
|
7
|
+
*
|
|
8
|
+
* Orchestrates fetching, extraction, chunking, embedding, and search
|
|
9
|
+
* by delegating to the standalone pipeline functions and an injected Embedder.
|
|
10
|
+
*/
|
|
11
|
+
export class ContentPipeline {
|
|
12
|
+
config;
|
|
13
|
+
embedder;
|
|
14
|
+
constructor(config, embedder) {
|
|
15
|
+
this.config = config;
|
|
16
|
+
this.embedder = embedder;
|
|
17
|
+
}
|
|
18
|
+
/** Fetch a URL and extract its main article content. */
|
|
19
|
+
async fetchAndExtract(url) {
|
|
20
|
+
const result = await fetchUrl(url, this.config);
|
|
21
|
+
return extractContent(result.html, result.finalUrl);
|
|
22
|
+
}
|
|
23
|
+
/** Split text into overlapping chunks. */
|
|
24
|
+
chunk(text) {
|
|
25
|
+
return chunkText(text, {
|
|
26
|
+
maxTokens: this.config.maxChunkTokens ?? 500,
|
|
27
|
+
overlapTokens: this.config.overlapTokens ?? 50,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
/** Embed a single text string. */
|
|
31
|
+
async embed(text) {
|
|
32
|
+
return this.embedder.embed(text);
|
|
33
|
+
}
|
|
34
|
+
/** Embed multiple chunks and zip results into EmbeddedChunk[]. */
|
|
35
|
+
async embedChunks(chunks) {
|
|
36
|
+
const vectors = await this.embedder.embedBatch(chunks.map((c) => c.content));
|
|
37
|
+
return chunks.map((chunk, i) => ({
|
|
38
|
+
...chunk,
|
|
39
|
+
embedding: vectors[i],
|
|
40
|
+
model: 'snowflake-arctic-embed-s@384',
|
|
41
|
+
}));
|
|
42
|
+
}
|
|
43
|
+
/** Search embeddings for the top-K most similar, returning their indices. */
|
|
44
|
+
search(query, embeddings, topK) {
|
|
45
|
+
const results = searchEmbeddings(query, embeddings, topK);
|
|
46
|
+
return results.map((r) => r.index);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
//# sourceMappingURL=content-pipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-pipeline.js","sourceRoot":"","sources":["../../pipeline/content-pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEzC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAU/C;;;;;GAKG;AACH,MAAM,OAAO,eAAe;IACT,MAAM,CAA0B;IAChC,QAAQ,CAAW;IAEpC,YAAY,MAA+B,EAAE,QAAkB;QAC7D,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IAED,wDAAwD;IACxD,KAAK,CAAC,eAAe,CAAC,GAAW;QAC/B,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QAChD,OAAO,cAAc,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;IACtD,CAAC;IAED,0CAA0C;IAC1C,KAAK,CAAC,IAAY;QAChB,OAAO,SAAS,CAAC,IAAI,EAAE;YACrB,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc,IAAI,GAAG;YAC5C,aAAa,EAAE,IAAI,CAAC,MAAM,CAAC,aAAa,IAAI,EAAE;SAC/C,CAAC,CAAC;IACL,CAAC;IAED,kCAAkC;IAClC,KAAK,CAAC,KAAK,CAAC,IAAY;QACtB,OAAO,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,CAAC;IAED,kEAAkE;IAClE,KAAK,CAAC,WAAW,CAAC,MAAe;QAC/B,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;QAC7E,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC/B,GAAG,KAAK;YACR,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;YACrB,KAAK,EAAE,8BAA8B;SACtC,CAAC,CAAC,CAAC;IACN,CAAC;IAED,6EAA6E;IAC7E,MAAM,CAAC,KAAsB,EAAE,UAA6B,EAAE,IAAY;QACxE,MAAM,OAAO,GAAG,gBAAgB,CAAC,KAAK,EAAE,UAAU,EAAE,IAAI,CAAC,CAAC;QAC1D,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IACrC,CAAC;CACF"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ONNX Runtime embedding using snowflake-arctic-embed-s.
|
|
3
|
+
*
|
|
4
|
+
* Provides a high-level Embedder class that loads an ONNX model and
|
|
5
|
+
* produces 384-dimensional embedding vectors from text input.
|
|
6
|
+
*
|
|
7
|
+
* Uses a real BERT WordPiece tokenizer with the model's vocabulary
|
|
8
|
+
* for proper subword tokenization and meaningful embeddings.
|
|
9
|
+
*/
|
|
10
|
+
import type { EmbeddingVector } from './types.js';
|
|
11
|
+
/**
|
|
12
|
+
* Embedder wraps an ONNX inference session for producing text embeddings.
|
|
13
|
+
*/
|
|
14
|
+
export declare class Embedder {
|
|
15
|
+
private session;
|
|
16
|
+
private vocab;
|
|
17
|
+
readonly modelPath: string;
|
|
18
|
+
private readonly vocabPath;
|
|
19
|
+
/**
|
|
20
|
+
* @param modelPath - Absolute path to the ONNX model file.
|
|
21
|
+
* @param vocabPath - Absolute path to the vocab.txt file.
|
|
22
|
+
*/
|
|
23
|
+
constructor(modelPath: string, vocabPath?: string);
|
|
24
|
+
/**
|
|
25
|
+
* Load the ONNX model into memory. Called lazily on first embed() call,
|
|
26
|
+
* but can be called explicitly to pre-warm.
|
|
27
|
+
*/
|
|
28
|
+
initialize(): Promise<void>;
|
|
29
|
+
/**
|
|
30
|
+
* Embed a single text string into a 384-dimensional vector.
|
|
31
|
+
*
|
|
32
|
+
* @param text - The text to embed.
|
|
33
|
+
* @param prefix - Optional prefix (e.g., "query: " for search queries).
|
|
34
|
+
* @returns A Float32Array of 384 dimensions.
|
|
35
|
+
*/
|
|
36
|
+
embed(text: string, prefix?: string): Promise<EmbeddingVector>;
|
|
37
|
+
/**
|
|
38
|
+
* Embed multiple texts in a single batch.
|
|
39
|
+
*
|
|
40
|
+
* @param texts - Array of text strings to embed.
|
|
41
|
+
* @returns Array of Float32Array embeddings, one per input text.
|
|
42
|
+
*/
|
|
43
|
+
embedBatch(texts: string[]): Promise<EmbeddingVector[]>;
|
|
44
|
+
/**
|
|
45
|
+
* Release the ONNX session and free memory.
|
|
46
|
+
*/
|
|
47
|
+
dispose(): void;
|
|
48
|
+
}
|
|
49
|
+
//# sourceMappingURL=embedder.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedder.d.ts","sourceRoot":"","sources":["../../pipeline/embedder.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAGH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMlD;;GAEG;AACH,qBAAa,QAAQ;IACnB,OAAO,CAAC,OAAO,CAAqC;IACpD,OAAO,CAAC,KAAK,CAA2B;IACxC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IAEnC;;;OAGG;gBACS,SAAS,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM;IAMjD;;;OAGG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAqBjC;;;;;;OAMG;IACG,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAKpE;;;;;OAKG;IACG,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAoH7D;;OAEG;IACH,OAAO,IAAI,IAAI;CAQhB"}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ONNX Runtime embedding using snowflake-arctic-embed-s.
|
|
3
|
+
*
|
|
4
|
+
* Provides a high-level Embedder class that loads an ONNX model and
|
|
5
|
+
* produces 384-dimensional embedding vectors from text input.
|
|
6
|
+
*
|
|
7
|
+
* Uses a real BERT WordPiece tokenizer with the model's vocabulary
|
|
8
|
+
* for proper subword tokenization and meaningful embeddings.
|
|
9
|
+
*/
|
|
10
|
+
import * as ort from 'onnxruntime-node';
|
|
11
|
+
import { loadVocabulary, tokenize } from './wordpiece-tokenizer.js';
|
|
12
|
+
/** Maximum sequence length for the model. */
|
|
13
|
+
const MAX_SEQ_LENGTH = 512;
|
|
14
|
+
/**
|
|
15
|
+
* Embedder wraps an ONNX inference session for producing text embeddings.
|
|
16
|
+
*/
|
|
17
|
+
export class Embedder {
|
|
18
|
+
session = null;
|
|
19
|
+
vocab = null;
|
|
20
|
+
modelPath;
|
|
21
|
+
vocabPath;
|
|
22
|
+
/**
|
|
23
|
+
* @param modelPath - Absolute path to the ONNX model file.
|
|
24
|
+
* @param vocabPath - Absolute path to the vocab.txt file.
|
|
25
|
+
*/
|
|
26
|
+
constructor(modelPath, vocabPath) {
|
|
27
|
+
this.modelPath = modelPath;
|
|
28
|
+
// Default: vocab.txt in the same directory as the model
|
|
29
|
+
this.vocabPath = vocabPath ?? modelPath.replace(/[^/\\]+$/, 'vocab.txt');
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Load the ONNX model into memory. Called lazily on first embed() call,
|
|
33
|
+
* but can be called explicitly to pre-warm.
|
|
34
|
+
*/
|
|
35
|
+
async initialize() {
|
|
36
|
+
if (this.session) {
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
try {
|
|
40
|
+
this.session = await ort.InferenceSession.create(this.modelPath, {
|
|
41
|
+
executionProviders: ['cpu'],
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
catch (error) {
|
|
45
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
46
|
+
throw new Error(`Failed to load ONNX model at ${this.modelPath}: ${msg}. ` +
|
|
47
|
+
'Ensure the model file exists and is a valid ONNX model.');
|
|
48
|
+
}
|
|
49
|
+
// Load vocabulary
|
|
50
|
+
this.vocab = loadVocabulary(this.vocabPath);
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Embed a single text string into a 384-dimensional vector.
|
|
54
|
+
*
|
|
55
|
+
* @param text - The text to embed.
|
|
56
|
+
* @param prefix - Optional prefix (e.g., "query: " for search queries).
|
|
57
|
+
* @returns A Float32Array of 384 dimensions.
|
|
58
|
+
*/
|
|
59
|
+
async embed(text, prefix) {
|
|
60
|
+
const results = await this.embedBatch([prefix ? `${prefix}${text}` : text]);
|
|
61
|
+
return results[0];
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Embed multiple texts in a single batch.
|
|
65
|
+
*
|
|
66
|
+
* @param texts - Array of text strings to embed.
|
|
67
|
+
* @returns Array of Float32Array embeddings, one per input text.
|
|
68
|
+
*/
|
|
69
|
+
async embedBatch(texts) {
|
|
70
|
+
if (!this.session) {
|
|
71
|
+
await this.initialize();
|
|
72
|
+
}
|
|
73
|
+
const session = this.session;
|
|
74
|
+
const vocab = this.vocab;
|
|
75
|
+
const batchSize = texts.length;
|
|
76
|
+
// Tokenize each text into input IDs using real WordPiece tokenizer
|
|
77
|
+
const allInputIds = [];
|
|
78
|
+
const allAttentionMask = [];
|
|
79
|
+
let maxLen = 0;
|
|
80
|
+
for (const text of texts) {
|
|
81
|
+
const ids = tokenize(text, vocab, MAX_SEQ_LENGTH);
|
|
82
|
+
allInputIds.push(ids);
|
|
83
|
+
allAttentionMask.push(ids.map(() => 1n));
|
|
84
|
+
maxLen = Math.max(maxLen, ids.length);
|
|
85
|
+
}
|
|
86
|
+
// Pad to uniform length
|
|
87
|
+
for (let i = 0; i < batchSize; i++) {
|
|
88
|
+
while (allInputIds[i].length < maxLen) {
|
|
89
|
+
allInputIds[i].push(0n); // PAD token
|
|
90
|
+
allAttentionMask[i].push(0n);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
// Flatten into typed arrays
|
|
94
|
+
const inputIdsFlat = new BigInt64Array(batchSize * maxLen);
|
|
95
|
+
const attentionMaskFlat = new BigInt64Array(batchSize * maxLen);
|
|
96
|
+
for (let i = 0; i < batchSize; i++) {
|
|
97
|
+
for (let j = 0; j < maxLen; j++) {
|
|
98
|
+
inputIdsFlat[i * maxLen + j] = allInputIds[i][j];
|
|
99
|
+
attentionMaskFlat[i * maxLen + j] = allAttentionMask[i][j];
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
// Create ONNX tensors
|
|
103
|
+
const feeds = {
|
|
104
|
+
input_ids: new ort.Tensor('int64', inputIdsFlat, [batchSize, maxLen]),
|
|
105
|
+
attention_mask: new ort.Tensor('int64', attentionMaskFlat, [batchSize, maxLen]),
|
|
106
|
+
};
|
|
107
|
+
// Some models expect token_type_ids
|
|
108
|
+
const inputNames = session.inputNames;
|
|
109
|
+
if (inputNames.includes('token_type_ids')) {
|
|
110
|
+
const tokenTypeIds = new BigInt64Array(batchSize * maxLen); // all zeros
|
|
111
|
+
feeds['token_type_ids'] = new ort.Tensor('int64', tokenTypeIds, [batchSize, maxLen]);
|
|
112
|
+
}
|
|
113
|
+
// Run inference
|
|
114
|
+
const results = await session.run(feeds);
|
|
115
|
+
// Extract embeddings from output
|
|
116
|
+
// The model may output under various names; try common ones
|
|
117
|
+
const outputKey = results['sentence_embedding'] ? 'sentence_embedding' :
|
|
118
|
+
results['last_hidden_state'] ? 'last_hidden_state' :
|
|
119
|
+
Object.keys(results)[0];
|
|
120
|
+
const outputTensor = results[outputKey];
|
|
121
|
+
const outputData = outputTensor.data;
|
|
122
|
+
const outputDims = outputTensor.dims;
|
|
123
|
+
// If output is [batch, seq_len, dim], mean-pool over seq_len
|
|
124
|
+
// If output is [batch, dim], use directly
|
|
125
|
+
const embeddings = [];
|
|
126
|
+
if (outputDims.length === 3) {
|
|
127
|
+
// [batch, seq_len, dim] — mean pooling with attention mask
|
|
128
|
+
const seqLen = outputDims[1];
|
|
129
|
+
const dim = outputDims[2];
|
|
130
|
+
for (let b = 0; b < batchSize; b++) {
|
|
131
|
+
const embedding = new Float32Array(dim);
|
|
132
|
+
let tokenCount = 0;
|
|
133
|
+
for (let s = 0; s < seqLen; s++) {
|
|
134
|
+
if (allAttentionMask[b][s] === 1n) {
|
|
135
|
+
tokenCount++;
|
|
136
|
+
for (let d = 0; d < dim; d++) {
|
|
137
|
+
embedding[d] += outputData[b * seqLen * dim + s * dim + d];
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
// Average
|
|
142
|
+
if (tokenCount > 0) {
|
|
143
|
+
for (let d = 0; d < dim; d++) {
|
|
144
|
+
embedding[d] /= tokenCount;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// L2 normalize
|
|
148
|
+
embeddings.push(l2Normalize(embedding));
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
else if (outputDims.length === 2) {
|
|
152
|
+
// [batch, dim] — already pooled
|
|
153
|
+
const dim = outputDims[1];
|
|
154
|
+
for (let b = 0; b < batchSize; b++) {
|
|
155
|
+
const embedding = new Float32Array(dim);
|
|
156
|
+
for (let d = 0; d < dim; d++) {
|
|
157
|
+
embedding[d] = outputData[b * dim + d];
|
|
158
|
+
}
|
|
159
|
+
embeddings.push(l2Normalize(embedding));
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
throw new Error(`Unexpected output tensor shape: [${outputDims.join(', ')}]`);
|
|
164
|
+
}
|
|
165
|
+
return embeddings;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Release the ONNX session and free memory.
|
|
169
|
+
*/
|
|
170
|
+
dispose() {
|
|
171
|
+
if (this.session) {
|
|
172
|
+
// InferenceSession doesn't have a sync dispose in all versions;
|
|
173
|
+
// release is best-effort.
|
|
174
|
+
this.session.release?.();
|
|
175
|
+
this.session = null;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* L2-normalize a vector in place and return it.
|
|
181
|
+
*/
|
|
182
|
+
function l2Normalize(vec) {
|
|
183
|
+
let sumSq = 0;
|
|
184
|
+
for (let i = 0; i < vec.length; i++) {
|
|
185
|
+
sumSq += vec[i] * vec[i];
|
|
186
|
+
}
|
|
187
|
+
const norm = Math.sqrt(sumSq);
|
|
188
|
+
if (norm > 0) {
|
|
189
|
+
for (let i = 0; i < vec.length; i++) {
|
|
190
|
+
vec[i] /= norm;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return vec;
|
|
194
|
+
}
|
|
195
|
+
//# sourceMappingURL=embedder.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedder.js","sourceRoot":"","sources":["../../pipeline/embedder.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,GAAG,MAAM,kBAAkB,CAAC;AAExC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAmB,MAAM,0BAA0B,CAAC;AAErF,6CAA6C;AAC7C,MAAM,cAAc,GAAG,GAAG,CAAC;AAE3B;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,OAAO,GAAgC,IAAI,CAAC;IAC5C,KAAK,GAAsB,IAAI,CAAC;IAC/B,SAAS,CAAS;IACV,SAAS,CAAS;IAEnC;;;OAGG;IACH,YAAY,SAAiB,EAAE,SAAkB;QAC/C,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,wDAAwD;QACxD,IAAI,CAAC,SAAS,GAAG,SAAS,IAAI,SAAS,CAAC,OAAO,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;IAC3E,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,UAAU;QACd,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,SAAS,EAAE;gBAC/D,kBAAkB,EAAE,CAAC,KAAK,CAAC;aAC5B,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAc,EAAE,CAAC;YACxB,MAAM,GAAG,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACnE,MAAM,IAAI,KAAK,CACb,gCAAgC,IAAI,CAAC,SAAS,KAAK,GAAG,IAAI;gBACxD,yDAAyD,CAC5D,CAAC;QACJ,CAAC;QAED,kBAAkB;QAClB,IAAI,CAAC,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC9C,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,MAAe;QACvC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAC5E,OAAO,OAAO,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,UAAU,CAAC,KAAe;QAC9B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QAC1B,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAAQ,CAAC;QAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAM,CAAC;QAC1B,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC;QAE/B,mEAAmE;QACnE,MAAM,WAAW,GAAe,EAAE,CAAC;QACnC,MAAM,gBAAgB,GAAe,EAAE,CAAC;QACxC,IAAI,MAAM,GAAG,CAAC,CAAC;QAEf,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,EAAE,KAAK,EAAE,cAAc,CAAC,CAAC;YAClD,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACtB,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;YACzC,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;QACxC,CAAC;QAED,wBAAwB;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,MAAM,EAAE,CAAC;gBACtC,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY;gBACrC,gBAAgB,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAC/B,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;QAC3D,MAAM,iBAAiB,GAAG,IAAI,aAAa,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;QAEhE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAChC,YAAY,CAAC,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBACjD,iBAAiB,CAAC,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;QAED,sBAAsB;QACtB,MAAM,KAAK,GAA+B;YACxC,SAAS,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;YACrE,cAAc,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,iBAAiB,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;SAChF,CAAC;QAEF,oCAAoC;QACpC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC,CAAC,YAAY;YACxE,KAAK,CAAC,gBAAgB,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,CAAC;QACvF,CAAC;QAED,gBAAgB;QAChB,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,iCAAiC;QACjC,4DAA4D;QAC5D,MAAM,SAAS,GACb,OAAO,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC;YACtD,OAAO,CAAC,mBAAmB,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC,CAAC;gBACpD,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;QAE1B,MAAM,YAAY,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;QACxC,MAAM,UAAU,GAAG,YAAY,CAAC,IAAoB,CAAC;QACrD,MAAM,UAAU,GAAG,YAAY,CAAC,IAAyB,CAAC;QAE1D,6DAA6D;QAC7D,0CAA0C;QAC1C,MAAM,UAAU,GAAsB,EAAE,CAAC;QAEzC,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,2DAA2D;YAC3D,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,CAAW,CAAC;YACvC,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAW,CAAC;YAEpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,MAAM,SAAS,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;gBACxC,IAAI,UAAU,GAAG,CAAC,CAAC;gBAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBAChC,IAAI,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC;wBAClC,UAAU,EAAE,CAAC;wBACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;4BAC7B,SAAS,CAAC,CAAC,CAAC,IAAI,UAAU,CAAC,CAAC,GAAG,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,CAAC;wBAC7D,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,UAAU;gBACV,IAAI,UAAU,GAAG,CAAC,EAAE,CAAC;oBACnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;wBAC7B,SAAS,CAAC,CAAC,CAAC,IAAI,UAAU,CAAC;oBAC7B,CAAC;gBACH,CAAC;gBAED,eAAe;gBACf,UAAU,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;aAAM,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,gCAAgC;YAChC,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAW,CAAC;YACpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,MAAM,SAAS,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;gBACxC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;oBAC7B,SAAS,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,CAAC;gBACzC,CAAC;gBACD,UAAU,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,oCAAoC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChF,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAED;;OAEG;IACH,OAAO;QACL,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,gEAAgE;YAChE,0BAA0B;YAC1B,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;YACzB,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;CACF;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,GAAiB;IACpC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,KAAK,IAAI,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;IAC3B,CAAC;IACD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC9B,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;QACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,GAAG,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACjB,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content extraction from HTML using Mozilla Readability.
|
|
3
|
+
*
|
|
4
|
+
* Parses HTML with `linkedom` and runs it through Readability to pull out
|
|
5
|
+
* the main article content, stripped of navigation, ads, and boilerplate.
|
|
6
|
+
*/
|
|
7
|
+
import type { ExtractedContent } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Extract the main article content from an HTML string.
|
|
10
|
+
*
|
|
11
|
+
* @param html - The raw HTML string to extract content from.
|
|
12
|
+
* @param url - The source URL (used for resolving relative links and metadata).
|
|
13
|
+
* @returns Extracted content with title, plain text, URL, and byte length.
|
|
14
|
+
* @throws If no article content can be extracted.
|
|
15
|
+
*/
|
|
16
|
+
export declare function extractContent(html: string, url: string): ExtractedContent;
|
|
17
|
+
//# sourceMappingURL=extract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAEnD;;;;;;;GAOG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,CAoD1E"}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content extraction from HTML using Mozilla Readability.
|
|
3
|
+
*
|
|
4
|
+
* Parses HTML with `linkedom` and runs it through Readability to pull out
|
|
5
|
+
* the main article content, stripped of navigation, ads, and boilerplate.
|
|
6
|
+
*/
|
|
7
|
+
import { Readability } from '@mozilla/readability';
|
|
8
|
+
import { parseHTML } from 'linkedom';
|
|
9
|
+
/**
|
|
10
|
+
* Extract the main article content from an HTML string.
|
|
11
|
+
*
|
|
12
|
+
* @param html - The raw HTML string to extract content from.
|
|
13
|
+
* @param url - The source URL (used for resolving relative links and metadata).
|
|
14
|
+
* @returns Extracted content with title, plain text, URL, and byte length.
|
|
15
|
+
* @throws If no article content can be extracted.
|
|
16
|
+
*/
|
|
17
|
+
export function extractContent(html, url) {
|
|
18
|
+
if (!html || html.trim().length === 0) {
|
|
19
|
+
throw new Error(`Empty HTML content from ${url}`);
|
|
20
|
+
}
|
|
21
|
+
const { document } = parseHTML(html);
|
|
22
|
+
// Attempt Readability extraction
|
|
23
|
+
const reader = new Readability(document);
|
|
24
|
+
const article = reader.parse();
|
|
25
|
+
let title;
|
|
26
|
+
let rawContent;
|
|
27
|
+
if (article && article.textContent && article.textContent.trim().length > 0) {
|
|
28
|
+
title = article.title || '';
|
|
29
|
+
rawContent = article.textContent;
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
// Fallback: extract text from body (strip script/style first)
|
|
33
|
+
// Wrap in a full HTML document to ensure linkedom creates a body element
|
|
34
|
+
const wrappedHtml = html.includes('<body') ? html : `<html><body>${html}</body></html>`;
|
|
35
|
+
const { document: fallbackDoc } = parseHTML(wrappedHtml);
|
|
36
|
+
for (const el of fallbackDoc.querySelectorAll('script, style')) {
|
|
37
|
+
el.remove();
|
|
38
|
+
}
|
|
39
|
+
const body = fallbackDoc.querySelector('body');
|
|
40
|
+
rawContent = body ? body.textContent ?? '' : '';
|
|
41
|
+
if (rawContent.trim().length === 0) {
|
|
42
|
+
throw new Error(`No extractable content from ${url}`);
|
|
43
|
+
}
|
|
44
|
+
title = '';
|
|
45
|
+
}
|
|
46
|
+
// Fallback title: try <title> tag from a fresh parse
|
|
47
|
+
if (!title) {
|
|
48
|
+
const { document: titleDoc } = parseHTML(html);
|
|
49
|
+
const titleEl = titleDoc.querySelector('title');
|
|
50
|
+
title = titleEl?.textContent?.trim() ?? '';
|
|
51
|
+
}
|
|
52
|
+
// Clean up the text: collapse whitespace runs, trim lines
|
|
53
|
+
const content = cleanText(rawContent);
|
|
54
|
+
if (content.length === 0) {
|
|
55
|
+
throw new Error(`No extractable content from ${url}`);
|
|
56
|
+
}
|
|
57
|
+
const byteLength = Buffer.byteLength(content, 'utf-8');
|
|
58
|
+
return { title, content, url, byteLength };
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Clean extracted text by collapsing whitespace and trimming.
|
|
62
|
+
*/
|
|
63
|
+
function cleanText(text) {
|
|
64
|
+
return text
|
|
65
|
+
.replace(/[\t ]+/g, ' ') // collapse horizontal whitespace
|
|
66
|
+
.replace(/\n{3,}/g, '\n\n') // collapse excessive newlines
|
|
67
|
+
.replace(/^ +| +$/gm, '') // trim each line
|
|
68
|
+
.trim();
|
|
69
|
+
}
|
|
70
|
+
//# sourceMappingURL=extract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.js","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,GAAG,EAAE,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAErC,iCAAiC;IACjC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;IAE/B,IAAI,KAAa,CAAC;IAClB,IAAI,UAAkB,CAAC;IAEvB,IAAI,OAAO,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5E,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5B,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC;IACnC,CAAC;SAAM,CAAC;QACN,8DAA8D;QAC9D,yEAAyE;QACzE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,eAAe,IAAI,gBAAgB,CAAC;QACxF,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,CAAC;QACzD,KAAK,MAAM,EAAE,IAAI,WAAW,CAAC,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;YAC/D,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;QACD,MAAM,IAAI,GAAG,WAAW,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/C,UAAU,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAEhD,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,KAAK,GAAG,EAAE,CAAC;IACb,CAAC;IAED,qDAAqD;IACrD,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAChD,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,0DAA0D;IAC1D,MAAM,OAAO,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAEvD,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAO,iCAAiC;SAC/D,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAI,8BAA8B;SAC5D,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAM,iBAAiB;SAC/C,IAAI,EAAE,CAAC;AACZ,CAAC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL fetching for the content pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Uses Node.js built-in `fetch` (available since Node 18) with
|
|
5
|
+
* configurable timeout, redirect limits, and user-agent.
|
|
6
|
+
*/
|
|
7
|
+
import type { PipelineConfig } from './types.js';
|
|
8
|
+
/** Result of a successful URL fetch. */
|
|
9
|
+
export interface FetchResult {
|
|
10
|
+
/** Raw HTML body. */
|
|
11
|
+
html: string;
|
|
12
|
+
/** Final URL after any redirects. */
|
|
13
|
+
finalUrl: string;
|
|
14
|
+
/** Content-Type header value. */
|
|
15
|
+
contentType: string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Fetch the HTML content of a URL.
|
|
19
|
+
*
|
|
20
|
+
* @param url - The URL to fetch.
|
|
21
|
+
* @param config - Optional partial pipeline config overrides.
|
|
22
|
+
* @returns The HTML content, final URL, and content type.
|
|
23
|
+
* @throws On network errors, non-2xx status codes, or non-HTML content.
|
|
24
|
+
*/
|
|
25
|
+
export declare function fetchUrl(url: string, config?: Partial<PipelineConfig>): Promise<FetchResult>;
|
|
26
|
+
//# sourceMappingURL=fetch.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AASjD,wCAAwC;AACxC,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,QAAQ,EAAE,MAAM,CAAC;IACjB,iCAAiC;IACjC,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;GAOG;AACH,wBAAsB,QAAQ,CAC5B,GAAG,EAAE,MAAM,EACX,MAAM,CAAC,EAAE,OAAO,CAAC,cAAc,CAAC,GAC/B,OAAO,CAAC,WAAW,CAAC,CA6EtB"}
|