@sprinterai/runtime 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/adapters/a2a-adapter.d.ts +37 -9
- package/dist/adapters/a2a-adapter.d.ts.map +1 -1
- package/dist/adapters/a2a-adapter.js +132 -9
- package/dist/adapters/a2a-adapter.js.map +1 -1
- package/dist/adapters/a2a-adapter.test.d.ts +2 -0
- package/dist/adapters/a2a-adapter.test.d.ts.map +1 -0
- package/dist/adapters/a2a-adapter.test.js +295 -0
- package/dist/adapters/a2a-adapter.test.js.map +1 -0
- package/dist/adapters/http-agent-adapter.d.ts +19 -9
- package/dist/adapters/http-agent-adapter.d.ts.map +1 -1
- package/dist/adapters/http-agent-adapter.js +55 -9
- package/dist/adapters/http-agent-adapter.js.map +1 -1
- package/dist/adapters/http-agent-adapter.test.d.ts +2 -0
- package/dist/adapters/http-agent-adapter.test.d.ts.map +1 -0
- package/dist/adapters/http-agent-adapter.test.js +164 -0
- package/dist/adapters/http-agent-adapter.test.js.map +1 -0
- package/dist/adapters/index.d.ts +8 -8
- package/dist/adapters/index.d.ts.map +1 -1
- package/dist/adapters/index.js +4 -4
- package/dist/adapters/index.js.map +1 -1
- package/dist/adapters/mcp-adapter.d.ts +29 -9
- package/dist/adapters/mcp-adapter.d.ts.map +1 -1
- package/dist/adapters/mcp-adapter.js +43 -8
- package/dist/adapters/mcp-adapter.js.map +1 -1
- package/dist/adapters/mcp-adapter.test.d.ts +2 -0
- package/dist/adapters/mcp-adapter.test.d.ts.map +1 -0
- package/dist/adapters/mcp-adapter.test.js +118 -0
- package/dist/adapters/mcp-adapter.test.js.map +1 -0
- package/dist/adapters/openclaw-adapter.d.ts +34 -8
- package/dist/adapters/openclaw-adapter.d.ts.map +1 -1
- package/dist/adapters/openclaw-adapter.js +38 -8
- package/dist/adapters/openclaw-adapter.js.map +1 -1
- package/dist/adapters/openclaw-adapter.test.d.ts +2 -0
- package/dist/adapters/openclaw-adapter.test.d.ts.map +1 -0
- package/dist/adapters/openclaw-adapter.test.js +77 -0
- package/dist/adapters/openclaw-adapter.test.js.map +1 -0
- package/dist/agent/agent-registry.test.js +1 -1
- package/dist/agent/agent-resolver.d.ts +8 -2
- package/dist/agent/agent-resolver.d.ts.map +1 -1
- package/dist/agent/agent-resolver.js +7 -1
- package/dist/agent/agent-resolver.js.map +1 -1
- package/dist/agent/agent-resolver.test.js +21 -3
- package/dist/agent/agent-resolver.test.js.map +1 -1
- package/dist/agent/delegate.test.js +1 -1
- package/dist/agent/execute-agent.d.ts +32 -8
- package/dist/agent/execute-agent.d.ts.map +1 -1
- package/dist/agent/execute-agent.js +40 -3
- package/dist/agent/execute-agent.js.map +1 -1
- package/dist/agent/index.d.ts +9 -9
- package/dist/agent/index.d.ts.map +1 -1
- package/dist/agent/index.js +5 -5
- package/dist/agent/index.js.map +1 -1
- package/dist/agent/prompt-builder.test.js +1 -1
- package/dist/approval/approval-manager.d.ts +19 -0
- package/dist/approval/approval-manager.d.ts.map +1 -0
- package/dist/approval/approval-manager.js +36 -0
- package/dist/approval/approval-manager.js.map +1 -0
- package/dist/approval/approval-manager.test.d.ts +2 -0
- package/dist/approval/approval-manager.test.d.ts.map +1 -0
- package/dist/approval/approval-manager.test.js +239 -0
- package/dist/approval/approval-manager.test.js.map +1 -0
- package/dist/approval/index.d.ts +3 -0
- package/dist/approval/index.d.ts.map +1 -0
- package/dist/approval/index.js +2 -0
- package/dist/approval/index.js.map +1 -0
- package/dist/chat/chat-handler.d.ts +14 -7
- package/dist/chat/chat-handler.d.ts.map +1 -1
- package/dist/chat/chat-handler.js +56 -11
- package/dist/chat/chat-handler.js.map +1 -1
- package/dist/chat/chat-handler.test.js +100 -11
- package/dist/chat/chat-handler.test.js.map +1 -1
- package/dist/chat/index.d.ts +3 -3
- package/dist/chat/index.js +2 -2
- package/dist/chat/message-utils.d.ts +15 -7
- package/dist/chat/message-utils.d.ts.map +1 -1
- package/dist/chat/message-utils.js +94 -22
- package/dist/chat/message-utils.js.map +1 -1
- package/dist/chat/message-utils.test.js +71 -1
- package/dist/chat/message-utils.test.js.map +1 -1
- package/dist/document/chunk-generator.d.ts +6 -0
- package/dist/document/chunk-generator.d.ts.map +1 -0
- package/dist/document/chunk-generator.js +107 -0
- package/dist/document/chunk-generator.js.map +1 -0
- package/dist/document/chunk-generator.test.d.ts +2 -0
- package/dist/document/chunk-generator.test.d.ts.map +1 -0
- package/dist/document/chunk-generator.test.js +166 -0
- package/dist/document/chunk-generator.test.js.map +1 -0
- package/dist/document/document-processor.d.ts +27 -0
- package/dist/document/document-processor.d.ts.map +1 -0
- package/dist/document/document-processor.js +44 -0
- package/dist/document/document-processor.js.map +1 -0
- package/dist/document/document-processor.test.d.ts +2 -0
- package/dist/document/document-processor.test.d.ts.map +1 -0
- package/dist/document/document-processor.test.js +197 -0
- package/dist/document/document-processor.test.js.map +1 -0
- package/dist/document/index.d.ts +5 -0
- package/dist/document/index.d.ts.map +1 -0
- package/dist/document/index.js +4 -0
- package/dist/document/index.js.map +1 -0
- package/dist/document/parsers/index.d.ts +2 -0
- package/dist/document/parsers/index.d.ts.map +1 -0
- package/dist/document/parsers/index.js +2 -0
- package/dist/document/parsers/index.js.map +1 -0
- package/dist/document/parsers/text-parser.d.ts +4 -0
- package/dist/document/parsers/text-parser.d.ts.map +1 -0
- package/dist/document/parsers/text-parser.js +23 -0
- package/dist/document/parsers/text-parser.js.map +1 -0
- package/dist/document/parsers/text-parser.test.d.ts +2 -0
- package/dist/document/parsers/text-parser.test.d.ts.map +1 -0
- package/dist/document/parsers/text-parser.test.js +64 -0
- package/dist/document/parsers/text-parser.test.js.map +1 -0
- package/dist/eval/eval-runner.test.js +2 -2
- package/dist/eval/index.d.ts +3 -3
- package/dist/eval/index.js +2 -2
- package/dist/eval/scorers.test.js +1 -1
- package/dist/events/event-bus.d.ts +12 -0
- package/dist/events/event-bus.d.ts.map +1 -0
- package/dist/events/event-bus.js +77 -0
- package/dist/events/event-bus.js.map +1 -0
- package/dist/events/event-bus.test.d.ts +2 -0
- package/dist/events/event-bus.test.d.ts.map +1 -0
- package/dist/events/event-bus.test.js +155 -0
- package/dist/events/event-bus.test.js.map +1 -0
- package/dist/events/index.d.ts +2 -0
- package/dist/events/index.d.ts.map +1 -0
- package/dist/events/index.js +2 -0
- package/dist/events/index.js.map +1 -0
- package/dist/guardrail/guardrail-pipeline.test.js +1 -1
- package/dist/guardrail/index.d.ts +2 -2
- package/dist/guardrail/index.js +1 -1
- package/dist/index.d.ts +81 -45
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +55 -27
- package/dist/index.js.map +1 -1
- package/dist/jobs/in-memory-job-queue.d.ts +20 -0
- package/dist/jobs/in-memory-job-queue.d.ts.map +1 -0
- package/dist/jobs/in-memory-job-queue.js +120 -0
- package/dist/jobs/in-memory-job-queue.js.map +1 -0
- package/dist/jobs/in-memory-job-queue.test.d.ts +2 -0
- package/dist/jobs/in-memory-job-queue.test.d.ts.map +1 -0
- package/dist/jobs/in-memory-job-queue.test.js +146 -0
- package/dist/jobs/in-memory-job-queue.test.js.map +1 -0
- package/dist/jobs/index.d.ts +4 -0
- package/dist/jobs/index.d.ts.map +1 -0
- package/dist/jobs/index.js +3 -0
- package/dist/jobs/index.js.map +1 -0
- package/dist/jobs/job-runner.d.ts +42 -0
- package/dist/jobs/job-runner.d.ts.map +1 -0
- package/dist/jobs/job-runner.js +119 -0
- package/dist/jobs/job-runner.js.map +1 -0
- package/dist/jobs/job-runner.test.d.ts +2 -0
- package/dist/jobs/job-runner.test.d.ts.map +1 -0
- package/dist/jobs/job-runner.test.js +190 -0
- package/dist/jobs/job-runner.test.js.map +1 -0
- package/dist/memory/index.d.ts +3 -3
- package/dist/memory/index.js +2 -2
- package/dist/memory/memory-prompt.d.ts +1 -1
- package/dist/module/create-runtime.d.ts +21 -6
- package/dist/module/create-runtime.d.ts.map +1 -1
- package/dist/module/create-runtime.js +7 -7
- package/dist/module/create-runtime.js.map +1 -1
- package/dist/module/create-runtime.test.js +8 -8
- package/dist/module/index.d.ts +5 -4
- package/dist/module/index.d.ts.map +1 -1
- package/dist/module/index.js +3 -2
- package/dist/module/index.js.map +1 -1
- package/dist/module/map-supabase-stores.d.ts +33 -0
- package/dist/module/map-supabase-stores.d.ts.map +1 -0
- package/dist/module/map-supabase-stores.js +28 -0
- package/dist/module/map-supabase-stores.js.map +1 -0
- package/dist/module/module-loader.d.ts +9 -1
- package/dist/module/module-loader.d.ts.map +1 -1
- package/dist/module/module-loader.js +40 -1
- package/dist/module/module-loader.js.map +1 -1
- package/dist/module/module-loader.test.js +38 -1
- package/dist/module/module-loader.test.js.map +1 -1
- package/dist/notification/digest-scheduler.d.ts +18 -0
- package/dist/notification/digest-scheduler.d.ts.map +1 -0
- package/dist/notification/digest-scheduler.js +44 -0
- package/dist/notification/digest-scheduler.js.map +1 -0
- package/dist/notification/digest-scheduler.test.d.ts +2 -0
- package/dist/notification/digest-scheduler.test.d.ts.map +1 -0
- package/dist/notification/digest-scheduler.test.js +306 -0
- package/dist/notification/digest-scheduler.test.js.map +1 -0
- package/dist/notification/index.d.ts +5 -0
- package/dist/notification/index.d.ts.map +1 -0
- package/dist/notification/index.js +3 -0
- package/dist/notification/index.js.map +1 -0
- package/dist/notification/notification-engine.d.ts +20 -0
- package/dist/notification/notification-engine.d.ts.map +1 -0
- package/dist/notification/notification-engine.js +78 -0
- package/dist/notification/notification-engine.js.map +1 -0
- package/dist/notification/notification-engine.test.d.ts +2 -0
- package/dist/notification/notification-engine.test.d.ts.map +1 -0
- package/dist/notification/notification-engine.test.js +364 -0
- package/dist/notification/notification-engine.test.js.map +1 -0
- package/dist/providers/index.d.ts +5 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +3 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/model-resolver.d.ts +93 -0
- package/dist/providers/model-resolver.d.ts.map +1 -0
- package/dist/providers/model-resolver.js +152 -0
- package/dist/providers/model-resolver.js.map +1 -0
- package/dist/providers/model-resolver.test.d.ts +2 -0
- package/dist/providers/model-resolver.test.d.ts.map +1 -0
- package/dist/providers/model-resolver.test.js +199 -0
- package/dist/providers/model-resolver.test.js.map +1 -0
- package/dist/providers/noop-providers.d.ts +6 -0
- package/dist/providers/noop-providers.d.ts.map +1 -0
- package/dist/providers/noop-providers.js +15 -0
- package/dist/providers/noop-providers.js.map +1 -0
- package/dist/providers/noop-providers.test.d.ts +2 -0
- package/dist/providers/noop-providers.test.d.ts.map +1 -0
- package/dist/providers/noop-providers.test.js +63 -0
- package/dist/providers/noop-providers.test.js.map +1 -0
- package/dist/providers/provider-interfaces.d.ts +65 -0
- package/dist/providers/provider-interfaces.d.ts.map +1 -0
- package/dist/providers/provider-interfaces.js +2 -0
- package/dist/providers/provider-interfaces.js.map +1 -0
- package/dist/realtime/index.d.ts +3 -0
- package/dist/realtime/index.d.ts.map +1 -0
- package/dist/realtime/index.js +2 -0
- package/dist/realtime/index.js.map +1 -0
- package/dist/realtime/realtime-manager.d.ts +20 -0
- package/dist/realtime/realtime-manager.d.ts.map +1 -0
- package/dist/realtime/realtime-manager.js +110 -0
- package/dist/realtime/realtime-manager.js.map +1 -0
- package/dist/realtime/realtime-manager.test.d.ts +2 -0
- package/dist/realtime/realtime-manager.test.d.ts.map +1 -0
- package/dist/realtime/realtime-manager.test.js +273 -0
- package/dist/realtime/realtime-manager.test.js.map +1 -0
- package/dist/scoring/compute-score.test.js +1 -1
- package/dist/scoring/index.d.ts +2 -2
- package/dist/scoring/index.js +1 -1
- package/dist/search/cosine-similarity.d.ts +8 -0
- package/dist/search/cosine-similarity.d.ts.map +1 -0
- package/dist/search/cosine-similarity.js +28 -0
- package/dist/search/cosine-similarity.js.map +1 -0
- package/dist/search/cosine-similarity.test.d.ts +2 -0
- package/dist/search/cosine-similarity.test.d.ts.map +1 -0
- package/dist/search/cosine-similarity.test.js +49 -0
- package/dist/search/cosine-similarity.test.js.map +1 -0
- package/dist/search/hybrid-search.d.ts +47 -0
- package/dist/search/hybrid-search.d.ts.map +1 -0
- package/dist/search/hybrid-search.js +111 -0
- package/dist/search/hybrid-search.js.map +1 -0
- package/dist/search/hybrid-search.test.d.ts +2 -0
- package/dist/search/hybrid-search.test.d.ts.map +1 -0
- package/dist/search/hybrid-search.test.js +238 -0
- package/dist/search/hybrid-search.test.js.map +1 -0
- package/dist/search/in-memory-search.d.ts +17 -0
- package/dist/search/in-memory-search.d.ts.map +1 -0
- package/dist/search/in-memory-search.js +59 -0
- package/dist/search/in-memory-search.js.map +1 -0
- package/dist/search/in-memory-search.test.d.ts +2 -0
- package/dist/search/in-memory-search.test.d.ts.map +1 -0
- package/dist/search/in-memory-search.test.js +169 -0
- package/dist/search/in-memory-search.test.js.map +1 -0
- package/dist/search/index.d.ts +6 -0
- package/dist/search/index.d.ts.map +1 -0
- package/dist/search/index.js +4 -0
- package/dist/search/index.js.map +1 -0
- package/dist/testing/in-memory-agent-store.d.ts +5 -1
- package/dist/testing/in-memory-agent-store.d.ts.map +1 -1
- package/dist/testing/in-memory-agent-store.js +19 -0
- package/dist/testing/in-memory-agent-store.js.map +1 -1
- package/dist/testing/in-memory-agent-store.test.js +1 -1
- package/dist/testing/in-memory-chat-store.d.ts.map +1 -1
- package/dist/testing/in-memory-chat-store.js +2 -1
- package/dist/testing/in-memory-chat-store.js.map +1 -1
- package/dist/testing/in-memory-entity-store.d.ts +5 -1
- package/dist/testing/in-memory-entity-store.d.ts.map +1 -1
- package/dist/testing/in-memory-entity-store.js +69 -4
- package/dist/testing/in-memory-entity-store.js.map +1 -1
- package/dist/testing/in-memory-entity-store.test.js +1 -1
- package/dist/testing/in-memory-memory-store.d.ts +3 -1
- package/dist/testing/in-memory-memory-store.d.ts.map +1 -1
- package/dist/testing/in-memory-memory-store.js +20 -0
- package/dist/testing/in-memory-memory-store.js.map +1 -1
- package/dist/testing/in-memory-tool-store.d.ts +13 -2
- package/dist/testing/in-memory-tool-store.d.ts.map +1 -1
- package/dist/testing/in-memory-tool-store.js +51 -0
- package/dist/testing/in-memory-tool-store.js.map +1 -1
- package/dist/testing/index.d.ts +7 -7
- package/dist/testing/index.js +7 -7
- package/dist/tool/ai-bridge.d.ts +1 -0
- package/dist/tool/ai-bridge.d.ts.map +1 -1
- package/dist/tool/ai-bridge.js +1 -0
- package/dist/tool/ai-bridge.js.map +1 -1
- package/dist/tool/ai-bridge.test.js +1 -1
- package/dist/tool/catalog.d.ts +19 -0
- package/dist/tool/catalog.d.ts.map +1 -0
- package/dist/tool/catalog.js +88 -0
- package/dist/tool/catalog.js.map +1 -0
- package/dist/tool/catalog.test.d.ts +2 -0
- package/dist/tool/catalog.test.d.ts.map +1 -0
- package/dist/tool/catalog.test.js +129 -0
- package/dist/tool/catalog.test.js.map +1 -0
- package/dist/tool/entity-tools-factory.test.js +2 -2
- package/dist/tool/execute-registered-tool.d.ts +17 -0
- package/dist/tool/execute-registered-tool.d.ts.map +1 -0
- package/dist/tool/execute-registered-tool.js +24 -0
- package/dist/tool/execute-registered-tool.js.map +1 -0
- package/dist/tool/execute-registered-tool.test.d.ts +2 -0
- package/dist/tool/execute-registered-tool.test.d.ts.map +1 -0
- package/dist/tool/execute-registered-tool.test.js +73 -0
- package/dist/tool/execute-registered-tool.test.js.map +1 -0
- package/dist/tool/index.d.ts +11 -7
- package/dist/tool/index.d.ts.map +1 -1
- package/dist/tool/index.js +7 -5
- package/dist/tool/index.js.map +1 -1
- package/dist/tool/resolve-tools.d.ts +3 -1
- package/dist/tool/resolve-tools.d.ts.map +1 -1
- package/dist/tool/resolve-tools.js +1 -1
- package/dist/tool/resolve-tools.js.map +1 -1
- package/dist/tool/resolve-tools.test.js +1 -1
- package/dist/tool/tool-executor.d.ts +3 -2
- package/dist/tool/tool-executor.d.ts.map +1 -1
- package/dist/tool/tool-executor.js +4 -2
- package/dist/tool/tool-executor.js.map +1 -1
- package/dist/tool/tool-executor.test.js +2 -2
- package/dist/tool/tool-registry.test.js +1 -1
- package/dist/tool/zod-to-json-schema.d.ts +7 -0
- package/dist/tool/zod-to-json-schema.d.ts.map +1 -0
- package/dist/tool/zod-to-json-schema.js +12 -0
- package/dist/tool/zod-to-json-schema.js.map +1 -0
- package/dist/tool/zod-to-json-schema.test.d.ts +2 -0
- package/dist/tool/zod-to-json-schema.test.d.ts.map +1 -0
- package/dist/tool/zod-to-json-schema.test.js +127 -0
- package/dist/tool/zod-to-json-schema.test.js.map +1 -0
- package/dist/webhook/index.d.ts +4 -0
- package/dist/webhook/index.d.ts.map +1 -0
- package/dist/webhook/index.js +3 -0
- package/dist/webhook/index.js.map +1 -0
- package/dist/webhook/webhook-delivery.d.ts +12 -0
- package/dist/webhook/webhook-delivery.d.ts.map +1 -0
- package/dist/webhook/webhook-delivery.js +102 -0
- package/dist/webhook/webhook-delivery.js.map +1 -0
- package/dist/webhook/webhook-delivery.test.d.ts +2 -0
- package/dist/webhook/webhook-delivery.test.d.ts.map +1 -0
- package/dist/webhook/webhook-delivery.test.js +284 -0
- package/dist/webhook/webhook-delivery.test.js.map +1 -0
- package/dist/webhook/webhook-signer.d.ts +14 -0
- package/dist/webhook/webhook-signer.d.ts.map +1 -0
- package/dist/webhook/webhook-signer.js +31 -0
- package/dist/webhook/webhook-signer.js.map +1 -0
- package/dist/webhook/webhook-signer.test.d.ts +2 -0
- package/dist/webhook/webhook-signer.test.d.ts.map +1 -0
- package/dist/webhook/webhook-signer.test.js +74 -0
- package/dist/webhook/webhook-signer.test.js.map +1 -0
- package/dist/workflow/compile.js +4 -1
- package/dist/workflow/compile.js.map +1 -1
- package/dist/workflow/compile.test.js +1 -1
- package/dist/workflow/evaluate-nodes.d.ts +24 -0
- package/dist/workflow/evaluate-nodes.d.ts.map +1 -0
- package/dist/workflow/evaluate-nodes.js +126 -0
- package/dist/workflow/evaluate-nodes.js.map +1 -0
- package/dist/workflow/evaluate-nodes.test.d.ts +2 -0
- package/dist/workflow/evaluate-nodes.test.d.ts.map +1 -0
- package/dist/workflow/evaluate-nodes.test.js +363 -0
- package/dist/workflow/evaluate-nodes.test.js.map +1 -0
- package/dist/workflow/index.d.ts +8 -3
- package/dist/workflow/index.d.ts.map +1 -1
- package/dist/workflow/index.js +4 -2
- package/dist/workflow/index.js.map +1 -1
- package/dist/workflow/node-executor.d.ts +40 -0
- package/dist/workflow/node-executor.d.ts.map +1 -0
- package/dist/workflow/node-executor.js +2 -0
- package/dist/workflow/node-executor.js.map +1 -0
- package/dist/workflow/node-executor.test.d.ts +2 -0
- package/dist/workflow/node-executor.test.d.ts.map +1 -0
- package/dist/workflow/node-executor.test.js +91 -0
- package/dist/workflow/node-executor.test.js.map +1 -0
- package/dist/workflow/status.test.js +1 -1
- package/dist/workflow/workflow-runner.d.ts +57 -0
- package/dist/workflow/workflow-runner.d.ts.map +1 -0
- package/dist/workflow/workflow-runner.js +263 -0
- package/dist/workflow/workflow-runner.js.map +1 -0
- package/dist/workflow/workflow-runner.test.d.ts +2 -0
- package/dist/workflow/workflow-runner.test.d.ts.map +1 -0
- package/dist/workflow/workflow-runner.test.js +657 -0
- package/dist/workflow/workflow-runner.test.js.map +1 -0
- package/package.json +19 -14
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { chunkText, chunkPages } from './chunk-generator.js';
|
|
3
|
+
describe('chunkText', () => {
|
|
4
|
+
it('returns empty array for empty input', () => {
|
|
5
|
+
expect(chunkText('')).toEqual([]);
|
|
6
|
+
});
|
|
7
|
+
it('returns empty array for undefined-like empty string', () => {
|
|
8
|
+
expect(chunkText('', { chunkSize: 100, overlap: 0 })).toEqual([]);
|
|
9
|
+
});
|
|
10
|
+
it('returns single chunk for short text', () => {
|
|
11
|
+
const result = chunkText('Hello, world!', { chunkSize: 100, overlap: 0 });
|
|
12
|
+
expect(result).toHaveLength(1);
|
|
13
|
+
expect(result[0]).toBe('Hello, world!');
|
|
14
|
+
});
|
|
15
|
+
it('splits at paragraph boundaries (double newline)', () => {
|
|
16
|
+
const text = 'Paragraph one.\n\nParagraph two.\n\nParagraph three.';
|
|
17
|
+
const result = chunkText(text, { chunkSize: 25, overlap: 0 });
|
|
18
|
+
// Each paragraph should be its own chunk
|
|
19
|
+
expect(result.length).toBeGreaterThanOrEqual(2);
|
|
20
|
+
expect(result[0]).toContain('Paragraph one.');
|
|
21
|
+
});
|
|
22
|
+
it('splits at single newline boundaries', () => {
|
|
23
|
+
const text = 'Line one.\nLine two.\nLine three.\nLine four.\nLine five.';
|
|
24
|
+
const result = chunkText(text, { chunkSize: 25, overlap: 0 });
|
|
25
|
+
expect(result.length).toBeGreaterThan(1);
|
|
26
|
+
// All original content should be preserved across chunks
|
|
27
|
+
const combined = result.join('');
|
|
28
|
+
expect(combined).toContain('Line one.');
|
|
29
|
+
expect(combined).toContain('Line five.');
|
|
30
|
+
});
|
|
31
|
+
it('splits at period boundaries', () => {
|
|
32
|
+
const text = 'First sentence. Second sentence. Third sentence. Fourth sentence.';
|
|
33
|
+
const result = chunkText(text, { chunkSize: 40, overlap: 0 });
|
|
34
|
+
expect(result.length).toBeGreaterThan(1);
|
|
35
|
+
});
|
|
36
|
+
it('splits at space boundaries', () => {
|
|
37
|
+
const text = 'word1 word2 word3 word4 word5 word6 word7 word8 word9 word10';
|
|
38
|
+
const result = chunkText(text, { chunkSize: 20, overlap: 0 });
|
|
39
|
+
expect(result.length).toBeGreaterThan(1);
|
|
40
|
+
for (const chunk of result) {
|
|
41
|
+
// Chunks should not exceed the limit by too much (spaces may cause slight variation)
|
|
42
|
+
expect(chunk.length).toBeLessThanOrEqual(25);
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
it('hard splits when no natural boundaries exist', () => {
|
|
46
|
+
const text = 'a'.repeat(50);
|
|
47
|
+
const result = chunkText(text, { chunkSize: 20, overlap: 0 });
|
|
48
|
+
expect(result.length).toBe(3); // 20 + 20 + 10
|
|
49
|
+
expect(result[0]).toBe('a'.repeat(20));
|
|
50
|
+
expect(result[1]).toBe('a'.repeat(20));
|
|
51
|
+
expect(result[2]).toBe('a'.repeat(10));
|
|
52
|
+
});
|
|
53
|
+
it('applies overlap correctly', () => {
|
|
54
|
+
const text = 'AAAAAAAAAA\n\nBBBBBBBBBB\n\nCCCCCCCCCC';
|
|
55
|
+
const result = chunkText(text, { chunkSize: 15, overlap: 5 });
|
|
56
|
+
expect(result.length).toBeGreaterThanOrEqual(2);
|
|
57
|
+
// Second chunk should start with last 5 chars of first chunk
|
|
58
|
+
if (result.length >= 2) {
|
|
59
|
+
const firstChunkEnd = result[0].slice(-5);
|
|
60
|
+
expect(result[1].startsWith(firstChunkEnd)).toBe(true);
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
it('uses default chunk size and overlap', () => {
|
|
64
|
+
// Create text that's just over 1000 chars (default chunkSize)
|
|
65
|
+
const text = 'word '.repeat(250); // 1250 chars
|
|
66
|
+
const result = chunkText(text);
|
|
67
|
+
expect(result.length).toBeGreaterThan(1);
|
|
68
|
+
});
|
|
69
|
+
it('handles text exactly at chunk size', () => {
|
|
70
|
+
const text = 'x'.repeat(100);
|
|
71
|
+
const result = chunkText(text, { chunkSize: 100, overlap: 0 });
|
|
72
|
+
expect(result).toHaveLength(1);
|
|
73
|
+
expect(result[0]).toBe(text);
|
|
74
|
+
});
|
|
75
|
+
it('preserves content integrity without overlap', () => {
|
|
76
|
+
const paragraphs = ['Alpha paragraph.', 'Beta paragraph.', 'Gamma paragraph.'];
|
|
77
|
+
const text = paragraphs.join('\n\n');
|
|
78
|
+
const result = chunkText(text, { chunkSize: 20, overlap: 0 });
|
|
79
|
+
// Every paragraph should appear in at least one chunk
|
|
80
|
+
for (const para of paragraphs) {
|
|
81
|
+
const found = result.some((chunk) => chunk.includes(para));
|
|
82
|
+
expect(found).toBe(true);
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
it('handles zero overlap', () => {
|
|
86
|
+
const text = 'Chunk A.\n\nChunk B.\n\nChunk C.';
|
|
87
|
+
const result = chunkText(text, { chunkSize: 15, overlap: 0 });
|
|
88
|
+
expect(result.length).toBeGreaterThan(1);
|
|
89
|
+
// With no overlap, second chunk should not start with content from first
|
|
90
|
+
});
|
|
91
|
+
});
|
|
92
|
+
describe('chunkPages', () => {
|
|
93
|
+
it('returns empty array for empty pages', () => {
|
|
94
|
+
expect(chunkPages([])).toEqual([]);
|
|
95
|
+
});
|
|
96
|
+
it('preserves page numbers in chunks', () => {
|
|
97
|
+
const pages = [
|
|
98
|
+
{ pageNumber: 1, content: 'Page one content' },
|
|
99
|
+
{ pageNumber: 2, content: 'Page two content' },
|
|
100
|
+
];
|
|
101
|
+
const result = chunkPages(pages, { chunkSize: 1000, overlap: 0 });
|
|
102
|
+
expect(result).toHaveLength(2);
|
|
103
|
+
expect(result[0].pageNumber).toBe(1);
|
|
104
|
+
expect(result[0].content).toBe('Page one content');
|
|
105
|
+
expect(result[1].pageNumber).toBe(2);
|
|
106
|
+
expect(result[1].content).toBe('Page two content');
|
|
107
|
+
});
|
|
108
|
+
it('assigns sequential chunk indexes across pages', () => {
|
|
109
|
+
const pages = [
|
|
110
|
+
{ pageNumber: 1, content: 'Short content' },
|
|
111
|
+
{ pageNumber: 2, content: 'More content' },
|
|
112
|
+
{ pageNumber: 3, content: 'Final content' },
|
|
113
|
+
];
|
|
114
|
+
const result = chunkPages(pages, { chunkSize: 1000, overlap: 0 });
|
|
115
|
+
expect(result.map((c) => c.chunkIndex)).toEqual([0, 1, 2]);
|
|
116
|
+
});
|
|
117
|
+
it('splits long pages into multiple chunks', () => {
|
|
118
|
+
const pages = [
|
|
119
|
+
{ pageNumber: 1, content: 'word '.repeat(300) }, // 1500 chars
|
|
120
|
+
];
|
|
121
|
+
const result = chunkPages(pages, { chunkSize: 500, overlap: 0 });
|
|
122
|
+
expect(result.length).toBeGreaterThan(1);
|
|
123
|
+
// All chunks should reference page 1
|
|
124
|
+
for (const chunk of result) {
|
|
125
|
+
expect(chunk.pageNumber).toBe(1);
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
it('handles empty page content', () => {
|
|
129
|
+
const pages = [
|
|
130
|
+
{ pageNumber: 1, content: '' },
|
|
131
|
+
{ pageNumber: 2, content: 'Has content' },
|
|
132
|
+
];
|
|
133
|
+
const result = chunkPages(pages, { chunkSize: 1000, overlap: 0 });
|
|
134
|
+
// Empty page produces no chunks
|
|
135
|
+
expect(result).toHaveLength(1);
|
|
136
|
+
expect(result[0].pageNumber).toBe(2);
|
|
137
|
+
expect(result[0].chunkIndex).toBe(0);
|
|
138
|
+
});
|
|
139
|
+
it('propagates page metadata to chunks', () => {
|
|
140
|
+
const pages = [
|
|
141
|
+
{ pageNumber: 1, content: 'Content', metadata: { source: 'pdf', font: 'Arial' } },
|
|
142
|
+
];
|
|
143
|
+
const result = chunkPages(pages, { chunkSize: 1000, overlap: 0 });
|
|
144
|
+
expect(result[0].metadata).toEqual({ source: 'pdf', font: 'Arial' });
|
|
145
|
+
});
|
|
146
|
+
it('handles multiple pages with varying sizes', () => {
|
|
147
|
+
const pages = [
|
|
148
|
+
{ pageNumber: 1, content: 'Short' },
|
|
149
|
+
{ pageNumber: 2, content: 'A much longer page. '.repeat(100) }, // 2000 chars
|
|
150
|
+
{ pageNumber: 3, content: 'Also short' },
|
|
151
|
+
];
|
|
152
|
+
const result = chunkPages(pages, { chunkSize: 500, overlap: 0 });
|
|
153
|
+
// Page 1: 1 chunk, page 2: multiple chunks, page 3: 1 chunk
|
|
154
|
+
const page1Chunks = result.filter((c) => c.pageNumber === 1);
|
|
155
|
+
const page2Chunks = result.filter((c) => c.pageNumber === 2);
|
|
156
|
+
const page3Chunks = result.filter((c) => c.pageNumber === 3);
|
|
157
|
+
expect(page1Chunks).toHaveLength(1);
|
|
158
|
+
expect(page2Chunks.length).toBeGreaterThan(1);
|
|
159
|
+
expect(page3Chunks).toHaveLength(1);
|
|
160
|
+
// Chunk indexes should be sequential across all pages
|
|
161
|
+
for (let i = 0; i < result.length; i++) {
|
|
162
|
+
expect(result[i].chunkIndex).toBe(i);
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
});
|
|
166
|
+
//# sourceMappingURL=chunk-generator.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk-generator.test.js","sourceRoot":"","sources":["../../src/document/chunk-generator.test.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAE9C,OAAO,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAE1D,QAAQ,CAAC,WAAW,EAAE,GAAG,EAAE;IACzB,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qDAAqD,EAAE,GAAG,EAAE;QAC7D,MAAM,CAAC,SAAS,CAAC,EAAE,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IACpE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,MAAM,GAAG,SAAS,CAAC,eAAe,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAC1E,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iDAAiD,EAAE,GAAG,EAAE;QACzD,MAAM,IAAI,GAAG,sDAAsD,CAAC;QACpE,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,yCAAyC;QACzC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAChD,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;IAChD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,IAAI,GAAG,2DAA2D,CAAC;QACzE,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACzC,yDAAyD;QACzD,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjC,MAAM,CAAC,QAAQ,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;QACxC,MAAM,CAAC,QAAQ,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,IAAI,GAAG,mEAAmE,CAAC;QACjF,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,MAAM,IAAI,GAAG,8DAA8D,CAAC;QAC5E,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACzC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,qFAAqF;YACrF,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,mBAAmB,CAAC,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAC5B,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,eAAe;QAC9C,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,IAAI,GAAG,wCAAwC,CAAC;QACtD,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAChD,6DAA6D;QAC7D,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YACvB,MAAM,aAAa,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzD,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,8DAA8D;QAC9D,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,aAAa;QAC/C,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAE/B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC7B,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE/D,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6CAA6C,EAAE,GAAG,EAAE;QACrD,MAAM,UAAU,GAAG,CAAC,kBAAkB,EAAE,iBAAiB,EAAE,kBAAkB,CAAC,CAAC;QAC/E,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,sDAAsD;QACtD,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;YAC9B,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;YAC3D,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC3B,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sBAAsB,EAAE,GAAG,EAAE;QAC9B,MAAM,IAAI,GAAG,kCAAkC,CAAC;QAChD,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACzC,yEAAyE;IAC3E,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;IAC1B,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IACrC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,KAAK,GAAiB;YAC1B,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,kBAAkB,EAAE;YAC9C,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,kBAAkB,EAAE;SAC/C,CAAC;QACF,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAElE,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;QACnD,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,KAAK,GAAiB;YAC1B,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,eAAe,EAAE;YAC3C,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,cAAc,EAAE;YAC1C,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,eAAe,EAAE;SAC5C,CAAC;QACF,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAElE,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IAC7D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,KAAK,GAAiB;YAC1B,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,EAAE,aAAa;SAC/D,CAAC;QACF,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAEjE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACzC,qCAAqC;QACrC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,MAAM,KAAK,GAAiB;YAC1B,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE;YAC9B,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,aAAa,EAAE;SAC1C,CAAC;QACF,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAElE,gCAAgC;QAChC,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,KAAK,GAAiB;YAC1B,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE;SAClF,CAAC;QACF,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAElE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;IACvE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2CAA2C,EAAE,GAAG,EAAE;QACnD,MAAM,KAAK,GAAiB;YAC1B,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE;YACnC,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,sBAAsB,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,EAAE,aAAa;YAC7E,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,YAAY,EAAE;SACzC,CAAC;QACF,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAEjE,4DAA4D;QAC5D,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,CAAC;QAC7D,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,CAAC;QAC7D,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,CAAC;QAE7D,MAAM,CAAC,WAAW,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACpC,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAC9C,MAAM,CAAC,WAAW,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAEpC,sDAAsD;QACtD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { ChunkOptions, DocumentChunk, DocumentParser, EmbeddingProvider, ParsedDocument, ParsedPage } from '@sprinterai/core';
|
|
2
|
+
/** Options for processing a document. */
|
|
3
|
+
export interface ProcessDocumentOptions {
|
|
4
|
+
documentId: string;
|
|
5
|
+
tenantId: string;
|
|
6
|
+
fileName: string;
|
|
7
|
+
mimeType: string;
|
|
8
|
+
fileData: Buffer | Uint8Array;
|
|
9
|
+
chunkOptions?: ChunkOptions;
|
|
10
|
+
embeddingProvider?: EmbeddingProvider;
|
|
11
|
+
}
|
|
12
|
+
/** Result of processing a document through the pipeline. */
|
|
13
|
+
export interface ProcessDocumentResult {
|
|
14
|
+
pages: ParsedPage[];
|
|
15
|
+
chunks: DocumentChunk[];
|
|
16
|
+
embeddings?: number[][];
|
|
17
|
+
metadata: ParsedDocument['metadata'];
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Create a document processor with a registry of parsers.
|
|
21
|
+
* Falls back to the text parser for unrecognized MIME types.
|
|
22
|
+
*/
|
|
23
|
+
export declare function createDocumentProcessor(parsers?: DocumentParser[]): {
|
|
24
|
+
process(options: ProcessDocumentOptions): Promise<ProcessDocumentResult>;
|
|
25
|
+
getParser(mimeType: string): DocumentParser | null;
|
|
26
|
+
};
|
|
27
|
+
//# sourceMappingURL=document-processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-processor.d.ts","sourceRoot":"","sources":["../../src/document/document-processor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,YAAY,EACZ,aAAa,EACb,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,EACX,MAAM,kBAAkB,CAAC;AAK1B,yCAAyC;AACzC,MAAM,WAAW,sBAAsB;IACrC,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,GAAG,UAAU,CAAC;IAC9B,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,iBAAiB,CAAC,EAAE,iBAAiB,CAAC;CACvC;AAED,4DAA4D;AAC5D,MAAM,WAAW,qBAAqB;IACpC,KAAK,EAAE,UAAU,EAAE,CAAC;IACpB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,UAAU,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IACxB,QAAQ,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;CACtC;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,OAAO,CAAC,EAAE,cAAc,EAAE,GAAG;IACnE,OAAO,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACzE,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,cAAc,GAAG,IAAI,CAAC;CACpD,CA6CA"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { chunkPages } from './chunk-generator.js';
|
|
2
|
+
import { textParser } from './parsers/index.js';
|
|
3
|
+
/**
|
|
4
|
+
* Create a document processor with a registry of parsers.
|
|
5
|
+
* Falls back to the text parser for unrecognized MIME types.
|
|
6
|
+
*/
|
|
7
|
+
export function createDocumentProcessor(parsers) {
|
|
8
|
+
// Build MIME type -> parser lookup from registered parsers + built-in text parser
|
|
9
|
+
const allParsers = [...(parsers ?? []), textParser];
|
|
10
|
+
function getParser(mimeType) {
|
|
11
|
+
const normalized = mimeType.toLowerCase().split(';')[0].trim();
|
|
12
|
+
for (const parser of allParsers) {
|
|
13
|
+
if (parser.mimeTypes.includes(normalized)) {
|
|
14
|
+
return parser;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
async function process(options) {
|
|
20
|
+
const { mimeType, fileData, chunkOptions, embeddingProvider } = options;
|
|
21
|
+
// 1. Parse: find a parser or fall back to text parser
|
|
22
|
+
const parser = getParser(mimeType) ?? textParser;
|
|
23
|
+
const parsed = await parser.parse(fileData instanceof Buffer ? fileData : Buffer.from(fileData));
|
|
24
|
+
// 2. Chunk: split parsed pages into chunks
|
|
25
|
+
const chunks = chunkPages(parsed.pages, chunkOptions);
|
|
26
|
+
// 3. Embed (optional): generate embeddings if provider is available
|
|
27
|
+
let embeddings;
|
|
28
|
+
if (embeddingProvider && chunks.length > 0) {
|
|
29
|
+
const texts = chunks.map((c) => c.content);
|
|
30
|
+
const result = await embeddingProvider.generateEmbeddings(texts);
|
|
31
|
+
if (result !== null) {
|
|
32
|
+
embeddings = result;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return {
|
|
36
|
+
pages: parsed.pages,
|
|
37
|
+
chunks,
|
|
38
|
+
embeddings,
|
|
39
|
+
metadata: parsed.metadata,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
return { process, getParser };
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=document-processor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-processor.js","sourceRoot":"","sources":["../../src/document/document-processor.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAqBvC;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CAAC,OAA0B;IAIhE,kFAAkF;IAClF,MAAM,UAAU,GAAqB,CAAC,GAAG,CAAC,OAAO,IAAI,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC;IAEtE,SAAS,SAAS,CAAC,QAAgB;QACjC,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC/D,KAAK,MAAM,MAAM,IAAI,UAAU,EAAE,CAAC;YAChC,IAAI,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC1C,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,KAAK,UAAU,OAAO,CAAC,OAA+B;QACpD,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,YAAY,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC;QAExE,sDAAsD;QACtD,MAAM,MAAM,GAAG,SAAS,CAAC,QAAQ,CAAC,IAAI,UAAU,CAAC;QACjD,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAC/B,QAAQ,YAAY,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAC9D,CAAC;QAEF,2CAA2C;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEtD,oEAAoE;QACpE,IAAI,UAAkC,CAAC;QACvC,IAAI,iBAAiB,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3C,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;YAC3C,MAAM,MAAM,GAAG,MAAM,iBAAiB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC;YACjE,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACpB,UAAU,GAAG,MAAM,CAAC;YACtB,CAAC;QACH,CAAC;QAED,OAAO;YACL,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,MAAM;YACN,UAAU;YACV,QAAQ,EAAE,MAAM,CAAC,QAAQ;SAC1B,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC;AAChC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-processor.test.d.ts","sourceRoot":"","sources":["../../src/document/document-processor.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
+
import { createDocumentProcessor } from './document-processor.js';
|
|
3
|
+
describe('createDocumentProcessor', () => {
|
|
4
|
+
const processor = createDocumentProcessor();
|
|
5
|
+
describe('getParser', () => {
|
|
6
|
+
it('returns text parser for text/plain', () => {
|
|
7
|
+
const parser = processor.getParser('text/plain');
|
|
8
|
+
expect(parser).not.toBeNull();
|
|
9
|
+
expect(parser.mimeTypes).toContain('text/plain');
|
|
10
|
+
});
|
|
11
|
+
it('returns text parser for text/markdown', () => {
|
|
12
|
+
const parser = processor.getParser('text/markdown');
|
|
13
|
+
expect(parser).not.toBeNull();
|
|
14
|
+
});
|
|
15
|
+
it('returns text parser for application/json', () => {
|
|
16
|
+
const parser = processor.getParser('application/json');
|
|
17
|
+
expect(parser).not.toBeNull();
|
|
18
|
+
});
|
|
19
|
+
it('returns null for unsupported MIME types', () => {
|
|
20
|
+
const parser = processor.getParser('application/pdf');
|
|
21
|
+
expect(parser).toBeNull();
|
|
22
|
+
});
|
|
23
|
+
it('handles MIME type with charset parameter', () => {
|
|
24
|
+
const parser = processor.getParser('text/plain; charset=utf-8');
|
|
25
|
+
expect(parser).not.toBeNull();
|
|
26
|
+
});
|
|
27
|
+
it('is case-insensitive', () => {
|
|
28
|
+
const parser = processor.getParser('TEXT/PLAIN');
|
|
29
|
+
expect(parser).not.toBeNull();
|
|
30
|
+
});
|
|
31
|
+
});
|
|
32
|
+
describe('process', () => {
|
|
33
|
+
it('processes a text document end-to-end', async () => {
|
|
34
|
+
const result = await processor.process({
|
|
35
|
+
documentId: 'doc-1',
|
|
36
|
+
tenantId: 'tenant-1',
|
|
37
|
+
fileName: 'test.txt',
|
|
38
|
+
mimeType: 'text/plain',
|
|
39
|
+
fileData: Buffer.from('Hello, world!'),
|
|
40
|
+
});
|
|
41
|
+
expect(result.pages).toHaveLength(1);
|
|
42
|
+
expect(result.pages[0].content).toBe('Hello, world!');
|
|
43
|
+
expect(result.chunks.length).toBeGreaterThanOrEqual(1);
|
|
44
|
+
expect(result.chunks[0].content).toContain('Hello, world!');
|
|
45
|
+
expect(result.metadata.pageCount).toBe(1);
|
|
46
|
+
expect(result.embeddings).toBeUndefined();
|
|
47
|
+
});
|
|
48
|
+
it('processes Uint8Array input', async () => {
|
|
49
|
+
const encoder = new TextEncoder();
|
|
50
|
+
const result = await processor.process({
|
|
51
|
+
documentId: 'doc-2',
|
|
52
|
+
tenantId: 'tenant-1',
|
|
53
|
+
fileName: 'test.txt',
|
|
54
|
+
mimeType: 'text/plain',
|
|
55
|
+
fileData: encoder.encode('From Uint8Array'),
|
|
56
|
+
});
|
|
57
|
+
expect(result.pages[0].content).toBe('From Uint8Array');
|
|
58
|
+
});
|
|
59
|
+
it('falls back to text parser for unknown MIME types', async () => {
|
|
60
|
+
const result = await processor.process({
|
|
61
|
+
documentId: 'doc-3',
|
|
62
|
+
tenantId: 'tenant-1',
|
|
63
|
+
fileName: 'unknown.xyz',
|
|
64
|
+
mimeType: 'application/x-unknown',
|
|
65
|
+
fileData: Buffer.from('Unknown format but readable text'),
|
|
66
|
+
});
|
|
67
|
+
expect(result.pages).toHaveLength(1);
|
|
68
|
+
expect(result.pages[0].content).toBe('Unknown format but readable text');
|
|
69
|
+
});
|
|
70
|
+
it('reports correct page and chunk counts', async () => {
|
|
71
|
+
const longText = 'This is a sentence. '.repeat(100); // ~2000 chars
|
|
72
|
+
const result = await processor.process({
|
|
73
|
+
documentId: 'doc-4',
|
|
74
|
+
tenantId: 'tenant-1',
|
|
75
|
+
fileName: 'long.txt',
|
|
76
|
+
mimeType: 'text/plain',
|
|
77
|
+
fileData: Buffer.from(longText),
|
|
78
|
+
chunkOptions: { chunkSize: 500, overlap: 0 },
|
|
79
|
+
});
|
|
80
|
+
expect(result.pages).toHaveLength(1);
|
|
81
|
+
expect(result.chunks.length).toBeGreaterThan(1);
|
|
82
|
+
expect(result.metadata.pageCount).toBe(1);
|
|
83
|
+
});
|
|
84
|
+
it('generates embeddings when provider is given', async () => {
|
|
85
|
+
const mockProvider = {
|
|
86
|
+
dimensions: 384,
|
|
87
|
+
generateEmbeddings: vi.fn().mockResolvedValue([
|
|
88
|
+
[0.1, 0.2, 0.3],
|
|
89
|
+
[0.4, 0.5, 0.6],
|
|
90
|
+
]),
|
|
91
|
+
};
|
|
92
|
+
const text = 'First paragraph.\n\nSecond paragraph.';
|
|
93
|
+
const result = await processor.process({
|
|
94
|
+
documentId: 'doc-5',
|
|
95
|
+
tenantId: 'tenant-1',
|
|
96
|
+
fileName: 'test.txt',
|
|
97
|
+
mimeType: 'text/plain',
|
|
98
|
+
fileData: Buffer.from(text),
|
|
99
|
+
chunkOptions: { chunkSize: 20, overlap: 0 },
|
|
100
|
+
embeddingProvider: mockProvider,
|
|
101
|
+
});
|
|
102
|
+
expect(result.embeddings).toBeDefined();
|
|
103
|
+
expect(mockProvider.generateEmbeddings).toHaveBeenCalledTimes(1);
|
|
104
|
+
});
|
|
105
|
+
it('returns undefined embeddings when provider returns null', async () => {
|
|
106
|
+
const mockProvider = {
|
|
107
|
+
dimensions: 384,
|
|
108
|
+
generateEmbeddings: vi.fn().mockResolvedValue(null),
|
|
109
|
+
};
|
|
110
|
+
const result = await processor.process({
|
|
111
|
+
documentId: 'doc-6',
|
|
112
|
+
tenantId: 'tenant-1',
|
|
113
|
+
fileName: 'test.txt',
|
|
114
|
+
mimeType: 'text/plain',
|
|
115
|
+
fileData: Buffer.from('Some text'),
|
|
116
|
+
embeddingProvider: mockProvider,
|
|
117
|
+
});
|
|
118
|
+
expect(result.embeddings).toBeUndefined();
|
|
119
|
+
});
|
|
120
|
+
it('skips embedding for empty content', async () => {
|
|
121
|
+
const mockProvider = {
|
|
122
|
+
dimensions: 384,
|
|
123
|
+
generateEmbeddings: vi.fn().mockResolvedValue([]),
|
|
124
|
+
};
|
|
125
|
+
const result = await processor.process({
|
|
126
|
+
documentId: 'doc-7',
|
|
127
|
+
tenantId: 'tenant-1',
|
|
128
|
+
fileName: 'empty.txt',
|
|
129
|
+
mimeType: 'text/plain',
|
|
130
|
+
fileData: Buffer.from(''),
|
|
131
|
+
embeddingProvider: mockProvider,
|
|
132
|
+
});
|
|
133
|
+
// Empty content produces no chunks, so embedding should not be called
|
|
134
|
+
expect(mockProvider.generateEmbeddings).not.toHaveBeenCalled();
|
|
135
|
+
expect(result.embeddings).toBeUndefined();
|
|
136
|
+
});
|
|
137
|
+
it('respects chunk options', async () => {
|
|
138
|
+
const text = 'word '.repeat(200); // 1000 chars
|
|
139
|
+
const result = await processor.process({
|
|
140
|
+
documentId: 'doc-8',
|
|
141
|
+
tenantId: 'tenant-1',
|
|
142
|
+
fileName: 'test.txt',
|
|
143
|
+
mimeType: 'text/plain',
|
|
144
|
+
fileData: Buffer.from(text),
|
|
145
|
+
chunkOptions: { chunkSize: 200, overlap: 50 },
|
|
146
|
+
});
|
|
147
|
+
expect(result.chunks.length).toBeGreaterThan(1);
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
describe('custom parsers', () => {
|
|
151
|
+
it('uses custom parser when registered', async () => {
|
|
152
|
+
const customParser = {
|
|
153
|
+
mimeTypes: ['application/pdf'],
|
|
154
|
+
parse: vi.fn().mockResolvedValue({
|
|
155
|
+
pages: [
|
|
156
|
+
{ pageNumber: 1, content: 'PDF page 1' },
|
|
157
|
+
{ pageNumber: 2, content: 'PDF page 2' },
|
|
158
|
+
],
|
|
159
|
+
fullText: 'PDF page 1\nPDF page 2',
|
|
160
|
+
metadata: { pageCount: 2, title: 'Test PDF' },
|
|
161
|
+
}),
|
|
162
|
+
};
|
|
163
|
+
const customProcessor = createDocumentProcessor([customParser]);
|
|
164
|
+
const result = await customProcessor.process({
|
|
165
|
+
documentId: 'doc-pdf',
|
|
166
|
+
tenantId: 'tenant-1',
|
|
167
|
+
fileName: 'test.pdf',
|
|
168
|
+
mimeType: 'application/pdf',
|
|
169
|
+
fileData: Buffer.from('fake pdf data'),
|
|
170
|
+
});
|
|
171
|
+
expect(customParser.parse).toHaveBeenCalledTimes(1);
|
|
172
|
+
expect(result.pages).toHaveLength(2);
|
|
173
|
+
expect(result.metadata.pageCount).toBe(2);
|
|
174
|
+
expect(result.metadata.title).toBe('Test PDF');
|
|
175
|
+
});
|
|
176
|
+
it('custom parser takes precedence over built-in', async () => {
|
|
177
|
+
const customTextParser = {
|
|
178
|
+
mimeTypes: ['text/plain'],
|
|
179
|
+
parse: vi.fn().mockResolvedValue({
|
|
180
|
+
pages: [{ pageNumber: 1, content: 'CUSTOM' }],
|
|
181
|
+
fullText: 'CUSTOM',
|
|
182
|
+
metadata: { pageCount: 1 },
|
|
183
|
+
}),
|
|
184
|
+
};
|
|
185
|
+
const customProcessor = createDocumentProcessor([customTextParser]);
|
|
186
|
+
const result = await customProcessor.process({
|
|
187
|
+
documentId: 'doc-custom',
|
|
188
|
+
tenantId: 'tenant-1',
|
|
189
|
+
fileName: 'test.txt',
|
|
190
|
+
mimeType: 'text/plain',
|
|
191
|
+
fileData: Buffer.from('original'),
|
|
192
|
+
});
|
|
193
|
+
expect(result.pages[0].content).toBe('CUSTOM');
|
|
194
|
+
});
|
|
195
|
+
});
|
|
196
|
+
});
|
|
197
|
+
//# sourceMappingURL=document-processor.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-processor.test.js","sourceRoot":"","sources":["../../src/document/document-processor.test.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAElD,OAAO,EAAE,uBAAuB,EAAE,MAAM,sBAAsB,CAAC;AAE/D,QAAQ,CAAC,yBAAyB,EAAE,GAAG,EAAE;IACvC,MAAM,SAAS,GAAG,uBAAuB,EAAE,CAAC;IAE5C,QAAQ,CAAC,WAAW,EAAE,GAAG,EAAE;QACzB,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;YAC5C,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACjD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAO,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACpD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC;YACvD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;YACjD,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACtD,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,CAAC,2BAA2B,CAAC,CAAC;YAChE,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qBAAqB,EAAE,GAAG,EAAE;YAC7B,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACjD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,SAAS,EAAE,GAAG,EAAE;QACvB,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,eAAe,CAAC;aACvC,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YACrC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;YACtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;YACvD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;YAC5D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1C,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,aAAa,EAAE,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4BAA4B,EAAE,KAAK,IAAI,EAAE;YAC1C,MAAM,OAAO,GAAG,IAAI,WAAW,EAAE,CAAC;YAClC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,OAAO,CAAC,MAAM,CAAC,iBAAiB,CAAC;aAC5C,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,aAAa;gBACvB,QAAQ,EAAE,uBAAuB;gBACjC,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC;aAC1D,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YACrC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC;QAC3E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;YACrD,MAAM,QAAQ,GAAG,sBAAsB,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,cAAc;YACnE,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC;gBAC/B,YAAY,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,EAAE;aAC7C,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YACrC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;YAChD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,YAAY,GAAsB;gBACtC,UAAU,EAAE,GAAG;gBACf,kBAAkB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC;oBAC5C,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC;oBACf,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC;iBAChB,CAAC;aACH,CAAC;YAEF,MAAM,IAAI,GAAG,uCAAuC,CAAC;YACrD,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;gBAC3B,YAAY,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE;gBAC3C,iBAAiB,EAAE,YAAY;aAChC,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,WAAW,EAAE,CAAC;YACxC,MAAM,CAAC,YAAY,CAAC,kBAAkB,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QACnE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yDAAyD,EAAE,KAAK,IAAI,EAAE;YACvE,MAAM,YAAY,GAAsB;gBACtC,UAAU,EAAE,GAAG;gBACf,kBAAkB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC;aACpD,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC;gBAClC,iBAAiB,EAAE,YAAY;aAChC,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,aAAa,EAAE,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,MAAM,YAAY,GAAsB;gBACtC,UAAU,EAAE,GAAG;gBACf,kBAAkB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,EAAE,CAAC;aAClD,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,WAAW;gBACrB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC;gBACzB,iBAAiB,EAAE,YAAY;aAChC,CAAC,CAAC;YAEH,sEAAsE;YACtE,MAAM,CAAC,YAAY,CAAC,kBAAkB,CAAC,CAAC,GAAG,CAAC,gBAAgB,EAAE,CAAC;YAC/D,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,aAAa,EAAE,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wBAAwB,EAAE,KAAK,IAAI,EAAE;YACtC,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,aAAa;YAC/C,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC;gBACrC,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;gBAC3B,YAAY,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE,EAAE;aAC9C,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAClD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,YAAY,GAAmB;gBACnC,SAAS,EAAE,CAAC,iBAAiB,CAAC;gBAC9B,KAAK,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC;oBAC/B,KAAK,EAAE;wBACL,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,YAAY,EAAE;wBACxC,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,YAAY,EAAE;qBACzC;oBACD,QAAQ,EAAE,wBAAwB;oBAClC,QAAQ,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE;iBAC9C,CAAC;aACH,CAAC;YAEF,MAAM,eAAe,GAAG,uBAAuB,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC;YAEhE,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,OAAO,CAAC;gBAC3C,UAAU,EAAE,SAAS;gBACrB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,iBAAiB;gBAC3B,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,eAAe,CAAC;aACvC,CAAC,CAAC;YAEH,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YACrC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1C,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;YAC5D,MAAM,gBAAgB,GAAmB;gBACvC,SAAS,EAAE,CAAC,YAAY,CAAC;gBACzB,KAAK,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC;oBAC/B,KAAK,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;oBAC7C,QAAQ,EAAE,QAAQ;oBAClB,QAAQ,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE;iBAC3B,CAAC;aACH,CAAC;YAEF,MAAM,eAAe,GAAG,uBAAuB,CAAC,CAAC,gBAAgB,CAAC,CAAC,CAAC;YAEpE,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,OAAO,CAAC;gBAC3C,UAAU,EAAE,YAAY;gBACxB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,UAAU;gBACpB,QAAQ,EAAE,YAAY;gBACtB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC;aAClC,CAAC,CAAC;YAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { chunkText, chunkPages } from './chunk-generator.js';
|
|
2
|
+
export { createDocumentProcessor } from './document-processor.js';
|
|
3
|
+
export type { ProcessDocumentOptions, ProcessDocumentResult } from './document-processor.js';
|
|
4
|
+
export { textParser } from './parsers/index.js';
|
|
5
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/document/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC1D,OAAO,EAAE,uBAAuB,EAAE,MAAM,sBAAsB,CAAC;AAC/D,YAAY,EAAE,sBAAsB,EAAE,qBAAqB,EAAE,MAAM,sBAAsB,CAAC;AAC1F,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/document/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC1D,OAAO,EAAE,uBAAuB,EAAE,MAAM,sBAAsB,CAAC;AAE/D,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/document/parsers/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/document/parsers/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-parser.d.ts","sourceRoot":"","sources":["../../../src/document/parsers/text-parser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAkB,MAAM,kBAAkB,CAAC;AAWvE,qFAAqF;AACrF,eAAO,MAAM,UAAU,EAAE,cAexB,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
const TEXT_MIME_TYPES = [
|
|
2
|
+
'text/plain',
|
|
3
|
+
'text/csv',
|
|
4
|
+
'text/markdown',
|
|
5
|
+
'application/json',
|
|
6
|
+
'text/xml',
|
|
7
|
+
'application/xml',
|
|
8
|
+
];
|
|
9
|
+
/** Parser for text-based document formats (plain text, CSV, Markdown, JSON, XML). */
|
|
10
|
+
export const textParser = {
|
|
11
|
+
mimeTypes: TEXT_MIME_TYPES,
|
|
12
|
+
async parse(buffer) {
|
|
13
|
+
const text = buffer instanceof Uint8Array && !(buffer instanceof Buffer)
|
|
14
|
+
? new TextDecoder('utf-8').decode(buffer)
|
|
15
|
+
: Buffer.from(buffer).toString('utf-8');
|
|
16
|
+
return {
|
|
17
|
+
pages: [{ pageNumber: 1, content: text }],
|
|
18
|
+
fullText: text,
|
|
19
|
+
metadata: { pageCount: 1 },
|
|
20
|
+
};
|
|
21
|
+
},
|
|
22
|
+
};
|
|
23
|
+
//# sourceMappingURL=text-parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-parser.js","sourceRoot":"","sources":["../../../src/document/parsers/text-parser.ts"],"names":[],"mappings":"AAEA,MAAM,eAAe,GAAG;IACtB,YAAY;IACZ,UAAU;IACV,eAAe;IACf,kBAAkB;IAClB,UAAU;IACV,iBAAiB;CAClB,CAAC;AAEF,qFAAqF;AACrF,MAAM,CAAC,MAAM,UAAU,GAAmB;IACxC,SAAS,EAAE,eAAe;IAE1B,KAAK,CAAC,KAAK,CAAC,MAA2B;QACrC,MAAM,IAAI,GACR,MAAM,YAAY,UAAU,IAAI,CAAC,CAAC,MAAM,YAAY,MAAM,CAAC;YACzD,CAAC,CAAC,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC;YACzC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QAE5C,OAAO;YACL,KAAK,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;YACzC,QAAQ,EAAE,IAAI;YACd,QAAQ,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE;SAC3B,CAAC;IACJ,CAAC;CACF,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-parser.test.d.ts","sourceRoot":"","sources":["../../../src/document/parsers/text-parser.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { textParser } from './text-parser.js';
|
|
3
|
+
describe('textParser', () => {
|
|
4
|
+
it('lists supported MIME types', () => {
|
|
5
|
+
expect(textParser.mimeTypes).toContain('text/plain');
|
|
6
|
+
expect(textParser.mimeTypes).toContain('text/csv');
|
|
7
|
+
expect(textParser.mimeTypes).toContain('text/markdown');
|
|
8
|
+
expect(textParser.mimeTypes).toContain('application/json');
|
|
9
|
+
expect(textParser.mimeTypes).toContain('text/xml');
|
|
10
|
+
expect(textParser.mimeTypes).toContain('application/xml');
|
|
11
|
+
});
|
|
12
|
+
it('parses plain text from Buffer', async () => {
|
|
13
|
+
const buffer = Buffer.from('Hello, world!');
|
|
14
|
+
const result = await textParser.parse(buffer);
|
|
15
|
+
expect(result.fullText).toBe('Hello, world!');
|
|
16
|
+
expect(result.pages).toHaveLength(1);
|
|
17
|
+
expect(result.pages[0].pageNumber).toBe(1);
|
|
18
|
+
expect(result.pages[0].content).toBe('Hello, world!');
|
|
19
|
+
expect(result.metadata.pageCount).toBe(1);
|
|
20
|
+
});
|
|
21
|
+
it('parses plain text from Uint8Array', async () => {
|
|
22
|
+
const encoder = new TextEncoder();
|
|
23
|
+
const buffer = encoder.encode('Uint8Array content');
|
|
24
|
+
const result = await textParser.parse(buffer);
|
|
25
|
+
expect(result.fullText).toBe('Uint8Array content');
|
|
26
|
+
expect(result.pages).toHaveLength(1);
|
|
27
|
+
expect(result.pages[0].content).toBe('Uint8Array content');
|
|
28
|
+
});
|
|
29
|
+
it('parses JSON content', async () => {
|
|
30
|
+
const json = JSON.stringify({ key: 'value', nested: { a: 1 } });
|
|
31
|
+
const buffer = Buffer.from(json);
|
|
32
|
+
const result = await textParser.parse(buffer);
|
|
33
|
+
expect(result.fullText).toBe(json);
|
|
34
|
+
expect(result.pages).toHaveLength(1);
|
|
35
|
+
});
|
|
36
|
+
it('handles empty input', async () => {
|
|
37
|
+
const buffer = Buffer.from('');
|
|
38
|
+
const result = await textParser.parse(buffer);
|
|
39
|
+
expect(result.fullText).toBe('');
|
|
40
|
+
expect(result.pages).toHaveLength(1);
|
|
41
|
+
expect(result.pages[0].content).toBe('');
|
|
42
|
+
expect(result.metadata.pageCount).toBe(1);
|
|
43
|
+
});
|
|
44
|
+
it('handles multi-line text', async () => {
|
|
45
|
+
const text = 'Line 1\nLine 2\nLine 3';
|
|
46
|
+
const buffer = Buffer.from(text);
|
|
47
|
+
const result = await textParser.parse(buffer);
|
|
48
|
+
expect(result.fullText).toBe(text);
|
|
49
|
+
expect(result.pages[0].content).toBe(text);
|
|
50
|
+
});
|
|
51
|
+
it('handles UTF-8 characters', async () => {
|
|
52
|
+
const text = 'Héllo wörld 你好 🌍';
|
|
53
|
+
const buffer = Buffer.from(text, 'utf-8');
|
|
54
|
+
const result = await textParser.parse(buffer);
|
|
55
|
+
expect(result.fullText).toBe(text);
|
|
56
|
+
});
|
|
57
|
+
it('returns single page for all content', async () => {
|
|
58
|
+
const text = 'Page content here';
|
|
59
|
+
const result = await textParser.parse(Buffer.from(text));
|
|
60
|
+
expect(result.pages).toHaveLength(1);
|
|
61
|
+
expect(result.pages[0].pageNumber).toBe(1);
|
|
62
|
+
});
|
|
63
|
+
});
|
|
64
|
+
//# sourceMappingURL=text-parser.test.js.map
|