@revealui/ai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/LICENSE.commercial +112 -0
- package/README.md +314 -0
- package/dist/a2a/card.d.ts +26 -0
- package/dist/a2a/card.d.ts.map +1 -0
- package/dist/a2a/card.js +173 -0
- package/dist/a2a/handler.d.ts +26 -0
- package/dist/a2a/handler.d.ts.map +1 -0
- package/dist/a2a/handler.js +170 -0
- package/dist/a2a/index.d.ts +10 -0
- package/dist/a2a/index.d.ts.map +1 -0
- package/dist/a2a/index.js +9 -0
- package/dist/a2a/task-store.d.ts +42 -0
- package/dist/a2a/task-store.d.ts.map +1 -0
- package/dist/a2a/task-store.js +99 -0
- package/dist/audit/emitter.d.ts +34 -0
- package/dist/audit/emitter.d.ts.map +1 -0
- package/dist/audit/emitter.js +34 -0
- package/dist/audit/index.d.ts +44 -0
- package/dist/audit/index.d.ts.map +1 -0
- package/dist/audit/index.js +48 -0
- package/dist/audit/observer.d.ts +108 -0
- package/dist/audit/observer.d.ts.map +1 -0
- package/dist/audit/observer.js +271 -0
- package/dist/audit/policy.d.ts +70 -0
- package/dist/audit/policy.d.ts.map +1 -0
- package/dist/audit/policy.js +209 -0
- package/dist/audit/store.d.ts +42 -0
- package/dist/audit/store.d.ts.map +1 -0
- package/dist/audit/store.js +80 -0
- package/dist/audit/types.d.ts +169 -0
- package/dist/audit/types.d.ts.map +1 -0
- package/dist/audit/types.js +80 -0
- package/dist/client/hooks/index.d.ts +22 -0
- package/dist/client/hooks/index.d.ts.map +1 -0
- package/dist/client/hooks/index.js +21 -0
- package/dist/client/hooks/useAgentContext.d.ts +30 -0
- package/dist/client/hooks/useAgentContext.d.ts.map +1 -0
- package/dist/client/hooks/useAgentContext.js +161 -0
- package/dist/client/hooks/useAgentEvents.d.ts +126 -0
- package/dist/client/hooks/useAgentEvents.d.ts.map +1 -0
- package/dist/client/hooks/useAgentEvents.js +232 -0
- package/dist/client/hooks/useAgentStream.d.ts +44 -0
- package/dist/client/hooks/useAgentStream.d.ts.map +1 -0
- package/dist/client/hooks/useAgentStream.js +101 -0
- package/dist/client/hooks/useEpisodicMemory.d.ts +25 -0
- package/dist/client/hooks/useEpisodicMemory.d.ts.map +1 -0
- package/dist/client/hooks/useEpisodicMemory.js +174 -0
- package/dist/client/hooks/useWorkingMemory.d.ts +57 -0
- package/dist/client/hooks/useWorkingMemory.d.ts.map +1 -0
- package/dist/client/hooks/useWorkingMemory.js +276 -0
- package/dist/client/index.d.ts +14 -0
- package/dist/client/index.d.ts.map +1 -0
- package/dist/client/index.js +13 -0
- package/dist/embeddings/index.d.ts +51 -0
- package/dist/embeddings/index.d.ts.map +1 -0
- package/dist/embeddings/index.js +73 -0
- package/dist/index.d.ts +83 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +103 -0
- package/dist/inference/context-assembly.d.ts +27 -0
- package/dist/inference/context-assembly.d.ts.map +1 -0
- package/dist/inference/context-assembly.js +81 -0
- package/dist/inference/overflow-compressor.d.ts +17 -0
- package/dist/inference/overflow-compressor.d.ts.map +1 -0
- package/dist/inference/overflow-compressor.js +40 -0
- package/dist/inference/runRag.d.ts +35 -0
- package/dist/inference/runRag.d.ts.map +1 -0
- package/dist/inference/runRag.js +53 -0
- package/dist/ingestion/bm25.d.ts +29 -0
- package/dist/ingestion/bm25.d.ts.map +1 -0
- package/dist/ingestion/bm25.js +161 -0
- package/dist/ingestion/cms-indexer.d.ts +39 -0
- package/dist/ingestion/cms-indexer.d.ts.map +1 -0
- package/dist/ingestion/cms-indexer.js +74 -0
- package/dist/ingestion/file-parsers.d.ts +51 -0
- package/dist/ingestion/file-parsers.d.ts.map +1 -0
- package/dist/ingestion/file-parsers.js +247 -0
- package/dist/ingestion/hybrid-search.d.ts +22 -0
- package/dist/ingestion/hybrid-search.d.ts.map +1 -0
- package/dist/ingestion/hybrid-search.js +63 -0
- package/dist/ingestion/index.d.ts +9 -0
- package/dist/ingestion/index.d.ts.map +1 -0
- package/dist/ingestion/index.js +8 -0
- package/dist/ingestion/pipeline.d.ts +35 -0
- package/dist/ingestion/pipeline.d.ts.map +1 -0
- package/dist/ingestion/pipeline.js +114 -0
- package/dist/ingestion/rag-vector-service.d.ts +34 -0
- package/dist/ingestion/rag-vector-service.d.ts.map +1 -0
- package/dist/ingestion/rag-vector-service.js +98 -0
- package/dist/ingestion/reranker.d.ts +10 -0
- package/dist/ingestion/reranker.d.ts.map +1 -0
- package/dist/ingestion/reranker.js +41 -0
- package/dist/ingestion/text-splitter.d.ts +25 -0
- package/dist/ingestion/text-splitter.d.ts.map +1 -0
- package/dist/ingestion/text-splitter.js +119 -0
- package/dist/llm/cache-utils.d.ts +146 -0
- package/dist/llm/cache-utils.d.ts.map +1 -0
- package/dist/llm/cache-utils.js +204 -0
- package/dist/llm/client.d.ts +134 -0
- package/dist/llm/client.d.ts.map +1 -0
- package/dist/llm/client.js +497 -0
- package/dist/llm/key-validator.d.ts +25 -0
- package/dist/llm/key-validator.d.ts.map +1 -0
- package/dist/llm/key-validator.js +101 -0
- package/dist/llm/provider-health.d.ts +40 -0
- package/dist/llm/provider-health.d.ts.map +1 -0
- package/dist/llm/provider-health.js +97 -0
- package/dist/llm/providers/anthropic.d.ts +31 -0
- package/dist/llm/providers/anthropic.d.ts.map +1 -0
- package/dist/llm/providers/anthropic.js +248 -0
- package/dist/llm/providers/base.d.ts +111 -0
- package/dist/llm/providers/base.d.ts.map +1 -0
- package/dist/llm/providers/base.js +6 -0
- package/dist/llm/providers/groq.d.ts +23 -0
- package/dist/llm/providers/groq.d.ts.map +1 -0
- package/dist/llm/providers/groq.js +27 -0
- package/dist/llm/providers/ollama.d.ts +27 -0
- package/dist/llm/providers/ollama.d.ts.map +1 -0
- package/dist/llm/providers/ollama.js +48 -0
- package/dist/llm/providers/openai.d.ts +19 -0
- package/dist/llm/providers/openai.d.ts.map +1 -0
- package/dist/llm/providers/openai.js +245 -0
- package/dist/llm/providers/vultr.d.ts +18 -0
- package/dist/llm/providers/vultr.d.ts.map +1 -0
- package/dist/llm/providers/vultr.js +168 -0
- package/dist/llm/response-cache.d.ts +166 -0
- package/dist/llm/response-cache.d.ts.map +1 -0
- package/dist/llm/response-cache.js +233 -0
- package/dist/llm/semantic-cache.d.ts +179 -0
- package/dist/llm/semantic-cache.d.ts.map +1 -0
- package/dist/llm/semantic-cache.js +306 -0
- package/dist/llm/server.d.ts +14 -0
- package/dist/llm/server.d.ts.map +1 -0
- package/dist/llm/server.js +15 -0
- package/dist/llm/token-counter.d.ts +48 -0
- package/dist/llm/token-counter.d.ts.map +1 -0
- package/dist/llm/token-counter.js +77 -0
- package/dist/llm/workspace-provider-config.d.ts +38 -0
- package/dist/llm/workspace-provider-config.d.ts.map +1 -0
- package/dist/llm/workspace-provider-config.js +47 -0
- package/dist/memory/agent/context-manager.d.ts +148 -0
- package/dist/memory/agent/context-manager.d.ts.map +1 -0
- package/dist/memory/agent/context-manager.js +284 -0
- package/dist/memory/agent/index.d.ts +7 -0
- package/dist/memory/agent/index.d.ts.map +1 -0
- package/dist/memory/agent/index.js +6 -0
- package/dist/memory/crdt/index.d.ts +13 -0
- package/dist/memory/crdt/index.d.ts.map +1 -0
- package/dist/memory/crdt/index.js +12 -0
- package/dist/memory/crdt/lww-register.d.ts +108 -0
- package/dist/memory/crdt/lww-register.d.ts.map +1 -0
- package/dist/memory/crdt/lww-register.js +169 -0
- package/dist/memory/crdt/or-set.d.ts +141 -0
- package/dist/memory/crdt/or-set.d.ts.map +1 -0
- package/dist/memory/crdt/or-set.js +291 -0
- package/dist/memory/crdt/pn-counter.d.ts +116 -0
- package/dist/memory/crdt/pn-counter.d.ts.map +1 -0
- package/dist/memory/crdt/pn-counter.js +174 -0
- package/dist/memory/crdt/vector-clock.d.ts +115 -0
- package/dist/memory/crdt/vector-clock.d.ts.map +1 -0
- package/dist/memory/crdt/vector-clock.js +179 -0
- package/dist/memory/errors/index.d.ts +56 -0
- package/dist/memory/errors/index.d.ts.map +1 -0
- package/dist/memory/errors/index.js +85 -0
- package/dist/memory/index.d.ts +21 -0
- package/dist/memory/index.d.ts.map +1 -0
- package/dist/memory/index.js +20 -0
- package/dist/memory/persistence/crdt-persistence.d.ts +85 -0
- package/dist/memory/persistence/crdt-persistence.d.ts.map +1 -0
- package/dist/memory/persistence/crdt-persistence.js +204 -0
- package/dist/memory/persistence/index.d.ts +7 -0
- package/dist/memory/persistence/index.d.ts.map +1 -0
- package/dist/memory/persistence/index.js +6 -0
- package/dist/memory/preferences/index.d.ts +7 -0
- package/dist/memory/preferences/index.d.ts.map +1 -0
- package/dist/memory/preferences/index.js +6 -0
- package/dist/memory/preferences/user-preferences-manager.d.ts +133 -0
- package/dist/memory/preferences/user-preferences-manager.d.ts.map +1 -0
- package/dist/memory/preferences/user-preferences-manager.js +342 -0
- package/dist/memory/services/index.d.ts +8 -0
- package/dist/memory/services/index.d.ts.map +1 -0
- package/dist/memory/services/index.js +6 -0
- package/dist/memory/services/node-id-service.d.ts +75 -0
- package/dist/memory/services/node-id-service.d.ts.map +1 -0
- package/dist/memory/services/node-id-service.js +190 -0
- package/dist/memory/stores/episodic-memory.d.ts +182 -0
- package/dist/memory/stores/episodic-memory.d.ts.map +1 -0
- package/dist/memory/stores/episodic-memory.js +378 -0
- package/dist/memory/stores/index.d.ts +16 -0
- package/dist/memory/stores/index.d.ts.map +1 -0
- package/dist/memory/stores/index.js +15 -0
- package/dist/memory/stores/procedural-memory.d.ts +89 -0
- package/dist/memory/stores/procedural-memory.d.ts.map +1 -0
- package/dist/memory/stores/procedural-memory.js +152 -0
- package/dist/memory/stores/semantic-memory.d.ts +92 -0
- package/dist/memory/stores/semantic-memory.d.ts.map +1 -0
- package/dist/memory/stores/semantic-memory.js +155 -0
- package/dist/memory/stores/working-memory.d.ts +225 -0
- package/dist/memory/stores/working-memory.d.ts.map +1 -0
- package/dist/memory/stores/working-memory.js +336 -0
- package/dist/memory/utils/deep-clone.d.ts +10 -0
- package/dist/memory/utils/deep-clone.d.ts.map +1 -0
- package/dist/memory/utils/deep-clone.js +9 -0
- package/dist/memory/utils/index.d.ts +8 -0
- package/dist/memory/utils/index.d.ts.map +1 -0
- package/dist/memory/utils/index.js +7 -0
- package/dist/memory/utils/logger.d.ts +21 -0
- package/dist/memory/utils/logger.d.ts.map +1 -0
- package/dist/memory/utils/logger.js +62 -0
- package/dist/memory/utils/sql-helpers.d.ts +97 -0
- package/dist/memory/utils/sql-helpers.d.ts.map +1 -0
- package/dist/memory/utils/sql-helpers.js +214 -0
- package/dist/memory/utils/validation.d.ts +62 -0
- package/dist/memory/utils/validation.d.ts.map +1 -0
- package/dist/memory/utils/validation.js +244 -0
- package/dist/memory/vector/index.d.ts +12 -0
- package/dist/memory/vector/index.d.ts.map +1 -0
- package/dist/memory/vector/index.js +14 -0
- package/dist/memory/vector/vector-memory-service.d.ts +88 -0
- package/dist/memory/vector/vector-memory-service.d.ts.map +1 -0
- package/dist/memory/vector/vector-memory-service.js +335 -0
- package/dist/observability/logger.d.ts +79 -0
- package/dist/observability/logger.d.ts.map +1 -0
- package/dist/observability/logger.js +165 -0
- package/dist/observability/metrics.d.ts +43 -0
- package/dist/observability/metrics.d.ts.map +1 -0
- package/dist/observability/metrics.js +197 -0
- package/dist/observability/query.d.ts +150 -0
- package/dist/observability/query.d.ts.map +1 -0
- package/dist/observability/query.js +339 -0
- package/dist/observability/types.d.ts +140 -0
- package/dist/observability/types.d.ts.map +1 -0
- package/dist/observability/types.js +6 -0
- package/dist/orchestration/agent.d.ts +98 -0
- package/dist/orchestration/agent.d.ts.map +1 -0
- package/dist/orchestration/agent.js +6 -0
- package/dist/orchestration/defaults.d.ts +21 -0
- package/dist/orchestration/defaults.d.ts.map +1 -0
- package/dist/orchestration/defaults.js +22 -0
- package/dist/orchestration/memory-integration.d.ts +58 -0
- package/dist/orchestration/memory-integration.d.ts.map +1 -0
- package/dist/orchestration/memory-integration.js +130 -0
- package/dist/orchestration/orchestrator.d.ts +67 -0
- package/dist/orchestration/orchestrator.d.ts.map +1 -0
- package/dist/orchestration/orchestrator.js +174 -0
- package/dist/orchestration/runtime.d.ts +82 -0
- package/dist/orchestration/runtime.d.ts.map +1 -0
- package/dist/orchestration/runtime.js +251 -0
- package/dist/orchestration/streaming-runtime.d.ts +36 -0
- package/dist/orchestration/streaming-runtime.d.ts.map +1 -0
- package/dist/orchestration/streaming-runtime.js +175 -0
- package/dist/orchestration/ticket-agent.d.ts +70 -0
- package/dist/orchestration/ticket-agent.d.ts.map +1 -0
- package/dist/orchestration/ticket-agent.js +146 -0
- package/dist/skills/activation/index.d.ts +7 -0
- package/dist/skills/activation/index.d.ts.map +1 -0
- package/dist/skills/activation/index.js +6 -0
- package/dist/skills/activation/skill-activator.d.ts +68 -0
- package/dist/skills/activation/skill-activator.d.ts.map +1 -0
- package/dist/skills/activation/skill-activator.js +224 -0
- package/dist/skills/catalog/catalog-search.d.ts +55 -0
- package/dist/skills/catalog/catalog-search.d.ts.map +1 -0
- package/dist/skills/catalog/catalog-search.js +111 -0
- package/dist/skills/catalog/catalog-types.d.ts +81 -0
- package/dist/skills/catalog/catalog-types.d.ts.map +1 -0
- package/dist/skills/catalog/catalog-types.js +66 -0
- package/dist/skills/catalog/index.d.ts +9 -0
- package/dist/skills/catalog/index.d.ts.map +1 -0
- package/dist/skills/catalog/index.js +7 -0
- package/dist/skills/catalog/vercel-catalog.d.ts +42 -0
- package/dist/skills/catalog/vercel-catalog.d.ts.map +1 -0
- package/dist/skills/catalog/vercel-catalog.js +189 -0
- package/dist/skills/compat/index.d.ts +9 -0
- package/dist/skills/compat/index.d.ts.map +1 -0
- package/dist/skills/compat/index.js +8 -0
- package/dist/skills/compat/skill-enhancer.d.ts +37 -0
- package/dist/skills/compat/skill-enhancer.d.ts.map +1 -0
- package/dist/skills/compat/skill-enhancer.js +76 -0
- package/dist/skills/compat/tool-mapper.d.ts +61 -0
- package/dist/skills/compat/tool-mapper.d.ts.map +1 -0
- package/dist/skills/compat/tool-mapper.js +168 -0
- package/dist/skills/compat/vercel-compat.d.ts +33 -0
- package/dist/skills/compat/vercel-compat.d.ts.map +1 -0
- package/dist/skills/compat/vercel-compat.js +132 -0
- package/dist/skills/index.d.ts +40 -0
- package/dist/skills/index.d.ts.map +1 -0
- package/dist/skills/index.js +47 -0
- package/dist/skills/integration/agent-skill-provider.d.ts +94 -0
- package/dist/skills/integration/agent-skill-provider.d.ts.map +1 -0
- package/dist/skills/integration/agent-skill-provider.js +161 -0
- package/dist/skills/integration/index.d.ts +7 -0
- package/dist/skills/integration/index.d.ts.map +1 -0
- package/dist/skills/integration/index.js +6 -0
- package/dist/skills/loader/github-loader.d.ts +61 -0
- package/dist/skills/loader/github-loader.d.ts.map +1 -0
- package/dist/skills/loader/github-loader.js +176 -0
- package/dist/skills/loader/index.d.ts +10 -0
- package/dist/skills/loader/index.d.ts.map +1 -0
- package/dist/skills/loader/index.js +9 -0
- package/dist/skills/loader/local-loader.d.ts +56 -0
- package/dist/skills/loader/local-loader.d.ts.map +1 -0
- package/dist/skills/loader/local-loader.js +186 -0
- package/dist/skills/loader/vercel-loader.d.ts +64 -0
- package/dist/skills/loader/vercel-loader.d.ts.map +1 -0
- package/dist/skills/loader/vercel-loader.js +313 -0
- package/dist/skills/loader/vercel-types.d.ts +64 -0
- package/dist/skills/loader/vercel-types.d.ts.map +1 -0
- package/dist/skills/loader/vercel-types.js +55 -0
- package/dist/skills/parser/index.d.ts +7 -0
- package/dist/skills/parser/index.d.ts.map +1 -0
- package/dist/skills/parser/index.js +6 -0
- package/dist/skills/parser/skill-md-parser.d.ts +64 -0
- package/dist/skills/parser/skill-md-parser.d.ts.map +1 -0
- package/dist/skills/parser/skill-md-parser.js +242 -0
- package/dist/skills/registry/index.d.ts +7 -0
- package/dist/skills/registry/index.d.ts.map +1 -0
- package/dist/skills/registry/index.js +6 -0
- package/dist/skills/registry/skill-registry.d.ts +133 -0
- package/dist/skills/registry/skill-registry.d.ts.map +1 -0
- package/dist/skills/registry/skill-registry.js +373 -0
- package/dist/skills/types.d.ts +216 -0
- package/dist/skills/types.d.ts.map +1 -0
- package/dist/skills/types.js +176 -0
- package/dist/templates/agent-spec.d.ts +138 -0
- package/dist/templates/agent-spec.d.ts.map +1 -0
- package/dist/templates/agent-spec.js +138 -0
- package/dist/templates/index.d.ts +56 -0
- package/dist/templates/index.d.ts.map +1 -0
- package/dist/templates/index.js +58 -0
- package/dist/templates/prompt-spec.d.ts +140 -0
- package/dist/templates/prompt-spec.d.ts.map +1 -0
- package/dist/templates/prompt-spec.js +210 -0
- package/dist/templates/skill-spec.d.ts +106 -0
- package/dist/templates/skill-spec.d.ts.map +1 -0
- package/dist/templates/skill-spec.js +119 -0
- package/dist/tools/base.d.ts +74 -0
- package/dist/tools/base.d.ts.map +1 -0
- package/dist/tools/base.js +6 -0
- package/dist/tools/cms/collection-tools.d.ts +36 -0
- package/dist/tools/cms/collection-tools.d.ts.map +1 -0
- package/dist/tools/cms/collection-tools.js +178 -0
- package/dist/tools/cms/factory.d.ts +89 -0
- package/dist/tools/cms/factory.d.ts.map +1 -0
- package/dist/tools/cms/factory.js +462 -0
- package/dist/tools/cms/global-tools.d.ts +21 -0
- package/dist/tools/cms/global-tools.d.ts.map +1 -0
- package/dist/tools/cms/global-tools.js +92 -0
- package/dist/tools/cms/index.d.ts +11 -0
- package/dist/tools/cms/index.d.ts.map +1 -0
- package/dist/tools/cms/index.js +11 -0
- package/dist/tools/cms/media-tools.d.ts +31 -0
- package/dist/tools/cms/media-tools.d.ts.map +1 -0
- package/dist/tools/cms/media-tools.js +140 -0
- package/dist/tools/cms/user-tools.d.ts +31 -0
- package/dist/tools/cms/user-tools.d.ts.map +1 -0
- package/dist/tools/cms/user-tools.js +135 -0
- package/dist/tools/deduplicator.d.ts +19 -0
- package/dist/tools/deduplicator.d.ts.map +1 -0
- package/dist/tools/deduplicator.js +53 -0
- package/dist/tools/document-summarizer.d.ts +11 -0
- package/dist/tools/document-summarizer.d.ts.map +1 -0
- package/dist/tools/document-summarizer.js +82 -0
- package/dist/tools/mcp-adapter.d.ts +66 -0
- package/dist/tools/mcp-adapter.d.ts.map +1 -0
- package/dist/tools/mcp-adapter.js +152 -0
- package/dist/tools/memory/index.d.ts +3 -0
- package/dist/tools/memory/index.d.ts.map +1 -0
- package/dist/tools/memory/index.js +1 -0
- package/dist/tools/memory/store-memory.d.ts +39 -0
- package/dist/tools/memory/store-memory.d.ts.map +1 -0
- package/dist/tools/memory/store-memory.js +94 -0
- package/dist/tools/registry.d.ts +14 -0
- package/dist/tools/registry.d.ts.map +1 -0
- package/dist/tools/registry.js +48 -0
- package/dist/tools/ticket-tools.d.ts +31 -0
- package/dist/tools/ticket-tools.d.ts.map +1 -0
- package/dist/tools/ticket-tools.js +74 -0
- package/dist/tools/web/duck-duck-go.d.ts +52 -0
- package/dist/tools/web/duck-duck-go.d.ts.map +1 -0
- package/dist/tools/web/duck-duck-go.js +202 -0
- package/dist/tools/web/exa.d.ts +34 -0
- package/dist/tools/web/exa.d.ts.map +1 -0
- package/dist/tools/web/exa.js +80 -0
- package/dist/tools/web/index.d.ts +6 -0
- package/dist/tools/web/index.d.ts.map +1 -0
- package/dist/tools/web/index.js +4 -0
- package/dist/tools/web/scraper.d.ts +9 -0
- package/dist/tools/web/scraper.d.ts.map +1 -0
- package/dist/tools/web/scraper.js +118 -0
- package/dist/tools/web/tavily.d.ts +32 -0
- package/dist/tools/web/tavily.d.ts.map +1 -0
- package/dist/tools/web/tavily.js +73 -0
- package/dist/tools/web/types.d.ts +31 -0
- package/dist/tools/web/types.d.ts.map +1 -0
- package/dist/tools/web/types.js +9 -0
- package/package.json +143 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File Parsers
|
|
3
|
+
*
|
|
4
|
+
* Zero external dependencies — pure Node.js + regex/string manipulation.
|
|
5
|
+
* PDF extraction uses a hand-rolled parser that reads text streams directly
|
|
6
|
+
* from the binary PDF format (BT/ET blocks, Tj/TJ operators).
|
|
7
|
+
*/
|
|
8
|
+
// =============================================================================
|
|
9
|
+
// Plain Text
|
|
10
|
+
// =============================================================================
|
|
11
|
+
export class PlainTextParser {
|
|
12
|
+
parse(input) {
|
|
13
|
+
return { text: input.replace(/\r\n/g, '\n').replace(/\r/g, '\n').trim() };
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
// =============================================================================
|
|
17
|
+
// Markdown
|
|
18
|
+
// =============================================================================
|
|
19
|
+
const FRONTMATTER_RE = /^---[\s\S]*?---\n?/;
|
|
20
|
+
const CODE_FENCE_RE = /```[\s\S]*?```/g;
|
|
21
|
+
const INLINE_CODE_RE = /`[^`]*`/g;
|
|
22
|
+
const HEADING_RE = /^#{1,6}\s+/gm;
|
|
23
|
+
const EMPHASIS_RE = /[*_]{1,3}([^*_]+)[*_]{1,3}/g;
|
|
24
|
+
const LINK_RE = /\[([^\]]*)\]\([^)]*\)/g;
|
|
25
|
+
const IMAGE_RE = /!\[[^\]]*\]\([^)]*\)/g;
|
|
26
|
+
const HTML_TAG_IN_MD_RE = /<[^>]+>/g;
|
|
27
|
+
const HORIZONTAL_RULE_RE = /^[-*_]{3,}\s*$/gm;
|
|
28
|
+
export class MarkdownParser {
|
|
29
|
+
parse(input) {
|
|
30
|
+
const text = input
|
|
31
|
+
// Strip frontmatter
|
|
32
|
+
.replace(FRONTMATTER_RE, '')
|
|
33
|
+
// Strip code fences (preserve content inside)
|
|
34
|
+
.replace(CODE_FENCE_RE, (m) => m.replace(/```\w*\n?/g, '').replace(/```/g, ''))
|
|
35
|
+
// Strip inline code backticks but keep content
|
|
36
|
+
.replace(INLINE_CODE_RE, (m) => m.slice(1, -1))
|
|
37
|
+
// Strip heading markers
|
|
38
|
+
.replace(HEADING_RE, '')
|
|
39
|
+
// Unwrap bold/italic — keep inner text
|
|
40
|
+
.replace(EMPHASIS_RE, '$1')
|
|
41
|
+
// Unwrap links — keep link text
|
|
42
|
+
.replace(LINK_RE, '$1')
|
|
43
|
+
// Remove images entirely
|
|
44
|
+
.replace(IMAGE_RE, '')
|
|
45
|
+
// Strip inline HTML
|
|
46
|
+
.replace(HTML_TAG_IN_MD_RE, '')
|
|
47
|
+
// Remove horizontal rules
|
|
48
|
+
.replace(HORIZONTAL_RULE_RE, '')
|
|
49
|
+
// Normalize CRLF
|
|
50
|
+
.replace(/\r\n/g, '\n')
|
|
51
|
+
.replace(/\r/g, '\n')
|
|
52
|
+
// Collapse 3+ blank lines to 2
|
|
53
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
54
|
+
.trim();
|
|
55
|
+
return { text };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// =============================================================================
|
|
59
|
+
// HTML
|
|
60
|
+
// =============================================================================
|
|
61
|
+
const SCRIPT_STYLE_RE = /<(script|style)[^>]*>[\s\S]*?<\/\1>/gi;
|
|
62
|
+
const HTML_COMMENT_RE = /<!--[\s\S]*?-->/g;
|
|
63
|
+
const HTML_ALL_TAGS_RE = /<[^>]+>/g;
|
|
64
|
+
const HTML_ENTITIES = {
|
|
65
|
+
'&': '&',
|
|
66
|
+
'<': '<',
|
|
67
|
+
'>': '>',
|
|
68
|
+
'"': '"',
|
|
69
|
+
''': "'",
|
|
70
|
+
' ': ' ',
|
|
71
|
+
''': "'",
|
|
72
|
+
};
|
|
73
|
+
function decodeEntities(text) {
|
|
74
|
+
return text
|
|
75
|
+
.replace(/&[a-z]+;|&#\d+;/gi, (entity) => HTML_ENTITIES[entity] ?? entity)
|
|
76
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)));
|
|
77
|
+
}
|
|
78
|
+
export class HtmlParser {
|
|
79
|
+
parse(input) {
|
|
80
|
+
const text = decodeEntities(input
|
|
81
|
+
.replace(SCRIPT_STYLE_RE, '')
|
|
82
|
+
.replace(HTML_COMMENT_RE, '')
|
|
83
|
+
.replace(HTML_ALL_TAGS_RE, ' ')
|
|
84
|
+
.replace(/\r\n/g, '\n')
|
|
85
|
+
.replace(/\r/g, '\n')
|
|
86
|
+
.replace(/[ \t]+/g, ' ')
|
|
87
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
88
|
+
.trim());
|
|
89
|
+
return { text };
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
// =============================================================================
|
|
93
|
+
// PDF
|
|
94
|
+
// =============================================================================
|
|
95
|
+
/**
|
|
96
|
+
* Decode a PDF literal string token.
|
|
97
|
+
*
|
|
98
|
+
* PDF literal strings are enclosed in `(…)` and support:
|
|
99
|
+
* - Escape sequences: \n \r \t \b \f \\ \( \)
|
|
100
|
+
* - Octal escapes: \nnn
|
|
101
|
+
* - Line continuation: \ at end of line (ignored)
|
|
102
|
+
*/
|
|
103
|
+
function decodePdfLiteral(raw) {
|
|
104
|
+
return raw.replace(/\\([\\()nrtbf]|[0-7]{1,3}|\n)/g, (_, esc) => {
|
|
105
|
+
if (esc === '\n')
|
|
106
|
+
return '';
|
|
107
|
+
if (esc === 'n')
|
|
108
|
+
return '\n';
|
|
109
|
+
if (esc === 'r')
|
|
110
|
+
return '\r';
|
|
111
|
+
if (esc === 't')
|
|
112
|
+
return '\t';
|
|
113
|
+
if (esc === 'b')
|
|
114
|
+
return '\b';
|
|
115
|
+
if (esc === 'f')
|
|
116
|
+
return '\f';
|
|
117
|
+
if (esc === '\\')
|
|
118
|
+
return '\\';
|
|
119
|
+
if (esc === '(')
|
|
120
|
+
return '(';
|
|
121
|
+
if (esc === ')')
|
|
122
|
+
return ')';
|
|
123
|
+
// Octal
|
|
124
|
+
return String.fromCharCode(Number.parseInt(esc, 8));
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Extract text from a single BT…ET block.
|
|
129
|
+
* Handles: (string)Tj [(array)]TJ (string)' (string)"
|
|
130
|
+
*/
|
|
131
|
+
function extractTextFromBlock(block) {
|
|
132
|
+
const parts = [];
|
|
133
|
+
// Tj — show string: (text) Tj
|
|
134
|
+
const tjRe = /\(([^)]*(?:\\.[^)]*)*)\)\s*Tj/g;
|
|
135
|
+
let m;
|
|
136
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: standard regex loop
|
|
137
|
+
while ((m = tjRe.exec(block)) !== null) {
|
|
138
|
+
if (m[1] !== undefined)
|
|
139
|
+
parts.push(decodePdfLiteral(m[1]));
|
|
140
|
+
}
|
|
141
|
+
// TJ — show array: [(t1)(t2) num …] TJ
|
|
142
|
+
const tjArrayRe = /\[([^\]]*)\]\s*TJ/g;
|
|
143
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: standard regex loop
|
|
144
|
+
while ((m = tjArrayRe.exec(block)) !== null) {
|
|
145
|
+
const inner = m[1] ?? '';
|
|
146
|
+
const strRe = /\(([^)]*(?:\\.[^)]*)*)\)/g;
|
|
147
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: standard regex loop
|
|
148
|
+
while ((m = strRe.exec(inner)) !== null) {
|
|
149
|
+
if (m[1] !== undefined)
|
|
150
|
+
parts.push(decodePdfLiteral(m[1]));
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// ' and " operators (move-to-next-line-then-show)
|
|
154
|
+
const quoteRe = /\(([^)]*(?:\\.[^)]*)*)\)\s*['"]/g;
|
|
155
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: standard regex loop
|
|
156
|
+
while ((m = quoteRe.exec(block)) !== null) {
|
|
157
|
+
if (m[1] !== undefined)
|
|
158
|
+
parts.push(decodePdfLiteral(m[1]));
|
|
159
|
+
}
|
|
160
|
+
return parts.join(' ');
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Hand-rolled PDF text extractor.
|
|
164
|
+
*
|
|
165
|
+
* Reads text content directly from the binary PDF format without any
|
|
166
|
+
* external dependencies. Works for text-based PDFs (the vast majority).
|
|
167
|
+
* Encrypted or image-only PDFs return an empty string.
|
|
168
|
+
*
|
|
169
|
+
* Supports:
|
|
170
|
+
* - BT/ET text blocks with Tj, TJ, ' and " operators
|
|
171
|
+
* - Literal string decoding (escape sequences + octal)
|
|
172
|
+
* - Page count from /Count dictionary entry
|
|
173
|
+
* - Document title from /Title dictionary entry
|
|
174
|
+
*
|
|
175
|
+
*/
|
|
176
|
+
export class PdfParser {
|
|
177
|
+
/** Parse a PDF string (for API compatibility; use parseBuffer for binary data). */
|
|
178
|
+
parse(input) {
|
|
179
|
+
return this.parseBuffer(Buffer.from(input, 'binary'));
|
|
180
|
+
}
|
|
181
|
+
/** Parse a PDF from a Buffer. */
|
|
182
|
+
parseBuffer(input) {
|
|
183
|
+
const src = input.toString('binary');
|
|
184
|
+
// Extract all BT…ET text blocks
|
|
185
|
+
const btEtRe = /BT\s([\s\S]*?)\sET/g;
|
|
186
|
+
const blocks = [];
|
|
187
|
+
let m;
|
|
188
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: standard regex loop
|
|
189
|
+
while ((m = btEtRe.exec(src)) !== null) {
|
|
190
|
+
if (m[1] !== undefined)
|
|
191
|
+
blocks.push(m[1]);
|
|
192
|
+
}
|
|
193
|
+
const rawText = blocks.map(extractTextFromBlock).filter(Boolean).join('\n');
|
|
194
|
+
const text = rawText
|
|
195
|
+
.replace(/[ \t]+/g, ' ')
|
|
196
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
197
|
+
.trim();
|
|
198
|
+
// Page count from /Count N
|
|
199
|
+
const countMatch = /\/Count\s+(\d+)/.exec(src);
|
|
200
|
+
const pageCount = countMatch ? Number.parseInt(countMatch[1] ?? '0', 10) : 0;
|
|
201
|
+
// Document title from /Title (string) or /Title <hex>
|
|
202
|
+
let title;
|
|
203
|
+
const titleMatch = /\/Title\s*\(([^)]*(?:\\.[^)]*)*)\)/.exec(src);
|
|
204
|
+
if (titleMatch) {
|
|
205
|
+
title = decodePdfLiteral(titleMatch[1] ?? '').trim() || undefined;
|
|
206
|
+
}
|
|
207
|
+
return {
|
|
208
|
+
text,
|
|
209
|
+
metadata: {
|
|
210
|
+
pageCount,
|
|
211
|
+
...(title ? { title } : {}),
|
|
212
|
+
},
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
// =============================================================================
|
|
217
|
+
// JSON
|
|
218
|
+
// =============================================================================
|
|
219
|
+
export class JsonParser {
|
|
220
|
+
parse(input) {
|
|
221
|
+
try {
|
|
222
|
+
const parsed = JSON.parse(input);
|
|
223
|
+
return { text: JSON.stringify(parsed, null, 2) };
|
|
224
|
+
}
|
|
225
|
+
catch {
|
|
226
|
+
// Not valid JSON — return as-is
|
|
227
|
+
return { text: input.trim() };
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
export function createParser(mimeType) {
|
|
232
|
+
const normalized = mimeType.toLowerCase().split(';')[0]?.trim() ?? '';
|
|
233
|
+
if (normalized === 'text/markdown' || normalized === 'text/x-markdown') {
|
|
234
|
+
return new MarkdownParser();
|
|
235
|
+
}
|
|
236
|
+
if (normalized === 'text/html' || normalized === 'application/xhtml+xml') {
|
|
237
|
+
return new HtmlParser();
|
|
238
|
+
}
|
|
239
|
+
if (normalized === 'application/json') {
|
|
240
|
+
return new JsonParser();
|
|
241
|
+
}
|
|
242
|
+
if (normalized === 'application/pdf') {
|
|
243
|
+
return new PdfParser();
|
|
244
|
+
}
|
|
245
|
+
// Default: plain text (covers text/plain and unknown types)
|
|
246
|
+
return new PlainTextParser();
|
|
247
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hybrid Search — Vector + BM25 with Reciprocal Rank Fusion
|
|
3
|
+
*
|
|
4
|
+
* speed mode: cosine vector search only (fast, default)
|
|
5
|
+
* accuracy mode: fetch 50 vector candidates → BM25 on those → RRF fusion → optional LLM rerank
|
|
6
|
+
*
|
|
7
|
+
* RRF formula: score(d) = Σ 1 / (k + rank(d)) where k = 60
|
|
8
|
+
*/
|
|
9
|
+
import type { Database } from '@revealui/db/client';
|
|
10
|
+
import type { LLMClient } from '../llm/client.js';
|
|
11
|
+
import type { RagSearchOptions, RagSearchResult } from './rag-vector-service.js';
|
|
12
|
+
export interface HybridSearchOptions extends RagSearchOptions {
|
|
13
|
+
/** speed = vector only, accuracy = BM25 hybrid + optional rerank (default: 'speed') */
|
|
14
|
+
mode?: 'speed' | 'accuracy';
|
|
15
|
+
/** Number of vector candidates to fetch before BM25 filtering (accuracy mode only) */
|
|
16
|
+
vectorCandidates?: number;
|
|
17
|
+
/** Whether to apply LLM re-ranking in accuracy mode (default: false) */
|
|
18
|
+
rerank?: boolean;
|
|
19
|
+
llmClient?: LLMClient;
|
|
20
|
+
}
|
|
21
|
+
export declare function hybridSearch(query: string, _db: Database, embeddingFn: (text: string) => Promise<number[]>, options: HybridSearchOptions): Promise<RagSearchResult[]>;
|
|
22
|
+
//# sourceMappingURL=hybrid-search.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hybrid-search.d.ts","sourceRoot":"","sources":["../../src/ingestion/hybrid-search.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAA;AACnD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AAEjD,OAAO,KAAK,EAAE,gBAAgB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAA;AAIhF,MAAM,WAAW,mBAAoB,SAAQ,gBAAgB;IAC3D,uFAAuF;IACvF,IAAI,CAAC,EAAE,OAAO,GAAG,UAAU,CAAA;IAC3B,sFAAsF;IACtF,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB,wEAAwE;IACxE,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB,SAAS,CAAC,EAAE,SAAS,CAAA;CACtB;AAID,wBAAsB,YAAY,CAChC,KAAK,EAAE,MAAM,EACb,GAAG,EAAE,QAAQ,EACb,WAAW,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,EAAE,CAAC,EAChD,OAAO,EAAE,mBAAmB,GAC3B,OAAO,CAAC,eAAe,EAAE,CAAC,CA+D5B"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hybrid Search — Vector + BM25 with Reciprocal Rank Fusion
|
|
3
|
+
*
|
|
4
|
+
* speed mode: cosine vector search only (fast, default)
|
|
5
|
+
* accuracy mode: fetch 50 vector candidates → BM25 on those → RRF fusion → optional LLM rerank
|
|
6
|
+
*
|
|
7
|
+
* RRF formula: score(d) = Σ 1 / (k + rank(d)) where k = 60
|
|
8
|
+
*/
|
|
9
|
+
import { BM25 } from './bm25.js';
|
|
10
|
+
import { RagVectorService } from './rag-vector-service.js';
|
|
11
|
+
import { rerankChunks } from './reranker.js';
|
|
12
|
+
const RRF_K = 60;
|
|
13
|
+
export async function hybridSearch(query, _db, embeddingFn, options) {
|
|
14
|
+
const service = new RagVectorService();
|
|
15
|
+
const mode = options.mode ?? 'speed';
|
|
16
|
+
const limit = options.limit ?? 5;
|
|
17
|
+
// Speed mode: pure vector search
|
|
18
|
+
if (mode === 'speed') {
|
|
19
|
+
const embedding = await embeddingFn(query);
|
|
20
|
+
return service.searchSimilar(embedding, { ...options, limit });
|
|
21
|
+
}
|
|
22
|
+
// Accuracy mode: over-fetch vector candidates, BM25 on them, RRF fusion
|
|
23
|
+
const candidateCount = options.vectorCandidates ?? 50;
|
|
24
|
+
const embedding = await embeddingFn(query);
|
|
25
|
+
const vectorResults = await service.searchSimilar(embedding, {
|
|
26
|
+
...options,
|
|
27
|
+
limit: candidateCount,
|
|
28
|
+
threshold: 0.3, // lower threshold to get more candidates
|
|
29
|
+
});
|
|
30
|
+
if (vectorResults.length === 0)
|
|
31
|
+
return [];
|
|
32
|
+
// Build BM25 index from vector candidates
|
|
33
|
+
const bm25 = new BM25();
|
|
34
|
+
bm25.index(vectorResults.map((r) => ({
|
|
35
|
+
id: r.chunk.id,
|
|
36
|
+
text: r.chunk.content,
|
|
37
|
+
})));
|
|
38
|
+
const bm25Results = bm25.search(query, candidateCount);
|
|
39
|
+
// Build rank maps
|
|
40
|
+
const vectorRankMap = new Map(vectorResults.map((r, i) => [r.chunk.id, i + 1]));
|
|
41
|
+
const bm25RankMap = new Map(bm25Results.map((r, i) => [r.id, i + 1]));
|
|
42
|
+
// RRF fusion
|
|
43
|
+
const allIds = new Set([...vectorRankMap.keys(), ...bm25RankMap.keys()]);
|
|
44
|
+
const rrfScores = new Map();
|
|
45
|
+
for (const id of allIds) {
|
|
46
|
+
const vectorRank = vectorRankMap.get(id) ?? candidateCount + 1;
|
|
47
|
+
const bm25Rank = bm25RankMap.get(id) ?? candidateCount + 1;
|
|
48
|
+
const rrf = 1 / (RRF_K + vectorRank) + 1 / (RRF_K + bm25Rank);
|
|
49
|
+
rrfScores.set(id, rrf);
|
|
50
|
+
}
|
|
51
|
+
// Sort by RRF score and map back to RagSearchResult
|
|
52
|
+
const resultMap = new Map(vectorResults.map((r) => [r.chunk.id, r]));
|
|
53
|
+
const sorted = Array.from(rrfScores.entries())
|
|
54
|
+
.sort((a, b) => b[1] - a[1])
|
|
55
|
+
.slice(0, limit)
|
|
56
|
+
.map(([id]) => resultMap.get(id))
|
|
57
|
+
.filter((r) => r !== undefined);
|
|
58
|
+
// Optional LLM re-ranking
|
|
59
|
+
if (options.rerank && options.llmClient) {
|
|
60
|
+
return rerankChunks(query, sorted, options.llmClient, limit);
|
|
61
|
+
}
|
|
62
|
+
return sorted;
|
|
63
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export * from './bm25.js';
|
|
2
|
+
export * from './cms-indexer.js';
|
|
3
|
+
export * from './file-parsers.js';
|
|
4
|
+
export * from './hybrid-search.js';
|
|
5
|
+
export * from './pipeline.js';
|
|
6
|
+
export * from './rag-vector-service.js';
|
|
7
|
+
export * from './reranker.js';
|
|
8
|
+
export * from './text-splitter.js';
|
|
9
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/ingestion/index.ts"],"names":[],"mappings":"AAAA,cAAc,WAAW,CAAA;AACzB,cAAc,kBAAkB,CAAA;AAChC,cAAc,mBAAmB,CAAA;AACjC,cAAc,oBAAoB,CAAA;AAClC,cAAc,eAAe,CAAA;AAC7B,cAAc,yBAAyB,CAAA;AACvC,cAAc,eAAe,CAAA;AAC7B,cAAc,oBAAoB,CAAA"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export * from './bm25.js';
|
|
2
|
+
export * from './cms-indexer.js';
|
|
3
|
+
export * from './file-parsers.js';
|
|
4
|
+
export * from './hybrid-search.js';
|
|
5
|
+
export * from './pipeline.js';
|
|
6
|
+
export * from './rag-vector-service.js';
|
|
7
|
+
export * from './reranker.js';
|
|
8
|
+
export * from './text-splitter.js';
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ingestion Pipeline
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates: parse → split → embed → store.
|
|
5
|
+
* One document produces N chunks, each with a 768-dim embedding.
|
|
6
|
+
*/
|
|
7
|
+
import type { Database } from '@revealui/db/client';
|
|
8
|
+
export interface IngestRequest {
|
|
9
|
+
workspaceId: string;
|
|
10
|
+
sourceType: 'cms_collection' | 'url' | 'file' | 'text';
|
|
11
|
+
sourceId?: string;
|
|
12
|
+
sourceCollection?: string;
|
|
13
|
+
title?: string;
|
|
14
|
+
mimeType?: string;
|
|
15
|
+
rawContent: string;
|
|
16
|
+
chunkSize?: number;
|
|
17
|
+
chunkOverlap?: number;
|
|
18
|
+
}
|
|
19
|
+
export interface IngestResult {
|
|
20
|
+
documentId: string;
|
|
21
|
+
chunkCount: number;
|
|
22
|
+
status: 'indexed' | 'failed';
|
|
23
|
+
error?: string;
|
|
24
|
+
}
|
|
25
|
+
export declare class IngestionPipeline {
|
|
26
|
+
private db;
|
|
27
|
+
private embeddingFn;
|
|
28
|
+
private splitter;
|
|
29
|
+
constructor(db: Database, embeddingFn: (text: string) => Promise<number[]>);
|
|
30
|
+
ingest(req: IngestRequest): Promise<IngestResult>;
|
|
31
|
+
ingestBatch(docs: IngestRequest[]): Promise<IngestResult[]>;
|
|
32
|
+
deleteDocument(documentId: string): Promise<void>;
|
|
33
|
+
deleteBySource(workspaceId: string, sourceCollection: string, sourceId: string): Promise<void>;
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/ingestion/pipeline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAA;AAMnD,MAAM,WAAW,aAAa;IAC5B,WAAW,EAAE,MAAM,CAAA;IACnB,UAAU,EAAE,gBAAgB,GAAG,KAAK,GAAG,MAAM,GAAG,MAAM,CAAA;IACtD,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,UAAU,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,YAAY;IAC3B,UAAU,EAAE,MAAM,CAAA;IAClB,UAAU,EAAE,MAAM,CAAA;IAClB,MAAM,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC5B,KAAK,CAAC,EAAE,MAAM,CAAA;CACf;AAcD,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,EAAE,CAAU;IACpB,OAAO,CAAC,WAAW,CAAqC;IACxD,OAAO,CAAC,QAAQ,CAA4B;gBAEhC,EAAE,EAAE,QAAQ,EAAE,WAAW,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,EAAE,CAAC;IAMpE,MAAM,CAAC,GAAG,EAAE,aAAa,GAAG,OAAO,CAAC,YAAY,CAAC;IAsEjD,WAAW,CAAC,IAAI,EAAE,aAAa,EAAE,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAa3D,cAAc,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKjD,cAAc,CAClB,WAAW,EAAE,MAAM,EACnB,gBAAgB,EAAE,MAAM,EACxB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,IAAI,CAAC;CAiBjB"}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ingestion Pipeline
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates: parse → split → embed → store.
|
|
5
|
+
* One document produces N chunks, each with a 768-dim embedding.
|
|
6
|
+
*/
|
|
7
|
+
import { ragChunks, ragDocuments } from '@revealui/db/schema/rag';
|
|
8
|
+
import { and, eq } from 'drizzle-orm';
|
|
9
|
+
import { createParser } from './file-parsers.js';
|
|
10
|
+
import { RecursiveCharacterSplitter } from './text-splitter.js';
|
|
11
|
+
function generateId(prefix) {
|
|
12
|
+
return `${prefix}-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
|
|
13
|
+
}
|
|
14
|
+
function estimateWordCount(text) {
|
|
15
|
+
return text.split(/\s+/).filter((w) => w.length > 0).length;
|
|
16
|
+
}
|
|
17
|
+
function estimateTokens(text) {
|
|
18
|
+
return Math.ceil(text.length / 4);
|
|
19
|
+
}
|
|
20
|
+
export class IngestionPipeline {
|
|
21
|
+
db;
|
|
22
|
+
embeddingFn;
|
|
23
|
+
splitter;
|
|
24
|
+
constructor(db, embeddingFn) {
|
|
25
|
+
this.db = db;
|
|
26
|
+
this.embeddingFn = embeddingFn;
|
|
27
|
+
this.splitter = new RecursiveCharacterSplitter();
|
|
28
|
+
}
|
|
29
|
+
async ingest(req) {
|
|
30
|
+
const docId = generateId('rdoc');
|
|
31
|
+
const now = new Date();
|
|
32
|
+
// 1. Insert document row with status='processing'
|
|
33
|
+
await this.db.insert(ragDocuments).values({
|
|
34
|
+
id: docId,
|
|
35
|
+
workspaceId: req.workspaceId,
|
|
36
|
+
sourceType: req.sourceType,
|
|
37
|
+
sourceId: req.sourceId ?? null,
|
|
38
|
+
sourceCollection: req.sourceCollection ?? null,
|
|
39
|
+
title: req.title ?? null,
|
|
40
|
+
mimeType: req.mimeType ?? 'text/plain',
|
|
41
|
+
rawContent: req.rawContent,
|
|
42
|
+
wordCount: estimateWordCount(req.rawContent),
|
|
43
|
+
tokenEstimate: estimateTokens(req.rawContent),
|
|
44
|
+
status: 'processing',
|
|
45
|
+
createdAt: now,
|
|
46
|
+
updatedAt: now,
|
|
47
|
+
});
|
|
48
|
+
try {
|
|
49
|
+
// 2. Parse
|
|
50
|
+
const parser = createParser(req.mimeType ?? 'text/plain');
|
|
51
|
+
const { text } = parser.parse(req.rawContent);
|
|
52
|
+
// 3. Split
|
|
53
|
+
const chunks = this.splitter.split(text, {
|
|
54
|
+
chunkSize: req.chunkSize ?? 512,
|
|
55
|
+
overlap: req.chunkOverlap ?? 64,
|
|
56
|
+
});
|
|
57
|
+
// 4. Embed and insert each chunk
|
|
58
|
+
for (const chunk of chunks) {
|
|
59
|
+
const embedding = await this.embeddingFn(chunk.content);
|
|
60
|
+
const chunkId = generateId('rchk');
|
|
61
|
+
await this.db.insert(ragChunks).values({
|
|
62
|
+
id: chunkId,
|
|
63
|
+
documentId: docId,
|
|
64
|
+
workspaceId: req.workspaceId,
|
|
65
|
+
content: chunk.content,
|
|
66
|
+
tokenCount: chunk.tokenCount,
|
|
67
|
+
chunkIndex: chunk.index,
|
|
68
|
+
embedding,
|
|
69
|
+
embeddingModel: 'nomic-embed-text',
|
|
70
|
+
metadata: chunk.metadata ?? {},
|
|
71
|
+
createdAt: now,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
// 5. Mark as indexed
|
|
75
|
+
await this.db
|
|
76
|
+
.update(ragDocuments)
|
|
77
|
+
.set({ status: 'indexed', indexedAt: now, updatedAt: now })
|
|
78
|
+
.where(eq(ragDocuments.id, docId));
|
|
79
|
+
return { documentId: docId, chunkCount: chunks.length, status: 'indexed' };
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
83
|
+
await this.db
|
|
84
|
+
.update(ragDocuments)
|
|
85
|
+
.set({ status: 'failed', errorMessage: message, updatedAt: new Date() })
|
|
86
|
+
.where(eq(ragDocuments.id, docId));
|
|
87
|
+
return { documentId: docId, chunkCount: 0, status: 'failed', error: message };
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
async ingestBatch(docs) {
|
|
91
|
+
const results = [];
|
|
92
|
+
const concurrencyLimit = 3;
|
|
93
|
+
for (let i = 0; i < docs.length; i += concurrencyLimit) {
|
|
94
|
+
const batch = docs.slice(i, i + concurrencyLimit);
|
|
95
|
+
const batchResults = await Promise.all(batch.map((doc) => this.ingest(doc)));
|
|
96
|
+
results.push(...batchResults);
|
|
97
|
+
}
|
|
98
|
+
return results;
|
|
99
|
+
}
|
|
100
|
+
async deleteDocument(documentId) {
|
|
101
|
+
// Chunks cascade-delete via FK
|
|
102
|
+
await this.db.delete(ragDocuments).where(eq(ragDocuments.id, documentId));
|
|
103
|
+
}
|
|
104
|
+
async deleteBySource(workspaceId, sourceCollection, sourceId) {
|
|
105
|
+
// Find and delete documents by source identity
|
|
106
|
+
const docs = await this.db
|
|
107
|
+
.select({ id: ragDocuments.id })
|
|
108
|
+
.from(ragDocuments)
|
|
109
|
+
.where(and(eq(ragDocuments.workspaceId, workspaceId), eq(ragDocuments.sourceCollection, sourceCollection), eq(ragDocuments.sourceId, sourceId)));
|
|
110
|
+
for (const doc of docs) {
|
|
111
|
+
await this.deleteDocument(doc.id);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RAG Vector Service
|
|
3
|
+
*
|
|
4
|
+
* Semantic similarity search over rag_chunks using pgvector cosine distance.
|
|
5
|
+
* Modeled after VectorMemoryService — same Drizzle cosine-distance pattern.
|
|
6
|
+
*/
|
|
7
|
+
import type { RagChunk, RagDocument } from '@revealui/db/schema/rag';
|
|
8
|
+
export interface RagSearchOptions {
|
|
9
|
+
workspaceId: string;
|
|
10
|
+
limit?: number;
|
|
11
|
+
threshold?: number;
|
|
12
|
+
sourceCollection?: string;
|
|
13
|
+
}
|
|
14
|
+
export interface RagSearchResult {
|
|
15
|
+
chunk: RagChunk;
|
|
16
|
+
document: Pick<RagDocument, 'id' | 'title' | 'sourceType' | 'sourceCollection' | 'sourceId' | 'createdAt'>;
|
|
17
|
+
similarity: number;
|
|
18
|
+
}
|
|
19
|
+
export declare class RagVectorService {
|
|
20
|
+
private _db;
|
|
21
|
+
private get db();
|
|
22
|
+
/**
|
|
23
|
+
* Search for RAG chunks similar to the query embedding.
|
|
24
|
+
*
|
|
25
|
+
* Uses cosine distance (<->) from pgvector.
|
|
26
|
+
* Validates that the embedding is 768-dimensional (nomic-embed-text).
|
|
27
|
+
*/
|
|
28
|
+
searchSimilar(queryEmbedding: number[], options: RagSearchOptions): Promise<RagSearchResult[]>;
|
|
29
|
+
/**
|
|
30
|
+
* Get all chunks for a document ordered by chunk index.
|
|
31
|
+
*/
|
|
32
|
+
getChunksByDocument(documentId: string): Promise<RagChunk[]>;
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=rag-vector-service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rag-vector-service.d.ts","sourceRoot":"","sources":["../../src/ingestion/rag-vector-service.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAA;AAIpE,MAAM,WAAW,gBAAgB;IAC/B,WAAW,EAAE,MAAM,CAAA;IACnB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,gBAAgB,CAAC,EAAE,MAAM,CAAA;CAC1B;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,QAAQ,CAAA;IACf,QAAQ,EAAE,IAAI,CACZ,WAAW,EACX,IAAI,GAAG,OAAO,GAAG,YAAY,GAAG,kBAAkB,GAAG,UAAU,GAAG,WAAW,CAC9E,CAAA;IACD,UAAU,EAAE,MAAM,CAAA;CACnB;AAED,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,GAAG,CAAkD;IAE7D,OAAO,KAAK,EAAE,GAKb;IAED;;;;;OAKG;IACG,aAAa,CACjB,cAAc,EAAE,MAAM,EAAE,EACxB,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,eAAe,EAAE,CAAC;IA8E7B;;OAEG;IACG,mBAAmB,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;CAOnE"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RAG Vector Service
|
|
3
|
+
*
|
|
4
|
+
* Semantic similarity search over rag_chunks using pgvector cosine distance.
|
|
5
|
+
* Modeled after VectorMemoryService — same Drizzle cosine-distance pattern.
|
|
6
|
+
*/
|
|
7
|
+
import { getVectorClient } from '@revealui/db/client';
|
|
8
|
+
import { ragChunks, ragDocuments } from '@revealui/db/schema/rag';
|
|
9
|
+
import { and, asc, eq, sql } from 'drizzle-orm';
|
|
10
|
+
export class RagVectorService {
|
|
11
|
+
_db = null;
|
|
12
|
+
get db() {
|
|
13
|
+
if (!this._db) {
|
|
14
|
+
this._db = getVectorClient();
|
|
15
|
+
}
|
|
16
|
+
return this._db;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Search for RAG chunks similar to the query embedding.
|
|
20
|
+
*
|
|
21
|
+
* Uses cosine distance (<->) from pgvector.
|
|
22
|
+
* Validates that the embedding is 768-dimensional (nomic-embed-text).
|
|
23
|
+
*/
|
|
24
|
+
async searchSimilar(queryEmbedding, options) {
|
|
25
|
+
if (queryEmbedding.length !== 768) {
|
|
26
|
+
throw new Error(`Invalid embedding dimension: expected 768, got ${queryEmbedding.length}. ` +
|
|
27
|
+
'RAG uses nomic-embed-text (768-dim). Ensure OLLAMA_BASE_URL is set.');
|
|
28
|
+
}
|
|
29
|
+
const conditions = [
|
|
30
|
+
sql `${ragChunks}.embedding IS NOT NULL`,
|
|
31
|
+
eq(ragChunks.workspaceId, options.workspaceId),
|
|
32
|
+
];
|
|
33
|
+
if (options.sourceCollection) {
|
|
34
|
+
conditions.push(eq(ragDocuments.sourceCollection, options.sourceCollection));
|
|
35
|
+
}
|
|
36
|
+
const threshold = options.threshold ?? 0.6;
|
|
37
|
+
const limit = options.limit ?? 5;
|
|
38
|
+
const embeddingJson = JSON.stringify(queryEmbedding);
|
|
39
|
+
const rows = await this.db
|
|
40
|
+
.select({
|
|
41
|
+
chunkId: ragChunks.id,
|
|
42
|
+
documentId: ragChunks.documentId,
|
|
43
|
+
workspaceId: ragChunks.workspaceId,
|
|
44
|
+
content: ragChunks.content,
|
|
45
|
+
tokenCount: ragChunks.tokenCount,
|
|
46
|
+
chunkIndex: ragChunks.chunkIndex,
|
|
47
|
+
embedding: ragChunks.embedding,
|
|
48
|
+
embeddingModel: ragChunks.embeddingModel,
|
|
49
|
+
metadata: ragChunks.metadata,
|
|
50
|
+
chunkCreatedAt: ragChunks.createdAt,
|
|
51
|
+
docTitle: ragDocuments.title,
|
|
52
|
+
docSourceType: ragDocuments.sourceType,
|
|
53
|
+
docSourceCollection: ragDocuments.sourceCollection,
|
|
54
|
+
docSourceId: ragDocuments.sourceId,
|
|
55
|
+
docCreatedAt: ragDocuments.createdAt,
|
|
56
|
+
// 1 - (cosine_distance / 2) maps [0,2] distance to [0,1] similarity
|
|
57
|
+
similarity: sql `1 - (${ragChunks}.embedding <-> ${embeddingJson}::vector) / 2`,
|
|
58
|
+
})
|
|
59
|
+
.from(ragChunks)
|
|
60
|
+
.innerJoin(ragDocuments, eq(ragChunks.documentId, ragDocuments.id))
|
|
61
|
+
.where(and(...conditions, sql `1 - (${ragChunks}.embedding <-> ${embeddingJson}::vector) / 2 >= ${threshold}`))
|
|
62
|
+
.orderBy(sql `${ragChunks}.embedding <-> ${embeddingJson}::vector`)
|
|
63
|
+
.limit(limit);
|
|
64
|
+
return rows.map((row) => ({
|
|
65
|
+
chunk: {
|
|
66
|
+
id: row.chunkId,
|
|
67
|
+
documentId: row.documentId,
|
|
68
|
+
workspaceId: row.workspaceId,
|
|
69
|
+
content: row.content,
|
|
70
|
+
tokenCount: row.tokenCount,
|
|
71
|
+
chunkIndex: row.chunkIndex,
|
|
72
|
+
embedding: row.embedding,
|
|
73
|
+
embeddingModel: row.embeddingModel,
|
|
74
|
+
metadata: row.metadata,
|
|
75
|
+
createdAt: row.chunkCreatedAt,
|
|
76
|
+
},
|
|
77
|
+
document: {
|
|
78
|
+
id: row.documentId,
|
|
79
|
+
title: row.docTitle,
|
|
80
|
+
sourceType: row.docSourceType,
|
|
81
|
+
sourceCollection: row.docSourceCollection,
|
|
82
|
+
sourceId: row.docSourceId,
|
|
83
|
+
createdAt: row.docCreatedAt,
|
|
84
|
+
},
|
|
85
|
+
similarity: row.similarity ?? 0,
|
|
86
|
+
}));
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Get all chunks for a document ordered by chunk index.
|
|
90
|
+
*/
|
|
91
|
+
async getChunksByDocument(documentId) {
|
|
92
|
+
return this.db
|
|
93
|
+
.select()
|
|
94
|
+
.from(ragChunks)
|
|
95
|
+
.where(eq(ragChunks.documentId, documentId))
|
|
96
|
+
.orderBy(asc(ragChunks.chunkIndex));
|
|
97
|
+
}
|
|
98
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-based Re-ranker
|
|
3
|
+
*
|
|
4
|
+
* Scores chunks 0–10 for relevance to the query using the configured LLMClient.
|
|
5
|
+
* No cross-encoder model required — uses the existing LLM.
|
|
6
|
+
*/
|
|
7
|
+
import type { LLMClient } from '../llm/client.js';
|
|
8
|
+
import type { RagSearchResult } from './rag-vector-service.js';
|
|
9
|
+
export declare function rerankChunks(query: string, chunks: RagSearchResult[], llmClient: LLMClient, topK?: number): Promise<RagSearchResult[]>;
|
|
10
|
+
//# sourceMappingURL=reranker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reranker.d.ts","sourceRoot":"","sources":["../../src/ingestion/reranker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAA;AAI9D,wBAAsB,YAAY,CAChC,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,eAAe,EAAE,EACzB,SAAS,EAAE,SAAS,EACpB,IAAI,CAAC,EAAE,MAAM,GACZ,OAAO,CAAC,eAAe,EAAE,CAAC,CAqC5B"}
|