@wanshi-kg/wanshi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +458 -0
- package/dist/__tests__/helpers.js +27 -0
- package/dist/__tests__/helpers.js.map +1 -0
- package/dist/cli/commands/export.command.js +99 -0
- package/dist/cli/commands/export.command.js.map +1 -0
- package/dist/cli/commands/index.js +22 -0
- package/dist/cli/commands/index.js.map +1 -0
- package/dist/cli/commands/inspectMerges.command.js +84 -0
- package/dist/cli/commands/inspectMerges.command.js.map +1 -0
- package/dist/cli/commands/metrics.command.js +196 -0
- package/dist/cli/commands/metrics.command.js.map +1 -0
- package/dist/cli/commands/process.command.js +82 -0
- package/dist/cli/commands/process.command.js.map +1 -0
- package/dist/cli/commands/watch.command.js +91 -0
- package/dist/cli/commands/watch.command.js.map +1 -0
- package/dist/cli/index.js +269 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/optionsToConfig.js +160 -0
- package/dist/cli/optionsToConfig.js.map +1 -0
- package/dist/config/index.js +59 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/legacyHints.js +113 -0
- package/dist/config/legacyHints.js.map +1 -0
- package/dist/config/schema.js +803 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/config/ui.js +221 -0
- package/dist/config/ui.js.map +1 -0
- package/dist/core/DirectoryProcessor.js +725 -0
- package/dist/core/DirectoryProcessor.js.map +1 -0
- package/dist/core/adapters/IStructuredAdapter.js +3 -0
- package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
- package/dist/core/adapters/SqliteAdapter.js +267 -0
- package/dist/core/adapters/SqliteAdapter.js.map +1 -0
- package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
- package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
- package/dist/core/adapters/index.js +20 -0
- package/dist/core/adapters/index.js.map +1 -0
- package/dist/core/checkpoint/CheckpointService.js +188 -0
- package/dist/core/checkpoint/CheckpointService.js.map +1 -0
- package/dist/core/checkpoint/index.js +18 -0
- package/dist/core/checkpoint/index.js.map +1 -0
- package/dist/core/corpus/CorpusAnalyzer.js +266 -0
- package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
- package/dist/core/corpus/CorpusProfileStore.js +92 -0
- package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
- package/dist/core/corpus/index.js +21 -0
- package/dist/core/corpus/index.js.map +1 -0
- package/dist/core/corpus/normalizeGlossary.js +60 -0
- package/dist/core/corpus/normalizeGlossary.js.map +1 -0
- package/dist/core/corpus/relPath.js +52 -0
- package/dist/core/corpus/relPath.js.map +1 -0
- package/dist/core/corpus/termFrequency.js +86 -0
- package/dist/core/corpus/termFrequency.js.map +1 -0
- package/dist/core/cost/CostMeter.js +235 -0
- package/dist/core/cost/CostMeter.js.map +1 -0
- package/dist/core/cost/index.js +19 -0
- package/dist/core/cost/index.js.map +1 -0
- package/dist/core/cost/prices.js +38 -0
- package/dist/core/cost/prices.js.map +1 -0
- package/dist/core/cv/ObjectDetectionService.js +119 -0
- package/dist/core/cv/ObjectDetectionService.js.map +1 -0
- package/dist/core/di/ContainerFactory.js +670 -0
- package/dist/core/di/ContainerFactory.js.map +1 -0
- package/dist/core/di/DIContainer.js +103 -0
- package/dist/core/di/DIContainer.js.map +1 -0
- package/dist/core/di/index.js +19 -0
- package/dist/core/di/index.js.map +1 -0
- package/dist/core/errors/CustomErrors.js +342 -0
- package/dist/core/errors/CustomErrors.js.map +1 -0
- package/dist/core/errors/index.js +18 -0
- package/dist/core/errors/index.js.map +1 -0
- package/dist/core/export/KnowledgeGraphExportService.js +56 -0
- package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
- package/dist/core/export/index.js +19 -0
- package/dist/core/export/index.js.map +1 -0
- package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
- package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
- package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/IExportStrategy.js +3 -0
- package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
- package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
- package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
- package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
- package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/McpExportStrategy.js +67 -0
- package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/index.js +25 -0
- package/dist/core/export/strategies/index.js.map +1 -0
- package/dist/core/export/strategies/kbTriples.js +60 -0
- package/dist/core/export/strategies/kbTriples.js.map +1 -0
- package/dist/core/index.js +22 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
- package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
- package/dist/core/knowledge/MergeRecord.js +3 -0
- package/dist/core/knowledge/MergeRecord.js.map +1 -0
- package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
- package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
- package/dist/core/knowledge/canon/index.js +18 -0
- package/dist/core/knowledge/canon/index.js.map +1 -0
- package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
- package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
- package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
- package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
- package/dist/core/knowledge/contradiction/index.js +19 -0
- package/dist/core/knowledge/contradiction/index.js.map +1 -0
- package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
- package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
- package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
- package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
- package/dist/core/knowledge/grounding/index.js +20 -0
- package/dist/core/knowledge/grounding/index.js.map +1 -0
- package/dist/core/knowledge/grounding/verbalize.js +38 -0
- package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
- package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
- package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
- package/dist/core/knowledge/index.js +20 -0
- package/dist/core/knowledge/index.js.map +1 -0
- package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
- package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
- package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
- package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
- package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
- package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
- package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
- package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
- package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
- package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
- package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
- package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
- package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
- package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
- package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
- package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
- package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
- package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
- package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
- package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
- package/dist/core/knowledge/vocabulary.js +162 -0
- package/dist/core/knowledge/vocabulary.js.map +1 -0
- package/dist/core/llm/EmbeddingService.js +113 -0
- package/dist/core/llm/EmbeddingService.js.map +1 -0
- package/dist/core/llm/OllamaService.js +146 -0
- package/dist/core/llm/OllamaService.js.map +1 -0
- package/dist/core/llm/OpenAICompatibleService.js +190 -0
- package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
- package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
- package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
- package/dist/core/llm/embeddingUtils.js +25 -0
- package/dist/core/llm/embeddingUtils.js.map +1 -0
- package/dist/core/llm/index.js +23 -0
- package/dist/core/llm/index.js.map +1 -0
- package/dist/core/llm/prompts/PromptManager.js +388 -0
- package/dist/core/llm/prompts/PromptManager.js.map +1 -0
- package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
- package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
- package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
- package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
- package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
- package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
- package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
- package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
- package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
- package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
- package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
- package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
- package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
- package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
- package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
- package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
- package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
- package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
- package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
- package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
- package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
- package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
- package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
- package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
- package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
- package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
- package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
- package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
- package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
- package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
- package/dist/core/pipeline/GroundingTransform.js +52 -0
- package/dist/core/pipeline/GroundingTransform.js.map +1 -0
- package/dist/core/pipeline/PipelineRunner.js +51 -0
- package/dist/core/pipeline/PipelineRunner.js.map +1 -0
- package/dist/core/pipeline/RelationFilterTransform.js +72 -0
- package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
- package/dist/core/pipeline/index.js +20 -0
- package/dist/core/pipeline/index.js.map +1 -0
- package/dist/core/processor/FileProcessor.js +184 -0
- package/dist/core/processor/FileProcessor.js.map +1 -0
- package/dist/core/processor/ProcessedRegistry.js +38 -0
- package/dist/core/processor/ProcessedRegistry.js.map +1 -0
- package/dist/core/processor/ast/AstSeedService.js +0 -0
- package/dist/core/processor/ast/AstSeedService.js.map +1 -0
- package/dist/core/processor/ast/AstSymbolStore.js +110 -0
- package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
- package/dist/core/processor/ast/index.js +19 -0
- package/dist/core/processor/ast/index.js.map +1 -0
- package/dist/core/processor/chunking/TextChunker.js +98 -0
- package/dist/core/processor/chunking/TextChunker.js.map +1 -0
- package/dist/core/processor/chunking/index.js +18 -0
- package/dist/core/processor/chunking/index.js.map +1 -0
- package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
- package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
- package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
- package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
- package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
- package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
- package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
- package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
- package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
- package/dist/core/processor/classifier/index.js +21 -0
- package/dist/core/processor/classifier/index.js.map +1 -0
- package/dist/core/processor/classifier/mergeClassifications.js +32 -0
- package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
- package/dist/core/processor/index.js +20 -0
- package/dist/core/processor/index.js.map +1 -0
- package/dist/core/processor/readers/AudioReader.js +462 -0
- package/dist/core/processor/readers/AudioReader.js.map +1 -0
- package/dist/core/processor/readers/BinaryReader.js +90 -0
- package/dist/core/processor/readers/BinaryReader.js.map +1 -0
- package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
- package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
- package/dist/core/processor/readers/ChatExportReader.js +365 -0
- package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
- package/dist/core/processor/readers/DoclingReader.js +445 -0
- package/dist/core/processor/readers/DoclingReader.js.map +1 -0
- package/dist/core/processor/readers/EmailReader.js +259 -0
- package/dist/core/processor/readers/EmailReader.js.map +1 -0
- package/dist/core/processor/readers/EpubReader.js +175 -0
- package/dist/core/processor/readers/EpubReader.js.map +1 -0
- package/dist/core/processor/readers/FileReader.js +90 -0
- package/dist/core/processor/readers/FileReader.js.map +1 -0
- package/dist/core/processor/readers/FileReaderFactory.js +49 -0
- package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
- package/dist/core/processor/readers/HtmlReader.js +371 -0
- package/dist/core/processor/readers/HtmlReader.js.map +1 -0
- package/dist/core/processor/readers/ImageReader.js +162 -0
- package/dist/core/processor/readers/ImageReader.js.map +1 -0
- package/dist/core/processor/readers/JsonFileReader.js +232 -0
- package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
- package/dist/core/processor/readers/JupyterReader.js +178 -0
- package/dist/core/processor/readers/JupyterReader.js.map +1 -0
- package/dist/core/processor/readers/LatexReader.js +176 -0
- package/dist/core/processor/readers/LatexReader.js.map +1 -0
- package/dist/core/processor/readers/MarkdownReader.js +289 -0
- package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
- package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
- package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
- package/dist/core/processor/readers/MistralOcrReader.js +198 -0
- package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
- package/dist/core/processor/readers/OfficeReader.js +174 -0
- package/dist/core/processor/readers/OfficeReader.js.map +1 -0
- package/dist/core/processor/readers/PdfReader.js +116 -0
- package/dist/core/processor/readers/PdfReader.js.map +1 -0
- package/dist/core/processor/readers/RtfReader.js +107 -0
- package/dist/core/processor/readers/RtfReader.js.map +1 -0
- package/dist/core/processor/readers/SubtitleReader.js +145 -0
- package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
- package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
- package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
- package/dist/core/processor/readers/TextReader.js +129 -0
- package/dist/core/processor/readers/TextReader.js.map +1 -0
- package/dist/core/processor/readers/TranscriptReader.js +234 -0
- package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
- package/dist/core/processor/readers/image/imageMetadata.js +155 -0
- package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
- package/dist/core/processor/readers/index.js +41 -0
- package/dist/core/processor/readers/index.js.map +1 -0
- package/dist/core/processor/readers/referenceExtraction.js +198 -0
- package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
- package/dist/core/processor/readers/stripReferences.js +59 -0
- package/dist/core/processor/readers/stripReferences.js.map +1 -0
- package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
- package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
- package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
- package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
- package/dist/core/progress/NoopProgressEmitter.js +15 -0
- package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
- package/dist/core/progress/index.js +19 -0
- package/dist/core/progress/index.js.map +1 -0
- package/dist/core/trace/TraceWriter.js +100 -0
- package/dist/core/trace/TraceWriter.js.map +1 -0
- package/dist/core/trace/events.js +13 -0
- package/dist/core/trace/events.js.map +1 -0
- package/dist/core/trace/index.js +20 -0
- package/dist/core/trace/index.js.map +1 -0
- package/dist/core/trace/lineage.js +97 -0
- package/dist/core/trace/lineage.js.map +1 -0
- package/dist/evaluation/BenchmarkRunner.js +171 -0
- package/dist/evaluation/BenchmarkRunner.js.map +1 -0
- package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
- package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
- package/dist/evaluation/classifier/labeledSamples.js +379 -0
- package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
- package/dist/evaluation/compare/goldCompare.js +126 -0
- package/dist/evaluation/compare/goldCompare.js.map +1 -0
- package/dist/evaluation/crossre/compareScoring.js +30 -0
- package/dist/evaluation/crossre/compareScoring.js.map +1 -0
- package/dist/evaluation/datasets/CrossREDataset.js +170 -0
- package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
- package/dist/evaluation/datasets/IDataset.js +3 -0
- package/dist/evaluation/datasets/IDataset.js.map +1 -0
- package/dist/evaluation/datasets/RebelDataset.js +117 -0
- package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
- package/dist/evaluation/datasets/RedocredDataset.js +218 -0
- package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
- package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
- package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
- package/dist/evaluation/index.js +33 -0
- package/dist/evaluation/index.js.map +1 -0
- package/dist/evaluation/matching/ExactMatcher.js +75 -0
- package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
- package/dist/evaluation/matching/SemanticMatcher.js +143 -0
- package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
- package/dist/evaluation/metrics/TripleMetrics.js +64 -0
- package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
- package/dist/evaluation/mine/MineCheckpoint.js +114 -0
- package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
- package/dist/evaluation/mine/MineDataset.js +208 -0
- package/dist/evaluation/mine/MineDataset.js.map +1 -0
- package/dist/evaluation/mine/MineReporter.js +98 -0
- package/dist/evaluation/mine/MineReporter.js.map +1 -0
- package/dist/evaluation/mine/MineRunner.js +148 -0
- package/dist/evaluation/mine/MineRunner.js.map +1 -0
- package/dist/evaluation/mine/MineScorer.js +127 -0
- package/dist/evaluation/mine/MineScorer.js.map +1 -0
- package/dist/evaluation/mine/types.js +12 -0
- package/dist/evaluation/mine/types.js.map +1 -0
- package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
- package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
- package/dist/evaluation/reporters/JsonReporter.js +50 -0
- package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
- package/dist/index.js +28 -0
- package/dist/index.js.map +1 -0
- package/dist/quality/CompositeScore.js +61 -0
- package/dist/quality/CompositeScore.js.map +1 -0
- package/dist/quality/ConsistencyMetrics.js +70 -0
- package/dist/quality/ConsistencyMetrics.js.map +1 -0
- package/dist/quality/FactualMetrics.js +76 -0
- package/dist/quality/FactualMetrics.js.map +1 -0
- package/dist/quality/GraphHealthMetrics.js +68 -0
- package/dist/quality/GraphHealthMetrics.js.map +1 -0
- package/dist/quality/SemanticMetrics.js +102 -0
- package/dist/quality/SemanticMetrics.js.map +1 -0
- package/dist/quality/StructuralMetrics.js +60 -0
- package/dist/quality/StructuralMetrics.js.map +1 -0
- package/dist/quality/index.js +23 -0
- package/dist/quality/index.js.map +1 -0
- package/dist/shared/index.js +20 -0
- package/dist/shared/index.js.map +1 -0
- package/dist/shared/logger/Logger.js +3 -0
- package/dist/shared/logger/Logger.js.map +1 -0
- package/dist/shared/logger/LoggerFactory.js +75 -0
- package/dist/shared/logger/LoggerFactory.js.map +1 -0
- package/dist/shared/logger/index.js +19 -0
- package/dist/shared/logger/index.js.map +1 -0
- package/dist/shared/shutdown.js +30 -0
- package/dist/shared/shutdown.js.map +1 -0
- package/dist/shared/utils/agglomerativeCluster.js +269 -0
- package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
- package/dist/shared/utils/astSymbols.js +69 -0
- package/dist/shared/utils/astSymbols.js.map +1 -0
- package/dist/shared/utils/cosineSimilarity.js +18 -0
- package/dist/shared/utils/cosineSimilarity.js.map +1 -0
- package/dist/shared/utils/directoryTree.js +184 -0
- package/dist/shared/utils/directoryTree.js.map +1 -0
- package/dist/shared/utils/documentOutline.js +74 -0
- package/dist/shared/utils/documentOutline.js.map +1 -0
- package/dist/shared/utils/index.js +24 -0
- package/dist/shared/utils/index.js.map +1 -0
- package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
- package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
- package/dist/shared/utils/parseJsonLenient.js +27 -0
- package/dist/shared/utils/parseJsonLenient.js.map +1 -0
- package/dist/shared/utils/readConfig.js +42 -0
- package/dist/shared/utils/readConfig.js.map +1 -0
- package/dist/shared/utils/readRtf.js +216 -0
- package/dist/shared/utils/readRtf.js.map +1 -0
- package/dist/shared/utils/softmax.js +26 -0
- package/dist/shared/utils/softmax.js.map +1 -0
- package/dist/types/ContentClass.js +3 -0
- package/dist/types/ContentClass.js.map +1 -0
- package/dist/types/CorpusProfile.js +3 -0
- package/dist/types/CorpusProfile.js.map +1 -0
- package/dist/types/IContradictionChecker.js +3 -0
- package/dist/types/IContradictionChecker.js.map +1 -0
- package/dist/types/ICorpusAnalyzer.js +3 -0
- package/dist/types/ICorpusAnalyzer.js.map +1 -0
- package/dist/types/IDirectoryProcessor.js +3 -0
- package/dist/types/IDirectoryProcessor.js.map +1 -0
- package/dist/types/IEmbeddingProvider.js +3 -0
- package/dist/types/IEmbeddingProvider.js.map +1 -0
- package/dist/types/IEmbeddingService.js +6 -0
- package/dist/types/IEmbeddingService.js.map +1 -0
- package/dist/types/IFileProcessor.js +3 -0
- package/dist/types/IFileProcessor.js.map +1 -0
- package/dist/types/IGroundingChecker.js +3 -0
- package/dist/types/IGroundingChecker.js.map +1 -0
- package/dist/types/IKnowledgeGraphBuilder.js +3 -0
- package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
- package/dist/types/IKnowledgeGraphExporter.js +3 -0
- package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
- package/dist/types/IKnowledgeGraphMerger.js +3 -0
- package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
- package/dist/types/IKnowledgeGraphSearch.js +3 -0
- package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
- package/dist/types/ILLMProvider.js +3 -0
- package/dist/types/ILLMProvider.js.map +1 -0
- package/dist/types/ILLMService.js +3 -0
- package/dist/types/ILLMService.js.map +1 -0
- package/dist/types/IObjectDetector.js +3 -0
- package/dist/types/IObjectDetector.js.map +1 -0
- package/dist/types/IProcessingService.js +3 -0
- package/dist/types/IProcessingService.js.map +1 -0
- package/dist/types/IProgressEmitter.js +3 -0
- package/dist/types/IProgressEmitter.js.map +1 -0
- package/dist/types/IPromptManager.js +3 -0
- package/dist/types/IPromptManager.js.map +1 -0
- package/dist/types/KnowledgeGraph.js +3 -0
- package/dist/types/KnowledgeGraph.js.map +1 -0
- package/dist/types/MCPKnowledgeGraph.js +3 -0
- package/dist/types/MCPKnowledgeGraph.js.map +1 -0
- package/dist/types/Observation.js +21 -0
- package/dist/types/Observation.js.map +1 -0
- package/dist/types/ProcessingOptions.js +3 -0
- package/dist/types/ProcessingOptions.js.map +1 -0
- package/dist/types/index.js +40 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +122 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
38
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
39
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
40
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
41
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
exports.ImageBufferResolver = exports.MarkdownProcessor = exports.MarkdownReader = exports.MarkdownReadError = void 0;
|
|
46
|
+
const fs = __importStar(require("fs"));
|
|
47
|
+
const FileReader_1 = require("./FileReader");
|
|
48
|
+
const stripReferences_1 = require("./stripReferences");
|
|
49
|
+
const referenceExtraction_1 = require("./referenceExtraction");
|
|
50
|
+
/**
|
|
51
|
+
* Custom error for markdown reading failures
|
|
52
|
+
*/
|
|
53
|
+
class MarkdownReadError extends Error {
|
|
54
|
+
constructor(message, options) {
|
|
55
|
+
super(message);
|
|
56
|
+
this.name = 'MarkdownReadError';
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
exports.MarkdownReadError = MarkdownReadError;
|
|
60
|
+
/**
|
|
61
|
+
* Reader for markdown text files with image extraction support
|
|
62
|
+
*/
|
|
63
|
+
class MarkdownReader extends FileReader_1.FileReader {
|
|
64
|
+
constructor(chunker, logger, stripReferences = false, extractLinks = false, extractCites = false) {
|
|
65
|
+
super([".md", ".markdown"], chunker, logger);
|
|
66
|
+
this.stripReferences = stripReferences;
|
|
67
|
+
this.extractLinks = extractLinks;
|
|
68
|
+
this.extractCites = extractCites;
|
|
69
|
+
}
|
|
70
|
+
getName() {
|
|
71
|
+
return "MarkdownReader";
|
|
72
|
+
}
|
|
73
|
+
read(filePath) {
|
|
74
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
75
|
+
yield this.validateFile(filePath);
|
|
76
|
+
try {
|
|
77
|
+
this.logger.debug(`Reading markdown file: ${filePath}`);
|
|
78
|
+
const markdownContent = yield MarkdownProcessor.processFile(filePath);
|
|
79
|
+
const fullText = markdownContent.text; // pre-strip, for reference extraction
|
|
80
|
+
// Detect the trailing bibliography once if either consumer needs it.
|
|
81
|
+
const split = this.stripReferences || this.extractCites
|
|
82
|
+
? (0, stripReferences_1.splitTrailingReferences)(markdownContent.text)
|
|
83
|
+
: undefined;
|
|
84
|
+
if (this.stripReferences && (split === null || split === void 0 ? void 0 : split.references)) {
|
|
85
|
+
this.logger.info(`Quarantined trailing references section of ${filePath} (${split.references.length} chars)`);
|
|
86
|
+
markdownContent.text = split.body;
|
|
87
|
+
}
|
|
88
|
+
// Reference extraction, gated by config. Markdown `[t](u)`/`[[wiki]]` links
|
|
89
|
+
// PLUS bare URLs (web-clip `> source:` headers etc.); external ones feed the
|
|
90
|
+
// Phase-1 fetcher, internal ones the Phase-0 resolver.
|
|
91
|
+
const references = {};
|
|
92
|
+
if (this.extractLinks) {
|
|
93
|
+
const seen = new Set();
|
|
94
|
+
const links = [...(0, referenceExtraction_1.extractMarkdownLinks)(fullText), ...(0, referenceExtraction_1.extractBareUrls)(fullText)].filter((l) => (seen.has(l.target) ? false : (seen.add(l.target), true)));
|
|
95
|
+
if (links.length)
|
|
96
|
+
references.links = links;
|
|
97
|
+
}
|
|
98
|
+
if (this.extractCites) {
|
|
99
|
+
const cites = (0, referenceExtraction_1.extractCitations)(split === null || split === void 0 ? void 0 : split.references, fullText);
|
|
100
|
+
if (cites.length)
|
|
101
|
+
references.citations = cites;
|
|
102
|
+
}
|
|
103
|
+
const hasRefs = !!(references.links || references.citations);
|
|
104
|
+
const chunks = yield this.chunker.chunk(markdownContent.text);
|
|
105
|
+
const enrichedChunks = yield this.enrichChunksWithImages(chunks, markdownContent.images);
|
|
106
|
+
this.logImageExtractionResults(markdownContent.images.length);
|
|
107
|
+
return {
|
|
108
|
+
chunks: enrichedChunks,
|
|
109
|
+
metadata: Object.assign({ type: "text", encoding: "utf-8", size: markdownContent.text.length, imageCount: markdownContent.images.length }, (hasRefs ? { references } : {})),
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
const errorMessage = `Failed to read markdown file ${filePath}: ${error}`;
|
|
114
|
+
this.logger.error(errorMessage);
|
|
115
|
+
throw new MarkdownReadError(errorMessage, { cause: error });
|
|
116
|
+
}
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
enrichChunksWithImages(chunks, images) {
|
|
120
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
121
|
+
return Promise.all(chunks.map((chunk) => __awaiter(this, void 0, void 0, function* () {
|
|
122
|
+
const imageReferences = this.extractImageReferences(chunk.content);
|
|
123
|
+
const imageBuffers = yield this.resolveImageBuffers(imageReferences, images);
|
|
124
|
+
return {
|
|
125
|
+
content: chunk.content,
|
|
126
|
+
images: imageBuffers,
|
|
127
|
+
startOffset: chunk.startOffset,
|
|
128
|
+
endOffset: chunk.endOffset,
|
|
129
|
+
index: chunk.index,
|
|
130
|
+
totalChunks: chunk.totalChunks,
|
|
131
|
+
};
|
|
132
|
+
})));
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
extractImageReferences(content) {
|
|
136
|
+
const imagePattern = /!\[.*?\]\((\d+)\)/g;
|
|
137
|
+
const references = [];
|
|
138
|
+
let match;
|
|
139
|
+
while ((match = imagePattern.exec(content)) !== null) {
|
|
140
|
+
const index = parseInt(match[1], 10);
|
|
141
|
+
if (!isNaN(index)) {
|
|
142
|
+
references.push(index);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return references;
|
|
146
|
+
}
|
|
147
|
+
resolveImageBuffers(references, images) {
|
|
148
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
149
|
+
const results = [];
|
|
150
|
+
for (const ref of references) {
|
|
151
|
+
if (ref >= 0 && ref < images.length) {
|
|
152
|
+
try {
|
|
153
|
+
const buffer = yield ImageBufferResolver.resolve(images[ref]);
|
|
154
|
+
results.push({
|
|
155
|
+
buffer: buffer,
|
|
156
|
+
alt: images[ref].alt,
|
|
157
|
+
path: images[ref].path || images[ref].url,
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
catch (error) {
|
|
161
|
+
this.logger.warn(`Failed to resolve image at index ${ref}: ${error}`);
|
|
162
|
+
// Continue processing other images instead of failing completely
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
else {
|
|
166
|
+
this.logger.warn(`Invalid image reference: ${ref}`);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return results;
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
logImageExtractionResults(imageCount) {
|
|
173
|
+
if (imageCount > 0) {
|
|
174
|
+
this.logger.debug(`Extracted ${imageCount} images from markdown content`);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
exports.MarkdownReader = MarkdownReader;
|
|
179
|
+
/**
|
|
180
|
+
* Handles processing of markdown files and image extraction
|
|
181
|
+
*/
|
|
182
|
+
class MarkdownProcessor {
|
|
183
|
+
static processFile(filePath) {
|
|
184
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
185
|
+
try {
|
|
186
|
+
const content = yield fs.promises.readFile(filePath, "utf-8");
|
|
187
|
+
return this.extractImagesFromContent(content);
|
|
188
|
+
}
|
|
189
|
+
catch (error) {
|
|
190
|
+
throw new MarkdownReadError(`Failed to process markdown file: ${filePath}`, { cause: error });
|
|
191
|
+
}
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
static extractImageMetadata(imageMarkdown) {
|
|
195
|
+
const matches = Array.from(imageMarkdown.matchAll(this.IMAGE_PATTERN));
|
|
196
|
+
if (matches.length === 0) {
|
|
197
|
+
throw new MarkdownReadError(`Invalid image markdown syntax: ${imageMarkdown}`);
|
|
198
|
+
}
|
|
199
|
+
const [, alt, source] = matches[0];
|
|
200
|
+
return this.parseImageSource(alt, source);
|
|
201
|
+
}
|
|
202
|
+
static parseImageSource(alt, source) {
|
|
203
|
+
if (source.startsWith("http://") || source.startsWith("https://")) {
|
|
204
|
+
return { alt, url: source };
|
|
205
|
+
}
|
|
206
|
+
if (source.startsWith("data:image/")) {
|
|
207
|
+
const base64Match = source.match(/^data:image\/[^;]+;base64,(.+)$/);
|
|
208
|
+
if (base64Match) {
|
|
209
|
+
return { alt, base64: base64Match[1] };
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
// Assume it's a file path
|
|
213
|
+
return { alt, path: source };
|
|
214
|
+
}
|
|
215
|
+
static extractImagesFromContent(content) {
|
|
216
|
+
const images = [];
|
|
217
|
+
const processedText = content.replace(this.IMAGE_PATTERN, (match, alt, source) => {
|
|
218
|
+
try {
|
|
219
|
+
const imageMetadata = this.parseImageSource(alt, source);
|
|
220
|
+
images.push(imageMetadata);
|
|
221
|
+
return ``;
|
|
222
|
+
}
|
|
223
|
+
catch (error) {
|
|
224
|
+
// If we can't parse the image, leave it as-is
|
|
225
|
+
return match;
|
|
226
|
+
}
|
|
227
|
+
});
|
|
228
|
+
return {
|
|
229
|
+
text: processedText,
|
|
230
|
+
images,
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
exports.MarkdownProcessor = MarkdownProcessor;
|
|
235
|
+
MarkdownProcessor.IMAGE_PATTERN = /!\[(.*?)\]\((.*?)\)/gim;
|
|
236
|
+
/**
|
|
237
|
+
* Handles resolving image metadata to actual buffer data
|
|
238
|
+
*/
|
|
239
|
+
class ImageBufferResolver {
|
|
240
|
+
static resolve(imageMetadata) {
|
|
241
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
242
|
+
if (imageMetadata.base64) {
|
|
243
|
+
return this.resolveBase64(imageMetadata.base64);
|
|
244
|
+
}
|
|
245
|
+
if (imageMetadata.url) {
|
|
246
|
+
return this.resolveUrl(imageMetadata.url);
|
|
247
|
+
}
|
|
248
|
+
if (imageMetadata.path) {
|
|
249
|
+
return this.resolvePath(imageMetadata.path);
|
|
250
|
+
}
|
|
251
|
+
throw new MarkdownReadError(`No valid image source found for image: ${imageMetadata.alt || 'unknown'}`);
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
static resolveBase64(base64Data) {
|
|
255
|
+
try {
|
|
256
|
+
return Buffer.from(base64Data, "base64");
|
|
257
|
+
}
|
|
258
|
+
catch (error) {
|
|
259
|
+
throw new MarkdownReadError("Invalid base64 image data", { cause: error });
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
static resolveUrl(url) {
|
|
263
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
264
|
+
try {
|
|
265
|
+
const response = yield fetch(url);
|
|
266
|
+
if (!response.ok) {
|
|
267
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
268
|
+
}
|
|
269
|
+
const arrayBuffer = yield response.arrayBuffer();
|
|
270
|
+
return Buffer.from(arrayBuffer);
|
|
271
|
+
}
|
|
272
|
+
catch (error) {
|
|
273
|
+
throw new MarkdownReadError(`Failed to fetch image from URL: ${url}`, { cause: error });
|
|
274
|
+
}
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
static resolvePath(filePath) {
|
|
278
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
279
|
+
try {
|
|
280
|
+
return yield fs.promises.readFile(filePath);
|
|
281
|
+
}
|
|
282
|
+
catch (error) {
|
|
283
|
+
throw new MarkdownReadError(`Failed to read image file: ${filePath}`, { cause: error });
|
|
284
|
+
}
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
exports.ImageBufferResolver = ImageBufferResolver;
|
|
289
|
+
//# sourceMappingURL=MarkdownReader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"MarkdownReader.js","sourceRoot":"","sources":["../../../../src/core/processor/readers/MarkdownReader.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,uCAAyB;AACzB,6CAAoF;AAIpF,uDAA4D;AAC5D,+DAK+B;AAoB/B;;GAEG;AACH,MAAa,iBAAkB,SAAQ,KAAK;IAC1C,YAAY,OAAe,EAAE,OAAa;QACxC,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,mBAAmB,CAAC;IAClC,CAAC;CACF;AALD,8CAKC;AAED;;GAEG;AACH,MAAa,cAAe,SAAQ,uBAAU;IAC5C,YACE,OAAoB,EACpB,MAAc,EACG,kBAA2B,KAAK,EAChC,eAAwB,KAAK,EAC7B,eAAwB,KAAK;QAE9C,KAAK,CAAC,CAAC,KAAK,EAAE,WAAW,CAAC,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAJ5B,oBAAe,GAAf,eAAe,CAAiB;QAChC,iBAAY,GAAZ,YAAY,CAAiB;QAC7B,iBAAY,GAAZ,YAAY,CAAiB;IAGhD,CAAC;IAED,OAAO;QACL,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IAEK,IAAI,CAAC,QAAgB;;YACzB,MAAM,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;YAElC,IAAI,CAAC;gBACH,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,0BAA0B,QAAQ,EAAE,CAAC,CAAC;gBAExD,MAAM,eAAe,GAAG,MAAM,iBAAiB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;gBACtE,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,sCAAsC;gBAE7E,qEAAqE;gBACrE,MAAM,KAAK,GACT,IAAI,CAAC,eAAe,IAAI,IAAI,CAAC,YAAY;oBACvC,CAAC,CAAC,IAAA,yCAAuB,EAAC,eAAe,CAAC,IAAI,CAAC;oBAC/C,CAAC,CAAC,SAAS,CAAC;gBAChB,IAAI,IAAI,CAAC,eAAe,KAAI,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,UAAU,CAAA,EAAE,CAAC;oBAC9C,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,8CAA8C,QAAQ,KAAK,KAAK,CAAC,UAAU,CAAC,MAAM,SAAS,CAC5F,CAAC;oBACF,eAAe,CAAC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC;gBACpC,CAAC;gBAED,4EAA4E;gBAC5E,6EAA6E;gBAC7E,uDAAuD;gBACvD,MAAM,UAAU,GAAkB,EAAE,CAAC;gBACrC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;oBACtB,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;oBAC/B,MAAM,KAAK,GAAG,CAAC,GAAG,IAAA,0CAAoB,EAAC,QAAQ,CAAC,EAAE,GAAG,IAAA,qCAAe,EAAC,QAAQ,CAAC,CAAC,CAAC,MAAM,CACpF,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,CAAC,CAAC,CACjE,CAAC;oBACF,IAAI,KAAK,CAAC,MAAM;wBAAE,UAAU,CAAC,KAAK,GAAG,KAAK,CAAC;gBAC7C,CAAC;gBACD,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;oBACtB,MAAM,KAAK,GAAG,IAAA,sCAAgB,EAAC,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;oBAC5D,IAAI,KAAK,CAAC,MAAM;wBAAE,UAAU,CAAC,SAAS,GAAG,KAAK,CAAC;gBACjD,CAAC;gBACD,MAAM,OAAO,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,IAAI,UAAU,CAAC,SAAS,CAAC,CAAC;gBAE7D,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;gBAE9D,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,sBAAsB,CACtD,MAAM,EACN,eAAe,CAAC,MAAM,CACvB,CAAC;gBAEF,IAAI,CAAC,yBAAyB,CAAC,eAAe,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;gBAE9D,OAAO;oBACL,MAAM,EAAE,cAAc;oBACtB,QAAQ,kBACN,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,OAAO,EACjB,IAAI,EAAE,eAAe,CAAC,IAAI,CAAC,MAAM,EACjC,UAAU,EAAE,eAAe,CAAC,MAAM,CAAC,MAAM,IACtC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CACnC;iBACF,CAAC;YACJ,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,YAAY,GAAG,gCAAgC,QAAQ,KAAK,KAAK,EAAE,CAAC;gBAC1E,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;gBAChC,MAAM,IAAI,iBAAiB,CAAC,YAAY,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;YAC9D,CAAC;QACH,CAAC;KAAA;IAEa,sBAAsB,CAClC,MAAwB,EACxB,MAAuB;;YAEvB,OAAO,OAAO,CAAC,GAAG,CAChB,MAAM,CAAC,GAAG,CAAC,CAAO,KAAK,EAAE,EAAE;gBACzB,MAAM,eAAe,GAAG,IAAI,CAAC,sBAAsB,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBACnE,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,mBAAmB,CAAC,eAAe,EAAE,MAAM,CAAC,CAAC;gBAE7E,OAAO;oBACL,OAAO,EAAE,KAAK,CAAC,OAAO;oBACtB,MAAM,EAAE,YAAY;oBACpB,WAAW,EAAE,KAAK,CAAC,WAAW;oBAC9B,SAAS,EAAE,KAAK,CAAC,SAAS;oBAC1B,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,WAAW,EAAE,KAAK,CAAC,WAAW;iBAC/B,CAAC;YACJ,CAAC,CAAA,CAAC,CACH,CAAC;QACJ,CAAC;KAAA;IAEO,sBAAsB,CAAC,OAAe;QAC5C,MAAM,YAAY,GAAG,oBAAoB,CAAC;QAC1C,MAAM,UAAU,GAAa,EAAE,CAAC;QAChC,IAAI,KAAK,CAAC;QAEV,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACrD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACrC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBAClB,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACzB,CAAC;QACH,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAEa,mBAAmB,CAC/B,UAAoB,EACpB,MAAuB;;YAEvB,MAAM,OAAO,GAAkB,EAAE,CAAC;YAElC,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;gBAC7B,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;oBACpC,IAAI,CAAC;wBACH,MAAM,MAAM,GAAG,MAAM,mBAAmB,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;wBAC9D,OAAO,CAAC,IAAI,CAAC;4BACX,MAAM,EAAE,MAAM;4BACd,GAAG,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG;4BACpB,IAAI,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,IAAI,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG;yBAC1C,CAAC,CAAC;oBACL,CAAC;oBAAC,OAAO,KAAK,EAAE,CAAC;wBACf,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,oCAAoC,GAAG,KAAK,KAAK,EAAE,CAAC,CAAC;wBACtE,iEAAiE;oBACnE,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,4BAA4B,GAAG,EAAE,CAAC,CAAC;gBACtD,CAAC;YACH,CAAC;YAED,OAAO,OAAO,CAAC;QACjB,CAAC;KAAA;IAEO,yBAAyB,CAAC,UAAkB;QAClD,IAAI,UAAU,GAAG,CAAC,EAAE,CAAC;YACnB,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,aAAa,UAAU,+BAA+B,CAAC,CAAC;QAC5E,CAAC;IACH,CAAC;CACF;AAnJD,wCAmJC;AAED;;GAEG;AACH,MAAa,iBAAiB;IAGrB,MAAM,CAAO,WAAW,CAAC,QAAgB;;YAC9C,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAC9D,OAAO,IAAI,CAAC,wBAAwB,CAAC,OAAO,CAAC,CAAC;YAChD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,iBAAiB,CAAC,oCAAoC,QAAQ,EAAE,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;YAChG,CAAC;QACH,CAAC;KAAA;IAEM,MAAM,CAAC,oBAAoB,CAAC,aAAqB;QACtD,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC;QAEvE,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACzB,MAAM,IAAI,iBAAiB,CAAC,kCAAkC,aAAa,EAAE,CAAC,CAAC;QACjF,CAAC;QAED,MAAM,CAAC,EAAE,GAAG,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACnC,OAAO,IAAI,CAAC,gBAAgB,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;IAC5C,CAAC;IAEO,MAAM,CAAC,gBAAgB,CAAC,GAAW,EAAE,MAAc;QACzD,IAAI,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,MAAM,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAClE,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC;QAC9B,CAAC;QAED,IAAI,MAAM,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;YACrC,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;YACpE,IAAI,WAAW,EAAE,CAAC;gBAChB,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC;YACzC,CAAC;QACH,CAAC;QAED,0BAA0B;QAC1B,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;IAC/B,CAAC;IAEO,MAAM,CAAC,wBAAwB,CAAC,OAAe;QACrD,MAAM,MAAM,GAAoB,EAAE,CAAC;QAEnC,MAAM,aAAa,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,EAAE;YAC/E,IAAI,CAAC;gBACH,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;gBACzD,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;gBAC3B,OAAO,KAAK,GAAG,KAAK,MAAM,CAAC,MAAM,GAAG,CAAC,GAAG,CAAC;YAC3C,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,8CAA8C;gBAC9C,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,MAAM;SACP,CAAC;IACJ,CAAC;;AAzDH,8CA0DC;AAzDyB,+BAAa,GAAG,wBAAwB,CAAC;AA2DnE;;GAEG;AACH,MAAa,mBAAmB;IACvB,MAAM,CAAO,OAAO,CAAC,aAA4B;;YACtD,IAAI,aAAa,CAAC,MAAM,EAAE,CAAC;gBACzB,OAAO,IAAI,CAAC,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAClD,CAAC;YAED,IAAI,aAAa,CAAC,GAAG,EAAE,CAAC;gBACtB,OAAO,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;YAC5C,CAAC;YAED,IAAI,aAAa,CAAC,IAAI,EAAE,CAAC;gBACvB,OAAO,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;YAC9C,CAAC;YAED,MAAM,IAAI,iBAAiB,CACzB,0CAA0C,aAAa,CAAC,GAAG,IAAI,SAAS,EAAE,CAC3E,CAAC;QACJ,CAAC;KAAA;IAEO,MAAM,CAAC,aAAa,CAAC,UAAkB;QAC7C,IAAI,CAAC;YACH,OAAO,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;QAC3C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,iBAAiB,CAAC,2BAA2B,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;QAC7E,CAAC;IACH,CAAC;IAEO,MAAM,CAAO,UAAU,CAAC,GAAW;;YACzC,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;gBAElC,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;gBACrE,CAAC;gBAED,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC;gBACjD,OAAO,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAClC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,iBAAiB,CAAC,mCAAmC,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;YAC1F,CAAC;QACH,CAAC;KAAA;IAEO,MAAM,CAAO,WAAW,CAAC,QAAgB;;YAC/C,IAAI,CAAC;gBACH,OAAO,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;YAC9C,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,iBAAiB,CAAC,8BAA8B,QAAQ,EAAE,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;YAC1F,CAAC;QACH,CAAC;KAAA;CACF;AAjDD,kDAiDC"}
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.MarkerPdfReader = void 0;
|
|
16
|
+
const FileReader_1 = require("./FileReader");
|
|
17
|
+
const path_1 = __importDefault(require("path"));
|
|
18
|
+
const promises_1 = __importDefault(require("fs/promises"));
|
|
19
|
+
const child_process_1 = require("child_process");
|
|
20
|
+
/**
|
|
21
|
+
* PDF reader backed by marker-pdf's `marker_single` CLI (datalab). Spawns the CLI
|
|
22
|
+
* (DoclingReader pattern), reads the markdown it produces, and returns it as a
|
|
23
|
+
* single content chunk (FileProcessor chunks it downstream). Claims `.pdf` only.
|
|
24
|
+
*
|
|
25
|
+
* marker is slow (~1GB models, minutes on CPU) so a `<pdf>.marker.md` sidecar is
|
|
26
|
+
* reused when newer than the source. Any failure — missing CLI, non-zero exit,
|
|
27
|
+
* timeout — **degrades to the injected pdf2json fallback** so a run never dies on
|
|
28
|
+
* a single PDF and the default path stays portable.
|
|
29
|
+
*/
|
|
30
|
+
class MarkerPdfReader extends FileReader_1.FileReader {
|
|
31
|
+
constructor(opts, llm, fallback, tempDir, chunker, logger) {
|
|
32
|
+
super([".pdf"], chunker, logger);
|
|
33
|
+
this.opts = opts;
|
|
34
|
+
this.llm = llm;
|
|
35
|
+
this.fallback = fallback;
|
|
36
|
+
this.tempDir = tempDir;
|
|
37
|
+
this.ensureTempDir();
|
|
38
|
+
}
|
|
39
|
+
getName() {
|
|
40
|
+
return "MarkerPdfReader";
|
|
41
|
+
}
|
|
42
|
+
adapterId() {
|
|
43
|
+
return "pdf:marker";
|
|
44
|
+
}
|
|
45
|
+
read(filePath) {
|
|
46
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
47
|
+
yield this.validateFile(filePath);
|
|
48
|
+
try {
|
|
49
|
+
const startTime = Date.now();
|
|
50
|
+
const { markdown, cached } = yield this.toMarkdown(filePath);
|
|
51
|
+
const content = markdown.trim();
|
|
52
|
+
if (!content)
|
|
53
|
+
throw new Error("marker produced empty markdown");
|
|
54
|
+
const stats = yield promises_1.default.stat(filePath);
|
|
55
|
+
return {
|
|
56
|
+
chunks: [
|
|
57
|
+
{ content, startOffset: 0, endOffset: content.length, index: 1, totalChunks: 1 },
|
|
58
|
+
],
|
|
59
|
+
metadata: {
|
|
60
|
+
type: "pdf",
|
|
61
|
+
fileName: filePath,
|
|
62
|
+
fileSize: stats.size,
|
|
63
|
+
pdfEngine: "marker",
|
|
64
|
+
markerUseLlm: this.opts.useLlm,
|
|
65
|
+
markerCached: cached,
|
|
66
|
+
contentLength: content.length,
|
|
67
|
+
processingTimeMs: Date.now() - startTime,
|
|
68
|
+
status: "success",
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
this.logger.warn(`Marker PDF engine failed for ${filePath} (${error.message}); falling back to pdf2json`);
|
|
74
|
+
return this.fallback.read(filePath);
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
/** Run marker (or reuse the sidecar) and return its markdown. Throws on failure. */
|
|
79
|
+
toMarkdown(filePath) {
|
|
80
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
81
|
+
const sidecar = `${filePath}.marker.md`;
|
|
82
|
+
if (yield this.sidecarIsFresh(sidecar, filePath)) {
|
|
83
|
+
this.logger.debug(`Reusing marker sidecar: ${sidecar}`);
|
|
84
|
+
return { markdown: yield promises_1.default.readFile(sidecar, "utf-8"), cached: true };
|
|
85
|
+
}
|
|
86
|
+
const outDir = path_1.default.resolve(path_1.default.join(this.tempDir, `marker_${Date.now()}_${Math.random().toString(36).slice(2, 9)}`));
|
|
87
|
+
yield promises_1.default.mkdir(outDir, { recursive: true });
|
|
88
|
+
try {
|
|
89
|
+
const args = [
|
|
90
|
+
path_1.default.resolve(filePath),
|
|
91
|
+
"--output_dir", outDir,
|
|
92
|
+
"--output_format", "markdown",
|
|
93
|
+
];
|
|
94
|
+
if (this.opts.forceOcr)
|
|
95
|
+
args.push("--force_ocr");
|
|
96
|
+
if (this.opts.useLlm) {
|
|
97
|
+
args.push("--use_llm", "--llm_service", "marker.services.openai.OpenAIService");
|
|
98
|
+
}
|
|
99
|
+
this.logger.info(`Marker: ${this.opts.command} ${args.join(" ")}`);
|
|
100
|
+
const result = yield this.executeCommand(this.opts.command, args);
|
|
101
|
+
if (result.code !== 0) {
|
|
102
|
+
throw new Error(`marker_single exited ${result.code}${result.stderr ? `: ${result.stderr.trim().slice(-400)}` : ""}`);
|
|
103
|
+
}
|
|
104
|
+
const produced = yield this.findMarkdown(outDir);
|
|
105
|
+
if (!produced)
|
|
106
|
+
throw new Error(`marker produced no .md under ${outDir}`);
|
|
107
|
+
const markdown = yield promises_1.default.readFile(produced, "utf-8");
|
|
108
|
+
yield promises_1.default.writeFile(sidecar, markdown, "utf-8");
|
|
109
|
+
return { markdown, cached: false };
|
|
110
|
+
}
|
|
111
|
+
finally {
|
|
112
|
+
yield promises_1.default.rm(outDir, { recursive: true, force: true }).catch(() => undefined);
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
/** First `.md` found under `dir` (marker writes `<dir>/<stem>/<stem>.md`). */
|
|
117
|
+
findMarkdown(dir) {
|
|
118
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
119
|
+
const entries = yield promises_1.default.readdir(dir, { withFileTypes: true }).catch(() => []);
|
|
120
|
+
for (const e of entries) {
|
|
121
|
+
const full = path_1.default.join(dir, e.name);
|
|
122
|
+
if (e.isDirectory()) {
|
|
123
|
+
const nested = yield this.findMarkdown(full);
|
|
124
|
+
if (nested)
|
|
125
|
+
return nested;
|
|
126
|
+
}
|
|
127
|
+
else if (e.name.toLowerCase().endsWith(".md")) {
|
|
128
|
+
return full;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return undefined;
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
sidecarIsFresh(sidecar, filePath) {
|
|
135
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
136
|
+
try {
|
|
137
|
+
const [s, a] = yield Promise.all([promises_1.default.stat(sidecar), promises_1.default.stat(filePath)]);
|
|
138
|
+
return s.mtimeMs >= a.mtimeMs;
|
|
139
|
+
}
|
|
140
|
+
catch (_a) {
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
/** Spawn marker with captured output + timeout (DoclingReader pattern). The
|
|
146
|
+
* openai-compatible LLM config is threaded as env for `--use_llm`. */
|
|
147
|
+
executeCommand(command, args) {
|
|
148
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
149
|
+
return new Promise((resolve, reject) => {
|
|
150
|
+
var _a, _b;
|
|
151
|
+
const env = Object.assign(Object.assign({}, process.env), { PYTHONUNBUFFERED: "1" });
|
|
152
|
+
if (this.opts.useLlm && this.llm) {
|
|
153
|
+
if (this.llm.apiKey)
|
|
154
|
+
env.OPENAI_API_KEY = this.llm.apiKey;
|
|
155
|
+
if (this.llm.host)
|
|
156
|
+
env.OPENAI_BASE_URL = this.llm.host;
|
|
157
|
+
if (this.llm.model)
|
|
158
|
+
env.OPENAI_MODEL = this.llm.model;
|
|
159
|
+
}
|
|
160
|
+
const options = { stdio: ["ignore", "pipe", "pipe"], env };
|
|
161
|
+
const child = (0, child_process_1.spawn)(command, args, options);
|
|
162
|
+
let stdout = "";
|
|
163
|
+
let stderr = "";
|
|
164
|
+
(_a = child.stdout) === null || _a === void 0 ? void 0 : _a.on("data", (d) => (stdout += d.toString()));
|
|
165
|
+
(_b = child.stderr) === null || _b === void 0 ? void 0 : _b.on("data", (d) => (stderr += d.toString()));
|
|
166
|
+
const timer = setTimeout(() => {
|
|
167
|
+
child.kill("SIGTERM");
|
|
168
|
+
reject(new Error(`marker timed out after ${this.opts.timeoutMs}ms`));
|
|
169
|
+
}, this.opts.timeoutMs);
|
|
170
|
+
child.on("error", (error) => {
|
|
171
|
+
clearTimeout(timer);
|
|
172
|
+
reject(new Error(`failed to launch marker (${command}): ${error.message}`));
|
|
173
|
+
});
|
|
174
|
+
child.on("close", (code) => {
|
|
175
|
+
clearTimeout(timer);
|
|
176
|
+
resolve({ code: code !== null && code !== void 0 ? code : 0, stdout, stderr });
|
|
177
|
+
});
|
|
178
|
+
});
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
ensureTempDir() {
|
|
182
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
183
|
+
try {
|
|
184
|
+
yield promises_1.default.mkdir(this.tempDir, { recursive: true });
|
|
185
|
+
}
|
|
186
|
+
catch (error) {
|
|
187
|
+
this.logger.warn(`Could not create temp directory ${this.tempDir}: ${error}`);
|
|
188
|
+
}
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
exports.MarkerPdfReader = MarkerPdfReader;
|
|
193
|
+
//# sourceMappingURL=MarkerPdfReader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"MarkerPdfReader.js","sourceRoot":"","sources":["../../../../src/core/processor/readers/MarkerPdfReader.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;AAAA,6CAA0D;AAC1D,gDAAwB;AACxB,2DAA6B;AAC7B,iDAAoD;AAyBpD;;;;;;;;;GASG;AACH,MAAa,eAAgB,SAAQ,uBAAU;IAC7C,YACmB,IAAmB,EACnB,GAAgC,EAChC,QAAoB,EACpB,OAAe,EAChC,OAAoB,EACpB,MAAc;QAEd,KAAK,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAPhB,SAAI,GAAJ,IAAI,CAAe;QACnB,QAAG,GAAH,GAAG,CAA6B;QAChC,aAAQ,GAAR,QAAQ,CAAY;QACpB,YAAO,GAAP,OAAO,CAAQ;QAKhC,IAAI,CAAC,aAAa,EAAE,CAAC;IACvB,CAAC;IAED,OAAO;QACL,OAAO,iBAAiB,CAAC;IAC3B,CAAC;IAED,SAAS;QACP,OAAO,YAAY,CAAC;IACtB,CAAC;IAEK,IAAI,CAAC,QAAgB;;YACzB,MAAM,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;YAClC,IAAI,CAAC;gBACH,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;gBAC7B,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;gBAC7D,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC;gBAChC,IAAI,CAAC,OAAO;oBAAE,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;gBAChE,MAAM,KAAK,GAAG,MAAM,kBAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;gBACtC,OAAO;oBACL,MAAM,EAAE;wBACN,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,EAAE,SAAS,EAAE,OAAO,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE;qBACjF;oBACD,QAAQ,EAAE;wBACR,IAAI,EAAE,KAAK;wBACX,QAAQ,EAAE,QAAQ;wBAClB,QAAQ,EAAE,KAAK,CAAC,IAAI;wBACpB,SAAS,EAAE,QAAQ;wBACnB,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,MAAM;wBAC9B,YAAY,EAAE,MAAM;wBACpB,aAAa,EAAE,OAAO,CAAC,MAAM;wBAC7B,gBAAgB,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;wBACxC,MAAM,EAAE,SAAS;qBAClB;iBACF,CAAC;YACJ,CAAC;YAAC,OAAO,KAAU,EAAE,CAAC;gBACpB,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,gCAAgC,QAAQ,KAAK,KAAK,CAAC,OAAO,6BAA6B,CACxF,CAAC;gBACF,OAAO,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACtC,CAAC;QACH,CAAC;KAAA;IAED,oFAAoF;IACtE,UAAU,CAAC,QAAgB;;YACvC,MAAM,OAAO,GAAG,GAAG,QAAQ,YAAY,CAAC;YACxC,IAAI,MAAM,IAAI,CAAC,cAAc,CAAC,OAAO,EAAE,QAAQ,CAAC,EAAE,CAAC;gBACjD,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,2BAA2B,OAAO,EAAE,CAAC,CAAC;gBACxD,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAE,CAAC,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC;YACzE,CAAC;YAED,MAAM,MAAM,GAAG,cAAI,CAAC,OAAO,CACzB,cAAI,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,UAAU,IAAI,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAC1F,CAAC;YACF,MAAM,kBAAE,CAAC,KAAK,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAC5C,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG;oBACX,cAAI,CAAC,OAAO,CAAC,QAAQ,CAAC;oBACtB,cAAc,EAAE,MAAM;oBACtB,iBAAiB,EAAE,UAAU;iBAC9B,CAAC;gBACF,IAAI,IAAI,CAAC,IAAI,CAAC,QAAQ;oBAAE,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;gBACjD,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;oBACrB,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,eAAe,EAAE,sCAAsC,CAAC,CAAC;gBAClF,CAAC;gBAED,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,IAAI,CAAC,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACnE,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;gBAClE,IAAI,MAAM,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;oBACtB,MAAM,IAAI,KAAK,CAAC,wBAAwB,MAAM,CAAC,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACxH,CAAC;gBAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;gBACjD,IAAI,CAAC,QAAQ;oBAAE,MAAM,IAAI,KAAK,CAAC,gCAAgC,MAAM,EAAE,CAAC,CAAC;gBACzE,MAAM,QAAQ,GAAG,MAAM,kBAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBACtD,MAAM,kBAAE,CAAC,SAAS,CAAC,OAAO,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAC/C,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;YACrC,CAAC;oBAAS,CAAC;gBACT,MAAM,kBAAE,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,SAAS,CAAC,CAAC;YAC/E,CAAC;QACH,CAAC;KAAA;IAED,8EAA8E;IAChE,YAAY,CAAC,GAAW;;YACpC,MAAM,OAAO,GAAG,MAAM,kBAAE,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;YAC/E,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;gBACxB,MAAM,IAAI,GAAG,cAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;gBACpC,IAAI,CAAC,CAAC,WAAW,EAAE,EAAE,CAAC;oBACpB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;oBAC7C,IAAI,MAAM;wBAAE,OAAO,MAAM,CAAC;gBAC5B,CAAC;qBAAM,IAAI,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;oBAChD,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;YACD,OAAO,SAAS,CAAC;QACnB,CAAC;KAAA;IAEa,cAAc,CAAC,OAAe,EAAE,QAAgB;;YAC5D,IAAI,CAAC;gBACH,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC,kBAAE,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,kBAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;gBACxE,OAAO,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC;YAChC,CAAC;YAAC,WAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC;KAAA;IAED;2EACuE;IACzD,cAAc,CAAC,OAAe,EAAE,IAAc;;YAC1D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;;gBACrC,MAAM,GAAG,mCAA2B,OAAO,CAAC,GAAG,KAAE,gBAAgB,EAAE,GAAG,GAAE,CAAC;gBACzE,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,GAAG,EAAE,CAAC;oBACjC,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM;wBAAE,GAAG,CAAC,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC;oBAC1D,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI;wBAAE,GAAG,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;oBACvD,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK;wBAAE,GAAG,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;gBACxD,CAAC;gBACD,MAAM,OAAO,GAAiB,EAAE,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC;gBACzE,MAAM,KAAK,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;gBAC5C,IAAI,MAAM,GAAG,EAAE,CAAC;gBAChB,IAAI,MAAM,GAAG,EAAE,CAAC;gBAChB,MAAA,KAAK,CAAC,MAAM,0CAAE,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;gBAC1D,MAAA,KAAK,CAAC,MAAM,0CAAE,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;gBAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC5B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACtB,MAAM,CAAC,IAAI,KAAK,CAAC,0BAA0B,IAAI,CAAC,IAAI,CAAC,SAAS,IAAI,CAAC,CAAC,CAAC;gBACvE,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBACxB,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;oBAC1B,YAAY,CAAC,KAAK,CAAC,CAAC;oBACpB,MAAM,CAAC,IAAI,KAAK,CAAC,4BAA4B,OAAO,MAAM,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;gBAC9E,CAAC,CAAC,CAAC;gBACH,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;oBACzB,YAAY,CAAC,KAAK,CAAC,CAAC;oBACpB,OAAO,CAAC,EAAE,IAAI,EAAE,IAAI,aAAJ,IAAI,cAAJ,IAAI,GAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;gBAC/C,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;KAAA;IAEa,aAAa;;YACzB,IAAI,CAAC;gBACH,MAAM,kBAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACpD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,mCAAmC,IAAI,CAAC,OAAO,KAAK,KAAK,EAAE,CAAC,CAAC;YAChF,CAAC;QACH,CAAC;KAAA;CACF;AA1JD,0CA0JC"}
|