@wanshi-kg/wanshi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +458 -0
- package/dist/__tests__/helpers.js +27 -0
- package/dist/__tests__/helpers.js.map +1 -0
- package/dist/cli/commands/export.command.js +99 -0
- package/dist/cli/commands/export.command.js.map +1 -0
- package/dist/cli/commands/index.js +22 -0
- package/dist/cli/commands/index.js.map +1 -0
- package/dist/cli/commands/inspectMerges.command.js +84 -0
- package/dist/cli/commands/inspectMerges.command.js.map +1 -0
- package/dist/cli/commands/metrics.command.js +196 -0
- package/dist/cli/commands/metrics.command.js.map +1 -0
- package/dist/cli/commands/process.command.js +82 -0
- package/dist/cli/commands/process.command.js.map +1 -0
- package/dist/cli/commands/watch.command.js +91 -0
- package/dist/cli/commands/watch.command.js.map +1 -0
- package/dist/cli/index.js +269 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/optionsToConfig.js +160 -0
- package/dist/cli/optionsToConfig.js.map +1 -0
- package/dist/config/index.js +59 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/legacyHints.js +113 -0
- package/dist/config/legacyHints.js.map +1 -0
- package/dist/config/schema.js +803 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/config/ui.js +221 -0
- package/dist/config/ui.js.map +1 -0
- package/dist/core/DirectoryProcessor.js +725 -0
- package/dist/core/DirectoryProcessor.js.map +1 -0
- package/dist/core/adapters/IStructuredAdapter.js +3 -0
- package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
- package/dist/core/adapters/SqliteAdapter.js +267 -0
- package/dist/core/adapters/SqliteAdapter.js.map +1 -0
- package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
- package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
- package/dist/core/adapters/index.js +20 -0
- package/dist/core/adapters/index.js.map +1 -0
- package/dist/core/checkpoint/CheckpointService.js +188 -0
- package/dist/core/checkpoint/CheckpointService.js.map +1 -0
- package/dist/core/checkpoint/index.js +18 -0
- package/dist/core/checkpoint/index.js.map +1 -0
- package/dist/core/corpus/CorpusAnalyzer.js +266 -0
- package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
- package/dist/core/corpus/CorpusProfileStore.js +92 -0
- package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
- package/dist/core/corpus/index.js +21 -0
- package/dist/core/corpus/index.js.map +1 -0
- package/dist/core/corpus/normalizeGlossary.js +60 -0
- package/dist/core/corpus/normalizeGlossary.js.map +1 -0
- package/dist/core/corpus/relPath.js +52 -0
- package/dist/core/corpus/relPath.js.map +1 -0
- package/dist/core/corpus/termFrequency.js +86 -0
- package/dist/core/corpus/termFrequency.js.map +1 -0
- package/dist/core/cost/CostMeter.js +235 -0
- package/dist/core/cost/CostMeter.js.map +1 -0
- package/dist/core/cost/index.js +19 -0
- package/dist/core/cost/index.js.map +1 -0
- package/dist/core/cost/prices.js +38 -0
- package/dist/core/cost/prices.js.map +1 -0
- package/dist/core/cv/ObjectDetectionService.js +119 -0
- package/dist/core/cv/ObjectDetectionService.js.map +1 -0
- package/dist/core/di/ContainerFactory.js +670 -0
- package/dist/core/di/ContainerFactory.js.map +1 -0
- package/dist/core/di/DIContainer.js +103 -0
- package/dist/core/di/DIContainer.js.map +1 -0
- package/dist/core/di/index.js +19 -0
- package/dist/core/di/index.js.map +1 -0
- package/dist/core/errors/CustomErrors.js +342 -0
- package/dist/core/errors/CustomErrors.js.map +1 -0
- package/dist/core/errors/index.js +18 -0
- package/dist/core/errors/index.js.map +1 -0
- package/dist/core/export/KnowledgeGraphExportService.js +56 -0
- package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
- package/dist/core/export/index.js +19 -0
- package/dist/core/export/index.js.map +1 -0
- package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
- package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
- package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/IExportStrategy.js +3 -0
- package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
- package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
- package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
- package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
- package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/McpExportStrategy.js +67 -0
- package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/index.js +25 -0
- package/dist/core/export/strategies/index.js.map +1 -0
- package/dist/core/export/strategies/kbTriples.js +60 -0
- package/dist/core/export/strategies/kbTriples.js.map +1 -0
- package/dist/core/index.js +22 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
- package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
- package/dist/core/knowledge/MergeRecord.js +3 -0
- package/dist/core/knowledge/MergeRecord.js.map +1 -0
- package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
- package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
- package/dist/core/knowledge/canon/index.js +18 -0
- package/dist/core/knowledge/canon/index.js.map +1 -0
- package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
- package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
- package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
- package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
- package/dist/core/knowledge/contradiction/index.js +19 -0
- package/dist/core/knowledge/contradiction/index.js.map +1 -0
- package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
- package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
- package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
- package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
- package/dist/core/knowledge/grounding/index.js +20 -0
- package/dist/core/knowledge/grounding/index.js.map +1 -0
- package/dist/core/knowledge/grounding/verbalize.js +38 -0
- package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
- package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
- package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
- package/dist/core/knowledge/index.js +20 -0
- package/dist/core/knowledge/index.js.map +1 -0
- package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
- package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
- package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
- package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
- package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
- package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
- package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
- package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
- package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
- package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
- package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
- package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
- package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
- package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
- package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
- package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
- package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
- package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
- package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
- package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
- package/dist/core/knowledge/vocabulary.js +162 -0
- package/dist/core/knowledge/vocabulary.js.map +1 -0
- package/dist/core/llm/EmbeddingService.js +113 -0
- package/dist/core/llm/EmbeddingService.js.map +1 -0
- package/dist/core/llm/OllamaService.js +146 -0
- package/dist/core/llm/OllamaService.js.map +1 -0
- package/dist/core/llm/OpenAICompatibleService.js +190 -0
- package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
- package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
- package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
- package/dist/core/llm/embeddingUtils.js +25 -0
- package/dist/core/llm/embeddingUtils.js.map +1 -0
- package/dist/core/llm/index.js +23 -0
- package/dist/core/llm/index.js.map +1 -0
- package/dist/core/llm/prompts/PromptManager.js +388 -0
- package/dist/core/llm/prompts/PromptManager.js.map +1 -0
- package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
- package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
- package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
- package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
- package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
- package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
- package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
- package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
- package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
- package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
- package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
- package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
- package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
- package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
- package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
- package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
- package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
- package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
- package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
- package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
- package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
- package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
- package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
- package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
- package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
- package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
- package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
- package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
- package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
- package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
- package/dist/core/pipeline/GroundingTransform.js +52 -0
- package/dist/core/pipeline/GroundingTransform.js.map +1 -0
- package/dist/core/pipeline/PipelineRunner.js +51 -0
- package/dist/core/pipeline/PipelineRunner.js.map +1 -0
- package/dist/core/pipeline/RelationFilterTransform.js +72 -0
- package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
- package/dist/core/pipeline/index.js +20 -0
- package/dist/core/pipeline/index.js.map +1 -0
- package/dist/core/processor/FileProcessor.js +184 -0
- package/dist/core/processor/FileProcessor.js.map +1 -0
- package/dist/core/processor/ProcessedRegistry.js +38 -0
- package/dist/core/processor/ProcessedRegistry.js.map +1 -0
- package/dist/core/processor/ast/AstSeedService.js +0 -0
- package/dist/core/processor/ast/AstSeedService.js.map +1 -0
- package/dist/core/processor/ast/AstSymbolStore.js +110 -0
- package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
- package/dist/core/processor/ast/index.js +19 -0
- package/dist/core/processor/ast/index.js.map +1 -0
- package/dist/core/processor/chunking/TextChunker.js +98 -0
- package/dist/core/processor/chunking/TextChunker.js.map +1 -0
- package/dist/core/processor/chunking/index.js +18 -0
- package/dist/core/processor/chunking/index.js.map +1 -0
- package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
- package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
- package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
- package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
- package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
- package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
- package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
- package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
- package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
- package/dist/core/processor/classifier/index.js +21 -0
- package/dist/core/processor/classifier/index.js.map +1 -0
- package/dist/core/processor/classifier/mergeClassifications.js +32 -0
- package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
- package/dist/core/processor/index.js +20 -0
- package/dist/core/processor/index.js.map +1 -0
- package/dist/core/processor/readers/AudioReader.js +462 -0
- package/dist/core/processor/readers/AudioReader.js.map +1 -0
- package/dist/core/processor/readers/BinaryReader.js +90 -0
- package/dist/core/processor/readers/BinaryReader.js.map +1 -0
- package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
- package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
- package/dist/core/processor/readers/ChatExportReader.js +365 -0
- package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
- package/dist/core/processor/readers/DoclingReader.js +445 -0
- package/dist/core/processor/readers/DoclingReader.js.map +1 -0
- package/dist/core/processor/readers/EmailReader.js +259 -0
- package/dist/core/processor/readers/EmailReader.js.map +1 -0
- package/dist/core/processor/readers/EpubReader.js +175 -0
- package/dist/core/processor/readers/EpubReader.js.map +1 -0
- package/dist/core/processor/readers/FileReader.js +90 -0
- package/dist/core/processor/readers/FileReader.js.map +1 -0
- package/dist/core/processor/readers/FileReaderFactory.js +49 -0
- package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
- package/dist/core/processor/readers/HtmlReader.js +371 -0
- package/dist/core/processor/readers/HtmlReader.js.map +1 -0
- package/dist/core/processor/readers/ImageReader.js +162 -0
- package/dist/core/processor/readers/ImageReader.js.map +1 -0
- package/dist/core/processor/readers/JsonFileReader.js +232 -0
- package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
- package/dist/core/processor/readers/JupyterReader.js +178 -0
- package/dist/core/processor/readers/JupyterReader.js.map +1 -0
- package/dist/core/processor/readers/LatexReader.js +176 -0
- package/dist/core/processor/readers/LatexReader.js.map +1 -0
- package/dist/core/processor/readers/MarkdownReader.js +289 -0
- package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
- package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
- package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
- package/dist/core/processor/readers/MistralOcrReader.js +198 -0
- package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
- package/dist/core/processor/readers/OfficeReader.js +174 -0
- package/dist/core/processor/readers/OfficeReader.js.map +1 -0
- package/dist/core/processor/readers/PdfReader.js +116 -0
- package/dist/core/processor/readers/PdfReader.js.map +1 -0
- package/dist/core/processor/readers/RtfReader.js +107 -0
- package/dist/core/processor/readers/RtfReader.js.map +1 -0
- package/dist/core/processor/readers/SubtitleReader.js +145 -0
- package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
- package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
- package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
- package/dist/core/processor/readers/TextReader.js +129 -0
- package/dist/core/processor/readers/TextReader.js.map +1 -0
- package/dist/core/processor/readers/TranscriptReader.js +234 -0
- package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
- package/dist/core/processor/readers/image/imageMetadata.js +155 -0
- package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
- package/dist/core/processor/readers/index.js +41 -0
- package/dist/core/processor/readers/index.js.map +1 -0
- package/dist/core/processor/readers/referenceExtraction.js +198 -0
- package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
- package/dist/core/processor/readers/stripReferences.js +59 -0
- package/dist/core/processor/readers/stripReferences.js.map +1 -0
- package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
- package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
- package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
- package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
- package/dist/core/progress/NoopProgressEmitter.js +15 -0
- package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
- package/dist/core/progress/index.js +19 -0
- package/dist/core/progress/index.js.map +1 -0
- package/dist/core/trace/TraceWriter.js +100 -0
- package/dist/core/trace/TraceWriter.js.map +1 -0
- package/dist/core/trace/events.js +13 -0
- package/dist/core/trace/events.js.map +1 -0
- package/dist/core/trace/index.js +20 -0
- package/dist/core/trace/index.js.map +1 -0
- package/dist/core/trace/lineage.js +97 -0
- package/dist/core/trace/lineage.js.map +1 -0
- package/dist/evaluation/BenchmarkRunner.js +171 -0
- package/dist/evaluation/BenchmarkRunner.js.map +1 -0
- package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
- package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
- package/dist/evaluation/classifier/labeledSamples.js +379 -0
- package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
- package/dist/evaluation/compare/goldCompare.js +126 -0
- package/dist/evaluation/compare/goldCompare.js.map +1 -0
- package/dist/evaluation/crossre/compareScoring.js +30 -0
- package/dist/evaluation/crossre/compareScoring.js.map +1 -0
- package/dist/evaluation/datasets/CrossREDataset.js +170 -0
- package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
- package/dist/evaluation/datasets/IDataset.js +3 -0
- package/dist/evaluation/datasets/IDataset.js.map +1 -0
- package/dist/evaluation/datasets/RebelDataset.js +117 -0
- package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
- package/dist/evaluation/datasets/RedocredDataset.js +218 -0
- package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
- package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
- package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
- package/dist/evaluation/index.js +33 -0
- package/dist/evaluation/index.js.map +1 -0
- package/dist/evaluation/matching/ExactMatcher.js +75 -0
- package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
- package/dist/evaluation/matching/SemanticMatcher.js +143 -0
- package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
- package/dist/evaluation/metrics/TripleMetrics.js +64 -0
- package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
- package/dist/evaluation/mine/MineCheckpoint.js +114 -0
- package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
- package/dist/evaluation/mine/MineDataset.js +208 -0
- package/dist/evaluation/mine/MineDataset.js.map +1 -0
- package/dist/evaluation/mine/MineReporter.js +98 -0
- package/dist/evaluation/mine/MineReporter.js.map +1 -0
- package/dist/evaluation/mine/MineRunner.js +148 -0
- package/dist/evaluation/mine/MineRunner.js.map +1 -0
- package/dist/evaluation/mine/MineScorer.js +127 -0
- package/dist/evaluation/mine/MineScorer.js.map +1 -0
- package/dist/evaluation/mine/types.js +12 -0
- package/dist/evaluation/mine/types.js.map +1 -0
- package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
- package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
- package/dist/evaluation/reporters/JsonReporter.js +50 -0
- package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
- package/dist/index.js +28 -0
- package/dist/index.js.map +1 -0
- package/dist/quality/CompositeScore.js +61 -0
- package/dist/quality/CompositeScore.js.map +1 -0
- package/dist/quality/ConsistencyMetrics.js +70 -0
- package/dist/quality/ConsistencyMetrics.js.map +1 -0
- package/dist/quality/FactualMetrics.js +76 -0
- package/dist/quality/FactualMetrics.js.map +1 -0
- package/dist/quality/GraphHealthMetrics.js +68 -0
- package/dist/quality/GraphHealthMetrics.js.map +1 -0
- package/dist/quality/SemanticMetrics.js +102 -0
- package/dist/quality/SemanticMetrics.js.map +1 -0
- package/dist/quality/StructuralMetrics.js +60 -0
- package/dist/quality/StructuralMetrics.js.map +1 -0
- package/dist/quality/index.js +23 -0
- package/dist/quality/index.js.map +1 -0
- package/dist/shared/index.js +20 -0
- package/dist/shared/index.js.map +1 -0
- package/dist/shared/logger/Logger.js +3 -0
- package/dist/shared/logger/Logger.js.map +1 -0
- package/dist/shared/logger/LoggerFactory.js +75 -0
- package/dist/shared/logger/LoggerFactory.js.map +1 -0
- package/dist/shared/logger/index.js +19 -0
- package/dist/shared/logger/index.js.map +1 -0
- package/dist/shared/shutdown.js +30 -0
- package/dist/shared/shutdown.js.map +1 -0
- package/dist/shared/utils/agglomerativeCluster.js +269 -0
- package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
- package/dist/shared/utils/astSymbols.js +69 -0
- package/dist/shared/utils/astSymbols.js.map +1 -0
- package/dist/shared/utils/cosineSimilarity.js +18 -0
- package/dist/shared/utils/cosineSimilarity.js.map +1 -0
- package/dist/shared/utils/directoryTree.js +184 -0
- package/dist/shared/utils/directoryTree.js.map +1 -0
- package/dist/shared/utils/documentOutline.js +74 -0
- package/dist/shared/utils/documentOutline.js.map +1 -0
- package/dist/shared/utils/index.js +24 -0
- package/dist/shared/utils/index.js.map +1 -0
- package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
- package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
- package/dist/shared/utils/parseJsonLenient.js +27 -0
- package/dist/shared/utils/parseJsonLenient.js.map +1 -0
- package/dist/shared/utils/readConfig.js +42 -0
- package/dist/shared/utils/readConfig.js.map +1 -0
- package/dist/shared/utils/readRtf.js +216 -0
- package/dist/shared/utils/readRtf.js.map +1 -0
- package/dist/shared/utils/softmax.js +26 -0
- package/dist/shared/utils/softmax.js.map +1 -0
- package/dist/types/ContentClass.js +3 -0
- package/dist/types/ContentClass.js.map +1 -0
- package/dist/types/CorpusProfile.js +3 -0
- package/dist/types/CorpusProfile.js.map +1 -0
- package/dist/types/IContradictionChecker.js +3 -0
- package/dist/types/IContradictionChecker.js.map +1 -0
- package/dist/types/ICorpusAnalyzer.js +3 -0
- package/dist/types/ICorpusAnalyzer.js.map +1 -0
- package/dist/types/IDirectoryProcessor.js +3 -0
- package/dist/types/IDirectoryProcessor.js.map +1 -0
- package/dist/types/IEmbeddingProvider.js +3 -0
- package/dist/types/IEmbeddingProvider.js.map +1 -0
- package/dist/types/IEmbeddingService.js +6 -0
- package/dist/types/IEmbeddingService.js.map +1 -0
- package/dist/types/IFileProcessor.js +3 -0
- package/dist/types/IFileProcessor.js.map +1 -0
- package/dist/types/IGroundingChecker.js +3 -0
- package/dist/types/IGroundingChecker.js.map +1 -0
- package/dist/types/IKnowledgeGraphBuilder.js +3 -0
- package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
- package/dist/types/IKnowledgeGraphExporter.js +3 -0
- package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
- package/dist/types/IKnowledgeGraphMerger.js +3 -0
- package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
- package/dist/types/IKnowledgeGraphSearch.js +3 -0
- package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
- package/dist/types/ILLMProvider.js +3 -0
- package/dist/types/ILLMProvider.js.map +1 -0
- package/dist/types/ILLMService.js +3 -0
- package/dist/types/ILLMService.js.map +1 -0
- package/dist/types/IObjectDetector.js +3 -0
- package/dist/types/IObjectDetector.js.map +1 -0
- package/dist/types/IProcessingService.js +3 -0
- package/dist/types/IProcessingService.js.map +1 -0
- package/dist/types/IProgressEmitter.js +3 -0
- package/dist/types/IProgressEmitter.js.map +1 -0
- package/dist/types/IPromptManager.js +3 -0
- package/dist/types/IPromptManager.js.map +1 -0
- package/dist/types/KnowledgeGraph.js +3 -0
- package/dist/types/KnowledgeGraph.js.map +1 -0
- package/dist/types/MCPKnowledgeGraph.js +3 -0
- package/dist/types/MCPKnowledgeGraph.js.map +1 -0
- package/dist/types/Observation.js +21 -0
- package/dist/types/Observation.js.map +1 -0
- package/dist/types/ProcessingOptions.js +3 -0
- package/dist/types/ProcessingOptions.js.map +1 -0
- package/dist/types/index.js +40 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +122 -0
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
38
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
39
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
40
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
41
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
exports.DirectoryProcessor = exports.FileDiscoveryService = void 0;
|
|
46
|
+
const glob_1 = require("glob");
|
|
47
|
+
const fs = __importStar(require("fs"));
|
|
48
|
+
const path = __importStar(require("path"));
|
|
49
|
+
const di_1 = require("./di");
|
|
50
|
+
const corpus_1 = require("./corpus");
|
|
51
|
+
const ReferenceResolver_1 = require("./knowledge/references/ReferenceResolver");
|
|
52
|
+
const imageMetaGraph_1 = require("./knowledge/images/imageMetaGraph");
|
|
53
|
+
const referenceExtraction_1 = require("./processor/readers/referenceExtraction");
|
|
54
|
+
const ProcessedRegistry_1 = require("./processor/ProcessedRegistry");
|
|
55
|
+
const WebReferenceProcessor_1 = require("./knowledge/references/web/WebReferenceProcessor");
|
|
56
|
+
const CitationEvidenceProcessor_1 = require("./knowledge/references/citations/CitationEvidenceProcessor");
|
|
57
|
+
const pipeline_1 = require("./pipeline");
|
|
58
|
+
const canon_1 = require("./knowledge/canon");
|
|
59
|
+
const shared_1 = require("../shared");
|
|
60
|
+
const trace_1 = require("./trace");
|
|
61
|
+
const cost_1 = require("./cost");
|
|
62
|
+
const adapters_1 = require("./adapters");
|
|
63
|
+
class FileDiscoveryService {
|
|
64
|
+
constructor(options, logger) {
|
|
65
|
+
this.logger = logger;
|
|
66
|
+
this.dir = options.input;
|
|
67
|
+
this.filter = options.filter;
|
|
68
|
+
this.exclude = options.exclude;
|
|
69
|
+
}
|
|
70
|
+
discover() {
|
|
71
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
72
|
+
const patterns = this.filter.map(f => path.join(this.dir, f));
|
|
73
|
+
const files = yield (0, glob_1.glob)(patterns, { nodir: true, ignore: this.exclude });
|
|
74
|
+
if (files.length === 0) {
|
|
75
|
+
const message = `No files found matching pattern: ${this.filter}`;
|
|
76
|
+
this.logger.warn(message);
|
|
77
|
+
throw new Error(message);
|
|
78
|
+
}
|
|
79
|
+
this.logger.info(`Found ${files.length} files to process`);
|
|
80
|
+
return files;
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
exports.FileDiscoveryService = FileDiscoveryService;
|
|
85
|
+
/**
|
|
86
|
+
* Refactored DirectoryProcessor using dependency injection
|
|
87
|
+
* Focuses on orchestration while delegating business logic to services
|
|
88
|
+
*/
|
|
89
|
+
class DirectoryProcessor {
|
|
90
|
+
constructor(container) {
|
|
91
|
+
this.container = container;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Process a directory and generate knowledge graphs
|
|
95
|
+
*/
|
|
96
|
+
processDirectory(options) {
|
|
97
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
98
|
+
var _a, _b;
|
|
99
|
+
const logger = yield this.container.resolve(di_1.TYPES.Logger);
|
|
100
|
+
const progress = yield this.container.resolve(di_1.TYPES.ProgressEmitter);
|
|
101
|
+
const fileDiscoveryService = yield this.container.resolve(di_1.TYPES.FileDiscoveryService);
|
|
102
|
+
logger.info(`Starting knowledge graph generation`);
|
|
103
|
+
logger.info(`Input: ${options.input}, Filter: ${options.filter}, Output: ${options.output}, Model: ${options.llm.model}`);
|
|
104
|
+
// Debug trace: open the run. A resumed run skips checkpointed chunks, so its
|
|
105
|
+
// trace is partial — flagged here.
|
|
106
|
+
trace_1.trace.emit({
|
|
107
|
+
stage: "run", type: "run_start",
|
|
108
|
+
output: options.output,
|
|
109
|
+
resumed: !!((_a = options.resume) === null || _a === void 0 ? void 0 : _a.enabled),
|
|
110
|
+
config: { model: options.llm.model, promptVersion: options.llm.promptVersion, grounding: (_b = options.grounding) === null || _b === void 0 ? void 0 : _b.mode },
|
|
111
|
+
});
|
|
112
|
+
// Cost meter: attach the resolved logger (configured in ContainerFactory without one).
|
|
113
|
+
if (cost_1.meter.enabled)
|
|
114
|
+
cost_1.meter.attachLogger(logger);
|
|
115
|
+
try {
|
|
116
|
+
// Orchestrate the workflow
|
|
117
|
+
const files = yield fileDiscoveryService.discover();
|
|
118
|
+
progress.emit({ type: "discovery", totalFiles: files.length });
|
|
119
|
+
// Rough pre-run cost estimate (bill-shock heads-up; the end tally is exact).
|
|
120
|
+
if (cost_1.meter.enabled)
|
|
121
|
+
yield this.logCostEstimate(files, options, logger);
|
|
122
|
+
const knowledgeGraphs = yield this.processFiles(files, options);
|
|
123
|
+
if (shared_1.shutdown.isRequested()) {
|
|
124
|
+
logger.warn("Run interrupted — merging and exporting the partial graph collected so far. Re-run with --resume to continue.");
|
|
125
|
+
}
|
|
126
|
+
progress.emit({ type: "merge", graphCount: knowledgeGraphs.length });
|
|
127
|
+
const mergedKG = yield this.mergeGraphs(knowledgeGraphs, logger);
|
|
128
|
+
const finalKG = yield this.applyGraphTransforms(mergedKG, options, logger);
|
|
129
|
+
const outputPath = yield this.exportKnowledgeGraph(finalKG, options);
|
|
130
|
+
progress.emit({
|
|
131
|
+
type: "export",
|
|
132
|
+
format: options.export.format,
|
|
133
|
+
entities: finalKG.entities.length,
|
|
134
|
+
relations: finalKG.relations.length,
|
|
135
|
+
output: outputPath,
|
|
136
|
+
});
|
|
137
|
+
trace_1.trace.emit({
|
|
138
|
+
stage: "export", type: "export",
|
|
139
|
+
format: options.export.format,
|
|
140
|
+
entities: finalKG.entities.length,
|
|
141
|
+
relations: finalKG.relations.length,
|
|
142
|
+
});
|
|
143
|
+
this.logSuccess(finalKG, outputPath, logger);
|
|
144
|
+
// Cost meter: exact end-of-run tally + persist the resume-safe cumulative ledger.
|
|
145
|
+
if (cost_1.meter.enabled) {
|
|
146
|
+
logger.info(cost_1.meter.summary());
|
|
147
|
+
cost_1.meter.persistLedger();
|
|
148
|
+
}
|
|
149
|
+
progress.emit({
|
|
150
|
+
type: "done",
|
|
151
|
+
entities: finalKG.entities.length,
|
|
152
|
+
relations: finalKG.relations.length,
|
|
153
|
+
output: outputPath,
|
|
154
|
+
interrupted: shared_1.shutdown.isRequested(),
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
catch (error) {
|
|
158
|
+
this.handleError(error, options.logging.debug, logger);
|
|
159
|
+
progress.emit({
|
|
160
|
+
type: "error",
|
|
161
|
+
message: error instanceof Error ? error.message : String(error),
|
|
162
|
+
});
|
|
163
|
+
throw error;
|
|
164
|
+
}
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
/** Rough pre-run cost projection from discovered file sizes (bytes≈chars; no double read pass). */
|
|
168
|
+
logCostEstimate(files, options, logger) {
|
|
169
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
170
|
+
let totalChars = 0;
|
|
171
|
+
for (const f of files) {
|
|
172
|
+
try {
|
|
173
|
+
totalChars += (yield fs.promises.stat(f)).size;
|
|
174
|
+
}
|
|
175
|
+
catch (_a) {
|
|
176
|
+
/* unreadable/removed — skip */
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
const est = cost_1.meter.estimate(totalChars, options.chunking.size, options.llm.model);
|
|
180
|
+
const tokens = est.estPromptTokens + est.estCompletionTokens;
|
|
181
|
+
const money = est.priced
|
|
182
|
+
? `~${options.cost.currency} ${est.estCost.toFixed(est.estCost < 1 ? 4 : 2)}`
|
|
183
|
+
: `no price set (shown as ${options.cost.currency} 0)`;
|
|
184
|
+
logger.info(`Cost estimate (rough): ~${est.estChunks} chunk(s), ~${tokens.toLocaleString()} tokens for ` +
|
|
185
|
+
`model '${options.llm.model}' — ${money}. Resume-cached chunks reduce actual spend; the ` +
|
|
186
|
+
`end-of-run tally is exact.`);
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Process multiple files and generate knowledge graphs
|
|
191
|
+
*/
|
|
192
|
+
processFiles(files, options) {
|
|
193
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
194
|
+
const knowledgeGraphs = [];
|
|
195
|
+
const logger = yield this.container.resolve(di_1.TYPES.Logger);
|
|
196
|
+
const progress = yield this.container.resolve(di_1.TYPES.ProgressEmitter);
|
|
197
|
+
// Load a prior output graph (if any) to seed retrieval CONTEXT only. It must
|
|
198
|
+
// NOT enter the merge set: re-merging already-merged output into a fresh run
|
|
199
|
+
// double-counts entities/observations on a plain (no --resume) re-run.
|
|
200
|
+
const priorGraphs = yield this.loadPriorGraphs(options.output, logger);
|
|
201
|
+
// Optional corpus analysis pre-pass: build/load a corpus-specific glossary
|
|
202
|
+
// (and cached per-file classification) once, before extraction.
|
|
203
|
+
const corpusProfile = yield this.buildCorpusProfile(files, options, logger);
|
|
204
|
+
const fileProcessor = yield this.container.resolve(di_1.TYPES.FileProcessor);
|
|
205
|
+
const kgBuilder = yield this.container.resolve(di_1.TYPES.KnowledgeGraphBuilder);
|
|
206
|
+
// Deterministic AST symbol seed (Phase 8): seed code definitions + exported
|
|
207
|
+
// members (and calls/imports edges) per file so the LLM augments the symbol
|
|
208
|
+
// set rather than originating it. Content-hash cached across the run.
|
|
209
|
+
const astSeed = options.ast.mode === "enabled"
|
|
210
|
+
? yield this.container.resolve(di_1.TYPES.AstSeedService)
|
|
211
|
+
: undefined;
|
|
212
|
+
yield (astSeed === null || astSeed === void 0 ? void 0 : astSeed.loadCache());
|
|
213
|
+
// Structured-emit adapters (data-sink track): a graph-native source (e.g. a
|
|
214
|
+
// .db) maps directly to graph fragments, bypassing the LLM. Empty registry =
|
|
215
|
+
// every file takes the normal read→build path (default).
|
|
216
|
+
const structuredAdapters = yield this.container
|
|
217
|
+
.resolve(di_1.TYPES.StructuredAdapterRegistry)
|
|
218
|
+
.catch(() => new adapters_1.StructuredAdapterRegistry()); // empty = off-path default
|
|
219
|
+
// Reference & link resolution: the corpus-relative path set drives link
|
|
220
|
+
// resolution (resolved-flag + follow targets). In follow mode it spans the
|
|
221
|
+
// WHOLE input tree (links can point outside the glob); otherwise the glob set.
|
|
222
|
+
const follow = options.references.follow.enabled;
|
|
223
|
+
const internalLinksOn = options.references.internalLinks.enabled || follow;
|
|
224
|
+
let corpusRelPaths;
|
|
225
|
+
if (follow) {
|
|
226
|
+
const allInput = yield new FileDiscoveryService({ input: options.input, filter: ["**/*"], exclude: options.exclude }, logger)
|
|
227
|
+
.discover()
|
|
228
|
+
.catch(() => []);
|
|
229
|
+
corpusRelPaths = new Set(allInput.map((f) => (0, corpus_1.toRelPathId)(options.input, f)));
|
|
230
|
+
}
|
|
231
|
+
else {
|
|
232
|
+
corpusRelPaths = internalLinksOn
|
|
233
|
+
? new Set(files.map((f) => (0, corpus_1.toRelPathId)(options.input, f)))
|
|
234
|
+
: new Set();
|
|
235
|
+
}
|
|
236
|
+
// Phase 1 — class-3 external web fetcher (opt-in; constructed only when
|
|
237
|
+
// references.web.enabled, so a default run never builds the network layer).
|
|
238
|
+
const webProc = options.references.web.enabled
|
|
239
|
+
? yield this.buildWebProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger)
|
|
240
|
+
: null;
|
|
241
|
+
// Phase 2 — citation span-fetch (opt-in; constructed only when
|
|
242
|
+
// references.citations.fetch.enabled). Resolves id-bearing cites to OA full
|
|
243
|
+
// text, span-selects the citing claim's evidence, and labels the edge.
|
|
244
|
+
const citeProc = options.references.citations.fetch.enabled
|
|
245
|
+
? yield this.buildCitationProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger)
|
|
246
|
+
: null;
|
|
247
|
+
// Worklist with a processed-file registry: the same file is read/extracted at
|
|
248
|
+
// most once however it's reached (overlapping globs, reference-following). The
|
|
249
|
+
// queue is seeded from follow.seeds (a crawl) or the discovered glob set, and
|
|
250
|
+
// (in follow mode) grows as internal links are resolved to existing files.
|
|
251
|
+
const registry = new ProcessedRegistry_1.ProcessedRegistry();
|
|
252
|
+
const queued = new Set();
|
|
253
|
+
const queue = [];
|
|
254
|
+
const enqueue = (file, depth) => {
|
|
255
|
+
const id = (0, corpus_1.toRelPathId)(options.input, file);
|
|
256
|
+
if (registry.has(id) || queued.has(id))
|
|
257
|
+
return;
|
|
258
|
+
queued.add(id);
|
|
259
|
+
queue.push({ file, depth });
|
|
260
|
+
};
|
|
261
|
+
const seeds = options.references.follow.seeds;
|
|
262
|
+
if (follow && seeds.length) {
|
|
263
|
+
for (const s of seeds) {
|
|
264
|
+
const abs = path.resolve(options.input, s);
|
|
265
|
+
if (fs.existsSync(abs))
|
|
266
|
+
enqueue(abs, 0);
|
|
267
|
+
else
|
|
268
|
+
logger.warn(`reference-follow seed not found, skipping: ${s}`);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
else {
|
|
272
|
+
for (const f of files)
|
|
273
|
+
enqueue(f, 0);
|
|
274
|
+
}
|
|
275
|
+
const { maxFiles, maxDepth } = options.references.follow;
|
|
276
|
+
let index = 0;
|
|
277
|
+
while (queue.length > 0) {
|
|
278
|
+
// Cooperative interrupt: stop before starting the next file so the
|
|
279
|
+
// partial graph accumulated so far can still be merged and exported.
|
|
280
|
+
if (shared_1.shutdown.isRequested()) {
|
|
281
|
+
logger.warn(`Interrupted — flushing partial graph (${registry.size} files processed)`);
|
|
282
|
+
break;
|
|
283
|
+
}
|
|
284
|
+
if (follow && registry.size >= maxFiles) {
|
|
285
|
+
logger.warn(`reference-follow reached maxFiles=${maxFiles}; stopping discovery`);
|
|
286
|
+
break;
|
|
287
|
+
}
|
|
288
|
+
const { file, depth } = queue.shift();
|
|
289
|
+
const id = (0, corpus_1.toRelPathId)(options.input, file);
|
|
290
|
+
if (registry.has(id))
|
|
291
|
+
continue; // already processed via another path
|
|
292
|
+
index += 1;
|
|
293
|
+
const total = registry.size + queue.length + 1;
|
|
294
|
+
progress.emit({ type: "file_start", index, total, path: file });
|
|
295
|
+
try {
|
|
296
|
+
// Retrieval sees prior output + graphs built so far this run; merge sees
|
|
297
|
+
// only what's built this run (knowledgeGraphs).
|
|
298
|
+
const retrievalContext = [...priorGraphs, ...knowledgeGraphs];
|
|
299
|
+
const { graphs: fileGraphs, links: fileLinks, citations: fileCitations } = yield this.processFile(file, options, fileProcessor, kgBuilder, retrievalContext, logger, corpusProfile, astSeed, corpusRelPaths, structuredAdapters);
|
|
300
|
+
registry.mark(id);
|
|
301
|
+
knowledgeGraphs.push(...fileGraphs);
|
|
302
|
+
// Reference-driven ingestion: enqueue resolved internal-link targets that
|
|
303
|
+
// exist in the corpus and haven't been processed/queued. Network-free —
|
|
304
|
+
// external targets are skipped (that's the web fetcher below).
|
|
305
|
+
if (follow && (maxDepth === 0 || depth < maxDepth)) {
|
|
306
|
+
for (const link of fileLinks) {
|
|
307
|
+
if ((0, referenceExtraction_1.isExternalTarget)(link.target))
|
|
308
|
+
continue;
|
|
309
|
+
const rel = (0, ReferenceResolver_1.resolveInternalTarget)(link, id, corpusRelPaths);
|
|
310
|
+
if (rel && !registry.has(rel) && !queued.has(rel)) {
|
|
311
|
+
enqueue(path.resolve(options.input, rel), depth + 1);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
// Phase 1 — class-3 external web: fetch this file's allowlisted external
|
|
316
|
+
// links (gated), extract, emit `references` edges. Depth-1 (fetched pages
|
|
317
|
+
// are not re-crawled). Offline unless references.web is enabled.
|
|
318
|
+
if (webProc) {
|
|
319
|
+
const webGraph = yield webProc.process(id, fileLinks, options.description);
|
|
320
|
+
if (webGraph)
|
|
321
|
+
knowledgeGraphs.push(webGraph);
|
|
322
|
+
}
|
|
323
|
+
// Phase 2 — citation span-fetch: resolve this file's id-bearing cites to OA
|
|
324
|
+
// full text, fold content + label faithfulness. Offline unless enabled.
|
|
325
|
+
if (citeProc) {
|
|
326
|
+
const citeGraph = yield citeProc.process(id, file, fileCitations);
|
|
327
|
+
if (citeGraph)
|
|
328
|
+
knowledgeGraphs.push(citeGraph);
|
|
329
|
+
}
|
|
330
|
+
const entities = fileGraphs.reduce((n, g) => n + g.entities.length, 0);
|
|
331
|
+
const relations = fileGraphs.reduce((n, g) => n + g.relations.length, 0);
|
|
332
|
+
progress.emit({ type: "file_complete", index, total, path: file, entities, relations });
|
|
333
|
+
if (options.logging.debug) {
|
|
334
|
+
yield this.writeIntermediateResults(knowledgeGraphs, options.output);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
catch (error) {
|
|
338
|
+
registry.mark(id); // don't retry a hard-failing file in this run
|
|
339
|
+
this.handleFileError(file, error, options.logging.debug, logger);
|
|
340
|
+
progress.emit({ type: "file_complete", index, total: registry.size + queue.length, path: file, entities: 0, relations: 0 });
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
// Persist the AST symbol cache so an unchanged file is a no-op next run.
|
|
344
|
+
yield (astSeed === null || astSeed === void 0 ? void 0 : astSeed.saveCache());
|
|
345
|
+
// Surface chunks whose extraction failed: they were left uncheckpointed (so
|
|
346
|
+
// --resume retries them) and must not pass silently as "done-and-empty". The
|
|
347
|
+
// partial graph still merges/exports; the run exits non-zero (KG-02).
|
|
348
|
+
const failedChunks = kgBuilder.getFailedChunks();
|
|
349
|
+
if (failedChunks.length > 0) {
|
|
350
|
+
logger.warn(`${failedChunks.length} chunk(s) failed extraction and were left uncheckpointed — ` +
|
|
351
|
+
`re-run with --resume to retry them:`);
|
|
352
|
+
for (const f of failedChunks) {
|
|
353
|
+
logger.warn(` - ${f.filePath} [chunk ${f.chunkIndex}/${f.totalChunks}]: ${f.error}`);
|
|
354
|
+
}
|
|
355
|
+
process.exitCode = 1;
|
|
356
|
+
}
|
|
357
|
+
// Surface claims the inline grounding gate rejected (WI3 manifest trace):
|
|
358
|
+
// in `drop` mode they were removed from the graph, in `flag` mode annotated
|
|
359
|
+
// and kept — either way they must leave a visible trace, not vanish.
|
|
360
|
+
const rejections = kgBuilder.getGroundingRejections();
|
|
361
|
+
if (rejections.length > 0) {
|
|
362
|
+
const dropped = rejections.filter((r) => r.dropped).length;
|
|
363
|
+
logger.warn(`Grounding gate flagged ${rejections.length} ungrounded claim(s)` +
|
|
364
|
+
(dropped > 0 ? ` (${dropped} dropped, ${rejections.length - dropped} flagged)` : ` (all flagged)`) +
|
|
365
|
+
`:`);
|
|
366
|
+
for (const r of rejections) {
|
|
367
|
+
logger.debug(` - [${r.kind}] ${r.subject} (score ${r.score.toFixed(2)}, ` +
|
|
368
|
+
`${r.dropped ? "dropped" : "flagged"}) in ${r.filePath} [chunk ${r.chunkIndex}]: ${r.claim}`);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return knowledgeGraphs;
|
|
372
|
+
});
|
|
373
|
+
}
|
|
374
|
+
/**
|
|
375
|
+
* Load a previously-written output graph for retrieval seeding. Tolerates both
|
|
376
|
+
* the current single-graph object (`{entities, relations}`) and a legacy array
|
|
377
|
+
* of per-file graphs. Returns [] (and warns) when missing/unparseable — the
|
|
378
|
+
* prior graph is a retrieval nicety, never required.
|
|
379
|
+
*/
|
|
380
|
+
loadPriorGraphs(outputPath, logger) {
|
|
381
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
382
|
+
if (!outputPath || !fs.existsSync(outputPath))
|
|
383
|
+
return [];
|
|
384
|
+
const raw = fs.readFileSync(outputPath, "utf-8");
|
|
385
|
+
// JSONL / mcp-jsonl outputs aren't valid JSON documents — parse them
|
|
386
|
+
// line-by-line (KG-11) instead of warning every run. Route by extension, and
|
|
387
|
+
// also fall back to the JSONL reader if a `.json` somehow fails to parse.
|
|
388
|
+
const isJsonl = /\.(jsonl|mcp-jsonl)$/i.test(outputPath);
|
|
389
|
+
if (isJsonl) {
|
|
390
|
+
const { JsonlExportStrategy } = yield Promise.resolve().then(() => __importStar(require("./export/strategies/JsonlExportStrategy")));
|
|
391
|
+
const graph = JsonlExportStrategy.fromJSONL(raw);
|
|
392
|
+
return graph.entities.length || graph.relations.length ? [graph] : [];
|
|
393
|
+
}
|
|
394
|
+
try {
|
|
395
|
+
const parsed = JSON.parse(raw);
|
|
396
|
+
if (Array.isArray(parsed))
|
|
397
|
+
return parsed;
|
|
398
|
+
if (parsed && typeof parsed === "object" && Array.isArray(parsed.entities)) {
|
|
399
|
+
return [parsed];
|
|
400
|
+
}
|
|
401
|
+
return [];
|
|
402
|
+
}
|
|
403
|
+
catch (_a) {
|
|
404
|
+
// Not a JSON document — try JSONL before giving up (covers a mislabeled file).
|
|
405
|
+
const { JsonlExportStrategy } = yield Promise.resolve().then(() => __importStar(require("./export/strategies/JsonlExportStrategy")));
|
|
406
|
+
const graph = JsonlExportStrategy.fromJSONL(raw);
|
|
407
|
+
if (graph.entities.length || graph.relations.length)
|
|
408
|
+
return [graph];
|
|
409
|
+
logger.warn(`Could not load prior graph at ${outputPath} for retrieval context (ignored)`);
|
|
410
|
+
return [];
|
|
411
|
+
}
|
|
412
|
+
});
|
|
413
|
+
}
|
|
414
|
+
/**
|
|
415
|
+
* Process a single file
|
|
416
|
+
*/
|
|
417
|
+
processFile(file_1, options_1, fileProcessor_1, kgBuilder_1, existingGraphs_1, logger_1, corpusProfile_1, astSeed_1) {
|
|
418
|
+
return __awaiter(this, arguments, void 0, function* (file, options, fileProcessor, kgBuilder, existingGraphs, logger, corpusProfile, astSeed, corpusRelPaths = new Set(), structuredAdapters) {
|
|
419
|
+
var _a, _b, _c, _d, _e;
|
|
420
|
+
logger.info(`Processing: ${file}`);
|
|
421
|
+
// Structured-emit path (data-sink track): if an adapter claims this file, it
|
|
422
|
+
// maps the source DIRECTLY to graph fragments (bypassing read→chunk→LLM). The
|
|
423
|
+
// fragment still enters the per-file graphs[] union → merge/canon.
|
|
424
|
+
const adapter = structuredAdapters === null || structuredAdapters === void 0 ? void 0 : structuredAdapters.match(file);
|
|
425
|
+
if (adapter) {
|
|
426
|
+
logger.info(`Structured adapter '${adapter.id}' handling ${file} (graph-native, no LLM)`);
|
|
427
|
+
const graph = yield adapter.extract(file);
|
|
428
|
+
if (trace_1.trace.enabled) {
|
|
429
|
+
trace_1.trace.emit({
|
|
430
|
+
stage: "ingest", type: "chunk", chunkId: `${file}#0`, file,
|
|
431
|
+
chunkIndex: 0, totalChunks: 1, reader: `adapter:${adapter.id}`, contentLength: 0,
|
|
432
|
+
});
|
|
433
|
+
}
|
|
434
|
+
return { graphs: graph ? [graph] : [], links: [], citations: [] };
|
|
435
|
+
}
|
|
436
|
+
// Reuse the pre-pass's cached classification for this file when available.
|
|
437
|
+
const cachedClasses = corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.perFileClasses[(0, corpus_1.toRelPathId)(options.input, file)];
|
|
438
|
+
const processedFile = yield fileProcessor.processFile(file, cachedClasses);
|
|
439
|
+
// A reader can signal a graceful skip (BinaryReader for binary/unknown
|
|
440
|
+
// files) — honor it before the "no content extracted" guard turns an empty
|
|
441
|
+
// read into a per-file error.
|
|
442
|
+
if ((_a = processedFile.metadata) === null || _a === void 0 ? void 0 : _a.skip) {
|
|
443
|
+
logger.info(`Skipped ${file} (binary / no extractable text)`);
|
|
444
|
+
return { graphs: [], links: [], citations: [] };
|
|
445
|
+
}
|
|
446
|
+
this.validateProcessedFile(processedFile, file, logger);
|
|
447
|
+
const retrieve = yield this.buildRetriever(processedFile, file, existingGraphs, options);
|
|
448
|
+
const promptManager = (yield this.container.resolve(di_1.TYPES.PromptManager));
|
|
449
|
+
const systemPrompt = yield promptManager.getSystemPrompt(options.input, options.filter.join(', '), options.description, (_b = processedFile.metadata) === null || _b === void 0 ? void 0 : _b.classes, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary, options.pipeline.extraction.openPredicate);
|
|
450
|
+
const graphs = yield kgBuilder.build(processedFile, systemPrompt, retrieve, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary);
|
|
451
|
+
// Append the deterministic AST symbol seed (Phase 8) so it merges with the
|
|
452
|
+
// LLM's per-chunk graphs — the model augments the symbol set, not originates it.
|
|
453
|
+
const seed = astSeed ? yield astSeed.seedGraph(processedFile) : null;
|
|
454
|
+
if (seed)
|
|
455
|
+
graphs.push(seed);
|
|
456
|
+
// Deterministic image metadata (EXIF/C2PA): graph facts that AUGMENT the VLM's
|
|
457
|
+
// read of an image rather than replacing it (sourceAdapter exif/c2pa, confidence).
|
|
458
|
+
// No-op (returns null) unless a reader stashed metadata.exif/metadata.c2pa.
|
|
459
|
+
const imageGraph = (0, imageMetaGraph_1.buildImageMetaGraph)(processedFile, options.input);
|
|
460
|
+
if (imageGraph)
|
|
461
|
+
graphs.push(imageGraph);
|
|
462
|
+
// Deterministic reference edges (Phase 0, network-free): internal links +
|
|
463
|
+
// citations the document already contains, resolved against the corpus.
|
|
464
|
+
// Merges with the LLM graphs like the AST seed above. Following auto-implies
|
|
465
|
+
// internal-link resolution (you can't follow links you didn't extract).
|
|
466
|
+
const internalLinksOn = options.references.internalLinks.enabled || options.references.follow.enabled;
|
|
467
|
+
// When citation-fetch is on (Phase 2), the CitationEvidenceProcessor OWNS the
|
|
468
|
+
// `cites` edges (resolved + faithfulness) — so the network-free resolver stands
|
|
469
|
+
// down on citations to avoid emitting a competing resolved:false edge.
|
|
470
|
+
const fetchOwnsCites = options.references.citations.fetch.enabled;
|
|
471
|
+
const citationsForResolver = options.references.citations.enabled && !fetchOwnsCites;
|
|
472
|
+
if (internalLinksOn || citationsForResolver) {
|
|
473
|
+
const refGraph = (0, ReferenceResolver_1.buildReferenceGraph)(processedFile, corpusRelPaths, options.input, {
|
|
474
|
+
internalLinks: internalLinksOn,
|
|
475
|
+
citations: citationsForResolver,
|
|
476
|
+
});
|
|
477
|
+
if (refGraph)
|
|
478
|
+
graphs.push(refGraph);
|
|
479
|
+
}
|
|
480
|
+
const refs = (_c = processedFile.metadata) === null || _c === void 0 ? void 0 : _c.references;
|
|
481
|
+
return { graphs, links: (_d = refs === null || refs === void 0 ? void 0 : refs.links) !== null && _d !== void 0 ? _d : [], citations: (_e = refs === null || refs === void 0 ? void 0 : refs.citations) !== null && _e !== void 0 ? _e : [] };
|
|
482
|
+
});
|
|
483
|
+
}
|
|
484
|
+
/**
|
|
485
|
+
* Build the Phase-1 web reference processor: the DI-managed gated fetcher +
|
|
486
|
+
* fetch cache, plus an extract closure that runs a fetched page through the
|
|
487
|
+
* normal reader + builder (content only — no reference-resolver/follow on
|
|
488
|
+
* fetched pages = depth-1).
|
|
489
|
+
*/
|
|
490
|
+
buildWebProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger) {
|
|
491
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
492
|
+
const fetcher = yield this.container.resolve(di_1.TYPES.GatedFetcher);
|
|
493
|
+
const cache = yield this.container.resolve(di_1.TYPES.FetchCacheService);
|
|
494
|
+
const promptManager = (yield this.container.resolve(di_1.TYPES.PromptManager));
|
|
495
|
+
const extract = (tempPath) => __awaiter(this, void 0, void 0, function* () {
|
|
496
|
+
var _a, _b;
|
|
497
|
+
const pf = yield fileProcessor.processFile(tempPath);
|
|
498
|
+
if (((_a = pf.metadata) === null || _a === void 0 ? void 0 : _a.skip) || !pf.chunks.length)
|
|
499
|
+
return [];
|
|
500
|
+
const systemPrompt = yield promptManager.getSystemPrompt(options.input, options.filter.join(", "), options.description, (_b = pf.metadata) === null || _b === void 0 ? void 0 : _b.classes, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary, options.pipeline.extraction.openPredicate);
|
|
501
|
+
return kgBuilder.build(pf, systemPrompt, undefined, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary);
|
|
502
|
+
});
|
|
503
|
+
return new WebReferenceProcessor_1.WebReferenceProcessor(fetcher, cache, extract, logger);
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Build the Phase-2 citation evidence processor: a PDF-capable gated fetcher +
|
|
508
|
+
* its own fetch cache + the id→OA resolver, an extract closure that runs a
|
|
509
|
+
* fetched cited PDF through the normal reader (chunks for span-select) + builder
|
|
510
|
+
* (content folded onto the cited-work node), the embedding provider for
|
|
511
|
+
* span-select, and (optionally) GROBID for marker→claim linking + MiniCheck for
|
|
512
|
+
* the faithfulness label.
|
|
513
|
+
*/
|
|
514
|
+
buildCitationProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger) {
|
|
515
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
516
|
+
const { CitationResolver } = yield Promise.resolve().then(() => __importStar(require("./knowledge/references/citations/CitationResolver")));
|
|
517
|
+
const fetcher = yield this.container.resolve(di_1.TYPES.CitationFetcher);
|
|
518
|
+
const cache = yield this.container.resolve(di_1.TYPES.CitationFetchCache);
|
|
519
|
+
const resolver = yield this.container.resolve(di_1.TYPES.CitationResolver);
|
|
520
|
+
const embeddings = yield this.container.resolve(di_1.TYPES.EmbeddingService);
|
|
521
|
+
const promptManager = (yield this.container.resolve(di_1.TYPES.PromptManager));
|
|
522
|
+
const cfg = options.references.citations;
|
|
523
|
+
const grobid = cfg.grobid.enabled
|
|
524
|
+
? yield this.container.resolve(di_1.TYPES.GrobidClient)
|
|
525
|
+
: null;
|
|
526
|
+
if (grobid && !(yield grobid.isAlive())) {
|
|
527
|
+
logger.warn(`GROBID not reachable at ${cfg.grobid.url} — citation span-select/faithfulness disabled (id-bearing fetch still runs). Start it with: docker run -p 8070:8070 lfoppiano/grobid`);
|
|
528
|
+
}
|
|
529
|
+
let faithfulness = null;
|
|
530
|
+
if (cfg.fetch.minicheck) {
|
|
531
|
+
const { MiniCheckGroundingChecker } = yield Promise.resolve().then(() => __importStar(require("./knowledge/grounding")));
|
|
532
|
+
faithfulness = new MiniCheckGroundingChecker({ model: cfg.fetch.minicheckModel, host: cfg.fetch.minicheckHost, min: 0.5, escalateAbove: 1.1 }, logger);
|
|
533
|
+
}
|
|
534
|
+
const extract = (tempPath) => __awaiter(this, void 0, void 0, function* () {
|
|
535
|
+
var _a, _b;
|
|
536
|
+
const pf = yield fileProcessor.processFile(tempPath);
|
|
537
|
+
if (((_a = pf.metadata) === null || _a === void 0 ? void 0 : _a.skip) || !pf.chunks.length)
|
|
538
|
+
return { chunks: [], graphs: [] };
|
|
539
|
+
const chunks = pf.chunks.map((ch) => ch.content);
|
|
540
|
+
const systemPrompt = yield promptManager.getSystemPrompt(options.input, options.filter.join(", "), options.description, (_b = pf.metadata) === null || _b === void 0 ? void 0 : _b.classes, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary, options.pipeline.extraction.openPredicate);
|
|
541
|
+
const graphs = yield kgBuilder.build(pf, systemPrompt, undefined, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary);
|
|
542
|
+
return { chunks, graphs };
|
|
543
|
+
});
|
|
544
|
+
return new CitationEvidenceProcessor_1.CitationEvidenceProcessor(fetcher, cache, resolver, extract, embeddings, logger, {
|
|
545
|
+
grobid,
|
|
546
|
+
faithfulness,
|
|
547
|
+
uncertainBand: cfg.fetch.uncertainBand,
|
|
548
|
+
});
|
|
549
|
+
});
|
|
550
|
+
}
|
|
551
|
+
/**
|
|
552
|
+
* Run the optional corpus analysis pre-pass (term frequency + cached
|
|
553
|
+
* classification + LLM glossary). Returns undefined when disabled or on
|
|
554
|
+
* failure — profiling is an enhancement, never required.
|
|
555
|
+
*/
|
|
556
|
+
buildCorpusProfile(files, options, logger) {
|
|
557
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
558
|
+
if (options.corpus.profiling !== "enabled")
|
|
559
|
+
return undefined;
|
|
560
|
+
try {
|
|
561
|
+
logger.info("Corpus analysis pre-pass enabled — profiling corpus before extraction");
|
|
562
|
+
const analyzer = yield this.container.resolve(di_1.TYPES.CorpusAnalyzer);
|
|
563
|
+
return yield analyzer.analyzeOrLoad(files, options);
|
|
564
|
+
}
|
|
565
|
+
catch (error) {
|
|
566
|
+
logger.warn(`Corpus pre-pass failed (continuing without a glossary): ${error}`);
|
|
567
|
+
return undefined;
|
|
568
|
+
}
|
|
569
|
+
});
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Validate processed file content
|
|
573
|
+
*/
|
|
574
|
+
validateProcessedFile(processedFile, filePath, logger) {
|
|
575
|
+
var _a;
|
|
576
|
+
if (!((_a = processedFile.chunks) === null || _a === void 0 ? void 0 : _a.length)) {
|
|
577
|
+
logger.warn(`No content extracted from: ${filePath}`);
|
|
578
|
+
throw new Error(`No content extracted from file: ${filePath}`);
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
/**
|
|
582
|
+
* Build a retrieval function for a file, or undefined when retrieval is
|
|
583
|
+
* disabled / there's no existing graph to search.
|
|
584
|
+
*
|
|
585
|
+
* - `retrievalScope: "chunk"` (default) returns a function that retrieves
|
|
586
|
+
* context per chunk using that chunk's own content.
|
|
587
|
+
* - `retrievalScope: "file"` retrieves once from the first chunk and reuses
|
|
588
|
+
* it for every chunk (legacy behavior).
|
|
589
|
+
*/
|
|
590
|
+
buildRetriever(processedFile, filePath, existingGraphs, options) {
|
|
591
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
592
|
+
if (!this.shouldUseRetrieval(options) || existingGraphs.length === 0) {
|
|
593
|
+
return undefined;
|
|
594
|
+
}
|
|
595
|
+
const searchService = yield this.container.resolve(di_1.TYPES.KnowledgeGraphSearch);
|
|
596
|
+
const searchOptions = {
|
|
597
|
+
limit: options.retrieval.limit,
|
|
598
|
+
includeObservations: true,
|
|
599
|
+
};
|
|
600
|
+
const search = (content) => searchService.searchByFileContent(content, filePath, existingGraphs, searchOptions);
|
|
601
|
+
if (options.retrieval.scope === "file") {
|
|
602
|
+
// Retrieve once from the first chunk, reuse for all chunks.
|
|
603
|
+
const context = yield search(processedFile.chunks[0].content);
|
|
604
|
+
return () => __awaiter(this, void 0, void 0, function* () { return context; });
|
|
605
|
+
}
|
|
606
|
+
// Default: per-chunk retrieval.
|
|
607
|
+
return (chunkContent) => search(chunkContent);
|
|
608
|
+
});
|
|
609
|
+
}
|
|
610
|
+
/**
|
|
611
|
+
* Determine if retrieval should be used
|
|
612
|
+
*/
|
|
613
|
+
shouldUseRetrieval(options) {
|
|
614
|
+
// Fix the conflicting boolean pairs issue
|
|
615
|
+
if (options.retrieval.mode === "disabled")
|
|
616
|
+
return false;
|
|
617
|
+
if (options.retrieval.mode === "enabled")
|
|
618
|
+
return true;
|
|
619
|
+
return true; // Auto to true
|
|
620
|
+
}
|
|
621
|
+
/**
|
|
622
|
+
* Merge multiple knowledge graphs
|
|
623
|
+
*/
|
|
624
|
+
mergeGraphs(graphs, logger) {
|
|
625
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
626
|
+
logger.info(`Merging ${graphs.length} knowledge graphs`);
|
|
627
|
+
const merger = yield this.container.resolve(di_1.TYPES.KnowledgeGraphMerger);
|
|
628
|
+
return yield merger.merge(graphs);
|
|
629
|
+
});
|
|
630
|
+
}
|
|
631
|
+
/**
|
|
632
|
+
* Run the post-extraction graph→graph transform pipeline (grounding gate,
|
|
633
|
+
* canonicalization) over the merged graph, in the order from `pipeline.stages`.
|
|
634
|
+
* A no-op when no transform is enabled — the providers resolved here are the
|
|
635
|
+
* same singletons extraction/merge already built, so the baseline path returns
|
|
636
|
+
* the merged graph unchanged.
|
|
637
|
+
*/
|
|
638
|
+
applyGraphTransforms(graph, options, logger) {
|
|
639
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
640
|
+
const transforms = [
|
|
641
|
+
new pipeline_1.GroundingTransform(),
|
|
642
|
+
new canon_1.Canonicalizer(),
|
|
643
|
+
new pipeline_1.RelationFilterTransform(), // after canon: endpoints are canonical before pairing
|
|
644
|
+
];
|
|
645
|
+
const ctx = {
|
|
646
|
+
options,
|
|
647
|
+
embeddings: yield this.container.resolve(di_1.TYPES.EmbeddingService),
|
|
648
|
+
llm: yield this.container.resolve(di_1.TYPES.LLMService),
|
|
649
|
+
logger,
|
|
650
|
+
};
|
|
651
|
+
const runner = new pipeline_1.PipelineRunner(transforms, ctx);
|
|
652
|
+
if (!runner.hasWork())
|
|
653
|
+
return graph;
|
|
654
|
+
return runner.run(graph);
|
|
655
|
+
});
|
|
656
|
+
}
|
|
657
|
+
/**
|
|
658
|
+
* Export knowledge graph in the requested format
|
|
659
|
+
*/
|
|
660
|
+
exportKnowledgeGraph(knowledgeGraph, options) {
|
|
661
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
662
|
+
yield this.ensureOutputDirectory(options.output);
|
|
663
|
+
const exporter = yield this.container.resolve(di_1.TYPES.KnowledgeGraphExportService);
|
|
664
|
+
const exportFormat = options.export.format;
|
|
665
|
+
if (!exporter.isFormatSupported(exportFormat)) {
|
|
666
|
+
throw new Error(`Unsupported export format: ${exportFormat}. Supported: ${exporter
|
|
667
|
+
.getSupportedFormats()
|
|
668
|
+
.join(", ")}`);
|
|
669
|
+
}
|
|
670
|
+
const outputContent = exporter.export(knowledgeGraph, exportFormat, options);
|
|
671
|
+
const outputPath = this.getOutputPath(options.output, exportFormat);
|
|
672
|
+
yield fs.promises.writeFile(outputPath, outputContent);
|
|
673
|
+
return outputPath;
|
|
674
|
+
});
|
|
675
|
+
}
|
|
676
|
+
/**
|
|
677
|
+
* Ensure output directory exists
|
|
678
|
+
*/
|
|
679
|
+
ensureOutputDirectory(outputPath) {
|
|
680
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
681
|
+
const outputDir = path.dirname(outputPath);
|
|
682
|
+
if (!fs.existsSync(outputDir)) {
|
|
683
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
684
|
+
}
|
|
685
|
+
});
|
|
686
|
+
}
|
|
687
|
+
/**
|
|
688
|
+
* Get the final output path with correct extension
|
|
689
|
+
*/
|
|
690
|
+
getOutputPath(originalPath, format) {
|
|
691
|
+
return originalPath.endsWith(`.${format}`)
|
|
692
|
+
? originalPath
|
|
693
|
+
: originalPath.replace(/\.[^.]+$/, `.${format}`);
|
|
694
|
+
}
|
|
695
|
+
/**
|
|
696
|
+
* Write intermediate results for debugging
|
|
697
|
+
*/
|
|
698
|
+
writeIntermediateResults(graphs, outputPath) {
|
|
699
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
700
|
+
const tmpPath = outputPath + ".tmp";
|
|
701
|
+
yield fs.promises.writeFile(tmpPath, JSON.stringify(graphs, null, 2));
|
|
702
|
+
});
|
|
703
|
+
}
|
|
704
|
+
/**
|
|
705
|
+
* Handle file processing errors
|
|
706
|
+
*/
|
|
707
|
+
handleFileError(file, error, debug, logger) {
|
|
708
|
+
logger.error(`Failed to process file ${file}: ${error.message || error}`);
|
|
709
|
+
}
|
|
710
|
+
/**
|
|
711
|
+
* Handle general processing errors
|
|
712
|
+
*/
|
|
713
|
+
handleError(error, debug, logger) {
|
|
714
|
+
logger.error(`Failed to process directory: ${error.message || error}`);
|
|
715
|
+
}
|
|
716
|
+
/**
|
|
717
|
+
* Log successful completion
|
|
718
|
+
*/
|
|
719
|
+
logSuccess(knowledgeGraph, outputPath, logger) {
|
|
720
|
+
logger.info(`Knowledge graph saved to: ${outputPath}`);
|
|
721
|
+
logger.info(`Final graph: ${knowledgeGraph.entities.length} entities, ${knowledgeGraph.relations.length} relations`);
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
exports.DirectoryProcessor = DirectoryProcessor;
|
|
725
|
+
//# sourceMappingURL=DirectoryProcessor.js.map
|