@wanshi-kg/wanshi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +458 -0
- package/dist/__tests__/helpers.js +27 -0
- package/dist/__tests__/helpers.js.map +1 -0
- package/dist/cli/commands/export.command.js +99 -0
- package/dist/cli/commands/export.command.js.map +1 -0
- package/dist/cli/commands/index.js +22 -0
- package/dist/cli/commands/index.js.map +1 -0
- package/dist/cli/commands/inspectMerges.command.js +84 -0
- package/dist/cli/commands/inspectMerges.command.js.map +1 -0
- package/dist/cli/commands/metrics.command.js +196 -0
- package/dist/cli/commands/metrics.command.js.map +1 -0
- package/dist/cli/commands/process.command.js +82 -0
- package/dist/cli/commands/process.command.js.map +1 -0
- package/dist/cli/commands/watch.command.js +91 -0
- package/dist/cli/commands/watch.command.js.map +1 -0
- package/dist/cli/index.js +269 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/optionsToConfig.js +160 -0
- package/dist/cli/optionsToConfig.js.map +1 -0
- package/dist/config/index.js +59 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/legacyHints.js +113 -0
- package/dist/config/legacyHints.js.map +1 -0
- package/dist/config/schema.js +803 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/config/ui.js +221 -0
- package/dist/config/ui.js.map +1 -0
- package/dist/core/DirectoryProcessor.js +725 -0
- package/dist/core/DirectoryProcessor.js.map +1 -0
- package/dist/core/adapters/IStructuredAdapter.js +3 -0
- package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
- package/dist/core/adapters/SqliteAdapter.js +267 -0
- package/dist/core/adapters/SqliteAdapter.js.map +1 -0
- package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
- package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
- package/dist/core/adapters/index.js +20 -0
- package/dist/core/adapters/index.js.map +1 -0
- package/dist/core/checkpoint/CheckpointService.js +188 -0
- package/dist/core/checkpoint/CheckpointService.js.map +1 -0
- package/dist/core/checkpoint/index.js +18 -0
- package/dist/core/checkpoint/index.js.map +1 -0
- package/dist/core/corpus/CorpusAnalyzer.js +266 -0
- package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
- package/dist/core/corpus/CorpusProfileStore.js +92 -0
- package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
- package/dist/core/corpus/index.js +21 -0
- package/dist/core/corpus/index.js.map +1 -0
- package/dist/core/corpus/normalizeGlossary.js +60 -0
- package/dist/core/corpus/normalizeGlossary.js.map +1 -0
- package/dist/core/corpus/relPath.js +52 -0
- package/dist/core/corpus/relPath.js.map +1 -0
- package/dist/core/corpus/termFrequency.js +86 -0
- package/dist/core/corpus/termFrequency.js.map +1 -0
- package/dist/core/cost/CostMeter.js +235 -0
- package/dist/core/cost/CostMeter.js.map +1 -0
- package/dist/core/cost/index.js +19 -0
- package/dist/core/cost/index.js.map +1 -0
- package/dist/core/cost/prices.js +38 -0
- package/dist/core/cost/prices.js.map +1 -0
- package/dist/core/cv/ObjectDetectionService.js +119 -0
- package/dist/core/cv/ObjectDetectionService.js.map +1 -0
- package/dist/core/di/ContainerFactory.js +670 -0
- package/dist/core/di/ContainerFactory.js.map +1 -0
- package/dist/core/di/DIContainer.js +103 -0
- package/dist/core/di/DIContainer.js.map +1 -0
- package/dist/core/di/index.js +19 -0
- package/dist/core/di/index.js.map +1 -0
- package/dist/core/errors/CustomErrors.js +342 -0
- package/dist/core/errors/CustomErrors.js.map +1 -0
- package/dist/core/errors/index.js +18 -0
- package/dist/core/errors/index.js.map +1 -0
- package/dist/core/export/KnowledgeGraphExportService.js +56 -0
- package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
- package/dist/core/export/index.js +19 -0
- package/dist/core/export/index.js.map +1 -0
- package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
- package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
- package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/IExportStrategy.js +3 -0
- package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
- package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
- package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
- package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
- package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/McpExportStrategy.js +67 -0
- package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
- package/dist/core/export/strategies/index.js +25 -0
- package/dist/core/export/strategies/index.js.map +1 -0
- package/dist/core/export/strategies/kbTriples.js +60 -0
- package/dist/core/export/strategies/kbTriples.js.map +1 -0
- package/dist/core/index.js +22 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
- package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
- package/dist/core/knowledge/MergeRecord.js +3 -0
- package/dist/core/knowledge/MergeRecord.js.map +1 -0
- package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
- package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
- package/dist/core/knowledge/canon/index.js +18 -0
- package/dist/core/knowledge/canon/index.js.map +1 -0
- package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
- package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
- package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
- package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
- package/dist/core/knowledge/contradiction/index.js +19 -0
- package/dist/core/knowledge/contradiction/index.js.map +1 -0
- package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
- package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
- package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
- package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
- package/dist/core/knowledge/grounding/index.js +20 -0
- package/dist/core/knowledge/grounding/index.js.map +1 -0
- package/dist/core/knowledge/grounding/verbalize.js +38 -0
- package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
- package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
- package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
- package/dist/core/knowledge/index.js +20 -0
- package/dist/core/knowledge/index.js.map +1 -0
- package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
- package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
- package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
- package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
- package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
- package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
- package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
- package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
- package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
- package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
- package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
- package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
- package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
- package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
- package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
- package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
- package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
- package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
- package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
- package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
- package/dist/core/knowledge/vocabulary.js +162 -0
- package/dist/core/knowledge/vocabulary.js.map +1 -0
- package/dist/core/llm/EmbeddingService.js +113 -0
- package/dist/core/llm/EmbeddingService.js.map +1 -0
- package/dist/core/llm/OllamaService.js +146 -0
- package/dist/core/llm/OllamaService.js.map +1 -0
- package/dist/core/llm/OpenAICompatibleService.js +190 -0
- package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
- package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
- package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
- package/dist/core/llm/embeddingUtils.js +25 -0
- package/dist/core/llm/embeddingUtils.js.map +1 -0
- package/dist/core/llm/index.js +23 -0
- package/dist/core/llm/index.js.map +1 -0
- package/dist/core/llm/prompts/PromptManager.js +388 -0
- package/dist/core/llm/prompts/PromptManager.js.map +1 -0
- package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
- package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
- package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
- package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
- package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
- package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
- package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
- package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
- package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
- package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
- package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
- package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
- package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
- package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
- package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
- package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
- package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
- package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
- package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
- package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
- package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
- package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
- package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
- package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
- package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
- package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
- package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
- package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
- package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
- package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
- package/dist/core/pipeline/GroundingTransform.js +52 -0
- package/dist/core/pipeline/GroundingTransform.js.map +1 -0
- package/dist/core/pipeline/PipelineRunner.js +51 -0
- package/dist/core/pipeline/PipelineRunner.js.map +1 -0
- package/dist/core/pipeline/RelationFilterTransform.js +72 -0
- package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
- package/dist/core/pipeline/index.js +20 -0
- package/dist/core/pipeline/index.js.map +1 -0
- package/dist/core/processor/FileProcessor.js +184 -0
- package/dist/core/processor/FileProcessor.js.map +1 -0
- package/dist/core/processor/ProcessedRegistry.js +38 -0
- package/dist/core/processor/ProcessedRegistry.js.map +1 -0
- package/dist/core/processor/ast/AstSeedService.js +0 -0
- package/dist/core/processor/ast/AstSeedService.js.map +1 -0
- package/dist/core/processor/ast/AstSymbolStore.js +110 -0
- package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
- package/dist/core/processor/ast/index.js +19 -0
- package/dist/core/processor/ast/index.js.map +1 -0
- package/dist/core/processor/chunking/TextChunker.js +98 -0
- package/dist/core/processor/chunking/TextChunker.js.map +1 -0
- package/dist/core/processor/chunking/index.js +18 -0
- package/dist/core/processor/chunking/index.js.map +1 -0
- package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
- package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
- package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
- package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
- package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
- package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
- package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
- package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
- package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
- package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
- package/dist/core/processor/classifier/index.js +21 -0
- package/dist/core/processor/classifier/index.js.map +1 -0
- package/dist/core/processor/classifier/mergeClassifications.js +32 -0
- package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
- package/dist/core/processor/index.js +20 -0
- package/dist/core/processor/index.js.map +1 -0
- package/dist/core/processor/readers/AudioReader.js +462 -0
- package/dist/core/processor/readers/AudioReader.js.map +1 -0
- package/dist/core/processor/readers/BinaryReader.js +90 -0
- package/dist/core/processor/readers/BinaryReader.js.map +1 -0
- package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
- package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
- package/dist/core/processor/readers/ChatExportReader.js +365 -0
- package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
- package/dist/core/processor/readers/DoclingReader.js +445 -0
- package/dist/core/processor/readers/DoclingReader.js.map +1 -0
- package/dist/core/processor/readers/EmailReader.js +259 -0
- package/dist/core/processor/readers/EmailReader.js.map +1 -0
- package/dist/core/processor/readers/EpubReader.js +175 -0
- package/dist/core/processor/readers/EpubReader.js.map +1 -0
- package/dist/core/processor/readers/FileReader.js +90 -0
- package/dist/core/processor/readers/FileReader.js.map +1 -0
- package/dist/core/processor/readers/FileReaderFactory.js +49 -0
- package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
- package/dist/core/processor/readers/HtmlReader.js +371 -0
- package/dist/core/processor/readers/HtmlReader.js.map +1 -0
- package/dist/core/processor/readers/ImageReader.js +162 -0
- package/dist/core/processor/readers/ImageReader.js.map +1 -0
- package/dist/core/processor/readers/JsonFileReader.js +232 -0
- package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
- package/dist/core/processor/readers/JupyterReader.js +178 -0
- package/dist/core/processor/readers/JupyterReader.js.map +1 -0
- package/dist/core/processor/readers/LatexReader.js +176 -0
- package/dist/core/processor/readers/LatexReader.js.map +1 -0
- package/dist/core/processor/readers/MarkdownReader.js +289 -0
- package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
- package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
- package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
- package/dist/core/processor/readers/MistralOcrReader.js +198 -0
- package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
- package/dist/core/processor/readers/OfficeReader.js +174 -0
- package/dist/core/processor/readers/OfficeReader.js.map +1 -0
- package/dist/core/processor/readers/PdfReader.js +116 -0
- package/dist/core/processor/readers/PdfReader.js.map +1 -0
- package/dist/core/processor/readers/RtfReader.js +107 -0
- package/dist/core/processor/readers/RtfReader.js.map +1 -0
- package/dist/core/processor/readers/SubtitleReader.js +145 -0
- package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
- package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
- package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
- package/dist/core/processor/readers/TextReader.js +129 -0
- package/dist/core/processor/readers/TextReader.js.map +1 -0
- package/dist/core/processor/readers/TranscriptReader.js +234 -0
- package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
- package/dist/core/processor/readers/image/imageMetadata.js +155 -0
- package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
- package/dist/core/processor/readers/index.js +41 -0
- package/dist/core/processor/readers/index.js.map +1 -0
- package/dist/core/processor/readers/referenceExtraction.js +198 -0
- package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
- package/dist/core/processor/readers/stripReferences.js +59 -0
- package/dist/core/processor/readers/stripReferences.js.map +1 -0
- package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
- package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
- package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
- package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
- package/dist/core/progress/NoopProgressEmitter.js +15 -0
- package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
- package/dist/core/progress/index.js +19 -0
- package/dist/core/progress/index.js.map +1 -0
- package/dist/core/trace/TraceWriter.js +100 -0
- package/dist/core/trace/TraceWriter.js.map +1 -0
- package/dist/core/trace/events.js +13 -0
- package/dist/core/trace/events.js.map +1 -0
- package/dist/core/trace/index.js +20 -0
- package/dist/core/trace/index.js.map +1 -0
- package/dist/core/trace/lineage.js +97 -0
- package/dist/core/trace/lineage.js.map +1 -0
- package/dist/evaluation/BenchmarkRunner.js +171 -0
- package/dist/evaluation/BenchmarkRunner.js.map +1 -0
- package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
- package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
- package/dist/evaluation/classifier/labeledSamples.js +379 -0
- package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
- package/dist/evaluation/compare/goldCompare.js +126 -0
- package/dist/evaluation/compare/goldCompare.js.map +1 -0
- package/dist/evaluation/crossre/compareScoring.js +30 -0
- package/dist/evaluation/crossre/compareScoring.js.map +1 -0
- package/dist/evaluation/datasets/CrossREDataset.js +170 -0
- package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
- package/dist/evaluation/datasets/IDataset.js +3 -0
- package/dist/evaluation/datasets/IDataset.js.map +1 -0
- package/dist/evaluation/datasets/RebelDataset.js +117 -0
- package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
- package/dist/evaluation/datasets/RedocredDataset.js +218 -0
- package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
- package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
- package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
- package/dist/evaluation/index.js +33 -0
- package/dist/evaluation/index.js.map +1 -0
- package/dist/evaluation/matching/ExactMatcher.js +75 -0
- package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
- package/dist/evaluation/matching/SemanticMatcher.js +143 -0
- package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
- package/dist/evaluation/metrics/TripleMetrics.js +64 -0
- package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
- package/dist/evaluation/mine/MineCheckpoint.js +114 -0
- package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
- package/dist/evaluation/mine/MineDataset.js +208 -0
- package/dist/evaluation/mine/MineDataset.js.map +1 -0
- package/dist/evaluation/mine/MineReporter.js +98 -0
- package/dist/evaluation/mine/MineReporter.js.map +1 -0
- package/dist/evaluation/mine/MineRunner.js +148 -0
- package/dist/evaluation/mine/MineRunner.js.map +1 -0
- package/dist/evaluation/mine/MineScorer.js +127 -0
- package/dist/evaluation/mine/MineScorer.js.map +1 -0
- package/dist/evaluation/mine/types.js +12 -0
- package/dist/evaluation/mine/types.js.map +1 -0
- package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
- package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
- package/dist/evaluation/reporters/JsonReporter.js +50 -0
- package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
- package/dist/index.js +28 -0
- package/dist/index.js.map +1 -0
- package/dist/quality/CompositeScore.js +61 -0
- package/dist/quality/CompositeScore.js.map +1 -0
- package/dist/quality/ConsistencyMetrics.js +70 -0
- package/dist/quality/ConsistencyMetrics.js.map +1 -0
- package/dist/quality/FactualMetrics.js +76 -0
- package/dist/quality/FactualMetrics.js.map +1 -0
- package/dist/quality/GraphHealthMetrics.js +68 -0
- package/dist/quality/GraphHealthMetrics.js.map +1 -0
- package/dist/quality/SemanticMetrics.js +102 -0
- package/dist/quality/SemanticMetrics.js.map +1 -0
- package/dist/quality/StructuralMetrics.js +60 -0
- package/dist/quality/StructuralMetrics.js.map +1 -0
- package/dist/quality/index.js +23 -0
- package/dist/quality/index.js.map +1 -0
- package/dist/shared/index.js +20 -0
- package/dist/shared/index.js.map +1 -0
- package/dist/shared/logger/Logger.js +3 -0
- package/dist/shared/logger/Logger.js.map +1 -0
- package/dist/shared/logger/LoggerFactory.js +75 -0
- package/dist/shared/logger/LoggerFactory.js.map +1 -0
- package/dist/shared/logger/index.js +19 -0
- package/dist/shared/logger/index.js.map +1 -0
- package/dist/shared/shutdown.js +30 -0
- package/dist/shared/shutdown.js.map +1 -0
- package/dist/shared/utils/agglomerativeCluster.js +269 -0
- package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
- package/dist/shared/utils/astSymbols.js +69 -0
- package/dist/shared/utils/astSymbols.js.map +1 -0
- package/dist/shared/utils/cosineSimilarity.js +18 -0
- package/dist/shared/utils/cosineSimilarity.js.map +1 -0
- package/dist/shared/utils/directoryTree.js +184 -0
- package/dist/shared/utils/directoryTree.js.map +1 -0
- package/dist/shared/utils/documentOutline.js +74 -0
- package/dist/shared/utils/documentOutline.js.map +1 -0
- package/dist/shared/utils/index.js +24 -0
- package/dist/shared/utils/index.js.map +1 -0
- package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
- package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
- package/dist/shared/utils/parseJsonLenient.js +27 -0
- package/dist/shared/utils/parseJsonLenient.js.map +1 -0
- package/dist/shared/utils/readConfig.js +42 -0
- package/dist/shared/utils/readConfig.js.map +1 -0
- package/dist/shared/utils/readRtf.js +216 -0
- package/dist/shared/utils/readRtf.js.map +1 -0
- package/dist/shared/utils/softmax.js +26 -0
- package/dist/shared/utils/softmax.js.map +1 -0
- package/dist/types/ContentClass.js +3 -0
- package/dist/types/ContentClass.js.map +1 -0
- package/dist/types/CorpusProfile.js +3 -0
- package/dist/types/CorpusProfile.js.map +1 -0
- package/dist/types/IContradictionChecker.js +3 -0
- package/dist/types/IContradictionChecker.js.map +1 -0
- package/dist/types/ICorpusAnalyzer.js +3 -0
- package/dist/types/ICorpusAnalyzer.js.map +1 -0
- package/dist/types/IDirectoryProcessor.js +3 -0
- package/dist/types/IDirectoryProcessor.js.map +1 -0
- package/dist/types/IEmbeddingProvider.js +3 -0
- package/dist/types/IEmbeddingProvider.js.map +1 -0
- package/dist/types/IEmbeddingService.js +6 -0
- package/dist/types/IEmbeddingService.js.map +1 -0
- package/dist/types/IFileProcessor.js +3 -0
- package/dist/types/IFileProcessor.js.map +1 -0
- package/dist/types/IGroundingChecker.js +3 -0
- package/dist/types/IGroundingChecker.js.map +1 -0
- package/dist/types/IKnowledgeGraphBuilder.js +3 -0
- package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
- package/dist/types/IKnowledgeGraphExporter.js +3 -0
- package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
- package/dist/types/IKnowledgeGraphMerger.js +3 -0
- package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
- package/dist/types/IKnowledgeGraphSearch.js +3 -0
- package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
- package/dist/types/ILLMProvider.js +3 -0
- package/dist/types/ILLMProvider.js.map +1 -0
- package/dist/types/ILLMService.js +3 -0
- package/dist/types/ILLMService.js.map +1 -0
- package/dist/types/IObjectDetector.js +3 -0
- package/dist/types/IObjectDetector.js.map +1 -0
- package/dist/types/IProcessingService.js +3 -0
- package/dist/types/IProcessingService.js.map +1 -0
- package/dist/types/IProgressEmitter.js +3 -0
- package/dist/types/IProgressEmitter.js.map +1 -0
- package/dist/types/IPromptManager.js +3 -0
- package/dist/types/IPromptManager.js.map +1 -0
- package/dist/types/KnowledgeGraph.js +3 -0
- package/dist/types/KnowledgeGraph.js.map +1 -0
- package/dist/types/MCPKnowledgeGraph.js +3 -0
- package/dist/types/MCPKnowledgeGraph.js.map +1 -0
- package/dist/types/Observation.js +21 -0
- package/dist/types/Observation.js.map +1 -0
- package/dist/types/ProcessingOptions.js +3 -0
- package/dist/types/ProcessingOptions.js.map +1 -0
- package/dist/types/index.js +40 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +122 -0
|
@@ -0,0 +1,624 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
38
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
39
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
40
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
41
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
exports.canonicalizeRelationType = canonicalizeRelationType;
|
|
46
|
+
exports.normalizeEntityName = normalizeEntityName;
|
|
47
|
+
exports.digitSignature = digitSignature;
|
|
48
|
+
exports.mergeKnowledgeGraphs = mergeKnowledgeGraphs;
|
|
49
|
+
const crypto = __importStar(require("crypto"));
|
|
50
|
+
const utils_1 = require("../../../shared/utils");
|
|
51
|
+
// Default similarity thresholds for entities and observation merging
|
|
52
|
+
const DefaultSimilarityThreshold = 0.7;
|
|
53
|
+
const DefaultObservationThreshold = 0.7;
|
|
54
|
+
// A fuzzy match across two different known entity types must clear this bar —
|
|
55
|
+
// spelling similarity alone is weak evidence of co-reference when types disagree
|
|
56
|
+
// (garlic/concept vs Anthropic/organization sit at JW 0.704).
|
|
57
|
+
const CrossTypeThreshold = 0.95;
|
|
58
|
+
/** Provenance identity used to keep distinct sources/speakers un-merged. */
|
|
59
|
+
function provenanceKey(o) {
|
|
60
|
+
var _a, _b;
|
|
61
|
+
return `${(_a = o.source) !== null && _a !== void 0 ? _a : ""}␟${(_b = o.speaker) !== null && _b !== void 0 ? _b : ""}`;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Canonicalize a relation's `relationType` array so semantically identical edges
|
|
65
|
+
* collapse on merge: trim → lowercase → de-dupe → sort. This makes the compound
|
|
66
|
+
* predicate order-insensitive, so `["uses","calls"]` and `["calls","uses"]` (the
|
|
67
|
+
* "reversed-twin" class that bloats the predicate vocabulary) map to one key.
|
|
68
|
+
* Pure — exported for tests.
|
|
69
|
+
*/
|
|
70
|
+
function canonicalizeRelationType(types) {
|
|
71
|
+
return Array.from(new Set((types !== null && types !== void 0 ? types : []).map((t) => t.trim().toLowerCase()).filter(Boolean))).sort();
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Deduplicate observations while PRESERVING per-source attribution: the same
|
|
75
|
+
* fact asserted by two different sources/speakers stays as two observations.
|
|
76
|
+
* We partition by provenance identity and only collapse near-duplicates *within*
|
|
77
|
+
* a single provenance group.
|
|
78
|
+
*/
|
|
79
|
+
function deduplicateObservations(observations, threshold, embeddingService, logger) {
|
|
80
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
81
|
+
if (observations.length <= 1)
|
|
82
|
+
return observations;
|
|
83
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(`Deduplicating ${observations.length} observations (provenance-aware)`);
|
|
84
|
+
const groups = new Map();
|
|
85
|
+
for (const o of observations) {
|
|
86
|
+
const key = provenanceKey(o);
|
|
87
|
+
const g = groups.get(key);
|
|
88
|
+
if (g)
|
|
89
|
+
g.push(o);
|
|
90
|
+
else
|
|
91
|
+
groups.set(key, [o]);
|
|
92
|
+
}
|
|
93
|
+
const result = [];
|
|
94
|
+
for (const group of groups.values()) {
|
|
95
|
+
result.push(...(yield dedupWithinProvenance(group, threshold, embeddingService, logger)));
|
|
96
|
+
}
|
|
97
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(`Deduplicated to ${result.length} observations (removed ${observations.length - result.length}, across ${groups.size} provenance group(s))`);
|
|
98
|
+
return result;
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
/** Collapse near-duplicate observations that share the same provenance. */
|
|
102
|
+
function dedupWithinProvenance(observations, threshold, embeddingService, logger) {
|
|
103
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
104
|
+
if (observations.length <= 1)
|
|
105
|
+
return observations;
|
|
106
|
+
const data = [];
|
|
107
|
+
for (const obs of observations) {
|
|
108
|
+
try {
|
|
109
|
+
const embedding = yield embeddingService.embed(obs.text);
|
|
110
|
+
data.push({ obs, embedding });
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
logger === null || logger === void 0 ? void 0 : logger.warn(`Failed to get embedding for observation: ${obs.text}`);
|
|
114
|
+
data.push({ obs, embedding: [] }); // keep it even if embedding fails
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
const toRemove = new Set();
|
|
118
|
+
for (let i = 0; i < data.length; i++) {
|
|
119
|
+
if (toRemove.has(i) || data[i].embedding.length === 0)
|
|
120
|
+
continue;
|
|
121
|
+
for (let j = i + 1; j < data.length; j++) {
|
|
122
|
+
if (toRemove.has(j) || data[j].embedding.length === 0)
|
|
123
|
+
continue;
|
|
124
|
+
const similarity = (0, utils_1.cosineSimilarity)(data[i].embedding, data[j].embedding);
|
|
125
|
+
if (similarity >= threshold) {
|
|
126
|
+
// keep the longer/more detailed observation (with its provenance)
|
|
127
|
+
if (data[i].obs.text.length >= data[j].obs.text.length) {
|
|
128
|
+
toRemove.add(j);
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
toRemove.add(i);
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
return data.filter((_, index) => !toRemove.has(index)).map((d) => d.obs);
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
/** Normalize an entity name for the exact-match fast path: case, `_`/`-`/dash and whitespace runs. */
|
|
141
|
+
function normalizeEntityName(name) {
|
|
142
|
+
return name.toLowerCase().replace(/[_\-‐-―\s]+/g, " ").trim();
|
|
143
|
+
}
|
|
144
|
+
/** Digit tokens of a name ("Table 12 v2" → "12,2"). Differing signatures veto fuzzy merging. */
|
|
145
|
+
function digitSignature(name) {
|
|
146
|
+
var _a;
|
|
147
|
+
return ((_a = name.match(/\d+/g)) !== null && _a !== void 0 ? _a : []).join(",");
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Find an existing entity the candidate should fold into. A normalized-exact name
|
|
151
|
+
* match always wins. Fuzzy (Jaro-Winkler) matching is gated by guards encoding
|
|
152
|
+
* "similar spelling is not co-reference": names whose digit tokens differ never
|
|
153
|
+
* merge (Table 1 ≠ Table 2, NeurIPS 2019 ≠ NeurIPS 2024), and a match across two
|
|
154
|
+
* different known entity types must clear the near-exact CrossTypeThreshold.
|
|
155
|
+
*/
|
|
156
|
+
function findSimilarEntity(entity, existingEntities, threshold, enableSimilarityMerging, qualifyFileIdentity = false) {
|
|
157
|
+
// At the global stage, file-identity entities (file/document) are matched by an
|
|
158
|
+
// exact name+file key *before* this is called, so skip them here — a conceptual
|
|
159
|
+
// entity must never fuse with a file artifact, and a file artifact never fuzzy-
|
|
160
|
+
// matches another file's same-named artifact (KG-13). Within-file merge passes
|
|
161
|
+
// false, preserving its name-only behavior.
|
|
162
|
+
const skip = (e) => qualifyFileIdentity && FILE_IDENTITY_TYPES.has(e.entityType);
|
|
163
|
+
const norm = normalizeEntityName(entity.name);
|
|
164
|
+
for (const [existingName, existing] of existingEntities) {
|
|
165
|
+
if (skip(existing))
|
|
166
|
+
continue;
|
|
167
|
+
if (normalizeEntityName(existingName) === norm) {
|
|
168
|
+
return { name: existingName, sim: 1, method: "string-exact" };
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
if (!enableSimilarityMerging)
|
|
172
|
+
return null;
|
|
173
|
+
const digits = digitSignature(entity.name);
|
|
174
|
+
let best = null;
|
|
175
|
+
for (const [existingName, existing] of existingEntities) {
|
|
176
|
+
if (skip(existing))
|
|
177
|
+
continue;
|
|
178
|
+
if (digitSignature(existingName) !== digits)
|
|
179
|
+
continue;
|
|
180
|
+
const crossType = !!entity.entityType &&
|
|
181
|
+
!!existing.entityType &&
|
|
182
|
+
entity.entityType !== existing.entityType &&
|
|
183
|
+
entity.entityType !== "other" &&
|
|
184
|
+
existing.entityType !== "other";
|
|
185
|
+
const required = crossType ? Math.max(threshold, CrossTypeThreshold) : threshold;
|
|
186
|
+
const similarity = (0, utils_1.jaroWinklerSimilarity)(entity.name, existingName);
|
|
187
|
+
if (similarity >= required && (!best || similarity > best.sim)) {
|
|
188
|
+
best = { name: existingName, sim: similarity, method: "string-jw" };
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
return best;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Merge-time supersession (KG-10, Graphiti "invalidate, don't delete"): for each
|
|
195
|
+
* pair of an entity's observations the checker flags as contradictory AND that
|
|
196
|
+
* carry orderable `validAt`, stamp the OLDER one's `invalidAt` (= when the newer
|
|
197
|
+
* fact began holding) and `expiredAt` (= now, when we recorded the supersession).
|
|
198
|
+
* Both observations are kept — history is preserved, the newer is current.
|
|
199
|
+
*/
|
|
200
|
+
function applySupersession(observations, checker, now) {
|
|
201
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
202
|
+
for (let i = 0; i < observations.length; i++) {
|
|
203
|
+
for (let j = i + 1; j < observations.length; j++) {
|
|
204
|
+
const a = observations[i];
|
|
205
|
+
const b = observations[j];
|
|
206
|
+
if (!a.validAt || !b.validAt || a.validAt === b.validAt)
|
|
207
|
+
continue;
|
|
208
|
+
if (a.expiredAt || b.expiredAt)
|
|
209
|
+
continue; // already superseded
|
|
210
|
+
const { contradicts } = yield checker.check(a.text, b.text);
|
|
211
|
+
if (!contradicts)
|
|
212
|
+
continue;
|
|
213
|
+
const older = a.validAt < b.validAt ? a : b;
|
|
214
|
+
const newer = older === a ? b : a;
|
|
215
|
+
older.invalidAt = newer.validAt;
|
|
216
|
+
older.expiredAt = now;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
/** Emit a merge-log record for one fusion (same JSONL shape as canon's merges.jsonl). */
|
|
222
|
+
function recordFusion(options, winner, loser, match) {
|
|
223
|
+
if (!options.onMergeRecord || winner === loser)
|
|
224
|
+
return;
|
|
225
|
+
options.onMergeRecord({
|
|
226
|
+
cluster_id: crypto.createHash("sha1").update(`${winner}␟${loser}`).digest("hex").slice(0, 12),
|
|
227
|
+
target: "entity",
|
|
228
|
+
surface_forms: [winner, loser],
|
|
229
|
+
canonical_chosen: winner,
|
|
230
|
+
member_count: 2,
|
|
231
|
+
method: match.method,
|
|
232
|
+
intra_cluster_sim: { min: match.sim, max: match.sim },
|
|
233
|
+
borderline_pairs: [],
|
|
234
|
+
source_spans: [],
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
function mergeKnowledgeGraphs(graphs, options, embeddingService, logger) {
|
|
238
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
239
|
+
var _a;
|
|
240
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Starting hierarchical merge of ${graphs.length} knowledge graphs`);
|
|
241
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Entity similarity threshold: ${options.entitySimilarityThreshold}`);
|
|
242
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Observation similarity threshold: ${options.observationSimilarityThreshold}`);
|
|
243
|
+
// Step 1: Group graphs by file
|
|
244
|
+
const graphsByFile = new Map();
|
|
245
|
+
for (const graph of graphs) {
|
|
246
|
+
for (const entity of graph.entities) {
|
|
247
|
+
const file = entity.files[0] || "unknown";
|
|
248
|
+
if (!graphsByFile.has(file)) {
|
|
249
|
+
graphsByFile.set(file, []);
|
|
250
|
+
}
|
|
251
|
+
// Create a mini-graph for this entity and related relations
|
|
252
|
+
const entityGraph = {
|
|
253
|
+
entities: [entity],
|
|
254
|
+
relations: graph.relations.filter((r) => r.from === entity.name || r.to === entity.name),
|
|
255
|
+
};
|
|
256
|
+
graphsByFile.get(file).push(entityGraph);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Step 1: Grouped into ${graphsByFile.size} files`);
|
|
260
|
+
// Step 2: Merge entities within each file
|
|
261
|
+
const mergedByFile = new Map();
|
|
262
|
+
for (const [file, fileGraphs] of graphsByFile) {
|
|
263
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(`Step 2: Merging ${fileGraphs.length} entities in file: ${file}`);
|
|
264
|
+
const fileMerged = yield mergeWithinFile(fileGraphs, file, options, embeddingService, logger);
|
|
265
|
+
mergedByFile.set(file, fileMerged);
|
|
266
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(`File ${file}: ${fileMerged.entities.length} entities, ${fileMerged.relations.length} relations`);
|
|
267
|
+
}
|
|
268
|
+
// Step 3: Global merge across files
|
|
269
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Step 3: Global merge across ${mergedByFile.size} files`);
|
|
270
|
+
const globalGraphs = Array.from(mergedByFile.values());
|
|
271
|
+
const { graph: finalResult, stats } = yield mergeGlobally(globalGraphs, options, embeddingService, logger);
|
|
272
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Hierarchical merge complete: ${finalResult.entities.length} entities, ${finalResult.relations.length} relations`);
|
|
273
|
+
// Cross-file linking health (KG-04) — the recall signal "0 dangling" used to hide.
|
|
274
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Cross-file linking: ${stats.crossFileEdges} edge(s) link entities across files; ` +
|
|
275
|
+
`${stats.droppedDanglingEdges} relation(s) dropped as dangling at the global stage`);
|
|
276
|
+
(_a = options.onMergeStats) === null || _a === void 0 ? void 0 : _a.call(options, stats);
|
|
277
|
+
logVocabularyFit(finalResult, logger);
|
|
278
|
+
return finalResult;
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Closed-vocabulary fit metric (Dove's guardrail for the v5 enums): how often the
|
|
283
|
+
* model fell back to a catch-all instead of a specific type/predicate. A high
|
|
284
|
+
* relation `related_to` fraction (north of ~15–20%) suggests the closed predicate
|
|
285
|
+
* set is too tight for this corpus, not that the corpus is weird.
|
|
286
|
+
*/
|
|
287
|
+
function logVocabularyFit(graph, logger) {
|
|
288
|
+
const rels = graph.relations;
|
|
289
|
+
const ents = graph.entities;
|
|
290
|
+
if (rels.length === 0 && ents.length === 0)
|
|
291
|
+
return;
|
|
292
|
+
const relCatchAll = rels.filter((r) => {
|
|
293
|
+
const types = Array.isArray(r.relationType) ? r.relationType : [r.relationType];
|
|
294
|
+
return types.length > 0 && types.every((t) => t === "related_to");
|
|
295
|
+
}).length;
|
|
296
|
+
const entCatchAll = ents.filter((e) => e.entityType === "other").length;
|
|
297
|
+
const relPct = rels.length ? ((100 * relCatchAll) / rels.length).toFixed(1) : "0.0";
|
|
298
|
+
const entPct = ents.length ? ((100 * entCatchAll) / ents.length).toFixed(1) : "0.0";
|
|
299
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Vocabulary fit: ${relCatchAll}/${rels.length} relations → 'related_to' (${relPct}%), ` +
|
|
300
|
+
`${entCatchAll}/${ents.length} entities → 'other' (${entPct}%)`);
|
|
301
|
+
}
|
|
302
|
+
// Merge entities within a single file. Same threshold as the global pass — the old
|
|
303
|
+
// "stricter for same-file" heuristic (×0.7, cap 0.6) fused unrelated short names
|
|
304
|
+
// (garlic↔Anthropic at JW 0.704); same-file proximity is not evidence of co-reference.
|
|
305
|
+
function mergeWithinFile(fileGraphs, fileName, options, embeddingService, logger) {
|
|
306
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
307
|
+
var _a, _b;
|
|
308
|
+
const entityMap = new Map();
|
|
309
|
+
const relationSet = new Set();
|
|
310
|
+
const relations = [];
|
|
311
|
+
const threshold = options.entitySimilarityThreshold || DefaultSimilarityThreshold;
|
|
312
|
+
const enableSimilarity = options.enableSimilarityMerging !== false;
|
|
313
|
+
// Every incoming surface form → its final entity key; relations re-key through this
|
|
314
|
+
// map only (never through an independent fuzzy lookup).
|
|
315
|
+
const rename = new Map();
|
|
316
|
+
// Merge entities within the file
|
|
317
|
+
for (const graph of fileGraphs) {
|
|
318
|
+
for (const entity of graph.entities) {
|
|
319
|
+
const match = findSimilarEntity(entity, entityMap, threshold, enableSimilarity);
|
|
320
|
+
if (match) {
|
|
321
|
+
rename.set(entity.name, match.name);
|
|
322
|
+
recordFusion(options, match.name, entity.name, match);
|
|
323
|
+
const existing = entityMap.get(match.name);
|
|
324
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(`[${fileName}] Merging entity "${entity.name}" with existing "${match.name}"`);
|
|
325
|
+
// Combine observations
|
|
326
|
+
const allObservations = [
|
|
327
|
+
...(existing.observations || []),
|
|
328
|
+
...(entity.observations || []),
|
|
329
|
+
];
|
|
330
|
+
// Deduplicate observations using embeddings
|
|
331
|
+
if (allObservations.length > 0) {
|
|
332
|
+
existing.observations = yield deduplicateObservations(allObservations, options.observationSimilarityThreshold || DefaultObservationThreshold, embeddingService, logger);
|
|
333
|
+
}
|
|
334
|
+
// Merge other properties
|
|
335
|
+
existing.entityType = existing.entityType || entity.entityType;
|
|
336
|
+
// Merge chunk information (keep the range)
|
|
337
|
+
if (entity.chunk !== undefined) {
|
|
338
|
+
existing.chunk =
|
|
339
|
+
existing.chunk !== undefined
|
|
340
|
+
? Math.min(existing.chunk, entity.chunk)
|
|
341
|
+
: entity.chunk;
|
|
342
|
+
}
|
|
343
|
+
if (entity.totalChunks !== undefined) {
|
|
344
|
+
existing.totalChunks = Math.max(existing.totalChunks || 0, entity.totalChunks);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
else {
|
|
348
|
+
// Add as new entity
|
|
349
|
+
rename.set(entity.name, entity.name);
|
|
350
|
+
const newEntity = Object.assign(Object.assign({}, entity), { file: fileName });
|
|
351
|
+
entityMap.set(entity.name, newEntity);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
// Merge relations within the file, re-keying endpoints through the rename map.
|
|
356
|
+
// Referential integrity is NOT enforced here (KG-04): a relation may legitimately
|
|
357
|
+
// point at an entity defined in ANOTHER file — the v5 cross-file contract — and
|
|
358
|
+
// those endpoints aren't visible until the global stage, where the full entity
|
|
359
|
+
// universe is known. Dropping them here destroyed every compliant cross-file edge
|
|
360
|
+
// before global merge ever saw it. So pass all (re-keyed, non-self-loop) relations
|
|
361
|
+
// through; mergeGlobally is the sole endpoint-existence gate.
|
|
362
|
+
for (const graph of fileGraphs) {
|
|
363
|
+
for (const relation of graph.relations) {
|
|
364
|
+
const fromEntity = (_a = rename.get(relation.from)) !== null && _a !== void 0 ? _a : relation.from;
|
|
365
|
+
const toEntity = (_b = rename.get(relation.to)) !== null && _b !== void 0 ? _b : relation.to;
|
|
366
|
+
// Drop self-loops (X→X): an extraction artifact, and merging names can also
|
|
367
|
+
// create one when both endpoints collapse to the same entity.
|
|
368
|
+
if (fromEntity === toEntity)
|
|
369
|
+
continue;
|
|
370
|
+
const relationType = canonicalizeRelationType(relation.relationType);
|
|
371
|
+
const relationKey = `${fromEntity}->${toEntity}:${relationType.join(",")}`;
|
|
372
|
+
if (!relationSet.has(relationKey)) {
|
|
373
|
+
relationSet.add(relationKey);
|
|
374
|
+
relations.push(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ from: fromEntity, to: toEntity, relationType }, (relation.sourceSpan ? { sourceSpan: relation.sourceSpan } : {})), (relation.validAt ? { validAt: relation.validAt } : {})), (relation.source ? { source: relation.source } : {})), (relation.resolved !== undefined ? { resolved: relation.resolved } : {})), (relation.faithfulness ? { faithfulness: relation.faithfulness } : {})), (relation.faithfulnessScore !== undefined ? { faithfulnessScore: relation.faithfulnessScore } : {})), (relation.supportingSpan ? { supportingSpan: relation.supportingSpan } : {})));
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
return {
|
|
379
|
+
entities: Array.from(entityMap.values()),
|
|
380
|
+
relations: relations,
|
|
381
|
+
};
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
const ENTITY_CATCH_ALL = "other";
|
|
385
|
+
/**
|
|
386
|
+
* Entity types that denote a *file/document artifact* rather than a concept
|
|
387
|
+
* (KG-13). Two `package.json` (or `index.ts`, or a `document` per paper) in
|
|
388
|
+
* different files are distinct artifacts that must NOT fuse, whereas a `function`
|
|
389
|
+
* or `concept` of the same name across files is the same thing and *should* merge
|
|
390
|
+
* (the whole point of global cross-file linking). So identity is name+file for
|
|
391
|
+
* these types only.
|
|
392
|
+
*/
|
|
393
|
+
const FILE_IDENTITY_TYPES = new Set(["file", "document"]);
|
|
394
|
+
/** Field separator for a name+file qualified identity key (unit separator). */
|
|
395
|
+
const ID_SEP = "␟";
|
|
396
|
+
/**
|
|
397
|
+
* Global-merge identity key for an entity: its bare name for conceptual entities
|
|
398
|
+
* (so same-name concepts merge across files), or `name␟primaryFile` for
|
|
399
|
+
* file-identity types (so same-name file artifacts in different files stay
|
|
400
|
+
* distinct). The bare name never contains `␟`, so the two key spaces can't collide.
|
|
401
|
+
*/
|
|
402
|
+
function entityIdentityKey(entity) {
|
|
403
|
+
var _a;
|
|
404
|
+
if (FILE_IDENTITY_TYPES.has(entity.entityType)) {
|
|
405
|
+
return `${entity.name}${ID_SEP}${(_a = entity.files[0]) !== null && _a !== void 0 ? _a : "unknown"}`;
|
|
406
|
+
}
|
|
407
|
+
return entity.name;
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Elect a merged entity's type from all the types its fused surface forms carried
|
|
411
|
+
* (KG-13): a specific type always beats the `other` catch-all, then majority vote
|
|
412
|
+
* wins (ties broken by first occurrence, so it's deterministic). Replaces the old
|
|
413
|
+
* "longest string wins" heuristic, under which `other`(5) beat `file`(4) and
|
|
414
|
+
* `organization` always beat `person`.
|
|
415
|
+
*/
|
|
416
|
+
function electEntityType(types) {
|
|
417
|
+
var _a;
|
|
418
|
+
const specific = types.filter((t) => t && t !== ENTITY_CATCH_ALL);
|
|
419
|
+
const pool = specific.length > 0 ? specific : types.filter(Boolean);
|
|
420
|
+
if (pool.length === 0)
|
|
421
|
+
return ENTITY_CATCH_ALL;
|
|
422
|
+
const counts = new Map();
|
|
423
|
+
for (const t of pool)
|
|
424
|
+
counts.set(t, ((_a = counts.get(t)) !== null && _a !== void 0 ? _a : 0) + 1);
|
|
425
|
+
let best = pool[0];
|
|
426
|
+
let bestN = 0;
|
|
427
|
+
for (const t of pool) {
|
|
428
|
+
const n = counts.get(t);
|
|
429
|
+
if (n > bestN) {
|
|
430
|
+
bestN = n;
|
|
431
|
+
best = t;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
return best;
|
|
435
|
+
}
|
|
436
|
+
// Global merge across different files. The sole referential-integrity gate (KG-04):
|
|
437
|
+
// the within-file pass defers here, where every entity across all files is visible.
|
|
438
|
+
function mergeGlobally(fileGraphs, options, embeddingService, logger) {
|
|
439
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
440
|
+
var _a, _b;
|
|
441
|
+
const entityMap = new Map();
|
|
442
|
+
const relationSet = new Set();
|
|
443
|
+
const relations = [];
|
|
444
|
+
// Track which files each entity appears in
|
|
445
|
+
const entityFileMap = new Map();
|
|
446
|
+
// Every entityType each fused surface form carried → elected at end-of-merge (KG-13).
|
|
447
|
+
const entityTypeVotes = new Map();
|
|
448
|
+
const globalSimilarityThreshold = options.entitySimilarityThreshold || DefaultSimilarityThreshold;
|
|
449
|
+
const enableSimilarity = options.enableSimilarityMerging !== false;
|
|
450
|
+
// Relation re-keying is PER GRAPH (KG-13): a file artifact's bare name is
|
|
451
|
+
// ambiguous across files, so each graph's relations resolve endpoints against
|
|
452
|
+
// that graph's own surface-name → output-name map; conceptual names also fall
|
|
453
|
+
// back to a global map for genuine cross-file references.
|
|
454
|
+
const renamePerGraph = [];
|
|
455
|
+
const globalConceptualRename = new Map();
|
|
456
|
+
// For file-identity entities, `name␟file` → the output name already assigned, so
|
|
457
|
+
// the same artifact re-extracted (e.g. across chunks) merges into one entity.
|
|
458
|
+
const idKeyToName = new Map();
|
|
459
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(`Global similarity threshold: ${globalSimilarityThreshold}`);
|
|
460
|
+
// Assign a unique output name, disambiguating a file artifact only when its bare
|
|
461
|
+
// name is already taken by a *different* file/entity (so the common single-project
|
|
462
|
+
// case keeps the clean `package.json`, but two projects' don't collide → no data loss).
|
|
463
|
+
const uniqueName = (name, file) => {
|
|
464
|
+
if (!entityMap.has(name))
|
|
465
|
+
return name;
|
|
466
|
+
const base = file ? `${name} [${file}]` : name;
|
|
467
|
+
let candidate = base;
|
|
468
|
+
let i = 2;
|
|
469
|
+
while (entityMap.has(candidate))
|
|
470
|
+
candidate = `${base}#${i++}`;
|
|
471
|
+
return candidate;
|
|
472
|
+
};
|
|
473
|
+
// Merge entities across files
|
|
474
|
+
for (const graph of fileGraphs) {
|
|
475
|
+
const localRename = new Map();
|
|
476
|
+
renamePerGraph.push(localRename);
|
|
477
|
+
for (const entity of graph.entities) {
|
|
478
|
+
const fileIdentity = FILE_IDENTITY_TYPES.has(entity.entityType);
|
|
479
|
+
// Resolve which existing entity (if any) this one merges into, as an output
|
|
480
|
+
// name. File artifacts merge only with the exact same name+file; conceptual
|
|
481
|
+
// entities merge by name/similarity (and never with a file artifact).
|
|
482
|
+
let outName;
|
|
483
|
+
let isNew;
|
|
484
|
+
let match = null;
|
|
485
|
+
if (fileIdentity) {
|
|
486
|
+
const idKey = `${entity.name}${ID_SEP}${(_a = entity.files[0]) !== null && _a !== void 0 ? _a : "unknown"}`;
|
|
487
|
+
const claimed = idKeyToName.get(idKey);
|
|
488
|
+
if (claimed) {
|
|
489
|
+
outName = claimed;
|
|
490
|
+
isNew = false;
|
|
491
|
+
}
|
|
492
|
+
else {
|
|
493
|
+
outName = uniqueName(entity.name, entity.files[0]);
|
|
494
|
+
idKeyToName.set(idKey, outName);
|
|
495
|
+
isNew = true;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
else {
|
|
499
|
+
match = findSimilarEntity(entity, entityMap, globalSimilarityThreshold, enableSimilarity, true);
|
|
500
|
+
if (match) {
|
|
501
|
+
outName = match.name;
|
|
502
|
+
isNew = false;
|
|
503
|
+
}
|
|
504
|
+
else {
|
|
505
|
+
// A conceptual entity that clashes with a file artifact holding the bare
|
|
506
|
+
// name gets disambiguated rather than overwriting it.
|
|
507
|
+
outName = uniqueName(entity.name);
|
|
508
|
+
isNew = true;
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
localRename.set(entity.name, outName);
|
|
512
|
+
if (!fileIdentity)
|
|
513
|
+
globalConceptualRename.set(entity.name, outName);
|
|
514
|
+
if (!isNew) {
|
|
515
|
+
const existing = entityMap.get(outName);
|
|
516
|
+
// Only a genuinely different surface form fused is merge-log-worthy.
|
|
517
|
+
if (match && existing.name !== entity.name) {
|
|
518
|
+
recordFusion(options, outName, entity.name, match);
|
|
519
|
+
}
|
|
520
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(`[Global] Merging entity "${entity.name}" (${entity.files[0]}) into "${outName}" (${existing.files[0]})`);
|
|
521
|
+
const allObservations = [
|
|
522
|
+
...(existing.observations || []),
|
|
523
|
+
...(entity.observations || []),
|
|
524
|
+
];
|
|
525
|
+
if (allObservations.length > 0) {
|
|
526
|
+
existing.observations = yield deduplicateObservations(allObservations, options.observationSimilarityThreshold || DefaultObservationThreshold, embeddingService, logger);
|
|
527
|
+
}
|
|
528
|
+
// Vote this surface form's type; the winner is elected at end-of-merge (KG-13).
|
|
529
|
+
entityTypeVotes.get(outName).push(entity.entityType);
|
|
530
|
+
for (const f of entity.files.length ? entity.files : ["unknown"]) {
|
|
531
|
+
entityFileMap.get(outName).add(f);
|
|
532
|
+
}
|
|
533
|
+
if (entity.chunk !== undefined) {
|
|
534
|
+
existing.chunk =
|
|
535
|
+
existing.chunk !== undefined ? Math.min(existing.chunk, entity.chunk) : entity.chunk;
|
|
536
|
+
}
|
|
537
|
+
if (entity.totalChunks !== undefined) {
|
|
538
|
+
existing.totalChunks = Math.max(existing.totalChunks || 0, entity.totalChunks);
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
else {
|
|
542
|
+
entityMap.set(outName, Object.assign(Object.assign({}, entity), { name: outName }));
|
|
543
|
+
entityFileMap.set(outName, new Set(entity.files.length ? entity.files : ["unknown"]));
|
|
544
|
+
entityTypeVotes.set(outName, [entity.entityType]);
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
// Merge relations across files, re-keying endpoints through the rename map. This
|
|
549
|
+
// is the sole endpoint-existence gate (KG-04): an endpoint missing here resolved
|
|
550
|
+
// to no entity in ANY file, so it's a true dangler. Cross-file edges — endpoints
|
|
551
|
+
// first surfaced in different files — survive here precisely because the within-
|
|
552
|
+
// file pass no longer destroys them.
|
|
553
|
+
let droppedDanglingEdges = 0;
|
|
554
|
+
let crossFileEdges = 0;
|
|
555
|
+
fileGraphs.forEach((graph, gi) => {
|
|
556
|
+
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
557
|
+
const localRename = renamePerGraph[gi];
|
|
558
|
+
for (const relation of graph.relations) {
|
|
559
|
+
// Resolve endpoints against THIS graph's name map first (so a file artifact
|
|
560
|
+
// resolves to the right disambiguated entity), then a global conceptual
|
|
561
|
+
// fallback for genuine cross-file references (KG-13).
|
|
562
|
+
const fromEntity = (_b = (_a = localRename.get(relation.from)) !== null && _a !== void 0 ? _a : globalConceptualRename.get(relation.from)) !== null && _b !== void 0 ? _b : relation.from;
|
|
563
|
+
const toEntity = (_d = (_c = localRename.get(relation.to)) !== null && _c !== void 0 ? _c : globalConceptualRename.get(relation.to)) !== null && _d !== void 0 ? _d : relation.to;
|
|
564
|
+
// Drop self-loops (X→X): an extraction artifact, and cross-file name
|
|
565
|
+
// mapping can also collapse both endpoints onto the same entity.
|
|
566
|
+
if (fromEntity === toEntity)
|
|
567
|
+
continue;
|
|
568
|
+
const fromNode = entityMap.get(fromEntity);
|
|
569
|
+
const toNode = entityMap.get(toEntity);
|
|
570
|
+
if (fromNode && toNode) {
|
|
571
|
+
const relationType = canonicalizeRelationType(relation.relationType);
|
|
572
|
+
const relationKey = `${fromEntity}->${toEntity}:${relationType.join(",")}`;
|
|
573
|
+
if (!relationSet.has(relationKey)) {
|
|
574
|
+
relationSet.add(relationKey);
|
|
575
|
+
// Count once per unique surviving edge whose endpoints were first defined
|
|
576
|
+
// in different files — the cross-file links the old within-file gate killed.
|
|
577
|
+
if (((_f = (_e = fromNode.files) === null || _e === void 0 ? void 0 : _e[0]) !== null && _f !== void 0 ? _f : "") !== ((_h = (_g = toNode.files) === null || _g === void 0 ? void 0 : _g[0]) !== null && _h !== void 0 ? _h : "")) {
|
|
578
|
+
crossFileEdges++;
|
|
579
|
+
}
|
|
580
|
+
relations.push(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ from: fromEntity, to: toEntity, relationType }, (relation.sourceSpan ? { sourceSpan: relation.sourceSpan } : {})), (relation.validAt ? { validAt: relation.validAt } : {})), (relation.source ? { source: relation.source } : {})), (relation.resolved !== undefined ? { resolved: relation.resolved } : {})), (relation.faithfulness ? { faithfulness: relation.faithfulness } : {})), (relation.faithfulnessScore !== undefined ? { faithfulnessScore: relation.faithfulnessScore } : {})), (relation.supportingSpan ? { supportingSpan: relation.supportingSpan } : {})));
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
else {
|
|
584
|
+
droppedDanglingEdges++;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
});
|
|
588
|
+
if (droppedDanglingEdges > 0) {
|
|
589
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Global merge dropped ${droppedDanglingEdges} relation(s) whose endpoints resolved to no entity (true danglers)`);
|
|
590
|
+
}
|
|
591
|
+
// Log cross-file entity statistics
|
|
592
|
+
const crossFileEntities = Array.from(entityFileMap.entries()).filter(([_, files]) => files.size > 1);
|
|
593
|
+
if (crossFileEntities.length > 0) {
|
|
594
|
+
logger === null || logger === void 0 ? void 0 : logger.info(`Found ${crossFileEntities.length} entities appearing across multiple files:`);
|
|
595
|
+
crossFileEntities.forEach(([entityName, files]) => {
|
|
596
|
+
logger === null || logger === void 0 ? void 0 : logger.debug(` ${entityName}: ${Array.from(files).join(", ")}`);
|
|
597
|
+
});
|
|
598
|
+
}
|
|
599
|
+
// Finalize each merged entity: elect its type from all votes (specific beats
|
|
600
|
+
// `other`, then majority), write back the cross-file files[] union (KG-13), and
|
|
601
|
+
// run merge-time supersession over its observations when enabled (KG-10).
|
|
602
|
+
const supersessionNow = new Date().toISOString();
|
|
603
|
+
for (const [key, entity] of entityMap) {
|
|
604
|
+
entity.entityType = electEntityType((_b = entityTypeVotes.get(key)) !== null && _b !== void 0 ? _b : [entity.entityType]);
|
|
605
|
+
const files = entityFileMap.get(key);
|
|
606
|
+
if (files && files.size > 0) {
|
|
607
|
+
entity.files = Array.from(files).filter((f) => f !== "unknown");
|
|
608
|
+
if (entity.files.length === 0)
|
|
609
|
+
entity.files = Array.from(files);
|
|
610
|
+
}
|
|
611
|
+
if (options.contradictionChecker) {
|
|
612
|
+
yield applySupersession(entity.observations, options.contradictionChecker, supersessionNow);
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
return {
|
|
616
|
+
graph: {
|
|
617
|
+
entities: Array.from(entityMap.values()),
|
|
618
|
+
relations: relations,
|
|
619
|
+
},
|
|
620
|
+
stats: { crossFileEdges, droppedDanglingEdges },
|
|
621
|
+
};
|
|
622
|
+
});
|
|
623
|
+
}
|
|
624
|
+
//# sourceMappingURL=KnowledgeMerger.js.map
|