npm - @wanshi-kg/wanshi - Versions diffs - 0.1.0 - Mend

@wanshi-kg/wanshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (443) hide show

package/LICENSE +21 -0
package/README.md +458 -0
package/dist/__tests__/helpers.js +27 -0
package/dist/__tests__/helpers.js.map +1 -0
package/dist/cli/commands/export.command.js +99 -0
package/dist/cli/commands/export.command.js.map +1 -0
package/dist/cli/commands/index.js +22 -0
package/dist/cli/commands/index.js.map +1 -0
package/dist/cli/commands/inspectMerges.command.js +84 -0
package/dist/cli/commands/inspectMerges.command.js.map +1 -0
package/dist/cli/commands/metrics.command.js +196 -0
package/dist/cli/commands/metrics.command.js.map +1 -0
package/dist/cli/commands/process.command.js +82 -0
package/dist/cli/commands/process.command.js.map +1 -0
package/dist/cli/commands/watch.command.js +91 -0
package/dist/cli/commands/watch.command.js.map +1 -0
package/dist/cli/index.js +269 -0
package/dist/cli/index.js.map +1 -0
package/dist/cli/optionsToConfig.js +160 -0
package/dist/cli/optionsToConfig.js.map +1 -0
package/dist/config/index.js +59 -0
package/dist/config/index.js.map +1 -0
package/dist/config/legacyHints.js +113 -0
package/dist/config/legacyHints.js.map +1 -0
package/dist/config/schema.js +803 -0
package/dist/config/schema.js.map +1 -0
package/dist/config/ui.js +221 -0
package/dist/config/ui.js.map +1 -0
package/dist/core/DirectoryProcessor.js +725 -0
package/dist/core/DirectoryProcessor.js.map +1 -0
package/dist/core/adapters/IStructuredAdapter.js +3 -0
package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
package/dist/core/adapters/SqliteAdapter.js +267 -0
package/dist/core/adapters/SqliteAdapter.js.map +1 -0
package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
package/dist/core/adapters/index.js +20 -0
package/dist/core/adapters/index.js.map +1 -0
package/dist/core/checkpoint/CheckpointService.js +188 -0
package/dist/core/checkpoint/CheckpointService.js.map +1 -0
package/dist/core/checkpoint/index.js +18 -0
package/dist/core/checkpoint/index.js.map +1 -0
package/dist/core/corpus/CorpusAnalyzer.js +266 -0
package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
package/dist/core/corpus/CorpusProfileStore.js +92 -0
package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
package/dist/core/corpus/index.js +21 -0
package/dist/core/corpus/index.js.map +1 -0
package/dist/core/corpus/normalizeGlossary.js +60 -0
package/dist/core/corpus/normalizeGlossary.js.map +1 -0
package/dist/core/corpus/relPath.js +52 -0
package/dist/core/corpus/relPath.js.map +1 -0
package/dist/core/corpus/termFrequency.js +86 -0
package/dist/core/corpus/termFrequency.js.map +1 -0
package/dist/core/cost/CostMeter.js +235 -0
package/dist/core/cost/CostMeter.js.map +1 -0
package/dist/core/cost/index.js +19 -0
package/dist/core/cost/index.js.map +1 -0
package/dist/core/cost/prices.js +38 -0
package/dist/core/cost/prices.js.map +1 -0
package/dist/core/cv/ObjectDetectionService.js +119 -0
package/dist/core/cv/ObjectDetectionService.js.map +1 -0
package/dist/core/di/ContainerFactory.js +670 -0
package/dist/core/di/ContainerFactory.js.map +1 -0
package/dist/core/di/DIContainer.js +103 -0
package/dist/core/di/DIContainer.js.map +1 -0
package/dist/core/di/index.js +19 -0
package/dist/core/di/index.js.map +1 -0
package/dist/core/errors/CustomErrors.js +342 -0
package/dist/core/errors/CustomErrors.js.map +1 -0
package/dist/core/errors/index.js +18 -0
package/dist/core/errors/index.js.map +1 -0
package/dist/core/export/KnowledgeGraphExportService.js +56 -0
package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
package/dist/core/export/index.js +19 -0
package/dist/core/export/index.js.map +1 -0
package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
package/dist/core/export/strategies/IExportStrategy.js +3 -0
package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
package/dist/core/export/strategies/McpExportStrategy.js +67 -0
package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
package/dist/core/export/strategies/index.js +25 -0
package/dist/core/export/strategies/index.js.map +1 -0
package/dist/core/export/strategies/kbTriples.js +60 -0
package/dist/core/export/strategies/kbTriples.js.map +1 -0
package/dist/core/index.js +22 -0
package/dist/core/index.js.map +1 -0
package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
package/dist/core/knowledge/MergeRecord.js +3 -0
package/dist/core/knowledge/MergeRecord.js.map +1 -0
package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
package/dist/core/knowledge/canon/index.js +18 -0
package/dist/core/knowledge/canon/index.js.map +1 -0
package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
package/dist/core/knowledge/contradiction/index.js +19 -0
package/dist/core/knowledge/contradiction/index.js.map +1 -0
package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
package/dist/core/knowledge/grounding/index.js +20 -0
package/dist/core/knowledge/grounding/index.js.map +1 -0
package/dist/core/knowledge/grounding/verbalize.js +38 -0
package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
package/dist/core/knowledge/index.js +20 -0
package/dist/core/knowledge/index.js.map +1 -0
package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
package/dist/core/knowledge/vocabulary.js +162 -0
package/dist/core/knowledge/vocabulary.js.map +1 -0
package/dist/core/llm/EmbeddingService.js +113 -0
package/dist/core/llm/EmbeddingService.js.map +1 -0
package/dist/core/llm/OllamaService.js +146 -0
package/dist/core/llm/OllamaService.js.map +1 -0
package/dist/core/llm/OpenAICompatibleService.js +190 -0
package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
package/dist/core/llm/embeddingUtils.js +25 -0
package/dist/core/llm/embeddingUtils.js.map +1 -0
package/dist/core/llm/index.js +23 -0
package/dist/core/llm/index.js.map +1 -0
package/dist/core/llm/prompts/PromptManager.js +388 -0
package/dist/core/llm/prompts/PromptManager.js.map +1 -0
package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
package/dist/core/pipeline/GroundingTransform.js +52 -0
package/dist/core/pipeline/GroundingTransform.js.map +1 -0
package/dist/core/pipeline/PipelineRunner.js +51 -0
package/dist/core/pipeline/PipelineRunner.js.map +1 -0
package/dist/core/pipeline/RelationFilterTransform.js +72 -0
package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
package/dist/core/pipeline/index.js +20 -0
package/dist/core/pipeline/index.js.map +1 -0
package/dist/core/processor/FileProcessor.js +184 -0
package/dist/core/processor/FileProcessor.js.map +1 -0
package/dist/core/processor/ProcessedRegistry.js +38 -0
package/dist/core/processor/ProcessedRegistry.js.map +1 -0
package/dist/core/processor/ast/AstSeedService.js +0 -0
package/dist/core/processor/ast/AstSeedService.js.map +1 -0
package/dist/core/processor/ast/AstSymbolStore.js +110 -0
package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
package/dist/core/processor/ast/index.js +19 -0
package/dist/core/processor/ast/index.js.map +1 -0
package/dist/core/processor/chunking/TextChunker.js +98 -0
package/dist/core/processor/chunking/TextChunker.js.map +1 -0
package/dist/core/processor/chunking/index.js +18 -0
package/dist/core/processor/chunking/index.js.map +1 -0
package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
package/dist/core/processor/classifier/index.js +21 -0
package/dist/core/processor/classifier/index.js.map +1 -0
package/dist/core/processor/classifier/mergeClassifications.js +32 -0
package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
package/dist/core/processor/index.js +20 -0
package/dist/core/processor/index.js.map +1 -0
package/dist/core/processor/readers/AudioReader.js +462 -0
package/dist/core/processor/readers/AudioReader.js.map +1 -0
package/dist/core/processor/readers/BinaryReader.js +90 -0
package/dist/core/processor/readers/BinaryReader.js.map +1 -0
package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
package/dist/core/processor/readers/ChatExportReader.js +365 -0
package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
package/dist/core/processor/readers/DoclingReader.js +445 -0
package/dist/core/processor/readers/DoclingReader.js.map +1 -0
package/dist/core/processor/readers/EmailReader.js +259 -0
package/dist/core/processor/readers/EmailReader.js.map +1 -0
package/dist/core/processor/readers/EpubReader.js +175 -0
package/dist/core/processor/readers/EpubReader.js.map +1 -0
package/dist/core/processor/readers/FileReader.js +90 -0
package/dist/core/processor/readers/FileReader.js.map +1 -0
package/dist/core/processor/readers/FileReaderFactory.js +49 -0
package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
package/dist/core/processor/readers/HtmlReader.js +371 -0
package/dist/core/processor/readers/HtmlReader.js.map +1 -0
package/dist/core/processor/readers/ImageReader.js +162 -0
package/dist/core/processor/readers/ImageReader.js.map +1 -0
package/dist/core/processor/readers/JsonFileReader.js +232 -0
package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
package/dist/core/processor/readers/JupyterReader.js +178 -0
package/dist/core/processor/readers/JupyterReader.js.map +1 -0
package/dist/core/processor/readers/LatexReader.js +176 -0
package/dist/core/processor/readers/LatexReader.js.map +1 -0
package/dist/core/processor/readers/MarkdownReader.js +289 -0
package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
package/dist/core/processor/readers/MistralOcrReader.js +198 -0
package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
package/dist/core/processor/readers/OfficeReader.js +174 -0
package/dist/core/processor/readers/OfficeReader.js.map +1 -0
package/dist/core/processor/readers/PdfReader.js +116 -0
package/dist/core/processor/readers/PdfReader.js.map +1 -0
package/dist/core/processor/readers/RtfReader.js +107 -0
package/dist/core/processor/readers/RtfReader.js.map +1 -0
package/dist/core/processor/readers/SubtitleReader.js +145 -0
package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
package/dist/core/processor/readers/TextReader.js +129 -0
package/dist/core/processor/readers/TextReader.js.map +1 -0
package/dist/core/processor/readers/TranscriptReader.js +234 -0
package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
package/dist/core/processor/readers/image/imageMetadata.js +155 -0
package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
package/dist/core/processor/readers/index.js +41 -0
package/dist/core/processor/readers/index.js.map +1 -0
package/dist/core/processor/readers/referenceExtraction.js +198 -0
package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
package/dist/core/processor/readers/stripReferences.js +59 -0
package/dist/core/processor/readers/stripReferences.js.map +1 -0
package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
package/dist/core/progress/NoopProgressEmitter.js +15 -0
package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
package/dist/core/progress/index.js +19 -0
package/dist/core/progress/index.js.map +1 -0
package/dist/core/trace/TraceWriter.js +100 -0
package/dist/core/trace/TraceWriter.js.map +1 -0
package/dist/core/trace/events.js +13 -0
package/dist/core/trace/events.js.map +1 -0
package/dist/core/trace/index.js +20 -0
package/dist/core/trace/index.js.map +1 -0
package/dist/core/trace/lineage.js +97 -0
package/dist/core/trace/lineage.js.map +1 -0
package/dist/evaluation/BenchmarkRunner.js +171 -0
package/dist/evaluation/BenchmarkRunner.js.map +1 -0
package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
package/dist/evaluation/classifier/labeledSamples.js +379 -0
package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
package/dist/evaluation/compare/goldCompare.js +126 -0
package/dist/evaluation/compare/goldCompare.js.map +1 -0
package/dist/evaluation/crossre/compareScoring.js +30 -0
package/dist/evaluation/crossre/compareScoring.js.map +1 -0
package/dist/evaluation/datasets/CrossREDataset.js +170 -0
package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
package/dist/evaluation/datasets/IDataset.js +3 -0
package/dist/evaluation/datasets/IDataset.js.map +1 -0
package/dist/evaluation/datasets/RebelDataset.js +117 -0
package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
package/dist/evaluation/datasets/RedocredDataset.js +218 -0
package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
package/dist/evaluation/index.js +33 -0
package/dist/evaluation/index.js.map +1 -0
package/dist/evaluation/matching/ExactMatcher.js +75 -0
package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
package/dist/evaluation/matching/SemanticMatcher.js +143 -0
package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
package/dist/evaluation/metrics/TripleMetrics.js +64 -0
package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
package/dist/evaluation/mine/MineCheckpoint.js +114 -0
package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
package/dist/evaluation/mine/MineDataset.js +208 -0
package/dist/evaluation/mine/MineDataset.js.map +1 -0
package/dist/evaluation/mine/MineReporter.js +98 -0
package/dist/evaluation/mine/MineReporter.js.map +1 -0
package/dist/evaluation/mine/MineRunner.js +148 -0
package/dist/evaluation/mine/MineRunner.js.map +1 -0
package/dist/evaluation/mine/MineScorer.js +127 -0
package/dist/evaluation/mine/MineScorer.js.map +1 -0
package/dist/evaluation/mine/types.js +12 -0
package/dist/evaluation/mine/types.js.map +1 -0
package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
package/dist/evaluation/reporters/JsonReporter.js +50 -0
package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
package/dist/index.js +28 -0
package/dist/index.js.map +1 -0
package/dist/quality/CompositeScore.js +61 -0
package/dist/quality/CompositeScore.js.map +1 -0
package/dist/quality/ConsistencyMetrics.js +70 -0
package/dist/quality/ConsistencyMetrics.js.map +1 -0
package/dist/quality/FactualMetrics.js +76 -0
package/dist/quality/FactualMetrics.js.map +1 -0
package/dist/quality/GraphHealthMetrics.js +68 -0
package/dist/quality/GraphHealthMetrics.js.map +1 -0
package/dist/quality/SemanticMetrics.js +102 -0
package/dist/quality/SemanticMetrics.js.map +1 -0
package/dist/quality/StructuralMetrics.js +60 -0
package/dist/quality/StructuralMetrics.js.map +1 -0
package/dist/quality/index.js +23 -0
package/dist/quality/index.js.map +1 -0
package/dist/shared/index.js +20 -0
package/dist/shared/index.js.map +1 -0
package/dist/shared/logger/Logger.js +3 -0
package/dist/shared/logger/Logger.js.map +1 -0
package/dist/shared/logger/LoggerFactory.js +75 -0
package/dist/shared/logger/LoggerFactory.js.map +1 -0
package/dist/shared/logger/index.js +19 -0
package/dist/shared/logger/index.js.map +1 -0
package/dist/shared/shutdown.js +30 -0
package/dist/shared/shutdown.js.map +1 -0
package/dist/shared/utils/agglomerativeCluster.js +269 -0
package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
package/dist/shared/utils/astSymbols.js +69 -0
package/dist/shared/utils/astSymbols.js.map +1 -0
package/dist/shared/utils/cosineSimilarity.js +18 -0
package/dist/shared/utils/cosineSimilarity.js.map +1 -0
package/dist/shared/utils/directoryTree.js +184 -0
package/dist/shared/utils/directoryTree.js.map +1 -0
package/dist/shared/utils/documentOutline.js +74 -0
package/dist/shared/utils/documentOutline.js.map +1 -0
package/dist/shared/utils/index.js +24 -0
package/dist/shared/utils/index.js.map +1 -0
package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
package/dist/shared/utils/parseJsonLenient.js +27 -0
package/dist/shared/utils/parseJsonLenient.js.map +1 -0
package/dist/shared/utils/readConfig.js +42 -0
package/dist/shared/utils/readConfig.js.map +1 -0
package/dist/shared/utils/readRtf.js +216 -0
package/dist/shared/utils/readRtf.js.map +1 -0
package/dist/shared/utils/softmax.js +26 -0
package/dist/shared/utils/softmax.js.map +1 -0
package/dist/types/ContentClass.js +3 -0
package/dist/types/ContentClass.js.map +1 -0
package/dist/types/CorpusProfile.js +3 -0
package/dist/types/CorpusProfile.js.map +1 -0
package/dist/types/IContradictionChecker.js +3 -0
package/dist/types/IContradictionChecker.js.map +1 -0
package/dist/types/ICorpusAnalyzer.js +3 -0
package/dist/types/ICorpusAnalyzer.js.map +1 -0
package/dist/types/IDirectoryProcessor.js +3 -0
package/dist/types/IDirectoryProcessor.js.map +1 -0
package/dist/types/IEmbeddingProvider.js +3 -0
package/dist/types/IEmbeddingProvider.js.map +1 -0
package/dist/types/IEmbeddingService.js +6 -0
package/dist/types/IEmbeddingService.js.map +1 -0
package/dist/types/IFileProcessor.js +3 -0
package/dist/types/IFileProcessor.js.map +1 -0
package/dist/types/IGroundingChecker.js +3 -0
package/dist/types/IGroundingChecker.js.map +1 -0
package/dist/types/IKnowledgeGraphBuilder.js +3 -0
package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
package/dist/types/IKnowledgeGraphExporter.js +3 -0
package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
package/dist/types/IKnowledgeGraphMerger.js +3 -0
package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
package/dist/types/IKnowledgeGraphSearch.js +3 -0
package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
package/dist/types/ILLMProvider.js +3 -0
package/dist/types/ILLMProvider.js.map +1 -0
package/dist/types/ILLMService.js +3 -0
package/dist/types/ILLMService.js.map +1 -0
package/dist/types/IObjectDetector.js +3 -0
package/dist/types/IObjectDetector.js.map +1 -0
package/dist/types/IProcessingService.js +3 -0
package/dist/types/IProcessingService.js.map +1 -0
package/dist/types/IProgressEmitter.js +3 -0
package/dist/types/IProgressEmitter.js.map +1 -0
package/dist/types/IPromptManager.js +3 -0
package/dist/types/IPromptManager.js.map +1 -0
package/dist/types/KnowledgeGraph.js +3 -0
package/dist/types/KnowledgeGraph.js.map +1 -0
package/dist/types/MCPKnowledgeGraph.js +3 -0
package/dist/types/MCPKnowledgeGraph.js.map +1 -0
package/dist/types/Observation.js +21 -0
package/dist/types/Observation.js.map +1 -0
package/dist/types/ProcessingOptions.js +3 -0
package/dist/types/ProcessingOptions.js.map +1 -0
package/dist/types/index.js +40 -0
package/dist/types/index.js.map +1 -0
package/package.json +122 -0

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Alex Sabaka
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,458 @@
+# wanshi
+<picture>
+  <source media="(prefers-color-scheme: dark)" srcset="docs/assets/readme-banner-dark.png">
+  <source media="(prefers-color-scheme: light)" srcset="docs/assets/readme-banner-light.png">
+  <img alt="wanshi" src="docs/assets/readme-banner-light.png">
+</picture>
+> A local-first CLI that reads ten thousand things — code, docs, PDFs, audio, transcripts — and builds one knowledge graph that remembers where every fact came from.
+`wanshi` extracts entities and relations from a file tree and merges them into a single graph. It runs on local models via [Ollama](https://ollama.ai) by default, or any OpenAI-compatible endpoint. Facts carry provenance and a bi-temporal axis, an inline grounding gate filters ungrounded claims, and the graph is a drop-in producer for the MCP memory server, Graphiti, and KBLaM/LoRA training exports.
+It's a working CLI and a research platform in equal measure — the long game is domain-tuned extraction feeding knowledge injection into small local models.
+---
+> **Command shorthand:** examples below write `wanshi` for the run command. Until the npm package ships, that's `npx ts-node ./src/index.ts` (dev) or `node ./dist/index.js` (built). Once published, it's literally `wanshi`.
+## What's distinctive
+Most text→KG tools stop at "extract triples." `wanshi` is built around the parts that come after:
+- **Provenance, not just facts.** Every observation records its `source`/`speaker` and a Graphiti-style bi-temporal axis (`validAt`/`invalidAt` for world-time, `createdAt`/`expiredAt` for system-time). The same fact from two speakers stays as two attributed observations, never one flattened string.
+- **A grounding gate.** Each extracted fact is scored against its source chunk and can be flagged or dropped before it reaches the output — keyword overlap as a cheap pre-filter, with an optional local NLI checker (MiniCheck) for the uncertain cases. It won't record what it can't verify against the source.
+- **Closed-vocabulary extraction.** An optional corpus pre-pass builds a glossary of canonical entity/relation types, which then *constrains* extraction — so a large corpus doesn't fragment into hundreds of one-off types.
+- **Transcript-aware ingestion.** Speaker-labeled transcripts and chat exports are split into speaker-pure chunks, so a speaker becomes per-fact provenance rather than a polluting entity.
+- **Memory-store interop.** `mcp-jsonl` output is byte-compatible with the official [MCP memory server](https://github.com/modelcontextprotocol/servers/tree/main/src/memory) — point it at the file and query your graph from Claude Code/Desktop. No store to build.
+- **Training-data exports.** Emit KBLaM `(entity, property, value)` triples or quality-filtered LoRA/SFT chat examples straight from a graph.
+- **Resumable runs.** Per-chunk checkpoints survive interrupts and exhausted API credits; re-run the same command to continue.
+## Supported inputs
+| Format | Extensions | Handling |
+| ------ | ---------- | -------- |
+| Text / source code | `.txt`, `.ts`, `.js`, `.py`, `.go`, `.rs`, … | Direct / code-aware extraction |
+| Markdown | `.md` | Markdown-aware parsing |
+| Transcripts | speaker-labeled `*.parakeet.txt`/`*.whisper.txt`, transcript/turn JSON, Claude/ChatGPT exports | Speaker-pure chunks with per-fact `speaker`/`occurredAt` |
+| JSON | `.json`, `.jsonl`, `.geojson` | Structure-aware chunking (splits on JSON structure, never mid-object) |
+| PDF | `.pdf` | Page text (`pdf2json`), or a richer engine via `--pdf-engine docling\|marker\|mistral` |
+| Office | `.docx`, `.xlsx`, `.pptx` | Via officeparser |
+| HTML / RTF | `.html`, `.htm`, `.rtf` | cheerio / RTF parsing |
+| Images | `.jpg`, `.png`, `.gif`, `.webp`, `.tiff`, `.heic`, `.avif` | Vision model required |
+| Audio / Video | `.mp3`, `.wav`, `.m4a`, `.flac`, `.mp4`, `.mkv`, `.webm`, … | Whisper transcription, or `--asr-engine dual` (VAD + dual-STT + diarization) |
+## Install
+Requires **Node.js 18+** and **[Ollama](https://ollama.ai)** running locally (needed for the default local generation + embeddings path; optional only if you point *both* at an OpenAI-compatible provider).
+```bash
+git clone https://github.com/wanshi-kg/wanshi
+cd wanshi
+npm install
+# Default local models
+ollama pull llama3.2                 # generation
+ollama pull nomic-embed-text   # embeddings
+npm run build   # optional; ts-node works directly
+```
+## Quick start
+```bash
+# Process a directory with defaults
+wanshi -i ./my-project -o knowledge-graph.json
+# Pick a model and output format
+wanshi -i ./src -m qwen3:8b --export-format jsonl -o kg.jsonl
+# Config file (recommended for anything non-trivial)
+wanshi --config config.yaml
+```
+### Configuration
+The config file uses a **nested** shape (the source of truth is the Zod schema in `src/config/`); CLI flags stay flat. Run `wanshi schema` to print the full JSON Schema.
+```yaml
+input: ./my-project
+filter: ["**/*.ts", "**/*.md"]
+exclude: ["**/node_modules/**", "**/dist/**"]
+output: knowledge-graph.jsonl
+description: "TypeScript project source code"
+llm:
+  provider: ollama          # ollama | openai (OpenAI-compatible)
+  model: gemma3:4b
+  host: http://localhost:11434
+  contextLength: 12000
+  temperature: 0.1
+embeddings:                 # independent from generation — keep local & free
+  provider: ollama
+  model: nomic-embed-text
+  host: http://localhost:11434
+chunking: { mode: enabled, size: 4000, overlap: 100 }
+retrieval: { mode: enabled, limit: 3 }
+merging:
+  enableSimilarityMerging: true
+  entitySimilarityThreshold: 0.9
+  observationSimilarityThreshold: 0.7
+export: { format: jsonl }
+```
+### Cloud generation + resume
+Point generation at any OpenAI-compatible endpoint (`provider: openai`, `host` = base URL), keep embeddings local so dedup/merge stays free, and enable `resume` so an interrupted run continues without reprocessing.
+```yaml
+llm:
+  provider: openai
+  host: https://openrouter.ai/api/v1
+  apiKey: sk-or-...          # or $OPENAI_API_KEY / $WANSHI_API_KEY
+  model: google/gemma-3-27b-it
+embeddings:
+  provider: ollama
+  model: nomic-embed-text
+resume:
+  enabled: true             # writes <output>.checkpoint.jsonl
+```
+If the run dies mid-way, just run the same command again — finished chunks are skipped. **Ctrl+C once** finishes the in-flight chunk, checkpoints it, and writes the partial graph before exiting; press again to force-quit.
+A chunk is reused only when its **file content, chunk size/overlap, model, and prompt version** all match — these are folded into the checkpoint key. Files are keyed by path *relative to `--input`*, so relocating the whole tree keeps checkpoints valid; only editing a file re-runs it.
+### Other modes
+```bash
+# Watch: update the graph as files change
+wanshi --config config.yaml --watch
+# Multimedia (images + audio transcription)
+wanshi -i ./media --images enabled --asr enabled --whisper-model medium -m llava:7b
+# GraphViz DOT for visualization
+wanshi -i ./src --export-format dot -o graph.dot && dot -Tsvg graph.dot -o graph.svg
+# Re-export an existing graph (no LLM calls)
+wanshi --export-only -i ./knowledge-graph.json --export-format kblam -o ./kb.jsonl
+```
+## CLI reference
+### Core
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `-i, --input <path>` | `.` | Input directory |
+| `-f, --filter <glob>` | `**/*` | Include pattern |
+| `-e, --exclude <glob...>` | — | Exclude patterns |
+| `-o, --output <path>` | `knowledge-graph.json` | Output file |
+| `-d, --description <text>` | — | Content description for LLM context |
+| `--config <file>` | — | YAML/JSON config file |
+### LLM
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `--provider <name>` | `ollama` | `ollama` or `openai` (any OpenAI-compatible endpoint) |
+| `-m, --model <name>` | `llama3.2` | Ollama tag or provider model id |
+| `-h, --host <url>` | `http://localhost:11434` | Ollama host, or OpenAI-compatible base URL |
+| `--api-key <key>` | — | Falls back to `$OPENAI_API_KEY` / `$WANSHI_API_KEY` |
+| `--temperature <n>` | `0.1` | Sampling temperature |
+| `--repeat-penalty <n>` | `1.1` | Ollama only (>1.0 discourages repetition) |
+| `--context-length <n>` | `8192` | Context window (Ollama only) |
+| `--max-tokens <n>` | provider default | Raise (or lower `--chunk-size`) if graph JSON truncates mid-output |
+| `--seed <n>` | — | Reproducibility seed (Ollama only) |
+| `-s, --system <prompt\|path>` | — | Custom system prompt or template path |
+### Embeddings (independent from generation)
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `--embeddings-provider <name>` | `ollama` | `ollama` or `openai` |
+| `--embeddings-model <name>` | `nomic-embed-text` | Embeddings model |
+| `--embeddings-host <url>` | `http://localhost:11434` | Host / base URL |
+| `--embeddings-max-input-chars <n>` | `1024` | Truncate embedding inputs (safe for 512-token models; raise for cloud) |
+### Processing & retrieval
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `--chunking <mode>` | `enabled` | `enabled\|disabled\|auto` |
+| `-c, --chunk-size <n>` | `2000` | Max chunk size (chars) |
+| `--overlap-size <n>` | `100` | Chunk overlap |
+| `--retrieval <mode>` | `enabled` | `enabled\|disabled\|auto` |
+| `--retrieval-limit <n>` | `3` | Retrieved context entities per chunk |
+| `--retrieval-scope <mode>` | `chunk` | `chunk` (per-chunk) or `file` (once, reused) |
+| `--json-strategy <mode>` | `structural` | `structural` (split on JSON structure) or `raw` |
+### Media & classification
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `--asr <mode>` | `enabled` | `enabled\|disabled\|auto` |
+| `--whisper-model <name>` | `medium` | `tiny\|base\|small\|medium\|large` |
+| `--language <lang>` | `auto` | Language code or `auto` |
+| `--translate` | `false` | Translate audio to English |
+| `--images <mode>` | `auto` | `enabled\|disabled\|auto` (vision model required) |
+| `--pdf-engine <engine>` | `pdf2json` | `pdf2json\|docling\|marker\|mistral` — PDF reading engine (non-default engines degrade to `pdf2json` on failure) |
+| `--asr-engine <engine>` | `whisper` | `whisper\|dual` — `dual` = vendored Python VAD + Parakeet/Whisper dual-STT + diarization (Apple-Silicon) |
+| `--classifier <mode>` | `disabled` | `disabled\|heuristic\|llm\|cascade` — drives domain prompt hints and scopes `entityType` to a per-domain enum *(experimental)* |
+| `--trace` | `false` | Emit a structured decision run-trace to `<output>.trace.jsonl` *(debug/observability)* |
+### Merging, grounding, corpus glossary
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `--entity-similarity-threshold <n>` | `0.9` | Jaro-Winkler entity dedup (0–1) |
+| `--observation-similarity-threshold <n>` | `0.9` | Embedding similarity (0–1) |
+| `--enable-similarity-merging` | `true` | Enable entity deduplication |
+| `--grounding <mode>` | `disabled` | `disabled` · `flag` (annotate `grounded`/`groundingScore`) · `drop` (remove below threshold) |
+| `--grounding-min-score <n>` | `0.5` | Min grounding score; also gates which facts the `lora` export keeps |
+| `--corpus-profiling <mode>` | `disabled` | Pre-pass that builds an authoritative corpus glossary (closed vocab under v5) *(experimental)* |
+| `--prompt-version <version>` | `v5` | `v5` (closed-vocab + topology hygiene) or `v4.5` (legacy) |
+### Export, resume, logging
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `--export-format <format>` | `json` | `json\|jsonl\|mcp-jsonl\|dot\|kblam\|lora\|graphiti` |
+| `--export-only` | `false` | Convert an existing graph (`--input`) to `--export-format` — no extraction |
+| `--resume` | `false` | Checkpoint chunks; skip done ones on re-run |
+| `--checkpoint <path>` | `<output>.checkpoint.jsonl` | Checkpoint sidecar |
+| `-L, --log-level <level>` | `info` | `debug\|info\|warning\|error` |
+| `-l, --log-file <path>` | — | Write logs to file |
+| `-w, --watch` | `false` | Watch mode |
+> Document-outline injection (`readers.outline`) and DOT styling (`export.dot`) are config-only (no CLI flags) — see the config schema.
+## Output formats
+### JSON (`json`)
+Observations are **objects**, not bare strings — each carries provenance and the bi-temporal axis. The LLM emits plain text; `wanshi` stamps the metadata deterministically from what it knows about the chunk. Unknown fields are omitted; legacy string-observation graphs still load.
+```json
+{
+  "entities": [
+    {
+      "name": "knowledge_graph_builder",
+      "entityType": "class",
+      "observations": [
+        {
+          "text": "Extracts entities and relations from file content using an LLM",
+          "source": "src/core/knowledge/KnowledgeGraphBuilder.ts",
+          "createdAt": "2026-06-05T15:57:59.856Z"
+        }
+      ],
+      "files": ["src/core/knowledge/KnowledgeGraphBuilder.ts"]
+    },
+    {
+      "name": "SPEAKER_01",
+      "entityType": "person",
+      "observations": [
+        {
+          "text": "Explains that a Naïve Bayes classifier assumes word independence",
+          "speaker": "SPEAKER_01",
+          "source": "Olga Lesson P.parakeet.txt",
+          "validAt": "2026-05-28T00:00:00Z",
+          "createdAt": "2026-06-05T15:57:59.856Z"
+        }
+      ],
+      "files": ["Olga Lesson P.parakeet.txt"]
+    }
+  ],
+  "relations": [
+    { "from": "knowledge_graph_builder", "to": "ollama_service", "relationType": ["uses", "depends_on"] }
+  ]
+}
+```
+### MCP-compatible JSONL (`mcp-jsonl`)
+```jsonl
+{"type":"entity","name":"knowledge_graph_builder","entityType":"class","observations":["Extracts entities and relations from file content using an LLM"]}
+{"type":"relation","from":"knowledge_graph_builder","to":"ollama_service","relationType":"uses,depends_on"}
+```
+### GraphViz DOT (`dot`)
+Styled, colored graph (one node per entity, colored edges per relation type, legend, config summary). Render with `dot -Tsvg graph.dot -o graph.svg` (or `neato`/`fdp`/`sfdp`/`circo`/`twopi`). Styling is config-only under `export.dot:` — layout, `rankdir`, `colorScheme` (`default\|scientific\|code\|minimal`), clustering by type or file, etc.
+### KBLaM triples (`kblam`)
+JSONL in the shape Microsoft [KBLaM](https://github.com/microsoft/KBLaM)'s `dataset_generation` ingests — **one `(entity, property, value)` per line**, each with the derived `Q`/`A`/`key_string` it encodes into a knowledge token. Property names are distinct per entity (relations contribute their predicate as the property), and keys are unique per `(name, property)` so rectangular-attention lookup is unambiguous.
+```jsonl
+{"name":"Recursion","property":"definition","value":"a function that calls itself","Q":"What is the definition of Recursion?","A":"The definition of Recursion is a function that calls itself.","key_string":"the definition of Recursion"}
+{"name":"Recursion","property":"terminates_at","value":"BaseCase","Q":"What is the terminates_at of Recursion?","A":"The terminates_at of Recursion is BaseCase.","key_string":"the terminates_at of Recursion"}
+```
+### LoRA / SFT (`lora`)
+Chat-format instruction examples derived from the same triples, **quality-filtered**: observations whose grounding score is below `--grounding-min-score` are dropped, so only grounded facts become training data.
+```jsonl
+{"messages":[{"role":"user","content":"What is the definition of Recursion?"},{"role":"assistant","content":"The definition of Recursion is a function that calls itself."}]}
+```
+### Graphiti (`graphiti`)
+`add_triplet`-shaped `{ nodes, edges }` for ingestion into a [Graphiti](https://github.com/getzep/graphiti) temporal graph — entities → nodes (summary from observations), relations → `UPPER_SNAKE` edges with stable uuids. Per-fact valid-time rides along in the `json`/`kblam` exports.
+## Local model guidance
+Quality/speed trade-off for local selection. For measured numbers see the benchmark below.
+| Model | Params | Quality | Speed | Notes |
+| ----- | ------ | ------- | ----- | ----- |
+| `qwen3:8b` | 8B | ★★★★★ | slower | highest extraction quality |
+| `gemma3:4b` | 4B | ★★★★ | medium | best quality/speed balance |
+| `qwen2.5-coder:1.5b` | 1.5B | ★★★ | fast | strong on source code |
+| `qwen3:1.7b` | 1.7B | ★★★ | fast | good general purpose |
+| `gemma3:1b` | 1B | ★★ | very fast | minimal resources |
+Default embeddings: `nomic-embed-text`.
+The table above is qualitative guidance. For measured, comparative numbers (wanshi vs KGGen on gold-labeled datasets) see **[Benchmarks](#benchmarks)** below — note those run on **cloud** models; local-model benchmarks are planned.
+## Benchmarks
+> **Scope & honesty (read first).** Every number here is **cloud inference via OpenRouter** —
+> **local-model (offline-first) benchmarks are planned and not yet run** (see [What's not yet
+> measured](#whats-not-yet-measured)). Comparative baselines are **re-scored under one identical
+> harness, not the published figures**. The document-level result rests on **one dataset** so far.
+> **MINE** is a recall-only, LLM-judge-mediated axis, reported as *context*, not a load-bearing claim.
+wanshi vs **KGGen** (its real Python package), **same model for both tools**, on gold-labeled datasets.
+The fair cross-tool metric is **entity-capture F1** (did the tool recover the gold entities) — both
+tools emit free predicates, so typed relation-F1 understates uniformly *except* in the schema-aware
+mode below. Embeddings for matching run locally (`nomic-embed-text`), semantic threshold 0.80.
+**Entity capture across granularity** (deepseek-v4-pro):
+| Dataset | Level | N | wanshi F1 | KGGen F1 |
+| ------- | ----- | - | --------- | -------- |
+| SemEval-2010 T8 | sentence | 300 | 0.422 | **0.453** |
+| CrossRE | sentence | 300 | 0.786 | **0.824** |
+| Re-DocRED | document | 100 | **0.677** | 0.643 |
+Same shape everywhere: KGGen edges **recall**, wanshi wins **precision**. The net **flips with document
+length** — on long documents KGGen over-extracts and its precision collapses, so wanshi's discipline wins.
+**Claim (a) — the precision advantage grows with document length *and* model capability.** Re-DocRED
+two-way (node entity-capture F1) across the model ladder:
+| Model | wanshi | KGGen | wanshi win | KGGen precision | KGGen ent/doc |
+| ----- | ------ | ----- | ---------- | --------------- | ------------- |
+| deepseek-v4-pro | 0.677 | 0.643 | +3.4 pt | 0.530 | 21.6 |
+| claude-sonnet-4.6 | 0.721 | 0.620 | +10.1 pt | 0.489 | 24.2 |
+| gpt-5.4 | 0.735 | 0.561 | **+17.4 pt** | 0.402 | 32.1 |
+Stronger models extract *more* (KGGen 21.6 → 32.1 entities/doc); on long docs that craters precision
+(0.53 → 0.40) faster than it helps recall, while wanshi stays disciplined — so the win **widens at the
+frontier**. *Confirmed across three models; rests on one document-level dataset (a second, SciERC/BioRED,
+is planned).*
+**Claim (b) — schema-aware typed extraction (a mode KGGen lacks).** When the **target relation schema is
+known**, wanshi extracts typed relations natively via a closed vocabulary (`--relation-vocab`). Re-DocRED
+triple-F1, free predicates → strict gold schema (96 Wikidata properties):
+| Model | wanshi free → strict | Ign-F1 | KGGen (free) | × KGGen |
+| ----- | -------------------- | ------ | ------------ | ------- |
+| deepseek-v4-pro | 0.012 → 0.107 | 0.111 | 0.025 | 4× |
+| claude-sonnet-4.6 | 0.016 → 0.112 | 0.116 | 0.019 | 6× |
+| gpt-5.4 | 0.015 → **0.145** | 0.148 | 0.014 | **10×** |
+**Ign-F1 ≈ triple-F1** on every model (Ign-F1 excludes triples seen in training) → the gains are
+**generalization, not memorized facts**. KGGen has no closed-vocab mode, so it can't consume a known
+ontology. *This is "schema-aware typed extraction," not "wanshi beats KGGen at relation extraction."*
+**MINE (context only).** On the recall-only, judge-mediated MINE benchmark KGGen's denser extraction wins
+(re-scored ~64% vs wanshi's best ~28%). MINE rewards raw triple coverage and is blind to precision, and
+its judge performs fact-verification (a known-soft measurement) — so the gold-labeled results above carry
+the comparative claims; MINE is reported as context, not a verdict.
+**Cost & reproducibility.** Generation = cloud OpenRouter; **embeddings = local Ollama (free)**.
+Representative spend (measured live via the OpenRouter credits API; wanshi extraction tokens shown, the
+$ also covers the KGGen baseline):
+| Cell (Re-DocRED, two-way + H4, N=100) | tokens in | tokens out | cost |
+| ------------------------------------- | --------- | ---------- | ---- |
+| claude-sonnet-4.6 | ~0.57 M | ~0.16 M | $6.00 |
+| gpt-5.4 | ~0.43 M | ~0.19 M | $5.60 |
+(OpenRouter rates at run time, ≈ $3 / $15 per Mtok in/out for the Claude tier.) Reproduce a cell with the
+one harness — wanshi inline, KGGen cached, same sample list for both:
+```bash
+npx ts-node scripts/gold-compare.ts --dataset redocred --limit 100 \
+  --model deepseek/deepseek-v4-pro --provider openai --host https://openrouter.ai/api/v1
+.venv-kggen/bin/python scripts/kggen-crossre.py --model deepseek/deepseek-v4-pro \
+  --samples data/redocred/compare/samples.jsonl --out data/redocred/compare/kggen.jsonl
+# add --relation-vocab @data/redocred/compare/relation-vocab.txt for the schema-aware (H4) cell
+```
+### What's not yet measured
+- **Local-model (offline-first) benchmarks** — the deployment-target floor (`gemma3:4b`-class) is *owed*;
+  every number above is cloud inference. This is the next benchmark priority. *(An earlier indicative
+  n=20 single-domain run hinted small `gemma3:4b` ≈ larger Gemmas on entity extraction — to be confirmed
+  in the local arm.)*
+- **A second document-level dataset** (SciERC / BioRED) to close the single-dataset caveat on claim (a).
+## Quality metrics
+Importable evaluators in `src/quality/` (also wired into `npm run benchmark`): **structural** (counts, density, type distribution), **semantic** (name quality, observation specificity, coverage), **factual** (grounding, hallucination, contradiction — this one also backs the inline grounding gate), and **consistency** (cross-file naming, type coherence), rolled into a 0–100 composite that can gate which graphs are harvested for `kblam`/`lora` training data.
+## Architecture
+```text
+src/
+├── cli/          # Commander.js CLI (process/watch/export; --export-only)
+├── core/
+│   ├── di/        # Async DI container + service registrations
+│   ├── processor/ # File readers (transcript, JSON, PDF, Office, audio, …) + chunking + classifiers
+│   ├── checkpoint/# Per-chunk resume sidecar
+│   ├── llm/       # Ollama / OpenAI-compatible providers, embeddings, Handlebars prompts
+│   ├── knowledge/ # KG building (LLM+Zod, provenance + grounding gate), 3-level merge, vector search
+│   └── export/    # Strategy pattern: json, jsonl, mcp-jsonl, dot, kblam, lora, graphiti
+├── quality/      # Importable metrics (structural, semantic, factual, consistency, composite)
+├── evaluation/   # Benchmark harness (CrossRE / REBEL / RE-DocRED)
+├── types/        # Interfaces and data models
+└── shared/       # Logger, graceful shutdown, utilities (Jaro-Winkler, cosine, config)
+```
+Tests use Jest (`npm test`); mock the LLM via `ILLMProvider` for network-free unit tests.
+## Development
+```bash
+git clone https://github.com/wanshi-kg/wanshi && cd wanshi && npm install
+npx ts-node ./src/index.ts --config config.yaml   # run directly
+npm run build && node ./dist/index.js --config config.yaml   # or build first
+```
+See `examples/kg-mail-assistant/` for a full integration (Gmail OAuth + Telegram bot + continuous email→KG pipeline) and programmatic usage via `ContainerFactory`.
+## Acknowledgments
+- **[Ollama](https://ollama.ai)** — local LLM runtime and embeddings
+- **[LangChain](https://github.com/langchain-ai/langchainjs)** — text-splitting utilities
+- **[OpenAI Whisper](https://github.com/openai/whisper)** (via `nodejs-whisper`) — audio transcription
+- **Anthropic** — the MCP protocol, and Claude as a build partner (Cheetah 🐆 on the code, Dove 🕊️ on the audits)
+- **[KBLaM](https://github.com/microsoft/KBLaM)** and **[Graphiti](https://github.com/getzep/graphiti)** — prior work this project's training exports and temporal model lean on
+## License
+MIT — see [LICENSE](LICENSE).
+---
+*Knows ten thousand things; keeps only the ones it can source.*

package/dist/__tests__/helpers.js ADDED Viewed

@@ -0,0 +1,27 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.makeConfig = makeConfig;
+exports.stubLogger = stubLogger;
+const config_1 = require("../config");
+/**
+ * Build a fully-defaulted, validated config from a nested partial (the new
+ * config shape). Use this instead of hand-rolling flat `{ model, chunkSize }`
+ * fixtures — it deep-applies schema defaults so consumers reading
+ * `options.llm.model` / `options.chunking.size` don't hit undefined.
+ */
+function makeConfig(partial = {}) {
+    return (0, config_1.parseConfig)(partial);
+}
+/** Minimal no-op Logger for unit tests (avoids tslog/file side effects). */
+function stubLogger() {
+    const noop = () => undefined;
+    return {
+        trace: noop,
+        debug: noop,
+        info: noop,
+        warn: noop,
+        error: noop,
+        fatal: noop,
+    };
+}
+//# sourceMappingURL=helpers.js.map

package/dist/__tests__/helpers.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"helpers.js","sourceRoot":"","sources":["../../src/__tests__/helpers.ts"],"names":[],"mappings":";;AASA,gCAEC;AAGD,gCAUC;AAvBD,sCAA2D;AAE3D;;;;;GAKG;AACH,SAAgB,UAAU,CAAC,UAAmC,EAAE;IAC9D,OAAO,IAAA,oBAAW,EAAC,OAAO,CAAC,CAAC;AAC9B,CAAC;AAED,4EAA4E;AAC5E,SAAgB,UAAU;IACxB,MAAM,IAAI,GAAG,GAAG,EAAE,CAAC,SAAS,CAAC;IAC7B,OAAO;QACL,KAAK,EAAE,IAAI;QACX,KAAK,EAAE,IAAI;QACX,IAAI,EAAE,IAAI;QACV,IAAI,EAAE,IAAI;QACV,KAAK,EAAE,IAAI;QACX,KAAK,EAAE,IAAI;KACS,CAAC;AACzB,CAAC"}

package/dist/cli/commands/export.command.js ADDED Viewed

@@ -0,0 +1,99 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.exportCommand = exportCommand;
+const fs = __importStar(require("fs"));
+const di_1 = require("../../core/di");
+/**
+ * Export command — convert an existing knowledge-graph JSON file (`--input`)
+ * into another export format (`--export-format`), written to `--output`.
+ * Reuses the same `KnowledgeGraphExportService` strategies as the main pipeline.
+ */
+function exportCommand(container) {
+    return __awaiter(this, void 0, void 0, function* () {
+        var _a;
+        const logger = yield container.resolve(di_1.TYPES.Logger);
+        const options = yield container.resolve(di_1.TYPES.ProcessingOptions);
+        try {
+            const sourcePath = options.input;
+            if (!sourcePath ||
+                !fs.existsSync(sourcePath) ||
+                fs.statSync(sourcePath).isDirectory()) {
+                throw new Error(`In export mode, --input must point to an existing knowledge-graph JSON file (got: ${sourcePath})`);
+            }
+            const parsed = JSON.parse(fs.readFileSync(sourcePath, "utf-8"));
+            // Tolerate both a single merged graph object and a legacy array of graphs.
+            const graph = Array.isArray(parsed)
+                ? flattenGraphs(parsed)
+                : parsed;
+            if (!graph || !Array.isArray(graph.entities)) {
+                throw new Error(`--input does not contain a knowledge graph ({ entities, relations }): ${sourcePath}`);
+            }
+            (_a = graph.relations) !== null && _a !== void 0 ? _a : (graph.relations = []);
+            const exporter = yield container.resolve(di_1.TYPES.KnowledgeGraphExportService);
+            const format = options.export.format;
+            if (!exporter.isFormatSupported(format)) {
+                throw new Error(`Unsupported export format: ${format}. Supported: ${exporter
+                    .getSupportedFormats()
+                    .join(", ")}`);
+            }
+            const content = exporter.export(graph, format, options);
+            yield fs.promises.writeFile(options.output, content);
+            logger.info(`Exported knowledge graph from ${sourcePath} to ${options.output} (${format}): ` +
+                `${graph.entities.length} entities, ${graph.relations.length} relations`);
+        }
+        catch (error) {
+            logger.error(`Export command failed: ${error}`);
+            throw error;
+        }
+    });
+}
+/** Concatenate a legacy array of per-file graphs into one graph. */
+function flattenGraphs(graphs) {
+    return {
+        entities: graphs.flatMap((g) => { var _a; return (_a = g.entities) !== null && _a !== void 0 ? _a : []; }),
+        relations: graphs.flatMap((g) => { var _a; return (_a = g.relations) !== null && _a !== void 0 ? _a : []; }),
+    };
+}
+//# sourceMappingURL=export.command.js.map

package/dist/cli/commands/export.command.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"export.command.js","sourceRoot":"","sources":["../../../src/cli/commands/export.command.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAcA,sCAsDC;AApED,uCAAyB;AACzB,sCAAmD;AAQnD;;;;GAIG;AACH,SAAsB,aAAa,CAAC,SAAsB;;;QACxD,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAS,UAAK,CAAC,MAAM,CAAC,CAAC;QAC7D,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,OAAO,CACrC,UAAK,CAAC,iBAAiB,CACxB,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC;YACjC,IACE,CAAC,UAAU;gBACX,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC;gBAC1B,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,WAAW,EAAE,EACrC,CAAC;gBACD,MAAM,IAAI,KAAK,CACb,qFAAqF,UAAU,GAAG,CACnG,CAAC;YACJ,CAAC;YAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC;YAChE,2EAA2E;YAC3E,MAAM,KAAK,GAAmB,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC;gBACjD,CAAC,CAAC,aAAa,CAAC,MAA0B,CAAC;gBAC3C,CAAC,CAAE,MAAyB,CAAC;YAE/B,IAAI,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC7C,MAAM,IAAI,KAAK,CACb,yEAAyE,UAAU,EAAE,CACtF,CAAC;YACJ,CAAC;YACD,MAAA,KAAK,CAAC,SAAS,oCAAf,KAAK,CAAC,SAAS,GAAK,EAAE,EAAC;YAEvB,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,OAAO,CACtC,UAAK,CAAC,2BAA2B,CAClC,CAAC;YACF,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC;YACrC,IAAI,CAAC,QAAQ,CAAC,iBAAiB,CAAC,MAAM,CAAC,EAAE,CAAC;gBACxC,MAAM,IAAI,KAAK,CACb,8BAA8B,MAAM,gBAAgB,QAAQ;qBACzD,mBAAmB,EAAE;qBACrB,IAAI,CAAC,IAAI,CAAC,EAAE,CAChB,CAAC;YACJ,CAAC;YAED,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;YACxD,MAAM,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YAErD,MAAM,CAAC,IAAI,CACT,iCAAiC,UAAU,OAAO,OAAO,CAAC,MAAM,KAAK,MAAM,KAAK;gBAC9E,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,cAAc,KAAK,CAAC,SAAS,CAAC,MAAM,YAAY,CAC3E,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,KAAK,CAAC,0BAA0B,KAAK,EAAE,CAAC,CAAC;YAChD,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;CAAA;AAED,oEAAoE;AACpE,SAAS,aAAa,CAAC,MAAwB;IAC7C,OAAO;QACL,QAAQ,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,WAAC,OAAA,MAAA,CAAC,CAAC,QAAQ,mCAAI,EAAE,CAAA,EAAA,CAAC;QACjD,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,WAAC,OAAA,MAAA,CAAC,CAAC,SAAS,mCAAI,EAAE,CAAA,EAAA,CAAC;KACpD,CAAC;AACJ,CAAC"}

package/dist/cli/commands/index.js ADDED Viewed

@@ -0,0 +1,22 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __exportStar = (this && this.__exportStar) || function(m, exports) {
+    for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+__exportStar(require("./process.command"), exports);
+__exportStar(require("./watch.command"), exports);
+__exportStar(require("./export.command"), exports);
+__exportStar(require("./metrics.command"), exports);
+__exportStar(require("./inspectMerges.command"), exports);
+//# sourceMappingURL=index.js.map

package/dist/cli/commands/index.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/cli/commands/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,oDAAkC;AAClC,kDAAgC;AAChC,mDAAiC;AACjC,oDAAkC;AAClC,0DAAwC"}