@wanshi-kg/wanshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (443) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +458 -0
  3. package/dist/__tests__/helpers.js +27 -0
  4. package/dist/__tests__/helpers.js.map +1 -0
  5. package/dist/cli/commands/export.command.js +99 -0
  6. package/dist/cli/commands/export.command.js.map +1 -0
  7. package/dist/cli/commands/index.js +22 -0
  8. package/dist/cli/commands/index.js.map +1 -0
  9. package/dist/cli/commands/inspectMerges.command.js +84 -0
  10. package/dist/cli/commands/inspectMerges.command.js.map +1 -0
  11. package/dist/cli/commands/metrics.command.js +196 -0
  12. package/dist/cli/commands/metrics.command.js.map +1 -0
  13. package/dist/cli/commands/process.command.js +82 -0
  14. package/dist/cli/commands/process.command.js.map +1 -0
  15. package/dist/cli/commands/watch.command.js +91 -0
  16. package/dist/cli/commands/watch.command.js.map +1 -0
  17. package/dist/cli/index.js +269 -0
  18. package/dist/cli/index.js.map +1 -0
  19. package/dist/cli/optionsToConfig.js +160 -0
  20. package/dist/cli/optionsToConfig.js.map +1 -0
  21. package/dist/config/index.js +59 -0
  22. package/dist/config/index.js.map +1 -0
  23. package/dist/config/legacyHints.js +113 -0
  24. package/dist/config/legacyHints.js.map +1 -0
  25. package/dist/config/schema.js +803 -0
  26. package/dist/config/schema.js.map +1 -0
  27. package/dist/config/ui.js +221 -0
  28. package/dist/config/ui.js.map +1 -0
  29. package/dist/core/DirectoryProcessor.js +725 -0
  30. package/dist/core/DirectoryProcessor.js.map +1 -0
  31. package/dist/core/adapters/IStructuredAdapter.js +3 -0
  32. package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
  33. package/dist/core/adapters/SqliteAdapter.js +267 -0
  34. package/dist/core/adapters/SqliteAdapter.js.map +1 -0
  35. package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
  36. package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
  37. package/dist/core/adapters/index.js +20 -0
  38. package/dist/core/adapters/index.js.map +1 -0
  39. package/dist/core/checkpoint/CheckpointService.js +188 -0
  40. package/dist/core/checkpoint/CheckpointService.js.map +1 -0
  41. package/dist/core/checkpoint/index.js +18 -0
  42. package/dist/core/checkpoint/index.js.map +1 -0
  43. package/dist/core/corpus/CorpusAnalyzer.js +266 -0
  44. package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
  45. package/dist/core/corpus/CorpusProfileStore.js +92 -0
  46. package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
  47. package/dist/core/corpus/index.js +21 -0
  48. package/dist/core/corpus/index.js.map +1 -0
  49. package/dist/core/corpus/normalizeGlossary.js +60 -0
  50. package/dist/core/corpus/normalizeGlossary.js.map +1 -0
  51. package/dist/core/corpus/relPath.js +52 -0
  52. package/dist/core/corpus/relPath.js.map +1 -0
  53. package/dist/core/corpus/termFrequency.js +86 -0
  54. package/dist/core/corpus/termFrequency.js.map +1 -0
  55. package/dist/core/cost/CostMeter.js +235 -0
  56. package/dist/core/cost/CostMeter.js.map +1 -0
  57. package/dist/core/cost/index.js +19 -0
  58. package/dist/core/cost/index.js.map +1 -0
  59. package/dist/core/cost/prices.js +38 -0
  60. package/dist/core/cost/prices.js.map +1 -0
  61. package/dist/core/cv/ObjectDetectionService.js +119 -0
  62. package/dist/core/cv/ObjectDetectionService.js.map +1 -0
  63. package/dist/core/di/ContainerFactory.js +670 -0
  64. package/dist/core/di/ContainerFactory.js.map +1 -0
  65. package/dist/core/di/DIContainer.js +103 -0
  66. package/dist/core/di/DIContainer.js.map +1 -0
  67. package/dist/core/di/index.js +19 -0
  68. package/dist/core/di/index.js.map +1 -0
  69. package/dist/core/errors/CustomErrors.js +342 -0
  70. package/dist/core/errors/CustomErrors.js.map +1 -0
  71. package/dist/core/errors/index.js +18 -0
  72. package/dist/core/errors/index.js.map +1 -0
  73. package/dist/core/export/KnowledgeGraphExportService.js +56 -0
  74. package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
  75. package/dist/core/export/index.js +19 -0
  76. package/dist/core/export/index.js.map +1 -0
  77. package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
  78. package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
  79. package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
  80. package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
  81. package/dist/core/export/strategies/IExportStrategy.js +3 -0
  82. package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
  83. package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
  84. package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
  85. package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
  86. package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
  87. package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
  88. package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
  89. package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
  90. package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
  91. package/dist/core/export/strategies/McpExportStrategy.js +67 -0
  92. package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
  93. package/dist/core/export/strategies/index.js +25 -0
  94. package/dist/core/export/strategies/index.js.map +1 -0
  95. package/dist/core/export/strategies/kbTriples.js +60 -0
  96. package/dist/core/export/strategies/kbTriples.js.map +1 -0
  97. package/dist/core/index.js +22 -0
  98. package/dist/core/index.js.map +1 -0
  99. package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
  100. package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
  101. package/dist/core/knowledge/MergeRecord.js +3 -0
  102. package/dist/core/knowledge/MergeRecord.js.map +1 -0
  103. package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
  104. package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
  105. package/dist/core/knowledge/canon/index.js +18 -0
  106. package/dist/core/knowledge/canon/index.js.map +1 -0
  107. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
  108. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
  109. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
  110. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
  111. package/dist/core/knowledge/contradiction/index.js +19 -0
  112. package/dist/core/knowledge/contradiction/index.js.map +1 -0
  113. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
  114. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
  115. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
  116. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
  117. package/dist/core/knowledge/grounding/index.js +20 -0
  118. package/dist/core/knowledge/grounding/index.js.map +1 -0
  119. package/dist/core/knowledge/grounding/verbalize.js +38 -0
  120. package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
  121. package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
  122. package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
  123. package/dist/core/knowledge/index.js +20 -0
  124. package/dist/core/knowledge/index.js.map +1 -0
  125. package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
  126. package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
  127. package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
  128. package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
  129. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
  130. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
  131. package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
  132. package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
  133. package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
  134. package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
  135. package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
  136. package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
  137. package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
  138. package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
  139. package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
  140. package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
  141. package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
  142. package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
  143. package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
  144. package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
  145. package/dist/core/knowledge/vocabulary.js +162 -0
  146. package/dist/core/knowledge/vocabulary.js.map +1 -0
  147. package/dist/core/llm/EmbeddingService.js +113 -0
  148. package/dist/core/llm/EmbeddingService.js.map +1 -0
  149. package/dist/core/llm/OllamaService.js +146 -0
  150. package/dist/core/llm/OllamaService.js.map +1 -0
  151. package/dist/core/llm/OpenAICompatibleService.js +190 -0
  152. package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
  153. package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
  154. package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
  155. package/dist/core/llm/embeddingUtils.js +25 -0
  156. package/dist/core/llm/embeddingUtils.js.map +1 -0
  157. package/dist/core/llm/index.js +23 -0
  158. package/dist/core/llm/index.js.map +1 -0
  159. package/dist/core/llm/prompts/PromptManager.js +388 -0
  160. package/dist/core/llm/prompts/PromptManager.js.map +1 -0
  161. package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
  162. package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
  163. package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
  164. package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
  165. package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
  166. package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
  167. package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
  168. package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
  169. package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
  170. package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
  171. package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
  172. package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
  173. package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
  174. package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
  175. package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
  176. package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
  177. package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
  178. package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
  179. package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
  180. package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
  181. package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
  182. package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
  183. package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
  184. package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
  185. package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
  186. package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
  187. package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
  188. package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
  189. package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
  190. package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
  191. package/dist/core/pipeline/GroundingTransform.js +52 -0
  192. package/dist/core/pipeline/GroundingTransform.js.map +1 -0
  193. package/dist/core/pipeline/PipelineRunner.js +51 -0
  194. package/dist/core/pipeline/PipelineRunner.js.map +1 -0
  195. package/dist/core/pipeline/RelationFilterTransform.js +72 -0
  196. package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
  197. package/dist/core/pipeline/index.js +20 -0
  198. package/dist/core/pipeline/index.js.map +1 -0
  199. package/dist/core/processor/FileProcessor.js +184 -0
  200. package/dist/core/processor/FileProcessor.js.map +1 -0
  201. package/dist/core/processor/ProcessedRegistry.js +38 -0
  202. package/dist/core/processor/ProcessedRegistry.js.map +1 -0
  203. package/dist/core/processor/ast/AstSeedService.js +0 -0
  204. package/dist/core/processor/ast/AstSeedService.js.map +1 -0
  205. package/dist/core/processor/ast/AstSymbolStore.js +110 -0
  206. package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
  207. package/dist/core/processor/ast/index.js +19 -0
  208. package/dist/core/processor/ast/index.js.map +1 -0
  209. package/dist/core/processor/chunking/TextChunker.js +98 -0
  210. package/dist/core/processor/chunking/TextChunker.js.map +1 -0
  211. package/dist/core/processor/chunking/index.js +18 -0
  212. package/dist/core/processor/chunking/index.js.map +1 -0
  213. package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
  214. package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
  215. package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
  216. package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
  217. package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
  218. package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
  219. package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
  220. package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
  221. package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
  222. package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
  223. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
  224. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
  225. package/dist/core/processor/classifier/index.js +21 -0
  226. package/dist/core/processor/classifier/index.js.map +1 -0
  227. package/dist/core/processor/classifier/mergeClassifications.js +32 -0
  228. package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
  229. package/dist/core/processor/index.js +20 -0
  230. package/dist/core/processor/index.js.map +1 -0
  231. package/dist/core/processor/readers/AudioReader.js +462 -0
  232. package/dist/core/processor/readers/AudioReader.js.map +1 -0
  233. package/dist/core/processor/readers/BinaryReader.js +90 -0
  234. package/dist/core/processor/readers/BinaryReader.js.map +1 -0
  235. package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
  236. package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
  237. package/dist/core/processor/readers/ChatExportReader.js +365 -0
  238. package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
  239. package/dist/core/processor/readers/DoclingReader.js +445 -0
  240. package/dist/core/processor/readers/DoclingReader.js.map +1 -0
  241. package/dist/core/processor/readers/EmailReader.js +259 -0
  242. package/dist/core/processor/readers/EmailReader.js.map +1 -0
  243. package/dist/core/processor/readers/EpubReader.js +175 -0
  244. package/dist/core/processor/readers/EpubReader.js.map +1 -0
  245. package/dist/core/processor/readers/FileReader.js +90 -0
  246. package/dist/core/processor/readers/FileReader.js.map +1 -0
  247. package/dist/core/processor/readers/FileReaderFactory.js +49 -0
  248. package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
  249. package/dist/core/processor/readers/HtmlReader.js +371 -0
  250. package/dist/core/processor/readers/HtmlReader.js.map +1 -0
  251. package/dist/core/processor/readers/ImageReader.js +162 -0
  252. package/dist/core/processor/readers/ImageReader.js.map +1 -0
  253. package/dist/core/processor/readers/JsonFileReader.js +232 -0
  254. package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
  255. package/dist/core/processor/readers/JupyterReader.js +178 -0
  256. package/dist/core/processor/readers/JupyterReader.js.map +1 -0
  257. package/dist/core/processor/readers/LatexReader.js +176 -0
  258. package/dist/core/processor/readers/LatexReader.js.map +1 -0
  259. package/dist/core/processor/readers/MarkdownReader.js +289 -0
  260. package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
  261. package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
  262. package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
  263. package/dist/core/processor/readers/MistralOcrReader.js +198 -0
  264. package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
  265. package/dist/core/processor/readers/OfficeReader.js +174 -0
  266. package/dist/core/processor/readers/OfficeReader.js.map +1 -0
  267. package/dist/core/processor/readers/PdfReader.js +116 -0
  268. package/dist/core/processor/readers/PdfReader.js.map +1 -0
  269. package/dist/core/processor/readers/RtfReader.js +107 -0
  270. package/dist/core/processor/readers/RtfReader.js.map +1 -0
  271. package/dist/core/processor/readers/SubtitleReader.js +145 -0
  272. package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
  273. package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
  274. package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
  275. package/dist/core/processor/readers/TextReader.js +129 -0
  276. package/dist/core/processor/readers/TextReader.js.map +1 -0
  277. package/dist/core/processor/readers/TranscriptReader.js +234 -0
  278. package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
  279. package/dist/core/processor/readers/image/imageMetadata.js +155 -0
  280. package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
  281. package/dist/core/processor/readers/index.js +41 -0
  282. package/dist/core/processor/readers/index.js.map +1 -0
  283. package/dist/core/processor/readers/referenceExtraction.js +198 -0
  284. package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
  285. package/dist/core/processor/readers/stripReferences.js +59 -0
  286. package/dist/core/processor/readers/stripReferences.js.map +1 -0
  287. package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
  288. package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
  289. package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
  290. package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
  291. package/dist/core/progress/NoopProgressEmitter.js +15 -0
  292. package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
  293. package/dist/core/progress/index.js +19 -0
  294. package/dist/core/progress/index.js.map +1 -0
  295. package/dist/core/trace/TraceWriter.js +100 -0
  296. package/dist/core/trace/TraceWriter.js.map +1 -0
  297. package/dist/core/trace/events.js +13 -0
  298. package/dist/core/trace/events.js.map +1 -0
  299. package/dist/core/trace/index.js +20 -0
  300. package/dist/core/trace/index.js.map +1 -0
  301. package/dist/core/trace/lineage.js +97 -0
  302. package/dist/core/trace/lineage.js.map +1 -0
  303. package/dist/evaluation/BenchmarkRunner.js +171 -0
  304. package/dist/evaluation/BenchmarkRunner.js.map +1 -0
  305. package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
  306. package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
  307. package/dist/evaluation/classifier/labeledSamples.js +379 -0
  308. package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
  309. package/dist/evaluation/compare/goldCompare.js +126 -0
  310. package/dist/evaluation/compare/goldCompare.js.map +1 -0
  311. package/dist/evaluation/crossre/compareScoring.js +30 -0
  312. package/dist/evaluation/crossre/compareScoring.js.map +1 -0
  313. package/dist/evaluation/datasets/CrossREDataset.js +170 -0
  314. package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
  315. package/dist/evaluation/datasets/IDataset.js +3 -0
  316. package/dist/evaluation/datasets/IDataset.js.map +1 -0
  317. package/dist/evaluation/datasets/RebelDataset.js +117 -0
  318. package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
  319. package/dist/evaluation/datasets/RedocredDataset.js +218 -0
  320. package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
  321. package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
  322. package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
  323. package/dist/evaluation/index.js +33 -0
  324. package/dist/evaluation/index.js.map +1 -0
  325. package/dist/evaluation/matching/ExactMatcher.js +75 -0
  326. package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
  327. package/dist/evaluation/matching/SemanticMatcher.js +143 -0
  328. package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
  329. package/dist/evaluation/metrics/TripleMetrics.js +64 -0
  330. package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
  331. package/dist/evaluation/mine/MineCheckpoint.js +114 -0
  332. package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
  333. package/dist/evaluation/mine/MineDataset.js +208 -0
  334. package/dist/evaluation/mine/MineDataset.js.map +1 -0
  335. package/dist/evaluation/mine/MineReporter.js +98 -0
  336. package/dist/evaluation/mine/MineReporter.js.map +1 -0
  337. package/dist/evaluation/mine/MineRunner.js +148 -0
  338. package/dist/evaluation/mine/MineRunner.js.map +1 -0
  339. package/dist/evaluation/mine/MineScorer.js +127 -0
  340. package/dist/evaluation/mine/MineScorer.js.map +1 -0
  341. package/dist/evaluation/mine/types.js +12 -0
  342. package/dist/evaluation/mine/types.js.map +1 -0
  343. package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
  344. package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
  345. package/dist/evaluation/reporters/JsonReporter.js +50 -0
  346. package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
  347. package/dist/index.js +28 -0
  348. package/dist/index.js.map +1 -0
  349. package/dist/quality/CompositeScore.js +61 -0
  350. package/dist/quality/CompositeScore.js.map +1 -0
  351. package/dist/quality/ConsistencyMetrics.js +70 -0
  352. package/dist/quality/ConsistencyMetrics.js.map +1 -0
  353. package/dist/quality/FactualMetrics.js +76 -0
  354. package/dist/quality/FactualMetrics.js.map +1 -0
  355. package/dist/quality/GraphHealthMetrics.js +68 -0
  356. package/dist/quality/GraphHealthMetrics.js.map +1 -0
  357. package/dist/quality/SemanticMetrics.js +102 -0
  358. package/dist/quality/SemanticMetrics.js.map +1 -0
  359. package/dist/quality/StructuralMetrics.js +60 -0
  360. package/dist/quality/StructuralMetrics.js.map +1 -0
  361. package/dist/quality/index.js +23 -0
  362. package/dist/quality/index.js.map +1 -0
  363. package/dist/shared/index.js +20 -0
  364. package/dist/shared/index.js.map +1 -0
  365. package/dist/shared/logger/Logger.js +3 -0
  366. package/dist/shared/logger/Logger.js.map +1 -0
  367. package/dist/shared/logger/LoggerFactory.js +75 -0
  368. package/dist/shared/logger/LoggerFactory.js.map +1 -0
  369. package/dist/shared/logger/index.js +19 -0
  370. package/dist/shared/logger/index.js.map +1 -0
  371. package/dist/shared/shutdown.js +30 -0
  372. package/dist/shared/shutdown.js.map +1 -0
  373. package/dist/shared/utils/agglomerativeCluster.js +269 -0
  374. package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
  375. package/dist/shared/utils/astSymbols.js +69 -0
  376. package/dist/shared/utils/astSymbols.js.map +1 -0
  377. package/dist/shared/utils/cosineSimilarity.js +18 -0
  378. package/dist/shared/utils/cosineSimilarity.js.map +1 -0
  379. package/dist/shared/utils/directoryTree.js +184 -0
  380. package/dist/shared/utils/directoryTree.js.map +1 -0
  381. package/dist/shared/utils/documentOutline.js +74 -0
  382. package/dist/shared/utils/documentOutline.js.map +1 -0
  383. package/dist/shared/utils/index.js +24 -0
  384. package/dist/shared/utils/index.js.map +1 -0
  385. package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
  386. package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
  387. package/dist/shared/utils/parseJsonLenient.js +27 -0
  388. package/dist/shared/utils/parseJsonLenient.js.map +1 -0
  389. package/dist/shared/utils/readConfig.js +42 -0
  390. package/dist/shared/utils/readConfig.js.map +1 -0
  391. package/dist/shared/utils/readRtf.js +216 -0
  392. package/dist/shared/utils/readRtf.js.map +1 -0
  393. package/dist/shared/utils/softmax.js +26 -0
  394. package/dist/shared/utils/softmax.js.map +1 -0
  395. package/dist/types/ContentClass.js +3 -0
  396. package/dist/types/ContentClass.js.map +1 -0
  397. package/dist/types/CorpusProfile.js +3 -0
  398. package/dist/types/CorpusProfile.js.map +1 -0
  399. package/dist/types/IContradictionChecker.js +3 -0
  400. package/dist/types/IContradictionChecker.js.map +1 -0
  401. package/dist/types/ICorpusAnalyzer.js +3 -0
  402. package/dist/types/ICorpusAnalyzer.js.map +1 -0
  403. package/dist/types/IDirectoryProcessor.js +3 -0
  404. package/dist/types/IDirectoryProcessor.js.map +1 -0
  405. package/dist/types/IEmbeddingProvider.js +3 -0
  406. package/dist/types/IEmbeddingProvider.js.map +1 -0
  407. package/dist/types/IEmbeddingService.js +6 -0
  408. package/dist/types/IEmbeddingService.js.map +1 -0
  409. package/dist/types/IFileProcessor.js +3 -0
  410. package/dist/types/IFileProcessor.js.map +1 -0
  411. package/dist/types/IGroundingChecker.js +3 -0
  412. package/dist/types/IGroundingChecker.js.map +1 -0
  413. package/dist/types/IKnowledgeGraphBuilder.js +3 -0
  414. package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
  415. package/dist/types/IKnowledgeGraphExporter.js +3 -0
  416. package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
  417. package/dist/types/IKnowledgeGraphMerger.js +3 -0
  418. package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
  419. package/dist/types/IKnowledgeGraphSearch.js +3 -0
  420. package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
  421. package/dist/types/ILLMProvider.js +3 -0
  422. package/dist/types/ILLMProvider.js.map +1 -0
  423. package/dist/types/ILLMService.js +3 -0
  424. package/dist/types/ILLMService.js.map +1 -0
  425. package/dist/types/IObjectDetector.js +3 -0
  426. package/dist/types/IObjectDetector.js.map +1 -0
  427. package/dist/types/IProcessingService.js +3 -0
  428. package/dist/types/IProcessingService.js.map +1 -0
  429. package/dist/types/IProgressEmitter.js +3 -0
  430. package/dist/types/IProgressEmitter.js.map +1 -0
  431. package/dist/types/IPromptManager.js +3 -0
  432. package/dist/types/IPromptManager.js.map +1 -0
  433. package/dist/types/KnowledgeGraph.js +3 -0
  434. package/dist/types/KnowledgeGraph.js.map +1 -0
  435. package/dist/types/MCPKnowledgeGraph.js +3 -0
  436. package/dist/types/MCPKnowledgeGraph.js.map +1 -0
  437. package/dist/types/Observation.js +21 -0
  438. package/dist/types/Observation.js.map +1 -0
  439. package/dist/types/ProcessingOptions.js +3 -0
  440. package/dist/types/ProcessingOptions.js.map +1 -0
  441. package/dist/types/index.js +40 -0
  442. package/dist/types/index.js.map +1 -0
  443. package/package.json +122 -0
@@ -0,0 +1,624 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.canonicalizeRelationType = canonicalizeRelationType;
46
+ exports.normalizeEntityName = normalizeEntityName;
47
+ exports.digitSignature = digitSignature;
48
+ exports.mergeKnowledgeGraphs = mergeKnowledgeGraphs;
49
+ const crypto = __importStar(require("crypto"));
50
+ const utils_1 = require("../../../shared/utils");
51
+ // Default similarity thresholds for entities and observation merging
52
+ const DefaultSimilarityThreshold = 0.7;
53
+ const DefaultObservationThreshold = 0.7;
54
+ // A fuzzy match across two different known entity types must clear this bar —
55
+ // spelling similarity alone is weak evidence of co-reference when types disagree
56
+ // (garlic/concept vs Anthropic/organization sit at JW 0.704).
57
+ const CrossTypeThreshold = 0.95;
58
+ /** Provenance identity used to keep distinct sources/speakers un-merged. */
59
+ function provenanceKey(o) {
60
+ var _a, _b;
61
+ return `${(_a = o.source) !== null && _a !== void 0 ? _a : ""}␟${(_b = o.speaker) !== null && _b !== void 0 ? _b : ""}`;
62
+ }
63
+ /**
64
+ * Canonicalize a relation's `relationType` array so semantically identical edges
65
+ * collapse on merge: trim → lowercase → de-dupe → sort. This makes the compound
66
+ * predicate order-insensitive, so `["uses","calls"]` and `["calls","uses"]` (the
67
+ * "reversed-twin" class that bloats the predicate vocabulary) map to one key.
68
+ * Pure — exported for tests.
69
+ */
70
+ function canonicalizeRelationType(types) {
71
+ return Array.from(new Set((types !== null && types !== void 0 ? types : []).map((t) => t.trim().toLowerCase()).filter(Boolean))).sort();
72
+ }
73
+ /**
74
+ * Deduplicate observations while PRESERVING per-source attribution: the same
75
+ * fact asserted by two different sources/speakers stays as two observations.
76
+ * We partition by provenance identity and only collapse near-duplicates *within*
77
+ * a single provenance group.
78
+ */
79
+ function deduplicateObservations(observations, threshold, embeddingService, logger) {
80
+ return __awaiter(this, void 0, void 0, function* () {
81
+ if (observations.length <= 1)
82
+ return observations;
83
+ logger === null || logger === void 0 ? void 0 : logger.debug(`Deduplicating ${observations.length} observations (provenance-aware)`);
84
+ const groups = new Map();
85
+ for (const o of observations) {
86
+ const key = provenanceKey(o);
87
+ const g = groups.get(key);
88
+ if (g)
89
+ g.push(o);
90
+ else
91
+ groups.set(key, [o]);
92
+ }
93
+ const result = [];
94
+ for (const group of groups.values()) {
95
+ result.push(...(yield dedupWithinProvenance(group, threshold, embeddingService, logger)));
96
+ }
97
+ logger === null || logger === void 0 ? void 0 : logger.debug(`Deduplicated to ${result.length} observations (removed ${observations.length - result.length}, across ${groups.size} provenance group(s))`);
98
+ return result;
99
+ });
100
+ }
101
+ /** Collapse near-duplicate observations that share the same provenance. */
102
+ function dedupWithinProvenance(observations, threshold, embeddingService, logger) {
103
+ return __awaiter(this, void 0, void 0, function* () {
104
+ if (observations.length <= 1)
105
+ return observations;
106
+ const data = [];
107
+ for (const obs of observations) {
108
+ try {
109
+ const embedding = yield embeddingService.embed(obs.text);
110
+ data.push({ obs, embedding });
111
+ }
112
+ catch (error) {
113
+ logger === null || logger === void 0 ? void 0 : logger.warn(`Failed to get embedding for observation: ${obs.text}`);
114
+ data.push({ obs, embedding: [] }); // keep it even if embedding fails
115
+ }
116
+ }
117
+ const toRemove = new Set();
118
+ for (let i = 0; i < data.length; i++) {
119
+ if (toRemove.has(i) || data[i].embedding.length === 0)
120
+ continue;
121
+ for (let j = i + 1; j < data.length; j++) {
122
+ if (toRemove.has(j) || data[j].embedding.length === 0)
123
+ continue;
124
+ const similarity = (0, utils_1.cosineSimilarity)(data[i].embedding, data[j].embedding);
125
+ if (similarity >= threshold) {
126
+ // keep the longer/more detailed observation (with its provenance)
127
+ if (data[i].obs.text.length >= data[j].obs.text.length) {
128
+ toRemove.add(j);
129
+ }
130
+ else {
131
+ toRemove.add(i);
132
+ break;
133
+ }
134
+ }
135
+ }
136
+ }
137
+ return data.filter((_, index) => !toRemove.has(index)).map((d) => d.obs);
138
+ });
139
+ }
140
+ /** Normalize an entity name for the exact-match fast path: case, `_`/`-`/dash and whitespace runs. */
141
+ function normalizeEntityName(name) {
142
+ return name.toLowerCase().replace(/[_\-‐-―\s]+/g, " ").trim();
143
+ }
144
+ /** Digit tokens of a name ("Table 12 v2" → "12,2"). Differing signatures veto fuzzy merging. */
145
+ function digitSignature(name) {
146
+ var _a;
147
+ return ((_a = name.match(/\d+/g)) !== null && _a !== void 0 ? _a : []).join(",");
148
+ }
149
+ /**
150
+ * Find an existing entity the candidate should fold into. A normalized-exact name
151
+ * match always wins. Fuzzy (Jaro-Winkler) matching is gated by guards encoding
152
+ * "similar spelling is not co-reference": names whose digit tokens differ never
153
+ * merge (Table 1 ≠ Table 2, NeurIPS 2019 ≠ NeurIPS 2024), and a match across two
154
+ * different known entity types must clear the near-exact CrossTypeThreshold.
155
+ */
156
+ function findSimilarEntity(entity, existingEntities, threshold, enableSimilarityMerging, qualifyFileIdentity = false) {
157
+ // At the global stage, file-identity entities (file/document) are matched by an
158
+ // exact name+file key *before* this is called, so skip them here — a conceptual
159
+ // entity must never fuse with a file artifact, and a file artifact never fuzzy-
160
+ // matches another file's same-named artifact (KG-13). Within-file merge passes
161
+ // false, preserving its name-only behavior.
162
+ const skip = (e) => qualifyFileIdentity && FILE_IDENTITY_TYPES.has(e.entityType);
163
+ const norm = normalizeEntityName(entity.name);
164
+ for (const [existingName, existing] of existingEntities) {
165
+ if (skip(existing))
166
+ continue;
167
+ if (normalizeEntityName(existingName) === norm) {
168
+ return { name: existingName, sim: 1, method: "string-exact" };
169
+ }
170
+ }
171
+ if (!enableSimilarityMerging)
172
+ return null;
173
+ const digits = digitSignature(entity.name);
174
+ let best = null;
175
+ for (const [existingName, existing] of existingEntities) {
176
+ if (skip(existing))
177
+ continue;
178
+ if (digitSignature(existingName) !== digits)
179
+ continue;
180
+ const crossType = !!entity.entityType &&
181
+ !!existing.entityType &&
182
+ entity.entityType !== existing.entityType &&
183
+ entity.entityType !== "other" &&
184
+ existing.entityType !== "other";
185
+ const required = crossType ? Math.max(threshold, CrossTypeThreshold) : threshold;
186
+ const similarity = (0, utils_1.jaroWinklerSimilarity)(entity.name, existingName);
187
+ if (similarity >= required && (!best || similarity > best.sim)) {
188
+ best = { name: existingName, sim: similarity, method: "string-jw" };
189
+ }
190
+ }
191
+ return best;
192
+ }
193
+ /**
194
+ * Merge-time supersession (KG-10, Graphiti "invalidate, don't delete"): for each
195
+ * pair of an entity's observations the checker flags as contradictory AND that
196
+ * carry orderable `validAt`, stamp the OLDER one's `invalidAt` (= when the newer
197
+ * fact began holding) and `expiredAt` (= now, when we recorded the supersession).
198
+ * Both observations are kept — history is preserved, the newer is current.
199
+ */
200
+ function applySupersession(observations, checker, now) {
201
+ return __awaiter(this, void 0, void 0, function* () {
202
+ for (let i = 0; i < observations.length; i++) {
203
+ for (let j = i + 1; j < observations.length; j++) {
204
+ const a = observations[i];
205
+ const b = observations[j];
206
+ if (!a.validAt || !b.validAt || a.validAt === b.validAt)
207
+ continue;
208
+ if (a.expiredAt || b.expiredAt)
209
+ continue; // already superseded
210
+ const { contradicts } = yield checker.check(a.text, b.text);
211
+ if (!contradicts)
212
+ continue;
213
+ const older = a.validAt < b.validAt ? a : b;
214
+ const newer = older === a ? b : a;
215
+ older.invalidAt = newer.validAt;
216
+ older.expiredAt = now;
217
+ }
218
+ }
219
+ });
220
+ }
221
+ /** Emit a merge-log record for one fusion (same JSONL shape as canon's merges.jsonl). */
222
+ function recordFusion(options, winner, loser, match) {
223
+ if (!options.onMergeRecord || winner === loser)
224
+ return;
225
+ options.onMergeRecord({
226
+ cluster_id: crypto.createHash("sha1").update(`${winner}␟${loser}`).digest("hex").slice(0, 12),
227
+ target: "entity",
228
+ surface_forms: [winner, loser],
229
+ canonical_chosen: winner,
230
+ member_count: 2,
231
+ method: match.method,
232
+ intra_cluster_sim: { min: match.sim, max: match.sim },
233
+ borderline_pairs: [],
234
+ source_spans: [],
235
+ });
236
+ }
237
+ function mergeKnowledgeGraphs(graphs, options, embeddingService, logger) {
238
+ return __awaiter(this, void 0, void 0, function* () {
239
+ var _a;
240
+ logger === null || logger === void 0 ? void 0 : logger.info(`Starting hierarchical merge of ${graphs.length} knowledge graphs`);
241
+ logger === null || logger === void 0 ? void 0 : logger.info(`Entity similarity threshold: ${options.entitySimilarityThreshold}`);
242
+ logger === null || logger === void 0 ? void 0 : logger.info(`Observation similarity threshold: ${options.observationSimilarityThreshold}`);
243
+ // Step 1: Group graphs by file
244
+ const graphsByFile = new Map();
245
+ for (const graph of graphs) {
246
+ for (const entity of graph.entities) {
247
+ const file = entity.files[0] || "unknown";
248
+ if (!graphsByFile.has(file)) {
249
+ graphsByFile.set(file, []);
250
+ }
251
+ // Create a mini-graph for this entity and related relations
252
+ const entityGraph = {
253
+ entities: [entity],
254
+ relations: graph.relations.filter((r) => r.from === entity.name || r.to === entity.name),
255
+ };
256
+ graphsByFile.get(file).push(entityGraph);
257
+ }
258
+ }
259
+ logger === null || logger === void 0 ? void 0 : logger.info(`Step 1: Grouped into ${graphsByFile.size} files`);
260
+ // Step 2: Merge entities within each file
261
+ const mergedByFile = new Map();
262
+ for (const [file, fileGraphs] of graphsByFile) {
263
+ logger === null || logger === void 0 ? void 0 : logger.debug(`Step 2: Merging ${fileGraphs.length} entities in file: ${file}`);
264
+ const fileMerged = yield mergeWithinFile(fileGraphs, file, options, embeddingService, logger);
265
+ mergedByFile.set(file, fileMerged);
266
+ logger === null || logger === void 0 ? void 0 : logger.debug(`File ${file}: ${fileMerged.entities.length} entities, ${fileMerged.relations.length} relations`);
267
+ }
268
+ // Step 3: Global merge across files
269
+ logger === null || logger === void 0 ? void 0 : logger.info(`Step 3: Global merge across ${mergedByFile.size} files`);
270
+ const globalGraphs = Array.from(mergedByFile.values());
271
+ const { graph: finalResult, stats } = yield mergeGlobally(globalGraphs, options, embeddingService, logger);
272
+ logger === null || logger === void 0 ? void 0 : logger.info(`Hierarchical merge complete: ${finalResult.entities.length} entities, ${finalResult.relations.length} relations`);
273
+ // Cross-file linking health (KG-04) — the recall signal "0 dangling" used to hide.
274
+ logger === null || logger === void 0 ? void 0 : logger.info(`Cross-file linking: ${stats.crossFileEdges} edge(s) link entities across files; ` +
275
+ `${stats.droppedDanglingEdges} relation(s) dropped as dangling at the global stage`);
276
+ (_a = options.onMergeStats) === null || _a === void 0 ? void 0 : _a.call(options, stats);
277
+ logVocabularyFit(finalResult, logger);
278
+ return finalResult;
279
+ });
280
+ }
281
+ /**
282
+ * Closed-vocabulary fit metric (Dove's guardrail for the v5 enums): how often the
283
+ * model fell back to a catch-all instead of a specific type/predicate. A high
284
+ * relation `related_to` fraction (north of ~15–20%) suggests the closed predicate
285
+ * set is too tight for this corpus, not that the corpus is weird.
286
+ */
287
+ function logVocabularyFit(graph, logger) {
288
+ const rels = graph.relations;
289
+ const ents = graph.entities;
290
+ if (rels.length === 0 && ents.length === 0)
291
+ return;
292
+ const relCatchAll = rels.filter((r) => {
293
+ const types = Array.isArray(r.relationType) ? r.relationType : [r.relationType];
294
+ return types.length > 0 && types.every((t) => t === "related_to");
295
+ }).length;
296
+ const entCatchAll = ents.filter((e) => e.entityType === "other").length;
297
+ const relPct = rels.length ? ((100 * relCatchAll) / rels.length).toFixed(1) : "0.0";
298
+ const entPct = ents.length ? ((100 * entCatchAll) / ents.length).toFixed(1) : "0.0";
299
+ logger === null || logger === void 0 ? void 0 : logger.info(`Vocabulary fit: ${relCatchAll}/${rels.length} relations → 'related_to' (${relPct}%), ` +
300
+ `${entCatchAll}/${ents.length} entities → 'other' (${entPct}%)`);
301
+ }
302
+ // Merge entities within a single file. Same threshold as the global pass — the old
303
+ // "stricter for same-file" heuristic (×0.7, cap 0.6) fused unrelated short names
304
+ // (garlic↔Anthropic at JW 0.704); same-file proximity is not evidence of co-reference.
305
+ function mergeWithinFile(fileGraphs, fileName, options, embeddingService, logger) {
306
+ return __awaiter(this, void 0, void 0, function* () {
307
+ var _a, _b;
308
+ const entityMap = new Map();
309
+ const relationSet = new Set();
310
+ const relations = [];
311
+ const threshold = options.entitySimilarityThreshold || DefaultSimilarityThreshold;
312
+ const enableSimilarity = options.enableSimilarityMerging !== false;
313
+ // Every incoming surface form → its final entity key; relations re-key through this
314
+ // map only (never through an independent fuzzy lookup).
315
+ const rename = new Map();
316
+ // Merge entities within the file
317
+ for (const graph of fileGraphs) {
318
+ for (const entity of graph.entities) {
319
+ const match = findSimilarEntity(entity, entityMap, threshold, enableSimilarity);
320
+ if (match) {
321
+ rename.set(entity.name, match.name);
322
+ recordFusion(options, match.name, entity.name, match);
323
+ const existing = entityMap.get(match.name);
324
+ logger === null || logger === void 0 ? void 0 : logger.debug(`[${fileName}] Merging entity "${entity.name}" with existing "${match.name}"`);
325
+ // Combine observations
326
+ const allObservations = [
327
+ ...(existing.observations || []),
328
+ ...(entity.observations || []),
329
+ ];
330
+ // Deduplicate observations using embeddings
331
+ if (allObservations.length > 0) {
332
+ existing.observations = yield deduplicateObservations(allObservations, options.observationSimilarityThreshold || DefaultObservationThreshold, embeddingService, logger);
333
+ }
334
+ // Merge other properties
335
+ existing.entityType = existing.entityType || entity.entityType;
336
+ // Merge chunk information (keep the range)
337
+ if (entity.chunk !== undefined) {
338
+ existing.chunk =
339
+ existing.chunk !== undefined
340
+ ? Math.min(existing.chunk, entity.chunk)
341
+ : entity.chunk;
342
+ }
343
+ if (entity.totalChunks !== undefined) {
344
+ existing.totalChunks = Math.max(existing.totalChunks || 0, entity.totalChunks);
345
+ }
346
+ }
347
+ else {
348
+ // Add as new entity
349
+ rename.set(entity.name, entity.name);
350
+ const newEntity = Object.assign(Object.assign({}, entity), { file: fileName });
351
+ entityMap.set(entity.name, newEntity);
352
+ }
353
+ }
354
+ }
355
+ // Merge relations within the file, re-keying endpoints through the rename map.
356
+ // Referential integrity is NOT enforced here (KG-04): a relation may legitimately
357
+ // point at an entity defined in ANOTHER file — the v5 cross-file contract — and
358
+ // those endpoints aren't visible until the global stage, where the full entity
359
+ // universe is known. Dropping them here destroyed every compliant cross-file edge
360
+ // before global merge ever saw it. So pass all (re-keyed, non-self-loop) relations
361
+ // through; mergeGlobally is the sole endpoint-existence gate.
362
+ for (const graph of fileGraphs) {
363
+ for (const relation of graph.relations) {
364
+ const fromEntity = (_a = rename.get(relation.from)) !== null && _a !== void 0 ? _a : relation.from;
365
+ const toEntity = (_b = rename.get(relation.to)) !== null && _b !== void 0 ? _b : relation.to;
366
+ // Drop self-loops (X→X): an extraction artifact, and merging names can also
367
+ // create one when both endpoints collapse to the same entity.
368
+ if (fromEntity === toEntity)
369
+ continue;
370
+ const relationType = canonicalizeRelationType(relation.relationType);
371
+ const relationKey = `${fromEntity}->${toEntity}:${relationType.join(",")}`;
372
+ if (!relationSet.has(relationKey)) {
373
+ relationSet.add(relationKey);
374
+ relations.push(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ from: fromEntity, to: toEntity, relationType }, (relation.sourceSpan ? { sourceSpan: relation.sourceSpan } : {})), (relation.validAt ? { validAt: relation.validAt } : {})), (relation.source ? { source: relation.source } : {})), (relation.resolved !== undefined ? { resolved: relation.resolved } : {})), (relation.faithfulness ? { faithfulness: relation.faithfulness } : {})), (relation.faithfulnessScore !== undefined ? { faithfulnessScore: relation.faithfulnessScore } : {})), (relation.supportingSpan ? { supportingSpan: relation.supportingSpan } : {})));
375
+ }
376
+ }
377
+ }
378
+ return {
379
+ entities: Array.from(entityMap.values()),
380
+ relations: relations,
381
+ };
382
+ });
383
+ }
384
+ const ENTITY_CATCH_ALL = "other";
385
+ /**
386
+ * Entity types that denote a *file/document artifact* rather than a concept
387
+ * (KG-13). Two `package.json` (or `index.ts`, or a `document` per paper) in
388
+ * different files are distinct artifacts that must NOT fuse, whereas a `function`
389
+ * or `concept` of the same name across files is the same thing and *should* merge
390
+ * (the whole point of global cross-file linking). So identity is name+file for
391
+ * these types only.
392
+ */
393
+ const FILE_IDENTITY_TYPES = new Set(["file", "document"]);
394
+ /** Field separator for a name+file qualified identity key (unit separator). */
395
+ const ID_SEP = "␟";
396
+ /**
397
+ * Global-merge identity key for an entity: its bare name for conceptual entities
398
+ * (so same-name concepts merge across files), or `name␟primaryFile` for
399
+ * file-identity types (so same-name file artifacts in different files stay
400
+ * distinct). The bare name never contains `␟`, so the two key spaces can't collide.
401
+ */
402
+ function entityIdentityKey(entity) {
403
+ var _a;
404
+ if (FILE_IDENTITY_TYPES.has(entity.entityType)) {
405
+ return `${entity.name}${ID_SEP}${(_a = entity.files[0]) !== null && _a !== void 0 ? _a : "unknown"}`;
406
+ }
407
+ return entity.name;
408
+ }
409
+ /**
410
+ * Elect a merged entity's type from all the types its fused surface forms carried
411
+ * (KG-13): a specific type always beats the `other` catch-all, then majority vote
412
+ * wins (ties broken by first occurrence, so it's deterministic). Replaces the old
413
+ * "longest string wins" heuristic, under which `other`(5) beat `file`(4) and
414
+ * `organization` always beat `person`.
415
+ */
416
+ function electEntityType(types) {
417
+ var _a;
418
+ const specific = types.filter((t) => t && t !== ENTITY_CATCH_ALL);
419
+ const pool = specific.length > 0 ? specific : types.filter(Boolean);
420
+ if (pool.length === 0)
421
+ return ENTITY_CATCH_ALL;
422
+ const counts = new Map();
423
+ for (const t of pool)
424
+ counts.set(t, ((_a = counts.get(t)) !== null && _a !== void 0 ? _a : 0) + 1);
425
+ let best = pool[0];
426
+ let bestN = 0;
427
+ for (const t of pool) {
428
+ const n = counts.get(t);
429
+ if (n > bestN) {
430
+ bestN = n;
431
+ best = t;
432
+ }
433
+ }
434
+ return best;
435
+ }
436
+ // Global merge across different files. The sole referential-integrity gate (KG-04):
437
+ // the within-file pass defers here, where every entity across all files is visible.
438
+ function mergeGlobally(fileGraphs, options, embeddingService, logger) {
439
+ return __awaiter(this, void 0, void 0, function* () {
440
+ var _a, _b;
441
+ const entityMap = new Map();
442
+ const relationSet = new Set();
443
+ const relations = [];
444
+ // Track which files each entity appears in
445
+ const entityFileMap = new Map();
446
+ // Every entityType each fused surface form carried → elected at end-of-merge (KG-13).
447
+ const entityTypeVotes = new Map();
448
+ const globalSimilarityThreshold = options.entitySimilarityThreshold || DefaultSimilarityThreshold;
449
+ const enableSimilarity = options.enableSimilarityMerging !== false;
450
+ // Relation re-keying is PER GRAPH (KG-13): a file artifact's bare name is
451
+ // ambiguous across files, so each graph's relations resolve endpoints against
452
+ // that graph's own surface-name → output-name map; conceptual names also fall
453
+ // back to a global map for genuine cross-file references.
454
+ const renamePerGraph = [];
455
+ const globalConceptualRename = new Map();
456
+ // For file-identity entities, `name␟file` → the output name already assigned, so
457
+ // the same artifact re-extracted (e.g. across chunks) merges into one entity.
458
+ const idKeyToName = new Map();
459
+ logger === null || logger === void 0 ? void 0 : logger.debug(`Global similarity threshold: ${globalSimilarityThreshold}`);
460
+ // Assign a unique output name, disambiguating a file artifact only when its bare
461
+ // name is already taken by a *different* file/entity (so the common single-project
462
+ // case keeps the clean `package.json`, but two projects' don't collide → no data loss).
463
+ const uniqueName = (name, file) => {
464
+ if (!entityMap.has(name))
465
+ return name;
466
+ const base = file ? `${name} [${file}]` : name;
467
+ let candidate = base;
468
+ let i = 2;
469
+ while (entityMap.has(candidate))
470
+ candidate = `${base}#${i++}`;
471
+ return candidate;
472
+ };
473
+ // Merge entities across files
474
+ for (const graph of fileGraphs) {
475
+ const localRename = new Map();
476
+ renamePerGraph.push(localRename);
477
+ for (const entity of graph.entities) {
478
+ const fileIdentity = FILE_IDENTITY_TYPES.has(entity.entityType);
479
+ // Resolve which existing entity (if any) this one merges into, as an output
480
+ // name. File artifacts merge only with the exact same name+file; conceptual
481
+ // entities merge by name/similarity (and never with a file artifact).
482
+ let outName;
483
+ let isNew;
484
+ let match = null;
485
+ if (fileIdentity) {
486
+ const idKey = `${entity.name}${ID_SEP}${(_a = entity.files[0]) !== null && _a !== void 0 ? _a : "unknown"}`;
487
+ const claimed = idKeyToName.get(idKey);
488
+ if (claimed) {
489
+ outName = claimed;
490
+ isNew = false;
491
+ }
492
+ else {
493
+ outName = uniqueName(entity.name, entity.files[0]);
494
+ idKeyToName.set(idKey, outName);
495
+ isNew = true;
496
+ }
497
+ }
498
+ else {
499
+ match = findSimilarEntity(entity, entityMap, globalSimilarityThreshold, enableSimilarity, true);
500
+ if (match) {
501
+ outName = match.name;
502
+ isNew = false;
503
+ }
504
+ else {
505
+ // A conceptual entity that clashes with a file artifact holding the bare
506
+ // name gets disambiguated rather than overwriting it.
507
+ outName = uniqueName(entity.name);
508
+ isNew = true;
509
+ }
510
+ }
511
+ localRename.set(entity.name, outName);
512
+ if (!fileIdentity)
513
+ globalConceptualRename.set(entity.name, outName);
514
+ if (!isNew) {
515
+ const existing = entityMap.get(outName);
516
+ // Only a genuinely different surface form fused is merge-log-worthy.
517
+ if (match && existing.name !== entity.name) {
518
+ recordFusion(options, outName, entity.name, match);
519
+ }
520
+ logger === null || logger === void 0 ? void 0 : logger.debug(`[Global] Merging entity "${entity.name}" (${entity.files[0]}) into "${outName}" (${existing.files[0]})`);
521
+ const allObservations = [
522
+ ...(existing.observations || []),
523
+ ...(entity.observations || []),
524
+ ];
525
+ if (allObservations.length > 0) {
526
+ existing.observations = yield deduplicateObservations(allObservations, options.observationSimilarityThreshold || DefaultObservationThreshold, embeddingService, logger);
527
+ }
528
+ // Vote this surface form's type; the winner is elected at end-of-merge (KG-13).
529
+ entityTypeVotes.get(outName).push(entity.entityType);
530
+ for (const f of entity.files.length ? entity.files : ["unknown"]) {
531
+ entityFileMap.get(outName).add(f);
532
+ }
533
+ if (entity.chunk !== undefined) {
534
+ existing.chunk =
535
+ existing.chunk !== undefined ? Math.min(existing.chunk, entity.chunk) : entity.chunk;
536
+ }
537
+ if (entity.totalChunks !== undefined) {
538
+ existing.totalChunks = Math.max(existing.totalChunks || 0, entity.totalChunks);
539
+ }
540
+ }
541
+ else {
542
+ entityMap.set(outName, Object.assign(Object.assign({}, entity), { name: outName }));
543
+ entityFileMap.set(outName, new Set(entity.files.length ? entity.files : ["unknown"]));
544
+ entityTypeVotes.set(outName, [entity.entityType]);
545
+ }
546
+ }
547
+ }
548
+ // Merge relations across files, re-keying endpoints through the rename map. This
549
+ // is the sole endpoint-existence gate (KG-04): an endpoint missing here resolved
550
+ // to no entity in ANY file, so it's a true dangler. Cross-file edges — endpoints
551
+ // first surfaced in different files — survive here precisely because the within-
552
+ // file pass no longer destroys them.
553
+ let droppedDanglingEdges = 0;
554
+ let crossFileEdges = 0;
555
+ fileGraphs.forEach((graph, gi) => {
556
+ var _a, _b, _c, _d, _e, _f, _g, _h;
557
+ const localRename = renamePerGraph[gi];
558
+ for (const relation of graph.relations) {
559
+ // Resolve endpoints against THIS graph's name map first (so a file artifact
560
+ // resolves to the right disambiguated entity), then a global conceptual
561
+ // fallback for genuine cross-file references (KG-13).
562
+ const fromEntity = (_b = (_a = localRename.get(relation.from)) !== null && _a !== void 0 ? _a : globalConceptualRename.get(relation.from)) !== null && _b !== void 0 ? _b : relation.from;
563
+ const toEntity = (_d = (_c = localRename.get(relation.to)) !== null && _c !== void 0 ? _c : globalConceptualRename.get(relation.to)) !== null && _d !== void 0 ? _d : relation.to;
564
+ // Drop self-loops (X→X): an extraction artifact, and cross-file name
565
+ // mapping can also collapse both endpoints onto the same entity.
566
+ if (fromEntity === toEntity)
567
+ continue;
568
+ const fromNode = entityMap.get(fromEntity);
569
+ const toNode = entityMap.get(toEntity);
570
+ if (fromNode && toNode) {
571
+ const relationType = canonicalizeRelationType(relation.relationType);
572
+ const relationKey = `${fromEntity}->${toEntity}:${relationType.join(",")}`;
573
+ if (!relationSet.has(relationKey)) {
574
+ relationSet.add(relationKey);
575
+ // Count once per unique surviving edge whose endpoints were first defined
576
+ // in different files — the cross-file links the old within-file gate killed.
577
+ if (((_f = (_e = fromNode.files) === null || _e === void 0 ? void 0 : _e[0]) !== null && _f !== void 0 ? _f : "") !== ((_h = (_g = toNode.files) === null || _g === void 0 ? void 0 : _g[0]) !== null && _h !== void 0 ? _h : "")) {
578
+ crossFileEdges++;
579
+ }
580
+ relations.push(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ from: fromEntity, to: toEntity, relationType }, (relation.sourceSpan ? { sourceSpan: relation.sourceSpan } : {})), (relation.validAt ? { validAt: relation.validAt } : {})), (relation.source ? { source: relation.source } : {})), (relation.resolved !== undefined ? { resolved: relation.resolved } : {})), (relation.faithfulness ? { faithfulness: relation.faithfulness } : {})), (relation.faithfulnessScore !== undefined ? { faithfulnessScore: relation.faithfulnessScore } : {})), (relation.supportingSpan ? { supportingSpan: relation.supportingSpan } : {})));
581
+ }
582
+ }
583
+ else {
584
+ droppedDanglingEdges++;
585
+ }
586
+ }
587
+ });
588
+ if (droppedDanglingEdges > 0) {
589
+ logger === null || logger === void 0 ? void 0 : logger.info(`Global merge dropped ${droppedDanglingEdges} relation(s) whose endpoints resolved to no entity (true danglers)`);
590
+ }
591
+ // Log cross-file entity statistics
592
+ const crossFileEntities = Array.from(entityFileMap.entries()).filter(([_, files]) => files.size > 1);
593
+ if (crossFileEntities.length > 0) {
594
+ logger === null || logger === void 0 ? void 0 : logger.info(`Found ${crossFileEntities.length} entities appearing across multiple files:`);
595
+ crossFileEntities.forEach(([entityName, files]) => {
596
+ logger === null || logger === void 0 ? void 0 : logger.debug(` ${entityName}: ${Array.from(files).join(", ")}`);
597
+ });
598
+ }
599
+ // Finalize each merged entity: elect its type from all votes (specific beats
600
+ // `other`, then majority), write back the cross-file files[] union (KG-13), and
601
+ // run merge-time supersession over its observations when enabled (KG-10).
602
+ const supersessionNow = new Date().toISOString();
603
+ for (const [key, entity] of entityMap) {
604
+ entity.entityType = electEntityType((_b = entityTypeVotes.get(key)) !== null && _b !== void 0 ? _b : [entity.entityType]);
605
+ const files = entityFileMap.get(key);
606
+ if (files && files.size > 0) {
607
+ entity.files = Array.from(files).filter((f) => f !== "unknown");
608
+ if (entity.files.length === 0)
609
+ entity.files = Array.from(files);
610
+ }
611
+ if (options.contradictionChecker) {
612
+ yield applySupersession(entity.observations, options.contradictionChecker, supersessionNow);
613
+ }
614
+ }
615
+ return {
616
+ graph: {
617
+ entities: Array.from(entityMap.values()),
618
+ relations: relations,
619
+ },
620
+ stats: { crossFileEdges, droppedDanglingEdges },
621
+ };
622
+ });
623
+ }
624
+ //# sourceMappingURL=KnowledgeMerger.js.map