@wanshi-kg/wanshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (443) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +458 -0
  3. package/dist/__tests__/helpers.js +27 -0
  4. package/dist/__tests__/helpers.js.map +1 -0
  5. package/dist/cli/commands/export.command.js +99 -0
  6. package/dist/cli/commands/export.command.js.map +1 -0
  7. package/dist/cli/commands/index.js +22 -0
  8. package/dist/cli/commands/index.js.map +1 -0
  9. package/dist/cli/commands/inspectMerges.command.js +84 -0
  10. package/dist/cli/commands/inspectMerges.command.js.map +1 -0
  11. package/dist/cli/commands/metrics.command.js +196 -0
  12. package/dist/cli/commands/metrics.command.js.map +1 -0
  13. package/dist/cli/commands/process.command.js +82 -0
  14. package/dist/cli/commands/process.command.js.map +1 -0
  15. package/dist/cli/commands/watch.command.js +91 -0
  16. package/dist/cli/commands/watch.command.js.map +1 -0
  17. package/dist/cli/index.js +269 -0
  18. package/dist/cli/index.js.map +1 -0
  19. package/dist/cli/optionsToConfig.js +160 -0
  20. package/dist/cli/optionsToConfig.js.map +1 -0
  21. package/dist/config/index.js +59 -0
  22. package/dist/config/index.js.map +1 -0
  23. package/dist/config/legacyHints.js +113 -0
  24. package/dist/config/legacyHints.js.map +1 -0
  25. package/dist/config/schema.js +803 -0
  26. package/dist/config/schema.js.map +1 -0
  27. package/dist/config/ui.js +221 -0
  28. package/dist/config/ui.js.map +1 -0
  29. package/dist/core/DirectoryProcessor.js +725 -0
  30. package/dist/core/DirectoryProcessor.js.map +1 -0
  31. package/dist/core/adapters/IStructuredAdapter.js +3 -0
  32. package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
  33. package/dist/core/adapters/SqliteAdapter.js +267 -0
  34. package/dist/core/adapters/SqliteAdapter.js.map +1 -0
  35. package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
  36. package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
  37. package/dist/core/adapters/index.js +20 -0
  38. package/dist/core/adapters/index.js.map +1 -0
  39. package/dist/core/checkpoint/CheckpointService.js +188 -0
  40. package/dist/core/checkpoint/CheckpointService.js.map +1 -0
  41. package/dist/core/checkpoint/index.js +18 -0
  42. package/dist/core/checkpoint/index.js.map +1 -0
  43. package/dist/core/corpus/CorpusAnalyzer.js +266 -0
  44. package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
  45. package/dist/core/corpus/CorpusProfileStore.js +92 -0
  46. package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
  47. package/dist/core/corpus/index.js +21 -0
  48. package/dist/core/corpus/index.js.map +1 -0
  49. package/dist/core/corpus/normalizeGlossary.js +60 -0
  50. package/dist/core/corpus/normalizeGlossary.js.map +1 -0
  51. package/dist/core/corpus/relPath.js +52 -0
  52. package/dist/core/corpus/relPath.js.map +1 -0
  53. package/dist/core/corpus/termFrequency.js +86 -0
  54. package/dist/core/corpus/termFrequency.js.map +1 -0
  55. package/dist/core/cost/CostMeter.js +235 -0
  56. package/dist/core/cost/CostMeter.js.map +1 -0
  57. package/dist/core/cost/index.js +19 -0
  58. package/dist/core/cost/index.js.map +1 -0
  59. package/dist/core/cost/prices.js +38 -0
  60. package/dist/core/cost/prices.js.map +1 -0
  61. package/dist/core/cv/ObjectDetectionService.js +119 -0
  62. package/dist/core/cv/ObjectDetectionService.js.map +1 -0
  63. package/dist/core/di/ContainerFactory.js +670 -0
  64. package/dist/core/di/ContainerFactory.js.map +1 -0
  65. package/dist/core/di/DIContainer.js +103 -0
  66. package/dist/core/di/DIContainer.js.map +1 -0
  67. package/dist/core/di/index.js +19 -0
  68. package/dist/core/di/index.js.map +1 -0
  69. package/dist/core/errors/CustomErrors.js +342 -0
  70. package/dist/core/errors/CustomErrors.js.map +1 -0
  71. package/dist/core/errors/index.js +18 -0
  72. package/dist/core/errors/index.js.map +1 -0
  73. package/dist/core/export/KnowledgeGraphExportService.js +56 -0
  74. package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
  75. package/dist/core/export/index.js +19 -0
  76. package/dist/core/export/index.js.map +1 -0
  77. package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
  78. package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
  79. package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
  80. package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
  81. package/dist/core/export/strategies/IExportStrategy.js +3 -0
  82. package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
  83. package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
  84. package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
  85. package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
  86. package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
  87. package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
  88. package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
  89. package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
  90. package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
  91. package/dist/core/export/strategies/McpExportStrategy.js +67 -0
  92. package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
  93. package/dist/core/export/strategies/index.js +25 -0
  94. package/dist/core/export/strategies/index.js.map +1 -0
  95. package/dist/core/export/strategies/kbTriples.js +60 -0
  96. package/dist/core/export/strategies/kbTriples.js.map +1 -0
  97. package/dist/core/index.js +22 -0
  98. package/dist/core/index.js.map +1 -0
  99. package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
  100. package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
  101. package/dist/core/knowledge/MergeRecord.js +3 -0
  102. package/dist/core/knowledge/MergeRecord.js.map +1 -0
  103. package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
  104. package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
  105. package/dist/core/knowledge/canon/index.js +18 -0
  106. package/dist/core/knowledge/canon/index.js.map +1 -0
  107. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
  108. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
  109. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
  110. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
  111. package/dist/core/knowledge/contradiction/index.js +19 -0
  112. package/dist/core/knowledge/contradiction/index.js.map +1 -0
  113. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
  114. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
  115. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
  116. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
  117. package/dist/core/knowledge/grounding/index.js +20 -0
  118. package/dist/core/knowledge/grounding/index.js.map +1 -0
  119. package/dist/core/knowledge/grounding/verbalize.js +38 -0
  120. package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
  121. package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
  122. package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
  123. package/dist/core/knowledge/index.js +20 -0
  124. package/dist/core/knowledge/index.js.map +1 -0
  125. package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
  126. package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
  127. package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
  128. package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
  129. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
  130. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
  131. package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
  132. package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
  133. package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
  134. package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
  135. package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
  136. package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
  137. package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
  138. package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
  139. package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
  140. package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
  141. package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
  142. package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
  143. package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
  144. package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
  145. package/dist/core/knowledge/vocabulary.js +162 -0
  146. package/dist/core/knowledge/vocabulary.js.map +1 -0
  147. package/dist/core/llm/EmbeddingService.js +113 -0
  148. package/dist/core/llm/EmbeddingService.js.map +1 -0
  149. package/dist/core/llm/OllamaService.js +146 -0
  150. package/dist/core/llm/OllamaService.js.map +1 -0
  151. package/dist/core/llm/OpenAICompatibleService.js +190 -0
  152. package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
  153. package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
  154. package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
  155. package/dist/core/llm/embeddingUtils.js +25 -0
  156. package/dist/core/llm/embeddingUtils.js.map +1 -0
  157. package/dist/core/llm/index.js +23 -0
  158. package/dist/core/llm/index.js.map +1 -0
  159. package/dist/core/llm/prompts/PromptManager.js +388 -0
  160. package/dist/core/llm/prompts/PromptManager.js.map +1 -0
  161. package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
  162. package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
  163. package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
  164. package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
  165. package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
  166. package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
  167. package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
  168. package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
  169. package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
  170. package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
  171. package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
  172. package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
  173. package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
  174. package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
  175. package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
  176. package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
  177. package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
  178. package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
  179. package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
  180. package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
  181. package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
  182. package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
  183. package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
  184. package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
  185. package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
  186. package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
  187. package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
  188. package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
  189. package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
  190. package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
  191. package/dist/core/pipeline/GroundingTransform.js +52 -0
  192. package/dist/core/pipeline/GroundingTransform.js.map +1 -0
  193. package/dist/core/pipeline/PipelineRunner.js +51 -0
  194. package/dist/core/pipeline/PipelineRunner.js.map +1 -0
  195. package/dist/core/pipeline/RelationFilterTransform.js +72 -0
  196. package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
  197. package/dist/core/pipeline/index.js +20 -0
  198. package/dist/core/pipeline/index.js.map +1 -0
  199. package/dist/core/processor/FileProcessor.js +184 -0
  200. package/dist/core/processor/FileProcessor.js.map +1 -0
  201. package/dist/core/processor/ProcessedRegistry.js +38 -0
  202. package/dist/core/processor/ProcessedRegistry.js.map +1 -0
  203. package/dist/core/processor/ast/AstSeedService.js +0 -0
  204. package/dist/core/processor/ast/AstSeedService.js.map +1 -0
  205. package/dist/core/processor/ast/AstSymbolStore.js +110 -0
  206. package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
  207. package/dist/core/processor/ast/index.js +19 -0
  208. package/dist/core/processor/ast/index.js.map +1 -0
  209. package/dist/core/processor/chunking/TextChunker.js +98 -0
  210. package/dist/core/processor/chunking/TextChunker.js.map +1 -0
  211. package/dist/core/processor/chunking/index.js +18 -0
  212. package/dist/core/processor/chunking/index.js.map +1 -0
  213. package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
  214. package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
  215. package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
  216. package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
  217. package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
  218. package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
  219. package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
  220. package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
  221. package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
  222. package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
  223. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
  224. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
  225. package/dist/core/processor/classifier/index.js +21 -0
  226. package/dist/core/processor/classifier/index.js.map +1 -0
  227. package/dist/core/processor/classifier/mergeClassifications.js +32 -0
  228. package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
  229. package/dist/core/processor/index.js +20 -0
  230. package/dist/core/processor/index.js.map +1 -0
  231. package/dist/core/processor/readers/AudioReader.js +462 -0
  232. package/dist/core/processor/readers/AudioReader.js.map +1 -0
  233. package/dist/core/processor/readers/BinaryReader.js +90 -0
  234. package/dist/core/processor/readers/BinaryReader.js.map +1 -0
  235. package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
  236. package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
  237. package/dist/core/processor/readers/ChatExportReader.js +365 -0
  238. package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
  239. package/dist/core/processor/readers/DoclingReader.js +445 -0
  240. package/dist/core/processor/readers/DoclingReader.js.map +1 -0
  241. package/dist/core/processor/readers/EmailReader.js +259 -0
  242. package/dist/core/processor/readers/EmailReader.js.map +1 -0
  243. package/dist/core/processor/readers/EpubReader.js +175 -0
  244. package/dist/core/processor/readers/EpubReader.js.map +1 -0
  245. package/dist/core/processor/readers/FileReader.js +90 -0
  246. package/dist/core/processor/readers/FileReader.js.map +1 -0
  247. package/dist/core/processor/readers/FileReaderFactory.js +49 -0
  248. package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
  249. package/dist/core/processor/readers/HtmlReader.js +371 -0
  250. package/dist/core/processor/readers/HtmlReader.js.map +1 -0
  251. package/dist/core/processor/readers/ImageReader.js +162 -0
  252. package/dist/core/processor/readers/ImageReader.js.map +1 -0
  253. package/dist/core/processor/readers/JsonFileReader.js +232 -0
  254. package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
  255. package/dist/core/processor/readers/JupyterReader.js +178 -0
  256. package/dist/core/processor/readers/JupyterReader.js.map +1 -0
  257. package/dist/core/processor/readers/LatexReader.js +176 -0
  258. package/dist/core/processor/readers/LatexReader.js.map +1 -0
  259. package/dist/core/processor/readers/MarkdownReader.js +289 -0
  260. package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
  261. package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
  262. package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
  263. package/dist/core/processor/readers/MistralOcrReader.js +198 -0
  264. package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
  265. package/dist/core/processor/readers/OfficeReader.js +174 -0
  266. package/dist/core/processor/readers/OfficeReader.js.map +1 -0
  267. package/dist/core/processor/readers/PdfReader.js +116 -0
  268. package/dist/core/processor/readers/PdfReader.js.map +1 -0
  269. package/dist/core/processor/readers/RtfReader.js +107 -0
  270. package/dist/core/processor/readers/RtfReader.js.map +1 -0
  271. package/dist/core/processor/readers/SubtitleReader.js +145 -0
  272. package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
  273. package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
  274. package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
  275. package/dist/core/processor/readers/TextReader.js +129 -0
  276. package/dist/core/processor/readers/TextReader.js.map +1 -0
  277. package/dist/core/processor/readers/TranscriptReader.js +234 -0
  278. package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
  279. package/dist/core/processor/readers/image/imageMetadata.js +155 -0
  280. package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
  281. package/dist/core/processor/readers/index.js +41 -0
  282. package/dist/core/processor/readers/index.js.map +1 -0
  283. package/dist/core/processor/readers/referenceExtraction.js +198 -0
  284. package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
  285. package/dist/core/processor/readers/stripReferences.js +59 -0
  286. package/dist/core/processor/readers/stripReferences.js.map +1 -0
  287. package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
  288. package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
  289. package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
  290. package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
  291. package/dist/core/progress/NoopProgressEmitter.js +15 -0
  292. package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
  293. package/dist/core/progress/index.js +19 -0
  294. package/dist/core/progress/index.js.map +1 -0
  295. package/dist/core/trace/TraceWriter.js +100 -0
  296. package/dist/core/trace/TraceWriter.js.map +1 -0
  297. package/dist/core/trace/events.js +13 -0
  298. package/dist/core/trace/events.js.map +1 -0
  299. package/dist/core/trace/index.js +20 -0
  300. package/dist/core/trace/index.js.map +1 -0
  301. package/dist/core/trace/lineage.js +97 -0
  302. package/dist/core/trace/lineage.js.map +1 -0
  303. package/dist/evaluation/BenchmarkRunner.js +171 -0
  304. package/dist/evaluation/BenchmarkRunner.js.map +1 -0
  305. package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
  306. package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
  307. package/dist/evaluation/classifier/labeledSamples.js +379 -0
  308. package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
  309. package/dist/evaluation/compare/goldCompare.js +126 -0
  310. package/dist/evaluation/compare/goldCompare.js.map +1 -0
  311. package/dist/evaluation/crossre/compareScoring.js +30 -0
  312. package/dist/evaluation/crossre/compareScoring.js.map +1 -0
  313. package/dist/evaluation/datasets/CrossREDataset.js +170 -0
  314. package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
  315. package/dist/evaluation/datasets/IDataset.js +3 -0
  316. package/dist/evaluation/datasets/IDataset.js.map +1 -0
  317. package/dist/evaluation/datasets/RebelDataset.js +117 -0
  318. package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
  319. package/dist/evaluation/datasets/RedocredDataset.js +218 -0
  320. package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
  321. package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
  322. package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
  323. package/dist/evaluation/index.js +33 -0
  324. package/dist/evaluation/index.js.map +1 -0
  325. package/dist/evaluation/matching/ExactMatcher.js +75 -0
  326. package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
  327. package/dist/evaluation/matching/SemanticMatcher.js +143 -0
  328. package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
  329. package/dist/evaluation/metrics/TripleMetrics.js +64 -0
  330. package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
  331. package/dist/evaluation/mine/MineCheckpoint.js +114 -0
  332. package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
  333. package/dist/evaluation/mine/MineDataset.js +208 -0
  334. package/dist/evaluation/mine/MineDataset.js.map +1 -0
  335. package/dist/evaluation/mine/MineReporter.js +98 -0
  336. package/dist/evaluation/mine/MineReporter.js.map +1 -0
  337. package/dist/evaluation/mine/MineRunner.js +148 -0
  338. package/dist/evaluation/mine/MineRunner.js.map +1 -0
  339. package/dist/evaluation/mine/MineScorer.js +127 -0
  340. package/dist/evaluation/mine/MineScorer.js.map +1 -0
  341. package/dist/evaluation/mine/types.js +12 -0
  342. package/dist/evaluation/mine/types.js.map +1 -0
  343. package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
  344. package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
  345. package/dist/evaluation/reporters/JsonReporter.js +50 -0
  346. package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
  347. package/dist/index.js +28 -0
  348. package/dist/index.js.map +1 -0
  349. package/dist/quality/CompositeScore.js +61 -0
  350. package/dist/quality/CompositeScore.js.map +1 -0
  351. package/dist/quality/ConsistencyMetrics.js +70 -0
  352. package/dist/quality/ConsistencyMetrics.js.map +1 -0
  353. package/dist/quality/FactualMetrics.js +76 -0
  354. package/dist/quality/FactualMetrics.js.map +1 -0
  355. package/dist/quality/GraphHealthMetrics.js +68 -0
  356. package/dist/quality/GraphHealthMetrics.js.map +1 -0
  357. package/dist/quality/SemanticMetrics.js +102 -0
  358. package/dist/quality/SemanticMetrics.js.map +1 -0
  359. package/dist/quality/StructuralMetrics.js +60 -0
  360. package/dist/quality/StructuralMetrics.js.map +1 -0
  361. package/dist/quality/index.js +23 -0
  362. package/dist/quality/index.js.map +1 -0
  363. package/dist/shared/index.js +20 -0
  364. package/dist/shared/index.js.map +1 -0
  365. package/dist/shared/logger/Logger.js +3 -0
  366. package/dist/shared/logger/Logger.js.map +1 -0
  367. package/dist/shared/logger/LoggerFactory.js +75 -0
  368. package/dist/shared/logger/LoggerFactory.js.map +1 -0
  369. package/dist/shared/logger/index.js +19 -0
  370. package/dist/shared/logger/index.js.map +1 -0
  371. package/dist/shared/shutdown.js +30 -0
  372. package/dist/shared/shutdown.js.map +1 -0
  373. package/dist/shared/utils/agglomerativeCluster.js +269 -0
  374. package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
  375. package/dist/shared/utils/astSymbols.js +69 -0
  376. package/dist/shared/utils/astSymbols.js.map +1 -0
  377. package/dist/shared/utils/cosineSimilarity.js +18 -0
  378. package/dist/shared/utils/cosineSimilarity.js.map +1 -0
  379. package/dist/shared/utils/directoryTree.js +184 -0
  380. package/dist/shared/utils/directoryTree.js.map +1 -0
  381. package/dist/shared/utils/documentOutline.js +74 -0
  382. package/dist/shared/utils/documentOutline.js.map +1 -0
  383. package/dist/shared/utils/index.js +24 -0
  384. package/dist/shared/utils/index.js.map +1 -0
  385. package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
  386. package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
  387. package/dist/shared/utils/parseJsonLenient.js +27 -0
  388. package/dist/shared/utils/parseJsonLenient.js.map +1 -0
  389. package/dist/shared/utils/readConfig.js +42 -0
  390. package/dist/shared/utils/readConfig.js.map +1 -0
  391. package/dist/shared/utils/readRtf.js +216 -0
  392. package/dist/shared/utils/readRtf.js.map +1 -0
  393. package/dist/shared/utils/softmax.js +26 -0
  394. package/dist/shared/utils/softmax.js.map +1 -0
  395. package/dist/types/ContentClass.js +3 -0
  396. package/dist/types/ContentClass.js.map +1 -0
  397. package/dist/types/CorpusProfile.js +3 -0
  398. package/dist/types/CorpusProfile.js.map +1 -0
  399. package/dist/types/IContradictionChecker.js +3 -0
  400. package/dist/types/IContradictionChecker.js.map +1 -0
  401. package/dist/types/ICorpusAnalyzer.js +3 -0
  402. package/dist/types/ICorpusAnalyzer.js.map +1 -0
  403. package/dist/types/IDirectoryProcessor.js +3 -0
  404. package/dist/types/IDirectoryProcessor.js.map +1 -0
  405. package/dist/types/IEmbeddingProvider.js +3 -0
  406. package/dist/types/IEmbeddingProvider.js.map +1 -0
  407. package/dist/types/IEmbeddingService.js +6 -0
  408. package/dist/types/IEmbeddingService.js.map +1 -0
  409. package/dist/types/IFileProcessor.js +3 -0
  410. package/dist/types/IFileProcessor.js.map +1 -0
  411. package/dist/types/IGroundingChecker.js +3 -0
  412. package/dist/types/IGroundingChecker.js.map +1 -0
  413. package/dist/types/IKnowledgeGraphBuilder.js +3 -0
  414. package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
  415. package/dist/types/IKnowledgeGraphExporter.js +3 -0
  416. package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
  417. package/dist/types/IKnowledgeGraphMerger.js +3 -0
  418. package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
  419. package/dist/types/IKnowledgeGraphSearch.js +3 -0
  420. package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
  421. package/dist/types/ILLMProvider.js +3 -0
  422. package/dist/types/ILLMProvider.js.map +1 -0
  423. package/dist/types/ILLMService.js +3 -0
  424. package/dist/types/ILLMService.js.map +1 -0
  425. package/dist/types/IObjectDetector.js +3 -0
  426. package/dist/types/IObjectDetector.js.map +1 -0
  427. package/dist/types/IProcessingService.js +3 -0
  428. package/dist/types/IProcessingService.js.map +1 -0
  429. package/dist/types/IProgressEmitter.js +3 -0
  430. package/dist/types/IProgressEmitter.js.map +1 -0
  431. package/dist/types/IPromptManager.js +3 -0
  432. package/dist/types/IPromptManager.js.map +1 -0
  433. package/dist/types/KnowledgeGraph.js +3 -0
  434. package/dist/types/KnowledgeGraph.js.map +1 -0
  435. package/dist/types/MCPKnowledgeGraph.js +3 -0
  436. package/dist/types/MCPKnowledgeGraph.js.map +1 -0
  437. package/dist/types/Observation.js +21 -0
  438. package/dist/types/Observation.js.map +1 -0
  439. package/dist/types/ProcessingOptions.js +3 -0
  440. package/dist/types/ProcessingOptions.js.map +1 -0
  441. package/dist/types/index.js +40 -0
  442. package/dist/types/index.js.map +1 -0
  443. package/package.json +122 -0
@@ -0,0 +1,627 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.KnowledgeGraphBuilder = void 0;
46
+ exports.buildGraphSchema = buildGraphSchema;
47
+ const path = __importStar(require("path"));
48
+ const crypto = __importStar(require("crypto"));
49
+ const zod_1 = require("zod");
50
+ const types_1 = require("../../types");
51
+ const progress_1 = require("../progress");
52
+ const vocabulary_1 = require("./vocabulary");
53
+ const grounding_1 = require("./grounding");
54
+ const shared_1 = require("../../shared");
55
+ const trace_1 = require("../trace");
56
+ /**
57
+ * Build the extraction schema. Under v5 both vocabularies are *closed*: when an
58
+ * allowed set is supplied, the field is a Zod enum; `entityType` falls back to the
59
+ * base set + `other`, `relationType` to the base set + `related_to`, so the model
60
+ * can never invent a one-off type/predicate. When a set is empty the field stays a
61
+ * free string (legacy behavior, e.g. older prompt versions).
62
+ *
63
+ * **Lenient coercion (recall guard):** the enum is wrapped in `.catch(escape)`, so an
64
+ * out-of-vocab value the model emits anyway (e.g. `relationType: "returns"`, which
65
+ * Ollama's soft `format` constraint doesn't reliably prevent) is coerced onto the
66
+ * catch-all (`other` / `related_to`) **per field** instead of failing Zod and
67
+ * discarding the *entire chunk* (3 retries → empty graph). This is the escapes'
68
+ * intended purpose ("prevent validation-failure recall loss"); coerced values surface
69
+ * in `KnowledgeMerger.logVocabularyFit`'s catch-all fraction (the too-tight-vocab
70
+ * signal), so nothing goes silent.
71
+ */
72
+ function buildGraphSchema(allowedTypes, allowedRelationTypes) {
73
+ const hasTypes = !!allowedTypes && allowedTypes.length > 0;
74
+ const hasRel = !!allowedRelationTypes && allowedRelationTypes.length > 0;
75
+ const entityEscape = hasTypes ? (allowedTypes.includes("other") ? "other" : allowedTypes[0]) : "other";
76
+ const relEscape = hasRel
77
+ ? allowedRelationTypes.includes("related_to")
78
+ ? "related_to"
79
+ : allowedRelationTypes[0]
80
+ : "related_to";
81
+ const entityType = hasTypes
82
+ ? zod_1.z
83
+ .enum(allowedTypes)
84
+ .catch(entityEscape)
85
+ .describe("Entity type — pick the closest; use 'other' if none fit")
86
+ : zod_1.z.string().describe("Entity description");
87
+ // v5's prompt asks for "one canonical predicate", so instruction-following models
88
+ // (e.g. gemma4) emit relationType as a scalar string ("depends_on") rather than a
89
+ // one-element array. Coerce scalar → [scalar] before validating so a compliant model
90
+ // isn't rejected; the array path is unchanged.
91
+ const toRelationArray = (v) => (Array.isArray(v) ? v : v == null ? [] : [v]);
92
+ const relationType = hasRel
93
+ ? zod_1.z
94
+ .preprocess(toRelationArray, zod_1.z.array(zod_1.z.enum(allowedRelationTypes).catch(relEscape)))
95
+ .describe("One canonical predicate; use 'related_to' if none fit")
96
+ : zod_1.z.preprocess(toRelationArray, zod_1.z.array(zod_1.z.string())).describe("List of relation types");
97
+ return zod_1.z.object({
98
+ entities: zod_1.z.array(zod_1.z.object({
99
+ name: zod_1.z.string().describe("Unique entity name"),
100
+ entityType,
101
+ // Models often emit referenced-but-undescribed entities with no observations
102
+ // field at all; default to [] so a missing array doesn't reject the whole chunk
103
+ // (and so observations drops out of the JSON-schema `required` list).
104
+ observations: zod_1.z
105
+ .array(zod_1.z.string())
106
+ .default([])
107
+ .describe("List of facts and observations about entity"),
108
+ })),
109
+ relations: zod_1.z.array(zod_1.z.object({
110
+ from: zod_1.z.string().describe("Relation source entity"),
111
+ to: zod_1.z.string().describe("Relation target entity"),
112
+ relationType,
113
+ })),
114
+ });
115
+ }
116
+ const DEFAULT_GRAPH_SCHEMA = buildGraphSchema();
117
+ /**
118
+ * Builds knowledge graphs from processed files using LLM
119
+ */
120
+ class KnowledgeGraphBuilder {
121
+ constructor(options, logger) {
122
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l;
123
+ /** Chunks whose extraction threw this run — left uncheckpointed (KG-02). */
124
+ this.failedChunks = [];
125
+ /** Claims the grounding gate rejected this run (WI3 manifest trace). */
126
+ this.groundingRejections = [];
127
+ this.llmService = options.llmService;
128
+ this.promptManager = options.promptManager;
129
+ this.checkpoint = options.checkpoint;
130
+ this.resume = (_a = options.resume) !== null && _a !== void 0 ? _a : false;
131
+ this.model = options.model;
132
+ this.promptVersion = (_b = options.promptVersion) !== null && _b !== void 0 ? _b : 'default';
133
+ this.inputRoot = (_c = options.inputRoot) !== null && _c !== void 0 ? _c : '';
134
+ this.logger = logger;
135
+ this.progress = (_d = options.progress) !== null && _d !== void 0 ? _d : new progress_1.NoopProgressEmitter();
136
+ this.grounding = (_e = options.grounding) !== null && _e !== void 0 ? _e : 'disabled';
137
+ this.groundingMinScore = (_f = options.groundingMinScore) !== null && _f !== void 0 ? _f : 0.5;
138
+ this.groundingChecker =
139
+ (_g = options.groundingChecker) !== null && _g !== void 0 ? _g : new grounding_1.KeywordGroundingChecker(this.groundingMinScore);
140
+ this.groundingSignature = (_h = options.groundingSignature) !== null && _h !== void 0 ? _h : '';
141
+ this.attachSourceSpans = (_j = options.attachSourceSpans) !== null && _j !== void 0 ? _j : false;
142
+ this.openPredicate = (_k = options.openPredicate) !== null && _k !== void 0 ? _k : false;
143
+ this.strictVocabulary = (_l = options.strictVocabulary) !== null && _l !== void 0 ? _l : false;
144
+ }
145
+ /** Chunks whose extraction failed this run (empty when all succeeded). */
146
+ getFailedChunks() {
147
+ return this.failedChunks;
148
+ }
149
+ /** Claims the inline grounding gate rejected this run (empty when none/off). */
150
+ getGroundingRejections() {
151
+ return this.groundingRejections;
152
+ }
153
+ /**
154
+ * Stable identity for a file in the checkpoint key: the path relative to the
155
+ * discovery root (`inputRoot`), normalized to posix separators. This makes
156
+ * resume survive relocating the whole input tree or changing the `input`
157
+ * prefix. Falls back to the raw path when there's no root or the file resolves
158
+ * outside it (`..`), so behavior degrades gracefully rather than mis-keying.
159
+ */
160
+ stablePathId(filePath) {
161
+ if (!this.inputRoot)
162
+ return filePath;
163
+ const rel = path.relative(this.inputRoot, filePath);
164
+ if (!rel || rel.startsWith('..') || path.isAbsolute(rel))
165
+ return filePath;
166
+ return rel.split(path.sep).join('/');
167
+ }
168
+ /**
169
+ * The *deterministic* extraction inputs other than the chunk's own text (KG-07),
170
+ * folded into the checkpoint key's `extra` so toggling any of them between
171
+ * `--resume` runs re-extracts the affected chunks instead of silently reusing a
172
+ * graph built under different settings: the grounding signature (Phase 5), the
173
+ * rendered system prompt (which already encodes the resolved entity/relation
174
+ * vocabulary + domain examples → the "schema shape"), the corpus glossary, and
175
+ * the classifier classes.
176
+ *
177
+ * Deliberately EXCLUDES the chunk's retrieved context: retrieval pulls from the
178
+ * graph built by *prior* (temperature>0, non-deterministic) extractions, so it
179
+ * differs on every run. Folding it into the key made the key unstable across runs
180
+ * and defeated `--resume` entirely whenever retrieval was on (the default) — a
181
+ * re-run after a crash matched nothing and re-extracted (and re-billed) every
182
+ * chunk. The key must hash deterministic *inputs*, never volatile *outputs*.
183
+ */
184
+ extractionExtra(systemPrompt, glossary, contentClasses) {
185
+ const h = crypto.createHash('sha1');
186
+ for (const part of [
187
+ this.groundingSignature,
188
+ systemPrompt,
189
+ glossary ? JSON.stringify(glossary) : '',
190
+ contentClasses ? JSON.stringify(contentClasses) : '',
191
+ ]) {
192
+ h.update(part);
193
+ h.update('\x00');
194
+ }
195
+ return h.digest('hex');
196
+ }
197
+ /**
198
+ * Build a knowledge graph from a processed file
199
+ */
200
+ build(processedFile, systemPrompt, retrieve, glossary) {
201
+ return __awaiter(this, void 0, void 0, function* () {
202
+ var _a, _b, _c;
203
+ this.logger.info(`Building knowledge graph for: ${processedFile.path}`);
204
+ const graphs = [];
205
+ const contentClasses = (_a = processedFile.metadata) === null || _a === void 0 ? void 0 : _a.classes;
206
+ const multiChunk = processedFile.chunks.length > 1;
207
+ // Process chunks if available
208
+ if (multiChunk) {
209
+ for (const chunk of processedFile.chunks) {
210
+ // Cooperative interrupt: finish the in-flight chunk, then stop before
211
+ // starting the next one so a partial graph can be flushed.
212
+ if (shared_1.shutdown.isRequested()) {
213
+ this.logger.warn(`Interrupted — stopping at chunk ${chunk.index}/${chunk.totalChunks} of ${processedFile.path}`);
214
+ break;
215
+ }
216
+ // Retrieve context for THIS chunk's content (per-chunk retrieval).
217
+ const retrievedContext = retrieve ? yield retrieve(chunk.content) : undefined;
218
+ const kg = yield this.buildChunk(processedFile.path, chunk.index, chunk.totalChunks, chunk.content, this.chunkProvenance(processedFile, chunk), () => {
219
+ var _a;
220
+ return this.buildFromChunk(processedFile.path, chunk.content, (_a = processedFile.content) !== null && _a !== void 0 ? _a : '', // full file text → outline + grounding
221
+ systemPrompt, chunk.index, chunk.totalChunks, retrievedContext, chunk.images, contentClasses, glossary);
222
+ }, (entity) => {
223
+ entity.files = [processedFile.path];
224
+ entity.chunk = chunk.index;
225
+ entity.totalChunks = chunk.totalChunks;
226
+ }, this.extractionExtra(systemPrompt, glossary, contentClasses));
227
+ graphs.push(kg);
228
+ }
229
+ }
230
+ else if (processedFile.chunks.length === 1) {
231
+ const chunk = processedFile.chunks[0];
232
+ const { content, images } = chunk;
233
+ const retrievedContext = retrieve ? yield retrieve(content) : undefined;
234
+ // Process entire file
235
+ const kg = yield this.buildChunk(processedFile.path, (_b = chunk.index) !== null && _b !== void 0 ? _b : 1, (_c = chunk.totalChunks) !== null && _c !== void 0 ? _c : 1, content, this.chunkProvenance(processedFile, chunk), () => this.buildFromContent(processedFile.path, content, systemPrompt, retrievedContext, images, contentClasses, glossary), (entity) => {
236
+ entity.files = [processedFile.path];
237
+ }, this.extractionExtra(systemPrompt, glossary, contentClasses));
238
+ graphs.push(kg);
239
+ }
240
+ // Pin ingest-time document identity (reader metadata) as its own entity.
241
+ // Never trusted to extraction: body text is full of OTHER papers' IDs, and a
242
+ // cited paper's arXiv ID binding onto the host document is the worst-case
243
+ // provenance failure.
244
+ const identity = this.documentIdentityGraph(processedFile);
245
+ if (identity)
246
+ graphs.push(identity);
247
+ return graphs;
248
+ });
249
+ }
250
+ /** Build the pinned `document` entity from reader-supplied identity metadata. */
251
+ documentIdentityGraph(processedFile) {
252
+ var _a, _b;
253
+ const arxivId = (_a = processedFile.metadata) === null || _a === void 0 ? void 0 : _a.arxivId;
254
+ const title = (_b = processedFile.metadata) === null || _b === void 0 ? void 0 : _b.title;
255
+ if (!arxivId && !title)
256
+ return null;
257
+ const createdAt = new Date().toISOString();
258
+ const observations = [];
259
+ if (title) {
260
+ observations.push({ text: `Title: ${title}`, source: processedFile.path, createdAt });
261
+ }
262
+ if (arxivId) {
263
+ observations.push({ text: `arXiv:${arxivId}`, source: processedFile.path, createdAt });
264
+ }
265
+ const name = title !== null && title !== void 0 ? title : path.basename(processedFile.path);
266
+ this.logger.info(`Pinned document identity for ${processedFile.path}: ${name}`);
267
+ return {
268
+ entities: [
269
+ {
270
+ name,
271
+ entityType: "document",
272
+ files: [processedFile.path],
273
+ observations,
274
+ },
275
+ ],
276
+ relations: [],
277
+ };
278
+ }
279
+ /**
280
+ * Run one chunk through the LLM, or restore it from the checkpoint when
281
+ * resuming. Stored graphs already carry their entity metadata, so on a hit
282
+ * we skip the LLM call entirely.
283
+ */
284
+ buildChunk(filePath, chunkIndex, totalChunks, content, provenance, generate, attachMetadata, extractionExtra) {
285
+ return __awaiter(this, void 0, void 0, function* () {
286
+ var _a, _b;
287
+ this.progress.emit({
288
+ type: "chunk_start",
289
+ path: filePath,
290
+ chunk: chunkIndex,
291
+ totalChunks,
292
+ });
293
+ const relPath = this.stablePathId(filePath);
294
+ const chunkId = `${relPath}#${chunkIndex}`;
295
+ const extractionId = `${chunkId}@0`;
296
+ const key = this.resume && this.checkpoint
297
+ ? this.checkpoint.computeKey(relPath, chunkIndex, content, this.model, this.promptVersion, extractionExtra)
298
+ : undefined;
299
+ if (key && this.checkpoint.has(key)) {
300
+ this.logger.info(`Skipping cached chunk ${chunkIndex}/${totalChunks} of ${filePath} (checkpoint hit)`);
301
+ const cached = this.normalizeGraph(this.checkpoint.get(key));
302
+ this.progress.emit({
303
+ type: "chunk_complete",
304
+ path: filePath,
305
+ chunk: chunkIndex,
306
+ totalChunks,
307
+ entities: cached.entities.length,
308
+ relations: cached.relations.length,
309
+ cached: true,
310
+ });
311
+ // Mint/register the cached chunk's mentions too so lineage works on resume.
312
+ this.traceExtraction(cached, { extractionId, chunkId, filePath, chunkIndex, checkpointHit: true });
313
+ return cached;
314
+ }
315
+ let raw;
316
+ try {
317
+ raw = yield generate();
318
+ }
319
+ catch (error) {
320
+ // Extraction threw (retries exhausted, truncation, network/credits).
321
+ // Record it and return an empty graph WITHOUT checkpointing, so the chunk
322
+ // is retried on the next --resume rather than cached as done-and-empty.
323
+ const message = error instanceof Error ? error.message : String(error);
324
+ this.logger.error(`Extraction failed for chunk ${chunkIndex}/${totalChunks} of ${filePath} ` +
325
+ `— left uncheckpointed so --resume retries it: ${message}`);
326
+ this.failedChunks.push({ filePath, chunkIndex, totalChunks, error: message });
327
+ this.progress.emit({
328
+ type: "chunk_failed",
329
+ path: filePath,
330
+ chunk: chunkIndex,
331
+ totalChunks,
332
+ error: message,
333
+ });
334
+ this.traceExtraction({ entities: [], relations: [] }, { extractionId, chunkId, filePath, chunkIndex, checkpointHit: false, failed: true, error: message });
335
+ return { entities: [], relations: [] };
336
+ }
337
+ const usage = (_b = (_a = this.llmService).getLastUsage) === null || _b === void 0 ? void 0 : _b.call(_a);
338
+ const graph0 = this.toGraph(raw, provenance, content);
339
+ // Register mention IDs (pre-grounding) + emit the extraction event. Mention IDs
340
+ // are derived deterministically from content, so grounding can reference them
341
+ // without anything being stored on the graph objects (observe-only).
342
+ this.traceExtraction(graph0, { extractionId, chunkId, filePath, chunkIndex, checkpointHit: false, usage });
343
+ const kg = yield this.applyGroundingGate(graph0, content, filePath, chunkIndex, extractionId);
344
+ kg.entities.forEach(attachMetadata);
345
+ this.progress.emit({
346
+ type: "chunk_complete",
347
+ path: filePath,
348
+ chunk: chunkIndex,
349
+ totalChunks,
350
+ entities: kg.entities.length,
351
+ relations: kg.relations.length,
352
+ cached: false,
353
+ });
354
+ if (key) {
355
+ yield this.checkpoint.append({
356
+ key,
357
+ filePath,
358
+ relPath,
359
+ chunkIndex,
360
+ totalChunks,
361
+ model: this.model,
362
+ promptVersion: this.promptVersion,
363
+ kg,
364
+ });
365
+ }
366
+ return kg;
367
+ });
368
+ }
369
+ /**
370
+ * Scope the entity-type enum: the active content domain's `primaryEntityTypes`
371
+ * ∪ corpus-glossary entity types ∪ base set ∪ `other`. Delegates to the shared
372
+ * {@link allowedEntityTypes} so the enum and the prompt hints derive from one
373
+ * source. Always closed — with no class and no glossary it still returns the
374
+ * base set (+`other`), so `entityType` is an enforced enum even on an
375
+ * un-profiled, un-classified run.
376
+ */
377
+ resolveAllowedTypes(contentClasses, glossary) {
378
+ var _a, _b;
379
+ // Open-predicate: no enum at all → buildGraphSchema falls to free `z.string()`.
380
+ if (this.openPredicate)
381
+ return undefined;
382
+ // Strict: a supplied glossary REPLACES the base/domain sets (exact ontology).
383
+ if (this.strictVocabulary && ((_a = glossary === null || glossary === void 0 ? void 0 : glossary.entityTypes) === null || _a === void 0 ? void 0 : _a.length)) {
384
+ return Array.from(new Set([...glossary.entityTypes, vocabulary_1.ENTITY_TYPE_ESCAPE]));
385
+ }
386
+ return (0, vocabulary_1.allowedEntityTypes)(contentClasses, (_b = glossary === null || glossary === void 0 ? void 0 : glossary.entityTypes) !== null && _b !== void 0 ? _b : []);
387
+ }
388
+ /**
389
+ * Scope the relation-predicate enum: the active domain's `primaryRelationTypes`
390
+ * ∪ corpus-glossary relation types ∪ base set ∪ `related_to`. Delegates to the
391
+ * shared {@link allowedRelationTypes}. Unlike the pre-Phase-2 resolver this
392
+ * passes `contentClasses`, so the domain predicates the hints/examples teach are
393
+ * actually emittable (KG-05) instead of triggering ZodError → empty graph.
394
+ */
395
+ resolveAllowedRelationTypes(contentClasses, glossary) {
396
+ var _a, _b;
397
+ // Open-predicate: no enum at all → buildGraphSchema falls to free `z.string()`.
398
+ if (this.openPredicate)
399
+ return undefined;
400
+ // Strict: a supplied glossary REPLACES the base/domain sets (exact ontology).
401
+ if (this.strictVocabulary && ((_a = glossary === null || glossary === void 0 ? void 0 : glossary.relationTypes) === null || _a === void 0 ? void 0 : _a.length)) {
402
+ return Array.from(new Set([...glossary.relationTypes, vocabulary_1.RELATION_TYPE_ESCAPE]));
403
+ }
404
+ return (0, vocabulary_1.allowedRelationTypes)(contentClasses, (_b = glossary === null || glossary === void 0 ? void 0 : glossary.relationTypes) !== null && _b !== void 0 ? _b : []);
405
+ }
406
+ /** Provenance to stamp on a chunk's observations (reader-supplied or file). */
407
+ chunkProvenance(processedFile, chunk) {
408
+ var _a, _b, _c, _d, _e, _f;
409
+ return {
410
+ speaker: (_a = chunk.provenance) === null || _a === void 0 ? void 0 : _a.speaker,
411
+ source: (_c = (_b = chunk.provenance) === null || _b === void 0 ? void 0 : _b.source) !== null && _c !== void 0 ? _c : processedFile.path,
412
+ occurredAt: (_d = chunk.provenance) === null || _d === void 0 ? void 0 : _d.occurredAt,
413
+ sourceAdapter: (_e = chunk.provenance) === null || _e === void 0 ? void 0 : _e.sourceAdapter,
414
+ locator: (_f = chunk.provenance) === null || _f === void 0 ? void 0 : _f.locator,
415
+ };
416
+ }
417
+ /**
418
+ * Convert the LLM's raw graph (bare-string observations) into the domain
419
+ * graph, stamping each observation with the chunk's provenance + transaction
420
+ * time. Grounding is deterministic — we attach what we already know rather
421
+ * than asking the model for it.
422
+ */
423
+ toGraph(raw, provenance, content) {
424
+ const createdAt = new Date().toISOString();
425
+ return {
426
+ entities: raw.entities.map((e) => ({
427
+ name: e.name,
428
+ entityType: e.entityType,
429
+ files: [],
430
+ observations: e.observations.map((text) => (Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ text }, (provenance.speaker ? { speaker: provenance.speaker } : {})), (provenance.source ? { source: provenance.source } : {})), (provenance.occurredAt ? { validAt: provenance.occurredAt } : {})), (provenance.sourceAdapter ? { sourceAdapter: provenance.sourceAdapter } : {})), (provenance.locator ? { locator: provenance.locator } : {})), { createdAt }))),
431
+ })),
432
+ relations: raw.relations.map((r) => (Object.assign(Object.assign({ from: r.from, to: r.to, relationType: r.relationType }, (this.attachSourceSpans ? { sourceSpan: content } : {})), (this.attachSourceSpans && provenance.occurredAt
433
+ ? { validAt: provenance.occurredAt }
434
+ : {})))),
435
+ };
436
+ }
437
+ /**
438
+ * Inline grounding gate (Phase 5): check each observation fact AND each
439
+ * relation triple against its source chunk via the injected checker (keyword
440
+ * overlap | MiniCheck NLI), then either flag (annotate, keep) or drop the
441
+ * ungrounded ones. No-op when disabled. Every rejection is recorded
442
+ * (`groundingRejections`) so it leaves a trace in the run manifest (WI3).
443
+ */
444
+ applyGroundingGate(kg, source, filePath, chunkIndex, extractionId) {
445
+ return __awaiter(this, void 0, void 0, function* () {
446
+ if (this.grounding === 'disabled' || !source)
447
+ return kg;
448
+ const drop = this.grounding === 'drop';
449
+ let droppedObs = 0;
450
+ let droppedRel = 0;
451
+ // Observations — the claim is the fact text.
452
+ for (const e of kg.entities) {
453
+ const kept = [];
454
+ for (const o of e.observations) {
455
+ const v = yield this.groundingChecker.check(o.text, source);
456
+ const decision = v.supported ? 'accept' : drop ? 'drop' : 'flag';
457
+ if (trace_1.trace.enabled && extractionId) {
458
+ this.traceGrounding(extractionId, 'observation', e.name, o.text, v.score, decision, trace_1.LineageRegistry.observationId(extractionId, e.name, o.text));
459
+ }
460
+ if (v.supported) {
461
+ if (!drop) {
462
+ o.groundingScore = v.score;
463
+ o.grounded = true;
464
+ }
465
+ kept.push(o);
466
+ continue;
467
+ }
468
+ this.recordRejection(filePath, chunkIndex, 'observation', e.name, o.text, v.score, drop);
469
+ if (drop) {
470
+ droppedObs++;
471
+ }
472
+ else {
473
+ o.groundingScore = v.score;
474
+ o.grounded = false;
475
+ kept.push(o);
476
+ }
477
+ }
478
+ e.observations = kept;
479
+ }
480
+ // Relation triples — verbalize `{from} {predicate} {to}` and check it.
481
+ const keptRel = [];
482
+ for (const r of kg.relations) {
483
+ const claim = (0, grounding_1.verbalizeRelation)(r.from, r.relationType, r.to);
484
+ const v = yield this.groundingChecker.check(claim, source);
485
+ const decision = v.supported ? 'accept' : drop ? 'drop' : 'flag';
486
+ if (trace_1.trace.enabled && extractionId) {
487
+ this.traceGrounding(extractionId, 'relation', `${r.from}→${r.to}`, claim, v.score, decision, trace_1.LineageRegistry.relationMentionId(extractionId, r.from, r.to));
488
+ }
489
+ if (v.supported) {
490
+ if (!drop) {
491
+ r.groundingScore = v.score;
492
+ r.grounded = true;
493
+ }
494
+ keptRel.push(r);
495
+ continue;
496
+ }
497
+ this.recordRejection(filePath, chunkIndex, 'relation', `${r.from}→${r.to}`, claim, v.score, drop);
498
+ if (drop) {
499
+ droppedRel++;
500
+ }
501
+ else {
502
+ r.groundingScore = v.score;
503
+ r.grounded = false;
504
+ keptRel.push(r);
505
+ }
506
+ }
507
+ kg.relations = keptRel;
508
+ if (droppedObs > 0 || droppedRel > 0) {
509
+ this.logger.debug(`Grounding gate dropped ${droppedObs} observation(s) and ${droppedRel} relation(s) ` +
510
+ `in ${filePath} [chunk ${chunkIndex}]`);
511
+ }
512
+ return kg;
513
+ });
514
+ }
515
+ /** Record one grounding rejection for the run manifest (WI3). */
516
+ recordRejection(filePath, chunkIndex, kind, subject, claim, score, dropped) {
517
+ this.groundingRejections.push({ filePath, chunkIndex, kind, subject, claim, score, dropped });
518
+ }
519
+ /**
520
+ * Debug trace: register each parsed entity/observation/relation's deterministic
521
+ * mention ID in the run lineage and emit the extraction event. Mention IDs are
522
+ * derived from content (never stored on the graph) so this is pure observation.
523
+ */
524
+ traceExtraction(kg, ctx) {
525
+ if (!trace_1.trace.enabled)
526
+ return;
527
+ const entityMentions = kg.entities.map((e) => {
528
+ const observationIds = e.observations.map((o) => trace_1.LineageRegistry.observationId(ctx.extractionId, e.name, (0, types_1.obsText)(o)));
529
+ const mentionId = trace_1.LineageRegistry.entityMentionId(ctx.extractionId, e.name);
530
+ trace_1.trace.lineage.registerEntity({
531
+ mentionId, name: e.name, entityType: e.entityType,
532
+ chunkId: ctx.chunkId, extractionId: ctx.extractionId, observationIds,
533
+ });
534
+ return { mentionId, name: e.name, entityType: e.entityType, observationIds };
535
+ });
536
+ const relationMentions = kg.relations.map((r) => ({
537
+ mentionId: trace_1.LineageRegistry.relationMentionId(ctx.extractionId, r.from, r.to),
538
+ from: r.from, to: r.to, relationType: r.relationType,
539
+ }));
540
+ trace_1.trace.emit(Object.assign(Object.assign(Object.assign({ stage: 'extract', type: 'extraction', extractionId: ctx.extractionId, chunkId: ctx.chunkId, file: ctx.filePath, chunkIndex: ctx.chunkIndex, model: this.model, promptVersion: this.promptVersion, attempt: 0, checkpointHit: ctx.checkpointHit, entityMentions, relationMentions }, (ctx.usage ? { usage: ctx.usage } : {})), (ctx.failed ? { failed: true } : {})), (ctx.error ? { error: ctx.error } : {})));
541
+ }
542
+ /** Debug trace: emit one grounding decision (accept/flag/drop) for a claim. */
543
+ traceGrounding(extractionId, kind, subject, claim, score, decision, mentionId) {
544
+ var _a, _b, _c;
545
+ trace_1.trace.emit({
546
+ stage: 'ground', type: 'grounding',
547
+ extractionId, chunkId: extractionId.split('@')[0], mentionId,
548
+ kind, subject, claim, score,
549
+ checker: (_c = (_b = (_a = this.groundingChecker) === null || _a === void 0 ? void 0 : _a.constructor) === null || _b === void 0 ? void 0 : _b.name) !== null && _c !== void 0 ? _c : 'grounding',
550
+ decision,
551
+ });
552
+ }
553
+ /** Normalize a (possibly legacy string-observation) graph from the checkpoint. */
554
+ normalizeGraph(kg) {
555
+ return Object.assign(Object.assign({}, kg), { entities: kg.entities.map((e) => (Object.assign(Object.assign({}, e), { observations: (0, types_1.normalizeObservations)(e.observations) }))) });
556
+ }
557
+ /**
558
+ * Build knowledge graph from a chunk of content
559
+ */
560
+ buildFromChunk(filePath, content, fullContent, systemPrompt, chunkIndex, totalChunks, retrievedContext, images, contentClasses, glossary) {
561
+ return __awaiter(this, void 0, void 0, function* () {
562
+ this.logger.debug(`Building KG for chunk ${chunkIndex}/${totalChunks} of ${filePath}`);
563
+ const userPrompt = yield this.promptManager.getUserPrompt({
564
+ input: '',
565
+ filter: '',
566
+ fileName: filePath,
567
+ fileContent: fullContent,
568
+ chunkContent: content,
569
+ chunkIndex,
570
+ totalChunks,
571
+ retrievedContext,
572
+ contentClasses,
573
+ corpusGlossary: glossary
574
+ });
575
+ return this.generateKnowledgeGraph(systemPrompt, userPrompt, images, this.resolveAllowedTypes(contentClasses, glossary), this.resolveAllowedRelationTypes(contentClasses, glossary));
576
+ });
577
+ }
578
+ /**
579
+ * Build knowledge graph from entire content
580
+ */
581
+ buildFromContent(filePath, content, systemPrompt, retrievedContext, images, contentClasses, glossary) {
582
+ return __awaiter(this, void 0, void 0, function* () {
583
+ this.logger.debug(`Building KG for entire file: ${filePath}`);
584
+ const userPrompt = yield this.promptManager.getUserPrompt({
585
+ input: '',
586
+ filter: '',
587
+ fileName: filePath,
588
+ fileContent: content,
589
+ chunkContent: content,
590
+ retrievedContext,
591
+ contentClasses,
592
+ corpusGlossary: glossary
593
+ });
594
+ return this.generateKnowledgeGraph(systemPrompt, userPrompt, images, this.resolveAllowedTypes(contentClasses, glossary), this.resolveAllowedRelationTypes(contentClasses, glossary));
595
+ });
596
+ }
597
+ /**
598
+ * Generate knowledge graph using LLM
599
+ */
600
+ generateKnowledgeGraph(systemPrompt, userPrompt, images, allowedTypes, allowedRelationTypes) {
601
+ return __awaiter(this, void 0, void 0, function* () {
602
+ var _a, _b;
603
+ const messages = [
604
+ {
605
+ role: 'system',
606
+ content: systemPrompt
607
+ },
608
+ {
609
+ role: 'user',
610
+ content: userPrompt,
611
+ images: images === null || images === void 0 ? void 0 : images.map(img => { var _a; return (_a = img.base64) !== null && _a !== void 0 ? _a : ''; })
612
+ }
613
+ ];
614
+ // Let failures propagate (generateStructured already retries 3× then throws).
615
+ // buildChunk catches, records the failed chunk, and skips its checkpoint so
616
+ // --resume retries it — do NOT swallow into an empty graph here (KG-02).
617
+ const result = yield this.llmService.generateStructured(messages, buildGraphSchema(allowedTypes, allowedRelationTypes));
618
+ // Ensure arrays exist
619
+ (_a = result.entities) !== null && _a !== void 0 ? _a : (result.entities = []);
620
+ (_b = result.relations) !== null && _b !== void 0 ? _b : (result.relations = []);
621
+ this.logger.debug(`Generated KG with ${result.entities.length} entities and ${result.relations.length} relations`);
622
+ return result;
623
+ });
624
+ }
625
+ }
626
+ exports.KnowledgeGraphBuilder = KnowledgeGraphBuilder;
627
+ //# sourceMappingURL=KnowledgeGraphBuilder.js.map