@wanshi-kg/wanshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (443) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +458 -0
  3. package/dist/__tests__/helpers.js +27 -0
  4. package/dist/__tests__/helpers.js.map +1 -0
  5. package/dist/cli/commands/export.command.js +99 -0
  6. package/dist/cli/commands/export.command.js.map +1 -0
  7. package/dist/cli/commands/index.js +22 -0
  8. package/dist/cli/commands/index.js.map +1 -0
  9. package/dist/cli/commands/inspectMerges.command.js +84 -0
  10. package/dist/cli/commands/inspectMerges.command.js.map +1 -0
  11. package/dist/cli/commands/metrics.command.js +196 -0
  12. package/dist/cli/commands/metrics.command.js.map +1 -0
  13. package/dist/cli/commands/process.command.js +82 -0
  14. package/dist/cli/commands/process.command.js.map +1 -0
  15. package/dist/cli/commands/watch.command.js +91 -0
  16. package/dist/cli/commands/watch.command.js.map +1 -0
  17. package/dist/cli/index.js +269 -0
  18. package/dist/cli/index.js.map +1 -0
  19. package/dist/cli/optionsToConfig.js +160 -0
  20. package/dist/cli/optionsToConfig.js.map +1 -0
  21. package/dist/config/index.js +59 -0
  22. package/dist/config/index.js.map +1 -0
  23. package/dist/config/legacyHints.js +113 -0
  24. package/dist/config/legacyHints.js.map +1 -0
  25. package/dist/config/schema.js +803 -0
  26. package/dist/config/schema.js.map +1 -0
  27. package/dist/config/ui.js +221 -0
  28. package/dist/config/ui.js.map +1 -0
  29. package/dist/core/DirectoryProcessor.js +725 -0
  30. package/dist/core/DirectoryProcessor.js.map +1 -0
  31. package/dist/core/adapters/IStructuredAdapter.js +3 -0
  32. package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
  33. package/dist/core/adapters/SqliteAdapter.js +267 -0
  34. package/dist/core/adapters/SqliteAdapter.js.map +1 -0
  35. package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
  36. package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
  37. package/dist/core/adapters/index.js +20 -0
  38. package/dist/core/adapters/index.js.map +1 -0
  39. package/dist/core/checkpoint/CheckpointService.js +188 -0
  40. package/dist/core/checkpoint/CheckpointService.js.map +1 -0
  41. package/dist/core/checkpoint/index.js +18 -0
  42. package/dist/core/checkpoint/index.js.map +1 -0
  43. package/dist/core/corpus/CorpusAnalyzer.js +266 -0
  44. package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
  45. package/dist/core/corpus/CorpusProfileStore.js +92 -0
  46. package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
  47. package/dist/core/corpus/index.js +21 -0
  48. package/dist/core/corpus/index.js.map +1 -0
  49. package/dist/core/corpus/normalizeGlossary.js +60 -0
  50. package/dist/core/corpus/normalizeGlossary.js.map +1 -0
  51. package/dist/core/corpus/relPath.js +52 -0
  52. package/dist/core/corpus/relPath.js.map +1 -0
  53. package/dist/core/corpus/termFrequency.js +86 -0
  54. package/dist/core/corpus/termFrequency.js.map +1 -0
  55. package/dist/core/cost/CostMeter.js +235 -0
  56. package/dist/core/cost/CostMeter.js.map +1 -0
  57. package/dist/core/cost/index.js +19 -0
  58. package/dist/core/cost/index.js.map +1 -0
  59. package/dist/core/cost/prices.js +38 -0
  60. package/dist/core/cost/prices.js.map +1 -0
  61. package/dist/core/cv/ObjectDetectionService.js +119 -0
  62. package/dist/core/cv/ObjectDetectionService.js.map +1 -0
  63. package/dist/core/di/ContainerFactory.js +670 -0
  64. package/dist/core/di/ContainerFactory.js.map +1 -0
  65. package/dist/core/di/DIContainer.js +103 -0
  66. package/dist/core/di/DIContainer.js.map +1 -0
  67. package/dist/core/di/index.js +19 -0
  68. package/dist/core/di/index.js.map +1 -0
  69. package/dist/core/errors/CustomErrors.js +342 -0
  70. package/dist/core/errors/CustomErrors.js.map +1 -0
  71. package/dist/core/errors/index.js +18 -0
  72. package/dist/core/errors/index.js.map +1 -0
  73. package/dist/core/export/KnowledgeGraphExportService.js +56 -0
  74. package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
  75. package/dist/core/export/index.js +19 -0
  76. package/dist/core/export/index.js.map +1 -0
  77. package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
  78. package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
  79. package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
  80. package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
  81. package/dist/core/export/strategies/IExportStrategy.js +3 -0
  82. package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
  83. package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
  84. package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
  85. package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
  86. package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
  87. package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
  88. package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
  89. package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
  90. package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
  91. package/dist/core/export/strategies/McpExportStrategy.js +67 -0
  92. package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
  93. package/dist/core/export/strategies/index.js +25 -0
  94. package/dist/core/export/strategies/index.js.map +1 -0
  95. package/dist/core/export/strategies/kbTriples.js +60 -0
  96. package/dist/core/export/strategies/kbTriples.js.map +1 -0
  97. package/dist/core/index.js +22 -0
  98. package/dist/core/index.js.map +1 -0
  99. package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
  100. package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
  101. package/dist/core/knowledge/MergeRecord.js +3 -0
  102. package/dist/core/knowledge/MergeRecord.js.map +1 -0
  103. package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
  104. package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
  105. package/dist/core/knowledge/canon/index.js +18 -0
  106. package/dist/core/knowledge/canon/index.js.map +1 -0
  107. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
  108. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
  109. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
  110. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
  111. package/dist/core/knowledge/contradiction/index.js +19 -0
  112. package/dist/core/knowledge/contradiction/index.js.map +1 -0
  113. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
  114. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
  115. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
  116. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
  117. package/dist/core/knowledge/grounding/index.js +20 -0
  118. package/dist/core/knowledge/grounding/index.js.map +1 -0
  119. package/dist/core/knowledge/grounding/verbalize.js +38 -0
  120. package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
  121. package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
  122. package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
  123. package/dist/core/knowledge/index.js +20 -0
  124. package/dist/core/knowledge/index.js.map +1 -0
  125. package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
  126. package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
  127. package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
  128. package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
  129. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
  130. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
  131. package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
  132. package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
  133. package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
  134. package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
  135. package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
  136. package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
  137. package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
  138. package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
  139. package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
  140. package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
  141. package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
  142. package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
  143. package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
  144. package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
  145. package/dist/core/knowledge/vocabulary.js +162 -0
  146. package/dist/core/knowledge/vocabulary.js.map +1 -0
  147. package/dist/core/llm/EmbeddingService.js +113 -0
  148. package/dist/core/llm/EmbeddingService.js.map +1 -0
  149. package/dist/core/llm/OllamaService.js +146 -0
  150. package/dist/core/llm/OllamaService.js.map +1 -0
  151. package/dist/core/llm/OpenAICompatibleService.js +190 -0
  152. package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
  153. package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
  154. package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
  155. package/dist/core/llm/embeddingUtils.js +25 -0
  156. package/dist/core/llm/embeddingUtils.js.map +1 -0
  157. package/dist/core/llm/index.js +23 -0
  158. package/dist/core/llm/index.js.map +1 -0
  159. package/dist/core/llm/prompts/PromptManager.js +388 -0
  160. package/dist/core/llm/prompts/PromptManager.js.map +1 -0
  161. package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
  162. package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
  163. package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
  164. package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
  165. package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
  166. package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
  167. package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
  168. package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
  169. package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
  170. package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
  171. package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
  172. package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
  173. package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
  174. package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
  175. package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
  176. package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
  177. package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
  178. package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
  179. package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
  180. package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
  181. package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
  182. package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
  183. package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
  184. package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
  185. package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
  186. package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
  187. package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
  188. package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
  189. package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
  190. package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
  191. package/dist/core/pipeline/GroundingTransform.js +52 -0
  192. package/dist/core/pipeline/GroundingTransform.js.map +1 -0
  193. package/dist/core/pipeline/PipelineRunner.js +51 -0
  194. package/dist/core/pipeline/PipelineRunner.js.map +1 -0
  195. package/dist/core/pipeline/RelationFilterTransform.js +72 -0
  196. package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
  197. package/dist/core/pipeline/index.js +20 -0
  198. package/dist/core/pipeline/index.js.map +1 -0
  199. package/dist/core/processor/FileProcessor.js +184 -0
  200. package/dist/core/processor/FileProcessor.js.map +1 -0
  201. package/dist/core/processor/ProcessedRegistry.js +38 -0
  202. package/dist/core/processor/ProcessedRegistry.js.map +1 -0
  203. package/dist/core/processor/ast/AstSeedService.js +0 -0
  204. package/dist/core/processor/ast/AstSeedService.js.map +1 -0
  205. package/dist/core/processor/ast/AstSymbolStore.js +110 -0
  206. package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
  207. package/dist/core/processor/ast/index.js +19 -0
  208. package/dist/core/processor/ast/index.js.map +1 -0
  209. package/dist/core/processor/chunking/TextChunker.js +98 -0
  210. package/dist/core/processor/chunking/TextChunker.js.map +1 -0
  211. package/dist/core/processor/chunking/index.js +18 -0
  212. package/dist/core/processor/chunking/index.js.map +1 -0
  213. package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
  214. package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
  215. package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
  216. package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
  217. package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
  218. package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
  219. package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
  220. package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
  221. package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
  222. package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
  223. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
  224. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
  225. package/dist/core/processor/classifier/index.js +21 -0
  226. package/dist/core/processor/classifier/index.js.map +1 -0
  227. package/dist/core/processor/classifier/mergeClassifications.js +32 -0
  228. package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
  229. package/dist/core/processor/index.js +20 -0
  230. package/dist/core/processor/index.js.map +1 -0
  231. package/dist/core/processor/readers/AudioReader.js +462 -0
  232. package/dist/core/processor/readers/AudioReader.js.map +1 -0
  233. package/dist/core/processor/readers/BinaryReader.js +90 -0
  234. package/dist/core/processor/readers/BinaryReader.js.map +1 -0
  235. package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
  236. package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
  237. package/dist/core/processor/readers/ChatExportReader.js +365 -0
  238. package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
  239. package/dist/core/processor/readers/DoclingReader.js +445 -0
  240. package/dist/core/processor/readers/DoclingReader.js.map +1 -0
  241. package/dist/core/processor/readers/EmailReader.js +259 -0
  242. package/dist/core/processor/readers/EmailReader.js.map +1 -0
  243. package/dist/core/processor/readers/EpubReader.js +175 -0
  244. package/dist/core/processor/readers/EpubReader.js.map +1 -0
  245. package/dist/core/processor/readers/FileReader.js +90 -0
  246. package/dist/core/processor/readers/FileReader.js.map +1 -0
  247. package/dist/core/processor/readers/FileReaderFactory.js +49 -0
  248. package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
  249. package/dist/core/processor/readers/HtmlReader.js +371 -0
  250. package/dist/core/processor/readers/HtmlReader.js.map +1 -0
  251. package/dist/core/processor/readers/ImageReader.js +162 -0
  252. package/dist/core/processor/readers/ImageReader.js.map +1 -0
  253. package/dist/core/processor/readers/JsonFileReader.js +232 -0
  254. package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
  255. package/dist/core/processor/readers/JupyterReader.js +178 -0
  256. package/dist/core/processor/readers/JupyterReader.js.map +1 -0
  257. package/dist/core/processor/readers/LatexReader.js +176 -0
  258. package/dist/core/processor/readers/LatexReader.js.map +1 -0
  259. package/dist/core/processor/readers/MarkdownReader.js +289 -0
  260. package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
  261. package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
  262. package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
  263. package/dist/core/processor/readers/MistralOcrReader.js +198 -0
  264. package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
  265. package/dist/core/processor/readers/OfficeReader.js +174 -0
  266. package/dist/core/processor/readers/OfficeReader.js.map +1 -0
  267. package/dist/core/processor/readers/PdfReader.js +116 -0
  268. package/dist/core/processor/readers/PdfReader.js.map +1 -0
  269. package/dist/core/processor/readers/RtfReader.js +107 -0
  270. package/dist/core/processor/readers/RtfReader.js.map +1 -0
  271. package/dist/core/processor/readers/SubtitleReader.js +145 -0
  272. package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
  273. package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
  274. package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
  275. package/dist/core/processor/readers/TextReader.js +129 -0
  276. package/dist/core/processor/readers/TextReader.js.map +1 -0
  277. package/dist/core/processor/readers/TranscriptReader.js +234 -0
  278. package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
  279. package/dist/core/processor/readers/image/imageMetadata.js +155 -0
  280. package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
  281. package/dist/core/processor/readers/index.js +41 -0
  282. package/dist/core/processor/readers/index.js.map +1 -0
  283. package/dist/core/processor/readers/referenceExtraction.js +198 -0
  284. package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
  285. package/dist/core/processor/readers/stripReferences.js +59 -0
  286. package/dist/core/processor/readers/stripReferences.js.map +1 -0
  287. package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
  288. package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
  289. package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
  290. package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
  291. package/dist/core/progress/NoopProgressEmitter.js +15 -0
  292. package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
  293. package/dist/core/progress/index.js +19 -0
  294. package/dist/core/progress/index.js.map +1 -0
  295. package/dist/core/trace/TraceWriter.js +100 -0
  296. package/dist/core/trace/TraceWriter.js.map +1 -0
  297. package/dist/core/trace/events.js +13 -0
  298. package/dist/core/trace/events.js.map +1 -0
  299. package/dist/core/trace/index.js +20 -0
  300. package/dist/core/trace/index.js.map +1 -0
  301. package/dist/core/trace/lineage.js +97 -0
  302. package/dist/core/trace/lineage.js.map +1 -0
  303. package/dist/evaluation/BenchmarkRunner.js +171 -0
  304. package/dist/evaluation/BenchmarkRunner.js.map +1 -0
  305. package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
  306. package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
  307. package/dist/evaluation/classifier/labeledSamples.js +379 -0
  308. package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
  309. package/dist/evaluation/compare/goldCompare.js +126 -0
  310. package/dist/evaluation/compare/goldCompare.js.map +1 -0
  311. package/dist/evaluation/crossre/compareScoring.js +30 -0
  312. package/dist/evaluation/crossre/compareScoring.js.map +1 -0
  313. package/dist/evaluation/datasets/CrossREDataset.js +170 -0
  314. package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
  315. package/dist/evaluation/datasets/IDataset.js +3 -0
  316. package/dist/evaluation/datasets/IDataset.js.map +1 -0
  317. package/dist/evaluation/datasets/RebelDataset.js +117 -0
  318. package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
  319. package/dist/evaluation/datasets/RedocredDataset.js +218 -0
  320. package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
  321. package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
  322. package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
  323. package/dist/evaluation/index.js +33 -0
  324. package/dist/evaluation/index.js.map +1 -0
  325. package/dist/evaluation/matching/ExactMatcher.js +75 -0
  326. package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
  327. package/dist/evaluation/matching/SemanticMatcher.js +143 -0
  328. package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
  329. package/dist/evaluation/metrics/TripleMetrics.js +64 -0
  330. package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
  331. package/dist/evaluation/mine/MineCheckpoint.js +114 -0
  332. package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
  333. package/dist/evaluation/mine/MineDataset.js +208 -0
  334. package/dist/evaluation/mine/MineDataset.js.map +1 -0
  335. package/dist/evaluation/mine/MineReporter.js +98 -0
  336. package/dist/evaluation/mine/MineReporter.js.map +1 -0
  337. package/dist/evaluation/mine/MineRunner.js +148 -0
  338. package/dist/evaluation/mine/MineRunner.js.map +1 -0
  339. package/dist/evaluation/mine/MineScorer.js +127 -0
  340. package/dist/evaluation/mine/MineScorer.js.map +1 -0
  341. package/dist/evaluation/mine/types.js +12 -0
  342. package/dist/evaluation/mine/types.js.map +1 -0
  343. package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
  344. package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
  345. package/dist/evaluation/reporters/JsonReporter.js +50 -0
  346. package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
  347. package/dist/index.js +28 -0
  348. package/dist/index.js.map +1 -0
  349. package/dist/quality/CompositeScore.js +61 -0
  350. package/dist/quality/CompositeScore.js.map +1 -0
  351. package/dist/quality/ConsistencyMetrics.js +70 -0
  352. package/dist/quality/ConsistencyMetrics.js.map +1 -0
  353. package/dist/quality/FactualMetrics.js +76 -0
  354. package/dist/quality/FactualMetrics.js.map +1 -0
  355. package/dist/quality/GraphHealthMetrics.js +68 -0
  356. package/dist/quality/GraphHealthMetrics.js.map +1 -0
  357. package/dist/quality/SemanticMetrics.js +102 -0
  358. package/dist/quality/SemanticMetrics.js.map +1 -0
  359. package/dist/quality/StructuralMetrics.js +60 -0
  360. package/dist/quality/StructuralMetrics.js.map +1 -0
  361. package/dist/quality/index.js +23 -0
  362. package/dist/quality/index.js.map +1 -0
  363. package/dist/shared/index.js +20 -0
  364. package/dist/shared/index.js.map +1 -0
  365. package/dist/shared/logger/Logger.js +3 -0
  366. package/dist/shared/logger/Logger.js.map +1 -0
  367. package/dist/shared/logger/LoggerFactory.js +75 -0
  368. package/dist/shared/logger/LoggerFactory.js.map +1 -0
  369. package/dist/shared/logger/index.js +19 -0
  370. package/dist/shared/logger/index.js.map +1 -0
  371. package/dist/shared/shutdown.js +30 -0
  372. package/dist/shared/shutdown.js.map +1 -0
  373. package/dist/shared/utils/agglomerativeCluster.js +269 -0
  374. package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
  375. package/dist/shared/utils/astSymbols.js +69 -0
  376. package/dist/shared/utils/astSymbols.js.map +1 -0
  377. package/dist/shared/utils/cosineSimilarity.js +18 -0
  378. package/dist/shared/utils/cosineSimilarity.js.map +1 -0
  379. package/dist/shared/utils/directoryTree.js +184 -0
  380. package/dist/shared/utils/directoryTree.js.map +1 -0
  381. package/dist/shared/utils/documentOutline.js +74 -0
  382. package/dist/shared/utils/documentOutline.js.map +1 -0
  383. package/dist/shared/utils/index.js +24 -0
  384. package/dist/shared/utils/index.js.map +1 -0
  385. package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
  386. package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
  387. package/dist/shared/utils/parseJsonLenient.js +27 -0
  388. package/dist/shared/utils/parseJsonLenient.js.map +1 -0
  389. package/dist/shared/utils/readConfig.js +42 -0
  390. package/dist/shared/utils/readConfig.js.map +1 -0
  391. package/dist/shared/utils/readRtf.js +216 -0
  392. package/dist/shared/utils/readRtf.js.map +1 -0
  393. package/dist/shared/utils/softmax.js +26 -0
  394. package/dist/shared/utils/softmax.js.map +1 -0
  395. package/dist/types/ContentClass.js +3 -0
  396. package/dist/types/ContentClass.js.map +1 -0
  397. package/dist/types/CorpusProfile.js +3 -0
  398. package/dist/types/CorpusProfile.js.map +1 -0
  399. package/dist/types/IContradictionChecker.js +3 -0
  400. package/dist/types/IContradictionChecker.js.map +1 -0
  401. package/dist/types/ICorpusAnalyzer.js +3 -0
  402. package/dist/types/ICorpusAnalyzer.js.map +1 -0
  403. package/dist/types/IDirectoryProcessor.js +3 -0
  404. package/dist/types/IDirectoryProcessor.js.map +1 -0
  405. package/dist/types/IEmbeddingProvider.js +3 -0
  406. package/dist/types/IEmbeddingProvider.js.map +1 -0
  407. package/dist/types/IEmbeddingService.js +6 -0
  408. package/dist/types/IEmbeddingService.js.map +1 -0
  409. package/dist/types/IFileProcessor.js +3 -0
  410. package/dist/types/IFileProcessor.js.map +1 -0
  411. package/dist/types/IGroundingChecker.js +3 -0
  412. package/dist/types/IGroundingChecker.js.map +1 -0
  413. package/dist/types/IKnowledgeGraphBuilder.js +3 -0
  414. package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
  415. package/dist/types/IKnowledgeGraphExporter.js +3 -0
  416. package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
  417. package/dist/types/IKnowledgeGraphMerger.js +3 -0
  418. package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
  419. package/dist/types/IKnowledgeGraphSearch.js +3 -0
  420. package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
  421. package/dist/types/ILLMProvider.js +3 -0
  422. package/dist/types/ILLMProvider.js.map +1 -0
  423. package/dist/types/ILLMService.js +3 -0
  424. package/dist/types/ILLMService.js.map +1 -0
  425. package/dist/types/IObjectDetector.js +3 -0
  426. package/dist/types/IObjectDetector.js.map +1 -0
  427. package/dist/types/IProcessingService.js +3 -0
  428. package/dist/types/IProcessingService.js.map +1 -0
  429. package/dist/types/IProgressEmitter.js +3 -0
  430. package/dist/types/IProgressEmitter.js.map +1 -0
  431. package/dist/types/IPromptManager.js +3 -0
  432. package/dist/types/IPromptManager.js.map +1 -0
  433. package/dist/types/KnowledgeGraph.js +3 -0
  434. package/dist/types/KnowledgeGraph.js.map +1 -0
  435. package/dist/types/MCPKnowledgeGraph.js +3 -0
  436. package/dist/types/MCPKnowledgeGraph.js.map +1 -0
  437. package/dist/types/Observation.js +21 -0
  438. package/dist/types/Observation.js.map +1 -0
  439. package/dist/types/ProcessingOptions.js +3 -0
  440. package/dist/types/ProcessingOptions.js.map +1 -0
  441. package/dist/types/index.js +40 -0
  442. package/dist/types/index.js.map +1 -0
  443. package/package.json +122 -0
@@ -0,0 +1,725 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.DirectoryProcessor = exports.FileDiscoveryService = void 0;
46
+ const glob_1 = require("glob");
47
+ const fs = __importStar(require("fs"));
48
+ const path = __importStar(require("path"));
49
+ const di_1 = require("./di");
50
+ const corpus_1 = require("./corpus");
51
+ const ReferenceResolver_1 = require("./knowledge/references/ReferenceResolver");
52
+ const imageMetaGraph_1 = require("./knowledge/images/imageMetaGraph");
53
+ const referenceExtraction_1 = require("./processor/readers/referenceExtraction");
54
+ const ProcessedRegistry_1 = require("./processor/ProcessedRegistry");
55
+ const WebReferenceProcessor_1 = require("./knowledge/references/web/WebReferenceProcessor");
56
+ const CitationEvidenceProcessor_1 = require("./knowledge/references/citations/CitationEvidenceProcessor");
57
+ const pipeline_1 = require("./pipeline");
58
+ const canon_1 = require("./knowledge/canon");
59
+ const shared_1 = require("../shared");
60
+ const trace_1 = require("./trace");
61
+ const cost_1 = require("./cost");
62
+ const adapters_1 = require("./adapters");
63
+ class FileDiscoveryService {
64
+ constructor(options, logger) {
65
+ this.logger = logger;
66
+ this.dir = options.input;
67
+ this.filter = options.filter;
68
+ this.exclude = options.exclude;
69
+ }
70
+ discover() {
71
+ return __awaiter(this, void 0, void 0, function* () {
72
+ const patterns = this.filter.map(f => path.join(this.dir, f));
73
+ const files = yield (0, glob_1.glob)(patterns, { nodir: true, ignore: this.exclude });
74
+ if (files.length === 0) {
75
+ const message = `No files found matching pattern: ${this.filter}`;
76
+ this.logger.warn(message);
77
+ throw new Error(message);
78
+ }
79
+ this.logger.info(`Found ${files.length} files to process`);
80
+ return files;
81
+ });
82
+ }
83
+ }
84
+ exports.FileDiscoveryService = FileDiscoveryService;
85
+ /**
86
+ * Refactored DirectoryProcessor using dependency injection
87
+ * Focuses on orchestration while delegating business logic to services
88
+ */
89
+ class DirectoryProcessor {
90
+ constructor(container) {
91
+ this.container = container;
92
+ }
93
+ /**
94
+ * Process a directory and generate knowledge graphs
95
+ */
96
+ processDirectory(options) {
97
+ return __awaiter(this, void 0, void 0, function* () {
98
+ var _a, _b;
99
+ const logger = yield this.container.resolve(di_1.TYPES.Logger);
100
+ const progress = yield this.container.resolve(di_1.TYPES.ProgressEmitter);
101
+ const fileDiscoveryService = yield this.container.resolve(di_1.TYPES.FileDiscoveryService);
102
+ logger.info(`Starting knowledge graph generation`);
103
+ logger.info(`Input: ${options.input}, Filter: ${options.filter}, Output: ${options.output}, Model: ${options.llm.model}`);
104
+ // Debug trace: open the run. A resumed run skips checkpointed chunks, so its
105
+ // trace is partial — flagged here.
106
+ trace_1.trace.emit({
107
+ stage: "run", type: "run_start",
108
+ output: options.output,
109
+ resumed: !!((_a = options.resume) === null || _a === void 0 ? void 0 : _a.enabled),
110
+ config: { model: options.llm.model, promptVersion: options.llm.promptVersion, grounding: (_b = options.grounding) === null || _b === void 0 ? void 0 : _b.mode },
111
+ });
112
+ // Cost meter: attach the resolved logger (configured in ContainerFactory without one).
113
+ if (cost_1.meter.enabled)
114
+ cost_1.meter.attachLogger(logger);
115
+ try {
116
+ // Orchestrate the workflow
117
+ const files = yield fileDiscoveryService.discover();
118
+ progress.emit({ type: "discovery", totalFiles: files.length });
119
+ // Rough pre-run cost estimate (bill-shock heads-up; the end tally is exact).
120
+ if (cost_1.meter.enabled)
121
+ yield this.logCostEstimate(files, options, logger);
122
+ const knowledgeGraphs = yield this.processFiles(files, options);
123
+ if (shared_1.shutdown.isRequested()) {
124
+ logger.warn("Run interrupted — merging and exporting the partial graph collected so far. Re-run with --resume to continue.");
125
+ }
126
+ progress.emit({ type: "merge", graphCount: knowledgeGraphs.length });
127
+ const mergedKG = yield this.mergeGraphs(knowledgeGraphs, logger);
128
+ const finalKG = yield this.applyGraphTransforms(mergedKG, options, logger);
129
+ const outputPath = yield this.exportKnowledgeGraph(finalKG, options);
130
+ progress.emit({
131
+ type: "export",
132
+ format: options.export.format,
133
+ entities: finalKG.entities.length,
134
+ relations: finalKG.relations.length,
135
+ output: outputPath,
136
+ });
137
+ trace_1.trace.emit({
138
+ stage: "export", type: "export",
139
+ format: options.export.format,
140
+ entities: finalKG.entities.length,
141
+ relations: finalKG.relations.length,
142
+ });
143
+ this.logSuccess(finalKG, outputPath, logger);
144
+ // Cost meter: exact end-of-run tally + persist the resume-safe cumulative ledger.
145
+ if (cost_1.meter.enabled) {
146
+ logger.info(cost_1.meter.summary());
147
+ cost_1.meter.persistLedger();
148
+ }
149
+ progress.emit({
150
+ type: "done",
151
+ entities: finalKG.entities.length,
152
+ relations: finalKG.relations.length,
153
+ output: outputPath,
154
+ interrupted: shared_1.shutdown.isRequested(),
155
+ });
156
+ }
157
+ catch (error) {
158
+ this.handleError(error, options.logging.debug, logger);
159
+ progress.emit({
160
+ type: "error",
161
+ message: error instanceof Error ? error.message : String(error),
162
+ });
163
+ throw error;
164
+ }
165
+ });
166
+ }
167
+ /** Rough pre-run cost projection from discovered file sizes (bytes≈chars; no double read pass). */
168
+ logCostEstimate(files, options, logger) {
169
+ return __awaiter(this, void 0, void 0, function* () {
170
+ let totalChars = 0;
171
+ for (const f of files) {
172
+ try {
173
+ totalChars += (yield fs.promises.stat(f)).size;
174
+ }
175
+ catch (_a) {
176
+ /* unreadable/removed — skip */
177
+ }
178
+ }
179
+ const est = cost_1.meter.estimate(totalChars, options.chunking.size, options.llm.model);
180
+ const tokens = est.estPromptTokens + est.estCompletionTokens;
181
+ const money = est.priced
182
+ ? `~${options.cost.currency} ${est.estCost.toFixed(est.estCost < 1 ? 4 : 2)}`
183
+ : `no price set (shown as ${options.cost.currency} 0)`;
184
+ logger.info(`Cost estimate (rough): ~${est.estChunks} chunk(s), ~${tokens.toLocaleString()} tokens for ` +
185
+ `model '${options.llm.model}' — ${money}. Resume-cached chunks reduce actual spend; the ` +
186
+ `end-of-run tally is exact.`);
187
+ });
188
+ }
189
+ /**
190
+ * Process multiple files and generate knowledge graphs
191
+ */
192
+ processFiles(files, options) {
193
+ return __awaiter(this, void 0, void 0, function* () {
194
+ const knowledgeGraphs = [];
195
+ const logger = yield this.container.resolve(di_1.TYPES.Logger);
196
+ const progress = yield this.container.resolve(di_1.TYPES.ProgressEmitter);
197
+ // Load a prior output graph (if any) to seed retrieval CONTEXT only. It must
198
+ // NOT enter the merge set: re-merging already-merged output into a fresh run
199
+ // double-counts entities/observations on a plain (no --resume) re-run.
200
+ const priorGraphs = yield this.loadPriorGraphs(options.output, logger);
201
+ // Optional corpus analysis pre-pass: build/load a corpus-specific glossary
202
+ // (and cached per-file classification) once, before extraction.
203
+ const corpusProfile = yield this.buildCorpusProfile(files, options, logger);
204
+ const fileProcessor = yield this.container.resolve(di_1.TYPES.FileProcessor);
205
+ const kgBuilder = yield this.container.resolve(di_1.TYPES.KnowledgeGraphBuilder);
206
+ // Deterministic AST symbol seed (Phase 8): seed code definitions + exported
207
+ // members (and calls/imports edges) per file so the LLM augments the symbol
208
+ // set rather than originating it. Content-hash cached across the run.
209
+ const astSeed = options.ast.mode === "enabled"
210
+ ? yield this.container.resolve(di_1.TYPES.AstSeedService)
211
+ : undefined;
212
+ yield (astSeed === null || astSeed === void 0 ? void 0 : astSeed.loadCache());
213
+ // Structured-emit adapters (data-sink track): a graph-native source (e.g. a
214
+ // .db) maps directly to graph fragments, bypassing the LLM. Empty registry =
215
+ // every file takes the normal read→build path (default).
216
+ const structuredAdapters = yield this.container
217
+ .resolve(di_1.TYPES.StructuredAdapterRegistry)
218
+ .catch(() => new adapters_1.StructuredAdapterRegistry()); // empty = off-path default
219
+ // Reference & link resolution: the corpus-relative path set drives link
220
+ // resolution (resolved-flag + follow targets). In follow mode it spans the
221
+ // WHOLE input tree (links can point outside the glob); otherwise the glob set.
222
+ const follow = options.references.follow.enabled;
223
+ const internalLinksOn = options.references.internalLinks.enabled || follow;
224
+ let corpusRelPaths;
225
+ if (follow) {
226
+ const allInput = yield new FileDiscoveryService({ input: options.input, filter: ["**/*"], exclude: options.exclude }, logger)
227
+ .discover()
228
+ .catch(() => []);
229
+ corpusRelPaths = new Set(allInput.map((f) => (0, corpus_1.toRelPathId)(options.input, f)));
230
+ }
231
+ else {
232
+ corpusRelPaths = internalLinksOn
233
+ ? new Set(files.map((f) => (0, corpus_1.toRelPathId)(options.input, f)))
234
+ : new Set();
235
+ }
236
+ // Phase 1 — class-3 external web fetcher (opt-in; constructed only when
237
+ // references.web.enabled, so a default run never builds the network layer).
238
+ const webProc = options.references.web.enabled
239
+ ? yield this.buildWebProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger)
240
+ : null;
241
+ // Phase 2 — citation span-fetch (opt-in; constructed only when
242
+ // references.citations.fetch.enabled). Resolves id-bearing cites to OA full
243
+ // text, span-selects the citing claim's evidence, and labels the edge.
244
+ const citeProc = options.references.citations.fetch.enabled
245
+ ? yield this.buildCitationProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger)
246
+ : null;
247
+ // Worklist with a processed-file registry: the same file is read/extracted at
248
+ // most once however it's reached (overlapping globs, reference-following). The
249
+ // queue is seeded from follow.seeds (a crawl) or the discovered glob set, and
250
+ // (in follow mode) grows as internal links are resolved to existing files.
251
+ const registry = new ProcessedRegistry_1.ProcessedRegistry();
252
+ const queued = new Set();
253
+ const queue = [];
254
+ const enqueue = (file, depth) => {
255
+ const id = (0, corpus_1.toRelPathId)(options.input, file);
256
+ if (registry.has(id) || queued.has(id))
257
+ return;
258
+ queued.add(id);
259
+ queue.push({ file, depth });
260
+ };
261
+ const seeds = options.references.follow.seeds;
262
+ if (follow && seeds.length) {
263
+ for (const s of seeds) {
264
+ const abs = path.resolve(options.input, s);
265
+ if (fs.existsSync(abs))
266
+ enqueue(abs, 0);
267
+ else
268
+ logger.warn(`reference-follow seed not found, skipping: ${s}`);
269
+ }
270
+ }
271
+ else {
272
+ for (const f of files)
273
+ enqueue(f, 0);
274
+ }
275
+ const { maxFiles, maxDepth } = options.references.follow;
276
+ let index = 0;
277
+ while (queue.length > 0) {
278
+ // Cooperative interrupt: stop before starting the next file so the
279
+ // partial graph accumulated so far can still be merged and exported.
280
+ if (shared_1.shutdown.isRequested()) {
281
+ logger.warn(`Interrupted — flushing partial graph (${registry.size} files processed)`);
282
+ break;
283
+ }
284
+ if (follow && registry.size >= maxFiles) {
285
+ logger.warn(`reference-follow reached maxFiles=${maxFiles}; stopping discovery`);
286
+ break;
287
+ }
288
+ const { file, depth } = queue.shift();
289
+ const id = (0, corpus_1.toRelPathId)(options.input, file);
290
+ if (registry.has(id))
291
+ continue; // already processed via another path
292
+ index += 1;
293
+ const total = registry.size + queue.length + 1;
294
+ progress.emit({ type: "file_start", index, total, path: file });
295
+ try {
296
+ // Retrieval sees prior output + graphs built so far this run; merge sees
297
+ // only what's built this run (knowledgeGraphs).
298
+ const retrievalContext = [...priorGraphs, ...knowledgeGraphs];
299
+ const { graphs: fileGraphs, links: fileLinks, citations: fileCitations } = yield this.processFile(file, options, fileProcessor, kgBuilder, retrievalContext, logger, corpusProfile, astSeed, corpusRelPaths, structuredAdapters);
300
+ registry.mark(id);
301
+ knowledgeGraphs.push(...fileGraphs);
302
+ // Reference-driven ingestion: enqueue resolved internal-link targets that
303
+ // exist in the corpus and haven't been processed/queued. Network-free —
304
+ // external targets are skipped (that's the web fetcher below).
305
+ if (follow && (maxDepth === 0 || depth < maxDepth)) {
306
+ for (const link of fileLinks) {
307
+ if ((0, referenceExtraction_1.isExternalTarget)(link.target))
308
+ continue;
309
+ const rel = (0, ReferenceResolver_1.resolveInternalTarget)(link, id, corpusRelPaths);
310
+ if (rel && !registry.has(rel) && !queued.has(rel)) {
311
+ enqueue(path.resolve(options.input, rel), depth + 1);
312
+ }
313
+ }
314
+ }
315
+ // Phase 1 — class-3 external web: fetch this file's allowlisted external
316
+ // links (gated), extract, emit `references` edges. Depth-1 (fetched pages
317
+ // are not re-crawled). Offline unless references.web is enabled.
318
+ if (webProc) {
319
+ const webGraph = yield webProc.process(id, fileLinks, options.description);
320
+ if (webGraph)
321
+ knowledgeGraphs.push(webGraph);
322
+ }
323
+ // Phase 2 — citation span-fetch: resolve this file's id-bearing cites to OA
324
+ // full text, fold content + label faithfulness. Offline unless enabled.
325
+ if (citeProc) {
326
+ const citeGraph = yield citeProc.process(id, file, fileCitations);
327
+ if (citeGraph)
328
+ knowledgeGraphs.push(citeGraph);
329
+ }
330
+ const entities = fileGraphs.reduce((n, g) => n + g.entities.length, 0);
331
+ const relations = fileGraphs.reduce((n, g) => n + g.relations.length, 0);
332
+ progress.emit({ type: "file_complete", index, total, path: file, entities, relations });
333
+ if (options.logging.debug) {
334
+ yield this.writeIntermediateResults(knowledgeGraphs, options.output);
335
+ }
336
+ }
337
+ catch (error) {
338
+ registry.mark(id); // don't retry a hard-failing file in this run
339
+ this.handleFileError(file, error, options.logging.debug, logger);
340
+ progress.emit({ type: "file_complete", index, total: registry.size + queue.length, path: file, entities: 0, relations: 0 });
341
+ }
342
+ }
343
+ // Persist the AST symbol cache so an unchanged file is a no-op next run.
344
+ yield (astSeed === null || astSeed === void 0 ? void 0 : astSeed.saveCache());
345
+ // Surface chunks whose extraction failed: they were left uncheckpointed (so
346
+ // --resume retries them) and must not pass silently as "done-and-empty". The
347
+ // partial graph still merges/exports; the run exits non-zero (KG-02).
348
+ const failedChunks = kgBuilder.getFailedChunks();
349
+ if (failedChunks.length > 0) {
350
+ logger.warn(`${failedChunks.length} chunk(s) failed extraction and were left uncheckpointed — ` +
351
+ `re-run with --resume to retry them:`);
352
+ for (const f of failedChunks) {
353
+ logger.warn(` - ${f.filePath} [chunk ${f.chunkIndex}/${f.totalChunks}]: ${f.error}`);
354
+ }
355
+ process.exitCode = 1;
356
+ }
357
+ // Surface claims the inline grounding gate rejected (WI3 manifest trace):
358
+ // in `drop` mode they were removed from the graph, in `flag` mode annotated
359
+ // and kept — either way they must leave a visible trace, not vanish.
360
+ const rejections = kgBuilder.getGroundingRejections();
361
+ if (rejections.length > 0) {
362
+ const dropped = rejections.filter((r) => r.dropped).length;
363
+ logger.warn(`Grounding gate flagged ${rejections.length} ungrounded claim(s)` +
364
+ (dropped > 0 ? ` (${dropped} dropped, ${rejections.length - dropped} flagged)` : ` (all flagged)`) +
365
+ `:`);
366
+ for (const r of rejections) {
367
+ logger.debug(` - [${r.kind}] ${r.subject} (score ${r.score.toFixed(2)}, ` +
368
+ `${r.dropped ? "dropped" : "flagged"}) in ${r.filePath} [chunk ${r.chunkIndex}]: ${r.claim}`);
369
+ }
370
+ }
371
+ return knowledgeGraphs;
372
+ });
373
+ }
374
+ /**
375
+ * Load a previously-written output graph for retrieval seeding. Tolerates both
376
+ * the current single-graph object (`{entities, relations}`) and a legacy array
377
+ * of per-file graphs. Returns [] (and warns) when missing/unparseable — the
378
+ * prior graph is a retrieval nicety, never required.
379
+ */
380
+ loadPriorGraphs(outputPath, logger) {
381
+ return __awaiter(this, void 0, void 0, function* () {
382
+ if (!outputPath || !fs.existsSync(outputPath))
383
+ return [];
384
+ const raw = fs.readFileSync(outputPath, "utf-8");
385
+ // JSONL / mcp-jsonl outputs aren't valid JSON documents — parse them
386
+ // line-by-line (KG-11) instead of warning every run. Route by extension, and
387
+ // also fall back to the JSONL reader if a `.json` somehow fails to parse.
388
+ const isJsonl = /\.(jsonl|mcp-jsonl)$/i.test(outputPath);
389
+ if (isJsonl) {
390
+ const { JsonlExportStrategy } = yield Promise.resolve().then(() => __importStar(require("./export/strategies/JsonlExportStrategy")));
391
+ const graph = JsonlExportStrategy.fromJSONL(raw);
392
+ return graph.entities.length || graph.relations.length ? [graph] : [];
393
+ }
394
+ try {
395
+ const parsed = JSON.parse(raw);
396
+ if (Array.isArray(parsed))
397
+ return parsed;
398
+ if (parsed && typeof parsed === "object" && Array.isArray(parsed.entities)) {
399
+ return [parsed];
400
+ }
401
+ return [];
402
+ }
403
+ catch (_a) {
404
+ // Not a JSON document — try JSONL before giving up (covers a mislabeled file).
405
+ const { JsonlExportStrategy } = yield Promise.resolve().then(() => __importStar(require("./export/strategies/JsonlExportStrategy")));
406
+ const graph = JsonlExportStrategy.fromJSONL(raw);
407
+ if (graph.entities.length || graph.relations.length)
408
+ return [graph];
409
+ logger.warn(`Could not load prior graph at ${outputPath} for retrieval context (ignored)`);
410
+ return [];
411
+ }
412
+ });
413
+ }
414
+ /**
415
+ * Process a single file
416
+ */
417
+ processFile(file_1, options_1, fileProcessor_1, kgBuilder_1, existingGraphs_1, logger_1, corpusProfile_1, astSeed_1) {
418
+ return __awaiter(this, arguments, void 0, function* (file, options, fileProcessor, kgBuilder, existingGraphs, logger, corpusProfile, astSeed, corpusRelPaths = new Set(), structuredAdapters) {
419
+ var _a, _b, _c, _d, _e;
420
+ logger.info(`Processing: ${file}`);
421
+ // Structured-emit path (data-sink track): if an adapter claims this file, it
422
+ // maps the source DIRECTLY to graph fragments (bypassing read→chunk→LLM). The
423
+ // fragment still enters the per-file graphs[] union → merge/canon.
424
+ const adapter = structuredAdapters === null || structuredAdapters === void 0 ? void 0 : structuredAdapters.match(file);
425
+ if (adapter) {
426
+ logger.info(`Structured adapter '${adapter.id}' handling ${file} (graph-native, no LLM)`);
427
+ const graph = yield adapter.extract(file);
428
+ if (trace_1.trace.enabled) {
429
+ trace_1.trace.emit({
430
+ stage: "ingest", type: "chunk", chunkId: `${file}#0`, file,
431
+ chunkIndex: 0, totalChunks: 1, reader: `adapter:${adapter.id}`, contentLength: 0,
432
+ });
433
+ }
434
+ return { graphs: graph ? [graph] : [], links: [], citations: [] };
435
+ }
436
+ // Reuse the pre-pass's cached classification for this file when available.
437
+ const cachedClasses = corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.perFileClasses[(0, corpus_1.toRelPathId)(options.input, file)];
438
+ const processedFile = yield fileProcessor.processFile(file, cachedClasses);
439
+ // A reader can signal a graceful skip (BinaryReader for binary/unknown
440
+ // files) — honor it before the "no content extracted" guard turns an empty
441
+ // read into a per-file error.
442
+ if ((_a = processedFile.metadata) === null || _a === void 0 ? void 0 : _a.skip) {
443
+ logger.info(`Skipped ${file} (binary / no extractable text)`);
444
+ return { graphs: [], links: [], citations: [] };
445
+ }
446
+ this.validateProcessedFile(processedFile, file, logger);
447
+ const retrieve = yield this.buildRetriever(processedFile, file, existingGraphs, options);
448
+ const promptManager = (yield this.container.resolve(di_1.TYPES.PromptManager));
449
+ const systemPrompt = yield promptManager.getSystemPrompt(options.input, options.filter.join(', '), options.description, (_b = processedFile.metadata) === null || _b === void 0 ? void 0 : _b.classes, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary, options.pipeline.extraction.openPredicate);
450
+ const graphs = yield kgBuilder.build(processedFile, systemPrompt, retrieve, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary);
451
+ // Append the deterministic AST symbol seed (Phase 8) so it merges with the
452
+ // LLM's per-chunk graphs — the model augments the symbol set, not originates it.
453
+ const seed = astSeed ? yield astSeed.seedGraph(processedFile) : null;
454
+ if (seed)
455
+ graphs.push(seed);
456
+ // Deterministic image metadata (EXIF/C2PA): graph facts that AUGMENT the VLM's
457
+ // read of an image rather than replacing it (sourceAdapter exif/c2pa, confidence).
458
+ // No-op (returns null) unless a reader stashed metadata.exif/metadata.c2pa.
459
+ const imageGraph = (0, imageMetaGraph_1.buildImageMetaGraph)(processedFile, options.input);
460
+ if (imageGraph)
461
+ graphs.push(imageGraph);
462
+ // Deterministic reference edges (Phase 0, network-free): internal links +
463
+ // citations the document already contains, resolved against the corpus.
464
+ // Merges with the LLM graphs like the AST seed above. Following auto-implies
465
+ // internal-link resolution (you can't follow links you didn't extract).
466
+ const internalLinksOn = options.references.internalLinks.enabled || options.references.follow.enabled;
467
+ // When citation-fetch is on (Phase 2), the CitationEvidenceProcessor OWNS the
468
+ // `cites` edges (resolved + faithfulness) — so the network-free resolver stands
469
+ // down on citations to avoid emitting a competing resolved:false edge.
470
+ const fetchOwnsCites = options.references.citations.fetch.enabled;
471
+ const citationsForResolver = options.references.citations.enabled && !fetchOwnsCites;
472
+ if (internalLinksOn || citationsForResolver) {
473
+ const refGraph = (0, ReferenceResolver_1.buildReferenceGraph)(processedFile, corpusRelPaths, options.input, {
474
+ internalLinks: internalLinksOn,
475
+ citations: citationsForResolver,
476
+ });
477
+ if (refGraph)
478
+ graphs.push(refGraph);
479
+ }
480
+ const refs = (_c = processedFile.metadata) === null || _c === void 0 ? void 0 : _c.references;
481
+ return { graphs, links: (_d = refs === null || refs === void 0 ? void 0 : refs.links) !== null && _d !== void 0 ? _d : [], citations: (_e = refs === null || refs === void 0 ? void 0 : refs.citations) !== null && _e !== void 0 ? _e : [] };
482
+ });
483
+ }
484
+ /**
485
+ * Build the Phase-1 web reference processor: the DI-managed gated fetcher +
486
+ * fetch cache, plus an extract closure that runs a fetched page through the
487
+ * normal reader + builder (content only — no reference-resolver/follow on
488
+ * fetched pages = depth-1).
489
+ */
490
+ buildWebProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger) {
491
+ return __awaiter(this, void 0, void 0, function* () {
492
+ const fetcher = yield this.container.resolve(di_1.TYPES.GatedFetcher);
493
+ const cache = yield this.container.resolve(di_1.TYPES.FetchCacheService);
494
+ const promptManager = (yield this.container.resolve(di_1.TYPES.PromptManager));
495
+ const extract = (tempPath) => __awaiter(this, void 0, void 0, function* () {
496
+ var _a, _b;
497
+ const pf = yield fileProcessor.processFile(tempPath);
498
+ if (((_a = pf.metadata) === null || _a === void 0 ? void 0 : _a.skip) || !pf.chunks.length)
499
+ return [];
500
+ const systemPrompt = yield promptManager.getSystemPrompt(options.input, options.filter.join(", "), options.description, (_b = pf.metadata) === null || _b === void 0 ? void 0 : _b.classes, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary, options.pipeline.extraction.openPredicate);
501
+ return kgBuilder.build(pf, systemPrompt, undefined, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary);
502
+ });
503
+ return new WebReferenceProcessor_1.WebReferenceProcessor(fetcher, cache, extract, logger);
504
+ });
505
+ }
506
+ /**
507
+ * Build the Phase-2 citation evidence processor: a PDF-capable gated fetcher +
508
+ * its own fetch cache + the id→OA resolver, an extract closure that runs a
509
+ * fetched cited PDF through the normal reader (chunks for span-select) + builder
510
+ * (content folded onto the cited-work node), the embedding provider for
511
+ * span-select, and (optionally) GROBID for marker→claim linking + MiniCheck for
512
+ * the faithfulness label.
513
+ */
514
+ buildCitationProcessor(options, fileProcessor, kgBuilder, corpusProfile, logger) {
515
+ return __awaiter(this, void 0, void 0, function* () {
516
+ const { CitationResolver } = yield Promise.resolve().then(() => __importStar(require("./knowledge/references/citations/CitationResolver")));
517
+ const fetcher = yield this.container.resolve(di_1.TYPES.CitationFetcher);
518
+ const cache = yield this.container.resolve(di_1.TYPES.CitationFetchCache);
519
+ const resolver = yield this.container.resolve(di_1.TYPES.CitationResolver);
520
+ const embeddings = yield this.container.resolve(di_1.TYPES.EmbeddingService);
521
+ const promptManager = (yield this.container.resolve(di_1.TYPES.PromptManager));
522
+ const cfg = options.references.citations;
523
+ const grobid = cfg.grobid.enabled
524
+ ? yield this.container.resolve(di_1.TYPES.GrobidClient)
525
+ : null;
526
+ if (grobid && !(yield grobid.isAlive())) {
527
+ logger.warn(`GROBID not reachable at ${cfg.grobid.url} — citation span-select/faithfulness disabled (id-bearing fetch still runs). Start it with: docker run -p 8070:8070 lfoppiano/grobid`);
528
+ }
529
+ let faithfulness = null;
530
+ if (cfg.fetch.minicheck) {
531
+ const { MiniCheckGroundingChecker } = yield Promise.resolve().then(() => __importStar(require("./knowledge/grounding")));
532
+ faithfulness = new MiniCheckGroundingChecker({ model: cfg.fetch.minicheckModel, host: cfg.fetch.minicheckHost, min: 0.5, escalateAbove: 1.1 }, logger);
533
+ }
534
+ const extract = (tempPath) => __awaiter(this, void 0, void 0, function* () {
535
+ var _a, _b;
536
+ const pf = yield fileProcessor.processFile(tempPath);
537
+ if (((_a = pf.metadata) === null || _a === void 0 ? void 0 : _a.skip) || !pf.chunks.length)
538
+ return { chunks: [], graphs: [] };
539
+ const chunks = pf.chunks.map((ch) => ch.content);
540
+ const systemPrompt = yield promptManager.getSystemPrompt(options.input, options.filter.join(", "), options.description, (_b = pf.metadata) === null || _b === void 0 ? void 0 : _b.classes, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary, options.pipeline.extraction.openPredicate);
541
+ const graphs = yield kgBuilder.build(pf, systemPrompt, undefined, corpusProfile === null || corpusProfile === void 0 ? void 0 : corpusProfile.glossary);
542
+ return { chunks, graphs };
543
+ });
544
+ return new CitationEvidenceProcessor_1.CitationEvidenceProcessor(fetcher, cache, resolver, extract, embeddings, logger, {
545
+ grobid,
546
+ faithfulness,
547
+ uncertainBand: cfg.fetch.uncertainBand,
548
+ });
549
+ });
550
+ }
551
+ /**
552
+ * Run the optional corpus analysis pre-pass (term frequency + cached
553
+ * classification + LLM glossary). Returns undefined when disabled or on
554
+ * failure — profiling is an enhancement, never required.
555
+ */
556
+ buildCorpusProfile(files, options, logger) {
557
+ return __awaiter(this, void 0, void 0, function* () {
558
+ if (options.corpus.profiling !== "enabled")
559
+ return undefined;
560
+ try {
561
+ logger.info("Corpus analysis pre-pass enabled — profiling corpus before extraction");
562
+ const analyzer = yield this.container.resolve(di_1.TYPES.CorpusAnalyzer);
563
+ return yield analyzer.analyzeOrLoad(files, options);
564
+ }
565
+ catch (error) {
566
+ logger.warn(`Corpus pre-pass failed (continuing without a glossary): ${error}`);
567
+ return undefined;
568
+ }
569
+ });
570
+ }
571
+ /**
572
+ * Validate processed file content
573
+ */
574
+ validateProcessedFile(processedFile, filePath, logger) {
575
+ var _a;
576
+ if (!((_a = processedFile.chunks) === null || _a === void 0 ? void 0 : _a.length)) {
577
+ logger.warn(`No content extracted from: ${filePath}`);
578
+ throw new Error(`No content extracted from file: ${filePath}`);
579
+ }
580
+ }
581
+ /**
582
+ * Build a retrieval function for a file, or undefined when retrieval is
583
+ * disabled / there's no existing graph to search.
584
+ *
585
+ * - `retrievalScope: "chunk"` (default) returns a function that retrieves
586
+ * context per chunk using that chunk's own content.
587
+ * - `retrievalScope: "file"` retrieves once from the first chunk and reuses
588
+ * it for every chunk (legacy behavior).
589
+ */
590
+ buildRetriever(processedFile, filePath, existingGraphs, options) {
591
+ return __awaiter(this, void 0, void 0, function* () {
592
+ if (!this.shouldUseRetrieval(options) || existingGraphs.length === 0) {
593
+ return undefined;
594
+ }
595
+ const searchService = yield this.container.resolve(di_1.TYPES.KnowledgeGraphSearch);
596
+ const searchOptions = {
597
+ limit: options.retrieval.limit,
598
+ includeObservations: true,
599
+ };
600
+ const search = (content) => searchService.searchByFileContent(content, filePath, existingGraphs, searchOptions);
601
+ if (options.retrieval.scope === "file") {
602
+ // Retrieve once from the first chunk, reuse for all chunks.
603
+ const context = yield search(processedFile.chunks[0].content);
604
+ return () => __awaiter(this, void 0, void 0, function* () { return context; });
605
+ }
606
+ // Default: per-chunk retrieval.
607
+ return (chunkContent) => search(chunkContent);
608
+ });
609
+ }
610
+ /**
611
+ * Determine if retrieval should be used
612
+ */
613
+ shouldUseRetrieval(options) {
614
+ // Fix the conflicting boolean pairs issue
615
+ if (options.retrieval.mode === "disabled")
616
+ return false;
617
+ if (options.retrieval.mode === "enabled")
618
+ return true;
619
+ return true; // Auto to true
620
+ }
621
+ /**
622
+ * Merge multiple knowledge graphs
623
+ */
624
+ mergeGraphs(graphs, logger) {
625
+ return __awaiter(this, void 0, void 0, function* () {
626
+ logger.info(`Merging ${graphs.length} knowledge graphs`);
627
+ const merger = yield this.container.resolve(di_1.TYPES.KnowledgeGraphMerger);
628
+ return yield merger.merge(graphs);
629
+ });
630
+ }
631
+ /**
632
+ * Run the post-extraction graph→graph transform pipeline (grounding gate,
633
+ * canonicalization) over the merged graph, in the order from `pipeline.stages`.
634
+ * A no-op when no transform is enabled — the providers resolved here are the
635
+ * same singletons extraction/merge already built, so the baseline path returns
636
+ * the merged graph unchanged.
637
+ */
638
+ applyGraphTransforms(graph, options, logger) {
639
+ return __awaiter(this, void 0, void 0, function* () {
640
+ const transforms = [
641
+ new pipeline_1.GroundingTransform(),
642
+ new canon_1.Canonicalizer(),
643
+ new pipeline_1.RelationFilterTransform(), // after canon: endpoints are canonical before pairing
644
+ ];
645
+ const ctx = {
646
+ options,
647
+ embeddings: yield this.container.resolve(di_1.TYPES.EmbeddingService),
648
+ llm: yield this.container.resolve(di_1.TYPES.LLMService),
649
+ logger,
650
+ };
651
+ const runner = new pipeline_1.PipelineRunner(transforms, ctx);
652
+ if (!runner.hasWork())
653
+ return graph;
654
+ return runner.run(graph);
655
+ });
656
+ }
657
+ /**
658
+ * Export knowledge graph in the requested format
659
+ */
660
+ exportKnowledgeGraph(knowledgeGraph, options) {
661
+ return __awaiter(this, void 0, void 0, function* () {
662
+ yield this.ensureOutputDirectory(options.output);
663
+ const exporter = yield this.container.resolve(di_1.TYPES.KnowledgeGraphExportService);
664
+ const exportFormat = options.export.format;
665
+ if (!exporter.isFormatSupported(exportFormat)) {
666
+ throw new Error(`Unsupported export format: ${exportFormat}. Supported: ${exporter
667
+ .getSupportedFormats()
668
+ .join(", ")}`);
669
+ }
670
+ const outputContent = exporter.export(knowledgeGraph, exportFormat, options);
671
+ const outputPath = this.getOutputPath(options.output, exportFormat);
672
+ yield fs.promises.writeFile(outputPath, outputContent);
673
+ return outputPath;
674
+ });
675
+ }
676
+ /**
677
+ * Ensure output directory exists
678
+ */
679
+ ensureOutputDirectory(outputPath) {
680
+ return __awaiter(this, void 0, void 0, function* () {
681
+ const outputDir = path.dirname(outputPath);
682
+ if (!fs.existsSync(outputDir)) {
683
+ fs.mkdirSync(outputDir, { recursive: true });
684
+ }
685
+ });
686
+ }
687
+ /**
688
+ * Get the final output path with correct extension
689
+ */
690
+ getOutputPath(originalPath, format) {
691
+ return originalPath.endsWith(`.${format}`)
692
+ ? originalPath
693
+ : originalPath.replace(/\.[^.]+$/, `.${format}`);
694
+ }
695
+ /**
696
+ * Write intermediate results for debugging
697
+ */
698
+ writeIntermediateResults(graphs, outputPath) {
699
+ return __awaiter(this, void 0, void 0, function* () {
700
+ const tmpPath = outputPath + ".tmp";
701
+ yield fs.promises.writeFile(tmpPath, JSON.stringify(graphs, null, 2));
702
+ });
703
+ }
704
+ /**
705
+ * Handle file processing errors
706
+ */
707
+ handleFileError(file, error, debug, logger) {
708
+ logger.error(`Failed to process file ${file}: ${error.message || error}`);
709
+ }
710
+ /**
711
+ * Handle general processing errors
712
+ */
713
+ handleError(error, debug, logger) {
714
+ logger.error(`Failed to process directory: ${error.message || error}`);
715
+ }
716
+ /**
717
+ * Log successful completion
718
+ */
719
+ logSuccess(knowledgeGraph, outputPath, logger) {
720
+ logger.info(`Knowledge graph saved to: ${outputPath}`);
721
+ logger.info(`Final graph: ${knowledgeGraph.entities.length} entities, ${knowledgeGraph.relations.length} relations`);
722
+ }
723
+ }
724
+ exports.DirectoryProcessor = DirectoryProcessor;
725
+ //# sourceMappingURL=DirectoryProcessor.js.map