@wanshi-kg/wanshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (443) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +458 -0
  3. package/dist/__tests__/helpers.js +27 -0
  4. package/dist/__tests__/helpers.js.map +1 -0
  5. package/dist/cli/commands/export.command.js +99 -0
  6. package/dist/cli/commands/export.command.js.map +1 -0
  7. package/dist/cli/commands/index.js +22 -0
  8. package/dist/cli/commands/index.js.map +1 -0
  9. package/dist/cli/commands/inspectMerges.command.js +84 -0
  10. package/dist/cli/commands/inspectMerges.command.js.map +1 -0
  11. package/dist/cli/commands/metrics.command.js +196 -0
  12. package/dist/cli/commands/metrics.command.js.map +1 -0
  13. package/dist/cli/commands/process.command.js +82 -0
  14. package/dist/cli/commands/process.command.js.map +1 -0
  15. package/dist/cli/commands/watch.command.js +91 -0
  16. package/dist/cli/commands/watch.command.js.map +1 -0
  17. package/dist/cli/index.js +269 -0
  18. package/dist/cli/index.js.map +1 -0
  19. package/dist/cli/optionsToConfig.js +160 -0
  20. package/dist/cli/optionsToConfig.js.map +1 -0
  21. package/dist/config/index.js +59 -0
  22. package/dist/config/index.js.map +1 -0
  23. package/dist/config/legacyHints.js +113 -0
  24. package/dist/config/legacyHints.js.map +1 -0
  25. package/dist/config/schema.js +803 -0
  26. package/dist/config/schema.js.map +1 -0
  27. package/dist/config/ui.js +221 -0
  28. package/dist/config/ui.js.map +1 -0
  29. package/dist/core/DirectoryProcessor.js +725 -0
  30. package/dist/core/DirectoryProcessor.js.map +1 -0
  31. package/dist/core/adapters/IStructuredAdapter.js +3 -0
  32. package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
  33. package/dist/core/adapters/SqliteAdapter.js +267 -0
  34. package/dist/core/adapters/SqliteAdapter.js.map +1 -0
  35. package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
  36. package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
  37. package/dist/core/adapters/index.js +20 -0
  38. package/dist/core/adapters/index.js.map +1 -0
  39. package/dist/core/checkpoint/CheckpointService.js +188 -0
  40. package/dist/core/checkpoint/CheckpointService.js.map +1 -0
  41. package/dist/core/checkpoint/index.js +18 -0
  42. package/dist/core/checkpoint/index.js.map +1 -0
  43. package/dist/core/corpus/CorpusAnalyzer.js +266 -0
  44. package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
  45. package/dist/core/corpus/CorpusProfileStore.js +92 -0
  46. package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
  47. package/dist/core/corpus/index.js +21 -0
  48. package/dist/core/corpus/index.js.map +1 -0
  49. package/dist/core/corpus/normalizeGlossary.js +60 -0
  50. package/dist/core/corpus/normalizeGlossary.js.map +1 -0
  51. package/dist/core/corpus/relPath.js +52 -0
  52. package/dist/core/corpus/relPath.js.map +1 -0
  53. package/dist/core/corpus/termFrequency.js +86 -0
  54. package/dist/core/corpus/termFrequency.js.map +1 -0
  55. package/dist/core/cost/CostMeter.js +235 -0
  56. package/dist/core/cost/CostMeter.js.map +1 -0
  57. package/dist/core/cost/index.js +19 -0
  58. package/dist/core/cost/index.js.map +1 -0
  59. package/dist/core/cost/prices.js +38 -0
  60. package/dist/core/cost/prices.js.map +1 -0
  61. package/dist/core/cv/ObjectDetectionService.js +119 -0
  62. package/dist/core/cv/ObjectDetectionService.js.map +1 -0
  63. package/dist/core/di/ContainerFactory.js +670 -0
  64. package/dist/core/di/ContainerFactory.js.map +1 -0
  65. package/dist/core/di/DIContainer.js +103 -0
  66. package/dist/core/di/DIContainer.js.map +1 -0
  67. package/dist/core/di/index.js +19 -0
  68. package/dist/core/di/index.js.map +1 -0
  69. package/dist/core/errors/CustomErrors.js +342 -0
  70. package/dist/core/errors/CustomErrors.js.map +1 -0
  71. package/dist/core/errors/index.js +18 -0
  72. package/dist/core/errors/index.js.map +1 -0
  73. package/dist/core/export/KnowledgeGraphExportService.js +56 -0
  74. package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
  75. package/dist/core/export/index.js +19 -0
  76. package/dist/core/export/index.js.map +1 -0
  77. package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
  78. package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
  79. package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
  80. package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
  81. package/dist/core/export/strategies/IExportStrategy.js +3 -0
  82. package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
  83. package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
  84. package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
  85. package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
  86. package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
  87. package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
  88. package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
  89. package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
  90. package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
  91. package/dist/core/export/strategies/McpExportStrategy.js +67 -0
  92. package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
  93. package/dist/core/export/strategies/index.js +25 -0
  94. package/dist/core/export/strategies/index.js.map +1 -0
  95. package/dist/core/export/strategies/kbTriples.js +60 -0
  96. package/dist/core/export/strategies/kbTriples.js.map +1 -0
  97. package/dist/core/index.js +22 -0
  98. package/dist/core/index.js.map +1 -0
  99. package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
  100. package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
  101. package/dist/core/knowledge/MergeRecord.js +3 -0
  102. package/dist/core/knowledge/MergeRecord.js.map +1 -0
  103. package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
  104. package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
  105. package/dist/core/knowledge/canon/index.js +18 -0
  106. package/dist/core/knowledge/canon/index.js.map +1 -0
  107. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
  108. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
  109. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
  110. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
  111. package/dist/core/knowledge/contradiction/index.js +19 -0
  112. package/dist/core/knowledge/contradiction/index.js.map +1 -0
  113. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
  114. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
  115. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
  116. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
  117. package/dist/core/knowledge/grounding/index.js +20 -0
  118. package/dist/core/knowledge/grounding/index.js.map +1 -0
  119. package/dist/core/knowledge/grounding/verbalize.js +38 -0
  120. package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
  121. package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
  122. package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
  123. package/dist/core/knowledge/index.js +20 -0
  124. package/dist/core/knowledge/index.js.map +1 -0
  125. package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
  126. package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
  127. package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
  128. package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
  129. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
  130. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
  131. package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
  132. package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
  133. package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
  134. package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
  135. package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
  136. package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
  137. package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
  138. package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
  139. package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
  140. package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
  141. package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
  142. package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
  143. package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
  144. package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
  145. package/dist/core/knowledge/vocabulary.js +162 -0
  146. package/dist/core/knowledge/vocabulary.js.map +1 -0
  147. package/dist/core/llm/EmbeddingService.js +113 -0
  148. package/dist/core/llm/EmbeddingService.js.map +1 -0
  149. package/dist/core/llm/OllamaService.js +146 -0
  150. package/dist/core/llm/OllamaService.js.map +1 -0
  151. package/dist/core/llm/OpenAICompatibleService.js +190 -0
  152. package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
  153. package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
  154. package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
  155. package/dist/core/llm/embeddingUtils.js +25 -0
  156. package/dist/core/llm/embeddingUtils.js.map +1 -0
  157. package/dist/core/llm/index.js +23 -0
  158. package/dist/core/llm/index.js.map +1 -0
  159. package/dist/core/llm/prompts/PromptManager.js +388 -0
  160. package/dist/core/llm/prompts/PromptManager.js.map +1 -0
  161. package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
  162. package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
  163. package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
  164. package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
  165. package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
  166. package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
  167. package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
  168. package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
  169. package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
  170. package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
  171. package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
  172. package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
  173. package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
  174. package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
  175. package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
  176. package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
  177. package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
  178. package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
  179. package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
  180. package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
  181. package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
  182. package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
  183. package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
  184. package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
  185. package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
  186. package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
  187. package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
  188. package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
  189. package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
  190. package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
  191. package/dist/core/pipeline/GroundingTransform.js +52 -0
  192. package/dist/core/pipeline/GroundingTransform.js.map +1 -0
  193. package/dist/core/pipeline/PipelineRunner.js +51 -0
  194. package/dist/core/pipeline/PipelineRunner.js.map +1 -0
  195. package/dist/core/pipeline/RelationFilterTransform.js +72 -0
  196. package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
  197. package/dist/core/pipeline/index.js +20 -0
  198. package/dist/core/pipeline/index.js.map +1 -0
  199. package/dist/core/processor/FileProcessor.js +184 -0
  200. package/dist/core/processor/FileProcessor.js.map +1 -0
  201. package/dist/core/processor/ProcessedRegistry.js +38 -0
  202. package/dist/core/processor/ProcessedRegistry.js.map +1 -0
  203. package/dist/core/processor/ast/AstSeedService.js +0 -0
  204. package/dist/core/processor/ast/AstSeedService.js.map +1 -0
  205. package/dist/core/processor/ast/AstSymbolStore.js +110 -0
  206. package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
  207. package/dist/core/processor/ast/index.js +19 -0
  208. package/dist/core/processor/ast/index.js.map +1 -0
  209. package/dist/core/processor/chunking/TextChunker.js +98 -0
  210. package/dist/core/processor/chunking/TextChunker.js.map +1 -0
  211. package/dist/core/processor/chunking/index.js +18 -0
  212. package/dist/core/processor/chunking/index.js.map +1 -0
  213. package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
  214. package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
  215. package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
  216. package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
  217. package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
  218. package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
  219. package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
  220. package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
  221. package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
  222. package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
  223. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
  224. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
  225. package/dist/core/processor/classifier/index.js +21 -0
  226. package/dist/core/processor/classifier/index.js.map +1 -0
  227. package/dist/core/processor/classifier/mergeClassifications.js +32 -0
  228. package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
  229. package/dist/core/processor/index.js +20 -0
  230. package/dist/core/processor/index.js.map +1 -0
  231. package/dist/core/processor/readers/AudioReader.js +462 -0
  232. package/dist/core/processor/readers/AudioReader.js.map +1 -0
  233. package/dist/core/processor/readers/BinaryReader.js +90 -0
  234. package/dist/core/processor/readers/BinaryReader.js.map +1 -0
  235. package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
  236. package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
  237. package/dist/core/processor/readers/ChatExportReader.js +365 -0
  238. package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
  239. package/dist/core/processor/readers/DoclingReader.js +445 -0
  240. package/dist/core/processor/readers/DoclingReader.js.map +1 -0
  241. package/dist/core/processor/readers/EmailReader.js +259 -0
  242. package/dist/core/processor/readers/EmailReader.js.map +1 -0
  243. package/dist/core/processor/readers/EpubReader.js +175 -0
  244. package/dist/core/processor/readers/EpubReader.js.map +1 -0
  245. package/dist/core/processor/readers/FileReader.js +90 -0
  246. package/dist/core/processor/readers/FileReader.js.map +1 -0
  247. package/dist/core/processor/readers/FileReaderFactory.js +49 -0
  248. package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
  249. package/dist/core/processor/readers/HtmlReader.js +371 -0
  250. package/dist/core/processor/readers/HtmlReader.js.map +1 -0
  251. package/dist/core/processor/readers/ImageReader.js +162 -0
  252. package/dist/core/processor/readers/ImageReader.js.map +1 -0
  253. package/dist/core/processor/readers/JsonFileReader.js +232 -0
  254. package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
  255. package/dist/core/processor/readers/JupyterReader.js +178 -0
  256. package/dist/core/processor/readers/JupyterReader.js.map +1 -0
  257. package/dist/core/processor/readers/LatexReader.js +176 -0
  258. package/dist/core/processor/readers/LatexReader.js.map +1 -0
  259. package/dist/core/processor/readers/MarkdownReader.js +289 -0
  260. package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
  261. package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
  262. package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
  263. package/dist/core/processor/readers/MistralOcrReader.js +198 -0
  264. package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
  265. package/dist/core/processor/readers/OfficeReader.js +174 -0
  266. package/dist/core/processor/readers/OfficeReader.js.map +1 -0
  267. package/dist/core/processor/readers/PdfReader.js +116 -0
  268. package/dist/core/processor/readers/PdfReader.js.map +1 -0
  269. package/dist/core/processor/readers/RtfReader.js +107 -0
  270. package/dist/core/processor/readers/RtfReader.js.map +1 -0
  271. package/dist/core/processor/readers/SubtitleReader.js +145 -0
  272. package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
  273. package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
  274. package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
  275. package/dist/core/processor/readers/TextReader.js +129 -0
  276. package/dist/core/processor/readers/TextReader.js.map +1 -0
  277. package/dist/core/processor/readers/TranscriptReader.js +234 -0
  278. package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
  279. package/dist/core/processor/readers/image/imageMetadata.js +155 -0
  280. package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
  281. package/dist/core/processor/readers/index.js +41 -0
  282. package/dist/core/processor/readers/index.js.map +1 -0
  283. package/dist/core/processor/readers/referenceExtraction.js +198 -0
  284. package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
  285. package/dist/core/processor/readers/stripReferences.js +59 -0
  286. package/dist/core/processor/readers/stripReferences.js.map +1 -0
  287. package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
  288. package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
  289. package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
  290. package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
  291. package/dist/core/progress/NoopProgressEmitter.js +15 -0
  292. package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
  293. package/dist/core/progress/index.js +19 -0
  294. package/dist/core/progress/index.js.map +1 -0
  295. package/dist/core/trace/TraceWriter.js +100 -0
  296. package/dist/core/trace/TraceWriter.js.map +1 -0
  297. package/dist/core/trace/events.js +13 -0
  298. package/dist/core/trace/events.js.map +1 -0
  299. package/dist/core/trace/index.js +20 -0
  300. package/dist/core/trace/index.js.map +1 -0
  301. package/dist/core/trace/lineage.js +97 -0
  302. package/dist/core/trace/lineage.js.map +1 -0
  303. package/dist/evaluation/BenchmarkRunner.js +171 -0
  304. package/dist/evaluation/BenchmarkRunner.js.map +1 -0
  305. package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
  306. package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
  307. package/dist/evaluation/classifier/labeledSamples.js +379 -0
  308. package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
  309. package/dist/evaluation/compare/goldCompare.js +126 -0
  310. package/dist/evaluation/compare/goldCompare.js.map +1 -0
  311. package/dist/evaluation/crossre/compareScoring.js +30 -0
  312. package/dist/evaluation/crossre/compareScoring.js.map +1 -0
  313. package/dist/evaluation/datasets/CrossREDataset.js +170 -0
  314. package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
  315. package/dist/evaluation/datasets/IDataset.js +3 -0
  316. package/dist/evaluation/datasets/IDataset.js.map +1 -0
  317. package/dist/evaluation/datasets/RebelDataset.js +117 -0
  318. package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
  319. package/dist/evaluation/datasets/RedocredDataset.js +218 -0
  320. package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
  321. package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
  322. package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
  323. package/dist/evaluation/index.js +33 -0
  324. package/dist/evaluation/index.js.map +1 -0
  325. package/dist/evaluation/matching/ExactMatcher.js +75 -0
  326. package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
  327. package/dist/evaluation/matching/SemanticMatcher.js +143 -0
  328. package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
  329. package/dist/evaluation/metrics/TripleMetrics.js +64 -0
  330. package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
  331. package/dist/evaluation/mine/MineCheckpoint.js +114 -0
  332. package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
  333. package/dist/evaluation/mine/MineDataset.js +208 -0
  334. package/dist/evaluation/mine/MineDataset.js.map +1 -0
  335. package/dist/evaluation/mine/MineReporter.js +98 -0
  336. package/dist/evaluation/mine/MineReporter.js.map +1 -0
  337. package/dist/evaluation/mine/MineRunner.js +148 -0
  338. package/dist/evaluation/mine/MineRunner.js.map +1 -0
  339. package/dist/evaluation/mine/MineScorer.js +127 -0
  340. package/dist/evaluation/mine/MineScorer.js.map +1 -0
  341. package/dist/evaluation/mine/types.js +12 -0
  342. package/dist/evaluation/mine/types.js.map +1 -0
  343. package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
  344. package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
  345. package/dist/evaluation/reporters/JsonReporter.js +50 -0
  346. package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
  347. package/dist/index.js +28 -0
  348. package/dist/index.js.map +1 -0
  349. package/dist/quality/CompositeScore.js +61 -0
  350. package/dist/quality/CompositeScore.js.map +1 -0
  351. package/dist/quality/ConsistencyMetrics.js +70 -0
  352. package/dist/quality/ConsistencyMetrics.js.map +1 -0
  353. package/dist/quality/FactualMetrics.js +76 -0
  354. package/dist/quality/FactualMetrics.js.map +1 -0
  355. package/dist/quality/GraphHealthMetrics.js +68 -0
  356. package/dist/quality/GraphHealthMetrics.js.map +1 -0
  357. package/dist/quality/SemanticMetrics.js +102 -0
  358. package/dist/quality/SemanticMetrics.js.map +1 -0
  359. package/dist/quality/StructuralMetrics.js +60 -0
  360. package/dist/quality/StructuralMetrics.js.map +1 -0
  361. package/dist/quality/index.js +23 -0
  362. package/dist/quality/index.js.map +1 -0
  363. package/dist/shared/index.js +20 -0
  364. package/dist/shared/index.js.map +1 -0
  365. package/dist/shared/logger/Logger.js +3 -0
  366. package/dist/shared/logger/Logger.js.map +1 -0
  367. package/dist/shared/logger/LoggerFactory.js +75 -0
  368. package/dist/shared/logger/LoggerFactory.js.map +1 -0
  369. package/dist/shared/logger/index.js +19 -0
  370. package/dist/shared/logger/index.js.map +1 -0
  371. package/dist/shared/shutdown.js +30 -0
  372. package/dist/shared/shutdown.js.map +1 -0
  373. package/dist/shared/utils/agglomerativeCluster.js +269 -0
  374. package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
  375. package/dist/shared/utils/astSymbols.js +69 -0
  376. package/dist/shared/utils/astSymbols.js.map +1 -0
  377. package/dist/shared/utils/cosineSimilarity.js +18 -0
  378. package/dist/shared/utils/cosineSimilarity.js.map +1 -0
  379. package/dist/shared/utils/directoryTree.js +184 -0
  380. package/dist/shared/utils/directoryTree.js.map +1 -0
  381. package/dist/shared/utils/documentOutline.js +74 -0
  382. package/dist/shared/utils/documentOutline.js.map +1 -0
  383. package/dist/shared/utils/index.js +24 -0
  384. package/dist/shared/utils/index.js.map +1 -0
  385. package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
  386. package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
  387. package/dist/shared/utils/parseJsonLenient.js +27 -0
  388. package/dist/shared/utils/parseJsonLenient.js.map +1 -0
  389. package/dist/shared/utils/readConfig.js +42 -0
  390. package/dist/shared/utils/readConfig.js.map +1 -0
  391. package/dist/shared/utils/readRtf.js +216 -0
  392. package/dist/shared/utils/readRtf.js.map +1 -0
  393. package/dist/shared/utils/softmax.js +26 -0
  394. package/dist/shared/utils/softmax.js.map +1 -0
  395. package/dist/types/ContentClass.js +3 -0
  396. package/dist/types/ContentClass.js.map +1 -0
  397. package/dist/types/CorpusProfile.js +3 -0
  398. package/dist/types/CorpusProfile.js.map +1 -0
  399. package/dist/types/IContradictionChecker.js +3 -0
  400. package/dist/types/IContradictionChecker.js.map +1 -0
  401. package/dist/types/ICorpusAnalyzer.js +3 -0
  402. package/dist/types/ICorpusAnalyzer.js.map +1 -0
  403. package/dist/types/IDirectoryProcessor.js +3 -0
  404. package/dist/types/IDirectoryProcessor.js.map +1 -0
  405. package/dist/types/IEmbeddingProvider.js +3 -0
  406. package/dist/types/IEmbeddingProvider.js.map +1 -0
  407. package/dist/types/IEmbeddingService.js +6 -0
  408. package/dist/types/IEmbeddingService.js.map +1 -0
  409. package/dist/types/IFileProcessor.js +3 -0
  410. package/dist/types/IFileProcessor.js.map +1 -0
  411. package/dist/types/IGroundingChecker.js +3 -0
  412. package/dist/types/IGroundingChecker.js.map +1 -0
  413. package/dist/types/IKnowledgeGraphBuilder.js +3 -0
  414. package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
  415. package/dist/types/IKnowledgeGraphExporter.js +3 -0
  416. package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
  417. package/dist/types/IKnowledgeGraphMerger.js +3 -0
  418. package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
  419. package/dist/types/IKnowledgeGraphSearch.js +3 -0
  420. package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
  421. package/dist/types/ILLMProvider.js +3 -0
  422. package/dist/types/ILLMProvider.js.map +1 -0
  423. package/dist/types/ILLMService.js +3 -0
  424. package/dist/types/ILLMService.js.map +1 -0
  425. package/dist/types/IObjectDetector.js +3 -0
  426. package/dist/types/IObjectDetector.js.map +1 -0
  427. package/dist/types/IProcessingService.js +3 -0
  428. package/dist/types/IProcessingService.js.map +1 -0
  429. package/dist/types/IProgressEmitter.js +3 -0
  430. package/dist/types/IProgressEmitter.js.map +1 -0
  431. package/dist/types/IPromptManager.js +3 -0
  432. package/dist/types/IPromptManager.js.map +1 -0
  433. package/dist/types/KnowledgeGraph.js +3 -0
  434. package/dist/types/KnowledgeGraph.js.map +1 -0
  435. package/dist/types/MCPKnowledgeGraph.js +3 -0
  436. package/dist/types/MCPKnowledgeGraph.js.map +1 -0
  437. package/dist/types/Observation.js +21 -0
  438. package/dist/types/Observation.js.map +1 -0
  439. package/dist/types/ProcessingOptions.js +3 -0
  440. package/dist/types/ProcessingOptions.js.map +1 -0
  441. package/dist/types/index.js +40 -0
  442. package/dist/types/index.js.map +1 -0
  443. package/package.json +122 -0
@@ -0,0 +1,803 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.ConfigSchema = exports.CanonTargetEnum = exports.CanonicalSelectionEnum = exports.ClusterAlgoEnum = exports.CanonMethodEnum = exports.TfAnalysisSourceEnum = exports.LogLevelEnum = exports.AsrModelsEnum = exports.AsrEngineEnum = exports.PdfEngineEnum = exports.JsonStrategyEnum = exports.ExportFormatEnum = exports.AstModeEnum = exports.CorpusProfilingModeEnum = exports.SupersessionModeEnum = exports.GroundingCheckerEnum = exports.GroundingModeEnum = exports.ContentClassifierModeEnum = exports.ImageProcessingModeEnum = exports.SpeechRecognitionModeEnum = exports.RetrievalScopeEnum = exports.RetrievalModeEnum = exports.ChunkingModeEnum = exports.ProviderModeEnum = void 0;
4
+ const zod_1 = require("zod");
5
+ /**
6
+ * The single source of truth for wanshi's configuration.
7
+ *
8
+ * Everything else is derived from this schema: the `ProcessingOptions` TS type
9
+ * (`z.infer`), runtime validation + defaults (`parseConfig`), and the JSON
10
+ * Schema served to the frontend (`configJsonSchema`). Defaults live here and
11
+ * nowhere else — CLI flags carry no defaults, and services no longer apply
12
+ * scattered `?? fallback`s.
13
+ *
14
+ * Objects are `.strict()` so an unknown/legacy flat key (e.g. `chunkSize`) is a
15
+ * hard error with a migration hint, not a silent miscast (clean break from the
16
+ * old flat shape — see docs/MIGRATION.md).
17
+ *
18
+ * Numeric fields use `z.coerce.number()` so CLI string values ("2000") and YAML
19
+ * numbers both validate. `.default()` short-circuits `undefined` before
20
+ * coercion, so an unset flag falls through to the default rather than NaN.
21
+ */
22
+ // ── small helpers ──────────────────────────────────────────────────────────
23
+ /** A number field with a default; coerces CLI strings + YAML numbers. */
24
+ const num = (def) => zod_1.z.coerce.number().default(def);
25
+ /** Accept a single string or an array of strings; normalize to an array. */
26
+ const stringList = (def) => zod_1.z
27
+ .union([zod_1.z.string(), zod_1.z.array(zod_1.z.string())])
28
+ .transform((v) => (Array.isArray(v) ? v : [v]))
29
+ .default(def);
30
+ // ── enums (reused as exported subtypes) ────────────────────────────────────
31
+ exports.ProviderModeEnum = zod_1.z.enum(["ollama", "openai"]);
32
+ exports.ChunkingModeEnum = zod_1.z.enum(["enabled", "disabled", "auto"]);
33
+ exports.RetrievalModeEnum = zod_1.z.enum(["enabled", "disabled", "auto"]);
34
+ exports.RetrievalScopeEnum = zod_1.z.enum(["chunk", "file"]);
35
+ exports.SpeechRecognitionModeEnum = zod_1.z.enum(["enabled", "disabled", "auto"]);
36
+ exports.ImageProcessingModeEnum = zod_1.z.enum(["enabled", "disabled", "auto"]);
37
+ exports.ContentClassifierModeEnum = zod_1.z.enum([
38
+ "disabled",
39
+ "heuristic",
40
+ "llm",
41
+ "cascade",
42
+ ]);
43
+ exports.GroundingModeEnum = zod_1.z.enum(["disabled", "flag", "drop"]);
44
+ exports.GroundingCheckerEnum = zod_1.z.enum(["keyword", "minicheck"]);
45
+ exports.SupersessionModeEnum = zod_1.z.enum(["disabled", "heuristic", "llm"]);
46
+ exports.CorpusProfilingModeEnum = zod_1.z.enum(["disabled", "enabled"]);
47
+ exports.AstModeEnum = zod_1.z.enum(["enabled", "disabled"]);
48
+ exports.ExportFormatEnum = zod_1.z.enum([
49
+ "json",
50
+ "jsonl",
51
+ "mcp-jsonl",
52
+ "dot",
53
+ "kblam",
54
+ "lora",
55
+ "graphiti",
56
+ ]);
57
+ exports.JsonStrategyEnum = zod_1.z.enum(["structural", "raw"]);
58
+ // PDF reading engine: `pdf2json` = built-in text extraction (default, no OCR,
59
+ // portable); `tesseract` = pure-JS/WASM OCR (light-local floor, no system binary);
60
+ // `docling`/`marker`/`chandra` = local Python tools (subprocess; chandra = slow
61
+ // SOTA/handwriting 4B VLM); `mistral` = Mistral OCR HTTP API. Any non-default
62
+ // engine degrades to pdf2json on failure. Hardware-aware ladder:
63
+ // tesseract (light/CPU) → pdf2json → docling → marker → chandra → mistral (cloud).
64
+ exports.PdfEngineEnum = zod_1.z.enum(["pdf2json", "docling", "marker", "mistral", "tesseract", "chandra"]);
65
+ // AudioReader transcription engine: `whisper` = built-in single-model nodejs-whisper
66
+ // (default, portable, network-free); `dual` = vendored Python audio-pipeline
67
+ // (Silero VAD + Parakeet/Whisper dual-STT + diarization, Apple-Silicon only, opt-in).
68
+ exports.AsrEngineEnum = zod_1.z.enum(["whisper", "dual"]);
69
+ exports.AsrModelsEnum = zod_1.z.enum(["both", "parakeet", "whisper"]);
70
+ exports.LogLevelEnum = zod_1.z.enum(["debug", "info", "warning", "error"]);
71
+ // ── grouped sub-schemas ────────────────────────────────────────────────────
72
+ const LlmSchema = zod_1.z
73
+ .object({
74
+ provider: exports.ProviderModeEnum.default("ollama").describe("Generation provider. 'openai' targets any OpenAI-compatible endpoint via host."),
75
+ model: zod_1.z.string().default("llama3.2").describe("LLM used for generation"),
76
+ host: zod_1.z
77
+ .string()
78
+ .default("http://localhost:11434")
79
+ .describe("Ollama host URL, or OpenAI-compatible base URL when provider=openai"),
80
+ apiKey: zod_1.z
81
+ .string()
82
+ .optional()
83
+ .describe("API key for OpenAI-compatible provider (falls back to $OPENAI_API_KEY / $WANSHI_API_KEY)"),
84
+ temperature: num(0.1).describe("Model temperature"),
85
+ repeatPenalty: num(1.1).describe("Repeat penalty (Ollama: >1.0 discourages repetition, <1.0 promotes it; 1.0 = off)"),
86
+ contextLength: num(8192).describe("Model context length (system prompt + chunk + response)"),
87
+ maxTokens: zod_1.z.coerce
88
+ .number()
89
+ .optional()
90
+ .describe("Max output tokens per generation; raise it if KG JSON truncates mid-output"),
91
+ seed: zod_1.z.coerce.number().optional().describe("Model seed"),
92
+ system: zod_1.z.string().optional().describe("System prompt text or path to a handlebars template"),
93
+ promptVersion: zod_1.z
94
+ .string()
95
+ .default("v5")
96
+ .describe("Prompt template version under templates/ (v5 default; v4.5 = legacy)"),
97
+ })
98
+ .strict();
99
+ const EmbeddingsSchema = zod_1.z
100
+ .object({
101
+ provider: exports.ProviderModeEnum.default("ollama").describe("Embeddings provider, independent from generation; defaults to local Ollama"),
102
+ model: zod_1.z.string().default("nomic-embed-text").describe("Embeddings model"),
103
+ host: zod_1.z.string().default("http://localhost:11434").describe("Embeddings host / OpenAI-compatible base URL"),
104
+ apiKey: zod_1.z.string().optional().describe("API key for OpenAI-compatible embeddings"),
105
+ maxInputChars: num(1024).describe("Truncate embedding inputs to at most N characters"),
106
+ })
107
+ .strict();
108
+ const ChunkingSchema = zod_1.z
109
+ .object({
110
+ mode: exports.ChunkingModeEnum.default("enabled").describe("Chunking mode"),
111
+ size: num(2000).describe("Maximum chunk size in characters"),
112
+ overlap: num(100).describe("Overlap size between chunks in characters"),
113
+ })
114
+ .strict();
115
+ const RetrievalSchema = zod_1.z
116
+ .object({
117
+ mode: exports.RetrievalModeEnum.default("enabled").describe("Context retrieval mode"),
118
+ limit: num(3).describe("Context retrieval limit"),
119
+ scope: exports.RetrievalScopeEnum.default("chunk").describe("Retrieval granularity: per-chunk (default) or once per file"),
120
+ })
121
+ .strict();
122
+ const MergingSchema = zod_1.z
123
+ .object({
124
+ entitySimilarityThreshold: num(0.9).describe("Jaro-Winkler threshold for entity-name merging, applied uniformly within-file and globally; fuzzy merging never crosses a digit mismatch (Table 1 ≠ Table 2) and cross-type matches need near-exact similarity"),
125
+ observationSimilarityThreshold: num(0.9).describe("Embedding cosine threshold for observation merging"),
126
+ enableSimilarityMerging: zod_1.z.boolean().default(true).describe("Allow fuzzy (Jaro-Winkler) entity-name merging; false ⇒ only normalized-exact name matches merge"),
127
+ supersession: exports.SupersessionModeEnum.default("disabled").describe("Merge-time supersession (KG-10): a newer fact contradicting an older one invalidates the older (sets invalidAt/expiredAt) instead of deleting it. disabled | heuristic (antonyms+negation) | llm"),
128
+ })
129
+ .strict();
130
+ const GroundingSchema = zod_1.z
131
+ .object({
132
+ mode: exports.GroundingModeEnum.default("disabled").describe("Inline grounding gate: disabled | flag (annotate) | drop (remove ungrounded)"),
133
+ minScore: num(0.5).describe("Minimum keyword-overlap grounding score (0..1)"),
134
+ checker: exports.GroundingCheckerEnum.default("keyword").describe("Grounding checker: keyword (overlap heuristic) | minicheck (local NLI fact-checker, with keyword pre-filter)"),
135
+ model: zod_1.z
136
+ .string()
137
+ .default("bespoke-minicheck:7b")
138
+ .describe("Ollama model for the minicheck checker (a (document, claim)→Yes/No NLI model)"),
139
+ host: zod_1.z
140
+ .string()
141
+ .optional()
142
+ .describe("Ollama host for the minicheck checker; defaults to the generation/embeddings host"),
143
+ escalateAbove: num(0.8).describe("Keyword score at/above which minicheck accepts without an NLI call (cheap pre-filter)"),
144
+ })
145
+ .strict();
146
+ const AstSchema = zod_1.z
147
+ .object({
148
+ mode: exports.AstModeEnum.default("enabled").describe("AST symbol seed (Phase 8): seed code definitions + exported members as entities (+ calls/imports edges) before the LLM, so the model augments rather than originates the symbol set"),
149
+ cachePath: zod_1.z.string().optional().describe("AST symbol cache sidecar path (default <output>.ast-cache.json)"),
150
+ })
151
+ .strict();
152
+ const CorpusSchema = zod_1.z
153
+ .object({
154
+ profiling: exports.CorpusProfilingModeEnum.default("disabled").describe("Corpus analysis pre-pass: term frequency + cached classification + LLM glossary"),
155
+ topTerms: num(100).describe("Number of most-frequent terms fed to the glossary call"),
156
+ profilePath: zod_1.z.string().optional().describe("Corpus profile sidecar path (default <output>.corpus-profile.json)"),
157
+ clustering: zod_1.z.boolean().default(false).describe("Embedding clustering of terms (v2 stub, deferred)"),
158
+ })
159
+ .strict();
160
+ const ClassifierSchema = zod_1.z
161
+ .object({
162
+ mode: exports.ContentClassifierModeEnum.default("disabled").describe("Content classifier mode (experimental)"),
163
+ temperature: zod_1.z.coerce
164
+ .number()
165
+ .positive()
166
+ .default(2.0)
167
+ .describe("Heuristic softmax temperature: lower = sharper/more decisive, higher = flatter/more ties"),
168
+ crossValidationFactor: zod_1.z.coerce
169
+ .number()
170
+ .min(0)
171
+ .default(0.15)
172
+ .describe("Heuristic cross-validation negative-pattern weight factor"),
173
+ maxEscalations: zod_1.z.coerce
174
+ .number()
175
+ .int()
176
+ .min(0)
177
+ .default(50)
178
+ .describe("Cascade mode: max LLM tie-break escalations per run (cost guard)"),
179
+ lowConfidenceThreshold: zod_1.z.coerce
180
+ .number()
181
+ .min(0)
182
+ .max(1)
183
+ .default(0.25)
184
+ .describe("Domain-gate floor: min top-1 confidence to route any domain"),
185
+ mixedDomainThreshold: zod_1.z.coerce
186
+ .number()
187
+ .min(0)
188
+ .max(1)
189
+ .default(0.15)
190
+ .describe("Domain-gate margin: max top1−top2 gap to also route the second domain"),
191
+ })
192
+ .strict();
193
+ const JsonReaderSchema = zod_1.z
194
+ .object({
195
+ strategy: exports.JsonStrategyEnum.default("structural").describe("JSON reader: structural (split on JSON structure) or raw (text split)"),
196
+ maxChunkSize: zod_1.z.coerce.number().optional().describe("Max JSON chunk size (inherits chunking.size when unset)"),
197
+ })
198
+ .strict();
199
+ // Dual-STT engine knobs (only consulted when `engine: dual`). The Python
200
+ // audio-pipeline subproject is invoked per audio file; any failure (missing
201
+ // interpreter, model, or service) degrades gracefully back to the whisper engine.
202
+ const AsrDualSchema = zod_1.z
203
+ .object({
204
+ projectDir: zod_1.z
205
+ .string()
206
+ .default("./audio-pipeline")
207
+ .describe("Path to the vendored Python audio-pipeline subproject"),
208
+ pythonPath: zod_1.z
209
+ .string()
210
+ .optional()
211
+ .describe("Python/launcher executable (default: `uv` runner inside projectDir)"),
212
+ asr: exports.AsrModelsEnum.default("both").describe("Which ASR backends to run (both keeps parakeet+whisper as provenance)"),
213
+ diarize: zod_1.z.boolean().default(true).describe("Run pyannote speaker diarization (needs an HF token)"),
214
+ numSpeakers: zod_1.z.coerce.number().int().positive().optional().describe("Hint the diarizer's speaker count when known"),
215
+ device: zod_1.z.string().optional().describe("Torch/MLX device override (e.g. mps, cpu, cuda)"),
216
+ timeoutMs: zod_1.z.coerce.number().int().positive().default(1800000).describe("Per-file transcription subprocess timeout (ms)"),
217
+ })
218
+ .strict();
219
+ const AsrSchema = zod_1.z
220
+ .object({
221
+ mode: exports.SpeechRecognitionModeEnum.default("enabled").describe("Automatic speech recognition mode"),
222
+ engine: exports.AsrEngineEnum.default("whisper").describe("Transcription engine: whisper (built-in) or dual (vendored Python VAD+dual-STT+diarization)"),
223
+ whisperModel: zod_1.z.string().default("medium").describe("Whisper model (whisper engine)"),
224
+ language: zod_1.z.string().default("auto").describe("Speech recognition language"),
225
+ translate: zod_1.z.boolean().default(false).describe("Translate transcript to English (whisper engine)"),
226
+ dual: AsrDualSchema.default({}),
227
+ })
228
+ .strict();
229
+ // Email reader knobs (`.eml`/`.mbox`). The body still flows through LLM
230
+ // extraction; these only govern how an email/thread is turned into turns.
231
+ const EmailReaderSchema = zod_1.z
232
+ .object({
233
+ maxMessages: zod_1.z.coerce
234
+ .number()
235
+ .int()
236
+ .positive()
237
+ .default(1000)
238
+ .describe("Max messages parsed from one .mbox (warns + truncates beyond this)"),
239
+ stripQuotes: zod_1.z
240
+ .boolean()
241
+ .default(true)
242
+ .describe("Strip quoted reply chains (`> …` / `On … wrote:`) so each message contributes only its new content"),
243
+ })
244
+ .strict();
245
+ // Chat-export reader knobs (WhatsApp .txt, Telegram/Discord/Slack .json). The
246
+ // message text still flows through LLM extraction; these only govern parsing.
247
+ const ChatReaderSchema = zod_1.z
248
+ .object({
249
+ maxMessages: zod_1.z.coerce
250
+ .number()
251
+ .int()
252
+ .positive()
253
+ .default(50000)
254
+ .describe("Max messages parsed from one chat export (warns + truncates beyond this)"),
255
+ skipSystem: zod_1.z
256
+ .boolean()
257
+ .default(true)
258
+ .describe("Drop system/service noise (joins, encryption notices, <Media omitted>, …)"),
259
+ })
260
+ .strict();
261
+ // Jupyter notebook reader knobs (.ipynb). Markdown narrative + fenced code are
262
+ // always rendered; outputs/images are opt-in (they often carry noise).
263
+ const JupyterReaderSchema = zod_1.z
264
+ .object({
265
+ includeOutputs: zod_1.z
266
+ .boolean()
267
+ .default(false)
268
+ .describe("Append code-cell text outputs (stream / text-plain results; error tracebacks always skipped)"),
269
+ includeImages: zod_1.z
270
+ .boolean()
271
+ .default(false)
272
+ .describe("Attach base64 image outputs as chunk images (for the vision path)"),
273
+ })
274
+ .strict();
275
+ const OutlineSchema = zod_1.z
276
+ .object({
277
+ enabled: zod_1.z.boolean().default(true).describe("Generate a per-file structural outline and inject it into the prompt"),
278
+ maxDepth: zod_1.z.coerce.number().optional().describe("Limit outline nesting depth"),
279
+ includeLineNumbers: zod_1.z.boolean().default(false).describe("Include line numbers in the outline"),
280
+ includePrivate: zod_1.z.boolean().default(false).describe("Include private/internal members"),
281
+ includeComments: zod_1.z.boolean().default(false).describe("Include comments"),
282
+ compact: zod_1.z.boolean().default(false).describe("Token-lean ascii-tree outline: drop line numbers + metadata annotations"),
283
+ })
284
+ .strict();
285
+ // marker-pdf engine (Python `marker_single` CLI subprocess; ~1GB models, slow on
286
+ // CPU). Only consulted when `pdfEngine: marker`; failure degrades to pdf2json.
287
+ const MarkerSchema = zod_1.z
288
+ .object({
289
+ command: zod_1.z.string().default("marker_single").describe("marker CLI executable (on PATH)"),
290
+ useLlm: zod_1.z.boolean().default(false).describe("Marker --use_llm hybrid mode (reuses the openai-compatible llm config; higher table accuracy, costs LLM calls)"),
291
+ forceOcr: zod_1.z.boolean().default(false).describe("Force OCR on every page (scanned PDFs)"),
292
+ timeoutMs: zod_1.z.coerce.number().int().positive().default(900000).describe("Per-file marker subprocess timeout (ms)"),
293
+ })
294
+ .strict();
295
+ // Mistral OCR engine (HTTP API; ~$1-2/1k pages). Only consulted when
296
+ // `pdfEngine: mistral`; missing key / HTTP error degrades to pdf2json.
297
+ const MistralSchema = zod_1.z
298
+ .object({
299
+ apiKey: zod_1.z.string().optional().describe("Mistral API key (falls back to $MISTRAL_API_KEY)"),
300
+ host: zod_1.z.string().default("https://api.mistral.ai").describe("Mistral API base URL"),
301
+ model: zod_1.z.string().default("mistral-ocr-latest").describe("Mistral OCR model"),
302
+ timeoutMs: zod_1.z.coerce.number().int().positive().default(300000).describe("Per-file OCR request timeout (ms)"),
303
+ })
304
+ .strict();
305
+ // Tesseract OCR engine (pure-JS/WASM: pdf-to-png-converter rasterizes each page,
306
+ // tesseract.js OCRs it; zero system binaries — the light-local floor for hardware
307
+ // with no GPU/VLM). Only consulted when `pdfEngine: tesseract`; any failure
308
+ // degrades to pdf2json. Language traineddata is fetched from the tesseract.js CDN
309
+ // on first use and cached — set `langPath` for a fully offline mirror.
310
+ const TesseractSchema = zod_1.z
311
+ .object({
312
+ lang: zod_1.z.string().default("eng").describe('Tesseract language code(s), e.g. "eng" or "eng+deu"'),
313
+ scale: zod_1.z.coerce.number().positive().default(2).describe("PDF→PNG render scale before OCR (higher = sharper input, slower)"),
314
+ oem: zod_1.z.coerce.number().int().optional().describe("OCR engine mode (tesseract.js OEM; default LSTM)"),
315
+ psm: zod_1.z.coerce.number().int().optional().describe("Page segmentation mode (tessedit_pageseg_mode)"),
316
+ langPath: zod_1.z.string().optional().describe("Offline traineddata dir/URL (no trailing slash); omit to use the CDN + cache"),
317
+ })
318
+ .strict();
319
+ // Chandra OCR engine (datalab `chandra-ocr` CLI subprocess; 4B VLM, slow on
320
+ // CPU/MPS — the SOTA/handwriting rung). Only consulted when `pdfEngine: chandra`;
321
+ // any failure degrades to pdf2json. License note: Chandra's weights are modified
322
+ // OpenRAIL-M (free for personal/research and orgs under $2M revenue; commercial
323
+ // self-hosting needs a license) — unlike Tesseract's clean Apache. Provide a
324
+ // license-aware default so a downstream commercial user isn't surprised.
325
+ const ChandraSchema = zod_1.z
326
+ .object({
327
+ command: zod_1.z.string().default("chandra").describe("chandra CLI executable (on PATH; `pip install chandra-ocr`)"),
328
+ method: zod_1.z.enum(["hf", "vllm"]).default("hf").describe("Backend: hf (HuggingFace+torch, M4-runnable but slow) | vllm (GPU server)"),
329
+ timeoutMs: zod_1.z.coerce.number().int().positive().default(900000).describe("Per-file chandra subprocess timeout (ms)"),
330
+ })
331
+ .strict();
332
+ // Image metadata enrichment (deterministic, default OFF → byte-identical run).
333
+ // EXIF is graph-native structured data the image already carries; mapped to
334
+ // facts (GPS→location, capture time→bitemporal validAt, camera/author/software)
335
+ // that AUGMENT the VLM read of the image, stamped sourceAdapter:"exif".
336
+ const ExifSchema = zod_1.z
337
+ .object({
338
+ enabled: zod_1.z.boolean().default(false).describe("Extract image EXIF (GPS→location, capture time→validAt, camera/author/software) into deterministic graph facts"),
339
+ })
340
+ .strict();
341
+ // C2PA content-credential read (deterministic validity signal). Shells the
342
+ // official Adobe/CAI `c2patool` (reference-grade cryptographic validation),
343
+ // degrade-if-absent like marker. Records a trust observation (present/valid/signer/
344
+ // AI-claim) stamped sourceAdapter:"c2pa" — a fact, never a verdict. Default OFF.
345
+ const C2paSchema = zod_1.z
346
+ .object({
347
+ enabled: zod_1.z.boolean().default(false).describe("Read C2PA content credentials (via the c2patool CLI) into a trust observation on the image"),
348
+ command: zod_1.z.string().default("c2patool").describe("c2patool executable (on PATH; degrade to no-credential if absent)"),
349
+ })
350
+ .strict();
351
+ // CV pre-pass (Phase 2, opt-in, signal-not-verdict). Object detection runs a
352
+ // transformers.js detector (already a dep; bundles onnxruntime-node + sharp — no
353
+ // new dep) over images; detections feed the VLM prompt as context AND a
354
+ // deterministic cv-detection graph fragment (confidence = detector score). Default
355
+ // OFF → byte-identical run. (Forensic/manipulation signals — `cv.forensics` — are
356
+ // the gated 2b sub-phase, not yet built.)
357
+ const CvDetectionSchema = zod_1.z
358
+ .object({
359
+ enabled: zod_1.z.boolean().default(false).describe("Detect objects in images (people/vehicles/objects/animals) → VLM context + cv-detection graph facts"),
360
+ mode: zod_1.z.enum(["closed", "zero-shot"]).default("closed").describe("closed = fixed COCO classes (DETR/YOLOS); zero-shot = open-vocab via `labels` (OWL-ViT)"),
361
+ model: zod_1.z.string().default("").describe("HF model id; empty ⇒ per-mode default (closed: Xenova/detr-resnet-50, zero-shot: Xenova/owlvit-base-patch32)"),
362
+ threshold: zod_1.z.coerce.number().min(0).max(1).default(0.5).describe("Minimum detection score to keep"),
363
+ labels: zod_1.z.array(zod_1.z.string()).default([]).describe("Zero-shot candidate labels (required for mode=zero-shot; ignored for closed)"),
364
+ maxObjects: zod_1.z.coerce.number().int().positive().default(20).describe("Cap detected objects per image"),
365
+ cacheDir: zod_1.z.string().optional().describe("transformers.js model cache dir (env.cacheDir)"),
366
+ allowRemote: zod_1.z.boolean().default(true).describe("Allow downloading the model from the HF Hub (false ⇒ offline; needs a local cache/mirror)"),
367
+ })
368
+ .strict();
369
+ const CvSchema = zod_1.z
370
+ .object({
371
+ detection: CvDetectionSchema.default({}),
372
+ })
373
+ .strict();
374
+ const ReadersSchema = zod_1.z
375
+ .object({
376
+ pdfEngine: exports.PdfEngineEnum.default("pdf2json").describe("PDF reading engine: pdf2json (built-in) | tesseract (pure-JS/WASM OCR) | docling | marker (Python subprocess) | chandra (Python subprocess, SOTA) | mistral (HTTP OCR API)"),
377
+ marker: MarkerSchema.default({}),
378
+ mistral: MistralSchema.default({}),
379
+ tesseract: TesseractSchema.default({}),
380
+ chandra: ChandraSchema.default({}),
381
+ stripReferences: zod_1.z.boolean().default(false).describe("Quarantine trailing references/bibliography sections before extraction (PDF + markdown)"),
382
+ images: exports.ImageProcessingModeEnum.default("auto").describe("Image processing mode"),
383
+ exif: ExifSchema.default({}),
384
+ c2pa: C2paSchema.default({}),
385
+ cv: CvSchema.default({}),
386
+ json: JsonReaderSchema.default({}),
387
+ email: EmailReaderSchema.default({}),
388
+ chat: ChatReaderSchema.default({}),
389
+ jupyter: JupyterReaderSchema.default({}),
390
+ asr: AsrSchema.default({}),
391
+ outline: OutlineSchema.default({}),
392
+ })
393
+ .strict();
394
+ // Reference & link resolution (Phase 0, network-free). Turns the references a
395
+ // document already contains into graph edges. Both axes default OFF — a default
396
+ // run's output shape is unchanged until opted in. Network classes (external web,
397
+ // citation span-fetch) are later phases and live behind their own opt-in.
398
+ const ReferencesSchema = zod_1.z
399
+ .object({
400
+ internalLinks: zod_1.z
401
+ .object({
402
+ enabled: zod_1.z
403
+ .boolean()
404
+ .default(false)
405
+ .describe("Resolve internal links ([x](./other.md), [[wikilinks]], HTML href) to corpus files as links_to edges"),
406
+ })
407
+ .strict()
408
+ .default({}),
409
+ citations: zod_1.z
410
+ .object({
411
+ enabled: zod_1.z
412
+ .boolean()
413
+ .default(false)
414
+ .describe("Parse the bibliography + inline arXiv/DOI/PMID into cites edges (network-free; fetch/resolution is a later phase)"),
415
+ // Phase 2 — citation span-fetch + faithfulness. Opt-in NETWORK; auto-enables
416
+ // citation extraction. Resolves a cited work's id → OA full text, folds it
417
+ // onto the cited-work node, and (with GROBID + MiniCheck) labels the edge.
418
+ fetch: zod_1.z
419
+ .object({
420
+ enabled: zod_1.z
421
+ .boolean()
422
+ .default(false)
423
+ .describe("Phase 2 — resolve id-bearing cites to OPEN-ACCESS full text and fetch it. Opt-in NETWORK; auto-enables citation extraction"),
424
+ allowlist: stringList(["arxiv.org", "ncbi.nlm.nih.gov"]).describe("OA hosts eligible to fetch (empty = no fetch). Broaden to raise DOI/Unpaywall reach"),
425
+ rejectlist: stringList([]).describe("Hosts / URL-prefixes to always skip"),
426
+ maxFetches: num(50).describe("Per-run citation fetch budget (hard cap)"),
427
+ timeoutMs: num(15000).describe("Per-fetch timeout (ms)"),
428
+ maxBytes: num(20000000).describe("Reject fetched PDFs larger than this"),
429
+ unpaywallEmail: zod_1.z
430
+ .string()
431
+ .optional()
432
+ .describe("Unpaywall polite-pool email (or $UNPAYWALL_EMAIL) — required to resolve DOI citations"),
433
+ minicheck: zod_1.z
434
+ .boolean()
435
+ .default(true)
436
+ .describe("Phase 2c — label cites supported/unsupported/uncertain via MiniCheck (needs a citing claim from GROBID)"),
437
+ minicheckModel: zod_1.z
438
+ .string()
439
+ .default("bespoke-minicheck:7b")
440
+ .describe("Ollama model for the citation faithfulness checker"),
441
+ minicheckHost: zod_1.z
442
+ .string()
443
+ .optional()
444
+ .describe("Ollama host for the faithfulness checker; defaults to the local daemon"),
445
+ uncertainBand: zod_1.z
446
+ .tuple([zod_1.z.coerce.number(), zod_1.z.coerce.number()])
447
+ .default([0.34, 0.67])
448
+ .describe("[lo, hi]: support score ≤lo ⇒ unsupported, ≥hi ⇒ supported, between ⇒ uncertain"),
449
+ cachePath: zod_1.z
450
+ .string()
451
+ .optional()
452
+ .describe("Citation fetch-cache sidecar path (default: <output>.citation-cache.jsonl)"),
453
+ })
454
+ .strict()
455
+ .default({}),
456
+ grobid: zod_1.z
457
+ .object({
458
+ enabled: zod_1.z
459
+ .boolean()
460
+ .default(false)
461
+ .describe("Phase 2b — use a local GROBID service to link in-text citation markers to references (enables span-select + faithfulness). Run via Docker: lfoppiano/grobid"),
462
+ url: zod_1.z.string().default("http://localhost:8070").describe("GROBID service base URL"),
463
+ })
464
+ .strict()
465
+ .default({}),
466
+ titleResolver: zod_1.z
467
+ .object({
468
+ enabled: zod_1.z
469
+ .boolean()
470
+ .default(false)
471
+ .describe("Phase 2d — resolve id-LESS references to a DOI/arXiv id via Crossref → Semantic Scholar → OpenAlex (widens fetch reach beyond id-bearing cites)"),
472
+ mailto: zod_1.z.string().optional().describe("Polite-pool email for Crossref/OpenAlex"),
473
+ openAlexKey: zod_1.z.string().optional().describe("OpenAlex API key (required from Feb 2026)"),
474
+ semanticScholarKey: zod_1.z.string().optional().describe("Semantic Scholar API key (raises rate limit)"),
475
+ minTitleSimilarity: num(0.85).describe("Min jaroWinkler title similarity to accept a title→id match"),
476
+ })
477
+ .strict()
478
+ .default({}),
479
+ })
480
+ .strict()
481
+ .default({}),
482
+ follow: zod_1.z
483
+ .object({
484
+ enabled: zod_1.z
485
+ .boolean()
486
+ .default(false)
487
+ .describe("Reference-driven ingestion: follow internal links to discover & process files (each once). Network-free, confined to input; auto-enables internalLinks"),
488
+ seeds: stringList([]).describe("Entry docs (relative to input) to crawl from; empty = crawl from the discovered glob set"),
489
+ maxDepth: num(0).describe("Link-follow depth from a seed (0 = unlimited, within maxFiles)"),
490
+ maxFiles: num(5000).describe("Hard cap on files processed per run (cycle/runaway guard)"),
491
+ })
492
+ .strict()
493
+ .default({}),
494
+ web: zod_1.z
495
+ .object({
496
+ enabled: zod_1.z
497
+ .boolean()
498
+ .default(false)
499
+ .describe("Phase 1 — fetch allowlisted EXTERNAL web links, extract, emit `references` edges. Opt-in NETWORK; auto-enables internalLinks extraction"),
500
+ allowlist: stringList([]).describe("Domains / URL-prefixes eligible to fetch (e.g. ['letta.com','https://x.io/docs']). Empty = no fetch (master switch)"),
501
+ rejectlist: stringList([]).describe("Domains / URL-prefixes to always skip"),
502
+ maxFetches: num(50).describe("Per-run fetch budget (hard cap)"),
503
+ timeoutMs: num(10000).describe("Per-fetch timeout (ms)"),
504
+ maxBytes: num(5000000).describe("Reject response bodies larger than this"),
505
+ relevanceCheck: zod_1.z
506
+ .boolean()
507
+ .default(true)
508
+ .describe("LLM relevance pre-check on title/meta before the extraction pass"),
509
+ robots: zod_1.z.boolean().default(true).describe("Respect robots.txt Disallow rules"),
510
+ cachePath: zod_1.z
511
+ .string()
512
+ .optional()
513
+ .describe("Fetch-cache sidecar path (default: <output>.fetch-cache.jsonl)"),
514
+ })
515
+ .strict()
516
+ .default({}),
517
+ })
518
+ .strict();
519
+ const DotSchema = zod_1.z
520
+ .object({
521
+ layout: zod_1.z.enum(["dot", "neato", "fdp", "sfdp", "circo", "twopi"]).default("dot"),
522
+ rankdir: zod_1.z.enum(["TB", "BT", "LR", "RL"]).default("TB"),
523
+ nodeShape: zod_1.z.string().default("box"),
524
+ edgeStyle: zod_1.z.string().default("solid"),
525
+ colorScheme: zod_1.z.enum(["default", "scientific", "code", "minimal"]).default("default"),
526
+ includeObservations: zod_1.z.boolean().default(true),
527
+ maxObservationsPerNode: num(3),
528
+ clusterByEntityType: zod_1.z.boolean().default(false),
529
+ clusterByFile: zod_1.z.boolean().default(false),
530
+ showLegend: zod_1.z.boolean().default(true),
531
+ })
532
+ .strict();
533
+ const ExportSchema = zod_1.z
534
+ .object({
535
+ format: exports.ExportFormatEnum.default("json").describe("Export format"),
536
+ dot: DotSchema.default({}).describe("DOT export options (used when format=dot)"),
537
+ })
538
+ .strict();
539
+ const ResumeSchema = zod_1.z
540
+ .object({
541
+ enabled: zod_1.z.boolean().default(false).describe("Checkpoint each chunk and skip already-done chunks on re-run"),
542
+ checkpointPath: zod_1.z.string().optional().describe("Checkpoint sidecar file path (default <output>.checkpoint.jsonl)"),
543
+ })
544
+ .strict();
545
+ // Debug/observability run-trace (off by default). Emits a versioned append-only
546
+ // JSONL sidecar of decision events (ingest→classify→extract→ground→merge→export)
547
+ // with mention-instance lineage IDs. Observe-only: the graph is byte-identical on/off.
548
+ const TraceSchema = zod_1.z
549
+ .object({
550
+ enabled: zod_1.z.boolean().default(false).describe("Emit a structured decision run-trace to a JSONL sidecar"),
551
+ path: zod_1.z.string().optional().describe("Trace sidecar file path (default <output>.trace.jsonl)"),
552
+ })
553
+ .strict();
554
+ // Cost / token metering (off by default; zero overhead, byte-identical default run).
555
+ // Records per-model token spend through ILLMProvider, prints a rough pre-run estimate
556
+ // + an exact end-of-run tally, enforces a hard `maxCost` cap (graceful stop), and keeps
557
+ // a resume-safe cumulative ledger sidecar. Setting `maxCost` auto-enables (ContainerFactory).
558
+ const CostSchema = zod_1.z
559
+ .object({
560
+ enabled: zod_1.z.boolean().default(false).describe("Meter LLM token usage + cost; print estimate + tally"),
561
+ maxCost: zod_1.z.coerce
562
+ .number()
563
+ .optional()
564
+ .describe("Hard spend cap (currency units) for THIS run; stops gracefully when exceeded (implies enabled)"),
565
+ currency: zod_1.z.string().default("USD").describe("Currency label for cost output"),
566
+ prices: zod_1.z
567
+ .record(zod_1.z.object({ in: zod_1.z.coerce.number(), out: zod_1.z.coerce.number() }).strict())
568
+ .default({})
569
+ .describe("Per-model price overrides (USD per 1M tokens {in,out}); merged over the built-in map"),
570
+ ledgerPath: zod_1.z.string().optional().describe("Cumulative cost ledger sidecar (default <output>.cost.json)"),
571
+ })
572
+ .strict();
573
+ // Structured-emit adapters (data-sink track): graph-native sources mapped DIRECTLY
574
+ // to graph fragments (no LLM), still flowing through merge/canon. Each adapter is
575
+ // off by default; the registry is empty until one is enabled (default run unchanged).
576
+ const AdaptersSchema = zod_1.z
577
+ .object({
578
+ sqlite: zod_1.z
579
+ .object({
580
+ enabled: zod_1.z.boolean().default(false).describe("Map .db/.sqlite files directly to a graph (tables→types, rows→entities, FK→edges)"),
581
+ extensions: zod_1.z
582
+ .array(zod_1.z.string())
583
+ .default([".db", ".sqlite", ".sqlite3"])
584
+ .describe("File extensions claimed by the SQLite adapter (a non-sqlite file still falls through)"),
585
+ maxRowsPerTable: zod_1.z.coerce.number().int().default(5000).describe("Cap rows emitted per table (warns + truncates beyond)"),
586
+ excludeTables: zod_1.z.array(zod_1.z.string()).default([]).describe("Table names to skip entirely"),
587
+ })
588
+ .strict()
589
+ .default({}),
590
+ })
591
+ .strict();
592
+ const LoggingSchema = zod_1.z
593
+ .object({
594
+ level: exports.LogLevelEnum.default("info").describe("Log level"),
595
+ file: zod_1.z.string().optional().describe("Log file path"),
596
+ debug: zod_1.z.boolean().default(false).describe("Debug mode"),
597
+ silent: zod_1.z.boolean().default(false).describe("Silent mode"),
598
+ progressNdjson: zod_1.z
599
+ .boolean()
600
+ .default(false)
601
+ .describe("Emit structured NDJSON progress events on stdout (suppresses pretty logging)"),
602
+ })
603
+ .strict();
604
+ const RuntimeSchema = zod_1.z
605
+ .object({
606
+ watch: zod_1.z.boolean().default(false).describe("Watch for changes and rebuild the graph"),
607
+ exportOnly: zod_1.z.boolean().default(false).describe("Convert an existing graph JSON (input) to export.format"),
608
+ })
609
+ .strict();
610
+ // ── pipeline stages (canonicalization experiment) ──────────────────────────
611
+ //
612
+ // Explicit, reorderable, enable/disable stages (canon brief §3/§4). For
613
+ // Experiment 1 the producer stages (tf_analysis / schema_induction /
614
+ // extraction) run in fixed relative order and gate existing behavior; the
615
+ // genuinely reorderable part is the post-extraction graph→graph transforms
616
+ // (grounding, canonicalization), which is the seam Experiment 2 needs.
617
+ exports.TfAnalysisSourceEnum = zod_1.z.enum(["corpus", "graph"]);
618
+ exports.CanonMethodEnum = zod_1.z.enum(["embeddings", "llm", "hybrid"]);
619
+ exports.ClusterAlgoEnum = zod_1.z.enum(["agglomerative", "hdbscan", "kmeans"]);
620
+ exports.CanonicalSelectionEnum = zod_1.z.enum(["frequency", "degree"]);
621
+ exports.CanonTargetEnum = zod_1.z.enum(["entities", "relations"]);
622
+ const DEFAULT_STAGES = [
623
+ "tf_analysis",
624
+ "schema_induction",
625
+ "extraction",
626
+ "grounding",
627
+ "canonicalization",
628
+ ];
629
+ const StageToggleSchema = zod_1.z
630
+ .object({ enabled: zod_1.z.boolean().default(true) })
631
+ .strict();
632
+ // The extraction stage: the bare toggle plus the vocabulary mode. `openPredicate`
633
+ // drops the closed entity/relation enum (and the `related_to`/`other` coercion) for
634
+ // free `z.string()` types — the measurement of the canonicalization tax: it lifts
635
+ // information recall (MINE) at the cost of merge/graph hygiene. Default closed.
636
+ const ExtractionStageSchema = zod_1.z
637
+ .object({
638
+ enabled: zod_1.z.boolean().default(true),
639
+ openPredicate: zod_1.z
640
+ .boolean()
641
+ .default(false)
642
+ .describe("Free-vocabulary extraction: emit any predicate/entity type (no closed enum, no related_to/other coercion). Lifts recall, costs merge hygiene. Default closed."),
643
+ strictVocabulary: zod_1.z
644
+ .boolean()
645
+ .default(false)
646
+ .describe("Strict closed vocabulary: when a glossary supplies entity/relation types, the enum is EXACTLY those (+escape) — the base/domain sets are NOT unioned in. For feeding a known ontology as the authoritative schema. Ignored under openPredicate. Default off (augment base)."),
647
+ })
648
+ .strict();
649
+ const TfAnalysisStageSchema = zod_1.z
650
+ .object({
651
+ enabled: zod_1.z.boolean().default(true),
652
+ source: exports.TfAnalysisSourceEnum.default("corpus").describe("Term-frequency source: 'corpus' (lexical, Exp 1) | 'graph' (structural salience, Exp 2 — stat collection only)"),
653
+ })
654
+ .strict();
655
+ // NOTE: distinct from the top-level `grounding` group (the inline *observation*
656
+ // grounding gate). This is the *edge* co-occurrence gate — OFF for Experiment 1,
657
+ // the precision gate Experiment 2 runs before canonicalization.
658
+ const PipelineGroundingSchema = zod_1.z
659
+ .object({
660
+ enabled: zod_1.z.boolean().default(false).describe("Edge co-occurrence grounding gate (OFF for Exp 1)"),
661
+ requireCooccurrence: zod_1.z
662
+ .boolean()
663
+ .default(true)
664
+ .describe("Drop edges whose endpoints don't co-occur in their source span"),
665
+ })
666
+ .strict();
667
+ const CanonClusterSchema = (threshold) => zod_1.z
668
+ .object({
669
+ cluster: exports.ClusterAlgoEnum.default("agglomerative").describe("Clustering algorithm (only 'agglomerative' is implemented)"),
670
+ threshold: num(threshold).describe("Cosine-similarity merge threshold"),
671
+ linkage: zod_1.z
672
+ .enum(["single", "complete"])
673
+ .default("complete")
674
+ .describe("Linkage: 'complete' (every in-cluster pair ≥ threshold; stops sibling chaining) | 'single' (legacy connectivity)"),
675
+ k: zod_1.z.coerce.number().nullable().default(null).describe("Cluster count (only for kmeans)"),
676
+ })
677
+ .strict();
678
+ const simBand = () => zod_1.z
679
+ .tuple([zod_1.z.coerce.number(), zod_1.z.coerce.number()])
680
+ .describe("Similarity band [low, high] considered borderline");
681
+ const CanonicalizationSchema = zod_1.z
682
+ .object({
683
+ enabled: zod_1.z.boolean().default(false).describe("Global embedding-clustering canonicalization pass (after merge)"),
684
+ target: zod_1.z
685
+ .array(exports.CanonTargetEnum)
686
+ .default(["entities", "relations"])
687
+ .describe("Canonicalize entity names/types, edge labels, or both"),
688
+ method: exports.CanonMethodEnum.default("embeddings").describe("embeddings (cluster) | llm (adjudicate) | hybrid (cluster + escalate borderline)"),
689
+ canonicalSelection: exports.CanonicalSelectionEnum.default("frequency").describe("Pick the cluster's canonical representative by frequency or graph degree"),
690
+ blockTopN: zod_1.z.coerce
691
+ .number()
692
+ .int()
693
+ .default(0)
694
+ .describe("Blocking: only an item's N nearest neighbours are merge-eligible (complete-linkage). 0 = off"),
695
+ maxAdjudications: zod_1.z.coerce
696
+ .number()
697
+ .int()
698
+ .default(2000)
699
+ .describe("Safety cap on LLM adjudication calls per canon pass; further escalations reject (the 26K guard)"),
700
+ embeddings: zod_1.z
701
+ .object({
702
+ entity: CanonClusterSchema(0.82).default({}),
703
+ relation: CanonClusterSchema(0.85).default({}),
704
+ })
705
+ .strict()
706
+ .default({}),
707
+ llm: zod_1.z
708
+ .object({
709
+ model: zod_1.z.string().optional().describe("Adjudication model (defaults to llm.model)"),
710
+ adjudicate: zod_1.z.enum(["borderline_only"]).default("borderline_only"),
711
+ band: simBand().default([0.72, 0.88]),
712
+ })
713
+ .strict()
714
+ .default({}),
715
+ hybrid: zod_1.z
716
+ .object({
717
+ escalateBand: simBand().default([0.72, 0.88]),
718
+ })
719
+ .strict()
720
+ .default({}),
721
+ })
722
+ .strict();
723
+ const PipelineRelationFilterSchema = zod_1.z
724
+ .object({
725
+ // `related_to` is the relation layer's catch-all (NR-4): on the telegram-sink corpus
726
+ // ~30% of edges. This post-canon gate prunes the low-value subset. `redundant` drops
727
+ // a `related_to` edge only when the same unordered endpoint pair already carries a
728
+ // typed edge (safe — no information lost). `all` drops every `related_to` edge (for
729
+ // consumers wanting only typed relations). Re-typing (LLM pass) is a future option.
730
+ mode: zod_1.z
731
+ .enum(["off", "redundant", "all"])
732
+ .default("off")
733
+ .describe("related_to pruning: off | redundant (drop when a typed twin exists) | all"),
734
+ })
735
+ .strict();
736
+ const PipelineSchema = zod_1.z
737
+ .object({
738
+ stages: zod_1.z
739
+ .array(zod_1.z.string())
740
+ .default(DEFAULT_STAGES)
741
+ .describe("Ordered stage list; reorder for Experiment 2 (typeless-first)"),
742
+ tfAnalysis: TfAnalysisStageSchema.default({}),
743
+ schemaInduction: StageToggleSchema.default({}),
744
+ extraction: ExtractionStageSchema.default({}),
745
+ grounding: PipelineGroundingSchema.default({}),
746
+ canonicalization: CanonicalizationSchema.default({}),
747
+ relationFilter: PipelineRelationFilterSchema.default({}),
748
+ })
749
+ .strict();
750
+ const InspectionSchema = zod_1.z
751
+ .object({
752
+ emitMergeLog: zod_1.z.boolean().default(false).describe("Write the per-cluster canonicalization merge log"),
753
+ mergeLogPath: zod_1.z
754
+ .string()
755
+ .optional()
756
+ .describe("Merge-log path (default runs/<run_id>/merges.jsonl)"),
757
+ })
758
+ .strict();
759
+ const EvalSchema = zod_1.z
760
+ .object({
761
+ seed: zod_1.z.coerce.number().optional().describe("Experiment seed (recorded in the run manifest)"),
762
+ groundTruth: zod_1.z.string().optional().describe("Ground-truth facts JSONL for scoring"),
763
+ pinVersions: zod_1.z
764
+ .boolean()
765
+ .default(true)
766
+ .describe("Pin model/embedding/seed versions in the run manifest"),
767
+ })
768
+ .strict();
769
+ // ── root schema ────────────────────────────────────────────────────────────
770
+ exports.ConfigSchema = zod_1.z
771
+ .object({
772
+ // Core run essentials stay top-level.
773
+ input: zod_1.z.string().default(".").describe("Input directory (or existing graph file in export-only mode)"),
774
+ filter: stringList(["**/*"]).describe("Include files by glob (string or list)"),
775
+ exclude: stringList([]).describe("Exclude files by glob (string or list)"),
776
+ output: zod_1.z.string().default("knowledge-graph.json").describe("Output knowledge graph file"),
777
+ description: zod_1.z.string().default("").describe("Short description of the corpus for the LLM"),
778
+ // Grouped by concern.
779
+ llm: LlmSchema.default({}),
780
+ embeddings: EmbeddingsSchema.default({}),
781
+ chunking: ChunkingSchema.default({}),
782
+ retrieval: RetrievalSchema.default({}),
783
+ merging: MergingSchema.default({}),
784
+ grounding: GroundingSchema.default({}),
785
+ corpus: CorpusSchema.default({}),
786
+ ast: AstSchema.default({}),
787
+ classifier: ClassifierSchema.default({}),
788
+ readers: ReadersSchema.default({}),
789
+ references: ReferencesSchema.default({}),
790
+ adapters: AdaptersSchema.default({}),
791
+ export: ExportSchema.default({}),
792
+ resume: ResumeSchema.default({}),
793
+ trace: TraceSchema.default({}),
794
+ cost: CostSchema.default({}),
795
+ logging: LoggingSchema.default({}),
796
+ runtime: RuntimeSchema.default({}),
797
+ // Canonicalization experiment (canon brief). Config-only (no CLI flags).
798
+ pipeline: PipelineSchema.default({}),
799
+ inspection: InspectionSchema.default({}),
800
+ eval: EvalSchema.default({}),
801
+ })
802
+ .strict();
803
+ //# sourceMappingURL=schema.js.map