@wanshi-kg/wanshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (443) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +458 -0
  3. package/dist/__tests__/helpers.js +27 -0
  4. package/dist/__tests__/helpers.js.map +1 -0
  5. package/dist/cli/commands/export.command.js +99 -0
  6. package/dist/cli/commands/export.command.js.map +1 -0
  7. package/dist/cli/commands/index.js +22 -0
  8. package/dist/cli/commands/index.js.map +1 -0
  9. package/dist/cli/commands/inspectMerges.command.js +84 -0
  10. package/dist/cli/commands/inspectMerges.command.js.map +1 -0
  11. package/dist/cli/commands/metrics.command.js +196 -0
  12. package/dist/cli/commands/metrics.command.js.map +1 -0
  13. package/dist/cli/commands/process.command.js +82 -0
  14. package/dist/cli/commands/process.command.js.map +1 -0
  15. package/dist/cli/commands/watch.command.js +91 -0
  16. package/dist/cli/commands/watch.command.js.map +1 -0
  17. package/dist/cli/index.js +269 -0
  18. package/dist/cli/index.js.map +1 -0
  19. package/dist/cli/optionsToConfig.js +160 -0
  20. package/dist/cli/optionsToConfig.js.map +1 -0
  21. package/dist/config/index.js +59 -0
  22. package/dist/config/index.js.map +1 -0
  23. package/dist/config/legacyHints.js +113 -0
  24. package/dist/config/legacyHints.js.map +1 -0
  25. package/dist/config/schema.js +803 -0
  26. package/dist/config/schema.js.map +1 -0
  27. package/dist/config/ui.js +221 -0
  28. package/dist/config/ui.js.map +1 -0
  29. package/dist/core/DirectoryProcessor.js +725 -0
  30. package/dist/core/DirectoryProcessor.js.map +1 -0
  31. package/dist/core/adapters/IStructuredAdapter.js +3 -0
  32. package/dist/core/adapters/IStructuredAdapter.js.map +1 -0
  33. package/dist/core/adapters/SqliteAdapter.js +267 -0
  34. package/dist/core/adapters/SqliteAdapter.js.map +1 -0
  35. package/dist/core/adapters/StructuredAdapterRegistry.js +31 -0
  36. package/dist/core/adapters/StructuredAdapterRegistry.js.map +1 -0
  37. package/dist/core/adapters/index.js +20 -0
  38. package/dist/core/adapters/index.js.map +1 -0
  39. package/dist/core/checkpoint/CheckpointService.js +188 -0
  40. package/dist/core/checkpoint/CheckpointService.js.map +1 -0
  41. package/dist/core/checkpoint/index.js +18 -0
  42. package/dist/core/checkpoint/index.js.map +1 -0
  43. package/dist/core/corpus/CorpusAnalyzer.js +266 -0
  44. package/dist/core/corpus/CorpusAnalyzer.js.map +1 -0
  45. package/dist/core/corpus/CorpusProfileStore.js +92 -0
  46. package/dist/core/corpus/CorpusProfileStore.js.map +1 -0
  47. package/dist/core/corpus/index.js +21 -0
  48. package/dist/core/corpus/index.js.map +1 -0
  49. package/dist/core/corpus/normalizeGlossary.js +60 -0
  50. package/dist/core/corpus/normalizeGlossary.js.map +1 -0
  51. package/dist/core/corpus/relPath.js +52 -0
  52. package/dist/core/corpus/relPath.js.map +1 -0
  53. package/dist/core/corpus/termFrequency.js +86 -0
  54. package/dist/core/corpus/termFrequency.js.map +1 -0
  55. package/dist/core/cost/CostMeter.js +235 -0
  56. package/dist/core/cost/CostMeter.js.map +1 -0
  57. package/dist/core/cost/index.js +19 -0
  58. package/dist/core/cost/index.js.map +1 -0
  59. package/dist/core/cost/prices.js +38 -0
  60. package/dist/core/cost/prices.js.map +1 -0
  61. package/dist/core/cv/ObjectDetectionService.js +119 -0
  62. package/dist/core/cv/ObjectDetectionService.js.map +1 -0
  63. package/dist/core/di/ContainerFactory.js +670 -0
  64. package/dist/core/di/ContainerFactory.js.map +1 -0
  65. package/dist/core/di/DIContainer.js +103 -0
  66. package/dist/core/di/DIContainer.js.map +1 -0
  67. package/dist/core/di/index.js +19 -0
  68. package/dist/core/di/index.js.map +1 -0
  69. package/dist/core/errors/CustomErrors.js +342 -0
  70. package/dist/core/errors/CustomErrors.js.map +1 -0
  71. package/dist/core/errors/index.js +18 -0
  72. package/dist/core/errors/index.js.map +1 -0
  73. package/dist/core/export/KnowledgeGraphExportService.js +56 -0
  74. package/dist/core/export/KnowledgeGraphExportService.js.map +1 -0
  75. package/dist/core/export/index.js +19 -0
  76. package/dist/core/export/index.js.map +1 -0
  77. package/dist/core/export/strategies/GraphitiExportStrategy.js +115 -0
  78. package/dist/core/export/strategies/GraphitiExportStrategy.js.map +1 -0
  79. package/dist/core/export/strategies/GraphvizDotExportStrategy.js +331 -0
  80. package/dist/core/export/strategies/GraphvizDotExportStrategy.js.map +1 -0
  81. package/dist/core/export/strategies/IExportStrategy.js +3 -0
  82. package/dist/core/export/strategies/IExportStrategy.js.map +1 -0
  83. package/dist/core/export/strategies/JsonExportStrategy.js +19 -0
  84. package/dist/core/export/strategies/JsonExportStrategy.js.map +1 -0
  85. package/dist/core/export/strategies/JsonlExportStrategy.js +69 -0
  86. package/dist/core/export/strategies/JsonlExportStrategy.js.map +1 -0
  87. package/dist/core/export/strategies/KblamExportStrategy.js +36 -0
  88. package/dist/core/export/strategies/KblamExportStrategy.js.map +1 -0
  89. package/dist/core/export/strategies/LoraExportStrategy.js +46 -0
  90. package/dist/core/export/strategies/LoraExportStrategy.js.map +1 -0
  91. package/dist/core/export/strategies/McpExportStrategy.js +67 -0
  92. package/dist/core/export/strategies/McpExportStrategy.js.map +1 -0
  93. package/dist/core/export/strategies/index.js +25 -0
  94. package/dist/core/export/strategies/index.js.map +1 -0
  95. package/dist/core/export/strategies/kbTriples.js +60 -0
  96. package/dist/core/export/strategies/kbTriples.js.map +1 -0
  97. package/dist/core/index.js +22 -0
  98. package/dist/core/index.js.map +1 -0
  99. package/dist/core/knowledge/KnowledgeGraphBuilder.js +627 -0
  100. package/dist/core/knowledge/KnowledgeGraphBuilder.js.map +1 -0
  101. package/dist/core/knowledge/MergeRecord.js +3 -0
  102. package/dist/core/knowledge/MergeRecord.js.map +1 -0
  103. package/dist/core/knowledge/canon/Canonicalizer.js +414 -0
  104. package/dist/core/knowledge/canon/Canonicalizer.js.map +1 -0
  105. package/dist/core/knowledge/canon/index.js +18 -0
  106. package/dist/core/knowledge/canon/index.js.map +1 -0
  107. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js +92 -0
  108. package/dist/core/knowledge/contradiction/HeuristicContradictionChecker.js.map +1 -0
  109. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js +52 -0
  110. package/dist/core/knowledge/contradiction/LlmContradictionChecker.js.map +1 -0
  111. package/dist/core/knowledge/contradiction/index.js +19 -0
  112. package/dist/core/knowledge/contradiction/index.js.map +1 -0
  113. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js +33 -0
  114. package/dist/core/knowledge/grounding/KeywordGroundingChecker.js.map +1 -0
  115. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js +82 -0
  116. package/dist/core/knowledge/grounding/MiniCheckGroundingChecker.js.map +1 -0
  117. package/dist/core/knowledge/grounding/index.js +20 -0
  118. package/dist/core/knowledge/grounding/index.js.map +1 -0
  119. package/dist/core/knowledge/grounding/verbalize.js +38 -0
  120. package/dist/core/knowledge/grounding/verbalize.js.map +1 -0
  121. package/dist/core/knowledge/images/imageMetaGraph.js +136 -0
  122. package/dist/core/knowledge/images/imageMetaGraph.js.map +1 -0
  123. package/dist/core/knowledge/index.js +20 -0
  124. package/dist/core/knowledge/index.js.map +1 -0
  125. package/dist/core/knowledge/merging/KnowledgeMerger.js +624 -0
  126. package/dist/core/knowledge/merging/KnowledgeMerger.js.map +1 -0
  127. package/dist/core/knowledge/references/ReferenceResolver.js +184 -0
  128. package/dist/core/knowledge/references/ReferenceResolver.js.map +1 -0
  129. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js +401 -0
  130. package/dist/core/knowledge/references/citations/CitationEvidenceProcessor.js.map +1 -0
  131. package/dist/core/knowledge/references/citations/CitationResolver.js +95 -0
  132. package/dist/core/knowledge/references/citations/CitationResolver.js.map +1 -0
  133. package/dist/core/knowledge/references/citations/GrobidClient.js +143 -0
  134. package/dist/core/knowledge/references/citations/GrobidClient.js.map +1 -0
  135. package/dist/core/knowledge/references/citations/TitleIdResolver.js +101 -0
  136. package/dist/core/knowledge/references/citations/TitleIdResolver.js.map +1 -0
  137. package/dist/core/knowledge/references/web/FetchCacheService.js +114 -0
  138. package/dist/core/knowledge/references/web/FetchCacheService.js.map +1 -0
  139. package/dist/core/knowledge/references/web/GatedFetcher.js +228 -0
  140. package/dist/core/knowledge/references/web/GatedFetcher.js.map +1 -0
  141. package/dist/core/knowledge/references/web/WebReferenceProcessor.js +164 -0
  142. package/dist/core/knowledge/references/web/WebReferenceProcessor.js.map +1 -0
  143. package/dist/core/knowledge/search/KnowledgeGraphSearch.js +261 -0
  144. package/dist/core/knowledge/search/KnowledgeGraphSearch.js.map +1 -0
  145. package/dist/core/knowledge/vocabulary.js +162 -0
  146. package/dist/core/knowledge/vocabulary.js.map +1 -0
  147. package/dist/core/llm/EmbeddingService.js +113 -0
  148. package/dist/core/llm/EmbeddingService.js.map +1 -0
  149. package/dist/core/llm/OllamaService.js +146 -0
  150. package/dist/core/llm/OllamaService.js.map +1 -0
  151. package/dist/core/llm/OpenAICompatibleService.js +190 -0
  152. package/dist/core/llm/OpenAICompatibleService.js.map +1 -0
  153. package/dist/core/llm/OpenAIEmbeddingService.js +129 -0
  154. package/dist/core/llm/OpenAIEmbeddingService.js.map +1 -0
  155. package/dist/core/llm/embeddingUtils.js +25 -0
  156. package/dist/core/llm/embeddingUtils.js.map +1 -0
  157. package/dist/core/llm/index.js +23 -0
  158. package/dist/core/llm/index.js.map +1 -0
  159. package/dist/core/llm/prompts/PromptManager.js +388 -0
  160. package/dist/core/llm/prompts/PromptManager.js.map +1 -0
  161. package/dist/core/llm/prompts/PromptTemplateEngine.js +257 -0
  162. package/dist/core/llm/prompts/PromptTemplateEngine.js.map +1 -0
  163. package/dist/core/llm/prompts/templates/partials/examples/EXAMPLE_STYLE_GUIDE.md +84 -0
  164. package/dist/core/llm/prompts/templates/partials/examples/article.md +187 -0
  165. package/dist/core/llm/prompts/templates/partials/examples/code.md +229 -0
  166. package/dist/core/llm/prompts/templates/partials/examples/communication.md +205 -0
  167. package/dist/core/llm/prompts/templates/partials/examples/documentation.md +262 -0
  168. package/dist/core/llm/prompts/templates/partials/examples/financial.md +157 -0
  169. package/dist/core/llm/prompts/templates/partials/examples/legal.md +153 -0
  170. package/dist/core/llm/prompts/templates/partials/examples/logs.md +127 -0
  171. package/dist/core/llm/prompts/templates/partials/examples/medical.md +218 -0
  172. package/dist/core/llm/prompts/templates/partials/examples/notes.md +201 -0
  173. package/dist/core/llm/prompts/templates/partials/examples/research.md +208 -0
  174. package/dist/core/llm/prompts/templates/partials/examples/tabular.md +178 -0
  175. package/dist/core/llm/prompts/templates/partials/examples/transcript.md +204 -0
  176. package/dist/core/llm/prompts/templates/partials/retrieved-context.hbs +18 -0
  177. package/dist/core/llm/prompts/templates/v1/system.hbs +371 -0
  178. package/dist/core/llm/prompts/templates/v1/user.hbs +20 -0
  179. package/dist/core/llm/prompts/templates/v2/system.hbs +573 -0
  180. package/dist/core/llm/prompts/templates/v2/user.hbs +20 -0
  181. package/dist/core/llm/prompts/templates/v3/system.hbs +861 -0
  182. package/dist/core/llm/prompts/templates/v3/user.hbs +16 -0
  183. package/dist/core/llm/prompts/templates/v4/system.hbs +800 -0
  184. package/dist/core/llm/prompts/templates/v4/user.hbs +40 -0
  185. package/dist/core/llm/prompts/templates/v4.5/system.hbs +71 -0
  186. package/dist/core/llm/prompts/templates/v4.5/user.hbs +46 -0
  187. package/dist/core/llm/prompts/templates/v5/glossary/system.hbs +40 -0
  188. package/dist/core/llm/prompts/templates/v5/glossary/user.hbs +11 -0
  189. package/dist/core/llm/prompts/templates/v5/system.hbs +163 -0
  190. package/dist/core/llm/prompts/templates/v5/user.hbs +55 -0
  191. package/dist/core/pipeline/GroundingTransform.js +52 -0
  192. package/dist/core/pipeline/GroundingTransform.js.map +1 -0
  193. package/dist/core/pipeline/PipelineRunner.js +51 -0
  194. package/dist/core/pipeline/PipelineRunner.js.map +1 -0
  195. package/dist/core/pipeline/RelationFilterTransform.js +72 -0
  196. package/dist/core/pipeline/RelationFilterTransform.js.map +1 -0
  197. package/dist/core/pipeline/index.js +20 -0
  198. package/dist/core/pipeline/index.js.map +1 -0
  199. package/dist/core/processor/FileProcessor.js +184 -0
  200. package/dist/core/processor/FileProcessor.js.map +1 -0
  201. package/dist/core/processor/ProcessedRegistry.js +38 -0
  202. package/dist/core/processor/ProcessedRegistry.js.map +1 -0
  203. package/dist/core/processor/ast/AstSeedService.js +0 -0
  204. package/dist/core/processor/ast/AstSeedService.js.map +1 -0
  205. package/dist/core/processor/ast/AstSymbolStore.js +110 -0
  206. package/dist/core/processor/ast/AstSymbolStore.js.map +1 -0
  207. package/dist/core/processor/ast/index.js +19 -0
  208. package/dist/core/processor/ast/index.js.map +1 -0
  209. package/dist/core/processor/chunking/TextChunker.js +98 -0
  210. package/dist/core/processor/chunking/TextChunker.js.map +1 -0
  211. package/dist/core/processor/chunking/index.js +18 -0
  212. package/dist/core/processor/chunking/index.js.map +1 -0
  213. package/dist/core/processor/classifier/CONTENT_CLASSES.js +294 -0
  214. package/dist/core/processor/classifier/CONTENT_CLASSES.js.map +1 -0
  215. package/dist/core/processor/classifier/CascadeContentClassifier.js +107 -0
  216. package/dist/core/processor/classifier/CascadeContentClassifier.js.map +1 -0
  217. package/dist/core/processor/classifier/HeuristicContentClassifier.js +113 -0
  218. package/dist/core/processor/classifier/HeuristicContentClassifier.js.map +1 -0
  219. package/dist/core/processor/classifier/IContentTypeClassifier.js +3 -0
  220. package/dist/core/processor/classifier/IContentTypeClassifier.js.map +1 -0
  221. package/dist/core/processor/classifier/LlmContentClassifier.js +107 -0
  222. package/dist/core/processor/classifier/LlmContentClassifier.js.map +1 -0
  223. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js +498 -0
  224. package/dist/core/processor/classifier/NER_DOMAIN_EXAMPLES.js.map +1 -0
  225. package/dist/core/processor/classifier/index.js +21 -0
  226. package/dist/core/processor/classifier/index.js.map +1 -0
  227. package/dist/core/processor/classifier/mergeClassifications.js +32 -0
  228. package/dist/core/processor/classifier/mergeClassifications.js.map +1 -0
  229. package/dist/core/processor/index.js +20 -0
  230. package/dist/core/processor/index.js.map +1 -0
  231. package/dist/core/processor/readers/AudioReader.js +462 -0
  232. package/dist/core/processor/readers/AudioReader.js.map +1 -0
  233. package/dist/core/processor/readers/BinaryReader.js +90 -0
  234. package/dist/core/processor/readers/BinaryReader.js.map +1 -0
  235. package/dist/core/processor/readers/ChandraPdfReader.js +187 -0
  236. package/dist/core/processor/readers/ChandraPdfReader.js.map +1 -0
  237. package/dist/core/processor/readers/ChatExportReader.js +365 -0
  238. package/dist/core/processor/readers/ChatExportReader.js.map +1 -0
  239. package/dist/core/processor/readers/DoclingReader.js +445 -0
  240. package/dist/core/processor/readers/DoclingReader.js.map +1 -0
  241. package/dist/core/processor/readers/EmailReader.js +259 -0
  242. package/dist/core/processor/readers/EmailReader.js.map +1 -0
  243. package/dist/core/processor/readers/EpubReader.js +175 -0
  244. package/dist/core/processor/readers/EpubReader.js.map +1 -0
  245. package/dist/core/processor/readers/FileReader.js +90 -0
  246. package/dist/core/processor/readers/FileReader.js.map +1 -0
  247. package/dist/core/processor/readers/FileReaderFactory.js +49 -0
  248. package/dist/core/processor/readers/FileReaderFactory.js.map +1 -0
  249. package/dist/core/processor/readers/HtmlReader.js +371 -0
  250. package/dist/core/processor/readers/HtmlReader.js.map +1 -0
  251. package/dist/core/processor/readers/ImageReader.js +162 -0
  252. package/dist/core/processor/readers/ImageReader.js.map +1 -0
  253. package/dist/core/processor/readers/JsonFileReader.js +232 -0
  254. package/dist/core/processor/readers/JsonFileReader.js.map +1 -0
  255. package/dist/core/processor/readers/JupyterReader.js +178 -0
  256. package/dist/core/processor/readers/JupyterReader.js.map +1 -0
  257. package/dist/core/processor/readers/LatexReader.js +176 -0
  258. package/dist/core/processor/readers/LatexReader.js.map +1 -0
  259. package/dist/core/processor/readers/MarkdownReader.js +289 -0
  260. package/dist/core/processor/readers/MarkdownReader.js.map +1 -0
  261. package/dist/core/processor/readers/MarkerPdfReader.js +193 -0
  262. package/dist/core/processor/readers/MarkerPdfReader.js.map +1 -0
  263. package/dist/core/processor/readers/MistralOcrReader.js +198 -0
  264. package/dist/core/processor/readers/MistralOcrReader.js.map +1 -0
  265. package/dist/core/processor/readers/OfficeReader.js +174 -0
  266. package/dist/core/processor/readers/OfficeReader.js.map +1 -0
  267. package/dist/core/processor/readers/PdfReader.js +116 -0
  268. package/dist/core/processor/readers/PdfReader.js.map +1 -0
  269. package/dist/core/processor/readers/RtfReader.js +107 -0
  270. package/dist/core/processor/readers/RtfReader.js.map +1 -0
  271. package/dist/core/processor/readers/SubtitleReader.js +145 -0
  272. package/dist/core/processor/readers/SubtitleReader.js.map +1 -0
  273. package/dist/core/processor/readers/TesseractPdfReader.js +183 -0
  274. package/dist/core/processor/readers/TesseractPdfReader.js.map +1 -0
  275. package/dist/core/processor/readers/TextReader.js +129 -0
  276. package/dist/core/processor/readers/TextReader.js.map +1 -0
  277. package/dist/core/processor/readers/TranscriptReader.js +234 -0
  278. package/dist/core/processor/readers/TranscriptReader.js.map +1 -0
  279. package/dist/core/processor/readers/image/imageMetadata.js +155 -0
  280. package/dist/core/processor/readers/image/imageMetadata.js.map +1 -0
  281. package/dist/core/processor/readers/index.js +41 -0
  282. package/dist/core/processor/readers/index.js.map +1 -0
  283. package/dist/core/processor/readers/referenceExtraction.js +198 -0
  284. package/dist/core/processor/readers/referenceExtraction.js.map +1 -0
  285. package/dist/core/processor/readers/stripReferences.js +59 -0
  286. package/dist/core/processor/readers/stripReferences.js.map +1 -0
  287. package/dist/core/processor/readers/transcript/turnPacking.js +81 -0
  288. package/dist/core/processor/readers/transcript/turnPacking.js.map +1 -0
  289. package/dist/core/progress/NdjsonProgressEmitter.js +30 -0
  290. package/dist/core/progress/NdjsonProgressEmitter.js.map +1 -0
  291. package/dist/core/progress/NoopProgressEmitter.js +15 -0
  292. package/dist/core/progress/NoopProgressEmitter.js.map +1 -0
  293. package/dist/core/progress/index.js +19 -0
  294. package/dist/core/progress/index.js.map +1 -0
  295. package/dist/core/trace/TraceWriter.js +100 -0
  296. package/dist/core/trace/TraceWriter.js.map +1 -0
  297. package/dist/core/trace/events.js +13 -0
  298. package/dist/core/trace/events.js.map +1 -0
  299. package/dist/core/trace/index.js +20 -0
  300. package/dist/core/trace/index.js.map +1 -0
  301. package/dist/core/trace/lineage.js +97 -0
  302. package/dist/core/trace/lineage.js.map +1 -0
  303. package/dist/evaluation/BenchmarkRunner.js +171 -0
  304. package/dist/evaluation/BenchmarkRunner.js.map +1 -0
  305. package/dist/evaluation/classifier/ClassifierAccuracy.js +185 -0
  306. package/dist/evaluation/classifier/ClassifierAccuracy.js.map +1 -0
  307. package/dist/evaluation/classifier/labeledSamples.js +379 -0
  308. package/dist/evaluation/classifier/labeledSamples.js.map +1 -0
  309. package/dist/evaluation/compare/goldCompare.js +126 -0
  310. package/dist/evaluation/compare/goldCompare.js.map +1 -0
  311. package/dist/evaluation/crossre/compareScoring.js +30 -0
  312. package/dist/evaluation/crossre/compareScoring.js.map +1 -0
  313. package/dist/evaluation/datasets/CrossREDataset.js +170 -0
  314. package/dist/evaluation/datasets/CrossREDataset.js.map +1 -0
  315. package/dist/evaluation/datasets/IDataset.js +3 -0
  316. package/dist/evaluation/datasets/IDataset.js.map +1 -0
  317. package/dist/evaluation/datasets/RebelDataset.js +117 -0
  318. package/dist/evaluation/datasets/RebelDataset.js.map +1 -0
  319. package/dist/evaluation/datasets/RedocredDataset.js +218 -0
  320. package/dist/evaluation/datasets/RedocredDataset.js.map +1 -0
  321. package/dist/evaluation/datasets/SemEval2010Dataset.js +150 -0
  322. package/dist/evaluation/datasets/SemEval2010Dataset.js.map +1 -0
  323. package/dist/evaluation/index.js +33 -0
  324. package/dist/evaluation/index.js.map +1 -0
  325. package/dist/evaluation/matching/ExactMatcher.js +75 -0
  326. package/dist/evaluation/matching/ExactMatcher.js.map +1 -0
  327. package/dist/evaluation/matching/SemanticMatcher.js +143 -0
  328. package/dist/evaluation/matching/SemanticMatcher.js.map +1 -0
  329. package/dist/evaluation/metrics/TripleMetrics.js +64 -0
  330. package/dist/evaluation/metrics/TripleMetrics.js.map +1 -0
  331. package/dist/evaluation/mine/MineCheckpoint.js +114 -0
  332. package/dist/evaluation/mine/MineCheckpoint.js.map +1 -0
  333. package/dist/evaluation/mine/MineDataset.js +208 -0
  334. package/dist/evaluation/mine/MineDataset.js.map +1 -0
  335. package/dist/evaluation/mine/MineReporter.js +98 -0
  336. package/dist/evaluation/mine/MineReporter.js.map +1 -0
  337. package/dist/evaluation/mine/MineRunner.js +148 -0
  338. package/dist/evaluation/mine/MineRunner.js.map +1 -0
  339. package/dist/evaluation/mine/MineScorer.js +127 -0
  340. package/dist/evaluation/mine/MineScorer.js.map +1 -0
  341. package/dist/evaluation/mine/types.js +12 -0
  342. package/dist/evaluation/mine/types.js.map +1 -0
  343. package/dist/evaluation/reporters/ConsoleReporter.js +55 -0
  344. package/dist/evaluation/reporters/ConsoleReporter.js.map +1 -0
  345. package/dist/evaluation/reporters/JsonReporter.js +50 -0
  346. package/dist/evaluation/reporters/JsonReporter.js.map +1 -0
  347. package/dist/index.js +28 -0
  348. package/dist/index.js.map +1 -0
  349. package/dist/quality/CompositeScore.js +61 -0
  350. package/dist/quality/CompositeScore.js.map +1 -0
  351. package/dist/quality/ConsistencyMetrics.js +70 -0
  352. package/dist/quality/ConsistencyMetrics.js.map +1 -0
  353. package/dist/quality/FactualMetrics.js +76 -0
  354. package/dist/quality/FactualMetrics.js.map +1 -0
  355. package/dist/quality/GraphHealthMetrics.js +68 -0
  356. package/dist/quality/GraphHealthMetrics.js.map +1 -0
  357. package/dist/quality/SemanticMetrics.js +102 -0
  358. package/dist/quality/SemanticMetrics.js.map +1 -0
  359. package/dist/quality/StructuralMetrics.js +60 -0
  360. package/dist/quality/StructuralMetrics.js.map +1 -0
  361. package/dist/quality/index.js +23 -0
  362. package/dist/quality/index.js.map +1 -0
  363. package/dist/shared/index.js +20 -0
  364. package/dist/shared/index.js.map +1 -0
  365. package/dist/shared/logger/Logger.js +3 -0
  366. package/dist/shared/logger/Logger.js.map +1 -0
  367. package/dist/shared/logger/LoggerFactory.js +75 -0
  368. package/dist/shared/logger/LoggerFactory.js.map +1 -0
  369. package/dist/shared/logger/index.js +19 -0
  370. package/dist/shared/logger/index.js.map +1 -0
  371. package/dist/shared/shutdown.js +30 -0
  372. package/dist/shared/shutdown.js.map +1 -0
  373. package/dist/shared/utils/agglomerativeCluster.js +269 -0
  374. package/dist/shared/utils/agglomerativeCluster.js.map +1 -0
  375. package/dist/shared/utils/astSymbols.js +69 -0
  376. package/dist/shared/utils/astSymbols.js.map +1 -0
  377. package/dist/shared/utils/cosineSimilarity.js +18 -0
  378. package/dist/shared/utils/cosineSimilarity.js.map +1 -0
  379. package/dist/shared/utils/directoryTree.js +184 -0
  380. package/dist/shared/utils/directoryTree.js.map +1 -0
  381. package/dist/shared/utils/documentOutline.js +74 -0
  382. package/dist/shared/utils/documentOutline.js.map +1 -0
  383. package/dist/shared/utils/index.js +24 -0
  384. package/dist/shared/utils/index.js.map +1 -0
  385. package/dist/shared/utils/jaroWinklerSimilarity.js +60 -0
  386. package/dist/shared/utils/jaroWinklerSimilarity.js.map +1 -0
  387. package/dist/shared/utils/parseJsonLenient.js +27 -0
  388. package/dist/shared/utils/parseJsonLenient.js.map +1 -0
  389. package/dist/shared/utils/readConfig.js +42 -0
  390. package/dist/shared/utils/readConfig.js.map +1 -0
  391. package/dist/shared/utils/readRtf.js +216 -0
  392. package/dist/shared/utils/readRtf.js.map +1 -0
  393. package/dist/shared/utils/softmax.js +26 -0
  394. package/dist/shared/utils/softmax.js.map +1 -0
  395. package/dist/types/ContentClass.js +3 -0
  396. package/dist/types/ContentClass.js.map +1 -0
  397. package/dist/types/CorpusProfile.js +3 -0
  398. package/dist/types/CorpusProfile.js.map +1 -0
  399. package/dist/types/IContradictionChecker.js +3 -0
  400. package/dist/types/IContradictionChecker.js.map +1 -0
  401. package/dist/types/ICorpusAnalyzer.js +3 -0
  402. package/dist/types/ICorpusAnalyzer.js.map +1 -0
  403. package/dist/types/IDirectoryProcessor.js +3 -0
  404. package/dist/types/IDirectoryProcessor.js.map +1 -0
  405. package/dist/types/IEmbeddingProvider.js +3 -0
  406. package/dist/types/IEmbeddingProvider.js.map +1 -0
  407. package/dist/types/IEmbeddingService.js +6 -0
  408. package/dist/types/IEmbeddingService.js.map +1 -0
  409. package/dist/types/IFileProcessor.js +3 -0
  410. package/dist/types/IFileProcessor.js.map +1 -0
  411. package/dist/types/IGroundingChecker.js +3 -0
  412. package/dist/types/IGroundingChecker.js.map +1 -0
  413. package/dist/types/IKnowledgeGraphBuilder.js +3 -0
  414. package/dist/types/IKnowledgeGraphBuilder.js.map +1 -0
  415. package/dist/types/IKnowledgeGraphExporter.js +3 -0
  416. package/dist/types/IKnowledgeGraphExporter.js.map +1 -0
  417. package/dist/types/IKnowledgeGraphMerger.js +3 -0
  418. package/dist/types/IKnowledgeGraphMerger.js.map +1 -0
  419. package/dist/types/IKnowledgeGraphSearch.js +3 -0
  420. package/dist/types/IKnowledgeGraphSearch.js.map +1 -0
  421. package/dist/types/ILLMProvider.js +3 -0
  422. package/dist/types/ILLMProvider.js.map +1 -0
  423. package/dist/types/ILLMService.js +3 -0
  424. package/dist/types/ILLMService.js.map +1 -0
  425. package/dist/types/IObjectDetector.js +3 -0
  426. package/dist/types/IObjectDetector.js.map +1 -0
  427. package/dist/types/IProcessingService.js +3 -0
  428. package/dist/types/IProcessingService.js.map +1 -0
  429. package/dist/types/IProgressEmitter.js +3 -0
  430. package/dist/types/IProgressEmitter.js.map +1 -0
  431. package/dist/types/IPromptManager.js +3 -0
  432. package/dist/types/IPromptManager.js.map +1 -0
  433. package/dist/types/KnowledgeGraph.js +3 -0
  434. package/dist/types/KnowledgeGraph.js.map +1 -0
  435. package/dist/types/MCPKnowledgeGraph.js +3 -0
  436. package/dist/types/MCPKnowledgeGraph.js.map +1 -0
  437. package/dist/types/Observation.js +21 -0
  438. package/dist/types/Observation.js.map +1 -0
  439. package/dist/types/ProcessingOptions.js +3 -0
  440. package/dist/types/ProcessingOptions.js.map +1 -0
  441. package/dist/types/index.js +40 -0
  442. package/dist/types/index.js.map +1 -0
  443. package/package.json +122 -0
@@ -0,0 +1,266 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.CorpusAnalyzer = void 0;
46
+ const crypto = __importStar(require("crypto"));
47
+ const fs = __importStar(require("fs"));
48
+ const zod_1 = require("zod");
49
+ const shared_1 = require("../../shared");
50
+ const termFrequency_1 = require("./termFrequency");
51
+ const CorpusProfileStore_1 = require("./CorpusProfileStore");
52
+ const normalizeGlossary_1 = require("./normalizeGlossary");
53
+ const relPath_1 = require("./relPath");
54
+ /** Per-file text read for the pre-pass is capped to bound frequency + classifier cost. */
55
+ const PER_FILE_CHAR_CAP = 16000;
56
+ /** Inline glossary system prompt, used when no versioned template is available. */
57
+ const FALLBACK_GLOSSARY_SYSTEM = "You design a controlled vocabulary (glossary) for knowledge-graph extraction " +
58
+ "over a document corpus. Given the dominant content type and the most frequent " +
59
+ "terms, propose: (1) canonical ENTITY NAMES — the real recurring proper nouns / " +
60
+ "key concepts, each normalized to ONE canonical spelling so extraction stays " +
61
+ "consistent; (2) ENTITY TYPES appropriate to this corpus; (3) RELATION TYPES " +
62
+ "appropriate to this corpus. Prefer terms that actually appear. Be concise — a " +
63
+ "few dozen names at most. Return JSON only.";
64
+ const GlossarySchema = zod_1.z.object({
65
+ entityNames: zod_1.z.array(zod_1.z.string()).describe("Canonical entity names recurring in this corpus"),
66
+ entityTypes: zod_1.z.array(zod_1.z.string()).describe("Entity type categories appropriate to this corpus"),
67
+ relationTypes: zod_1.z.array(zod_1.z.string()).describe("Relation type names appropriate to this corpus"),
68
+ });
69
+ /**
70
+ * Corpus analysis pre-pass: read + classify each file (char-capped), count term
71
+ * frequency, then make ONE LLM call to propose a corpus-specific glossary
72
+ * (canonical entity names / types / relation types). Cached to a sidecar and
73
+ * reused on re-run when the corpus + model are unchanged.
74
+ */
75
+ class CorpusAnalyzer {
76
+ constructor(llm, classifier, readerFactory, logger, promptManager) {
77
+ this.llm = llm;
78
+ this.classifier = classifier;
79
+ this.readerFactory = readerFactory;
80
+ this.logger = logger;
81
+ this.promptManager = promptManager;
82
+ }
83
+ analyzeOrLoad(files, options) {
84
+ return __awaiter(this, void 0, void 0, function* () {
85
+ var _a;
86
+ const inputRoot = (_a = options.input) !== null && _a !== void 0 ? _a : "";
87
+ const topN = options.corpus.topTerms;
88
+ const profilePath = options.corpus.profilePath || `${options.output}.corpus-profile.json`;
89
+ const store = new CorpusProfileStore_1.CorpusProfileStore(profilePath, this.logger);
90
+ const key = this.computeKey(files, inputRoot, options, topN);
91
+ const cached = yield store.load();
92
+ if (cached && cached.key === key) {
93
+ this.logger.info(`Reusing cached corpus profile (${cached.glossary.entityNames.length} names) from ${profilePath}`);
94
+ return cached;
95
+ }
96
+ if (cached) {
97
+ this.logger.info(`Corpus profile at ${profilePath} is stale (corpus/model changed); rebuilding`);
98
+ }
99
+ if (options.corpus.clustering) {
100
+ this.logger.info("corpusClustering is not implemented yet (deferred to a follow-up); ignoring the flag");
101
+ }
102
+ // 1. Read (char-capped) + classify each file. The classifier call here is the
103
+ // expensive bit we cache; FileProcessor reuses perFileClasses downstream.
104
+ const texts = [];
105
+ const perFileClasses = {};
106
+ for (const file of files) {
107
+ if (shared_1.shutdown.isRequested()) {
108
+ this.logger.warn("Interrupted during corpus pre-pass; profiling partial corpus");
109
+ break;
110
+ }
111
+ const text = yield this.readCapped(file);
112
+ if (!text)
113
+ continue;
114
+ texts.push(text);
115
+ if (this.classifier) {
116
+ try {
117
+ perFileClasses[(0, relPath_1.toRelPathId)(inputRoot, file)] =
118
+ yield this.classifier.classify(text, file);
119
+ }
120
+ catch (error) {
121
+ this.logger.warn(`Corpus pre-pass could not classify ${file}: ${error}`);
122
+ }
123
+ }
124
+ }
125
+ // 2. Frequency + corpus-level class aggregate.
126
+ const topTerms = (0, termFrequency_1.countTerms)(texts, { topN });
127
+ const corpusClasses = aggregateClasses(Object.values(perFileClasses));
128
+ // 3. One LLM call → glossary. A failure must NOT be cached as an empty
129
+ // glossary and reused forever — run this pass without one and skip
130
+ // persisting the sidecar so the next run retries (KG-02).
131
+ let glossary;
132
+ let glossaryOk = true;
133
+ try {
134
+ glossary = yield this.generateGlossary(corpusClasses, topTerms, texts);
135
+ }
136
+ catch (error) {
137
+ this.logger.warn(`Corpus glossary generation failed; running without it and NOT caching ` +
138
+ `the profile (will rebuild next run): ${error}`);
139
+ glossary = { entityNames: [], entityTypes: [], relationTypes: [] };
140
+ glossaryOk = false;
141
+ }
142
+ const profile = {
143
+ generatedAt: new Date().toISOString(),
144
+ key,
145
+ fileCount: files.length,
146
+ corpusClasses,
147
+ perFileClasses,
148
+ topTerms,
149
+ glossary,
150
+ };
151
+ if (glossaryOk) {
152
+ yield store.save(profile);
153
+ this.logger.info(`Corpus profile built: ${topTerms.length} top terms, glossary ` +
154
+ `${glossary.entityNames.length} names / ${glossary.entityTypes.length} types / ` +
155
+ `${glossary.relationTypes.length} relations → ${profilePath}`);
156
+ }
157
+ return profile;
158
+ });
159
+ }
160
+ /** Read a file via its reader and concatenate chunk text, capped. Non-fatal. */
161
+ readCapped(file) {
162
+ return __awaiter(this, void 0, void 0, function* () {
163
+ var _a;
164
+ try {
165
+ const reader = this.readerFactory.getReader(file);
166
+ if (!reader)
167
+ return "";
168
+ const res = yield reader.read(file);
169
+ return ((_a = res.chunks) !== null && _a !== void 0 ? _a : [])
170
+ .map((c) => c.content)
171
+ .join("\n")
172
+ .slice(0, PER_FILE_CHAR_CAP);
173
+ }
174
+ catch (error) {
175
+ this.logger.warn(`Corpus pre-pass could not read ${file}: ${error}`);
176
+ return "";
177
+ }
178
+ });
179
+ }
180
+ /**
181
+ * Validity key: model + topN + classifier mode + per-file (relpath, size,
182
+ * mtime). Folding size+mtime makes the cache **content-sensitive** (KG-06):
183
+ * editing a file invalidates its glossary, while a moved input tree / unchanged
184
+ * content stays stable (relpath is input-relative). size+mtime is a cheap
185
+ * `stat` proxy — deliberately *not* a byte hash, so expensive media readers
186
+ * (PDF/audio) aren't re-invoked on every run just to compute the key.
187
+ */
188
+ computeKey(files, inputRoot, options, topN) {
189
+ const entries = files
190
+ .map((f) => {
191
+ const rel = (0, relPath_1.toRelPathId)(inputRoot, f);
192
+ let sig = "missing";
193
+ try {
194
+ const st = fs.statSync(f);
195
+ sig = `${st.size}:${Math.round(st.mtimeMs)}`;
196
+ }
197
+ catch (_a) {
198
+ // Non-existent / unreadable file: stable sentinel, never throws.
199
+ }
200
+ return `${rel}|${sig}`;
201
+ })
202
+ .sort();
203
+ const hash = crypto.createHash("sha1");
204
+ hash.update(`${options.llm.model} ${topN} ${options.classifier.mode} ${entries.length}\n${entries.join("\n")}`);
205
+ return hash.digest("hex");
206
+ }
207
+ generateGlossary(corpusClasses, topTerms, texts) {
208
+ return __awaiter(this, void 0, void 0, function* () {
209
+ var _a, _b, _c;
210
+ const classLine = corpusClasses.length > 0
211
+ ? corpusClasses
212
+ .slice(0, 2)
213
+ .map((c) => `${c.class} (${c.confidence.toFixed(2)})`)
214
+ .join(", ")
215
+ : "unknown";
216
+ const termList = topTerms
217
+ .map((t) => `${t.term} (${t.count})`)
218
+ .join(", ");
219
+ const snippets = texts
220
+ .slice(0, 3)
221
+ .map((t, i) => `--- sample ${i + 1} ---\n${t.slice(0, 600)}`)
222
+ .join("\n\n");
223
+ // Prefer the versioned glossary templates (v5); fall back to inline strings
224
+ // when the current prompt version ships none (e.g. v4.5) or rendering fails.
225
+ const rendered = yield ((_a = this.promptManager) === null || _a === void 0 ? void 0 : _a.getGlossaryPrompt({
226
+ classLine,
227
+ termList,
228
+ snippets,
229
+ }));
230
+ const system = (_b = rendered === null || rendered === void 0 ? void 0 : rendered.system) !== null && _b !== void 0 ? _b : FALLBACK_GLOSSARY_SYSTEM;
231
+ const user = (_c = rendered === null || rendered === void 0 ? void 0 : rendered.user) !== null && _c !== void 0 ? _c : `Corpus content type: ${classLine}\n\n` +
232
+ `Most frequent terms (with counts):\n${termList}\n\n` +
233
+ `Representative snippets:\n${snippets}`;
234
+ const messages = [
235
+ { role: "system", content: system },
236
+ { role: "user", content: user },
237
+ ];
238
+ // Let failures propagate: a failed glossary must NOT be cached as empty and
239
+ // reused forever (KG-02). The caller (analyzeOrLoad) catches, runs this pass
240
+ // without a glossary, and skips persisting the sidecar so the next run retries.
241
+ const result = yield this.llm.generateStructured(messages, GlossarySchema);
242
+ // Validate + normalize before it becomes the authoritative closed vocabulary
243
+ // (KG-06): snake_case, dedupe, drop has_* predicates, cap to the prompt's
244
+ // limits — so garbage glossary output can't get enforced as the Zod enum.
245
+ return (0, normalizeGlossary_1.normalizeGlossary)(result);
246
+ });
247
+ }
248
+ }
249
+ exports.CorpusAnalyzer = CorpusAnalyzer;
250
+ /** Average per-class confidence across files, sorted descending. */
251
+ function aggregateClasses(perFile) {
252
+ var _a;
253
+ const sums = new Map();
254
+ for (const classes of perFile) {
255
+ for (const c of classes) {
256
+ const e = (_a = sums.get(c.class)) !== null && _a !== void 0 ? _a : { sum: 0, n: 0 };
257
+ e.sum += c.confidence;
258
+ e.n += 1;
259
+ sums.set(c.class, e);
260
+ }
261
+ }
262
+ return Array.from(sums.entries())
263
+ .map(([cls, { sum, n }]) => ({ class: cls, confidence: sum / n }))
264
+ .sort((a, b) => b.confidence - a.confidence);
265
+ }
266
+ //# sourceMappingURL=CorpusAnalyzer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"CorpusAnalyzer.js","sourceRoot":"","sources":["../../../src/core/corpus/CorpusAnalyzer.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,+CAAiC;AACjC,uCAAyB;AACzB,6BAAwB;AAYxB,yCAAgD;AAGhD,mDAA6C;AAC7C,6DAA0D;AAC1D,2DAAwD;AACxD,uCAAwC;AAExC,0FAA0F;AAC1F,MAAM,iBAAiB,GAAG,KAAM,CAAC;AAEjC,mFAAmF;AACnF,MAAM,wBAAwB,GAC5B,+EAA+E;IAC/E,gFAAgF;IAChF,iFAAiF;IACjF,8EAA8E;IAC9E,8EAA8E;IAC9E,gFAAgF;IAChF,4CAA4C,CAAC;AAE/C,MAAM,cAAc,GAAG,OAAC,CAAC,MAAM,CAAC;IAC9B,WAAW,EAAE,OAAC,CAAC,KAAK,CAAC,OAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,iDAAiD,CAAC;IAC5F,WAAW,EAAE,OAAC,CAAC,KAAK,CAAC,OAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,mDAAmD,CAAC;IAC9F,aAAa,EAAE,OAAC,CAAC,KAAK,CAAC,OAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,gDAAgD,CAAC;CAC9F,CAAC,CAAC;AAEH;;;;;GAKG;AACH,MAAa,cAAc;IACzB,YACmB,GAAiB,EACjB,UAA0C,EAC1C,aAAgC,EAChC,MAAc,EACd,aAA8B;QAJ9B,QAAG,GAAH,GAAG,CAAc;QACjB,eAAU,GAAV,UAAU,CAAgC;QAC1C,kBAAa,GAAb,aAAa,CAAmB;QAChC,WAAM,GAAN,MAAM,CAAQ;QACd,kBAAa,GAAb,aAAa,CAAiB;IAC9C,CAAC;IAEE,aAAa,CACjB,KAAe,EACf,OAA0B;;;YAE1B,MAAM,SAAS,GAAG,MAAA,OAAO,CAAC,KAAK,mCAAI,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC;YACrC,MAAM,WAAW,GACf,OAAO,CAAC,MAAM,CAAC,WAAW,IAAI,GAAG,OAAO,CAAC,MAAM,sBAAsB,CAAC;YACxE,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;YAC/D,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,SAAS,EAAE,OAAO,EAAE,IAAI,CAAC,CAAC;YAE7D,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;YAClC,IAAI,MAAM,IAAI,MAAM,CAAC,GAAG,KAAK,GAAG,EAAE,CAAC;gBACjC,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,kCAAkC,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,gBAAgB,WAAW,EAAE,CAClG,CAAC;gBACF,OAAO,MAAM,CAAC;YAChB,CAAC;YACD,IAAI,MAAM,EAAE,CAAC;gBACX,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,qBAAqB,WAAW,8CAA8C,CAC/E,CAAC;YACJ,CAAC;YACD,IAAI,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;gBAC9B,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,sFAAsF,CACvF,CAAC;YACJ,CAAC;YAED,8EAA8E;YAC9E,6EAA6E;YAC7E,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,MAAM,cAAc,GAA2C,EAAE,CAAC;YAClE,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,iBAAQ,CAAC,WAAW,EAAE,EAAE,CAAC;oBAC3B,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,8DAA8D,CAAC,CAAC;oBACjF,MAAM;gBACR,CAAC;gBACD,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;gBACzC,IAAI,CAAC,IAAI;oBAAE,SAAS;gBACpB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;oBACpB,IAAI,CAAC;wBACH,cAAc,CAAC,IAAA,qBAAW,EAAC,SAAS,EAAE,IAAI,CAAC,CAAC;4BAC1C,MAAM,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;oBAC/C,CAAC;oBAAC,OAAO,KAAK,EAAE,CAAC;wBACf,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,sCAAsC,IAAI,KAAK,KAAK,EAAE,CAAC,CAAC;oBAC3E,CAAC;gBACH,CAAC;YACH,CAAC;YAED,+CAA+C;YAC/C,MAAM,QAAQ,GAAG,IAAA,0BAAU,EAAC,KAAK,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;YAC7C,MAAM,aAAa,GAAG,gBAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC;YAEtE,uEAAuE;YACvE,sEAAsE;YACtE,6DAA6D;YAC7D,IAAI,QAAwB,CAAC;YAC7B,IAAI,UAAU,GAAG,IAAI,CAAC;YACtB,IAAI,CAAC;gBACH,QAAQ,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,aAAa,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;YACzE,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,wEAAwE;oBACtE,wCAAwC,KAAK,EAAE,CAClD,CAAC;gBACF,QAAQ,GAAG,EAAE,WAAW,EAAE,EAAE,EAAE,WAAW,EAAE,EAAE,EAAE,aAAa,EAAE,EAAE,EAAE,CAAC;gBACnE,UAAU,GAAG,KAAK,CAAC;YACrB,CAAC;YAED,MAAM,OAAO,GAAkB;gBAC7B,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;gBACrC,GAAG;gBACH,SAAS,EAAE,KAAK,CAAC,MAAM;gBACvB,aAAa;gBACb,cAAc;gBACd,QAAQ;gBACR,QAAQ;aACT,CAAC;YACF,IAAI,UAAU,EAAE,CAAC;gBACf,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBAC1B,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,yBAAyB,QAAQ,CAAC,MAAM,uBAAuB;oBAC7D,GAAG,QAAQ,CAAC,WAAW,CAAC,MAAM,YAAY,QAAQ,CAAC,WAAW,CAAC,MAAM,WAAW;oBAChF,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,gBAAgB,WAAW,EAAE,CAChE,CAAC;YACJ,CAAC;YACD,OAAO,OAAO,CAAC;QACjB,CAAC;KAAA;IAED,gFAAgF;IAClE,UAAU,CAAC,IAAY;;;YACnC,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;gBAClD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,CAAC;gBACvB,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACpC,OAAO,CAAC,MAAA,GAAG,CAAC,MAAM,mCAAI,EAAE,CAAC;qBACtB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;qBACrB,IAAI,CAAC,IAAI,CAAC;qBACV,KAAK,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC;YACjC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,kCAAkC,IAAI,KAAK,KAAK,EAAE,CAAC,CAAC;gBACrE,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC;KAAA;IAED;;;;;;;OAOG;IACK,UAAU,CAChB,KAAe,EACf,SAAiB,EACjB,OAA0B,EAC1B,IAAY;QAEZ,MAAM,OAAO,GAAG,KAAK;aAClB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;YACT,MAAM,GAAG,GAAG,IAAA,qBAAW,EAAC,SAAS,EAAE,CAAC,CAAC,CAAC;YACtC,IAAI,GAAG,GAAG,SAAS,CAAC;YACpB,IAAI,CAAC;gBACH,MAAM,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;gBAC1B,GAAG,GAAG,GAAG,EAAE,CAAC,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;YAC/C,CAAC;YAAC,WAAM,CAAC;gBACP,iEAAiE;YACnE,CAAC;YACD,OAAO,GAAG,GAAG,IAAI,GAAG,EAAE,CAAC;QACzB,CAAC,CAAC;aACD,IAAI,EAAE,CAAC;QACV,MAAM,IAAI,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QACvC,IAAI,CAAC,MAAM,CACT,GAAG,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,IAAI,IAAI,OAAO,CAAC,UAAU,CAAC,IAAI,IAAI,OAAO,CAAC,MAAM,KAAK,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACnG,CAAC;QACF,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAC5B,CAAC;IAEa,gBAAgB,CAC5B,aAAqC,EACrC,QAAqB,EACrB,KAAe;;;YAEf,MAAM,SAAS,GACb,aAAa,CAAC,MAAM,GAAG,CAAC;gBACtB,CAAC,CAAC,aAAa;qBACV,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;qBACX,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;qBACrD,IAAI,CAAC,IAAI,CAAC;gBACf,CAAC,CAAC,SAAS,CAAC;YAChB,MAAM,QAAQ,GAAG,QAAQ;iBACtB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,KAAK,GAAG,CAAC;iBACpC,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,MAAM,QAAQ,GAAG,KAAK;iBACnB,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;iBACX,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;iBAC5D,IAAI,CAAC,MAAM,CAAC,CAAC;YAEhB,4EAA4E;YAC5E,6EAA6E;YAC7E,MAAM,QAAQ,GAAG,MAAM,CAAA,MAAA,IAAI,CAAC,aAAa,0CAAE,iBAAiB,CAAC;gBAC3D,SAAS;gBACT,QAAQ;gBACR,QAAQ;aACT,CAAC,CAAA,CAAC;YACH,MAAM,MAAM,GAAG,MAAA,QAAQ,aAAR,QAAQ,uBAAR,QAAQ,CAAE,MAAM,mCAAI,wBAAwB,CAAC;YAC5D,MAAM,IAAI,GACR,MAAA,QAAQ,aAAR,QAAQ,uBAAR,QAAQ,CAAE,IAAI,mCACd,wBAAwB,SAAS,MAAM;gBACrC,uCAAuC,QAAQ,MAAM;gBACrD,6BAA6B,QAAQ,EAAE,CAAC;YAE5C,MAAM,QAAQ,GAAiB;gBAC7B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,EAAE;gBACnC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE;aAChC,CAAC;YAEF,4EAA4E;YAC5E,6EAA6E;YAC7E,gFAAgF;YAChF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,kBAAkB,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;YAC3E,6EAA6E;YAC7E,0EAA0E;YAC1E,0EAA0E;YAC1E,OAAO,IAAA,qCAAiB,EAAC,MAAM,CAAC,CAAC;QACnC,CAAC;KAAA;CACF;AAtMD,wCAsMC;AAED,oEAAoE;AACpE,SAAS,gBAAgB,CACvB,OAAiC;;IAEjC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAsC,CAAC;IAC3D,KAAK,MAAM,OAAO,IAAI,OAAO,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACxB,MAAM,CAAC,GAAG,MAAA,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,mCAAI,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC;YAChD,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,UAAU,CAAC;YACtB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACT,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;SAC9B,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,UAAU,EAAE,GAAG,GAAG,CAAC,EAA2B,CAAA,CAAC;SACzF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC;AACjD,CAAC"}
@@ -0,0 +1,92 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.CorpusProfileStore = void 0;
46
+ const fs = __importStar(require("fs"));
47
+ const path = __importStar(require("path"));
48
+ /**
49
+ * Load/save the corpus profile sidecar (`<output>.corpus-profile.json`). Like
50
+ * {@link CheckpointService}, a missing or unparseable file is non-fatal — the
51
+ * caller just rebuilds the profile.
52
+ */
53
+ class CorpusProfileStore {
54
+ constructor(path, logger) {
55
+ this.path = path;
56
+ this.logger = logger;
57
+ }
58
+ getPath() {
59
+ return this.path;
60
+ }
61
+ load() {
62
+ return __awaiter(this, void 0, void 0, function* () {
63
+ if (!fs.existsSync(this.path))
64
+ return undefined;
65
+ try {
66
+ const parsed = JSON.parse(yield fs.promises.readFile(this.path, "utf-8"));
67
+ if (parsed && typeof parsed === "object" && parsed.glossary && parsed.key) {
68
+ return parsed;
69
+ }
70
+ this.logger.warn(`Corpus profile at ${this.path} has an unexpected shape; ignoring`);
71
+ return undefined;
72
+ }
73
+ catch (error) {
74
+ this.logger.warn(`Could not read corpus profile at ${this.path} (ignored): ${error}`);
75
+ return undefined;
76
+ }
77
+ });
78
+ }
79
+ save(profile) {
80
+ return __awaiter(this, void 0, void 0, function* () {
81
+ // The pre-pass runs before the output directory is created (that happens at
82
+ // export time), so ensure the sidecar's parent dir exists first.
83
+ const dir = path.dirname(this.path);
84
+ if (dir && !fs.existsSync(dir)) {
85
+ yield fs.promises.mkdir(dir, { recursive: true });
86
+ }
87
+ yield fs.promises.writeFile(this.path, JSON.stringify(profile, null, 2));
88
+ });
89
+ }
90
+ }
91
+ exports.CorpusProfileStore = CorpusProfileStore;
92
+ //# sourceMappingURL=CorpusProfileStore.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"CorpusProfileStore.js","sourceRoot":"","sources":["../../../src/core/corpus/CorpusProfileStore.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,uCAAyB;AACzB,2CAA6B;AAI7B;;;;GAIG;AACH,MAAa,kBAAkB;IAC7B,YAA6B,IAAY,EAAmB,MAAc;QAA7C,SAAI,GAAJ,IAAI,CAAQ;QAAmB,WAAM,GAAN,MAAM,CAAQ;IAAG,CAAC;IAE9E,OAAO;QACL,OAAO,IAAI,CAAC,IAAI,CAAC;IACnB,CAAC;IAEK,IAAI;;YACR,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,OAAO,SAAS,CAAC;YAChD,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC;gBAC1E,IAAI,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,GAAG,EAAE,CAAC;oBAC1E,OAAO,MAAuB,CAAC;gBACjC,CAAC;gBACD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,qBAAqB,IAAI,CAAC,IAAI,oCAAoC,CAAC,CAAC;gBACrF,OAAO,SAAS,CAAC;YACnB,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,oCAAoC,IAAI,CAAC,IAAI,eAAe,KAAK,EAAE,CAAC,CAAC;gBACtF,OAAO,SAAS,CAAC;YACnB,CAAC;QACH,CAAC;KAAA;IAEK,IAAI,CAAC,OAAsB;;YAC/B,4EAA4E;YAC5E,iEAAiE;YACjE,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACpC,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC/B,MAAM,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACpD,CAAC;YACD,MAAM,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAC3E,CAAC;KAAA;CACF;AA/BD,gDA+BC"}
@@ -0,0 +1,21 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ __exportStar(require("./termFrequency"), exports);
18
+ __exportStar(require("./CorpusProfileStore"), exports);
19
+ __exportStar(require("./CorpusAnalyzer"), exports);
20
+ __exportStar(require("./relPath"), exports);
21
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/core/corpus/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,kDAAgC;AAChC,uDAAqC;AACrC,mDAAiC;AACjC,4CAA0B"}
@@ -0,0 +1,60 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.normalizeGlossary = normalizeGlossary;
4
+ const DEFAULT_ENTITY_CAP = 20;
5
+ const DEFAULT_RELATION_CAP = 15;
6
+ /** Lowercase + `snake_case` a type/predicate token; '' when nothing survives. */
7
+ function toSnakeType(raw) {
8
+ return raw
9
+ .normalize("NFKD")
10
+ .toLowerCase()
11
+ .replace(/[^a-z0-9]+/g, "_") // spaces, hyphens, slashes, unicode dashes → _
12
+ .replace(/^_+|_+$/g, ""); // trim leading/trailing underscores
13
+ }
14
+ /** Normalize a type/predicate list: snake_case, dedupe, reject has_*, cap. */
15
+ function normalizeTypeList(raw, cap, rejectHasPrefix) {
16
+ const seen = new Set();
17
+ const out = [];
18
+ for (const item of raw !== null && raw !== void 0 ? raw : []) {
19
+ const s = toSnakeType(item);
20
+ if (!s)
21
+ continue;
22
+ // The banned attribute family (`has_format`, `has_length`, …). The legit
23
+ // base predicate `has_attribute` is supplied by the base set downstream, so
24
+ // dropping it from the glossary loses nothing.
25
+ if (rejectHasPrefix && s.startsWith("has_"))
26
+ continue;
27
+ if (seen.has(s))
28
+ continue;
29
+ seen.add(s);
30
+ out.push(s);
31
+ if (out.length >= cap)
32
+ break;
33
+ }
34
+ return out;
35
+ }
36
+ /** Trim + case-insensitive dedupe entity names, preserving first-seen casing. */
37
+ function normalizeNames(raw) {
38
+ const seen = new Set();
39
+ const out = [];
40
+ for (const name of raw !== null && raw !== void 0 ? raw : []) {
41
+ const t = name.trim();
42
+ if (!t)
43
+ continue;
44
+ const key = t.toLowerCase();
45
+ if (seen.has(key))
46
+ continue;
47
+ seen.add(key);
48
+ out.push(t);
49
+ }
50
+ return out;
51
+ }
52
+ function normalizeGlossary(raw, caps = {}) {
53
+ var _a, _b;
54
+ return {
55
+ entityNames: normalizeNames(raw.entityNames),
56
+ entityTypes: normalizeTypeList(raw.entityTypes, (_a = caps.entityCap) !== null && _a !== void 0 ? _a : DEFAULT_ENTITY_CAP, false),
57
+ relationTypes: normalizeTypeList(raw.relationTypes, (_b = caps.relationCap) !== null && _b !== void 0 ? _b : DEFAULT_RELATION_CAP, true),
58
+ };
59
+ }
60
+ //# sourceMappingURL=normalizeGlossary.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalizeGlossary.js","sourceRoot":"","sources":["../../../src/core/corpus/normalizeGlossary.ts"],"names":[],"mappings":";;AA0EA,8CAiBC;AAnED,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAC9B,MAAM,oBAAoB,GAAG,EAAE,CAAC;AAEhC,iFAAiF;AACjF,SAAS,WAAW,CAAC,GAAW;IAC9B,OAAO,GAAG;SACP,SAAS,CAAC,MAAM,CAAC;SACjB,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC,+CAA+C;SAC3E,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,oCAAoC;AAClE,CAAC;AAED,8EAA8E;AAC9E,SAAS,iBAAiB,CACxB,GAAyB,EACzB,GAAW,EACX,eAAwB;IAExB,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,MAAM,IAAI,IAAI,GAAG,aAAH,GAAG,cAAH,GAAG,GAAI,EAAE,EAAE,CAAC;QAC7B,MAAM,CAAC,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;QAC5B,IAAI,CAAC,CAAC;YAAE,SAAS;QACjB,yEAAyE;QACzE,4EAA4E;QAC5E,+CAA+C;QAC/C,IAAI,eAAe,IAAI,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC;YAAE,SAAS;QACtD,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;YAAE,SAAS;QAC1B,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACZ,IAAI,GAAG,CAAC,MAAM,IAAI,GAAG;YAAE,MAAM;IAC/B,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,iFAAiF;AACjF,SAAS,cAAc,CAAC,GAAyB;IAC/C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,MAAM,IAAI,IAAI,GAAG,aAAH,GAAG,cAAH,GAAG,GAAI,EAAE,EAAE,CAAC;QAC7B,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QACtB,IAAI,CAAC,CAAC;YAAE,SAAS;QACjB,MAAM,GAAG,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC;QAC5B,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,SAAS;QAC5B,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACd,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACd,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAgB,iBAAiB,CAC/B,GAAmB,EACnB,OAAqB,EAAE;;IAEvB,OAAO;QACL,WAAW,EAAE,cAAc,CAAC,GAAG,CAAC,WAAW,CAAC;QAC5C,WAAW,EAAE,iBAAiB,CAC5B,GAAG,CAAC,WAAW,EACf,MAAA,IAAI,CAAC,SAAS,mCAAI,kBAAkB,EACpC,KAAK,CACN;QACD,aAAa,EAAE,iBAAiB,CAC9B,GAAG,CAAC,aAAa,EACjB,MAAA,IAAI,CAAC,WAAW,mCAAI,oBAAoB,EACxC,IAAI,CACL;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,52 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.toRelPathId = toRelPathId;
37
+ const path = __importStar(require("path"));
38
+ /**
39
+ * Stable identity for a file: its path relative to the discovery root
40
+ * (`options.input`), posix-normalized — so relocating the input tree doesn't
41
+ * invalidate per-file caching. Mirrors `KnowledgeGraphBuilder.stablePathId`.
42
+ * Falls back to the raw path when the file lies outside the root.
43
+ */
44
+ function toRelPathId(inputRoot, filePath) {
45
+ if (!inputRoot)
46
+ return filePath;
47
+ const rel = path.relative(inputRoot, filePath);
48
+ if (!rel || rel.startsWith("..") || path.isAbsolute(rel))
49
+ return filePath;
50
+ return rel.split(path.sep).join("/");
51
+ }
52
+ //# sourceMappingURL=relPath.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"relPath.js","sourceRoot":"","sources":["../../../src/core/corpus/relPath.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAQA,kCAKC;AAbD,2CAA6B;AAE7B;;;;;GAKG;AACH,SAAgB,WAAW,CAAC,SAAiB,EAAE,QAAgB;IAC7D,IAAI,CAAC,SAAS;QAAE,OAAO,QAAQ,CAAC;IAChC,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IAC/C,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;QAAE,OAAO,QAAQ,CAAC;IAC1E,OAAO,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AACvC,CAAC"}
@@ -0,0 +1,86 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.DEFAULT_STOPWORDS = void 0;
4
+ exports.countTerms = countTerms;
5
+ /**
6
+ * Compact English stopword set. Not exhaustive — just enough to keep the top-N
7
+ * terms dominated by content words rather than glue. Kept inline (no dependency)
8
+ * to match the project's brutalist tendency.
9
+ */
10
+ exports.DEFAULT_STOPWORDS = new Set([
11
+ "the", "and", "for", "are", "but", "not", "you", "all", "any", "can", "had",
12
+ "her", "was", "one", "our", "out", "his", "has", "him", "how", "its", "may",
13
+ "new", "now", "old", "see", "two", "way", "who", "did", "get", "let", "put",
14
+ "say", "she", "too", "use", "that", "this", "with", "have", "from", "they",
15
+ "will", "would", "there", "their", "what", "about", "which", "when", "make",
16
+ "like", "time", "just", "know", "into", "your", "some", "could", "them",
17
+ "than", "then", "look", "only", "come", "over", "also", "back", "after",
18
+ "work", "first", "well", "even", "want", "because", "these", "give", "most",
19
+ "been", "were", "such", "very", "more", "much", "many", "here", "does", "each",
20
+ "where", "while", "those", "being", "every", "should", "shall", "might",
21
+ "must", "ever", "thing", "things", "really", "actually", "going", "kind",
22
+ "okay", "yeah", "right", "lot", "got", "etc",
23
+ ]);
24
+ /** Capitalized multiword runs (2–4 words) → likely proper-noun entity names. */
25
+ const PROPER_NOUN_RE = /\b[A-Z][A-Za-z0-9'’]+(?:\s+[A-Z][A-Za-z0-9'’]+){1,3}\b/g;
26
+ /** Word-ish single tokens (letters/digits, internal '-_'). */
27
+ const WORD_RE = /[a-z0-9](?:[a-z0-9'’_-]*[a-z0-9])?/g;
28
+ /**
29
+ * Sentence-openers / determiners that get capitalized at the start of a sentence
30
+ * and wrongly absorbed into a proper-noun run (e.g. "The Naive Bayes Classifier").
31
+ * Deliberately narrow — excludes content words that are also stopwords but can
32
+ * legitimately start a name (e.g. "New" in "New York").
33
+ */
34
+ const LEADING_TRIM = new Set([
35
+ "the", "a", "an", "this", "that", "these", "those", "we", "you", "they",
36
+ "it", "he", "she", "but", "and", "or", "so", "then", "if", "when", "while",
37
+ "as", "in", "on", "at", "for", "to", "of",
38
+ ]);
39
+ /** Drop leading determiner/opener words; keep the run only if ≥ 2 words remain. */
40
+ function normalizeProperNoun(run) {
41
+ const words = run.trim().split(/\s+/);
42
+ while (words.length > 0 && LEADING_TRIM.has(words[0].toLowerCase())) {
43
+ words.shift();
44
+ }
45
+ return words.length >= 2 ? words.join(" ") : undefined;
46
+ }
47
+ /**
48
+ * Count term frequency across a set of texts and return the top-N ranked terms.
49
+ *
50
+ * Two signals are merged: lowercased single content words (stopwords / pure
51
+ * numbers / sub-`minLength` dropped) and original-cased capitalized multiword
52
+ * runs (proper-noun candidates). Deterministic: ties break alphabetically.
53
+ */
54
+ function countTerms(texts, options = {}) {
55
+ var _a, _b, _c, _d, _e;
56
+ const topN = (_a = options.topN) !== null && _a !== void 0 ? _a : 100;
57
+ const minLength = (_b = options.minLength) !== null && _b !== void 0 ? _b : 3;
58
+ const stop = (_c = options.stopwords) !== null && _c !== void 0 ? _c : exports.DEFAULT_STOPWORDS;
59
+ const counts = new Map();
60
+ const bump = (term) => { var _a; return counts.set(term, ((_a = counts.get(term)) !== null && _a !== void 0 ? _a : 0) + 1); };
61
+ for (const text of texts) {
62
+ if (!text)
63
+ continue;
64
+ // Proper-noun candidates keep their original casing (canonical names).
65
+ for (const m of (_d = text.match(PROPER_NOUN_RE)) !== null && _d !== void 0 ? _d : []) {
66
+ const proper = normalizeProperNoun(m);
67
+ if (proper)
68
+ bump(proper);
69
+ }
70
+ // Single content words, lowercased.
71
+ for (const w of (_e = text.toLowerCase().match(WORD_RE)) !== null && _e !== void 0 ? _e : []) {
72
+ if (w.length < minLength)
73
+ continue;
74
+ if (/^\d+$/.test(w))
75
+ continue; // pure numbers carry no naming signal
76
+ if (stop.has(w))
77
+ continue;
78
+ bump(w);
79
+ }
80
+ }
81
+ return Array.from(counts.entries())
82
+ .map(([term, count]) => ({ term, count }))
83
+ .sort((a, b) => b.count - a.count || a.term.localeCompare(b.term))
84
+ .slice(0, topN);
85
+ }
86
+ //# sourceMappingURL=termFrequency.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"termFrequency.js","sourceRoot":"","sources":["../../../src/core/corpus/termFrequency.ts"],"names":[],"mappings":";;;AA6DA,gCAiCC;AA5FD;;;;GAIG;AACU,QAAA,iBAAiB,GAAG,IAAI,GAAG,CAAS;IAC/C,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK;IAC3E,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK;IAC3E,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK;IAC3E,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC1E,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM;IAC3E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IACvE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO;IACvE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM;IAC3E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC9E,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO;IACvE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,MAAM;IACxE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK;CAC7C,CAAC,CAAC;AAQH,gFAAgF;AAChF,MAAM,cAAc,GAAG,yDAAyD,CAAC;AACjF,8DAA8D;AAC9D,MAAM,OAAO,GAAG,qCAAqC,CAAC;AAEtD;;;;;GAKG;AACH,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC;IAC3B,KAAK,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM;IACvE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO;IAC1E,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI;CAC1C,CAAC,CAAC;AAEH,mFAAmF;AACnF,SAAS,mBAAmB,CAAC,GAAW;IACtC,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACtC,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;QACpE,KAAK,CAAC,KAAK,EAAE,CAAC;IAChB,CAAC;IACD,OAAO,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;AACzD,CAAC;AAED;;;;;;GAMG;AACH,SAAgB,UAAU,CACxB,KAAe,EACf,UAA6B,EAAE;;IAE/B,MAAM,IAAI,GAAG,MAAA,OAAO,CAAC,IAAI,mCAAI,GAAG,CAAC;IACjC,MAAM,SAAS,GAAG,MAAA,OAAO,CAAC,SAAS,mCAAI,CAAC,CAAC;IACzC,MAAM,IAAI,GAAG,MAAA,OAAO,CAAC,SAAS,mCAAI,yBAAiB,CAAC;IAEpD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,MAAM,IAAI,GAAG,CAAC,IAAY,EAAE,EAAE,WAAC,OAAA,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,MAAA,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,mCAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA,EAAA,CAAC;IAE7E,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,IAAI;YAAE,SAAS;QAEpB,uEAAuE;QACvE,KAAK,MAAM,CAAC,IAAI,MAAA,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,mCAAI,EAAE,EAAE,CAAC;YACjD,MAAM,MAAM,GAAG,mBAAmB,CAAC,CAAC,CAAC,CAAC;YACtC,IAAI,MAAM;gBAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QAC3B,CAAC;QAED,oCAAoC;QACpC,KAAK,MAAM,CAAC,IAAI,MAAA,IAAI,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,mCAAI,EAAE,EAAE,CAAC;YACxD,IAAI,CAAC,CAAC,MAAM,GAAG,SAAS;gBAAE,SAAS;YACnC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC;gBAAE,SAAS,CAAC,sCAAsC;YACrE,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;gBAAE,SAAS;YAC1B,IAAI,CAAC,CAAC,CAAC,CAAC;QACV,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;SAChC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;SACzC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;SACjE,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;AACpB,CAAC"}