kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,169 @@
1
+ //! Main extraction configuration struct.
2
+ //!
3
+ //! This module contains the main `ExtractionConfig` struct that aggregates all
4
+ //! configuration options for the extraction process.
5
+
6
+ use serde::{Deserialize, Serialize};
7
+
8
+ use super::super::formats::OutputFormat;
9
+ use super::super::ocr::OcrConfig;
10
+ use super::super::page::PageConfig;
11
+ use super::super::processing::{ChunkingConfig, PostProcessorConfig};
12
+ use super::types::{ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
13
+
14
+ /// Main extraction configuration.
15
+ ///
16
+ /// This struct contains all configuration options for the extraction process.
17
+ /// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
18
+ ///
19
+ /// # Example
20
+ ///
21
+ /// ```rust
22
+ /// use kreuzberg::core::config::ExtractionConfig;
23
+ ///
24
+ /// // Create with defaults
25
+ /// let config = ExtractionConfig::default();
26
+ ///
27
+ /// // Load from TOML file
28
+ /// // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
29
+ /// ```
30
+ #[derive(Debug, Clone, Serialize, Deserialize)]
31
+ pub struct ExtractionConfig {
32
+ /// Enable caching of extraction results
33
+ #[serde(default = "default_true")]
34
+ pub use_cache: bool,
35
+
36
+ /// Enable quality post-processing
37
+ #[serde(default = "default_true")]
38
+ pub enable_quality_processing: bool,
39
+
40
+ /// OCR configuration (None = OCR disabled)
41
+ #[serde(default)]
42
+ pub ocr: Option<OcrConfig>,
43
+
44
+ /// Force OCR even for searchable PDFs
45
+ #[serde(default)]
46
+ pub force_ocr: bool,
47
+
48
+ /// Text chunking configuration (None = chunking disabled)
49
+ #[serde(default)]
50
+ pub chunking: Option<ChunkingConfig>,
51
+
52
+ /// Image extraction configuration (None = no image extraction)
53
+ #[serde(default)]
54
+ pub images: Option<ImageExtractionConfig>,
55
+
56
+ /// PDF-specific options (None = use defaults)
57
+ #[cfg(feature = "pdf")]
58
+ #[serde(default)]
59
+ pub pdf_options: Option<super::super::pdf::PdfConfig>,
60
+
61
+ /// Token reduction configuration (None = no token reduction)
62
+ #[serde(default)]
63
+ pub token_reduction: Option<TokenReductionConfig>,
64
+
65
+ /// Language detection configuration (None = no language detection)
66
+ #[serde(default)]
67
+ pub language_detection: Option<LanguageDetectionConfig>,
68
+
69
+ /// Page extraction configuration (None = no page tracking)
70
+ #[serde(default)]
71
+ pub pages: Option<PageConfig>,
72
+
73
+ /// Keyword extraction configuration (None = no keyword extraction)
74
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
75
+ #[serde(default)]
76
+ pub keywords: Option<crate::keywords::KeywordConfig>,
77
+
78
+ /// Post-processor configuration (None = use defaults)
79
+ #[serde(default)]
80
+ pub postprocessor: Option<PostProcessorConfig>,
81
+
82
+ /// HTML to Markdown conversion options (None = use defaults)
83
+ ///
84
+ /// Configure how HTML documents are converted to Markdown, including heading styles,
85
+ /// list formatting, code block styles, and preprocessing options.
86
+ #[cfg(feature = "html")]
87
+ #[serde(default)]
88
+ pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
89
+
90
+ /// Maximum concurrent extractions in batch operations (None = num_cpus * 2).
91
+ ///
92
+ /// Limits parallelism to prevent resource exhaustion when processing
93
+ /// large batches. Defaults to twice the number of CPU cores.
94
+ #[serde(default)]
95
+ pub max_concurrent_extractions: Option<usize>,
96
+
97
+ /// Result structure format
98
+ ///
99
+ /// Controls whether results are returned in unified format (default) with all
100
+ /// content in the `content` field, or element-based format with semantic
101
+ /// elements (for Unstructured-compatible output).
102
+ #[serde(default)]
103
+ pub result_format: crate::types::OutputFormat,
104
+
105
+ /// Content text format (default: Plain).
106
+ ///
107
+ /// Controls the format of the extracted content:
108
+ /// - `Plain`: Raw extracted text (default)
109
+ /// - `Markdown`: Markdown formatted output
110
+ /// - `Djot`: Djot markup format (requires djot feature)
111
+ /// - `Html`: HTML formatted output
112
+ ///
113
+ /// When set to a structured format, extraction results will include
114
+ /// formatted output. The `formatted_content` field may be populated
115
+ /// when format conversion is applied.
116
+ #[serde(default)]
117
+ pub output_format: OutputFormat,
118
+ }
119
+
120
+ impl Default for ExtractionConfig {
121
+ fn default() -> Self {
122
+ Self {
123
+ use_cache: true,
124
+ enable_quality_processing: true,
125
+ ocr: None,
126
+ force_ocr: false,
127
+ chunking: None,
128
+ images: None,
129
+ #[cfg(feature = "pdf")]
130
+ pdf_options: None,
131
+ token_reduction: None,
132
+ language_detection: None,
133
+ pages: None,
134
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
135
+ keywords: None,
136
+ postprocessor: None,
137
+ #[cfg(feature = "html")]
138
+ html_options: None,
139
+ max_concurrent_extractions: None,
140
+ result_format: crate::types::OutputFormat::Unified,
141
+ output_format: OutputFormat::Plain,
142
+ }
143
+ }
144
+ }
145
+
146
+ impl ExtractionConfig {
147
+ /// Check if image processing is needed by examining OCR and image extraction settings.
148
+ ///
149
+ /// Returns `true` if either OCR is enabled or image extraction is configured,
150
+ /// indicating that image decompression and processing should occur.
151
+ /// Returns `false` if both are disabled, allowing optimization to skip unnecessary
152
+ /// image decompression for text-only extraction workflows.
153
+ ///
154
+ /// # Optimization Impact
155
+ /// For text-only extractions (no OCR, no image extraction), skipping image
156
+ /// decompression can improve CPU utilization by 5-10% by avoiding wasteful
157
+ /// image I/O and processing when results won't be used.
158
+ pub fn needs_image_processing(&self) -> bool {
159
+ let ocr_enabled = self.ocr.is_some();
160
+
161
+ let image_extraction_enabled = self.images.as_ref().map(|i| i.extract_images).unwrap_or(false);
162
+
163
+ ocr_enabled || image_extraction_enabled
164
+ }
165
+ }
166
+
167
+ fn default_true() -> bool {
168
+ true
169
+ }
@@ -0,0 +1,179 @@
1
+ //! Environment variable override support for extraction configuration.
2
+ //!
3
+ //! This module provides functionality to apply environment variable overrides
4
+ //! to extraction configuration, allowing runtime configuration changes.
5
+
6
+ use crate::{KreuzbergError, Result};
7
+
8
+ use super::super::ocr::OcrConfig;
9
+ use super::super::processing::ChunkingConfig;
10
+ use super::core::ExtractionConfig;
11
+ use super::types::TokenReductionConfig;
12
+
13
+ impl ExtractionConfig {
14
+ /// Apply environment variable overrides to configuration.
15
+ ///
16
+ /// Environment variables have the highest precedence and will override any values
17
+ /// loaded from configuration files. This method supports the following environment variables:
18
+ ///
19
+ /// - `KREUZBERG_OCR_LANGUAGE`: OCR language (ISO 639-1 or 639-3 code, e.g., "eng", "fra", "deu")
20
+ /// - `KREUZBERG_OCR_BACKEND`: OCR backend ("tesseract", "easyocr", or "paddleocr")
21
+ /// - `KREUZBERG_CHUNKING_MAX_CHARS`: Maximum characters per chunk (positive integer)
22
+ /// - `KREUZBERG_CHUNKING_MAX_OVERLAP`: Maximum overlap between chunks (non-negative integer)
23
+ /// - `KREUZBERG_CACHE_ENABLED`: Cache enabled flag ("true" or "false")
24
+ /// - `KREUZBERG_TOKEN_REDUCTION_MODE`: Token reduction mode ("off", "light", "moderate", "aggressive", or "maximum")
25
+ ///
26
+ /// # Behavior
27
+ ///
28
+ /// - If an environment variable is set and valid, it overrides the current configuration value
29
+ /// - If a required parent config is `None` (e.g., `self.ocr` is None), it's created with defaults before applying the override
30
+ /// - Invalid values return a `KreuzbergError::Validation` with helpful error messages
31
+ /// - Missing or unset environment variables are silently ignored
32
+ ///
33
+ /// # Example
34
+ ///
35
+ /// ```rust
36
+ /// # use kreuzberg::core::config::ExtractionConfig;
37
+ /// # fn example() -> kreuzberg::Result<()> {
38
+ /// let mut config = ExtractionConfig::from_file("config.toml")?;
39
+ /// // Set KREUZBERG_OCR_LANGUAGE=fra before calling
40
+ /// config.apply_env_overrides()?; // OCR language is now "fra"
41
+ /// # Ok(())
42
+ /// # }
43
+ /// ```
44
+ ///
45
+ /// # Errors
46
+ ///
47
+ /// Returns `KreuzbergError::Validation` if:
48
+ /// - An environment variable contains an invalid value
49
+ /// - A number cannot be parsed as the expected type
50
+ /// - A boolean is not "true" or "false"
51
+ pub fn apply_env_overrides(&mut self) -> Result<()> {
52
+ use crate::core::config_validation::{
53
+ validate_chunking_params, validate_language_code, validate_ocr_backend, validate_token_reduction_level,
54
+ };
55
+
56
+ // KREUZBERG_OCR_LANGUAGE override
57
+ if let Ok(lang) = std::env::var("KREUZBERG_OCR_LANGUAGE") {
58
+ validate_language_code(&lang)?;
59
+ if self.ocr.is_none() {
60
+ self.ocr = Some(OcrConfig::default());
61
+ }
62
+ if let Some(ref mut ocr) = self.ocr {
63
+ ocr.language = lang;
64
+ }
65
+ }
66
+
67
+ // KREUZBERG_OCR_BACKEND override
68
+ if let Ok(backend) = std::env::var("KREUZBERG_OCR_BACKEND") {
69
+ validate_ocr_backend(&backend)?;
70
+ if self.ocr.is_none() {
71
+ self.ocr = Some(OcrConfig::default());
72
+ }
73
+ if let Some(ref mut ocr) = self.ocr {
74
+ ocr.backend = backend;
75
+ }
76
+ }
77
+
78
+ // KREUZBERG_CHUNKING_MAX_CHARS override
79
+ if let Ok(max_chars_str) = std::env::var("KREUZBERG_CHUNKING_MAX_CHARS") {
80
+ let max_chars: usize = max_chars_str.parse().map_err(|_| KreuzbergError::Validation {
81
+ message: format!(
82
+ "Invalid value for KREUZBERG_CHUNKING_MAX_CHARS: '{}'. Must be a positive integer.",
83
+ max_chars_str
84
+ ),
85
+ source: None,
86
+ })?;
87
+
88
+ if max_chars == 0 {
89
+ return Err(KreuzbergError::Validation {
90
+ message: "KREUZBERG_CHUNKING_MAX_CHARS must be greater than 0".to_string(),
91
+ source: None,
92
+ });
93
+ }
94
+
95
+ if self.chunking.is_none() {
96
+ self.chunking = Some(ChunkingConfig {
97
+ max_chars: 1000,
98
+ max_overlap: 200,
99
+ embedding: None,
100
+ preset: None,
101
+ });
102
+ }
103
+
104
+ if let Some(ref mut chunking) = self.chunking {
105
+ // Validate against current overlap before updating
106
+ validate_chunking_params(max_chars, chunking.max_overlap)?;
107
+ chunking.max_chars = max_chars;
108
+ }
109
+ }
110
+
111
+ // KREUZBERG_CHUNKING_MAX_OVERLAP override
112
+ if let Ok(max_overlap_str) = std::env::var("KREUZBERG_CHUNKING_MAX_OVERLAP") {
113
+ let max_overlap: usize = max_overlap_str.parse().map_err(|_| KreuzbergError::Validation {
114
+ message: format!(
115
+ "Invalid value for KREUZBERG_CHUNKING_MAX_OVERLAP: '{}'. Must be a non-negative integer.",
116
+ max_overlap_str
117
+ ),
118
+ source: None,
119
+ })?;
120
+
121
+ if self.chunking.is_none() {
122
+ self.chunking = Some(ChunkingConfig {
123
+ max_chars: 1000,
124
+ max_overlap: 200,
125
+ embedding: None,
126
+ preset: None,
127
+ });
128
+ }
129
+
130
+ if let Some(ref mut chunking) = self.chunking {
131
+ // Validate against current max_chars before updating
132
+ validate_chunking_params(chunking.max_chars, max_overlap)?;
133
+ chunking.max_overlap = max_overlap;
134
+ }
135
+ }
136
+
137
+ // KREUZBERG_CACHE_ENABLED override
138
+ if let Ok(cache_str) = std::env::var("KREUZBERG_CACHE_ENABLED") {
139
+ let cache_enabled = match cache_str.to_lowercase().as_str() {
140
+ "true" => true,
141
+ "false" => false,
142
+ _ => {
143
+ return Err(KreuzbergError::Validation {
144
+ message: format!(
145
+ "Invalid value for KREUZBERG_CACHE_ENABLED: '{}'. Must be 'true' or 'false'.",
146
+ cache_str
147
+ ),
148
+ source: None,
149
+ });
150
+ }
151
+ };
152
+ self.use_cache = cache_enabled;
153
+ }
154
+
155
+ // KREUZBERG_TOKEN_REDUCTION_MODE override
156
+ if let Ok(mode) = std::env::var("KREUZBERG_TOKEN_REDUCTION_MODE") {
157
+ validate_token_reduction_level(&mode)?;
158
+ if self.token_reduction.is_none() {
159
+ self.token_reduction = Some(TokenReductionConfig {
160
+ mode: "off".to_string(),
161
+ preserve_important_words: true,
162
+ });
163
+ }
164
+ if let Some(ref mut token_reduction) = self.token_reduction {
165
+ token_reduction.mode = mode;
166
+ }
167
+ }
168
+
169
+ // KREUZBERG_OUTPUT_FORMAT override
170
+ if let Ok(val) = std::env::var("KREUZBERG_OUTPUT_FORMAT") {
171
+ self.output_format = val.parse().map_err(|e: String| KreuzbergError::Validation {
172
+ message: format!("Invalid value for KREUZBERG_OUTPUT_FORMAT: {}", e),
173
+ source: None,
174
+ })?;
175
+ }
176
+
177
+ Ok(())
178
+ }
179
+ }
@@ -0,0 +1,204 @@
1
+ //! Configuration file loading with caching support.
2
+ //!
3
+ //! This module provides methods for loading extraction configuration from various
4
+ //! file formats (TOML, YAML, JSON) with automatic caching based on file modification times.
5
+
6
+ use crate::{KreuzbergError, Result};
7
+ use dashmap::DashMap;
8
+ use std::path::{Path, PathBuf};
9
+ use std::sync::{Arc, LazyLock};
10
+ use std::time::SystemTime;
11
+
12
+ use super::core::ExtractionConfig;
13
+
14
+ static CONFIG_CACHE: LazyLock<DashMap<PathBuf, (SystemTime, Arc<ExtractionConfig>)>> = LazyLock::new(DashMap::new);
15
+
16
+ impl ExtractionConfig {
17
+ /// Load configuration from a TOML file.
18
+ ///
19
+ /// # Arguments
20
+ ///
21
+ /// * `path` - Path to the TOML file
22
+ ///
23
+ /// # Errors
24
+ ///
25
+ /// Returns `KreuzbergError::Validation` if file doesn't exist or is invalid TOML.
26
+ pub fn from_toml_file(path: impl AsRef<Path>) -> Result<Self> {
27
+ let path = path.as_ref();
28
+
29
+ let metadata = std::fs::metadata(path)
30
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
31
+ let mtime = metadata.modified().map_err(|e| {
32
+ KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
33
+ })?;
34
+
35
+ if let Some(entry) = CONFIG_CACHE.get(path)
36
+ && entry.0 == mtime
37
+ {
38
+ return Ok((*entry.1).clone());
39
+ }
40
+
41
+ let content = std::fs::read_to_string(path)
42
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
43
+
44
+ let config: Self = toml::from_str(&content)
45
+ .map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
46
+
47
+ let config_arc = Arc::new(config.clone());
48
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
49
+
50
+ Ok(config)
51
+ }
52
+
53
+ /// Load configuration from a YAML file.
54
+ pub fn from_yaml_file(path: impl AsRef<Path>) -> Result<Self> {
55
+ let path = path.as_ref();
56
+
57
+ let metadata = std::fs::metadata(path)
58
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
59
+ let mtime = metadata.modified().map_err(|e| {
60
+ KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
61
+ })?;
62
+
63
+ if let Some(entry) = CONFIG_CACHE.get(path)
64
+ && entry.0 == mtime
65
+ {
66
+ return Ok((*entry.1).clone());
67
+ }
68
+
69
+ let content = std::fs::read_to_string(path)
70
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
71
+
72
+ let config: Self = serde_yaml_ng::from_str(&content)
73
+ .map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
74
+
75
+ let config_arc = Arc::new(config.clone());
76
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
77
+
78
+ Ok(config)
79
+ }
80
+
81
+ /// Load configuration from a JSON file.
82
+ pub fn from_json_file(path: impl AsRef<Path>) -> Result<Self> {
83
+ let path = path.as_ref();
84
+
85
+ let metadata = std::fs::metadata(path)
86
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
87
+ let mtime = metadata.modified().map_err(|e| {
88
+ KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
89
+ })?;
90
+
91
+ if let Some(entry) = CONFIG_CACHE.get(path)
92
+ && entry.0 == mtime
93
+ {
94
+ return Ok((*entry.1).clone());
95
+ }
96
+
97
+ let content = std::fs::read_to_string(path)
98
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
99
+
100
+ let config: Self = serde_json::from_str(&content)
101
+ .map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
102
+
103
+ let config_arc = Arc::new(config.clone());
104
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
105
+
106
+ Ok(config)
107
+ }
108
+
109
+ /// Load configuration from a file, auto-detecting format by extension.
110
+ ///
111
+ /// Supported formats:
112
+ /// - `.toml` - TOML format
113
+ /// - `.yaml` - YAML format
114
+ /// - `.json` - JSON format
115
+ ///
116
+ /// # Arguments
117
+ ///
118
+ /// * `path` - Path to the configuration file
119
+ ///
120
+ /// # Errors
121
+ ///
122
+ /// Returns `KreuzbergError::Validation` if:
123
+ /// - File doesn't exist
124
+ /// - File extension is not supported
125
+ /// - File content is invalid for the detected format
126
+ ///
127
+ /// # Example
128
+ ///
129
+ /// ```rust
130
+ /// use kreuzberg::core::config::ExtractionConfig;
131
+ ///
132
+ /// // Auto-detects TOML format
133
+ /// // let config = ExtractionConfig::from_file("kreuzberg.toml")?;
134
+ ///
135
+ /// // Auto-detects YAML format
136
+ /// // let config = ExtractionConfig::from_file("kreuzberg.yaml")?;
137
+ /// ```
138
+ pub fn from_file(path: impl AsRef<Path>) -> Result<Self> {
139
+ let path = path.as_ref();
140
+
141
+ let metadata = std::fs::metadata(path)
142
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
143
+ let mtime = metadata.modified().map_err(|e| {
144
+ KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
145
+ })?;
146
+
147
+ if let Some(entry) = CONFIG_CACHE.get(path)
148
+ && entry.0 == mtime
149
+ {
150
+ return Ok((*entry.1).clone());
151
+ }
152
+
153
+ let extension = path.extension().and_then(|ext| ext.to_str()).ok_or_else(|| {
154
+ KreuzbergError::validation(format!(
155
+ "Cannot determine file format: no extension found in {}",
156
+ path.display()
157
+ ))
158
+ })?;
159
+
160
+ let config = match extension.to_lowercase().as_str() {
161
+ "toml" => Self::from_toml_file(path)?,
162
+ "yaml" | "yml" => Self::from_yaml_file(path)?,
163
+ "json" => Self::from_json_file(path)?,
164
+ _ => {
165
+ return Err(KreuzbergError::validation(format!(
166
+ "Unsupported config file format: .{}. Supported formats: .toml, .yaml, .json",
167
+ extension
168
+ )));
169
+ }
170
+ };
171
+
172
+ let config_arc = Arc::new(config.clone());
173
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
174
+
175
+ Ok(config)
176
+ }
177
+
178
+ /// Discover configuration file in parent directories.
179
+ ///
180
+ /// Searches for `kreuzberg.toml` in current directory and parent directories.
181
+ ///
182
+ /// # Returns
183
+ ///
184
+ /// - `Some(config)` if found
185
+ /// - `None` if no config file found
186
+ pub fn discover() -> Result<Option<Self>> {
187
+ let mut current = std::env::current_dir().map_err(KreuzbergError::Io)?;
188
+
189
+ loop {
190
+ let kreuzberg_toml = current.join("kreuzberg.toml");
191
+ if kreuzberg_toml.exists() {
192
+ return Ok(Some(Self::from_toml_file(kreuzberg_toml)?));
193
+ }
194
+
195
+ if let Some(parent) = current.parent() {
196
+ current = parent.to_path_buf();
197
+ } else {
198
+ break;
199
+ }
200
+ }
201
+
202
+ Ok(None)
203
+ }
204
+ }
@@ -0,0 +1,42 @@
1
+ //! Main extraction configuration and environment variable handling.
2
+ //!
3
+ //! This module contains the main `ExtractionConfig` struct and related utilities
4
+ //! for loading configuration from files and applying environment variable overrides.
5
+ //!
6
+ //! The module is organized into focused submodules:
7
+ //! - `types`: Feature-specific configuration types (image, token reduction, language detection)
8
+ //! - `core`: Main ExtractionConfig struct and implementation
9
+ //! - `env`: Environment variable override support
10
+ //! - `loaders`: Configuration file loading with caching
11
+
12
+ mod core;
13
+ mod env;
14
+ mod loaders;
15
+ mod types;
16
+
17
+ // Re-export all public types for backward compatibility
18
+ pub use self::core::ExtractionConfig;
19
+ pub use self::types::{ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
20
+
21
+ #[cfg(test)]
22
+ mod tests {
23
+ use super::*;
24
+ use crate::core::config::ocr::OcrConfig;
25
+
26
+ #[test]
27
+ fn test_default_config() {
28
+ let config = ExtractionConfig::default();
29
+ assert!(config.use_cache);
30
+ assert!(config.enable_quality_processing);
31
+ assert!(config.ocr.is_none());
32
+ }
33
+
34
+ #[test]
35
+ fn test_needs_image_processing() {
36
+ let mut config = ExtractionConfig::default();
37
+ assert!(!config.needs_image_processing());
38
+
39
+ config.ocr = Some(OcrConfig::default());
40
+ assert!(config.needs_image_processing());
41
+ }
42
+ }
@@ -0,0 +1,93 @@
1
+ //! Feature-specific configuration types for extraction.
2
+ //!
3
+ //! This module contains configuration structs for specific extraction features:
4
+ //! - Image extraction and processing
5
+ //! - Token reduction
6
+ //! - Language detection
7
+
8
+ use serde::{Deserialize, Serialize};
9
+
10
+ /// Image extraction configuration.
11
+ #[derive(Debug, Clone, Serialize, Deserialize)]
12
+ pub struct ImageExtractionConfig {
13
+ /// Extract images from documents
14
+ #[serde(default = "default_true")]
15
+ pub extract_images: bool,
16
+
17
+ /// Target DPI for image normalization
18
+ #[serde(default = "default_target_dpi")]
19
+ pub target_dpi: i32,
20
+
21
+ /// Maximum dimension for images (width or height)
22
+ #[serde(default = "default_max_dimension")]
23
+ pub max_image_dimension: i32,
24
+
25
+ /// Automatically adjust DPI based on image content
26
+ #[serde(default = "default_true")]
27
+ pub auto_adjust_dpi: bool,
28
+
29
+ /// Minimum DPI threshold
30
+ #[serde(default = "default_min_dpi")]
31
+ pub min_dpi: i32,
32
+
33
+ /// Maximum DPI threshold
34
+ #[serde(default = "default_max_dpi")]
35
+ pub max_dpi: i32,
36
+ }
37
+
38
+ /// Token reduction configuration.
39
+ #[derive(Debug, Clone, Serialize, Deserialize)]
40
+ pub struct TokenReductionConfig {
41
+ /// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
42
+ #[serde(default = "default_reduction_mode")]
43
+ pub mode: String,
44
+
45
+ /// Preserve important words (capitalized, technical terms)
46
+ #[serde(default = "default_true")]
47
+ pub preserve_important_words: bool,
48
+ }
49
+
50
+ /// Language detection configuration.
51
+ #[derive(Debug, Clone, Serialize, Deserialize)]
52
+ pub struct LanguageDetectionConfig {
53
+ /// Enable language detection
54
+ #[serde(default = "default_true")]
55
+ pub enabled: bool,
56
+
57
+ /// Minimum confidence threshold (0.0-1.0)
58
+ #[serde(default = "default_confidence")]
59
+ pub min_confidence: f64,
60
+
61
+ /// Detect multiple languages in the document
62
+ #[serde(default)]
63
+ pub detect_multiple: bool,
64
+ }
65
+
66
+ // Default value functions
67
+ fn default_true() -> bool {
68
+ true
69
+ }
70
+
71
+ fn default_target_dpi() -> i32 {
72
+ 300
73
+ }
74
+
75
+ fn default_max_dimension() -> i32 {
76
+ 4096
77
+ }
78
+
79
+ fn default_min_dpi() -> i32 {
80
+ 72
81
+ }
82
+
83
+ fn default_max_dpi() -> i32 {
84
+ 600
85
+ }
86
+
87
+ fn default_reduction_mode() -> String {
88
+ "off".to_string()
89
+ }
90
+
91
+ fn default_confidence() -> f64 {
92
+ 0.8
93
+ }