kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -9,10 +9,7 @@ use crate::{batch_extract_bytes, cache, extract_bytes};
9
9
 
10
10
  use super::{
11
11
  error::ApiError,
12
- types::{
13
- ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ExtractResponse, HealthResponse,
14
- InfoResponse,
15
- },
12
+ types::{ApiState, CacheClearResponse, CacheStatsResponse, ExtractResponse, HealthResponse, InfoResponse},
16
13
  };
17
14
 
18
15
  /// Extract endpoint handler.
@@ -27,24 +24,15 @@ use super::{
27
24
  ///
28
25
  /// # Size Limits
29
26
  ///
30
- /// Request body size limits are enforced at the router layer via `DefaultBodyLimit` and `RequestBodyLimitLayer`.
27
+ /// Request body size limits are enforced at the router layer via `RequestBodyLimitLayer`.
31
28
  /// Default limits:
32
29
  /// - Total request body: 100 MB (all files + form data combined)
33
- /// - Individual multipart fields: 100 MB (controlled by Axum's `DefaultBodyLimit`)
30
+ /// - Individual multipart fields: Controlled by Axum's default multipart limits
34
31
  ///
35
- /// Limits can be configured via environment variables or programmatically when creating the router.
36
32
  /// If a request exceeds the size limit, it will be rejected with HTTP 413 (Payload Too Large).
37
33
  ///
38
34
  /// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
39
35
  /// is used as the base, and any per-request config overrides those defaults.
40
- #[cfg_attr(
41
- feature = "otel",
42
- tracing::instrument(
43
- name = "api.extract",
44
- skip(state, multipart),
45
- fields(files_count = tracing::field::Empty)
46
- )
47
- )]
48
36
  pub async fn extract_handler(
49
37
  State(state): State<ApiState>,
50
38
  mut multipart: Multipart,
@@ -95,9 +83,6 @@ pub async fn extract_handler(
95
83
  )));
96
84
  }
97
85
 
98
- #[cfg(feature = "otel")]
99
- tracing::Span::current().record("files_count", files.len());
100
-
101
86
  if files.len() == 1 {
102
87
  let (data, mime_type, _file_name) = files
103
88
  .into_iter()
@@ -109,14 +94,18 @@ pub async fn extract_handler(
109
94
 
110
95
  let files_data: Vec<(Vec<u8>, String)> = files.into_iter().map(|(data, mime, _name)| (data, mime)).collect();
111
96
 
112
- let results = batch_extract_bytes(files_data, &config).await?;
97
+ let file_refs: Vec<(&[u8], &str)> = files_data
98
+ .iter()
99
+ .map(|(data, mime)| (data.as_slice(), mime.as_str()))
100
+ .collect();
101
+
102
+ let results = batch_extract_bytes(file_refs, &config).await?;
113
103
  Ok(Json(results))
114
104
  }
115
105
 
116
106
  /// Health check endpoint handler.
117
107
  ///
118
108
  /// GET /health
119
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
120
109
  pub async fn health_handler() -> Json<HealthResponse> {
121
110
  Json(HealthResponse {
122
111
  status: "healthy".to_string(),
@@ -127,7 +116,6 @@ pub async fn health_handler() -> Json<HealthResponse> {
127
116
  /// Server info endpoint handler.
128
117
  ///
129
118
  /// GET /info
130
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
131
119
  pub async fn info_handler() -> Json<InfoResponse> {
132
120
  Json(InfoResponse {
133
121
  version: env!("CARGO_PKG_VERSION").to_string(),
@@ -145,7 +133,6 @@ pub async fn info_handler() -> Json<InfoResponse> {
145
133
  /// - Current directory cannot be determined
146
134
  /// - Cache directory path contains non-UTF8 characters
147
135
  /// - Cache metadata retrieval fails
148
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
149
136
  pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
150
137
  let cache_dir = std::env::current_dir()
151
138
  .map_err(|e| {
@@ -185,7 +172,6 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
185
172
  /// - Current directory cannot be determined
186
173
  /// - Cache directory path contains non-UTF8 characters
187
174
  /// - Cache clearing operation fails
188
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
189
175
  pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
190
176
  let cache_dir = std::env::current_dir()
191
177
  .map_err(|e| {
@@ -211,110 +197,3 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
211
197
  freed_mb,
212
198
  }))
213
199
  }
214
-
215
- /// Embedding endpoint handler.
216
- ///
217
- /// POST /embed
218
- ///
219
- /// Accepts JSON body with:
220
- /// - `texts`: Array of strings to generate embeddings for
221
- /// - `config` (optional): Embedding configuration (model, batch size, cache_dir)
222
- ///
223
- /// Returns embeddings for each input text.
224
- ///
225
- /// # Errors
226
- ///
227
- /// Returns `ApiError::Internal` if:
228
- /// - Embeddings feature is not enabled
229
- /// - ONNX Runtime is not available
230
- /// - Model initialization fails
231
- /// - Embedding generation fails
232
- #[cfg(feature = "embeddings")]
233
- #[cfg_attr(
234
- feature = "otel",
235
- tracing::instrument(
236
- name = "api.embed",
237
- skip(request),
238
- fields(
239
- texts_count = request.texts.len(),
240
- model = tracing::field::Empty
241
- )
242
- )
243
- )]
244
- pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
245
- use crate::types::{Chunk, ChunkMetadata};
246
-
247
- if request.texts.is_empty() {
248
- return Err(ApiError::validation(crate::error::KreuzbergError::validation(
249
- "No texts provided for embedding generation",
250
- )));
251
- }
252
-
253
- // Use default config if none provided
254
- let config = request.config.unwrap_or_default();
255
-
256
- // Create chunks from input texts
257
- let mut chunks: Vec<Chunk> = request
258
- .texts
259
- .iter()
260
- .enumerate()
261
- .map(|(idx, text)| Chunk {
262
- content: text.clone(),
263
- embedding: None,
264
- metadata: ChunkMetadata {
265
- byte_start: 0,
266
- byte_end: text.len(),
267
- token_count: None,
268
- chunk_index: idx,
269
- total_chunks: request.texts.len(),
270
- first_page: None,
271
- last_page: None,
272
- },
273
- })
274
- .collect();
275
-
276
- // Generate embeddings
277
- crate::embeddings::generate_embeddings_for_chunks(&mut chunks, &config).map_err(ApiError::internal)?;
278
-
279
- // Extract embeddings from chunks
280
- let embeddings: Vec<Vec<f32>> = chunks
281
- .into_iter()
282
- .map(|chunk| {
283
- chunk.embedding.ok_or_else(|| {
284
- ApiError::internal(crate::error::KreuzbergError::Other(
285
- "Failed to generate embedding for text".to_string(),
286
- ))
287
- })
288
- })
289
- .collect::<Result<Vec<_>, _>>()?;
290
-
291
- let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
292
-
293
- // Get model name from config
294
- let model_name = match &config.model {
295
- crate::core::config::EmbeddingModelType::Preset { name } => name.clone(),
296
- #[cfg(feature = "embeddings")]
297
- crate::core::config::EmbeddingModelType::FastEmbed { model, .. } => model.clone(),
298
- crate::core::config::EmbeddingModelType::Custom { .. } => "custom".to_string(),
299
- };
300
-
301
- #[cfg(feature = "otel")]
302
- tracing::Span::current().record("model", &model_name);
303
-
304
- Ok(Json(EmbedResponse {
305
- embeddings,
306
- model: model_name,
307
- dimensions,
308
- count: request.texts.len(),
309
- }))
310
- }
311
-
312
- /// Embedding endpoint handler (when embeddings feature is disabled).
313
- ///
314
- /// Returns an error indicating embeddings feature is not enabled.
315
- #[cfg(not(feature = "embeddings"))]
316
- pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
317
- Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
318
- "Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
319
- )))
320
- }
@@ -6,11 +6,8 @@
6
6
  //! # Endpoints
7
7
  //!
8
8
  //! - `POST /extract` - Extract text from uploaded files (multipart form data)
9
- //! - `POST /embed` - Generate embeddings for text (JSON body with texts array)
10
9
  //! - `GET /health` - Health check endpoint
11
10
  //! - `GET /info` - Server information
12
- //! - `GET /cache/stats` - Get cache statistics
13
- //! - `DELETE /cache/clear` - Clear all cached files
14
11
  //!
15
12
  //! # Examples
16
13
  //!
@@ -65,17 +62,6 @@
65
62
  //!
66
63
  //! # Server info
67
64
  //! curl http://localhost:8000/info
68
- //!
69
- //! # Cache statistics
70
- //! curl http://localhost:8000/cache/stats
71
- //!
72
- //! # Clear cache
73
- //! curl -X DELETE http://localhost:8000/cache/clear
74
- //!
75
- //! # Generate embeddings
76
- //! curl -X POST http://localhost:8000/embed \
77
- //! -H "Content-Type: application/json" \
78
- //! -d '{"texts":["Hello world","Second text"]}'
79
65
  //! ```
80
66
 
81
67
  mod error;
@@ -85,10 +71,9 @@ mod types;
85
71
 
86
72
  pub use error::ApiError;
87
73
  pub use server::{
88
- create_router, create_router_with_limits, create_router_with_limits_and_server_config, load_server_config, serve,
89
- serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config,
74
+ create_router, create_router_with_limits, serve, serve_default, serve_with_config, serve_with_config_and_limits,
90
75
  };
91
76
  pub use types::{
92
- ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ErrorResponse,
93
- ExtractResponse, HealthResponse, InfoResponse,
77
+ ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, ErrorResponse, ExtractResponse, HealthResponse,
78
+ InfoResponse,
94
79
  };
@@ -1,10 +1,12 @@
1
1
  //! API server setup and configuration.
2
2
 
3
- use std::{net::SocketAddr, sync::Arc};
3
+ use std::{
4
+ net::{IpAddr, SocketAddr},
5
+ sync::Arc,
6
+ };
4
7
 
5
8
  use axum::{
6
9
  Router,
7
- extract::DefaultBodyLimit,
8
10
  routing::{delete, get, post},
9
11
  };
10
12
  use tower_http::{
@@ -13,79 +15,60 @@ use tower_http::{
13
15
  trace::TraceLayer,
14
16
  };
15
17
 
16
- use crate::{ExtractionConfig, Result, core::ServerConfig};
18
+ use crate::{ExtractionConfig, Result};
17
19
 
18
20
  use super::{
19
- handlers::{
20
- cache_clear_handler, cache_stats_handler, embed_handler, extract_handler, health_handler, info_handler,
21
- },
21
+ handlers::{cache_clear_handler, cache_stats_handler, extract_handler, health_handler, info_handler},
22
22
  types::{ApiSizeLimits, ApiState},
23
23
  };
24
24
 
25
- /// Load ServerConfig with proper precedence order.
26
- ///
27
- /// This function implements the configuration hierarchy:
28
- /// 1. File (if provided)
29
- /// 2. Environment variables (via apply_env_overrides)
30
- /// 3. Defaults
31
- ///
32
- /// The config file can be in flat format (server settings at root) or nested format
33
- /// (server settings under [server] section alongside other configs like [ocr]).
34
- ///
35
- /// # Arguments
36
- ///
37
- /// * `config_path` - Optional path to a ServerConfig file (TOML, YAML, or JSON)
38
- ///
39
- /// # Returns
40
- ///
41
- /// A configured ServerConfig with proper precedence applied.
42
- ///
43
- /// # Errors
44
- ///
45
- /// Returns an error if:
46
- /// - The config file path is provided but cannot be read
47
- /// - The config file contains invalid server configuration
48
- /// - Environment variable overrides contain invalid values
49
- ///
50
- /// # Examples
51
- ///
52
- /// ```no_run
53
- /// use kreuzberg::api::load_server_config;
54
- /// use std::path::Path;
55
- ///
56
- /// # fn example() -> kreuzberg::Result<()> {
57
- /// // Load from file with env overrides
58
- /// let config = load_server_config(Some(Path::new("server.toml")))?;
59
- ///
60
- /// // Or use defaults with env overrides
61
- /// let config = load_server_config(None)?;
62
- /// # Ok(())
63
- /// # }
64
- /// ```
65
- pub fn load_server_config(config_path: Option<&std::path::Path>) -> Result<ServerConfig> {
66
- let mut config = if let Some(path) = config_path {
67
- ServerConfig::from_file(path)?
68
- } else {
69
- ServerConfig::default()
70
- };
71
-
72
- // Apply environment variable overrides with proper logging
73
- config.apply_env_overrides()?;
74
-
75
- tracing::info!(
76
- "Server configuration loaded: host={}, port={}, request_body_limit={} MB, multipart_field_limit={} MB, CORS={}",
77
- config.host,
78
- config.port,
79
- config.max_request_body_mb(),
80
- config.max_multipart_field_mb(),
81
- if config.cors_allows_all() {
82
- "allow all origins".to_string()
83
- } else {
84
- format!("{} specific origins", config.cors_origins.len())
25
+ /// Parse size limits from environment variables.
26
+ ///
27
+ /// Reads `KREUZBERG_MAX_UPLOAD_SIZE_MB` to configure upload size limits.
28
+ /// Falls back to default (100 MB) if not set or invalid.
29
+ fn parse_size_limits_from_env() -> ApiSizeLimits {
30
+ match std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
31
+ Ok(value) => match value.parse::<usize>() {
32
+ Ok(mb) if mb > 0 => {
33
+ tracing::info!(
34
+ "Upload size limit configured from environment: {} MB ({} bytes)",
35
+ mb,
36
+ mb * 1024 * 1024
37
+ );
38
+ ApiSizeLimits::from_mb(mb, mb)
39
+ }
40
+ Ok(_) => {
41
+ tracing::warn!("Invalid KREUZBERG_MAX_UPLOAD_SIZE_MB value (must be > 0), using default 100 MB");
42
+ let limits = ApiSizeLimits::default();
43
+ tracing::info!(
44
+ "Upload size limit: 100 MB (default, {} bytes)",
45
+ limits.max_request_body_bytes
46
+ );
47
+ limits
48
+ }
49
+ Err(e) => {
50
+ tracing::warn!(
51
+ "Failed to parse KREUZBERG_MAX_UPLOAD_SIZE_MB='{}': {}, using default 100 MB",
52
+ value,
53
+ e
54
+ );
55
+ let limits = ApiSizeLimits::default();
56
+ tracing::info!(
57
+ "Upload size limit: 100 MB (default, {} bytes)",
58
+ limits.max_request_body_bytes
59
+ );
60
+ limits
61
+ }
62
+ },
63
+ Err(_) => {
64
+ let limits = ApiSizeLimits::default();
65
+ tracing::info!(
66
+ "Upload size limit: 100 MB (default, {} bytes)",
67
+ limits.max_request_body_bytes
68
+ );
69
+ limits
85
70
  }
86
- );
87
-
88
- Ok(config)
71
+ }
89
72
  }
90
73
 
91
74
  /// Create the API router with all routes configured.
@@ -148,58 +131,15 @@ pub fn create_router(config: ExtractionConfig) -> Router {
148
131
  /// # }
149
132
  /// ```
150
133
  pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
151
- create_router_with_limits_and_server_config(config, limits, ServerConfig::default())
152
- }
153
-
154
- /// Create the API router with custom size limits and server configuration.
155
- ///
156
- /// This function provides full control over request limits, CORS, and server settings via ServerConfig.
157
- ///
158
- /// # Arguments
159
- ///
160
- /// * `config` - Default extraction configuration. Per-request configs override these defaults.
161
- /// * `limits` - Size limits for request bodies and multipart uploads.
162
- /// * `server_config` - Server configuration including host, port, and CORS settings.
163
- ///
164
- /// # Examples
165
- ///
166
- /// ```no_run
167
- /// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
168
- ///
169
- /// # #[tokio::main]
170
- /// # async fn main() -> kreuzberg::Result<()> {
171
- /// let extraction_config = ExtractionConfig::default();
172
- /// let mut server_config = ServerConfig::default();
173
- /// server_config.cors_origins = vec!["https://example.com".to_string()];
174
- /// let router = create_router_with_limits_and_server_config(
175
- /// extraction_config,
176
- /// Default::default(),
177
- /// server_config
178
- /// );
179
- /// # Ok(())
180
- /// # }
181
- /// ```
182
- pub fn create_router_with_limits_and_server_config(
183
- config: ExtractionConfig,
184
- limits: ApiSizeLimits,
185
- server_config: ServerConfig,
186
- ) -> Router {
187
134
  let state = ApiState {
188
135
  default_config: Arc::new(config),
189
136
  };
190
137
 
191
- // CORS configuration based on ServerConfig
192
- let cors_layer = if server_config.cors_allows_all() {
193
- tracing::warn!(
194
- "CORS configured to allow all origins (default). This permits CSRF attacks. \
195
- For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
196
- list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
197
- );
198
- CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
199
- } else {
200
- let origins: Vec<_> = server_config
201
- .cors_origins
202
- .iter()
138
+ // SECURITY WARNING: The default allows all origins for development convenience,
139
+ let cors_layer = if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
140
+ let origins: Vec<_> = origins_str
141
+ .split(',')
142
+ .filter(|s| !s.trim().is_empty())
203
143
  .filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
204
144
  .collect();
205
145
 
@@ -211,21 +151,26 @@ pub fn create_router_with_limits_and_server_config(
211
151
  .allow_headers(Any)
212
152
  } else {
213
153
  tracing::warn!(
214
- "CORS origins configured but empty/invalid - falling back to permissive CORS. \
154
+ "KREUZBERG_CORS_ORIGINS set but empty/invalid - falling back to permissive CORS. \
215
155
  This allows CSRF attacks. Set explicit origins for production."
216
156
  );
217
157
  CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
218
158
  }
159
+ } else {
160
+ tracing::warn!(
161
+ "CORS configured to allow all origins (default). This permits CSRF attacks. \
162
+ For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
163
+ list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
164
+ );
165
+ CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
219
166
  };
220
167
 
221
168
  Router::new()
222
169
  .route("/extract", post(extract_handler))
223
- .route("/embed", post(embed_handler))
224
170
  .route("/health", get(health_handler))
225
171
  .route("/info", get(info_handler))
226
172
  .route("/cache/stats", get(cache_stats_handler))
227
173
  .route("/cache/clear", delete(cache_clear_handler))
228
- .layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
229
174
  .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
230
175
  .layer(cors_layer)
231
176
  .layer(TraceLayer::new_for_http())
@@ -278,18 +223,13 @@ pub fn create_router_with_limits_and_server_config(
278
223
  /// # Production: set to comma-separated list of allowed origins
279
224
  /// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
280
225
  ///
281
- /// # Upload size limits (default: 100 MB)
282
- /// # Modern approach (in bytes):
283
- /// export KREUZBERG_MAX_REQUEST_BODY_BYTES=104857600 # 100 MB
284
- /// export KREUZBERG_MAX_MULTIPART_FIELD_BYTES=104857600 # 100 MB per file
285
- ///
286
- /// # Legacy approach (in MB, applies to both limits):
287
- /// export KREUZBERG_MAX_UPLOAD_SIZE_MB=100 # 100 MB
226
+ /// # Upload size limit (default: 100 MB)
227
+ /// export KREUZBERG_MAX_UPLOAD_SIZE_MB=200
288
228
  ///
289
229
  /// python -m kreuzberg.api
290
230
  /// ```
291
231
  pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
292
- let extraction_config = match ExtractionConfig::discover()? {
232
+ let config = match ExtractionConfig::discover()? {
293
233
  Some(config) => {
294
234
  tracing::info!("Loaded extraction config from discovered file");
295
235
  config
@@ -300,13 +240,9 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
300
240
  }
301
241
  };
302
242
 
303
- let server_config = load_server_config(None)?;
304
- let limits = ApiSizeLimits::new(
305
- server_config.max_request_body_bytes,
306
- server_config.max_multipart_field_bytes,
307
- );
243
+ let limits = parse_size_limits_from_env();
308
244
 
309
- serve_with_config_and_limits(host, port, extraction_config, limits).await
245
+ serve_with_config_and_limits(host, port, config, limits).await
310
246
  }
311
247
 
312
248
  /// Start the API server with explicit config.
@@ -368,23 +304,13 @@ pub async fn serve_with_config_and_limits(
368
304
  config: ExtractionConfig,
369
305
  limits: ApiSizeLimits,
370
306
  ) -> Result<()> {
371
- use std::net::IpAddr;
372
-
373
307
  let ip: IpAddr = host
374
308
  .as_ref()
375
309
  .parse()
376
310
  .map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
377
311
 
378
- let server_config = ServerConfig {
379
- host: host.as_ref().to_string(),
380
- port,
381
- max_request_body_bytes: limits.max_request_body_bytes,
382
- max_multipart_field_bytes: limits.max_multipart_field_bytes,
383
- ..Default::default()
384
- };
385
-
386
312
  let addr = SocketAddr::new(ip, port);
387
- let app = create_router_with_limits_and_server_config(config, limits, server_config);
313
+ let app = create_router_with_limits(config, limits);
388
314
 
389
315
  tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
390
316
 
@@ -399,70 +325,6 @@ pub async fn serve_with_config_and_limits(
399
325
  Ok(())
400
326
  }
401
327
 
402
- /// Start the API server with explicit extraction config and server config.
403
- ///
404
- /// This function accepts a fully-configured ServerConfig, including CORS origins,
405
- /// size limits, host, and port. It respects all ServerConfig fields without
406
- /// re-parsing environment variables, making it ideal for CLI usage where
407
- /// configuration precedence has already been applied.
408
- ///
409
- /// # Arguments
410
- ///
411
- /// * `extraction_config` - Default extraction configuration for all requests
412
- /// * `server_config` - Server configuration including host, port, CORS, and size limits
413
- ///
414
- /// # Examples
415
- ///
416
- /// ```no_run
417
- /// use kreuzberg::{ExtractionConfig, api::serve_with_server_config, core::ServerConfig};
418
- ///
419
- /// #[tokio::main]
420
- /// async fn main() -> kreuzberg::Result<()> {
421
- /// let extraction_config = ExtractionConfig::default();
422
- /// let mut server_config = ServerConfig::default();
423
- /// server_config.host = "0.0.0.0".to_string();
424
- /// server_config.port = 3000;
425
- /// server_config.cors_origins = vec!["https://example.com".to_string()];
426
- ///
427
- /// serve_with_server_config(extraction_config, server_config).await?;
428
- /// Ok(())
429
- /// }
430
- /// ```
431
- pub async fn serve_with_server_config(extraction_config: ExtractionConfig, server_config: ServerConfig) -> Result<()> {
432
- use std::net::IpAddr;
433
-
434
- let ip: IpAddr = server_config
435
- .host
436
- .parse()
437
- .map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
438
-
439
- let limits = ApiSizeLimits::new(
440
- server_config.max_request_body_bytes,
441
- server_config.max_multipart_field_bytes,
442
- );
443
-
444
- let addr = SocketAddr::new(ip, server_config.port);
445
- let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
446
-
447
- tracing::info!(
448
- "Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
449
- ip,
450
- server_config.port,
451
- server_config.max_request_body_mb(),
452
- server_config.max_multipart_field_mb()
453
- );
454
-
455
- let listener = tokio::net::TcpListener::bind(addr)
456
- .await
457
- .map_err(crate::error::KreuzbergError::Io)?;
458
-
459
- axum::serve(listener, app)
460
- .await
461
- .map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
462
-
463
- Ok(())
464
- }
465
-
466
328
  /// Start the API server with default host and port.
467
329
  ///
468
330
  /// Defaults: host = "127.0.0.1", port = 8000
@@ -473,7 +335,6 @@ pub async fn serve_default() -> Result<()> {
473
335
  }
474
336
 
475
337
  #[cfg(test)]
476
- #[allow(unsafe_code)]
477
338
  mod tests {
478
339
  use super::*;
479
340
 
@@ -489,30 +350,4 @@ mod tests {
489
350
  let router = create_router(config);
490
351
  assert!(size_of_val(&router) > 0);
491
352
  }
492
-
493
- #[test]
494
- fn test_create_router_with_limits() {
495
- let config = ExtractionConfig::default();
496
- let limits = ApiSizeLimits::from_mb(50, 50);
497
- let _router = create_router_with_limits(config, limits);
498
- }
499
-
500
- #[test]
501
- fn test_create_router_with_server_config() {
502
- let extraction_config = ExtractionConfig::default();
503
- let limits = ApiSizeLimits::from_mb(100, 100);
504
- let server_config = ServerConfig::default();
505
- let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
506
- }
507
-
508
- #[test]
509
- fn test_server_config_cors_handling() {
510
- let extraction_config = ExtractionConfig::default();
511
- let limits = ApiSizeLimits::default();
512
- let server_config = ServerConfig {
513
- cors_origins: vec!["https://example.com".to_string()],
514
- ..Default::default()
515
- };
516
- let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
517
- }
518
353
  }