kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,199 @@
1
+ //! API request handlers.
2
+
3
+ use axum::{
4
+ Json,
5
+ extract::{Multipart, State},
6
+ };
7
+
8
+ use crate::{batch_extract_bytes, cache, extract_bytes};
9
+
10
+ use super::{
11
+ error::ApiError,
12
+ types::{ApiState, CacheClearResponse, CacheStatsResponse, ExtractResponse, HealthResponse, InfoResponse},
13
+ };
14
+
15
+ /// Extract endpoint handler.
16
+ ///
17
+ /// POST /extract
18
+ ///
19
+ /// Accepts multipart form data with:
20
+ /// - `files`: One or more files to extract
21
+ /// - `config` (optional): JSON extraction configuration (overrides server defaults)
22
+ ///
23
+ /// Returns a list of extraction results, one per file.
24
+ ///
25
+ /// # Size Limits
26
+ ///
27
+ /// Request body size limits are enforced at the router layer via `RequestBodyLimitLayer`.
28
+ /// Default limits:
29
+ /// - Total request body: 100 MB (all files + form data combined)
30
+ /// - Individual multipart fields: Controlled by Axum's default multipart limits
31
+ ///
32
+ /// If a request exceeds the size limit, it will be rejected with HTTP 413 (Payload Too Large).
33
+ ///
34
+ /// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
35
+ /// is used as the base, and any per-request config overrides those defaults.
36
+ pub async fn extract_handler(
37
+ State(state): State<ApiState>,
38
+ mut multipart: Multipart,
39
+ ) -> Result<Json<ExtractResponse>, ApiError> {
40
+ let mut files = Vec::new();
41
+ let mut config = (*state.default_config).clone();
42
+
43
+ while let Some(field) = multipart
44
+ .next_field()
45
+ .await
46
+ .map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?
47
+ {
48
+ let field_name = field.name().unwrap_or("").to_string();
49
+
50
+ match field_name.as_str() {
51
+ "files" => {
52
+ let file_name = field.file_name().map(|s| s.to_string());
53
+ let content_type = field.content_type().map(|s| s.to_string());
54
+ let data = field
55
+ .bytes()
56
+ .await
57
+ .map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
58
+
59
+ let mime_type = content_type.unwrap_or_else(|| "application/octet-stream".to_string());
60
+
61
+ files.push((data.to_vec(), mime_type, file_name));
62
+ }
63
+ "config" => {
64
+ let config_str = field
65
+ .text()
66
+ .await
67
+ .map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
68
+
69
+ config = serde_json::from_str(&config_str).map_err(|e| {
70
+ ApiError::validation(crate::error::KreuzbergError::validation(format!(
71
+ "Invalid extraction configuration: {}",
72
+ e
73
+ )))
74
+ })?;
75
+ }
76
+ _ => {}
77
+ }
78
+ }
79
+
80
+ if files.is_empty() {
81
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(
82
+ "No files provided for extraction",
83
+ )));
84
+ }
85
+
86
+ if files.len() == 1 {
87
+ let (data, mime_type, _file_name) = files
88
+ .into_iter()
89
+ .next()
90
+ .expect("files.len() == 1 guarantees one element exists");
91
+ let result = extract_bytes(&data, mime_type.as_str(), &config).await?;
92
+ return Ok(Json(vec![result]));
93
+ }
94
+
95
+ let files_data: Vec<(Vec<u8>, String)> = files.into_iter().map(|(data, mime, _name)| (data, mime)).collect();
96
+
97
+ let file_refs: Vec<(&[u8], &str)> = files_data
98
+ .iter()
99
+ .map(|(data, mime)| (data.as_slice(), mime.as_str()))
100
+ .collect();
101
+
102
+ let results = batch_extract_bytes(file_refs, &config).await?;
103
+ Ok(Json(results))
104
+ }
105
+
106
+ /// Health check endpoint handler.
107
+ ///
108
+ /// GET /health
109
+ pub async fn health_handler() -> Json<HealthResponse> {
110
+ Json(HealthResponse {
111
+ status: "healthy".to_string(),
112
+ version: env!("CARGO_PKG_VERSION").to_string(),
113
+ })
114
+ }
115
+
116
+ /// Server info endpoint handler.
117
+ ///
118
+ /// GET /info
119
+ pub async fn info_handler() -> Json<InfoResponse> {
120
+ Json(InfoResponse {
121
+ version: env!("CARGO_PKG_VERSION").to_string(),
122
+ rust_backend: true,
123
+ })
124
+ }
125
+
126
+ /// Cache stats endpoint handler.
127
+ ///
128
+ /// GET /cache/stats
129
+ ///
130
+ /// # Errors
131
+ ///
132
+ /// Returns `ApiError::Internal` if:
133
+ /// - Current directory cannot be determined
134
+ /// - Cache directory path contains non-UTF8 characters
135
+ /// - Cache metadata retrieval fails
136
+ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
137
+ let cache_dir = std::env::current_dir()
138
+ .map_err(|e| {
139
+ ApiError::internal(crate::error::KreuzbergError::Other(format!(
140
+ "Failed to get current directory: {}",
141
+ e
142
+ )))
143
+ })?
144
+ .join(".kreuzberg");
145
+
146
+ let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
147
+ ApiError::internal(crate::error::KreuzbergError::Other(format!(
148
+ "Cache directory path contains non-UTF8 characters: {}",
149
+ cache_dir.display()
150
+ )))
151
+ })?;
152
+
153
+ let stats = cache::get_cache_metadata(cache_dir_str).map_err(ApiError::internal)?;
154
+
155
+ Ok(Json(CacheStatsResponse {
156
+ directory: cache_dir.to_string_lossy().to_string(),
157
+ total_files: stats.total_files,
158
+ total_size_mb: stats.total_size_mb,
159
+ available_space_mb: stats.available_space_mb,
160
+ oldest_file_age_days: stats.oldest_file_age_days,
161
+ newest_file_age_days: stats.newest_file_age_days,
162
+ }))
163
+ }
164
+
165
+ /// Cache clear endpoint handler.
166
+ ///
167
+ /// DELETE /cache/clear
168
+ ///
169
+ /// # Errors
170
+ ///
171
+ /// Returns `ApiError::Internal` if:
172
+ /// - Current directory cannot be determined
173
+ /// - Cache directory path contains non-UTF8 characters
174
+ /// - Cache clearing operation fails
175
+ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
176
+ let cache_dir = std::env::current_dir()
177
+ .map_err(|e| {
178
+ ApiError::internal(crate::error::KreuzbergError::Other(format!(
179
+ "Failed to get current directory: {}",
180
+ e
181
+ )))
182
+ })?
183
+ .join(".kreuzberg");
184
+
185
+ let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
186
+ ApiError::internal(crate::error::KreuzbergError::Other(format!(
187
+ "Cache directory path contains non-UTF8 characters: {}",
188
+ cache_dir.display()
189
+ )))
190
+ })?;
191
+
192
+ let (removed_files, freed_mb) = cache::clear_cache_directory(cache_dir_str).map_err(ApiError::internal)?;
193
+
194
+ Ok(Json(CacheClearResponse {
195
+ directory: cache_dir.to_string_lossy().to_string(),
196
+ removed_files,
197
+ freed_mb,
198
+ }))
199
+ }
@@ -0,0 +1,79 @@
1
+ //! REST API server for Kreuzberg document extraction.
2
+ //!
3
+ //! This module provides an Axum-based HTTP server for document extraction
4
+ //! with endpoints for single and batch extraction operations.
5
+ //!
6
+ //! # Endpoints
7
+ //!
8
+ //! - `POST /extract` - Extract text from uploaded files (multipart form data)
9
+ //! - `GET /health` - Health check endpoint
10
+ //! - `GET /info` - Server information
11
+ //!
12
+ //! # Examples
13
+ //!
14
+ //! ## Starting the server
15
+ //!
16
+ //! ```no_run
17
+ //! use kreuzberg::api::serve;
18
+ //!
19
+ //! #[tokio::main]
20
+ //! async fn main() -> kreuzberg::Result<()> {
21
+ //! // Local development
22
+ //! serve("127.0.0.1", 8000).await?;
23
+ //! Ok(())
24
+ //! }
25
+ //! ```
26
+ //!
27
+ //! ## Embedding the router in your app
28
+ //!
29
+ //! ```no_run
30
+ //! use kreuzberg::{ExtractionConfig, api::create_router};
31
+ //! use axum::Router;
32
+ //!
33
+ //! #[tokio::main]
34
+ //! async fn main() -> kreuzberg::Result<()> {
35
+ //! // Load config (from file or use default)
36
+ //! let config = ExtractionConfig::default();
37
+ //! let kreuzberg_router = create_router(config);
38
+ //!
39
+ //! // Nest under /api prefix
40
+ //! let app = Router::new().nest("/api", kreuzberg_router);
41
+ //!
42
+ //! // Add your own routes
43
+ //! // ...
44
+ //!
45
+ //! Ok(())
46
+ //! }
47
+ //! ```
48
+ //!
49
+ //! # cURL Examples
50
+ //!
51
+ //! ```bash
52
+ //! # Single file extraction
53
+ //! curl -F "files=@document.pdf" http://localhost:8000/extract
54
+ //!
55
+ //! # Multiple files with OCR config
56
+ //! curl -F "files=@doc1.pdf" -F "files=@doc2.jpg" \
57
+ //! -F 'config={"ocr":{"language":"eng"}}' \
58
+ //! http://localhost:8000/extract
59
+ //!
60
+ //! # Health check
61
+ //! curl http://localhost:8000/health
62
+ //!
63
+ //! # Server info
64
+ //! curl http://localhost:8000/info
65
+ //! ```
66
+
67
+ mod error;
68
+ mod handlers;
69
+ mod server;
70
+ mod types;
71
+
72
+ pub use error::ApiError;
73
+ pub use server::{
74
+ create_router, create_router_with_limits, serve, serve_default, serve_with_config, serve_with_config_and_limits,
75
+ };
76
+ pub use types::{
77
+ ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, ErrorResponse, ExtractResponse, HealthResponse,
78
+ InfoResponse,
79
+ };
@@ -0,0 +1,353 @@
1
+ //! API server setup and configuration.
2
+
3
+ use std::{
4
+ net::{IpAddr, SocketAddr},
5
+ sync::Arc,
6
+ };
7
+
8
+ use axum::{
9
+ Router,
10
+ routing::{delete, get, post},
11
+ };
12
+ use tower_http::{
13
+ cors::{AllowOrigin, Any, CorsLayer},
14
+ limit::RequestBodyLimitLayer,
15
+ trace::TraceLayer,
16
+ };
17
+
18
+ use crate::{ExtractionConfig, Result};
19
+
20
+ use super::{
21
+ handlers::{cache_clear_handler, cache_stats_handler, extract_handler, health_handler, info_handler},
22
+ types::{ApiSizeLimits, ApiState},
23
+ };
24
+
25
+ /// Parse size limits from environment variables.
26
+ ///
27
+ /// Reads `KREUZBERG_MAX_UPLOAD_SIZE_MB` to configure upload size limits.
28
+ /// Falls back to default (100 MB) if not set or invalid.
29
+ fn parse_size_limits_from_env() -> ApiSizeLimits {
30
+ match std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
31
+ Ok(value) => match value.parse::<usize>() {
32
+ Ok(mb) if mb > 0 => {
33
+ tracing::info!(
34
+ "Upload size limit configured from environment: {} MB ({} bytes)",
35
+ mb,
36
+ mb * 1024 * 1024
37
+ );
38
+ ApiSizeLimits::from_mb(mb, mb)
39
+ }
40
+ Ok(_) => {
41
+ tracing::warn!("Invalid KREUZBERG_MAX_UPLOAD_SIZE_MB value (must be > 0), using default 100 MB");
42
+ let limits = ApiSizeLimits::default();
43
+ tracing::info!(
44
+ "Upload size limit: 100 MB (default, {} bytes)",
45
+ limits.max_request_body_bytes
46
+ );
47
+ limits
48
+ }
49
+ Err(e) => {
50
+ tracing::warn!(
51
+ "Failed to parse KREUZBERG_MAX_UPLOAD_SIZE_MB='{}': {}, using default 100 MB",
52
+ value,
53
+ e
54
+ );
55
+ let limits = ApiSizeLimits::default();
56
+ tracing::info!(
57
+ "Upload size limit: 100 MB (default, {} bytes)",
58
+ limits.max_request_body_bytes
59
+ );
60
+ limits
61
+ }
62
+ },
63
+ Err(_) => {
64
+ let limits = ApiSizeLimits::default();
65
+ tracing::info!(
66
+ "Upload size limit: 100 MB (default, {} bytes)",
67
+ limits.max_request_body_bytes
68
+ );
69
+ limits
70
+ }
71
+ }
72
+ }
73
+
74
+ /// Create the API router with all routes configured.
75
+ ///
76
+ /// This is public to allow users to embed the router in their own applications.
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
81
+ ///
82
+ /// # Examples
83
+ ///
84
+ /// ```no_run
85
+ /// use kreuzberg::{ExtractionConfig, api::create_router};
86
+ ///
87
+ /// # #[tokio::main]
88
+ /// # async fn main() {
89
+ /// // Create router with default config and size limits
90
+ /// let config = ExtractionConfig::default();
91
+ /// let router = create_router(config);
92
+ /// # }
93
+ /// ```
94
+ pub fn create_router(config: ExtractionConfig) -> Router {
95
+ create_router_with_limits(config, ApiSizeLimits::default())
96
+ }
97
+
98
+ /// Create the API router with custom size limits.
99
+ ///
100
+ /// This allows fine-grained control over request body and multipart field size limits.
101
+ ///
102
+ /// # Arguments
103
+ ///
104
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
105
+ /// * `limits` - Size limits for request bodies and multipart uploads.
106
+ ///
107
+ /// # Examples
108
+ ///
109
+ /// ```no_run
110
+ /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
111
+ ///
112
+ /// # #[tokio::main]
113
+ /// # async fn main() {
114
+ /// // Create router with 50 MB limits
115
+ /// let config = ExtractionConfig::default();
116
+ /// let limits = ApiSizeLimits::from_mb(50, 50);
117
+ /// let router = create_router_with_limits(config, limits);
118
+ /// # }
119
+ /// ```
120
+ ///
121
+ /// ```no_run
122
+ /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
123
+ /// use tower_http::limit::RequestBodyLimitLayer;
124
+ ///
125
+ /// # #[tokio::main]
126
+ /// # async fn main() {
127
+ /// // Custom limits for very large documents (500 MB)
128
+ /// let config = ExtractionConfig::default();
129
+ /// let limits = ApiSizeLimits::from_mb(500, 500);
130
+ /// let router = create_router_with_limits(config, limits);
131
+ /// # }
132
+ /// ```
133
+ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
134
+ let state = ApiState {
135
+ default_config: Arc::new(config),
136
+ };
137
+
138
+ // SECURITY WARNING: The default allows all origins for development convenience,
139
+ let cors_layer = if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
140
+ let origins: Vec<_> = origins_str
141
+ .split(',')
142
+ .filter(|s| !s.trim().is_empty())
143
+ .filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
144
+ .collect();
145
+
146
+ if !origins.is_empty() {
147
+ tracing::info!("CORS configured with {} explicit allowed origin(s)", origins.len());
148
+ CorsLayer::new()
149
+ .allow_origin(AllowOrigin::list(origins))
150
+ .allow_methods(Any)
151
+ .allow_headers(Any)
152
+ } else {
153
+ tracing::warn!(
154
+ "KREUZBERG_CORS_ORIGINS set but empty/invalid - falling back to permissive CORS. \
155
+ This allows CSRF attacks. Set explicit origins for production."
156
+ );
157
+ CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
158
+ }
159
+ } else {
160
+ tracing::warn!(
161
+ "CORS configured to allow all origins (default). This permits CSRF attacks. \
162
+ For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
163
+ list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
164
+ );
165
+ CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
166
+ };
167
+
168
+ Router::new()
169
+ .route("/extract", post(extract_handler))
170
+ .route("/health", get(health_handler))
171
+ .route("/info", get(info_handler))
172
+ .route("/cache/stats", get(cache_stats_handler))
173
+ .route("/cache/clear", delete(cache_clear_handler))
174
+ .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
175
+ .layer(cors_layer)
176
+ .layer(TraceLayer::new_for_http())
177
+ .with_state(state)
178
+ }
179
+
180
+ /// Start the API server with config file discovery.
181
+ ///
182
+ /// Searches for kreuzberg.toml/yaml/json in current and parent directories.
183
+ /// If no config file is found, uses default configuration.
184
+ ///
185
+ /// # Arguments
186
+ ///
187
+ /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
188
+ /// * `port` - Port number to bind to (e.g., 8000)
189
+ ///
190
+ /// # Examples
191
+ ///
192
+ /// ```no_run
193
+ /// use kreuzberg::api::serve;
194
+ ///
195
+ /// #[tokio::main]
196
+ /// async fn main() -> kreuzberg::Result<()> {
197
+ /// // Local development
198
+ /// serve("127.0.0.1", 8000).await?;
199
+ /// Ok(())
200
+ /// }
201
+ /// ```
202
+ ///
203
+ /// ```no_run
204
+ /// use kreuzberg::api::serve;
205
+ ///
206
+ /// #[tokio::main]
207
+ /// async fn main() -> kreuzberg::Result<()> {
208
+ /// // Docker/production (listen on all interfaces)
209
+ /// serve("0.0.0.0", 8000).await?;
210
+ /// Ok(())
211
+ /// }
212
+ /// ```
213
+ ///
214
+ /// # Environment Variables
215
+ ///
216
+ /// ```bash
217
+ /// # Python/Docker usage
218
+ /// export KREUZBERG_HOST=0.0.0.0
219
+ /// export KREUZBERG_PORT=8000
220
+ ///
221
+ /// # CORS configuration (IMPORTANT for production security)
222
+ /// # Default: allows all origins (permits CSRF attacks)
223
+ /// # Production: set to comma-separated list of allowed origins
224
+ /// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
225
+ ///
226
+ /// # Upload size limit (default: 100 MB)
227
+ /// export KREUZBERG_MAX_UPLOAD_SIZE_MB=200
228
+ ///
229
+ /// python -m kreuzberg.api
230
+ /// ```
231
+ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
232
+ let config = match ExtractionConfig::discover()? {
233
+ Some(config) => {
234
+ tracing::info!("Loaded extraction config from discovered file");
235
+ config
236
+ }
237
+ None => {
238
+ tracing::info!("No config file found, using default configuration");
239
+ ExtractionConfig::default()
240
+ }
241
+ };
242
+
243
+ let limits = parse_size_limits_from_env();
244
+
245
+ serve_with_config_and_limits(host, port, config, limits).await
246
+ }
247
+
248
+ /// Start the API server with explicit config.
249
+ ///
250
+ /// Uses default size limits (100 MB). For custom limits, use `serve_with_config_and_limits`.
251
+ ///
252
+ /// # Arguments
253
+ ///
254
+ /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
255
+ /// * `port` - Port number to bind to (e.g., 8000)
256
+ /// * `config` - Default extraction configuration for all requests
257
+ ///
258
+ /// # Examples
259
+ ///
260
+ /// ```no_run
261
+ /// use kreuzberg::{ExtractionConfig, api::serve_with_config};
262
+ ///
263
+ /// #[tokio::main]
264
+ /// async fn main() -> kreuzberg::Result<()> {
265
+ /// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
266
+ /// serve_with_config("127.0.0.1", 8000, config).await?;
267
+ /// Ok(())
268
+ /// }
269
+ /// ```
270
+ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: ExtractionConfig) -> Result<()> {
271
+ let limits = ApiSizeLimits::default();
272
+ tracing::info!(
273
+ "Upload size limit: 100 MB (default, {} bytes)",
274
+ limits.max_request_body_bytes
275
+ );
276
+ serve_with_config_and_limits(host, port, config, limits).await
277
+ }
278
+
279
+ /// Start the API server with explicit config and size limits.
280
+ ///
281
+ /// # Arguments
282
+ ///
283
+ /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
284
+ /// * `port` - Port number to bind to (e.g., 8000)
285
+ /// * `config` - Default extraction configuration for all requests
286
+ /// * `limits` - Size limits for request bodies and multipart uploads
287
+ ///
288
+ /// # Examples
289
+ ///
290
+ /// ```no_run
291
+ /// use kreuzberg::{ExtractionConfig, api::{serve_with_config_and_limits, ApiSizeLimits}};
292
+ ///
293
+ /// #[tokio::main]
294
+ /// async fn main() -> kreuzberg::Result<()> {
295
+ /// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
296
+ /// let limits = ApiSizeLimits::from_mb(200, 200);
297
+ /// serve_with_config_and_limits("127.0.0.1", 8000, config, limits).await?;
298
+ /// Ok(())
299
+ /// }
300
+ /// ```
301
+ pub async fn serve_with_config_and_limits(
302
+ host: impl AsRef<str>,
303
+ port: u16,
304
+ config: ExtractionConfig,
305
+ limits: ApiSizeLimits,
306
+ ) -> Result<()> {
307
+ let ip: IpAddr = host
308
+ .as_ref()
309
+ .parse()
310
+ .map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
311
+
312
+ let addr = SocketAddr::new(ip, port);
313
+ let app = create_router_with_limits(config, limits);
314
+
315
+ tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
316
+
317
+ let listener = tokio::net::TcpListener::bind(addr)
318
+ .await
319
+ .map_err(crate::error::KreuzbergError::Io)?;
320
+
321
+ axum::serve(listener, app)
322
+ .await
323
+ .map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
324
+
325
+ Ok(())
326
+ }
327
+
328
+ /// Start the API server with default host and port.
329
+ ///
330
+ /// Defaults: host = "127.0.0.1", port = 8000
331
+ ///
332
+ /// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
333
+ pub async fn serve_default() -> Result<()> {
334
+ serve("127.0.0.1", 8000).await
335
+ }
336
+
337
+ #[cfg(test)]
338
+ mod tests {
339
+ use super::*;
340
+
341
+ #[test]
342
+ fn test_create_router() {
343
+ let config = ExtractionConfig::default();
344
+ let _router = create_router(config);
345
+ }
346
+
347
+ #[test]
348
+ fn test_router_has_routes() {
349
+ let config = ExtractionConfig::default();
350
+ let router = create_router(config);
351
+ assert!(size_of_val(&router) > 0);
352
+ }
353
+ }