kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module Errors
5
+ # Base error class for all Kreuzberg errors
6
+ class Error < StandardError; end
7
+
8
+ # Raised when validation fails
9
+ class ValidationError < Error; end
10
+
11
+ # Raised when document parsing fails
12
+ class ParsingError < Error
13
+ attr_reader :context
14
+
15
+ def initialize(message, context: nil)
16
+ super(message)
17
+ @context = context
18
+ end
19
+ end
20
+
21
+ # Raised when OCR processing fails
22
+ class OCRError < Error
23
+ attr_reader :context
24
+
25
+ def initialize(message, context: nil)
26
+ super(message)
27
+ @context = context
28
+ end
29
+ end
30
+
31
+ # Raised when a required dependency is missing
32
+ class MissingDependencyError < Error
33
+ attr_reader :dependency
34
+
35
+ def initialize(message, dependency: nil)
36
+ super(message)
37
+ @dependency = dependency
38
+ end
39
+ end
40
+
41
+ # Raised when an I/O operation fails
42
+ class IOError < Error; end
43
+
44
+ # Raised when plugin operations fail
45
+ class PluginError < Error; end
46
+
47
+ # Raised when an unsupported file format or MIME type is encountered
48
+ class UnsupportedFormatError < Error; end
49
+ end
50
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module ExtractionAPI
5
+ def extract_file_sync(path, mime_type: nil, config: nil)
6
+ opts = normalize_config(config)
7
+ hash = if mime_type
8
+ native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
9
+ else
10
+ native_extract_file_sync(path.to_s, **opts)
11
+ end
12
+ result = Result.new(hash)
13
+ record_cache_entry!(result, opts)
14
+ result
15
+ end
16
+
17
+ def extract_bytes_sync(data, mime_type, config: nil)
18
+ opts = normalize_config(config)
19
+ hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
20
+ result = Result.new(hash)
21
+ record_cache_entry!(result, opts)
22
+ result
23
+ end
24
+
25
+ def batch_extract_files_sync(paths, config: nil)
26
+ opts = normalize_config(config)
27
+ hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
28
+ results = hashes.map { |hash| Result.new(hash) }
29
+ record_cache_entry!(results, opts)
30
+ results
31
+ end
32
+
33
+ def extract_file(path, mime_type: nil, config: nil)
34
+ opts = normalize_config(config)
35
+ hash = if mime_type
36
+ native_extract_file(path.to_s, mime_type.to_s, **opts)
37
+ else
38
+ native_extract_file(path.to_s, **opts)
39
+ end
40
+ result = Result.new(hash)
41
+ record_cache_entry!(result, opts)
42
+ result
43
+ end
44
+
45
+ def extract_bytes(data, mime_type, config: nil)
46
+ opts = normalize_config(config)
47
+ hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
48
+ result = Result.new(hash)
49
+ record_cache_entry!(result, opts)
50
+ result
51
+ end
52
+
53
+ def batch_extract_files(paths, config: nil)
54
+ opts = normalize_config(config)
55
+ hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
56
+ results = hashes.map { |hash| Result.new(hash) }
57
+ record_cache_entry!(results, opts)
58
+ results
59
+ end
60
+
61
+ def batch_extract_bytes_sync(data_array, mime_types, config: nil)
62
+ opts = normalize_config(config)
63
+ hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
64
+ results = hashes.map { |hash| Result.new(hash) }
65
+ record_cache_entry!(results, opts)
66
+ results
67
+ end
68
+
69
+ def batch_extract_bytes(data_array, mime_types, config: nil)
70
+ opts = normalize_config(config)
71
+ hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
72
+ results = hashes.map { |hash| Result.new(hash) }
73
+ record_cache_entry!(results, opts)
74
+ results
75
+ end
76
+
77
+ def normalize_config(config)
78
+ return {} if config.nil?
79
+ return config if config.is_a?(Hash)
80
+
81
+ config.to_h
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'pathname'
5
+ require 'json'
6
+
7
+ module Kreuzberg
8
+ # MCP (Model Context Protocol) server proxy
9
+ #
10
+ # Starts and manages the Kreuzberg MCP server for Claude Desktop integration.
11
+ #
12
+ # @example Start MCP server
13
+ # server = Kreuzberg::MCPProxy.new
14
+ # server.start
15
+ #
16
+ module MCPProxy
17
+ Error = Class.new(Kreuzberg::Errors::Error)
18
+ MissingBinaryError = Class.new(Error)
19
+ ServerError = Class.new(Error)
20
+
21
+ # MCP server instance
22
+ class Server
23
+ attr_reader :pid, :transport
24
+
25
+ # Initialize MCP server
26
+ #
27
+ # @param transport [String] Transport method ("stdio" or "sse")
28
+ #
29
+ def initialize(transport: 'stdio')
30
+ @transport = transport
31
+ @pid = nil
32
+ @stdin = nil
33
+ @stdout = nil
34
+ @stderr = nil
35
+ end
36
+
37
+ # Start the MCP server
38
+ #
39
+ # @return [Integer, nil] Process ID (for SSE) or nil (for stdio)
40
+ #
41
+ def start
42
+ binary = MCPProxy.find_mcp_binary
43
+
44
+ case @transport
45
+ when 'stdio'
46
+ start_stdio(binary)
47
+ when 'sse'
48
+ start_sse(binary)
49
+ else
50
+ raise ServerError, "Unknown transport: #{@transport}"
51
+ end
52
+ end
53
+
54
+ # Stop the server
55
+ #
56
+ # @return [void]
57
+ #
58
+ def stop
59
+ return unless @pid
60
+
61
+ Process.kill('TERM', @pid)
62
+ Process.wait(@pid)
63
+ rescue Errno::ESRCH, Errno::ECHILD
64
+ # Process already dead
65
+ ensure
66
+ @pid = nil
67
+ close_pipes
68
+ end
69
+
70
+ # Send a message to the server (stdio only)
71
+ #
72
+ # @param message [Hash] JSON-RPC message
73
+ # @return [void]
74
+ #
75
+ def send_message(message)
76
+ raise ServerError, 'Can only send messages in stdio mode' unless @transport == 'stdio'
77
+ raise ServerError, 'Server not started' unless @stdin
78
+
79
+ @stdin.puts(JSON.generate(message))
80
+ @stdin.flush
81
+ end
82
+
83
+ # Read a message from the server (stdio only)
84
+ #
85
+ # @return [Hash] JSON-RPC message
86
+ #
87
+ def read_message
88
+ raise ServerError, 'Can only read messages in stdio mode' unless @transport == 'stdio'
89
+ raise ServerError, 'Server not started' unless @stdout
90
+
91
+ line = @stdout.gets
92
+ JSON.parse(line) if line
93
+ end
94
+
95
+ # Check if server is running
96
+ #
97
+ # @return [Boolean]
98
+ #
99
+ def running?
100
+ return false unless @pid
101
+
102
+ Process.kill(0, @pid)
103
+ true
104
+ rescue Errno::ESRCH, Errno::EPERM
105
+ false
106
+ end
107
+
108
+ private
109
+
110
+ def start_stdio(binary)
111
+ @stdin, @stdout, @stderr, wait_thr = Open3.popen3(binary.to_s, 'mcp', '--transport', 'stdio')
112
+ @pid = wait_thr.pid
113
+ nil
114
+ end
115
+
116
+ def start_sse(binary)
117
+ @pid = spawn(
118
+ binary.to_s,
119
+ 'mcp',
120
+ '--transport', 'sse',
121
+ out: $stdout,
122
+ err: $stderr
123
+ )
124
+ Process.detach(@pid)
125
+ sleep 1 # Give server time to start
126
+ @pid
127
+ end
128
+
129
+ def close_pipes
130
+ @stdin&.close
131
+ @stdout&.close
132
+ @stderr&.close
133
+ @stdin = @stdout = @stderr = nil
134
+ end
135
+ end
136
+
137
+ module_function
138
+
139
+ # Run MCP server with a block
140
+ #
141
+ # @param transport [String] Transport method
142
+ # @yield [Server] Yields server instance
143
+ # @return [Object] Block result
144
+ #
145
+ # @example
146
+ # Kreuzberg::MCPProxy.run(transport: 'stdio') do |server|
147
+ # server.send_message({ method: 'tools/list' })
148
+ # response = server.read_message
149
+ # end
150
+ #
151
+ def run(transport: 'stdio')
152
+ server = Server.new(transport: transport)
153
+ server.start
154
+ yield server
155
+ ensure
156
+ server&.stop
157
+ end
158
+
159
+ # Find the MCP binary
160
+ #
161
+ # @return [Pathname] Path to binary
162
+ # @raise [MissingBinaryError] If not found
163
+ #
164
+ def find_mcp_binary
165
+ # MCP is served by kreuzberg CLI
166
+ binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
167
+ found = CLIProxy.search_paths(binary_name).find(&:file?)
168
+ return found if found
169
+
170
+ raise MissingBinaryError, missing_binary_message
171
+ end
172
+
173
+ # Error message for missing binary
174
+ #
175
+ # @return [String]
176
+ #
177
+ def missing_binary_message
178
+ <<~MSG.strip
179
+ kreuzberg binary not found for MCP server. Build it with:
180
+ `cargo build --release --package kreuzberg-cli`
181
+
182
+ Or ensure kreuzberg is installed with MCP support.
183
+ MSG
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # OCR backend protocol interface.
5
+ #
6
+ # This module defines the protocol that all Ruby OCR backends must implement
7
+ # to be registered with the Rust core via the FFI bridge.
8
+ #
9
+ # OCR backends implement optical character recognition for images and scanned documents.
10
+ # They are called when OCR is enabled in the extraction configuration.
11
+ #
12
+ # @example Implementing a custom OCR backend
13
+ # class CustomOcrBackend
14
+ # include Kreuzberg::OcrBackendProtocol
15
+ #
16
+ # def name
17
+ # "custom-ocr"
18
+ # end
19
+ #
20
+ # def process_image(image_bytes, config)
21
+ # # Perform OCR on image_bytes
22
+ # # This is a placeholder - integrate with a real OCR engine
23
+ # text = my_ocr_engine.recognize(image_bytes, language: config["language"])
24
+ # text
25
+ # end
26
+ # end
27
+ #
28
+ # backend = CustomOcrBackend.new
29
+ # Kreuzberg.register_ocr_backend(backend.name, backend)
30
+ #
31
+ # # Use in extraction
32
+ # result = Kreuzberg.extract_file_sync(
33
+ # "scanned.pdf",
34
+ # config: { ocr: { backend: "custom-ocr", language: "eng" } }
35
+ # )
36
+ #
37
+ # @example Implementing an OCR backend with initialization
38
+ # class ModelBasedOcr
39
+ # include Kreuzberg::OcrBackendProtocol
40
+ #
41
+ # def initialize
42
+ # @model = nil
43
+ # end
44
+ #
45
+ # def name
46
+ # "model-ocr"
47
+ # end
48
+ #
49
+ # def process_image(image_bytes, config)
50
+ # # Load model on first use (lazy initialization)
51
+ # @model ||= load_model
52
+ #
53
+ # # Run OCR
54
+ # @model.recognize(image_bytes, config)
55
+ # end
56
+ #
57
+ # private
58
+ #
59
+ # def load_model
60
+ # # Load ML model for OCR
61
+ # MyOcrModel.load("path/to/model")
62
+ # end
63
+ # end
64
+ #
65
+ # Kreuzberg.register_ocr_backend("model-ocr", ModelBasedOcr.new)
66
+ #
67
+ module OcrBackendProtocol
68
+ # Return the unique name of this OCR backend.
69
+ #
70
+ # This name is used in ExtractionConfig to select the backend:
71
+ #
72
+ # config = { ocr: { backend: "custom-ocr", language: "eng" } }
73
+ #
74
+ # The name should be a lowercase string with hyphens (e.g., "custom-ocr", "tesseract").
75
+ #
76
+ # @return [String] Unique backend identifier
77
+ #
78
+ # @example
79
+ # def name
80
+ # "custom-ocr"
81
+ # end
82
+ def name
83
+ raise NotImplementedError, "#{self.class} must implement #name"
84
+ end
85
+
86
+ # Process image bytes and extract text via OCR.
87
+ #
88
+ # This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
89
+ # hash. It must return the extracted text as a string.
90
+ #
91
+ # The config hash contains OCR settings such as:
92
+ # - "language" [String] - Language code (e.g., "eng", "deu", "fra")
93
+ # - "backend" [String] - Backend name (same as #name)
94
+ # - Additional backend-specific settings
95
+ #
96
+ # @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
97
+ # @param config [Hash] OCR configuration with the following keys:
98
+ # - "language" [String] - Language code for OCR (e.g., "eng", "deu")
99
+ # - "backend" [String] - Backend name
100
+ #
101
+ # @return [String] Extracted text content
102
+ #
103
+ # @example
104
+ # def process_image(image_bytes, config)
105
+ # language = config["language"] || "eng"
106
+ # text = my_ocr_engine.recognize(image_bytes, language: language)
107
+ # text
108
+ # end
109
+ def process_image(image_bytes, config)
110
+ raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # PostProcessor protocol interface.
5
+ #
6
+ # This module defines the protocol that all Ruby post-processors must implement
7
+ # to be registered with the Rust core via the FFI bridge.
8
+ #
9
+ # Post-processors enrich extraction results by adding metadata, transforming content,
10
+ # or performing additional analysis. They are called after extraction completes.
11
+ #
12
+ # @example Implementing a simple post-processor
13
+ # class UpcaseProcessor
14
+ # include Kreuzberg::PostProcessorProtocol
15
+ #
16
+ # def call(result)
17
+ # result["content"] = result["content"].upcase
18
+ # result
19
+ # end
20
+ # end
21
+ #
22
+ # Kreuzberg.register_post_processor("upcase", UpcaseProcessor.new)
23
+ #
24
+ # @example Implementing a post-processor that adds metadata
25
+ # class EntityExtractor
26
+ # include Kreuzberg::PostProcessorProtocol
27
+ #
28
+ # def call(result)
29
+ # entities = extract_entities(result["content"])
30
+ # result["metadata"]["entities"] = entities
31
+ # result
32
+ # end
33
+ #
34
+ # private
35
+ #
36
+ # def extract_entities(text)
37
+ # # Extract named entities from text
38
+ # # This is a placeholder - use a real NER library in production
39
+ # text.scan(/[A-Z][a-z]+(?:\s[A-Z][a-z]+)*/)
40
+ # end
41
+ # end
42
+ #
43
+ # Kreuzberg.register_post_processor("entities", EntityExtractor.new)
44
+ #
45
+ # @example Using a Proc as a post-processor
46
+ # Kreuzberg.register_post_processor("word_count", ->(result) {
47
+ # word_count = result["content"].split.length
48
+ # result["metadata"]["word_count"] = word_count
49
+ # result
50
+ # })
51
+ #
52
+ module PostProcessorProtocol
53
+ # Process and enrich an extraction result.
54
+ #
55
+ # This method is called after extraction completes. It receives the extraction result
56
+ # as a hash and must return the modified hash. The processor can:
57
+ # - Add new keys to result["metadata"]
58
+ # - Transform result["content"]
59
+ # - Add entries to result["tables"]
60
+ # - Modify any other result fields
61
+ #
62
+ # Existing metadata keys will not be overwritten by the FFI bridge, so it's safe
63
+ # to add new keys without worrying about conflicts.
64
+ #
65
+ # @param result [Hash] Extraction result with the following structure:
66
+ # - "content" [String] - Extracted text content
67
+ # - "mime_type" [String] - MIME type of the source document
68
+ # - "metadata" [Hash] - Document metadata (title, author, etc.)
69
+ # - "tables" [Array<Hash>] - Extracted tables
70
+ # - "detected_languages" [Array<String>, nil] - Detected language codes
71
+ # - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
72
+ #
73
+ # @return [Hash] Modified extraction result with enriched metadata
74
+ #
75
+ # @example
76
+ # def call(result)
77
+ # text = result["content"]
78
+ # entities = extract_entities(text)
79
+ # result["metadata"]["entities"] = entities
80
+ # result
81
+ # end
82
+ def call(result)
83
+ raise NotImplementedError, "#{self.class} must implement #call(result)"
84
+ end
85
+ end
86
+ end