kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
data/extconf.rb ADDED
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ require 'rb_sys/mkmf'
5
+ require 'rbconfig'
6
+
7
+ if /mswin|mingw/.match?(RbConfig::CONFIG['host_os'])
8
+ devkit = ENV.fetch('RI_DEVKIT', nil)
9
+ prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
10
+
11
+ if devkit
12
+ sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
13
+ extra_args = [
14
+ '--target=x86_64-pc-windows-gnu',
15
+ "--sysroot=#{sysroot}"
16
+ ]
17
+
18
+ existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
19
+ ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
20
+ end
21
+ end
22
+
23
+ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
24
+
25
+ create_rust_makefile('kreuzberg_rb') do |config|
26
+ config.profile = default_profile.to_sym
27
+ config.ext_dir = File.expand_path('ext/kreuzberg_rb/native', __dir__)
28
+ end
data/kreuzberg.gemspec ADDED
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/kreuzberg/version'
4
+
5
+ repo_root = File.expand_path('../..', __dir__)
6
+
7
+ # Include files from packages/ruby
8
+ ruby_prefix = 'packages/ruby/'
9
+ ruby_cmd = %(git -C "#{repo_root}" ls-files -z #{ruby_prefix})
10
+ ruby_files =
11
+ `#{ruby_cmd}`.split("\x0")
12
+ .select { |path| path.start_with?(ruby_prefix) }
13
+ .map { |path| path.delete_prefix(ruby_prefix) }
14
+
15
+ # Include the kreuzberg core crate (needed for path patch in Cargo.toml)
16
+ core_prefix = 'crates/kreuzberg/'
17
+ core_cmd = %(git -C "#{repo_root}" ls-files -z #{core_prefix})
18
+ core_files =
19
+ `#{core_cmd}`.split("\x0")
20
+ .select { |path| path.start_with?(core_prefix) }
21
+ .map { |path| path.delete_prefix('crates/') }
22
+ .map { |path| "vendor/#{path}" }
23
+
24
+ fallback_files = Dir.chdir(__dir__) do
25
+ ruby_fallback = Dir.glob(
26
+ %w[
27
+ README.md
28
+ LICENSE
29
+ ext/**/*.rs
30
+ ext/**/*.rb
31
+ ext/**/*.toml
32
+ ext/**/*.lock
33
+ ext/**/*.md
34
+ ext/**/build.rs
35
+ ext/**/Cargo.*
36
+ exe/*
37
+ lib/**/*.rb
38
+ sig/**/*.rbs
39
+ spec/**/*.rb
40
+ ],
41
+ File::FNM_DOTMATCH
42
+ )
43
+
44
+ # Fallback for core crate - copy from repo root
45
+ core_fallback = Dir.chdir(repo_root) do
46
+ Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
47
+ .reject { |f| File.directory?(f) }
48
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
49
+ end
50
+
51
+ ruby_fallback + core_fallback
52
+ end
53
+
54
+ files = (ruby_files + core_files).empty? ? fallback_files : (ruby_files + core_files)
55
+
56
+ Gem::Specification.new do |spec|
57
+ spec.name = 'kreuzberg'
58
+ spec.version = Kreuzberg::VERSION
59
+ spec.authors = ['Na\'aman Hirschfeld']
60
+ spec.email = ['nhirschfeld@gmail.com']
61
+
62
+ spec.summary = 'High-performance document intelligence framework'
63
+ spec.description = <<~DESC
64
+ Kreuzberg is a multi-language document intelligence framework with a high-performance
65
+ Rust core. Supports extraction, OCR, chunking, and language detection for 30+ file formats
66
+ including PDF, DOCX, PPTX, XLSX, images, and more.
67
+ DESC
68
+ spec.homepage = 'https://github.com/Goldziher/kreuzberg'
69
+ spec.license = 'MIT'
70
+ spec.required_ruby_version = '>= 3.2.0'
71
+
72
+ spec.metadata = {
73
+ 'homepage_uri' => spec.homepage,
74
+ 'source_code_uri' => 'https://github.com/Goldziher/kreuzberg',
75
+ 'changelog_uri' => 'https://github.com/Goldziher/kreuzberg/blob/main/CHANGELOG.md',
76
+ 'documentation_uri' => 'https://docs.kreuzberg.dev',
77
+ 'bug_tracker_uri' => 'https://github.com/Goldziher/kreuzberg/issues',
78
+ 'rubygems_mfa_required' => 'true',
79
+ 'keywords' => 'document-intelligence,document-extraction,ocr,rust,bindings'
80
+ }
81
+
82
+ spec.files = files
83
+ spec.bindir = 'exe'
84
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
85
+ spec.require_paths = ['lib']
86
+ spec.extensions = ['ext/kreuzberg_rb/extconf.rb']
87
+
88
+ # Runtime dependencies
89
+ # None - the gem is self-contained with the Rust extension
90
+
91
+ # Development dependencies
92
+ spec.add_development_dependency 'bundler', '~> 2.0'
93
+ spec.add_development_dependency 'rake', '~> 13.0'
94
+ spec.add_development_dependency 'rake-compiler', '~> 1.2'
95
+ spec.add_development_dependency 'rb_sys', '~> 0.9'
96
+ spec.add_development_dependency 'rspec', '~> 3.12'
97
+ unless Gem.win_platform?
98
+ spec.add_development_dependency 'rbs', '~> 3.0'
99
+ spec.add_development_dependency 'rubocop', '~> 1.66'
100
+ spec.add_development_dependency 'rubocop-performance', '~> 1.21'
101
+ spec.add_development_dependency 'rubocop-rspec', '~> 3.0'
102
+ spec.add_development_dependency 'steep', '~> 1.8'
103
+ end
104
+ spec.add_development_dependency 'yard', '~> 0.9'
105
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'pathname'
5
+
6
+ module Kreuzberg
7
+ # API server proxy
8
+ #
9
+ # Starts and manages the Kreuzberg API server (Litestar/Python-based or Rust-based).
10
+ #
11
+ # @example Start the server
12
+ # server = Kreuzberg::APIProxy.new(port: 8000)
13
+ # server.start
14
+ # # Server runs in background
15
+ # server.stop
16
+ #
17
+ # @example With block
18
+ # Kreuzberg::APIProxy.run(port: 8000) do |server|
19
+ # # Server runs while block executes
20
+ # response = Net::HTTP.get(URI('http://localhost:8000/health'))
21
+ # end
22
+ #
23
+ module APIProxy
24
+ Error = Class.new(Kreuzberg::Errors::Error)
25
+ MissingBinaryError = Class.new(Error)
26
+ ServerError = Class.new(Error)
27
+
28
+ # API server instance
29
+ class Server
30
+ attr_reader :port, :host, :pid
31
+
32
+ # Initialize server
33
+ #
34
+ # @param port [Integer] Port to run on (default: 8000)
35
+ # @param host [String] Host to bind to (default: "0.0.0.0")
36
+ #
37
+ def initialize(port: 8000, host: '0.0.0.0')
38
+ @port = port
39
+ @host = host
40
+ @pid = nil
41
+ @process = nil
42
+ end
43
+
44
+ # Start the server in the background
45
+ #
46
+ # @return [Integer] Process ID
47
+ # @raise [ServerError] If server fails to start
48
+ #
49
+ def start
50
+ binary = APIProxy.find_api_binary
51
+ @pid = spawn(
52
+ binary.to_s,
53
+ 'api',
54
+ '--host', @host,
55
+ '--port', @port.to_s,
56
+ out: $stdout,
57
+ err: $stderr
58
+ )
59
+ Process.detach(@pid)
60
+ sleep 1 # Give server time to start
61
+ @pid
62
+ end
63
+
64
+ # Stop the server
65
+ #
66
+ # @return [void]
67
+ #
68
+ def stop
69
+ return unless @pid
70
+
71
+ Process.kill('TERM', @pid)
72
+ Process.wait(@pid)
73
+ rescue Errno::ESRCH, Errno::ECHILD
74
+ # Process already dead
75
+ ensure
76
+ @pid = nil
77
+ end
78
+
79
+ # Check if server is running
80
+ #
81
+ # @return [Boolean]
82
+ #
83
+ def running?
84
+ return false unless @pid
85
+
86
+ Process.kill(0, @pid)
87
+ true
88
+ rescue Errno::ESRCH, Errno::EPERM
89
+ false
90
+ end
91
+ end
92
+
93
+ module_function
94
+
95
+ # Run server with a block
96
+ #
97
+ # @param port [Integer] Port to run on
98
+ # @param host [String] Host to bind to
99
+ # @yield [Server] Yields server instance
100
+ # @return [Object] Block result
101
+ #
102
+ # @example
103
+ # Kreuzberg::APIProxy.run(port: 8000) do |server|
104
+ # # Make API requests
105
+ # end
106
+ #
107
+ def run(port: 8000, host: '0.0.0.0')
108
+ server = Server.new(port: port, host: host)
109
+ server.start
110
+ yield server
111
+ ensure
112
+ server&.stop
113
+ end
114
+
115
+ # Find the API binary
116
+ #
117
+ # @return [Pathname] Path to binary
118
+ # @raise [MissingBinaryError] If not found
119
+ #
120
+ def find_api_binary
121
+ # API might be served by kreuzberg CLI or a separate binary
122
+ binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
123
+ found = CLIProxy.search_paths(binary_name).find(&:file?)
124
+ return found if found
125
+
126
+ raise MissingBinaryError, missing_binary_message
127
+ end
128
+
129
+ # Error message for missing binary
130
+ #
131
+ # @return [String]
132
+ #
133
+ def missing_binary_message
134
+ <<~MSG.strip
135
+ kreuzberg binary not found for API server. Build it with:
136
+ `cargo build --release --package kreuzberg-cli`
137
+
138
+ Or ensure kreuzberg is installed with API support.
139
+ MSG
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module CacheAPI
5
+ def clear_cache
6
+ native_clear_cache
7
+ reset_cache_tracker!
8
+ end
9
+
10
+ def cache_stats
11
+ stats = native_cache_stats
12
+ total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
13
+ total_size = (stats['total_size_bytes'] || stats[:total_size_bytes] || 0) + @__cache_tracker[:bytes]
14
+
15
+ stats['total_entries'] = total_entries
16
+ stats[:total_entries] = total_entries
17
+ stats['total_size_bytes'] = total_size
18
+ stats[:total_size_bytes] = total_size
19
+
20
+ stats
21
+ end
22
+
23
+ private
24
+
25
+ def record_cache_entry!(results, opts)
26
+ use_cache = opts.key?(:use_cache) ? opts[:use_cache] : true
27
+ return unless use_cache
28
+
29
+ results_array = results.is_a?(Array) ? results : [results]
30
+ results_array.each do |result|
31
+ # @type var result: Result
32
+ next unless result.respond_to?(:content)
33
+
34
+ @__cache_tracker[:entries] += 1
35
+ @__cache_tracker[:bytes] += result.content.to_s.bytesize
36
+ end
37
+ end
38
+
39
+ def reset_cache_tracker!
40
+ @__cache_tracker[:entries] = 0
41
+ @__cache_tracker[:bytes] = 0
42
+ nil
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # Command-line interface wrapper
5
+ #
6
+ # Provides a Ruby API for the Kreuzberg CLI commands.
7
+ #
8
+ # @example Extract a file
9
+ # Kreuzberg::CLI.extract('document.pdf', output: 'text')
10
+ #
11
+ # @example Detect file type
12
+ # mime_type = Kreuzberg::CLI.detect('document.pdf')
13
+ #
14
+ module CLI
15
+ module_function
16
+
17
+ # Extract content from a file using the CLI
18
+ #
19
+ # @param path [String] Path to the file
20
+ # @param output [String] Output format ("text", "json", "markdown")
21
+ # @param ocr [Boolean] Enable OCR
22
+ # @return [String] Extracted content
23
+ #
24
+ def extract(path, output: 'text', ocr: false)
25
+ args = ['extract', path, '--output', output]
26
+ args << '--ocr' if ocr
27
+ CLIProxy.call(args)
28
+ end
29
+
30
+ # Detect MIME type of a file using the CLI
31
+ #
32
+ # @param path [String] Path to the file
33
+ # @return [String] MIME type
34
+ #
35
+ def detect(path)
36
+ CLIProxy.call(['detect', path]).strip
37
+ end
38
+
39
+ # Get CLI version
40
+ #
41
+ # @return [String] Version string
42
+ #
43
+ def version
44
+ CLIProxy.call(['--version']).strip
45
+ end
46
+
47
+ # Get CLI help text
48
+ #
49
+ # @return [String] Help text
50
+ #
51
+ def help
52
+ CLIProxy.call(['--help'])
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'pathname'
5
+
6
+ module Kreuzberg
7
+ # CLI binary proxy
8
+ #
9
+ # Provides access to the Kreuzberg CLI binary built from crates/kreuzberg-cli.
10
+ #
11
+ # @example
12
+ # output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
13
+ # puts output
14
+ #
15
+ module CLIProxy
16
+ Error = Class.new(Kreuzberg::Errors::Error)
17
+ MissingBinaryError = Class.new(Error)
18
+
19
+ # CLI execution error with stderr and exit status
20
+ class CLIExecutionError < Error
21
+ attr_reader :stderr, :status
22
+
23
+ def initialize(message, stderr:, status:)
24
+ super(message)
25
+ @stderr = stderr
26
+ @status = status
27
+ end
28
+ end
29
+
30
+ module_function
31
+
32
+ # Execute the Kreuzberg CLI with given arguments
33
+ #
34
+ # @param argv [Array<String>] Command-line arguments
35
+ # @return [String] Standard output from the CLI
36
+ # @raise [CLIExecutionError] If the CLI exits with non-zero status
37
+ # @raise [MissingBinaryError] If the CLI binary is not found
38
+ #
39
+ # @example Extract a file
40
+ # output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
41
+ #
42
+ # @example Detect file type
43
+ # output = Kreuzberg::CLIProxy.call(['detect', 'document.pdf'])
44
+ #
45
+ def call(argv)
46
+ binary = find_cli_binary
47
+ args = Array(argv).map(&:to_s)
48
+ stdout, stderr, status = Open3.capture3(binary.to_s, *args)
49
+ return stdout if status.success?
50
+
51
+ raise CLIExecutionError.new(
52
+ "kreuzberg CLI exited with status #{status.exitstatus}",
53
+ stderr: stderr,
54
+ status: status.exitstatus
55
+ )
56
+ end
57
+
58
+ # Find the kreuzberg CLI binary
59
+ #
60
+ # Searches in multiple locations:
61
+ # - crates/kreuzberg-cli/target/release/
62
+ # - packages/ruby/lib/bin/
63
+ # - workspace root target/release/
64
+ #
65
+ # @return [Pathname] Path to the CLI binary
66
+ # @raise [MissingBinaryError] If binary not found
67
+ #
68
+ def find_cli_binary
69
+ binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
70
+ found = search_paths(binary_name).find(&:file?)
71
+ return found if found
72
+
73
+ raise MissingBinaryError, missing_binary_message
74
+ end
75
+
76
+ # Get the root path of the Ruby package
77
+ #
78
+ # @return [Pathname] Root path
79
+ #
80
+ def root_path
81
+ @root_path ||= Pathname(__dir__ || '.').join('../..').expand_path
82
+ end
83
+
84
+ # Get the lib path
85
+ #
86
+ # @return [Pathname] Lib path
87
+ #
88
+ def lib_path
89
+ @lib_path ||= Pathname(__dir__ || '.').join('..').expand_path
90
+ end
91
+
92
+ # Search paths for the CLI binary
93
+ #
94
+ # @param binary_name [String] Name of the binary
95
+ # @return [Array<Pathname>] List of paths to search
96
+ #
97
+ def search_paths(binary_name)
98
+ paths = [
99
+ # In lib/bin (for packaged gems)
100
+ lib_path.join('bin', binary_name),
101
+ lib_path.join(binary_name),
102
+ # In local development (packages/ruby)
103
+ root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
104
+ root_path.join('../../target/release', binary_name)
105
+ ]
106
+
107
+ # Try workspace root
108
+ workspace_root = root_path.parent&.parent
109
+ paths << workspace_root.join('target', 'release', binary_name) if workspace_root
110
+
111
+ paths
112
+ end
113
+
114
+ # Error message when binary is missing
115
+ #
116
+ # @return [String] Error message
117
+ #
118
+ def missing_binary_message
119
+ <<~MSG.strip
120
+ kreuzberg CLI binary not found. Build it with:
121
+ `cargo build --release --package kreuzberg-cli`
122
+
123
+ Or install the gem with pre-built binaries.
124
+ MSG
125
+ end
126
+ end
127
+ end