kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,283 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Configuration validation tests
4
+
5
+ RSpec.describe 'Configuration Validation' do
6
+ describe Kreuzberg::Config::Extraction do
7
+ it 'accepts all valid parameters' do
8
+ config = described_class.new(
9
+ use_cache: true,
10
+ enable_quality_processing: false,
11
+ force_ocr: false,
12
+ ocr: Kreuzberg::Config::OCR.new,
13
+ chunking: Kreuzberg::Config::Chunking.new,
14
+ language_detection: Kreuzberg::Config::LanguageDetection.new,
15
+ pdf_options: Kreuzberg::Config::PDF.new
16
+ )
17
+
18
+ expect(config.use_cache).to be true
19
+ expect(config.enable_quality_processing).to be false
20
+ expect(config.force_ocr).to be false
21
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
22
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
23
+ expect(config.language_detection).to be_a(Kreuzberg::Config::LanguageDetection)
24
+ expect(config.pdf_options).to be_a(Kreuzberg::Config::PDF)
25
+ end
26
+
27
+ it 'accepts hashes for nested configs' do
28
+ config = described_class.new(
29
+ ocr: { backend: 'tesseract', language: 'eng' },
30
+ chunking: { max_chars: 500 }
31
+ )
32
+
33
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
34
+ expect(config.ocr.backend).to eq('tesseract')
35
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
36
+ expect(config.chunking.max_chars).to eq(500)
37
+ end
38
+
39
+ it 'validates ocr config type' do
40
+ expect do
41
+ described_class.new(ocr: 'invalid')
42
+ end.to raise_error(ArgumentError, /Expected.*OCR/)
43
+ end
44
+
45
+ it 'validates chunking config type' do
46
+ expect do
47
+ described_class.new(chunking: 'invalid')
48
+ end.to raise_error(ArgumentError, /Expected.*Chunking/)
49
+ end
50
+
51
+ it 'converts to hash correctly' do
52
+ config = described_class.new(
53
+ use_cache: false,
54
+ force_ocr: true
55
+ )
56
+ hash = config.to_h
57
+
58
+ expect(hash).to be_a(Hash)
59
+ expect(hash[:use_cache]).to be false
60
+ expect(hash[:force_ocr]).to be true
61
+ end
62
+
63
+ it 'omits nil values from hash' do
64
+ config = described_class.new
65
+ hash = config.to_h
66
+
67
+ expect(hash[:ocr]).to be_nil
68
+ expect(hash[:chunking]).to be_nil
69
+ end
70
+
71
+ it 'accepts html options hashes' do
72
+ config = described_class.new(html_options: { heading_style: :atx, wrap: true })
73
+ expect(config.html_options).to be_a(Kreuzberg::Config::HtmlOptions)
74
+ expect(config.html_options.to_h[:heading_style]).to eq(:atx)
75
+ end
76
+
77
+ it 'accepts keyword configurations' do
78
+ keywords = Kreuzberg::Config::Keywords.new(algorithm: :yake, max_keywords: 5)
79
+ config = described_class.new(keywords: keywords, max_concurrent_extractions: 4)
80
+ expect(config.keywords).to be_a(Kreuzberg::Config::Keywords)
81
+ expect(config.max_concurrent_extractions).to eq(4)
82
+ end
83
+ end
84
+
85
+ describe Kreuzberg::Config::OCR do
86
+ it 'has sensible defaults' do
87
+ config = described_class.new
88
+
89
+ expect(config.backend).to eq('tesseract')
90
+ expect(config.language).to eq('eng')
91
+ expect(config.tesseract_config).to be_nil
92
+ end
93
+
94
+ it 'accepts custom values' do
95
+ config = described_class.new(
96
+ backend: 'easyocr',
97
+ language: 'deu'
98
+ )
99
+
100
+ expect(config.backend).to eq('easyocr')
101
+ expect(config.language).to eq('deu')
102
+ end
103
+
104
+ it 'coerces types correctly' do
105
+ config = described_class.new(
106
+ backend: :tesseract,
107
+ language: 123
108
+ )
109
+
110
+ expect(config.backend).to eq('tesseract')
111
+ expect(config.language).to eq('123')
112
+ end
113
+
114
+ it 'accepts tesseract config hashes' do
115
+ config = described_class.new(
116
+ tesseract_config: {
117
+ psm: 6,
118
+ enable_table_detection: true
119
+ }
120
+ )
121
+
122
+ expect(config.tesseract_config).to be_a(Kreuzberg::Config::Tesseract)
123
+ expect(config.tesseract_config.to_h[:psm]).to eq(6)
124
+ end
125
+ end
126
+
127
+ describe Kreuzberg::Config::Chunking do
128
+ it 'has sensible defaults' do
129
+ config = described_class.new
130
+
131
+ expect(config.max_chars).to eq(1000)
132
+ expect(config.max_overlap).to eq(200)
133
+ expect(config.preset).to be_nil
134
+ end
135
+
136
+ it 'accepts custom chunk sizes' do
137
+ config = described_class.new(
138
+ max_chars: 500,
139
+ max_overlap: 100
140
+ )
141
+
142
+ expect(config.max_chars).to eq(500)
143
+ expect(config.max_overlap).to eq(100)
144
+ end
145
+
146
+ it 'supports different strategies' do
147
+ config = described_class.new(preset: 'fast')
148
+ expect(config.preset).to eq('fast')
149
+ end
150
+
151
+ it 'accepts embedding configs' do
152
+ embedding = { model: { type: :preset, name: 'quality' }, normalize: false }
153
+ config = described_class.new(embedding: embedding)
154
+ expect(config.embedding).to be_a(Kreuzberg::Config::Embedding)
155
+ expect(config.embedding.to_h[:model]).to include(type: :preset, name: 'quality')
156
+ end
157
+ end
158
+
159
+ describe Kreuzberg::Config::LanguageDetection do
160
+ it 'has sensible defaults' do
161
+ config = described_class.new
162
+
163
+ expect(config.enabled).to be false
164
+ expect(config.min_confidence).to eq(0.5)
165
+ end
166
+
167
+ it 'accepts custom confidence thresholds' do
168
+ config = described_class.new(
169
+ enabled: true,
170
+ min_confidence: 0.9
171
+ )
172
+
173
+ expect(config.enabled).to be true
174
+ expect(config.min_confidence).to eq(0.9)
175
+ end
176
+
177
+ it 'coerces confidence to float' do
178
+ config = described_class.new(min_confidence: '0.75')
179
+ expect(config.min_confidence).to eq(0.75)
180
+ end
181
+
182
+ it 'supports detect_multiple flag' do
183
+ config = described_class.new(detect_multiple: true)
184
+ expect(config.detect_multiple).to be true
185
+ expect(config.to_h[:detect_multiple]).to be true
186
+ end
187
+ end
188
+
189
+ describe Kreuzberg::Config::PDF do
190
+ it 'has sensible defaults' do
191
+ config = described_class.new
192
+
193
+ expect(config.extract_images).to be false
194
+ expect(config.passwords).to be_nil
195
+ expect(config.extract_metadata).to be true
196
+ end
197
+
198
+ it 'accepts custom values' do
199
+ config = described_class.new(
200
+ extract_images: true,
201
+ passwords: ['secret123']
202
+ )
203
+
204
+ expect(config.extract_images).to be true
205
+ expect(config.passwords).to eq(['secret123'])
206
+ end
207
+
208
+ it 'converts password to string' do
209
+ config = described_class.new(passwords: 12_345)
210
+ expect(config.passwords).to eq(['12345'])
211
+ end
212
+ end
213
+
214
+ describe Kreuzberg::Config::HtmlOptions do
215
+ it 'normalizes preprocessing settings' do
216
+ options = described_class.new(
217
+ heading_style: :atx_closed,
218
+ preprocessing: { enabled: true, preset: :standard }
219
+ )
220
+ hash = options.to_h
221
+ expect(hash[:heading_style]).to eq(:atx_closed)
222
+ expect(hash[:preprocessing]).to include(preset: :standard)
223
+ end
224
+ end
225
+
226
+ describe Kreuzberg::Config::Keywords do
227
+ it 'accepts hash arguments' do
228
+ config = described_class.new(
229
+ algorithm: :yake,
230
+ max_keywords: 10,
231
+ ngram_range: [1, 3],
232
+ yake_params: { window_size: 4 }
233
+ )
234
+ expect(config.to_h[:algorithm]).to eq('yake')
235
+ expect(config.to_h[:yake_params]).to eq(window_size: 4)
236
+ end
237
+ end
238
+
239
+ describe 'config usage in extraction' do
240
+ it 'works with OCR config' do
241
+ path = create_test_file('OCR config test')
242
+ config = Kreuzberg::Config::Extraction.new(
243
+ ocr: Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
244
+ )
245
+
246
+ result = Kreuzberg.extract_file_sync(path, config: config)
247
+ expect(result).to be_a(Kreuzberg::Result)
248
+ end
249
+
250
+ it 'works with chunking config' do
251
+ path = create_test_file('Chunking config test' * 50)
252
+ config = Kreuzberg::Config::Extraction.new(
253
+ chunking: Kreuzberg::Config::Chunking.new(max_chars: 50)
254
+ )
255
+
256
+ result = Kreuzberg.extract_file_sync(path, config: config)
257
+ expect(result).to be_a(Kreuzberg::Result)
258
+ end
259
+
260
+ it 'works with language detection config' do
261
+ path = create_test_file('Language detection test')
262
+ config = Kreuzberg::Config::Extraction.new(
263
+ language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
264
+ )
265
+
266
+ result = Kreuzberg.extract_file_sync(path, config: config)
267
+ expect(result).to be_a(Kreuzberg::Result)
268
+ end
269
+
270
+ it 'works with combined configs' do
271
+ path = create_test_file('Combined config test')
272
+ config = Kreuzberg::Config::Extraction.new(
273
+ use_cache: false,
274
+ force_ocr: false,
275
+ ocr: { backend: 'tesseract', language: 'eng' },
276
+ language_detection: { enabled: false }
277
+ )
278
+
279
+ result = Kreuzberg.extract_file_sync(path, config: config)
280
+ expect(result).to be_a(Kreuzberg::Result)
281
+ end
282
+ end
283
+ end
@@ -0,0 +1,213 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Error handling and exception mapping tests
4
+
5
+ RSpec.describe 'Error Handling' do
6
+ let(:nested_ocr_result) do
7
+ {
8
+ 'content' => 'ocr text',
9
+ 'mime_type' => 'text/plain',
10
+ 'metadata_json' => '{}',
11
+ 'tables' => []
12
+ }
13
+ end
14
+
15
+ let(:image_result_payload) do
16
+ {
17
+ content: 'Test',
18
+ mime_type: 'text/plain',
19
+ images: [
20
+ {
21
+ 'data' => "binary\0data",
22
+ 'format' => 'png',
23
+ 'image_index' => 0,
24
+ 'page_number' => 1,
25
+ 'width' => 100,
26
+ 'height' => 200,
27
+ 'colorspace' => 'RGB',
28
+ 'bits_per_component' => 8,
29
+ 'is_mask' => false,
30
+ 'description' => 'inline image',
31
+ 'ocr_result' => nested_ocr_result
32
+ }
33
+ ]
34
+ }
35
+ end
36
+
37
+ describe 'file not found errors' do
38
+ it 'raises error for non-existent file' do
39
+ expect do
40
+ Kreuzberg.extract_file_sync('/nonexistent/path/file.txt')
41
+ end.to raise_error(StandardError)
42
+ end
43
+
44
+ it 'raises error for empty path' do
45
+ expect do
46
+ Kreuzberg.extract_file_sync('')
47
+ end.to raise_error(StandardError)
48
+ end
49
+
50
+ it 'raises error for nil path' do
51
+ expect do
52
+ Kreuzberg.extract_file_sync(nil)
53
+ end.to raise_error(StandardError)
54
+ end
55
+ end
56
+
57
+ describe 'invalid MIME type handling' do
58
+ it 'handles unknown MIME types' do
59
+ path = create_test_file('Unknown MIME')
60
+
61
+ # Implementation may either handle gracefully or raise error for unknown MIME types
62
+ begin
63
+ result = Kreuzberg.extract_file_sync(path, mime_type: 'application/x-unknown-type')
64
+ expect(result).to be_a(Kreuzberg::Result)
65
+ rescue StandardError => e
66
+ expect(e).to be_a(StandardError)
67
+ end
68
+ end
69
+ end
70
+
71
+ describe 'invalid configuration' do
72
+ it 'raises error for invalid ocr config' do
73
+ expect do
74
+ Kreuzberg::Config::Extraction.new(ocr: 'invalid')
75
+ end.to raise_error(ArgumentError)
76
+ end
77
+
78
+ it 'raises error for invalid chunking config' do
79
+ expect do
80
+ Kreuzberg::Config::Extraction.new(chunking: 123)
81
+ end.to raise_error(ArgumentError)
82
+ end
83
+
84
+ it 'raises error for invalid language_detection config' do
85
+ expect do
86
+ Kreuzberg::Config::Extraction.new(language_detection: [])
87
+ end.to raise_error(ArgumentError)
88
+ end
89
+
90
+ it 'raises error for invalid pdf_options config' do
91
+ expect do
92
+ Kreuzberg::Config::Extraction.new(pdf_options: 'invalid')
93
+ end.to raise_error(ArgumentError)
94
+ end
95
+ end
96
+
97
+ describe 'error context' do
98
+ it 'provides meaningful error messages' do
99
+ Kreuzberg.extract_file_sync('/nonexistent/file.pdf')
100
+ raise 'Expected an error to be raised'
101
+ rescue StandardError => e
102
+ expect(e.message).not_to be_empty
103
+ end
104
+ end
105
+
106
+ describe 'batch extraction errors' do
107
+ it 'handles mixed valid and invalid files' do
108
+ files = [
109
+ create_test_file('Valid'),
110
+ '/definitely/nonexistent/file.txt'
111
+ ]
112
+
113
+ # Implementation may either raise error or handle gracefully
114
+ begin
115
+ result = Kreuzberg.batch_extract_files_sync(files)
116
+ expect(result).to be_an(Array)
117
+ rescue StandardError => e
118
+ expect(e).to be_a(StandardError)
119
+ end
120
+ end
121
+
122
+ it 'handles all invalid files' do
123
+ files = [
124
+ '/nonexistent1.txt',
125
+ '/nonexistent2.txt',
126
+ '/nonexistent3.txt'
127
+ ]
128
+
129
+ # Batch operations may either fail fast or return partial results
130
+ begin
131
+ result = Kreuzberg.batch_extract_files_sync(files)
132
+ # If no error is raised, result should be an array (possibly empty or with errors)
133
+ expect(result).to be_an(Array)
134
+ rescue StandardError => e
135
+ # If error is raised, it should be a StandardError
136
+ expect(e).to be_a(StandardError)
137
+ end
138
+ end
139
+ end
140
+
141
+ describe 'async error handling' do
142
+ it 'propagates errors in async extraction' do
143
+ expect do
144
+ Kreuzberg.extract_file('/nonexistent/async/file.txt')
145
+ end.to raise_error(StandardError)
146
+ end
147
+
148
+ it 'propagates errors in async bytes extraction' do
149
+ # Implementation may either handle invalid MIME types or raise error
150
+
151
+ result = Kreuzberg.extract_bytes('data', 'invalid/mime/type/that/causes/error')
152
+ expect(result).to be_a(Kreuzberg::Result)
153
+ rescue StandardError => e
154
+ expect(e).to be_a(StandardError)
155
+ end
156
+ end
157
+
158
+ describe 'result parsing errors' do
159
+ it 'handles malformed result gracefully' do
160
+ # This tests the Result class constructor with edge cases
161
+ result = Kreuzberg::Result.new({})
162
+
163
+ expect(result.content).to eq('')
164
+ expect(result.mime_type).to eq('')
165
+ expect(result.metadata).to eq({})
166
+ expect(result.tables).to eq([])
167
+ expect(result.detected_languages).to be_nil
168
+ expect(result.chunks).to be_nil
169
+ expect(result.images).to be_nil
170
+ end
171
+
172
+ it 'handles partial result data' do
173
+ result = Kreuzberg::Result.new(
174
+ content: 'Test',
175
+ mime_type: 'text/plain'
176
+ )
177
+
178
+ expect(result.content).to eq('Test')
179
+ expect(result.mime_type).to eq('text/plain')
180
+ expect(result.tables).to eq([])
181
+ end
182
+
183
+ it 'parses invalid metadata JSON' do
184
+ result = Kreuzberg::Result.new(
185
+ content: 'Test',
186
+ mime_type: 'text/plain',
187
+ metadata_json: 'invalid json{'
188
+ )
189
+
190
+ expect(result.metadata).to eq({})
191
+ end
192
+
193
+ it 'parses extracted images' do
194
+ result = Kreuzberg::Result.new(image_result_payload)
195
+ image = result.images&.first
196
+
197
+ expect(image&.format).to eq('png')
198
+ expect(image&.data&.encoding).to eq(Encoding::BINARY)
199
+ expect(image&.ocr_result).to be_a(Kreuzberg::Result)
200
+ end
201
+ end
202
+
203
+ describe 'type conversion errors' do
204
+ it 'handles non-string content gracefully' do
205
+ # Test that the wrapper handles type coercion
206
+ path = create_test_file('Type test')
207
+ result = Kreuzberg.extract_file_sync(path)
208
+
209
+ expect(result.content).to be_a(String)
210
+ expect(result.mime_type).to be_a(String)
211
+ end
212
+ end
213
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Errors do
4
+ describe Kreuzberg::Errors::Error do
5
+ it 'is a StandardError subclass' do
6
+ expect(described_class).to be < StandardError
7
+ end
8
+
9
+ it 'can be raised with a message' do
10
+ expect do
11
+ raise described_class, 'Test error'
12
+ end.to raise_error(described_class, 'Test error')
13
+ end
14
+ end
15
+
16
+ describe Kreuzberg::Errors::ValidationError do
17
+ it 'is an Error subclass' do
18
+ expect(described_class).to be < Kreuzberg::Errors::Error
19
+ end
20
+ end
21
+
22
+ describe Kreuzberg::Errors::ParsingError do
23
+ it 'is an Error subclass' do
24
+ expect(described_class).to be < Kreuzberg::Errors::Error
25
+ end
26
+
27
+ it 'stores context' do
28
+ error = described_class.new('Parsing failed', context: { file: 'test.pdf' })
29
+ expect(error.context).to eq({ file: 'test.pdf' })
30
+ end
31
+ end
32
+
33
+ describe Kreuzberg::Errors::OCRError do
34
+ it 'is an Error subclass' do
35
+ expect(described_class).to be < Kreuzberg::Errors::Error
36
+ end
37
+
38
+ it 'stores context' do
39
+ error = described_class.new('OCR failed', context: { page: 1 })
40
+ expect(error.context).to eq({ page: 1 })
41
+ end
42
+ end
43
+
44
+ describe Kreuzberg::Errors::MissingDependencyError do
45
+ it 'is an Error subclass' do
46
+ expect(described_class).to be < Kreuzberg::Errors::Error
47
+ end
48
+
49
+ it 'stores dependency name' do
50
+ error = described_class.new('Tesseract not found', dependency: 'tesseract')
51
+ expect(error.dependency).to eq('tesseract')
52
+ end
53
+ end
54
+
55
+ describe Kreuzberg::Errors::IOError do
56
+ it 'is an Error subclass' do
57
+ expect(described_class).to be < Kreuzberg::Errors::Error
58
+ end
59
+ end
60
+
61
+ describe Kreuzberg::Errors::PluginError do
62
+ it 'is an Error subclass' do
63
+ expect(described_class).to be < Kreuzberg::Errors::Error
64
+ end
65
+ end
66
+ end