kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::CLIProxy do
4
+ describe '.find_cli_binary' do
5
+ context 'when binary exists' do
6
+ it 'finds the binary in search paths', :skip do
7
+ # Skip in CI/test environments where binary might not be built
8
+ binary = described_class.find_cli_binary
9
+ expect(binary).to be_a(Pathname)
10
+ expect(binary.file?).to be true
11
+ end
12
+ end
13
+
14
+ context 'when binary does not exist' do
15
+ before do
16
+ allow(described_class).to receive(:search_paths).and_return([])
17
+ end
18
+
19
+ it 'raises MissingBinaryError' do
20
+ expect do
21
+ described_class.find_cli_binary
22
+ end.to raise_error(Kreuzberg::CLIProxy::MissingBinaryError, /not found/)
23
+ end
24
+ end
25
+ end
26
+
27
+ describe '.call' do
28
+ context 'when binary is available', :skip do
29
+ it 'executes CLI command successfully' do
30
+ # Skip in environments without built binary
31
+ output = described_class.call(['--version'])
32
+ expect(output).to be_a(String)
33
+ expect(output).not_to be_empty
34
+ end
35
+
36
+ it 'raises CLIExecutionError on failure' do
37
+ expect do
38
+ described_class.call(['invalid-command'])
39
+ end.to raise_error(Kreuzberg::CLIProxy::CLIExecutionError)
40
+ end
41
+ end
42
+ end
43
+
44
+ describe '.search_paths' do
45
+ it 'returns an array of Pathname objects' do
46
+ paths = described_class.search_paths('kreuzberg')
47
+ expect(paths).to be_an(Array)
48
+ expect(paths).to all(be_a(Pathname))
49
+ end
50
+
51
+ it 'includes expected search locations' do
52
+ paths = described_class.search_paths('kreuzberg')
53
+ path_strings = paths.map(&:to_s)
54
+
55
+ expect(path_strings.any? { |p| p.include?('lib/bin') }).to be true
56
+ expect(path_strings.any? { |p| p.include?('target/release') }).to be true
57
+ end
58
+ end
59
+
60
+ describe '.root_path' do
61
+ it 'returns a Pathname' do
62
+ expect(described_class.root_path).to be_a(Pathname)
63
+ end
64
+
65
+ it 'points to an existing directory' do
66
+ expect(described_class.root_path.directory?).to be true
67
+ end
68
+ end
69
+
70
+ describe '.lib_path' do
71
+ it 'returns a Pathname' do
72
+ expect(described_class.lib_path).to be_a(Pathname)
73
+ end
74
+
75
+ it 'points to an existing directory' do
76
+ expect(described_class.lib_path.directory?).to be true
77
+ end
78
+ end
79
+
80
+ describe '.missing_binary_message' do
81
+ it 'returns helpful error message' do
82
+ message = described_class.missing_binary_message
83
+ expect(message).to include('cargo build')
84
+ expect(message).to include('kreuzberg-cli')
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::CLI do
4
+ describe '.extract', :skip do
5
+ it 'extracts content from a file' do
6
+ # Skip in environments without CLI binary
7
+ path = create_test_file('CLI test content')
8
+ output = described_class.extract(path)
9
+
10
+ expect(output).to be_a(String)
11
+ expect(output).to include('CLI test content')
12
+ end
13
+
14
+ it 'accepts output format option' do
15
+ path = create_test_file('JSON output test')
16
+ output = described_class.extract(path, output: 'json')
17
+
18
+ expect(output).to be_a(String)
19
+ end
20
+
21
+ it 'accepts OCR option' do
22
+ path = create_test_file('OCR test')
23
+ output = described_class.extract(path, ocr: true)
24
+
25
+ expect(output).to be_a(String)
26
+ end
27
+ end
28
+
29
+ describe '.detect', :skip do
30
+ it 'detects MIME type' do
31
+ path = create_test_file('MIME detection test')
32
+ mime_type = described_class.detect(path)
33
+
34
+ expect(mime_type).to be_a(String)
35
+ expect(mime_type).not_to be_empty
36
+ end
37
+ end
38
+
39
+ describe '.version', :skip do
40
+ it 'returns version string' do
41
+ version = described_class.version
42
+ expect(version).to be_a(String)
43
+ expect(version).to match(/\d+\.\d+/)
44
+ end
45
+ end
46
+
47
+ describe '.help', :skip do
48
+ it 'returns help text' do
49
+ help_text = described_class.help
50
+ expect(help_text).to be_a(String)
51
+ expect(help_text).to include('kreuzberg')
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,345 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config do
4
+ describe Kreuzberg::Config::OCR do
5
+ it 'creates with default values' do
6
+ ocr = described_class.new
7
+
8
+ expect(ocr.backend).to eq('tesseract')
9
+ expect(ocr.language).to eq('eng')
10
+ expect(ocr.tesseract_config).to be_nil
11
+ end
12
+
13
+ it 'creates with custom values' do
14
+ ocr = described_class.new(
15
+ backend: 'easyocr',
16
+ language: 'deu'
17
+ )
18
+
19
+ expect(ocr.backend).to eq('easyocr')
20
+ expect(ocr.language).to eq('deu')
21
+ end
22
+
23
+ it 'converts to hash' do
24
+ ocr = described_class.new(backend: 'tesseract', language: 'fra')
25
+ hash = ocr.to_h
26
+
27
+ expect(hash).to be_a(Hash)
28
+ expect(hash[:backend]).to eq('tesseract')
29
+ expect(hash[:language]).to eq('fra')
30
+ end
31
+ end
32
+
33
+ describe Kreuzberg::Config::Chunking do
34
+ it 'creates with default values' do
35
+ chunking = described_class.new
36
+
37
+ expect(chunking.max_chars).to eq(1000)
38
+ expect(chunking.max_overlap).to eq(200)
39
+ expect(chunking.preset).to be_nil
40
+ expect(chunking.embedding).to be_nil
41
+ end
42
+
43
+ it 'creates with custom values' do
44
+ chunking = described_class.new(
45
+ max_chars: 500,
46
+ max_overlap: 100,
47
+ preset: 'fast'
48
+ )
49
+
50
+ expect(chunking.max_chars).to eq(500)
51
+ expect(chunking.max_overlap).to eq(100)
52
+ expect(chunking.preset).to eq('fast')
53
+ end
54
+
55
+ it 'converts to hash' do
56
+ chunking = described_class.new(max_chars: 750)
57
+ hash = chunking.to_h
58
+
59
+ expect(hash).to be_a(Hash)
60
+ expect(hash[:max_chars]).to eq(750)
61
+ end
62
+ end
63
+
64
+ describe Kreuzberg::Config::LanguageDetection do
65
+ it 'creates with default values' do
66
+ lang = described_class.new
67
+
68
+ expect(lang.enabled).to be false
69
+ expect(lang.min_confidence).to eq(0.5)
70
+ end
71
+
72
+ it 'creates with custom values' do
73
+ lang = described_class.new(enabled: true, min_confidence: 0.9)
74
+
75
+ expect(lang.enabled).to be true
76
+ expect(lang.min_confidence).to eq(0.9)
77
+ end
78
+
79
+ it 'converts to hash' do
80
+ lang = described_class.new(enabled: true, min_confidence: 0.75)
81
+ hash = lang.to_h
82
+
83
+ expect(hash).to be_a(Hash)
84
+ expect(hash[:enabled]).to be true
85
+ expect(hash[:min_confidence]).to eq(0.75)
86
+ end
87
+ end
88
+
89
+ describe Kreuzberg::Config::PDF do
90
+ it 'creates with default values' do
91
+ pdf = described_class.new
92
+
93
+ expect(pdf.extract_images).to be false
94
+ expect(pdf.passwords).to be_nil
95
+ expect(pdf.extract_metadata).to be true
96
+ end
97
+
98
+ it 'creates with custom values' do
99
+ pdf = described_class.new(
100
+ extract_images: true,
101
+ passwords: %w[secret backup]
102
+ )
103
+
104
+ expect(pdf.extract_images).to be true
105
+ expect(pdf.passwords).to eq(%w[secret backup])
106
+ end
107
+
108
+ it 'converts to hash' do
109
+ pdf = described_class.new(extract_images: true, passwords: ['test'])
110
+ hash = pdf.to_h
111
+
112
+ expect(hash).to be_a(Hash)
113
+ expect(hash[:extract_images]).to be true
114
+ expect(hash[:passwords]).to eq(['test'])
115
+ end
116
+ end
117
+
118
+ describe Kreuzberg::Config::Extraction do
119
+ describe '.from_file' do
120
+ it 'loads configuration from TOML file' do
121
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
122
+ config = described_class.from_file(config_path)
123
+
124
+ expect(config.use_cache).to be false
125
+ expect(config.enable_quality_processing).to be true
126
+ expect(config.force_ocr).to be true
127
+ end
128
+
129
+ it 'loads OCR config from TOML file' do
130
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
131
+ config = described_class.from_file(config_path)
132
+
133
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
134
+ expect(config.ocr.backend).to eq('tesseract')
135
+ expect(config.ocr.language).to eq('deu')
136
+ end
137
+
138
+ it 'loads chunking config from TOML file' do
139
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
140
+ config = described_class.from_file(config_path)
141
+
142
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
143
+ expect(config.chunking.max_chars).to eq(500)
144
+ expect(config.chunking.max_overlap).to eq(100)
145
+ expect(config.chunking.preset).to eq('fast')
146
+ end
147
+
148
+ it 'loads language detection config from TOML file' do
149
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
150
+ config = described_class.from_file(config_path)
151
+
152
+ expect(config.language_detection).to be_a(Kreuzberg::Config::LanguageDetection)
153
+ expect(config.language_detection.enabled).to be true
154
+ expect(config.language_detection.min_confidence).to eq(0.9)
155
+ end
156
+
157
+ it 'loads PDF options from TOML file' do
158
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
159
+ config = described_class.from_file(config_path)
160
+
161
+ expect(config.pdf_options).to be_a(Kreuzberg::Config::PDF)
162
+ expect(config.pdf_options.extract_images).to be true
163
+ expect(config.pdf_options.passwords).to eq(%w[secret backup])
164
+ expect(config.pdf_options.extract_metadata).to be true
165
+ end
166
+
167
+ it 'loads configuration from YAML file' do
168
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
169
+ config = described_class.from_file(config_path)
170
+
171
+ expect(config.use_cache).to be false
172
+ expect(config.enable_quality_processing).to be true
173
+ expect(config.force_ocr).to be true
174
+ end
175
+
176
+ it 'loads OCR config from YAML file' do
177
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
178
+ config = described_class.from_file(config_path)
179
+
180
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
181
+ expect(config.ocr.backend).to eq('tesseract')
182
+ expect(config.ocr.language).to eq('fra')
183
+ end
184
+
185
+ it 'loads chunking config from YAML file' do
186
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
187
+ config = described_class.from_file(config_path)
188
+
189
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
190
+ expect(config.chunking.max_chars).to eq(750)
191
+ expect(config.chunking.max_overlap).to eq(150)
192
+ expect(config.chunking.preset).to eq('balanced')
193
+ end
194
+
195
+ it 'works with absolute paths' do
196
+ config_path = File.expand_path('../fixtures/config.toml', __dir__)
197
+ config = described_class.from_file(config_path)
198
+
199
+ expect(config.use_cache).to be false
200
+ end
201
+
202
+ it 'works with relative paths' do
203
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
204
+ config = described_class.from_file(config_path)
205
+
206
+ expect(config.use_cache).to be false
207
+ end
208
+
209
+ it 'raises error for non-existent file' do
210
+ expect do
211
+ described_class.from_file('/path/to/nonexistent/config.toml')
212
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Failed to read config file/)
213
+ end
214
+
215
+ it 'raises error for invalid TOML file' do
216
+ config_path = File.join(__dir__, '..', 'fixtures', 'invalid_config.toml')
217
+ expect do
218
+ described_class.from_file(config_path)
219
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Invalid TOML/)
220
+ end
221
+
222
+ it 'detects file format from extension' do
223
+ toml_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
224
+ yaml_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
225
+
226
+ toml_config = described_class.from_file(toml_path)
227
+ yaml_config = described_class.from_file(yaml_path)
228
+
229
+ expect(toml_config.ocr.language).to eq('deu')
230
+ expect(yaml_config.ocr.language).to eq('fra')
231
+ end
232
+ end
233
+
234
+ it 'creates with default values' do
235
+ config = described_class.new
236
+
237
+ expect(config.use_cache).to be true
238
+ expect(config.enable_quality_processing).to be false
239
+ expect(config.force_ocr).to be false
240
+ expect(config.ocr).to be_nil
241
+ expect(config.chunking).to be_nil
242
+ expect(config.language_detection).to be_nil
243
+ expect(config.pdf_options).to be_nil
244
+ end
245
+
246
+ it 'creates with custom values' do
247
+ ocr = Kreuzberg::Config::OCR.new(backend: 'easyocr')
248
+ chunking = Kreuzberg::Config::Chunking.new(max_chars: 500)
249
+ lang = Kreuzberg::Config::LanguageDetection.new(enabled: true)
250
+ pdf = Kreuzberg::Config::PDF.new(extract_images: true)
251
+
252
+ config = described_class.new(
253
+ use_cache: false,
254
+ enable_quality_processing: true,
255
+ force_ocr: true,
256
+ ocr: ocr,
257
+ chunking: chunking,
258
+ language_detection: lang,
259
+ pdf_options: pdf
260
+ )
261
+
262
+ expect(config.use_cache).to be false
263
+ expect(config.enable_quality_processing).to be true
264
+ expect(config.force_ocr).to be true
265
+ expect(config.ocr).to eq(ocr)
266
+ expect(config.chunking).to eq(chunking)
267
+ expect(config.language_detection).to eq(lang)
268
+ expect(config.pdf_options).to eq(pdf)
269
+ end
270
+
271
+ it 'accepts hash for nested configs' do
272
+ config = described_class.new(
273
+ ocr: { backend: 'tesseract', language: 'eng' },
274
+ chunking: { max_chars: 500 }
275
+ )
276
+
277
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
278
+ expect(config.ocr.backend).to eq('tesseract')
279
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
280
+ expect(config.chunking.max_chars).to eq(500)
281
+ end
282
+
283
+ it 'converts to hash' do
284
+ config = described_class.new(
285
+ use_cache: false,
286
+ ocr: { backend: 'tesseract' }
287
+ )
288
+ hash = config.to_h
289
+
290
+ expect(hash).to be_a(Hash)
291
+ expect(hash[:use_cache]).to be false
292
+ expect(hash[:ocr]).to be_a(Hash)
293
+ expect(hash[:ocr][:backend]).to eq('tesseract')
294
+ end
295
+
296
+ it 'raises error for invalid config type' do
297
+ expect do
298
+ described_class.new(ocr: 'invalid')
299
+ end.to raise_error(ArgumentError, /Expected.*OCR/)
300
+ end
301
+ end
302
+
303
+ describe 'ExtractionConfig alias' do
304
+ it 'exists at module level' do
305
+ expect(Kreuzberg.const_defined?(:ExtractionConfig)).to be true
306
+ end
307
+
308
+ it 'is the same class as Config::Extraction' do
309
+ expect(Kreuzberg::ExtractionConfig).to eq(Kreuzberg::Config::Extraction)
310
+ end
311
+
312
+ it 'can be instantiated using the alias' do
313
+ config = Kreuzberg::ExtractionConfig.new(use_cache: false)
314
+
315
+ expect(config).to be_a(Kreuzberg::Config::Extraction)
316
+ expect(config.use_cache).to be false
317
+ end
318
+
319
+ it 'supports all methods through the alias' do
320
+ config = Kreuzberg::ExtractionConfig.new(
321
+ use_cache: false,
322
+ force_ocr: true,
323
+ ocr: { backend: 'tesseract', language: 'eng' }
324
+ )
325
+
326
+ expect(config.use_cache).to be false
327
+ expect(config.force_ocr).to be true
328
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
329
+ expect(config.ocr.backend).to eq('tesseract')
330
+
331
+ hash = config.to_h
332
+ expect(hash[:use_cache]).to be false
333
+ expect(hash[:force_ocr]).to be true
334
+ end
335
+
336
+ it 'supports from_file through the alias' do
337
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
338
+ config = Kreuzberg::ExtractionConfig.from_file(config_path)
339
+
340
+ expect(config).to be_a(Kreuzberg::Config::Extraction)
341
+ expect(config.use_cache).to be false
342
+ expect(config.enable_quality_processing).to be true
343
+ end
344
+ end
345
+ end