kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,684 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module Config
5
+ # OCR configuration
6
+ #
7
+ # @example
8
+ # ocr = OCR.new(backend: "tesseract", language: "eng")
9
+ #
10
+ class OCR
11
+ attr_reader :backend, :language, :tesseract_config
12
+
13
+ def initialize(
14
+ backend: 'tesseract',
15
+ language: 'eng',
16
+ tesseract_config: nil
17
+ )
18
+ @backend = backend.to_s
19
+ @language = language.to_s
20
+ @tesseract_config = normalize_tesseract_config(tesseract_config)
21
+ end
22
+
23
+ def to_h
24
+ {
25
+ backend: @backend,
26
+ language: @language,
27
+ tesseract_config: @tesseract_config&.to_h
28
+ }.compact
29
+ end
30
+
31
+ private
32
+
33
+ def normalize_tesseract_config(value)
34
+ return nil if value.nil?
35
+ return value if value.is_a?(Tesseract)
36
+ return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
37
+
38
+ raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
39
+ end
40
+ end
41
+
42
+ class Tesseract
43
+ attr_reader :options
44
+
45
+ def initialize(**options)
46
+ @options = options.transform_keys(&:to_sym)
47
+ normalize_nested_preprocessing!
48
+ end
49
+
50
+ def to_h
51
+ @options.dup
52
+ end
53
+
54
+ private
55
+
56
+ def normalize_nested_preprocessing!
57
+ preprocessing = @options[:preprocessing]
58
+ return if preprocessing.nil?
59
+ return if preprocessing.is_a?(ImagePreprocessing)
60
+ return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
61
+ preprocessing.is_a?(Hash)
62
+
63
+ raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
64
+ end
65
+ end
66
+
67
+ # Chunking configuration
68
+ #
69
+ # @example
70
+ # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
71
+ #
72
+ class Chunking
73
+ attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
74
+
75
+ def initialize(
76
+ max_chars: nil,
77
+ max_overlap: nil,
78
+ preset: nil,
79
+ embedding: nil,
80
+ chunk_size: nil,
81
+ chunk_overlap: nil,
82
+ enabled: true
83
+ )
84
+ resolved_size = chunk_size || max_chars || 1000
85
+ resolved_overlap = chunk_overlap || max_overlap || 200
86
+
87
+ @max_chars = resolved_size.to_i
88
+ @max_overlap = resolved_overlap.to_i
89
+ @preset = preset&.to_s
90
+ @embedding = normalize_embedding(embedding)
91
+ @enabled = boolean_or_nil(enabled)
92
+ end
93
+
94
+ def to_h
95
+ config = {
96
+ max_chars: @max_chars,
97
+ max_overlap: @max_overlap,
98
+ preset: @preset,
99
+ embedding: @embedding&.to_h
100
+ }.compact
101
+ # @type var config: Hash[Symbol, untyped]
102
+ config[:enabled] = @enabled unless @enabled.nil?
103
+ config
104
+ end
105
+
106
+ private
107
+
108
+ def normalize_embedding(value)
109
+ return nil if value.nil?
110
+ return value if value.is_a?(Embedding)
111
+ return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
112
+
113
+ raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
114
+ end
115
+
116
+ def boolean_or_nil(value)
117
+ return nil if value.nil?
118
+
119
+ value ? true : false
120
+ end
121
+ end
122
+
123
+ class Embedding
124
+ attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
125
+
126
+ def initialize(
127
+ model: { type: :preset, name: 'balanced' },
128
+ normalize: true,
129
+ batch_size: 32,
130
+ show_download_progress: false,
131
+ cache_dir: nil
132
+ )
133
+ @model = normalize_model(model)
134
+ @normalize = boolean_or_nil(normalize)
135
+ @batch_size = batch_size&.to_i
136
+ @show_download_progress = boolean_or_nil(show_download_progress)
137
+ @cache_dir = cache_dir&.to_s
138
+ end
139
+
140
+ def to_h
141
+ {
142
+ model: @model,
143
+ normalize: @normalize,
144
+ batch_size: @batch_size,
145
+ show_download_progress: @show_download_progress,
146
+ cache_dir: @cache_dir
147
+ }.compact
148
+ end
149
+
150
+ private
151
+
152
+ def normalize_model(model)
153
+ normalized = if model.respond_to?(:to_h)
154
+ model.to_h
155
+ else
156
+ model
157
+ end
158
+ raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
159
+
160
+ normalized.transform_keys(&:to_sym)
161
+ end
162
+
163
+ def boolean_or_nil(value)
164
+ return nil if value.nil?
165
+
166
+ value ? true : false
167
+ end
168
+ end
169
+
170
+ # Language detection configuration
171
+ #
172
+ # @example
173
+ # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
174
+ #
175
+ class LanguageDetection
176
+ attr_reader :enabled, :min_confidence, :detect_multiple
177
+
178
+ def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
179
+ @enabled = enabled ? true : false
180
+ @min_confidence = min_confidence.to_f
181
+ @detect_multiple = detect_multiple ? true : false
182
+ end
183
+
184
+ def to_h
185
+ {
186
+ enabled: @enabled,
187
+ min_confidence: @min_confidence,
188
+ detect_multiple: @detect_multiple
189
+ }
190
+ end
191
+ end
192
+
193
+ # PDF-specific options
194
+ #
195
+ # @example
196
+ # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
197
+ #
198
+ class PDF
199
+ attr_reader :extract_images, :passwords, :extract_metadata
200
+
201
+ def initialize(
202
+ extract_images: false,
203
+ passwords: nil,
204
+ extract_metadata: true
205
+ )
206
+ @extract_images = extract_images ? true : false
207
+ @passwords = if passwords.is_a?(Array)
208
+ passwords.map(&:to_s)
209
+ else
210
+ (passwords ? [passwords.to_s] : nil)
211
+ end
212
+ @extract_metadata = extract_metadata ? true : false
213
+ end
214
+
215
+ def to_h
216
+ {
217
+ extract_images: @extract_images,
218
+ passwords: @passwords,
219
+ extract_metadata: @extract_metadata
220
+ }.compact
221
+ end
222
+ end
223
+
224
+ # Image extraction configuration
225
+ #
226
+ # @example
227
+ # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
228
+ #
229
+ # @example With auto-adjust DPI
230
+ # image = ImageExtraction.new(
231
+ # extract_images: true,
232
+ # auto_adjust_dpi: true,
233
+ # min_dpi: 150,
234
+ # max_dpi: 600
235
+ # )
236
+ #
237
+ class ImageExtraction
238
+ attr_reader :extract_images, :target_dpi, :max_image_dimension,
239
+ :auto_adjust_dpi, :min_dpi, :max_dpi
240
+
241
+ def initialize(
242
+ extract_images: true,
243
+ target_dpi: 300,
244
+ max_image_dimension: 2000,
245
+ auto_adjust_dpi: true,
246
+ min_dpi: 150,
247
+ max_dpi: 600
248
+ )
249
+ @extract_images = extract_images ? true : false
250
+ @target_dpi = target_dpi.to_i
251
+ @max_image_dimension = max_image_dimension.to_i
252
+ @auto_adjust_dpi = auto_adjust_dpi ? true : false
253
+ @min_dpi = min_dpi.to_i
254
+ @max_dpi = max_dpi.to_i
255
+ end
256
+
257
+ def to_h
258
+ {
259
+ extract_images: @extract_images,
260
+ target_dpi: @target_dpi,
261
+ max_image_dimension: @max_image_dimension,
262
+ auto_adjust_dpi: @auto_adjust_dpi,
263
+ min_dpi: @min_dpi,
264
+ max_dpi: @max_dpi
265
+ }
266
+ end
267
+ end
268
+
269
+ # Image preprocessing configuration for OCR
270
+ #
271
+ # @example Basic preprocessing
272
+ # preprocessing = ImagePreprocessing.new(
273
+ # binarization_method: "otsu",
274
+ # denoise: true
275
+ # )
276
+ #
277
+ # @example Advanced preprocessing
278
+ # preprocessing = ImagePreprocessing.new(
279
+ # target_dpi: 600,
280
+ # auto_rotate: true,
281
+ # deskew: true,
282
+ # denoise: true,
283
+ # contrast_enhance: true,
284
+ # binarization_method: "sauvola",
285
+ # invert_colors: false
286
+ # )
287
+ #
288
+ class ImagePreprocessing
289
+ attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
290
+ :contrast_enhance, :binarization_method, :invert_colors
291
+
292
+ def initialize(
293
+ target_dpi: 300,
294
+ auto_rotate: true,
295
+ deskew: true,
296
+ denoise: false,
297
+ contrast_enhance: true,
298
+ binarization_method: 'otsu',
299
+ invert_colors: false
300
+ )
301
+ @target_dpi = target_dpi.to_i
302
+ @auto_rotate = auto_rotate ? true : false
303
+ @deskew = deskew ? true : false
304
+ @denoise = denoise ? true : false
305
+ @contrast_enhance = contrast_enhance ? true : false
306
+ @binarization_method = binarization_method.to_s
307
+ @invert_colors = invert_colors ? true : false
308
+
309
+ valid_methods = %w[otsu sauvola adaptive]
310
+ return if valid_methods.include?(@binarization_method)
311
+
312
+ raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
313
+ end
314
+
315
+ def to_h
316
+ {
317
+ target_dpi: @target_dpi,
318
+ auto_rotate: @auto_rotate,
319
+ deskew: @deskew,
320
+ denoise: @denoise,
321
+ contrast_enhance: @contrast_enhance,
322
+ binarization_method: @binarization_method,
323
+ invert_colors: @invert_colors
324
+ }
325
+ end
326
+ end
327
+
328
+ # Token reduction configuration
329
+ #
330
+ # @example Disable token reduction
331
+ # token = TokenReduction.new(mode: "off")
332
+ #
333
+ # @example Light reduction
334
+ # token = TokenReduction.new(mode: "light", preserve_important_words: true)
335
+ #
336
+ # @example Aggressive reduction
337
+ # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
338
+ #
339
+ class TokenReduction
340
+ attr_reader :mode, :preserve_important_words
341
+
342
+ def initialize(mode: 'off', preserve_important_words: true)
343
+ @mode = mode.to_s
344
+ @preserve_important_words = preserve_important_words ? true : false
345
+
346
+ valid_modes = %w[off light moderate aggressive maximum]
347
+ return if valid_modes.include?(@mode)
348
+
349
+ raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
350
+ end
351
+
352
+ def to_h
353
+ {
354
+ mode: @mode,
355
+ preserve_important_words: @preserve_important_words
356
+ }
357
+ end
358
+ end
359
+
360
+ class HtmlPreprocessing
361
+ attr_reader :enabled, :preset, :remove_navigation, :remove_forms
362
+
363
+ def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
364
+ @enabled = boolean_or_nil(enabled)
365
+ @preset = preset&.to_sym
366
+ @remove_navigation = boolean_or_nil(remove_navigation)
367
+ @remove_forms = boolean_or_nil(remove_forms)
368
+ end
369
+
370
+ def to_h
371
+ {
372
+ enabled: @enabled,
373
+ preset: @preset,
374
+ remove_navigation: @remove_navigation,
375
+ remove_forms: @remove_forms
376
+ }.compact
377
+ end
378
+
379
+ private
380
+
381
+ def boolean_or_nil(value)
382
+ return nil if value.nil?
383
+
384
+ value ? true : false
385
+ end
386
+ end
387
+
388
+ class HtmlOptions
389
+ attr_reader :options
390
+
391
+ def initialize(**options)
392
+ normalized = options.transform_keys(&:to_sym)
393
+ symbol_keys = %i[
394
+ heading_style
395
+ code_block_style
396
+ highlight_style
397
+ list_indent_type
398
+ newline_style
399
+ whitespace_mode
400
+ ]
401
+ symbol_keys.each do |key|
402
+ normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
403
+ end
404
+ if normalized[:preprocessing].is_a?(Hash)
405
+ normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
406
+ end
407
+ @options = normalized
408
+ end
409
+
410
+ def to_h
411
+ @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
412
+ end
413
+ end
414
+
415
+ class KeywordYakeParams
416
+ attr_reader :window_size
417
+
418
+ def initialize(window_size: 2)
419
+ @window_size = window_size.to_i
420
+ end
421
+
422
+ def to_h
423
+ { window_size: @window_size }
424
+ end
425
+ end
426
+
427
+ class KeywordRakeParams
428
+ attr_reader :min_word_length, :max_words_per_phrase
429
+
430
+ def initialize(min_word_length: 1, max_words_per_phrase: 3)
431
+ @min_word_length = min_word_length.to_i
432
+ @max_words_per_phrase = max_words_per_phrase.to_i
433
+ end
434
+
435
+ def to_h
436
+ {
437
+ min_word_length: @min_word_length,
438
+ max_words_per_phrase: @max_words_per_phrase
439
+ }
440
+ end
441
+ end
442
+
443
+ class Keywords
444
+ attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
445
+ :language, :yake_params, :rake_params
446
+
447
+ def initialize(
448
+ algorithm: nil,
449
+ max_keywords: nil,
450
+ min_score: nil,
451
+ ngram_range: nil,
452
+ language: nil,
453
+ yake_params: nil,
454
+ rake_params: nil
455
+ )
456
+ @algorithm = algorithm&.to_s
457
+ @max_keywords = max_keywords&.to_i
458
+ @min_score = min_score&.to_f
459
+ @ngram_range = ngram_range&.map(&:to_i)
460
+ @language = language&.to_s
461
+ @yake_params = normalize_nested(yake_params, KeywordYakeParams)
462
+ @rake_params = normalize_nested(rake_params, KeywordRakeParams)
463
+ end
464
+
465
+ def to_h
466
+ {
467
+ algorithm: @algorithm,
468
+ max_keywords: @max_keywords,
469
+ min_score: @min_score,
470
+ ngram_range: @ngram_range,
471
+ language: @language,
472
+ yake_params: @yake_params&.to_h,
473
+ rake_params: @rake_params&.to_h
474
+ }.compact
475
+ end
476
+
477
+ private
478
+
479
+ def normalize_nested(value, klass)
480
+ return nil if value.nil?
481
+ return value if value.is_a?(klass)
482
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
483
+
484
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
485
+ end
486
+ end
487
+
488
+ # Post-processor configuration
489
+ #
490
+ # @example Enable all post-processors
491
+ # postprocessor = PostProcessor.new(enabled: true)
492
+ #
493
+ # @example Enable specific processors
494
+ # postprocessor = PostProcessor.new(
495
+ # enabled: true,
496
+ # enabled_processors: ["quality", "formatting"]
497
+ # )
498
+ #
499
+ # @example Disable specific processors
500
+ # postprocessor = PostProcessor.new(
501
+ # enabled: true,
502
+ # disabled_processors: ["token_reduction"]
503
+ # )
504
+ #
505
+ class PostProcessor
506
+ attr_reader :enabled, :enabled_processors, :disabled_processors
507
+
508
+ def initialize(
509
+ enabled: true,
510
+ enabled_processors: nil,
511
+ disabled_processors: nil
512
+ )
513
+ @enabled = enabled ? true : false
514
+ @enabled_processors = enabled_processors&.map(&:to_s)
515
+ @disabled_processors = disabled_processors&.map(&:to_s)
516
+ end
517
+
518
+ def to_h
519
+ {
520
+ enabled: @enabled,
521
+ enabled_processors: @enabled_processors,
522
+ disabled_processors: @disabled_processors
523
+ }.compact
524
+ end
525
+ end
526
+
527
+ # Main extraction configuration
528
+ #
529
+ # @example Basic usage
530
+ # config = Extraction.new(use_cache: true, force_ocr: true)
531
+ #
532
+ # @example With OCR
533
+ # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
534
+ # config = Extraction.new(ocr: ocr)
535
+ #
536
+ # @example With image extraction
537
+ # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
538
+ # config = Extraction.new(image_extraction: image)
539
+ #
540
+ # @example With preprocessing
541
+ # preprocessing = Config::ImagePreprocessing.new(
542
+ # binarization_method: "sauvola",
543
+ # denoise: true
544
+ # )
545
+ # config = Extraction.new(image_preprocessing: preprocessing)
546
+ #
547
+ # @example With post-processing
548
+ # postprocessor = Config::PostProcessor.new(
549
+ # enabled: true,
550
+ # enabled_processors: ["quality"]
551
+ # )
552
+ # config = Extraction.new(postprocessor: postprocessor)
553
+ #
554
+ # @example With all options
555
+ # config = Extraction.new(
556
+ # use_cache: true,
557
+ # enable_quality_processing: true,
558
+ # force_ocr: false,
559
+ # ocr: Config::OCR.new(language: "deu"),
560
+ # chunking: Config::Chunking.new(max_chars: 500),
561
+ # language_detection: Config::LanguageDetection.new(enabled: true),
562
+ # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
563
+ # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
564
+ # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
565
+ # postprocessor: Config::PostProcessor.new(enabled: true)
566
+ # )
567
+ #
568
+ class Extraction
569
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr,
570
+ :ocr, :chunking, :language_detection, :pdf_options,
571
+ :image_extraction, :image_preprocessing, :postprocessor,
572
+ :token_reduction, :keywords, :html_options,
573
+ :max_concurrent_extractions
574
+
575
+ # Load configuration from a file.
576
+ #
577
+ # Detects the file format from the extension (.toml, .yaml, .json)
578
+ # and loads the configuration accordingly.
579
+ #
580
+ # @param path [String] Path to the configuration file
581
+ # @return [Kreuzberg::Config::Extraction] Loaded configuration object
582
+ #
583
+ # @example Load from TOML
584
+ # config = Kreuzberg::Config::Extraction.from_file("config.toml")
585
+ #
586
+ # @example Load from YAML
587
+ # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
588
+ #
589
+ def self.from_file(path)
590
+ hash = Kreuzberg._config_from_file_native(path)
591
+ # Convert string keys to symbols for keyword arguments
592
+ new(**hash.transform_keys(&:to_sym))
593
+ end
594
+
595
+ # Discover configuration file in current or parent directories.
596
+ #
597
+ # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
598
+ # directory and parent directories.
599
+ #
600
+ # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
601
+ #
602
+ # @example
603
+ # config = Kreuzberg::Config::Extraction.discover
604
+ # if config
605
+ # # Use discovered config
606
+ # end
607
+ #
608
+ def self.discover
609
+ hash = Kreuzberg._config_discover_native
610
+ return nil if hash.nil?
611
+
612
+ # Convert string keys to symbols for keyword arguments
613
+ new(**hash.transform_keys(&:to_sym))
614
+ end
615
+
616
+ def initialize(
617
+ use_cache: true,
618
+ enable_quality_processing: false,
619
+ force_ocr: false,
620
+ ocr: nil,
621
+ chunking: nil,
622
+ language_detection: nil,
623
+ pdf_options: nil,
624
+ image_extraction: nil,
625
+ image_preprocessing: nil,
626
+ postprocessor: nil,
627
+ token_reduction: nil,
628
+ keywords: nil,
629
+ html_options: nil,
630
+ max_concurrent_extractions: nil
631
+ )
632
+ @use_cache = use_cache ? true : false
633
+ @enable_quality_processing = enable_quality_processing ? true : false
634
+ @force_ocr = force_ocr ? true : false
635
+ @ocr = normalize_config(ocr, OCR)
636
+ @chunking = normalize_config(chunking, Chunking)
637
+ @language_detection = normalize_config(language_detection, LanguageDetection)
638
+ @pdf_options = normalize_config(pdf_options, PDF)
639
+ @image_extraction = normalize_config(image_extraction, ImageExtraction)
640
+ @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
641
+ @postprocessor = normalize_config(postprocessor, PostProcessor)
642
+ @token_reduction = normalize_config(token_reduction, TokenReduction)
643
+ @keywords = normalize_config(keywords, Keywords)
644
+ @html_options = normalize_config(html_options, HtmlOptions)
645
+ @max_concurrent_extractions = max_concurrent_extractions&.to_i
646
+ end
647
+
648
+ # rubocop:disable Metrics/PerceivedComplexity
649
+ def to_h
650
+ {
651
+ use_cache: @use_cache,
652
+ enable_quality_processing: @enable_quality_processing,
653
+ force_ocr: @force_ocr,
654
+ ocr: @ocr&.to_h,
655
+ chunking: @chunking&.to_h,
656
+ language_detection: @language_detection&.to_h,
657
+ pdf_options: @pdf_options&.to_h,
658
+ image_extraction: @image_extraction&.to_h,
659
+ image_preprocessing: @image_preprocessing&.to_h,
660
+ postprocessor: @postprocessor&.to_h,
661
+ token_reduction: @token_reduction&.to_h,
662
+ keywords: @keywords&.to_h,
663
+ html_options: @html_options&.to_h,
664
+ max_concurrent_extractions: @max_concurrent_extractions
665
+ }.compact
666
+ end
667
+ # rubocop:enable Metrics/PerceivedComplexity
668
+
669
+ private
670
+
671
+ def normalize_config(value, klass)
672
+ return nil if value.nil?
673
+ return value if value.is_a?(klass)
674
+ # Convert string keys to symbols for keyword arguments
675
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
676
+
677
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
678
+ end
679
+ end
680
+
681
+ # Backwards compatibility aliases
682
+ Ocr = OCR
683
+ end
684
+ end