kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
data/sig/kreuzberg.rbs ADDED
@@ -0,0 +1,468 @@
1
+ # Type signatures for Kreuzberg document intelligence framework
2
+
3
+ module Kreuzberg
4
+ VERSION: String
5
+
6
+ # Config namespace (defined in lib/kreuzberg/config.rb)
7
+ module Config
8
+ class OCR
9
+ attr_reader backend: String
10
+ attr_reader language: String
11
+ attr_reader tesseract_config: Tesseract?
12
+
13
+ def initialize: (?backend: String, ?language: String, ?tesseract_config: (Tesseract | Hash[Symbol, untyped])?) -> void
14
+ def to_h: () -> Hash[Symbol, untyped]
15
+ end
16
+
17
+ class Tesseract
18
+ def initialize: (**untyped options) -> void
19
+ def to_h: () -> Hash[Symbol, untyped]
20
+ end
21
+
22
+ class Chunking
23
+ attr_reader max_chars: Integer
24
+ attr_reader max_overlap: Integer
25
+ attr_reader preset: String?
26
+ attr_reader embedding: Embedding?
27
+ attr_reader enabled: bool?
28
+
29
+ def initialize: (
30
+ ?max_chars: Integer?,
31
+ ?max_overlap: Integer?,
32
+ ?preset: String?,
33
+ ?embedding: (Embedding | Hash[Symbol, untyped])?,
34
+ ?chunk_size: Integer?,
35
+ ?chunk_overlap: Integer?,
36
+ ?enabled: bool
37
+ ) -> void
38
+ def to_h: () -> Hash[Symbol, untyped]
39
+ end
40
+
41
+ class Embedding
42
+ attr_reader model: Hash[Symbol, untyped]
43
+ attr_reader normalize: bool?
44
+ attr_reader batch_size: Integer?
45
+ attr_reader show_download_progress: bool?
46
+ attr_reader cache_dir: String?
47
+
48
+ def initialize: (
49
+ ?model: Hash[Symbol, untyped],
50
+ ?normalize: bool?,
51
+ ?batch_size: Integer?,
52
+ ?show_download_progress: bool?,
53
+ ?cache_dir: String?
54
+ ) -> void
55
+ def to_h: () -> Hash[Symbol, untyped]
56
+ end
57
+
58
+ class LanguageDetection
59
+ attr_reader enabled: bool
60
+ attr_reader min_confidence: Float
61
+ attr_reader detect_multiple: bool
62
+
63
+ def initialize: (?enabled: bool, ?min_confidence: Float, ?detect_multiple: bool) -> void
64
+ def to_h: () -> Hash[Symbol, untyped]
65
+ end
66
+
67
+ class PDF
68
+ attr_reader extract_images: bool
69
+ attr_reader passwords: Array[String]?
70
+ attr_reader extract_metadata: bool
71
+
72
+ def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool) -> void
73
+ def to_h: () -> Hash[Symbol, untyped]
74
+ end
75
+
76
+ class ImageExtraction
77
+ attr_reader extract_images: bool
78
+ attr_reader target_dpi: Integer
79
+ attr_reader max_image_dimension: Integer
80
+ attr_reader auto_adjust_dpi: bool
81
+ attr_reader min_dpi: Integer
82
+ attr_reader max_dpi: Integer
83
+
84
+ def initialize: (
85
+ ?extract_images: bool,
86
+ ?target_dpi: Integer,
87
+ ?max_image_dimension: Integer,
88
+ ?auto_adjust_dpi: bool,
89
+ ?min_dpi: Integer,
90
+ ?max_dpi: Integer
91
+ ) -> void
92
+ def to_h: () -> Hash[Symbol, untyped]
93
+ end
94
+
95
+ class ImagePreprocessing
96
+ attr_reader target_dpi: Integer
97
+ attr_reader auto_rotate: bool
98
+ attr_reader deskew: bool
99
+ attr_reader denoise: bool
100
+ attr_reader contrast_enhance: bool
101
+ attr_reader binarization_method: String
102
+ attr_reader invert_colors: bool
103
+
104
+ def initialize: (
105
+ ?target_dpi: Integer,
106
+ ?auto_rotate: bool,
107
+ ?deskew: bool,
108
+ ?denoise: bool,
109
+ ?contrast_enhance: bool,
110
+ ?binarization_method: String,
111
+ ?invert_colors: bool
112
+ ) -> void
113
+ def to_h: () -> Hash[Symbol, untyped]
114
+ end
115
+
116
+ class TokenReduction
117
+ attr_reader mode: String
118
+ attr_reader preserve_important_words: bool
119
+
120
+ def initialize: (?mode: String, ?preserve_important_words: bool) -> void
121
+ def to_h: () -> Hash[Symbol, untyped]
122
+ end
123
+
124
+ class PostProcessor
125
+ attr_reader enabled: bool
126
+ attr_reader enabled_processors: Array[String]?
127
+ attr_reader disabled_processors: Array[String]?
128
+
129
+ def initialize: (?enabled: bool, ?enabled_processors: Array[String]?, ?disabled_processors: Array[String]?) -> void
130
+ def to_h: () -> Hash[Symbol, untyped]
131
+ end
132
+
133
+ class HtmlPreprocessing
134
+ attr_reader enabled: bool?
135
+ attr_reader preset: Symbol?
136
+ attr_reader remove_navigation: bool?
137
+ attr_reader remove_forms: bool?
138
+
139
+ def initialize: (?enabled: bool?, ?preset: Symbol?, ?remove_navigation: bool?, ?remove_forms: bool?) -> void
140
+ def to_h: () -> Hash[Symbol, untyped]
141
+ end
142
+
143
+ class HtmlOptions
144
+ def initialize: (**untyped options) -> void
145
+ def to_h: () -> Hash[Symbol, untyped]
146
+ end
147
+
148
+ class Keywords
149
+ def initialize: (
150
+ ?algorithm: Symbol?,
151
+ ?max_keywords: Integer?,
152
+ ?min_score: Float?,
153
+ ?ngram_range: Array[Integer]?,
154
+ ?language: Symbol?,
155
+ ?yake_params: Hash[Symbol, untyped]?,
156
+ ?rake_params: Hash[Symbol, untyped]?
157
+ ) -> void
158
+ def to_h: () -> Hash[Symbol, untyped]
159
+ end
160
+
161
+ class Extraction
162
+ attr_reader use_cache: bool
163
+ attr_reader enable_quality_processing: bool
164
+ attr_reader force_ocr: bool
165
+ attr_reader ocr: OCR?
166
+ attr_reader chunking: Chunking?
167
+ attr_reader language_detection: LanguageDetection?
168
+ attr_reader pdf_options: PDF?
169
+ attr_reader image_extraction: ImageExtraction?
170
+ attr_reader image_preprocessing: ImagePreprocessing?
171
+ attr_reader postprocessor: PostProcessor?
172
+ attr_reader token_reduction: TokenReduction?
173
+ attr_reader keywords: Keywords?
174
+ attr_reader html_options: HtmlOptions?
175
+ attr_reader max_concurrent_extractions: Integer?
176
+
177
+ def self.from_file: (String path) -> Extraction
178
+ def initialize: (
179
+ ?use_cache: bool,
180
+ ?enable_quality_processing: bool,
181
+ ?force_ocr: bool,
182
+ ?ocr: (OCR | Hash[Symbol, untyped])?,
183
+ ?chunking: (Chunking | Hash[Symbol, untyped])?,
184
+ ?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
185
+ ?pdf_options: (PDF | Hash[Symbol, untyped])?,
186
+ ?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
187
+ ?image_preprocessing: (ImagePreprocessing | Hash[Symbol, untyped])?,
188
+ ?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
189
+ ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
190
+ ?keywords: (Keywords | Hash[Symbol, untyped])?,
191
+ ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
192
+ ?max_concurrent_extractions: Integer?
193
+ ) -> void
194
+ def to_h: () -> Hash[Symbol, untyped]
195
+
196
+ private
197
+
198
+ def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
199
+ end
200
+
201
+ # Backwards compatibility alias
202
+ Ocr: singleton(OCR)
203
+ end
204
+
205
+ # Alias for Config::Extraction (for API consistency with other language bindings)
206
+ ExtractionConfig: singleton(Config::Extraction)
207
+
208
+ # Extraction result type
209
+ type extraction_result_hash = {
210
+ content: String,
211
+ mime_type: String,
212
+ metadata_json: String,
213
+ tables: Array[table_hash]?,
214
+ detected_languages: Array[String]?,
215
+ chunks: Array[chunk_hash]?,
216
+ images: Array[image_hash]?
217
+ }
218
+
219
+ type table_hash = {
220
+ cells: Array[Array[String]],
221
+ markdown: String,
222
+ page_number: Integer
223
+ }
224
+
225
+ type chunk_hash = {
226
+ content: String,
227
+ char_start: Integer,
228
+ char_end: Integer,
229
+ token_count: Integer?,
230
+ chunk_index: Integer?,
231
+ total_chunks: Integer?,
232
+ embedding: Array[Float]?
233
+ }
234
+
235
+ type image_hash = {
236
+ data: String,
237
+ format: String,
238
+ image_index: Integer,
239
+ page_number: Integer?,
240
+ width: Integer?,
241
+ height: Integer?,
242
+ colorspace: String?,
243
+ bits_per_component: Integer?,
244
+ is_mask: bool?,
245
+ description: String?,
246
+ ocr_result: extraction_result_hash?
247
+ }
248
+
249
+ type config_hash = Hash[Symbol, untyped]
250
+ type config_input = config_hash | _ToH
251
+
252
+ interface _ToH
253
+ def to_h: () -> config_hash
254
+ end
255
+
256
+ # Extraction result wrapper
257
+ class Result
258
+ # Table structure
259
+ class Table
260
+ attr_reader cells: Array[Array[String]]
261
+ attr_reader markdown: String
262
+ attr_reader page_number: Integer
263
+
264
+ def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer) -> void
265
+ def to_h: () -> table_hash
266
+ end
267
+
268
+ # Text chunk
269
+ class Chunk
270
+ attr_reader content: String
271
+ attr_reader char_start: Integer
272
+ attr_reader char_end: Integer
273
+ attr_reader token_count: Integer?
274
+ attr_reader chunk_index: Integer?
275
+ attr_reader total_chunks: Integer?
276
+ attr_reader embedding: Array[Float]?
277
+
278
+ def initialize: (
279
+ content: String,
280
+ char_start: Integer,
281
+ char_end: Integer,
282
+ token_count: Integer?,
283
+ chunk_index: Integer?,
284
+ total_chunks: Integer?,
285
+ embedding: Array[Float]?
286
+ ) -> void
287
+ def to_h: () -> chunk_hash
288
+ end
289
+
290
+ class Image
291
+ attr_reader data: String
292
+ attr_reader format: String
293
+ attr_reader image_index: Integer
294
+ attr_reader page_number: Integer?
295
+ attr_reader width: Integer?
296
+ attr_reader height: Integer?
297
+ attr_reader colorspace: String?
298
+ attr_reader bits_per_component: Integer?
299
+ attr_reader is_mask: bool?
300
+ attr_reader description: String?
301
+ attr_reader ocr_result: Result?
302
+
303
+ def initialize: (
304
+ data: String,
305
+ format: String,
306
+ image_index: Integer,
307
+ page_number: Integer?,
308
+ width: Integer?,
309
+ height: Integer?,
310
+ colorspace: String?,
311
+ bits_per_component: Integer?,
312
+ is_mask: bool?,
313
+ description: String?,
314
+ ocr_result: Result?
315
+ ) -> void
316
+ def to_h: () -> image_hash
317
+ end
318
+
319
+ attr_reader content: String
320
+ attr_reader mime_type: String
321
+ attr_reader metadata: Hash[untyped, untyped]
322
+ attr_reader metadata_json: String
323
+ attr_reader tables: Array[Table]
324
+ attr_reader detected_languages: Array[String]?
325
+ attr_reader chunks: Array[Chunk]?
326
+ attr_reader images: Array[Image]?
327
+
328
+ def initialize: (extraction_result_hash hash) -> void
329
+ def to_h: () -> Hash[Symbol, untyped]
330
+ def to_json: (*untyped) -> String
331
+
332
+ private
333
+
334
+ def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
335
+ def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
336
+ def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
337
+ def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
338
+ end
339
+
340
+ # Module methods (extraction API)
341
+ def self.extract_file_sync: (
342
+ String | Pathname path,
343
+ ?mime_type: String?,
344
+ ?config: config_input?
345
+ ) -> Result
346
+
347
+ def self.extract_bytes_sync: (
348
+ String data,
349
+ String mime_type,
350
+ ?config: config_input?
351
+ ) -> Result
352
+
353
+ def self.batch_extract_files_sync: (
354
+ Array[String | Pathname] paths,
355
+ ?config: config_input?
356
+ ) -> Array[Result]
357
+
358
+ def self.batch_extract_bytes_sync: (
359
+ Array[String] data_array,
360
+ Array[String] mime_types,
361
+ ?config: config_input?
362
+ ) -> Array[Result]
363
+
364
+ def self.extract_file: (
365
+ String | Pathname path,
366
+ ?mime_type: String?,
367
+ ?config: config_input?
368
+ ) -> Result
369
+
370
+ def self.extract_bytes: (
371
+ String data,
372
+ String mime_type,
373
+ ?config: config_input?
374
+ ) -> Result
375
+
376
+ def self.batch_extract_files: (
377
+ Array[String | Pathname] paths,
378
+ ?config: config_input?
379
+ ) -> Array[Result]
380
+
381
+ def self.batch_extract_bytes: (
382
+ Array[String] data_array,
383
+ Array[String] mime_types,
384
+ ?config: config_input?
385
+ ) -> Array[Result]
386
+
387
+ # Cache API
388
+ def self.clear_cache: () -> void
389
+ def self.cache_stats: () -> Hash[Symbol | String, Integer]
390
+
391
+ # Config loading (native method)
392
+ def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
393
+
394
+ # Plugin registration
395
+ def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
396
+ def self.unregister_post_processor: (String name) -> void
397
+ def self.clear_post_processors: () -> void
398
+ def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
399
+ def self.unregister_validator: (String name) -> void
400
+ def self.clear_validators: () -> void
401
+ def self.register_ocr_backend: (_OcrBackend backend) -> void
402
+
403
+ interface _PostProcessor
404
+ def call: (extraction_result_hash result) -> extraction_result_hash
405
+ end
406
+
407
+ interface _Validator
408
+ def call: (extraction_result_hash result) -> void
409
+ end
410
+
411
+ interface _OcrBackend
412
+ def name: () -> String
413
+ def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
414
+ end
415
+
416
+ module Errors
417
+ class Error < StandardError
418
+ end
419
+
420
+ class ValidationError < Error
421
+ end
422
+
423
+ class ParsingError < Error
424
+ def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
425
+ end
426
+
427
+ class OCRError < Error
428
+ def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
429
+ end
430
+
431
+ class MissingDependencyError < Error
432
+ attr_reader dependency: String?
433
+
434
+ def initialize: (String message, ?dependency: String?) -> void
435
+ end
436
+
437
+ class IOError < Error
438
+ end
439
+
440
+ class PluginError < Error
441
+ end
442
+
443
+ class UnsupportedFormatError < Error
444
+ end
445
+ end
446
+
447
+ # Internal modules (prepended to Kreuzberg singleton)
448
+ # These are not checked by steep - see Steepfile
449
+ module CacheAPI : Object
450
+ end
451
+
452
+ module ExtractionAPI : Object
453
+ end
454
+
455
+ module PostProcessorProtocol
456
+ def call: (extraction_result_hash result) -> extraction_result_hash
457
+ end
458
+
459
+ module ValidatorProtocol
460
+ def call: (extraction_result_hash result) -> void
461
+ end
462
+
463
+ module OcrBackendProtocol
464
+ def name: () -> String
465
+ def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
466
+ def process_image: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
467
+ end
468
+ end
@@ -0,0 +1,227 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'Cache Management' do
6
+ let(:test_pdf) do
7
+ test_document_path('pdfs/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
8
+ end
9
+ let(:test_text) { test_document_path('text/contract_test.txt') }
10
+ let(:test_docx) { test_document_path('documents/contract.docx') }
11
+
12
+ before do
13
+ Kreuzberg.clear_cache
14
+ end
15
+
16
+ after do
17
+ Kreuzberg.clear_cache
18
+ end
19
+
20
+ describe 'clear_cache' do
21
+ it 'removes all cached results' do
22
+ Kreuzberg.extract_file_sync(test_pdf)
23
+ Kreuzberg.extract_file_sync(test_text)
24
+
25
+ stats_before = Kreuzberg.cache_stats
26
+ expect(stats_before['total_entries']).to be_positive
27
+
28
+ Kreuzberg.clear_cache
29
+
30
+ stats_after = Kreuzberg.cache_stats
31
+ expect(stats_after['total_entries']).to eq(0)
32
+ expect(stats_after['total_size_bytes']).to eq(0)
33
+ end
34
+
35
+ it 'returns nil (void return)' do
36
+ result = Kreuzberg.clear_cache
37
+ expect(result).to be_nil
38
+ end
39
+
40
+ it 'can be called multiple times safely' do
41
+ Kreuzberg.clear_cache
42
+ Kreuzberg.clear_cache
43
+ Kreuzberg.clear_cache
44
+
45
+ stats = Kreuzberg.cache_stats
46
+ expect(stats['total_entries']).to eq(0)
47
+ end
48
+
49
+ it 'does not affect future extractions' do
50
+ Kreuzberg.extract_file_sync(test_pdf)
51
+ Kreuzberg.clear_cache
52
+
53
+ result = Kreuzberg.extract_file_sync(test_pdf)
54
+
55
+ expect(result).to be_a(Kreuzberg::Result)
56
+ expect(result.content).not_to be_empty
57
+ end
58
+ end
59
+
60
+ describe 'cache_stats' do
61
+ it 'returns hash with correct structure' do
62
+ stats = Kreuzberg.cache_stats
63
+
64
+ expect(stats).to be_a(Hash)
65
+ expect(stats).to have_key('total_entries')
66
+ expect(stats).to have_key('total_size_bytes')
67
+ end
68
+
69
+ it 'returns zero stats when cache is empty' do
70
+ Kreuzberg.clear_cache
71
+ stats = Kreuzberg.cache_stats
72
+
73
+ expect(stats['total_entries']).to eq(0)
74
+ expect(stats['total_size_bytes']).to eq(0)
75
+ end
76
+
77
+ it 'shows entries after extractions' do
78
+ Kreuzberg.clear_cache
79
+
80
+ Kreuzberg.extract_file_sync(test_pdf)
81
+ stats = Kreuzberg.cache_stats
82
+
83
+ expect(stats['total_entries']).to be_positive
84
+ end
85
+
86
+ it 'shows total size in bytes' do
87
+ Kreuzberg.clear_cache
88
+
89
+ Kreuzberg.extract_file_sync(test_pdf)
90
+ stats = Kreuzberg.cache_stats
91
+
92
+ expect(stats['total_size_bytes']).to be_positive
93
+ end
94
+
95
+ it 'increases stats with multiple extractions' do
96
+ Kreuzberg.clear_cache
97
+
98
+ Kreuzberg.extract_file_sync(test_pdf)
99
+ stats_after_one = Kreuzberg.cache_stats
100
+
101
+ Kreuzberg.extract_file_sync(test_text)
102
+ stats_after_two = Kreuzberg.cache_stats
103
+
104
+ expect(stats_after_two['total_entries']).to be >= stats_after_one['total_entries']
105
+ end
106
+ end
107
+
108
+ describe 'cache behavior across extractions' do
109
+ it 'caches extraction results' do
110
+ Kreuzberg.clear_cache
111
+ stats_initial = Kreuzberg.cache_stats
112
+ expect(stats_initial['total_entries']).to eq(0)
113
+
114
+ result1 = Kreuzberg.extract_file_sync(test_pdf)
115
+ stats_after_first = Kreuzberg.cache_stats
116
+ expect(stats_after_first['total_entries']).to be_positive
117
+
118
+ result2 = Kreuzberg.extract_file_sync(test_pdf)
119
+ stats_after_second = Kreuzberg.cache_stats
120
+
121
+ expect(result1.content).to eq(result2.content)
122
+ expect(stats_after_second['total_entries']).to eq(stats_after_first['total_entries'] + 1)
123
+ end
124
+
125
+ it 'tracks different files separately' do
126
+ Kreuzberg.clear_cache
127
+
128
+ Kreuzberg.extract_file_sync(test_pdf)
129
+ stats_after_pdf = Kreuzberg.cache_stats
130
+
131
+ Kreuzberg.extract_file_sync(test_text)
132
+ stats_after_text = Kreuzberg.cache_stats
133
+
134
+ expect(stats_after_text['total_entries']).to be >= stats_after_pdf['total_entries']
135
+ end
136
+
137
+ it 'second extraction of same file may use cache' do
138
+ Kreuzberg.clear_cache
139
+
140
+ Time.now
141
+ result1 = Kreuzberg.extract_file_sync(test_pdf)
142
+ Time.now
143
+
144
+ Time.now
145
+ result2 = Kreuzberg.extract_file_sync(test_pdf)
146
+ Time.now
147
+
148
+ expect(result1.content).to eq(result2.content)
149
+ expect(result1.mime_type).to eq(result2.mime_type)
150
+ end
151
+
152
+ it 'clears cache between extractions when requested' do
153
+ result1 = Kreuzberg.extract_file_sync(test_pdf)
154
+
155
+ Kreuzberg.clear_cache
156
+
157
+ result2 = Kreuzberg.extract_file_sync(test_pdf)
158
+
159
+ expect(result1.content).to eq(result2.content)
160
+ end
161
+ end
162
+
163
+ describe 'cache with different configurations' do
164
+ it 'respects use_cache flag in configs' do
165
+ Kreuzberg.clear_cache
166
+
167
+ config1 = Kreuzberg::Config::Extraction.new(use_cache: true)
168
+ config2 = Kreuzberg::Config::Extraction.new(use_cache: false)
169
+
170
+ Kreuzberg.extract_file_sync(test_pdf, config: config1)
171
+ stats_after_first = Kreuzberg.cache_stats
172
+
173
+ Kreuzberg.extract_file_sync(test_pdf, config: config2)
174
+ stats_after_second = Kreuzberg.cache_stats
175
+
176
+ expect(stats_after_second['total_entries']).to eq(stats_after_first['total_entries'])
177
+ end
178
+ end
179
+
180
+ describe 'cache stats consistency' do
181
+ it 'stats remain consistent after clear' do
182
+ Kreuzberg.extract_file_sync(test_pdf)
183
+ Kreuzberg.extract_file_sync(test_text)
184
+
185
+ Kreuzberg.clear_cache
186
+ stats = Kreuzberg.cache_stats
187
+
188
+ expect(stats['total_entries']).to eq(0)
189
+ expect(stats['total_size_bytes']).to eq(0)
190
+ end
191
+
192
+ it 'stats update correctly after new extractions' do
193
+ Kreuzberg.clear_cache
194
+
195
+ Kreuzberg.extract_file_sync(test_pdf)
196
+ Kreuzberg.cache_stats
197
+
198
+ Kreuzberg.clear_cache
199
+
200
+ Kreuzberg.extract_file_sync(test_text)
201
+ stats2 = Kreuzberg.cache_stats
202
+
203
+ expect(stats2['total_entries']).to be_positive
204
+ end
205
+ end
206
+
207
+ describe 'integration with batch operations' do
208
+ it 'caches batch extraction results' do
209
+ Kreuzberg.clear_cache
210
+
211
+ results = Kreuzberg.batch_extract_files_sync([test_pdf, test_text])
212
+ stats = Kreuzberg.cache_stats
213
+
214
+ expect(results.length).to eq(2)
215
+ expect(stats['total_entries']).to be_positive
216
+ end
217
+
218
+ it 'clear_cache affects batch extractions' do
219
+ Kreuzberg.batch_extract_files_sync([test_pdf, test_text])
220
+
221
+ Kreuzberg.clear_cache
222
+
223
+ stats = Kreuzberg.cache_stats
224
+ expect(stats['total_entries']).to eq(0)
225
+ end
226
+ end
227
+ end