kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
metadata ADDED
@@ -0,0 +1,292 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kreuzberg
3
+ version: !ruby/object:Gem::Version
4
+ version: 4.3.5
5
+ platform: aarch64-linux
6
+ authors:
7
+ - Na'aman Hirschfeld
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2026-02-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '4.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '4.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.2'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.12'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.12'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sorbet-runtime
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.5'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rbs
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.66'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.66'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rubocop-performance
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.21'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.21'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rubocop-rspec
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '3.0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: steep
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '1.8'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '1.8'
153
+ - !ruby/object:Gem::Dependency
154
+ name: yard
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '0.9'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '0.9'
167
+ description: |
168
+ Kreuzberg is a high-performance document intelligence library with a Rust core and native
169
+ Ruby bindings via Magnus. Extract text, metadata, and structured data from 75+ file formats
170
+ including PDF, DOCX, PPTX, XLSX, HTML, RTF, images (with OCR), email, archives, and more.
171
+ Features async/sync APIs, text chunking, language detection, and keyword extraction.
172
+ email:
173
+ - nhirschfeld@gmail.com
174
+ executables: []
175
+ extensions: []
176
+ extra_rdoc_files: []
177
+ files:
178
+ - ".gitignore"
179
+ - ".rspec"
180
+ - ".rubocop.yaml"
181
+ - ".rubocop.yml"
182
+ - Gemfile
183
+ - Gemfile.lock
184
+ - README.md
185
+ - Rakefile
186
+ - Steepfile
187
+ - examples/async_patterns.rb
188
+ - extconf.rb
189
+ - kreuzberg.gemspec
190
+ - lib/kreuzberg.rb
191
+ - lib/kreuzberg/api_proxy.rb
192
+ - lib/kreuzberg/cache_api.rb
193
+ - lib/kreuzberg/cli.rb
194
+ - lib/kreuzberg/cli_proxy.rb
195
+ - lib/kreuzberg/config.rb
196
+ - lib/kreuzberg/djot_content.rb
197
+ - lib/kreuzberg/document_structure.rb
198
+ - lib/kreuzberg/error_context.rb
199
+ - lib/kreuzberg/errors.rb
200
+ - lib/kreuzberg/extraction_api.rb
201
+ - lib/kreuzberg/mcp_proxy.rb
202
+ - lib/kreuzberg/ocr_backend_protocol.rb
203
+ - lib/kreuzberg/post_processor_protocol.rb
204
+ - lib/kreuzberg/result.rb
205
+ - lib/kreuzberg/setup_lib_path.rb
206
+ - lib/kreuzberg/types.rb
207
+ - lib/kreuzberg/validator_protocol.rb
208
+ - lib/kreuzberg/version.rb
209
+ - lib/kreuzberg_rb.so
210
+ - lib/libpdfium.so
211
+ - sig/kreuzberg.rbs
212
+ - sig/kreuzberg/internal.rbs
213
+ - spec/binding/async_operations_spec.rb
214
+ - spec/binding/batch_operations_spec.rb
215
+ - spec/binding/batch_spec.rb
216
+ - spec/binding/cache_spec.rb
217
+ - spec/binding/cli_proxy_spec.rb
218
+ - spec/binding/cli_spec.rb
219
+ - spec/binding/config_result_spec.rb
220
+ - spec/binding/config_spec.rb
221
+ - spec/binding/config_validation_spec.rb
222
+ - spec/binding/embeddings_spec.rb
223
+ - spec/binding/error_handling_spec.rb
224
+ - spec/binding/error_recovery_spec.rb
225
+ - spec/binding/errors_spec.rb
226
+ - spec/binding/font_config_spec.rb
227
+ - spec/binding/images_spec.rb
228
+ - spec/binding/keywords_extraction_spec.rb
229
+ - spec/binding/metadata_types_spec.rb
230
+ - spec/binding/pages_extraction_spec.rb
231
+ - spec/binding/plugins/ocr_backend_spec.rb
232
+ - spec/binding/plugins/postprocessor_spec.rb
233
+ - spec/binding/plugins/validator_spec.rb
234
+ - spec/binding/tables_spec.rb
235
+ - spec/fixtures/config.toml
236
+ - spec/fixtures/config.yaml
237
+ - spec/fixtures/invalid_config.toml
238
+ - spec/serialization_spec.rb
239
+ - spec/smoke/package_spec.rb
240
+ - spec/spec_helper.rb
241
+ - spec/unit/config/chunking_config_spec.rb
242
+ - spec/unit/config/embedding_config_spec.rb
243
+ - spec/unit/config/extraction_config_spec.rb
244
+ - spec/unit/config/font_config_spec.rb
245
+ - spec/unit/config/hierarchy_config_spec.rb
246
+ - spec/unit/config/image_extraction_config_spec.rb
247
+ - spec/unit/config/image_preprocessing_config_spec.rb
248
+ - spec/unit/config/keyword_config_spec.rb
249
+ - spec/unit/config/language_detection_config_spec.rb
250
+ - spec/unit/config/ocr_config_spec.rb
251
+ - spec/unit/config/output_format_spec.rb
252
+ - spec/unit/config/page_config_spec.rb
253
+ - spec/unit/config/pdf_config_spec.rb
254
+ - spec/unit/config/postprocessor_config_spec.rb
255
+ - spec/unit/config/tesseract_config_spec.rb
256
+ - spec/unit/config/token_reduction_config_spec.rb
257
+ - test/metadata_types_test.rb
258
+ homepage: https://github.com/kreuzberg-dev/kreuzberg
259
+ licenses:
260
+ - MIT
261
+ metadata:
262
+ homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
263
+ source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
264
+ changelog_uri: https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md
265
+ documentation_uri: https://docs.kreuzberg.dev
266
+ bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
267
+ rubygems_mfa_required: 'true'
268
+ keywords: document-intelligence,document-extraction,text-extraction,ocr,pdf,rust,native-extension,nlp,rag
269
+ post_install_message:
270
+ rdoc_options: []
271
+ require_paths:
272
+ - lib
273
+ required_ruby_version: !ruby/object:Gem::Requirement
274
+ requirements:
275
+ - - ">="
276
+ - !ruby/object:Gem::Version
277
+ version: 3.2.0
278
+ - - "<"
279
+ - !ruby/object:Gem::Version
280
+ version: '5.0'
281
+ required_rubygems_version: !ruby/object:Gem::Requirement
282
+ requirements:
283
+ - - ">="
284
+ - !ruby/object:Gem::Version
285
+ version: '0'
286
+ requirements: []
287
+ rubygems_version: 3.5.22
288
+ signing_key:
289
+ specification_version: 4
290
+ summary: Document intelligence library — extract text from PDFs, Office docs, images,
291
+ and 75+ formats
292
+ test_files: []