kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
data/Steepfile ADDED
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Steep configuration for Kreuzberg Ruby package
4
+
5
+ target :lib do
6
+ signature 'sig'
7
+
8
+ check 'lib'
9
+
10
+ # Core library
11
+ library 'pathname'
12
+ library 'json'
13
+ library 'fileutils'
14
+ library 'open3'
15
+
16
+ # Strategic ignores for steep limitations (not fixable, safe to ignore):
17
+
18
+ # 1. Sorbet type annotations - Steep doesn't recognize Sorbet's T::Struct and T::Sig
19
+ # This file uses Sorbet exclusively for type definitions
20
+ ignore 'lib/kreuzberg/types.rb'
21
+
22
+ # 2. Struct.new with keyword_init - steep cannot understand implicit attr_readers
23
+ # defined by Struct.new in blocks (Table and Chunk classes)
24
+ ignore 'lib/kreuzberg/result.rb'
25
+
26
+ # 3. Generic type parameters in normalize_config - steep has difficulty with
27
+ # methods that take Class as parameter and return instances
28
+ ignore 'lib/kreuzberg/config.rb'
29
+
30
+ # 4. Interface types - steep doesn't recognize that all Ruby objects have nil? and is_a?
31
+ # even for interface types like _ToH
32
+ ignore 'lib/kreuzberg/extraction_api.rb'
33
+
34
+ # 5. Open3 methods - steep's built-in Open3 RBS signatures incomplete
35
+ # (capture2, capture3, popen3 are standard library methods)
36
+ ignore 'lib/kreuzberg/setup_lib_path.rb'
37
+ ignore 'lib/kreuzberg/cli_proxy.rb'
38
+ ignore 'lib/kreuzberg/mcp_proxy.rb'
39
+
40
+ # Ignore Rust extension methods (defined in native code)
41
+ ignore 'ext'
42
+
43
+ # Ignore vendored code
44
+ ignore 'vendor'
45
+
46
+ # Ignore test files
47
+ ignore 'spec'
48
+
49
+ # Ignore examples
50
+ ignore 'examples'
51
+ end
@@ -0,0 +1,283 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'kreuzberg'
4
+
5
+ # NOTE: Ruby bindings use Tokio runtime with block_on() internally.
6
+ # The "async" functions block the Ruby GVL during execution, so there's
7
+ # no performance benefit over the _sync variants from Ruby's perspective.
8
+
9
+ # ============================================================================
10
+ # Pattern 1: Synchronous Extraction (Recommended)
11
+ # ============================================================================
12
+
13
+ def basic_sync_extraction
14
+ result = Kreuzberg.extract_file_sync('document.pdf')
15
+ puts "Content: #{result[:content]}"
16
+ puts "MIME type: #{result[:mime_type]}"
17
+ end
18
+
19
+ # ============================================================================
20
+ # Pattern 2: "Async" Extraction (Same Performance as Sync)
21
+ # ============================================================================
22
+
23
+ def basic_async_extraction
24
+ result = Kreuzberg.extract_file('document.pdf')
25
+ puts "Content: #{result[:content]}"
26
+ end
27
+
28
+ # ============================================================================
29
+ # Pattern 3: Concurrent Processing with Ruby Threads
30
+ # ============================================================================
31
+
32
+ def concurrent_with_threads
33
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
34
+
35
+ threads = files.map do |file|
36
+ Thread.new do
37
+ Kreuzberg.extract_file_sync(file)
38
+ end
39
+ end
40
+
41
+ results = threads.map(&:value)
42
+ results.each_with_index do |result, index|
43
+ puts "File #{index + 1}: #{result[:content][0..100]}"
44
+ end
45
+ end
46
+
47
+ # ============================================================================
48
+ # Pattern 4: Batch Processing (Preferred for Multiple Files)
49
+ # ============================================================================
50
+
51
+ def batch_processing
52
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
53
+
54
+ results = Kreuzberg.batch_extract_files_sync(files)
55
+
56
+ puts "Processed #{results.length} files"
57
+ results.each do |result|
58
+ puts "Content preview: #{result[:content][0..50]}"
59
+ end
60
+ end
61
+
62
+ # ============================================================================
63
+ # Pattern 5: Extraction with Configuration
64
+ # ============================================================================
65
+
66
+ def extraction_with_config
67
+ config = {
68
+ ocr: {
69
+ backend: 'tesseract',
70
+ language: 'eng'
71
+ },
72
+ force_ocr: false
73
+ }
74
+
75
+ result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
76
+ puts "Extracted with OCR: #{result[:content]}"
77
+ end
78
+
79
+ # ============================================================================
80
+ # Pattern 6: Extract from Bytes
81
+ # ============================================================================
82
+
83
+ def extract_from_bytes
84
+ data = File.binread('document.pdf')
85
+ result = Kreuzberg.extract_bytes_sync(data, 'application/pdf')
86
+ puts "Extracted from memory: #{result[:content]}"
87
+ end
88
+
89
+ # ============================================================================
90
+ # Pattern 7: Batch Extract from Bytes
91
+ # ============================================================================
92
+
93
+ def batch_extract_from_bytes
94
+ files = ['doc1.pdf', 'doc2.pdf']
95
+ bytes_array = files.map { |f| File.binread(f) }
96
+ mime_types = ['application/pdf', 'application/pdf']
97
+
98
+ results = Kreuzberg.batch_extract_bytes_sync(bytes_array, mime_types)
99
+ puts "Processed #{results.length} files from memory"
100
+ end
101
+
102
+ # ============================================================================
103
+ # Pattern 8: Error Handling
104
+ # ============================================================================
105
+
106
+ def error_handling
107
+ Kreuzberg.extract_file_sync('nonexistent.pdf')
108
+ rescue StandardError => e
109
+ puts "Extraction failed: #{e.message}"
110
+ end
111
+
112
+ # ============================================================================
113
+ # Pattern 9: Sequential Processing
114
+ # ============================================================================
115
+
116
+ def sequential_processing
117
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
118
+
119
+ files.each do |file|
120
+ result = Kreuzberg.extract_file_sync(file)
121
+ puts "Processed #{file}: #{result[:content][0..50]}"
122
+ end
123
+ end
124
+
125
+ # ============================================================================
126
+ # Pattern 10: Background Processing with ActiveJob (Rails)
127
+ # ============================================================================
128
+
129
+ # Example ActiveJob for async processing in Rails
130
+ # < ApplicationJob
131
+ class DocumentExtractionJob
132
+ def perform(file_path)
133
+ result = Kreuzberg.extract_file_sync(file_path)
134
+ puts "Background extraction complete: #{result[:content][0..100]}"
135
+ end
136
+ end
137
+
138
+ # Usage in Rails controller:
139
+ # DocumentExtractionJob.perform_later('document.pdf')
140
+
141
+ # ============================================================================
142
+ # Pattern 11: Concurrent Processing with Parallel Gem
143
+ # ============================================================================
144
+
145
+ def concurrent_with_parallel_gem
146
+ require 'parallel'
147
+
148
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
149
+
150
+ results = Parallel.map(files, in_processes: 4) do |file|
151
+ Kreuzberg.extract_file_sync(file)
152
+ end
153
+
154
+ results.each do |result|
155
+ puts "Content: #{result[:content][0..50]}"
156
+ end
157
+ end
158
+
159
+ # ============================================================================
160
+ # Pattern 12: Timeout Wrapper
161
+ # ============================================================================
162
+
163
+ def extraction_with_timeout(file_path, timeout_seconds = 30)
164
+ require 'timeout'
165
+
166
+ Timeout.timeout(timeout_seconds) do
167
+ Kreuzberg.extract_file_sync(file_path)
168
+ end
169
+ rescue Timeout::Error
170
+ puts "Extraction timed out after #{timeout_seconds} seconds"
171
+ nil
172
+ end
173
+
174
+ # ============================================================================
175
+ # Pattern 13: Custom Ruby PostProcessor Plugin
176
+ # ============================================================================
177
+
178
+ def register_postprocessor
179
+ uppercase_processor = lambda do |result|
180
+ result[:content] = result[:content].upcase
181
+ result
182
+ end
183
+
184
+ Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
185
+
186
+ result = Kreuzberg.extract_file_sync('document.pdf')
187
+ puts "Uppercase content: #{result[:content]}"
188
+
189
+ Kreuzberg.unregister_post_processor('uppercase')
190
+ end
191
+
192
+ # ============================================================================
193
+ # Pattern 14: Custom Ruby Validator Plugin
194
+ # ============================================================================
195
+
196
+ def register_validator
197
+ min_length_validator = lambda do |result|
198
+ raise 'Content too short' if result[:content].length < 100
199
+ end
200
+
201
+ Kreuzberg.register_validator('min_length', min_length_validator, 100)
202
+
203
+ begin
204
+ result = Kreuzberg.extract_file_sync('short_document.pdf')
205
+ puts "Validation passed: #{result[:content]}"
206
+ rescue StandardError => e
207
+ puts "Validation failed: #{e.message}"
208
+ end
209
+
210
+ Kreuzberg.unregister_validator('min_length')
211
+ end
212
+
213
+ # ============================================================================
214
+ # Pattern 15: Custom Ruby OCR Backend Plugin
215
+ # ============================================================================
216
+
217
+ # Example OCR backend implementation for custom processing.
218
+ class CustomOcrBackend
219
+ def process_image(image_bytes, language)
220
+ "Extracted text from #{image_bytes.length} bytes using #{language}"
221
+ end
222
+
223
+ def supports_language?(lang)
224
+ %w[eng deu fra].include?(lang)
225
+ end
226
+ end
227
+
228
+ def register_ocr_backend
229
+ backend = CustomOcrBackend.new
230
+ Kreuzberg.register_ocr_backend('custom', backend)
231
+
232
+ config = {
233
+ ocr: {
234
+ backend: 'custom',
235
+ language: 'eng'
236
+ },
237
+ force_ocr: true
238
+ }
239
+
240
+ result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
241
+ puts "Custom OCR result: #{result[:content]}"
242
+ end
243
+
244
+ # ============================================================================
245
+ # Main Demonstration
246
+ # ============================================================================
247
+
248
+ def main
249
+ puts '=== Basic Sync Extraction ==='
250
+ basic_sync_extraction
251
+
252
+ puts '\n=== Basic Async Extraction (Blocks GVL) ==='
253
+ basic_async_extraction
254
+
255
+ puts '\n=== Concurrent with Ruby Threads ==='
256
+ concurrent_with_threads
257
+
258
+ puts '\n=== Batch Processing (Preferred) ==='
259
+ batch_processing
260
+
261
+ puts '\n=== Extraction with Config ==='
262
+ extraction_with_config
263
+
264
+ puts '\n=== Extract from Bytes ==='
265
+ extract_from_bytes
266
+
267
+ puts '\n=== Error Handling ==='
268
+ error_handling
269
+
270
+ puts '\n=== Sequential Processing ==='
271
+ sequential_processing
272
+
273
+ puts '\n=== Extraction with Timeout ==='
274
+ extraction_with_timeout('large_document.pdf', 30)
275
+
276
+ puts '\n=== Custom PostProcessor ==='
277
+ register_postprocessor
278
+
279
+ puts '\n=== Custom Validator ==='
280
+ register_validator
281
+ end
282
+
283
+ main if __FILE__ == $PROGRAM_NAME
data/extconf.rb ADDED
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ require 'rb_sys/mkmf'
5
+ require 'rbconfig'
6
+ require 'fileutils'
7
+
8
+ if Gem.win_platform?
9
+ # Use CI-provided CARGO_TARGET_DIR if available, otherwise use a short path
10
+ # GitHub Actions sets CARGO_TARGET_DIR=C:\t for MAX_PATH mitigation
11
+ if ENV['CARGO_TARGET_DIR']
12
+ puts "Windows detected: Using existing CARGO_TARGET_DIR=#{ENV['CARGO_TARGET_DIR']}"
13
+ else
14
+ short_target_dir = Dir.exist?('C:/t') ? 'C:/t' : 'C:/kz-build'
15
+ begin
16
+ FileUtils.mkdir_p(short_target_dir)
17
+ ENV['CARGO_TARGET_DIR'] = short_target_dir
18
+ ENV['OUT_DIR'] = short_target_dir
19
+ puts "Windows detected: Using short build path #{short_target_dir}"
20
+ rescue StandardError => e
21
+ puts "Warning: Could not create short path #{short_target_dir}: #{e.message}"
22
+ end
23
+ end
24
+ end
25
+
26
+ if /mswin|mingw/.match?(RbConfig::CONFIG['host_os'])
27
+ devkit = ENV.fetch('RI_DEVKIT', nil)
28
+ prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
29
+
30
+ # Set up include paths for MSVC compatibility headers
31
+ native_include = File.expand_path('ext/kreuzberg_rb/native/include', __dir__).tr('\\', '/')
32
+ compat_include = File.expand_path('ext/kreuzberg_rb/native/include/msvc_compat', __dir__).tr('\\', '/')
33
+
34
+ extra_args = []
35
+ extra_args << "-I#{native_include}"
36
+ extra_args << "-I#{compat_include}"
37
+ extra_args << '-fms-extensions'
38
+ extra_args << '-fno-omit-frame-pointer'
39
+
40
+ if devkit
41
+ sysroot = "#{devkit}#{prefix}".tr('\\', '/')
42
+ extra_args.push('--target=x86_64-pc-windows-gnu', "--sysroot=#{sysroot}")
43
+ end
44
+
45
+ unless extra_args.empty?
46
+ existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/).reject(&:empty?)
47
+ ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
48
+ puts "BINDGEN_EXTRA_CLANG_ARGS set to: #{ENV.fetch('BINDGEN_EXTRA_CLANG_ARGS', nil)}"
49
+ end
50
+
51
+ # Set target for Windows GNU toolchain if not already set
52
+ ENV['CARGO_BUILD_TARGET'] ||= 'x86_64-pc-windows-gnu' if devkit || ENV['MSYSTEM']
53
+ end
54
+
55
+ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
56
+
57
+ create_rust_makefile('kreuzberg_rb') do |config|
58
+ config.profile = default_profile.to_sym
59
+ config.ext_dir = File.expand_path('ext/kreuzberg_rb/native', __dir__)
60
+ end
data/kreuzberg.gemspec ADDED
@@ -0,0 +1,253 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/kreuzberg/version'
4
+
5
+ repo_root = File.expand_path('../..', __dir__)
6
+
7
+ ruby_prefix = 'packages/ruby/'
8
+ ruby_cmd = %(git -C "#{repo_root}" ls-files -z #{ruby_prefix})
9
+ ruby_files =
10
+ `#{ruby_cmd}`.split("\x0")
11
+ .select { |path| path.start_with?(ruby_prefix) }
12
+ .map { |path| path.delete_prefix(ruby_prefix) }
13
+
14
+ core_prefix = 'crates/kreuzberg/'
15
+ core_cmd = %(git -C "#{repo_root}" ls-files -z #{core_prefix})
16
+ core_files =
17
+ `#{core_cmd}`.split("\x0")
18
+ .select { |path| path.start_with?(core_prefix) }
19
+ .map { |path| path.delete_prefix('crates/') }
20
+ .map { |path| "vendor/#{path}" }
21
+
22
+ ffi_prefix = 'crates/kreuzberg-ffi/'
23
+ ffi_cmd = %(git -C "#{repo_root}" ls-files -z #{ffi_prefix})
24
+ ffi_files =
25
+ `#{ffi_cmd}`.split("\x0")
26
+ .select { |path| path.start_with?(ffi_prefix) }
27
+ .map { |path| path.delete_prefix('crates/') }
28
+ .map { |path| "vendor/#{path}" }
29
+
30
+ fallback_files = Dir.chdir(__dir__) do
31
+ ruby_fallback = Dir.glob(
32
+ %w[
33
+ README.md
34
+ LICENSE
35
+ ext/**/*.rs
36
+ ext/**/*.rb
37
+ ext/**/*.toml
38
+ ext/**/*.lock
39
+ ext/**/*.md
40
+ ext/**/build.rs
41
+ ext/**/Cargo.*
42
+ exe/*
43
+ lib/**/*.rb
44
+ sig/**/*.rbs
45
+ spec/**/*.rb
46
+ ],
47
+ File::FNM_DOTMATCH
48
+ )
49
+
50
+ core_fallback = Dir.chdir(repo_root) do
51
+ Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
52
+ .reject { |f| File.directory?(f) }
53
+ .reject { |f| f.include?('/.fastembed_cache/') }
54
+ .reject { |f| f.include?('/target/') }
55
+ .grep_v(/\.(swp|bak|tmp)$/)
56
+ .grep_v(/~$/)
57
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
58
+ end
59
+
60
+ ffi_fallback = Dir.chdir(repo_root) do
61
+ Dir.glob('crates/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
62
+ .reject { |f| File.directory?(f) }
63
+ .reject { |f| f.include?('/target/') }
64
+ .grep_v(/\.(swp|bak|tmp)$/)
65
+ .grep_v(/~$/)
66
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
67
+ end
68
+
69
+ tesseract_fallback = Dir.chdir(repo_root) do
70
+ Dir.glob('crates/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
71
+ .reject { |f| File.directory?(f) }
72
+ .reject { |f| f.include?('/target/') }
73
+ .grep_v(/\.(swp|bak|tmp)$/)
74
+ .grep_v(/~$/)
75
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
76
+ end
77
+
78
+ paddle_ocr_fallback = Dir.chdir(repo_root) do
79
+ Dir.glob('crates/kreuzberg-paddle-ocr/**/*', File::FNM_DOTMATCH)
80
+ .reject { |f| File.directory?(f) }
81
+ .reject { |f| f.include?('/target/') }
82
+ .grep_v(/\.(swp|bak|tmp)$/)
83
+ .grep_v(/~$/)
84
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
85
+ end
86
+
87
+ pdfium_render_fallback = Dir.chdir(repo_root) do
88
+ Dir.glob('crates/kreuzberg-pdfium-render/**/*', File::FNM_DOTMATCH)
89
+ .reject { |f| File.directory?(f) }
90
+ .reject { |f| f.include?('/target/') }
91
+ .grep_v(/\.(swp|bak|tmp)$/)
92
+ .grep_v(/~$/)
93
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
94
+ end
95
+
96
+ ruby_fallback + core_fallback + ffi_fallback + tesseract_fallback + paddle_ocr_fallback + pdfium_render_fallback
97
+ end
98
+
99
+ vendor_files = Dir.chdir(__dir__) do
100
+ kreuzberg_files = if Dir.exist?('vendor/kreuzberg')
101
+ Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
102
+ .reject { |f| File.directory?(f) }
103
+ .reject { |f| f.include?('/.fastembed_cache/') }
104
+ .reject { |f| f.include?('/.kreuzberg/') }
105
+ .reject { |f| f.include?('/target/') }
106
+ .grep_v(/\.(swp|bak|tmp)$/)
107
+ .grep_v(/~$/)
108
+ else
109
+ []
110
+ end
111
+
112
+ kreuzberg_ffi_files = if Dir.exist?('vendor/kreuzberg-ffi')
113
+ Dir.glob('vendor/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
114
+ .reject { |f| File.directory?(f) }
115
+ .reject { |f| f.include?('/target/') }
116
+ .grep_v(/\.(swp|bak|tmp)$/)
117
+ .grep_v(/~$/)
118
+ else
119
+ []
120
+ end
121
+
122
+ kreuzberg_tesseract_files = if Dir.exist?('vendor/kreuzberg-tesseract')
123
+ Dir.glob('vendor/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
124
+ .reject { |f| File.directory?(f) }
125
+ .reject { |f| f.include?('/target/') }
126
+ .grep_v(/\.(swp|bak|tmp)$/)
127
+ .grep_v(/~$/)
128
+ else
129
+ []
130
+ end
131
+
132
+ kreuzberg_paddle_ocr_files = if Dir.exist?('vendor/kreuzberg-paddle-ocr')
133
+ Dir.glob('vendor/kreuzberg-paddle-ocr/**/*', File::FNM_DOTMATCH)
134
+ .reject { |f| File.directory?(f) }
135
+ .reject { |f| f.include?('/target/') }
136
+ .grep_v(/\.(swp|bak|tmp)$/)
137
+ .grep_v(/~$/)
138
+ else
139
+ []
140
+ end
141
+
142
+ kreuzberg_pdfium_render_files = if Dir.exist?('vendor/kreuzberg-pdfium-render')
143
+ Dir.glob('vendor/kreuzberg-pdfium-render/**/*', File::FNM_DOTMATCH)
144
+ .reject { |f| File.directory?(f) }
145
+ .reject { |f| f.include?('/target/') }
146
+ .grep_v(/\.(swp|bak|tmp)$/)
147
+ .grep_v(/~$/)
148
+ else
149
+ []
150
+ end
151
+
152
+ rb_sys_files = if Dir.exist?('vendor/rb-sys')
153
+ Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
154
+ .reject { |f| File.directory?(f) }
155
+ .reject { |f| f.include?('/target/') }
156
+ .grep_v(/\.(swp|bak|tmp)$/)
157
+ .grep_v(/~$/)
158
+ else
159
+ []
160
+ end
161
+
162
+ workspace_toml = if File.exist?('vendor/Cargo.toml')
163
+ ['vendor/Cargo.toml']
164
+ else
165
+ []
166
+ end
167
+
168
+ kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files +
169
+ kreuzberg_paddle_ocr_files + kreuzberg_pdfium_render_files + rb_sys_files + workspace_toml
170
+ end
171
+
172
+ # When vendor files exist, get ext/ files from filesystem (to include modified Cargo.toml
173
+ # with vendor paths) instead of from git (which has original 5-level crate paths)
174
+ ext_files_from_fs = Dir.chdir(__dir__) do
175
+ Dir.glob('ext/**/*', File::FNM_DOTMATCH)
176
+ .reject { |f| File.directory?(f) }
177
+ .reject { |f| f.include?('/target/') }
178
+ .grep_v(/\.(swp|bak|tmp)$/)
179
+ .grep_v(/~$/)
180
+ end
181
+
182
+ files = if (ruby_files + core_files + ffi_files).empty?
183
+ fallback_files
184
+ elsif vendor_files.any?
185
+ # Use ext/ files from filesystem (modified by vendor script) + non-ext ruby files from git
186
+ non_ext_ruby_files = ruby_files.reject { |f| f.start_with?('ext/') }
187
+ non_ext_ruby_files + ext_files_from_fs + vendor_files
188
+ else
189
+ ruby_files + core_files + ffi_files
190
+ end
191
+
192
+ native_artifacts = Dir.chdir(__dir__) do
193
+ Dir.glob(%w[
194
+ lib/**/*.bundle
195
+ lib/**/*.so
196
+ lib/**/*.dll
197
+ lib/**/*.dylib
198
+ ])
199
+ end
200
+ files.concat(native_artifacts)
201
+
202
+ files = files.select { |f| File.exist?(f) }
203
+ files = files.uniq
204
+
205
+ Gem::Specification.new do |spec|
206
+ spec.name = 'kreuzberg'
207
+ spec.version = Kreuzberg::VERSION
208
+ spec.authors = ['Na\'aman Hirschfeld']
209
+ spec.email = ['nhirschfeld@gmail.com']
210
+
211
+ spec.summary = 'Document intelligence library — extract text from PDFs, Office docs, images, and 75+ formats'
212
+ spec.description = <<~DESC
213
+ Kreuzberg is a high-performance document intelligence library with a Rust core and native
214
+ Ruby bindings via Magnus. Extract text, metadata, and structured data from 75+ file formats
215
+ including PDF, DOCX, PPTX, XLSX, HTML, RTF, images (with OCR), email, archives, and more.
216
+ Features async/sync APIs, text chunking, language detection, and keyword extraction.
217
+ DESC
218
+ spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
219
+ spec.license = 'MIT'
220
+ spec.required_ruby_version = '>= 3.2.0', '< 5.0'
221
+
222
+ spec.metadata = {
223
+ 'homepage_uri' => spec.homepage,
224
+ 'source_code_uri' => 'https://github.com/kreuzberg-dev/kreuzberg',
225
+ 'changelog_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md',
226
+ 'documentation_uri' => 'https://docs.kreuzberg.dev',
227
+ 'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
228
+ 'rubygems_mfa_required' => 'true',
229
+ 'keywords' => 'document-intelligence,document-extraction,text-extraction,ocr,pdf,rust,native-extension,nlp,rag'
230
+ }
231
+
232
+ spec.files = files
233
+ spec.bindir = 'exe'
234
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
235
+ spec.require_paths = ['lib']
236
+ spec.extensions = ['ext/kreuzberg_rb/extconf.rb']
237
+
238
+ spec.add_dependency 'rb_sys', '~> 0.9.119'
239
+
240
+ spec.add_development_dependency 'bundler', '~> 4.0'
241
+ spec.add_development_dependency 'rake', '~> 13.0'
242
+ spec.add_development_dependency 'rake-compiler', '~> 1.2'
243
+ spec.add_development_dependency 'rspec', '~> 3.12'
244
+ spec.add_development_dependency 'sorbet-runtime', '~> 0.5'
245
+ unless Gem.win_platform?
246
+ spec.add_development_dependency 'rbs', '~> 3.0'
247
+ spec.add_development_dependency 'rubocop', '~> 1.66'
248
+ spec.add_development_dependency 'rubocop-performance', '~> 1.21'
249
+ spec.add_development_dependency 'rubocop-rspec', '~> 3.0'
250
+ spec.add_development_dependency 'steep', '~> 1.8'
251
+ end
252
+ spec.add_development_dependency 'yard', '~> 0.9'
253
+ end