kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
data/Steepfile
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Steep configuration for Kreuzberg Ruby package
|
|
4
|
+
|
|
5
|
+
target :lib do
|
|
6
|
+
signature 'sig'
|
|
7
|
+
|
|
8
|
+
check 'lib'
|
|
9
|
+
|
|
10
|
+
# Core library
|
|
11
|
+
library 'pathname'
|
|
12
|
+
library 'json'
|
|
13
|
+
library 'fileutils'
|
|
14
|
+
library 'open3'
|
|
15
|
+
|
|
16
|
+
# Strategic ignores for steep limitations (not fixable, safe to ignore):
|
|
17
|
+
|
|
18
|
+
# 1. Sorbet type annotations - Steep doesn't recognize Sorbet's T::Struct and T::Sig
|
|
19
|
+
# This file uses Sorbet exclusively for type definitions
|
|
20
|
+
ignore 'lib/kreuzberg/types.rb'
|
|
21
|
+
|
|
22
|
+
# 2. Struct.new with keyword_init - steep cannot understand implicit attr_readers
|
|
23
|
+
# defined by Struct.new in blocks (Table and Chunk classes)
|
|
24
|
+
ignore 'lib/kreuzberg/result.rb'
|
|
25
|
+
|
|
26
|
+
# 3. Generic type parameters in normalize_config - steep has difficulty with
|
|
27
|
+
# methods that take Class as parameter and return instances
|
|
28
|
+
ignore 'lib/kreuzberg/config.rb'
|
|
29
|
+
|
|
30
|
+
# 4. Interface types - steep doesn't recognize that all Ruby objects have nil? and is_a?
|
|
31
|
+
# even for interface types like _ToH
|
|
32
|
+
ignore 'lib/kreuzberg/extraction_api.rb'
|
|
33
|
+
|
|
34
|
+
# 5. Open3 methods - steep's built-in Open3 RBS signatures incomplete
|
|
35
|
+
# (capture2, capture3, popen3 are standard library methods)
|
|
36
|
+
ignore 'lib/kreuzberg/setup_lib_path.rb'
|
|
37
|
+
ignore 'lib/kreuzberg/cli_proxy.rb'
|
|
38
|
+
ignore 'lib/kreuzberg/mcp_proxy.rb'
|
|
39
|
+
|
|
40
|
+
# Ignore Rust extension methods (defined in native code)
|
|
41
|
+
ignore 'ext'
|
|
42
|
+
|
|
43
|
+
# Ignore vendored code
|
|
44
|
+
ignore 'vendor'
|
|
45
|
+
|
|
46
|
+
# Ignore test files
|
|
47
|
+
ignore 'spec'
|
|
48
|
+
|
|
49
|
+
# Ignore examples
|
|
50
|
+
ignore 'examples'
|
|
51
|
+
end
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'kreuzberg'
|
|
4
|
+
|
|
5
|
+
# NOTE: Ruby bindings use Tokio runtime with block_on() internally.
|
|
6
|
+
# The "async" functions block the Ruby GVL during execution, so there's
|
|
7
|
+
# no performance benefit over the _sync variants from Ruby's perspective.
|
|
8
|
+
|
|
9
|
+
# ============================================================================
|
|
10
|
+
# Pattern 1: Synchronous Extraction (Recommended)
|
|
11
|
+
# ============================================================================
|
|
12
|
+
|
|
13
|
+
def basic_sync_extraction
|
|
14
|
+
result = Kreuzberg.extract_file_sync('document.pdf')
|
|
15
|
+
puts "Content: #{result[:content]}"
|
|
16
|
+
puts "MIME type: #{result[:mime_type]}"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# ============================================================================
|
|
20
|
+
# Pattern 2: "Async" Extraction (Same Performance as Sync)
|
|
21
|
+
# ============================================================================
|
|
22
|
+
|
|
23
|
+
def basic_async_extraction
|
|
24
|
+
result = Kreuzberg.extract_file('document.pdf')
|
|
25
|
+
puts "Content: #{result[:content]}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# ============================================================================
|
|
29
|
+
# Pattern 3: Concurrent Processing with Ruby Threads
|
|
30
|
+
# ============================================================================
|
|
31
|
+
|
|
32
|
+
def concurrent_with_threads
|
|
33
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
34
|
+
|
|
35
|
+
threads = files.map do |file|
|
|
36
|
+
Thread.new do
|
|
37
|
+
Kreuzberg.extract_file_sync(file)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
results = threads.map(&:value)
|
|
42
|
+
results.each_with_index do |result, index|
|
|
43
|
+
puts "File #{index + 1}: #{result[:content][0..100]}"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# ============================================================================
|
|
48
|
+
# Pattern 4: Batch Processing (Preferred for Multiple Files)
|
|
49
|
+
# ============================================================================
|
|
50
|
+
|
|
51
|
+
def batch_processing
|
|
52
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
53
|
+
|
|
54
|
+
results = Kreuzberg.batch_extract_files_sync(files)
|
|
55
|
+
|
|
56
|
+
puts "Processed #{results.length} files"
|
|
57
|
+
results.each do |result|
|
|
58
|
+
puts "Content preview: #{result[:content][0..50]}"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# ============================================================================
|
|
63
|
+
# Pattern 5: Extraction with Configuration
|
|
64
|
+
# ============================================================================
|
|
65
|
+
|
|
66
|
+
def extraction_with_config
|
|
67
|
+
config = {
|
|
68
|
+
ocr: {
|
|
69
|
+
backend: 'tesseract',
|
|
70
|
+
language: 'eng'
|
|
71
|
+
},
|
|
72
|
+
force_ocr: false
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
|
|
76
|
+
puts "Extracted with OCR: #{result[:content]}"
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# ============================================================================
|
|
80
|
+
# Pattern 6: Extract from Bytes
|
|
81
|
+
# ============================================================================
|
|
82
|
+
|
|
83
|
+
def extract_from_bytes
|
|
84
|
+
data = File.binread('document.pdf')
|
|
85
|
+
result = Kreuzberg.extract_bytes_sync(data, 'application/pdf')
|
|
86
|
+
puts "Extracted from memory: #{result[:content]}"
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# ============================================================================
|
|
90
|
+
# Pattern 7: Batch Extract from Bytes
|
|
91
|
+
# ============================================================================
|
|
92
|
+
|
|
93
|
+
def batch_extract_from_bytes
|
|
94
|
+
files = ['doc1.pdf', 'doc2.pdf']
|
|
95
|
+
bytes_array = files.map { |f| File.binread(f) }
|
|
96
|
+
mime_types = ['application/pdf', 'application/pdf']
|
|
97
|
+
|
|
98
|
+
results = Kreuzberg.batch_extract_bytes_sync(bytes_array, mime_types)
|
|
99
|
+
puts "Processed #{results.length} files from memory"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# ============================================================================
|
|
103
|
+
# Pattern 8: Error Handling
|
|
104
|
+
# ============================================================================
|
|
105
|
+
|
|
106
|
+
def error_handling
|
|
107
|
+
Kreuzberg.extract_file_sync('nonexistent.pdf')
|
|
108
|
+
rescue StandardError => e
|
|
109
|
+
puts "Extraction failed: #{e.message}"
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# ============================================================================
|
|
113
|
+
# Pattern 9: Sequential Processing
|
|
114
|
+
# ============================================================================
|
|
115
|
+
|
|
116
|
+
def sequential_processing
|
|
117
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
118
|
+
|
|
119
|
+
files.each do |file|
|
|
120
|
+
result = Kreuzberg.extract_file_sync(file)
|
|
121
|
+
puts "Processed #{file}: #{result[:content][0..50]}"
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# ============================================================================
|
|
126
|
+
# Pattern 10: Background Processing with ActiveJob (Rails)
|
|
127
|
+
# ============================================================================
|
|
128
|
+
|
|
129
|
+
# Example ActiveJob for async processing in Rails
|
|
130
|
+
# < ApplicationJob
|
|
131
|
+
class DocumentExtractionJob
|
|
132
|
+
def perform(file_path)
|
|
133
|
+
result = Kreuzberg.extract_file_sync(file_path)
|
|
134
|
+
puts "Background extraction complete: #{result[:content][0..100]}"
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Usage in Rails controller:
|
|
139
|
+
# DocumentExtractionJob.perform_later('document.pdf')
|
|
140
|
+
|
|
141
|
+
# ============================================================================
|
|
142
|
+
# Pattern 11: Concurrent Processing with Parallel Gem
|
|
143
|
+
# ============================================================================
|
|
144
|
+
|
|
145
|
+
def concurrent_with_parallel_gem
|
|
146
|
+
require 'parallel'
|
|
147
|
+
|
|
148
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
|
|
149
|
+
|
|
150
|
+
results = Parallel.map(files, in_processes: 4) do |file|
|
|
151
|
+
Kreuzberg.extract_file_sync(file)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
results.each do |result|
|
|
155
|
+
puts "Content: #{result[:content][0..50]}"
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# ============================================================================
|
|
160
|
+
# Pattern 12: Timeout Wrapper
|
|
161
|
+
# ============================================================================
|
|
162
|
+
|
|
163
|
+
def extraction_with_timeout(file_path, timeout_seconds = 30)
|
|
164
|
+
require 'timeout'
|
|
165
|
+
|
|
166
|
+
Timeout.timeout(timeout_seconds) do
|
|
167
|
+
Kreuzberg.extract_file_sync(file_path)
|
|
168
|
+
end
|
|
169
|
+
rescue Timeout::Error
|
|
170
|
+
puts "Extraction timed out after #{timeout_seconds} seconds"
|
|
171
|
+
nil
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# ============================================================================
|
|
175
|
+
# Pattern 13: Custom Ruby PostProcessor Plugin
|
|
176
|
+
# ============================================================================
|
|
177
|
+
|
|
178
|
+
def register_postprocessor
|
|
179
|
+
uppercase_processor = lambda do |result|
|
|
180
|
+
result[:content] = result[:content].upcase
|
|
181
|
+
result
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
|
|
185
|
+
|
|
186
|
+
result = Kreuzberg.extract_file_sync('document.pdf')
|
|
187
|
+
puts "Uppercase content: #{result[:content]}"
|
|
188
|
+
|
|
189
|
+
Kreuzberg.unregister_post_processor('uppercase')
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# ============================================================================
|
|
193
|
+
# Pattern 14: Custom Ruby Validator Plugin
|
|
194
|
+
# ============================================================================
|
|
195
|
+
|
|
196
|
+
def register_validator
|
|
197
|
+
min_length_validator = lambda do |result|
|
|
198
|
+
raise 'Content too short' if result[:content].length < 100
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
Kreuzberg.register_validator('min_length', min_length_validator, 100)
|
|
202
|
+
|
|
203
|
+
begin
|
|
204
|
+
result = Kreuzberg.extract_file_sync('short_document.pdf')
|
|
205
|
+
puts "Validation passed: #{result[:content]}"
|
|
206
|
+
rescue StandardError => e
|
|
207
|
+
puts "Validation failed: #{e.message}"
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
Kreuzberg.unregister_validator('min_length')
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# ============================================================================
|
|
214
|
+
# Pattern 15: Custom Ruby OCR Backend Plugin
|
|
215
|
+
# ============================================================================
|
|
216
|
+
|
|
217
|
+
# Example OCR backend implementation for custom processing.
|
|
218
|
+
class CustomOcrBackend
|
|
219
|
+
def process_image(image_bytes, language)
|
|
220
|
+
"Extracted text from #{image_bytes.length} bytes using #{language}"
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def supports_language?(lang)
|
|
224
|
+
%w[eng deu fra].include?(lang)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def register_ocr_backend
|
|
229
|
+
backend = CustomOcrBackend.new
|
|
230
|
+
Kreuzberg.register_ocr_backend('custom', backend)
|
|
231
|
+
|
|
232
|
+
config = {
|
|
233
|
+
ocr: {
|
|
234
|
+
backend: 'custom',
|
|
235
|
+
language: 'eng'
|
|
236
|
+
},
|
|
237
|
+
force_ocr: true
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
|
|
241
|
+
puts "Custom OCR result: #{result[:content]}"
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# ============================================================================
|
|
245
|
+
# Main Demonstration
|
|
246
|
+
# ============================================================================
|
|
247
|
+
|
|
248
|
+
def main
|
|
249
|
+
puts '=== Basic Sync Extraction ==='
|
|
250
|
+
basic_sync_extraction
|
|
251
|
+
|
|
252
|
+
puts '\n=== Basic Async Extraction (Blocks GVL) ==='
|
|
253
|
+
basic_async_extraction
|
|
254
|
+
|
|
255
|
+
puts '\n=== Concurrent with Ruby Threads ==='
|
|
256
|
+
concurrent_with_threads
|
|
257
|
+
|
|
258
|
+
puts '\n=== Batch Processing (Preferred) ==='
|
|
259
|
+
batch_processing
|
|
260
|
+
|
|
261
|
+
puts '\n=== Extraction with Config ==='
|
|
262
|
+
extraction_with_config
|
|
263
|
+
|
|
264
|
+
puts '\n=== Extract from Bytes ==='
|
|
265
|
+
extract_from_bytes
|
|
266
|
+
|
|
267
|
+
puts '\n=== Error Handling ==='
|
|
268
|
+
error_handling
|
|
269
|
+
|
|
270
|
+
puts '\n=== Sequential Processing ==='
|
|
271
|
+
sequential_processing
|
|
272
|
+
|
|
273
|
+
puts '\n=== Extraction with Timeout ==='
|
|
274
|
+
extraction_with_timeout('large_document.pdf', 30)
|
|
275
|
+
|
|
276
|
+
puts '\n=== Custom PostProcessor ==='
|
|
277
|
+
register_postprocessor
|
|
278
|
+
|
|
279
|
+
puts '\n=== Custom Validator ==='
|
|
280
|
+
register_validator
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
main if __FILE__ == $PROGRAM_NAME
|
data/extconf.rb
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'mkmf'
|
|
4
|
+
require 'rb_sys/mkmf'
|
|
5
|
+
require 'rbconfig'
|
|
6
|
+
require 'fileutils'
|
|
7
|
+
|
|
8
|
+
if Gem.win_platform?
|
|
9
|
+
# Use CI-provided CARGO_TARGET_DIR if available, otherwise use a short path
|
|
10
|
+
# GitHub Actions sets CARGO_TARGET_DIR=C:\t for MAX_PATH mitigation
|
|
11
|
+
if ENV['CARGO_TARGET_DIR']
|
|
12
|
+
puts "Windows detected: Using existing CARGO_TARGET_DIR=#{ENV['CARGO_TARGET_DIR']}"
|
|
13
|
+
else
|
|
14
|
+
short_target_dir = Dir.exist?('C:/t') ? 'C:/t' : 'C:/kz-build'
|
|
15
|
+
begin
|
|
16
|
+
FileUtils.mkdir_p(short_target_dir)
|
|
17
|
+
ENV['CARGO_TARGET_DIR'] = short_target_dir
|
|
18
|
+
ENV['OUT_DIR'] = short_target_dir
|
|
19
|
+
puts "Windows detected: Using short build path #{short_target_dir}"
|
|
20
|
+
rescue StandardError => e
|
|
21
|
+
puts "Warning: Could not create short path #{short_target_dir}: #{e.message}"
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
if /mswin|mingw/.match?(RbConfig::CONFIG['host_os'])
|
|
27
|
+
devkit = ENV.fetch('RI_DEVKIT', nil)
|
|
28
|
+
prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
|
|
29
|
+
|
|
30
|
+
# Set up include paths for MSVC compatibility headers
|
|
31
|
+
native_include = File.expand_path('ext/kreuzberg_rb/native/include', __dir__).tr('\\', '/')
|
|
32
|
+
compat_include = File.expand_path('ext/kreuzberg_rb/native/include/msvc_compat', __dir__).tr('\\', '/')
|
|
33
|
+
|
|
34
|
+
extra_args = []
|
|
35
|
+
extra_args << "-I#{native_include}"
|
|
36
|
+
extra_args << "-I#{compat_include}"
|
|
37
|
+
extra_args << '-fms-extensions'
|
|
38
|
+
extra_args << '-fno-omit-frame-pointer'
|
|
39
|
+
|
|
40
|
+
if devkit
|
|
41
|
+
sysroot = "#{devkit}#{prefix}".tr('\\', '/')
|
|
42
|
+
extra_args.push('--target=x86_64-pc-windows-gnu', "--sysroot=#{sysroot}")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
unless extra_args.empty?
|
|
46
|
+
existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/).reject(&:empty?)
|
|
47
|
+
ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
|
|
48
|
+
puts "BINDGEN_EXTRA_CLANG_ARGS set to: #{ENV.fetch('BINDGEN_EXTRA_CLANG_ARGS', nil)}"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Set target for Windows GNU toolchain if not already set
|
|
52
|
+
ENV['CARGO_BUILD_TARGET'] ||= 'x86_64-pc-windows-gnu' if devkit || ENV['MSYSTEM']
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
default_profile = ENV.fetch('CARGO_PROFILE', 'release')
|
|
56
|
+
|
|
57
|
+
create_rust_makefile('kreuzberg_rb') do |config|
|
|
58
|
+
config.profile = default_profile.to_sym
|
|
59
|
+
config.ext_dir = File.expand_path('ext/kreuzberg_rb/native', __dir__)
|
|
60
|
+
end
|
data/kreuzberg.gemspec
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/kreuzberg/version'
|
|
4
|
+
|
|
5
|
+
repo_root = File.expand_path('../..', __dir__)
|
|
6
|
+
|
|
7
|
+
ruby_prefix = 'packages/ruby/'
|
|
8
|
+
ruby_cmd = %(git -C "#{repo_root}" ls-files -z #{ruby_prefix})
|
|
9
|
+
ruby_files =
|
|
10
|
+
`#{ruby_cmd}`.split("\x0")
|
|
11
|
+
.select { |path| path.start_with?(ruby_prefix) }
|
|
12
|
+
.map { |path| path.delete_prefix(ruby_prefix) }
|
|
13
|
+
|
|
14
|
+
core_prefix = 'crates/kreuzberg/'
|
|
15
|
+
core_cmd = %(git -C "#{repo_root}" ls-files -z #{core_prefix})
|
|
16
|
+
core_files =
|
|
17
|
+
`#{core_cmd}`.split("\x0")
|
|
18
|
+
.select { |path| path.start_with?(core_prefix) }
|
|
19
|
+
.map { |path| path.delete_prefix('crates/') }
|
|
20
|
+
.map { |path| "vendor/#{path}" }
|
|
21
|
+
|
|
22
|
+
ffi_prefix = 'crates/kreuzberg-ffi/'
|
|
23
|
+
ffi_cmd = %(git -C "#{repo_root}" ls-files -z #{ffi_prefix})
|
|
24
|
+
ffi_files =
|
|
25
|
+
`#{ffi_cmd}`.split("\x0")
|
|
26
|
+
.select { |path| path.start_with?(ffi_prefix) }
|
|
27
|
+
.map { |path| path.delete_prefix('crates/') }
|
|
28
|
+
.map { |path| "vendor/#{path}" }
|
|
29
|
+
|
|
30
|
+
fallback_files = Dir.chdir(__dir__) do
|
|
31
|
+
ruby_fallback = Dir.glob(
|
|
32
|
+
%w[
|
|
33
|
+
README.md
|
|
34
|
+
LICENSE
|
|
35
|
+
ext/**/*.rs
|
|
36
|
+
ext/**/*.rb
|
|
37
|
+
ext/**/*.toml
|
|
38
|
+
ext/**/*.lock
|
|
39
|
+
ext/**/*.md
|
|
40
|
+
ext/**/build.rs
|
|
41
|
+
ext/**/Cargo.*
|
|
42
|
+
exe/*
|
|
43
|
+
lib/**/*.rb
|
|
44
|
+
sig/**/*.rbs
|
|
45
|
+
spec/**/*.rb
|
|
46
|
+
],
|
|
47
|
+
File::FNM_DOTMATCH
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
core_fallback = Dir.chdir(repo_root) do
|
|
51
|
+
Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
52
|
+
.reject { |f| File.directory?(f) }
|
|
53
|
+
.reject { |f| f.include?('/.fastembed_cache/') }
|
|
54
|
+
.reject { |f| f.include?('/target/') }
|
|
55
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
56
|
+
.grep_v(/~$/)
|
|
57
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
ffi_fallback = Dir.chdir(repo_root) do
|
|
61
|
+
Dir.glob('crates/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
|
|
62
|
+
.reject { |f| File.directory?(f) }
|
|
63
|
+
.reject { |f| f.include?('/target/') }
|
|
64
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
65
|
+
.grep_v(/~$/)
|
|
66
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
tesseract_fallback = Dir.chdir(repo_root) do
|
|
70
|
+
Dir.glob('crates/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
|
|
71
|
+
.reject { |f| File.directory?(f) }
|
|
72
|
+
.reject { |f| f.include?('/target/') }
|
|
73
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
74
|
+
.grep_v(/~$/)
|
|
75
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
paddle_ocr_fallback = Dir.chdir(repo_root) do
|
|
79
|
+
Dir.glob('crates/kreuzberg-paddle-ocr/**/*', File::FNM_DOTMATCH)
|
|
80
|
+
.reject { |f| File.directory?(f) }
|
|
81
|
+
.reject { |f| f.include?('/target/') }
|
|
82
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
83
|
+
.grep_v(/~$/)
|
|
84
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
pdfium_render_fallback = Dir.chdir(repo_root) do
|
|
88
|
+
Dir.glob('crates/kreuzberg-pdfium-render/**/*', File::FNM_DOTMATCH)
|
|
89
|
+
.reject { |f| File.directory?(f) }
|
|
90
|
+
.reject { |f| f.include?('/target/') }
|
|
91
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
92
|
+
.grep_v(/~$/)
|
|
93
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
ruby_fallback + core_fallback + ffi_fallback + tesseract_fallback + paddle_ocr_fallback + pdfium_render_fallback
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
vendor_files = Dir.chdir(__dir__) do
|
|
100
|
+
kreuzberg_files = if Dir.exist?('vendor/kreuzberg')
|
|
101
|
+
Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
102
|
+
.reject { |f| File.directory?(f) }
|
|
103
|
+
.reject { |f| f.include?('/.fastembed_cache/') }
|
|
104
|
+
.reject { |f| f.include?('/.kreuzberg/') }
|
|
105
|
+
.reject { |f| f.include?('/target/') }
|
|
106
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
107
|
+
.grep_v(/~$/)
|
|
108
|
+
else
|
|
109
|
+
[]
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
kreuzberg_ffi_files = if Dir.exist?('vendor/kreuzberg-ffi')
|
|
113
|
+
Dir.glob('vendor/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
|
|
114
|
+
.reject { |f| File.directory?(f) }
|
|
115
|
+
.reject { |f| f.include?('/target/') }
|
|
116
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
117
|
+
.grep_v(/~$/)
|
|
118
|
+
else
|
|
119
|
+
[]
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
kreuzberg_tesseract_files = if Dir.exist?('vendor/kreuzberg-tesseract')
|
|
123
|
+
Dir.glob('vendor/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
|
|
124
|
+
.reject { |f| File.directory?(f) }
|
|
125
|
+
.reject { |f| f.include?('/target/') }
|
|
126
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
127
|
+
.grep_v(/~$/)
|
|
128
|
+
else
|
|
129
|
+
[]
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
kreuzberg_paddle_ocr_files = if Dir.exist?('vendor/kreuzberg-paddle-ocr')
|
|
133
|
+
Dir.glob('vendor/kreuzberg-paddle-ocr/**/*', File::FNM_DOTMATCH)
|
|
134
|
+
.reject { |f| File.directory?(f) }
|
|
135
|
+
.reject { |f| f.include?('/target/') }
|
|
136
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
137
|
+
.grep_v(/~$/)
|
|
138
|
+
else
|
|
139
|
+
[]
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
kreuzberg_pdfium_render_files = if Dir.exist?('vendor/kreuzberg-pdfium-render')
|
|
143
|
+
Dir.glob('vendor/kreuzberg-pdfium-render/**/*', File::FNM_DOTMATCH)
|
|
144
|
+
.reject { |f| File.directory?(f) }
|
|
145
|
+
.reject { |f| f.include?('/target/') }
|
|
146
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
147
|
+
.grep_v(/~$/)
|
|
148
|
+
else
|
|
149
|
+
[]
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
rb_sys_files = if Dir.exist?('vendor/rb-sys')
|
|
153
|
+
Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
|
|
154
|
+
.reject { |f| File.directory?(f) }
|
|
155
|
+
.reject { |f| f.include?('/target/') }
|
|
156
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
157
|
+
.grep_v(/~$/)
|
|
158
|
+
else
|
|
159
|
+
[]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
workspace_toml = if File.exist?('vendor/Cargo.toml')
|
|
163
|
+
['vendor/Cargo.toml']
|
|
164
|
+
else
|
|
165
|
+
[]
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files +
|
|
169
|
+
kreuzberg_paddle_ocr_files + kreuzberg_pdfium_render_files + rb_sys_files + workspace_toml
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# When vendor files exist, get ext/ files from filesystem (to include modified Cargo.toml
|
|
173
|
+
# with vendor paths) instead of from git (which has original 5-level crate paths)
|
|
174
|
+
ext_files_from_fs = Dir.chdir(__dir__) do
|
|
175
|
+
Dir.glob('ext/**/*', File::FNM_DOTMATCH)
|
|
176
|
+
.reject { |f| File.directory?(f) }
|
|
177
|
+
.reject { |f| f.include?('/target/') }
|
|
178
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
179
|
+
.grep_v(/~$/)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
files = if (ruby_files + core_files + ffi_files).empty?
|
|
183
|
+
fallback_files
|
|
184
|
+
elsif vendor_files.any?
|
|
185
|
+
# Use ext/ files from filesystem (modified by vendor script) + non-ext ruby files from git
|
|
186
|
+
non_ext_ruby_files = ruby_files.reject { |f| f.start_with?('ext/') }
|
|
187
|
+
non_ext_ruby_files + ext_files_from_fs + vendor_files
|
|
188
|
+
else
|
|
189
|
+
ruby_files + core_files + ffi_files
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
native_artifacts = Dir.chdir(__dir__) do
|
|
193
|
+
Dir.glob(%w[
|
|
194
|
+
lib/**/*.bundle
|
|
195
|
+
lib/**/*.so
|
|
196
|
+
lib/**/*.dll
|
|
197
|
+
lib/**/*.dylib
|
|
198
|
+
])
|
|
199
|
+
end
|
|
200
|
+
files.concat(native_artifacts)
|
|
201
|
+
|
|
202
|
+
files = files.select { |f| File.exist?(f) }
|
|
203
|
+
files = files.uniq
|
|
204
|
+
|
|
205
|
+
Gem::Specification.new do |spec|
|
|
206
|
+
spec.name = 'kreuzberg'
|
|
207
|
+
spec.version = Kreuzberg::VERSION
|
|
208
|
+
spec.authors = ['Na\'aman Hirschfeld']
|
|
209
|
+
spec.email = ['nhirschfeld@gmail.com']
|
|
210
|
+
|
|
211
|
+
spec.summary = 'Document intelligence library — extract text from PDFs, Office docs, images, and 75+ formats'
|
|
212
|
+
spec.description = <<~DESC
|
|
213
|
+
Kreuzberg is a high-performance document intelligence library with a Rust core and native
|
|
214
|
+
Ruby bindings via Magnus. Extract text, metadata, and structured data from 75+ file formats
|
|
215
|
+
including PDF, DOCX, PPTX, XLSX, HTML, RTF, images (with OCR), email, archives, and more.
|
|
216
|
+
Features async/sync APIs, text chunking, language detection, and keyword extraction.
|
|
217
|
+
DESC
|
|
218
|
+
spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
|
|
219
|
+
spec.license = 'MIT'
|
|
220
|
+
spec.required_ruby_version = '>= 3.2.0', '< 5.0'
|
|
221
|
+
|
|
222
|
+
spec.metadata = {
|
|
223
|
+
'homepage_uri' => spec.homepage,
|
|
224
|
+
'source_code_uri' => 'https://github.com/kreuzberg-dev/kreuzberg',
|
|
225
|
+
'changelog_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md',
|
|
226
|
+
'documentation_uri' => 'https://docs.kreuzberg.dev',
|
|
227
|
+
'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
|
|
228
|
+
'rubygems_mfa_required' => 'true',
|
|
229
|
+
'keywords' => 'document-intelligence,document-extraction,text-extraction,ocr,pdf,rust,native-extension,nlp,rag'
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
spec.files = files
|
|
233
|
+
spec.bindir = 'exe'
|
|
234
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
235
|
+
spec.require_paths = ['lib']
|
|
236
|
+
spec.extensions = ['ext/kreuzberg_rb/extconf.rb']
|
|
237
|
+
|
|
238
|
+
spec.add_dependency 'rb_sys', '~> 0.9.119'
|
|
239
|
+
|
|
240
|
+
spec.add_development_dependency 'bundler', '~> 4.0'
|
|
241
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
|
242
|
+
spec.add_development_dependency 'rake-compiler', '~> 1.2'
|
|
243
|
+
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
244
|
+
spec.add_development_dependency 'sorbet-runtime', '~> 0.5'
|
|
245
|
+
unless Gem.win_platform?
|
|
246
|
+
spec.add_development_dependency 'rbs', '~> 3.0'
|
|
247
|
+
spec.add_development_dependency 'rubocop', '~> 1.66'
|
|
248
|
+
spec.add_development_dependency 'rubocop-performance', '~> 1.21'
|
|
249
|
+
spec.add_development_dependency 'rubocop-rspec', '~> 3.0'
|
|
250
|
+
spec.add_development_dependency 'steep', '~> 1.8'
|
|
251
|
+
end
|
|
252
|
+
spec.add_development_dependency 'yard', '~> 0.9'
|
|
253
|
+
end
|