kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,329 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module ExtractionAPI
5
+ # @param path [String, Pathname] Path to the document file to extract
6
+ # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
7
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration controlling
8
+ # @return [Result] Extraction result containing content, metadata, tables, and images
9
+ # @raise [Errors::IOError] If the file cannot be read or access is denied
10
+ # @raise [Errors::ParsingError] If document parsing fails
11
+ # @raise [Errors::UnsupportedFormatError] If the file format is not supported
12
+ # @raise [Errors::OCRError] If OCR is enabled and fails
13
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
14
+ # @example Extract a PDF file
15
+ # @example Extract with explicit MIME type
16
+ # @example Extract with OCR enabled
17
+ def extract_file_sync(path:, mime_type: nil, config: nil)
18
+ # Validate that the file exists
19
+ path_str = path.to_s
20
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
21
+
22
+ opts = normalize_config(config)
23
+ hash = if mime_type
24
+ native_extract_file_sync(path_str, mime_type.to_s, **opts)
25
+ else
26
+ native_extract_file_sync(path_str, **opts)
27
+ end
28
+ result = Result.new(hash)
29
+ record_cache_entry!(result, opts)
30
+ result
31
+ end
32
+
33
+ # Synchronously extract content from byte data.
34
+ #
35
+ # Performs document extraction directly from binary data in memory. Useful for
36
+ # extracting content from files already loaded into memory or from network streams.
37
+ #
38
+ # @param data [String] Binary document data (can contain any byte values)
39
+ # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
40
+ # This parameter is mandatory to guide the extraction engine.
41
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
42
+ # either a {Config::Extraction} object or a configuration hash.
43
+ #
44
+ # @return [Result] Extraction result containing content, metadata, tables, and images
45
+ #
46
+ # @raise [Errors::ParsingError] If document parsing fails
47
+ # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
48
+ # @raise [Errors::OCRError] If OCR is enabled and fails
49
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
50
+ #
51
+ # @example Extract PDF from memory
52
+ # pdf_data = File.read("document.pdf", binmode: true)
53
+ # result = Kreuzberg.extract_bytes_sync(pdf_data, "application/pdf")
54
+ # puts result.content
55
+ #
56
+ # @example Extract from a network stream
57
+ # response = HTTParty.get("https://example.com/document.docx")
58
+ # result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
59
+ def extract_bytes_sync(data:, mime_type:, config: nil)
60
+ raise TypeError, "mime_type must be a String, got #{mime_type.inspect}" if mime_type.nil?
61
+
62
+ opts = normalize_config(config)
63
+ hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
64
+ result = Result.new(hash)
65
+ record_cache_entry!(result, opts)
66
+ result
67
+ end
68
+
69
+ # Synchronously extract content from multiple files.
70
+ #
71
+ # Processes multiple files in a single batch operation. Files are extracted sequentially,
72
+ # and results maintain the same order as the input paths. This is useful for bulk
73
+ # processing multiple documents with consistent configuration.
74
+ #
75
+ # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
76
+ # is converted to a string and MIME type is auto-detected from extension.
77
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
78
+ # Accepts either a {Config::Extraction} object or a configuration hash.
79
+ #
80
+ # @return [Array<Result>] Array of extraction results in the same order as input paths.
81
+ # Array length matches the input paths length.
82
+ #
83
+ # @raise [Errors::IOError] If any file cannot be read
84
+ # @raise [Errors::ParsingError] If any document parsing fails
85
+ # @raise [Errors::UnsupportedFormatError] If any file format is not supported
86
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
87
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
88
+ #
89
+ # @example Batch extract multiple PDFs
90
+ # paths = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
91
+ # results = Kreuzberg.batch_extract_files_sync(paths)
92
+ # results.each_with_index do |result, idx|
93
+ # puts "File #{idx}: #{result.content.length} characters"
94
+ # end
95
+ #
96
+ # @example Batch extract with consistent configuration
97
+ # paths = Dir.glob("documents/*.pdf")
98
+ # config = Kreuzberg::Config::Extraction.new(force_ocr: true)
99
+ # results = Kreuzberg.batch_extract_files_sync(paths, config: config)
100
+ def batch_extract_files_sync(paths:, config: nil)
101
+ # Validate that all files exist
102
+ paths.each do |path|
103
+ path_str = path.to_s
104
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
105
+ end
106
+
107
+ opts = normalize_config(config)
108
+ hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
109
+ results = hashes.map { |hash| Result.new(hash) }
110
+ record_cache_entry!(results, opts)
111
+ results
112
+ end
113
+
114
+ # Asynchronously extract content from a file.
115
+ #
116
+ # Non-blocking extraction that returns a {Result} promise. Extraction is performed
117
+ # in the background using native threads or the Tokio runtime. This method is
118
+ # preferred for I/O-bound operations and integrating with async workflows.
119
+ #
120
+ # @param path [String, Pathname] Path to the document file to extract
121
+ # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
122
+ # If omitted, type is detected from file extension.
123
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
124
+ # either a {Config::Extraction} object or a configuration hash.
125
+ #
126
+ # @return [Result] Extraction result containing content, metadata, tables, and images.
127
+ # In async contexts, this result is available upon method return.
128
+ #
129
+ # @raise [Errors::IOError] If the file cannot be read or access is denied
130
+ # @raise [Errors::ParsingError] If document parsing fails
131
+ # @raise [Errors::UnsupportedFormatError] If the file format is not supported
132
+ # @raise [Errors::OCRError] If OCR is enabled and fails
133
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
134
+ #
135
+ # @example Extract a PDF file asynchronously
136
+ # result = Kreuzberg.extract_file("large_document.pdf")
137
+ # puts result.content
138
+ #
139
+ # @example Extract with custom OCR configuration
140
+ # config = Kreuzberg::Config::Extraction.new(
141
+ # ocr: Kreuzberg::Config::OCR.new(language: "deu")
142
+ # )
143
+ # result = Kreuzberg.extract_file("document.pdf", config: config)
144
+ def extract_file(path:, mime_type: nil, config: nil)
145
+ # Validate that the file exists
146
+ path_str = path.to_s
147
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
148
+
149
+ opts = normalize_config(config)
150
+ hash = if mime_type
151
+ native_extract_file(path_str, mime_type.to_s, **opts)
152
+ else
153
+ native_extract_file(path_str, **opts)
154
+ end
155
+ result = Result.new(hash)
156
+ record_cache_entry!(result, opts)
157
+ result
158
+ end
159
+
160
+ # Asynchronously extract content from byte data.
161
+ #
162
+ # Non-blocking extraction from in-memory binary data. Like {#extract_file},
163
+ # this performs extraction in the background, making it suitable for handling
164
+ # high-volume extraction workloads without blocking the main thread.
165
+ #
166
+ # @param data [String] Binary document data (can contain any byte values)
167
+ # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
168
+ # This parameter is mandatory to guide the extraction engine.
169
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
170
+ # either a {Config::Extraction} object or a configuration hash.
171
+ #
172
+ # @return [Result] Extraction result containing content, metadata, tables, and images
173
+ #
174
+ # @raise [Errors::ParsingError] If document parsing fails
175
+ # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
176
+ # @raise [Errors::OCRError] If OCR is enabled and fails
177
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
178
+ #
179
+ # @example Extract PDF from memory asynchronously
180
+ # pdf_data = File.read("document.pdf", binmode: true)
181
+ # result = Kreuzberg.extract_bytes(pdf_data, "application/pdf")
182
+ # puts result.content
183
+ #
184
+ # @example Extract with image extraction
185
+ # data = File.read("file.docx", binmode: true)
186
+ # config = Kreuzberg::Config::Extraction.new(
187
+ # image_extraction: Kreuzberg::Config::ImageExtraction.new(extract_images: true)
188
+ # )
189
+ # result = Kreuzberg.extract_bytes(data, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config: config)
190
+ def extract_bytes(data:, mime_type:, config: nil)
191
+ opts = normalize_config(config)
192
+ hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
193
+ result = Result.new(hash)
194
+ record_cache_entry!(result, opts)
195
+ result
196
+ end
197
+
198
+ # Asynchronously extract content from multiple files.
199
+ #
200
+ # Non-blocking batch extraction from multiple files. Results maintain the same order
201
+ # as input paths. This is the preferred method for bulk processing when non-blocking
202
+ # I/O is required (e.g., in web servers or async applications).
203
+ #
204
+ # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
205
+ # is converted to a string and MIME type is auto-detected from extension.
206
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
207
+ # Accepts either a {Config::Extraction} object or a configuration hash.
208
+ #
209
+ # @return [Array<Result>] Array of extraction results in the same order as input paths.
210
+ # Array length matches the input paths length.
211
+ #
212
+ # @raise [Errors::IOError] If any file cannot be read
213
+ # @raise [Errors::ParsingError] If any document parsing fails
214
+ # @raise [Errors::UnsupportedFormatError] If any file format is not supported
215
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
216
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
217
+ #
218
+ # @example Batch extract multiple files asynchronously
219
+ # paths = ["invoice_1.pdf", "invoice_2.pdf", "invoice_3.pdf"]
220
+ # results = Kreuzberg.batch_extract_files(paths)
221
+ # results.each_with_index do |result, idx|
222
+ # puts "Invoice #{idx}: #{result.detected_languages}"
223
+ # end
224
+ #
225
+ # @example Batch extract with chunking
226
+ # paths = Dir.glob("reports/*.docx")
227
+ # config = Kreuzberg::Config::Extraction.new(
228
+ # chunking: Kreuzberg::Config::Chunking.new(max_chars: 1000, max_overlap: 200)
229
+ # )
230
+ # results = Kreuzberg.batch_extract_files(paths, config: config)
231
+ def batch_extract_files(paths:, config: nil)
232
+ opts = normalize_config(config)
233
+ hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
234
+ results = hashes.map { |hash| Result.new(hash) }
235
+ record_cache_entry!(results, opts)
236
+ results
237
+ end
238
+
239
+ # Synchronously extract content from multiple byte data sources.
240
+ #
241
+ # Processes multiple in-memory binary documents in a single batch operation. Results
242
+ # maintain the same order as the input data array. The mime_types array must have
243
+ # the same length as the data_array.
244
+ #
245
+ # @param data_array [Array<String>] Array of binary document data. Each element can
246
+ # contain any byte values (e.g., PDF binary data).
247
+ # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
248
+ # Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
249
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
250
+ # Accepts either a {Config::Extraction} object or a configuration hash.
251
+ #
252
+ # @return [Array<Result>] Array of extraction results in the same order as input data.
253
+ # Array length matches the data_array length.
254
+ #
255
+ # @raise [ArgumentError] If data_array and mime_types have different lengths
256
+ # @raise [Errors::ParsingError] If any document parsing fails
257
+ # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
258
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
259
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
260
+ #
261
+ # @example Batch extract binary documents
262
+ # pdf_data_1 = File.read("doc1.pdf", binmode: true)
263
+ # pdf_data_2 = File.read("doc2.pdf", binmode: true)
264
+ # docx_data = File.read("report.docx", binmode: true)
265
+ #
266
+ # data = [pdf_data_1, pdf_data_2, docx_data]
267
+ # types = ["application/pdf", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
268
+ # results = Kreuzberg.batch_extract_bytes_sync(data, types)
269
+ # results.each { |r| puts r.content }
270
+ def batch_extract_bytes_sync(data_array:, mime_types:, config: nil)
271
+ opts = normalize_config(config)
272
+ hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
273
+ results = hashes.map { |hash| Result.new(hash) }
274
+ record_cache_entry!(results, opts)
275
+ results
276
+ end
277
+
278
+ # Asynchronously extract content from multiple byte data sources.
279
+ #
280
+ # Non-blocking batch extraction from multiple in-memory binary documents. Results
281
+ # maintain the same order as the input data array. This method is preferred when
282
+ # processing multiple documents without blocking (e.g., handling multiple uploads).
283
+ #
284
+ # @param data_array [Array<String>] Array of binary document data. Each element can
285
+ # contain any byte values (e.g., PDF binary data).
286
+ # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
287
+ # Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
288
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
289
+ # Accepts either a {Config::Extraction} object or a configuration hash.
290
+ #
291
+ # @return [Array<Result>] Array of extraction results in the same order as input data.
292
+ # Array length matches the data_array length.
293
+ #
294
+ # @raise [ArgumentError] If data_array and mime_types have different lengths
295
+ # @raise [Errors::ParsingError] If any document parsing fails
296
+ # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
297
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
298
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
299
+ #
300
+ # @example Batch extract uploaded documents asynchronously
301
+ # # From a web request with multiple file uploads
302
+ # uploaded_files = params[:files] # Array of uploaded file objects
303
+ # data = uploaded_files.map(&:read)
304
+ # types = uploaded_files.map(&:content_type)
305
+ #
306
+ # results = Kreuzberg.batch_extract_bytes(data, types)
307
+ # results.each { |r| puts r.content }
308
+ #
309
+ # @example Batch extract with OCR
310
+ # data = [scan_1_bytes, scan_2_bytes, scan_3_bytes]
311
+ # types = ["image/png", "image/png", "image/png"]
312
+ # config = Kreuzberg::Config::Extraction.new(force_ocr: true)
313
+ # results = Kreuzberg.batch_extract_bytes(data, types, config: config)
314
+ def batch_extract_bytes(data_array:, mime_types:, config: nil)
315
+ opts = normalize_config(config)
316
+ hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
317
+ results = hashes.map { |hash| Result.new(hash) }
318
+ record_cache_entry!(results, opts)
319
+ results
320
+ end
321
+
322
+ def normalize_config(config)
323
+ return {} if config.nil?
324
+ return config if config.is_a?(Hash)
325
+
326
+ config.to_h
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,176 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'json'
5
+
6
+ module Kreuzberg
7
+ # @example Start MCP server
8
+ module MCPProxy
9
+ class Error < Kreuzberg::Errors::Error; end
10
+ class MissingBinaryError < Error; end
11
+ class ServerError < Error; end
12
+
13
+ # MCP server instance
14
+ class Server
15
+ attr_reader :pid, :transport
16
+
17
+ # Initialize MCP server
18
+ #
19
+ # @param transport [String] Transport method ("stdio" or "sse")
20
+ #
21
+ def initialize(transport: 'stdio')
22
+ @transport = transport
23
+ @pid = nil
24
+ @stdin = nil
25
+ @stdout = nil
26
+ @stderr = nil
27
+ end
28
+
29
+ # Start the MCP server
30
+ #
31
+ # @return [Integer, nil] Process ID (for SSE) or nil (for stdio)
32
+ #
33
+ def start
34
+ binary = MCPProxy.find_mcp_binary
35
+
36
+ case @transport
37
+ when 'stdio'
38
+ start_stdio(binary)
39
+ when 'sse'
40
+ start_sse(binary)
41
+ else
42
+ raise ServerError, "Unknown transport: #{@transport}"
43
+ end
44
+ end
45
+
46
+ # Stop the server
47
+ #
48
+ # @return [void]
49
+ #
50
+ def stop
51
+ return unless @pid
52
+
53
+ Process.kill('TERM', @pid)
54
+ Process.wait(@pid)
55
+ rescue Errno::ESRCH, Errno::ECHILD # rubocop:disable Lint/SuppressedException
56
+ ensure
57
+ @pid = nil
58
+ close_pipes
59
+ end
60
+
61
+ # Send a message to the server (stdio only)
62
+ #
63
+ # @param message [Hash] JSON-RPC message
64
+ # @return [void]
65
+ #
66
+ def send_message(message)
67
+ raise ServerError, 'Can only send messages in stdio mode' unless @transport == 'stdio'
68
+ raise ServerError, 'Server not started' unless @stdin
69
+
70
+ @stdin.puts(JSON.generate(message))
71
+ @stdin.flush
72
+ end
73
+
74
+ # Read a message from the server (stdio only)
75
+ #
76
+ # @return [Hash] JSON-RPC message
77
+ #
78
+ def read_message
79
+ raise ServerError, 'Can only read messages in stdio mode' unless @transport == 'stdio'
80
+ raise ServerError, 'Server not started' unless @stdout
81
+
82
+ line = @stdout.gets
83
+ JSON.parse(line) if line
84
+ end
85
+
86
+ # Check if server is running
87
+ #
88
+ # @return [Boolean]
89
+ #
90
+ def running?
91
+ return false unless @pid
92
+
93
+ Process.kill(0, @pid)
94
+ true
95
+ rescue Errno::ESRCH, Errno::EPERM
96
+ false
97
+ end
98
+
99
+ private
100
+
101
+ def start_stdio(binary)
102
+ @stdin, @stdout, @stderr, wait_thr = Open3.popen3(binary.to_s, 'mcp', '--transport', 'stdio')
103
+ @pid = wait_thr.pid
104
+ nil
105
+ end
106
+
107
+ def start_sse(binary)
108
+ @pid = spawn(
109
+ binary.to_s,
110
+ 'mcp',
111
+ '--transport', 'sse',
112
+ out: $stdout,
113
+ err: $stderr
114
+ )
115
+ Process.detach(@pid)
116
+ sleep 1
117
+ @pid
118
+ end
119
+
120
+ def close_pipes
121
+ @stdin&.close
122
+ @stdout&.close
123
+ @stderr&.close
124
+ @stdin = @stdout = @stderr = nil
125
+ end
126
+ end
127
+
128
+ module_function
129
+
130
+ # Run MCP server with a block
131
+ #
132
+ # @param transport [String] Transport method
133
+ # @yield [Server] Yields server instance
134
+ # @return [Object] Block result
135
+ #
136
+ # @example
137
+ # Kreuzberg::MCPProxy.run(transport: 'stdio') do |server|
138
+ # server.send_message({ method: 'tools/list' })
139
+ # response = server.read_message
140
+ # end
141
+ #
142
+ def run(transport: 'stdio')
143
+ server = Server.new(transport: transport)
144
+ server.start
145
+ yield server
146
+ ensure
147
+ server&.stop
148
+ end
149
+
150
+ # Find the MCP binary
151
+ #
152
+ # @return [Pathname] Path to binary
153
+ # @raise [MissingBinaryError] If not found
154
+ #
155
+ def find_mcp_binary
156
+ binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
157
+ found = CLIProxy.search_paths(binary_name).find(&:file?)
158
+ return found if found
159
+
160
+ raise MissingBinaryError, missing_binary_message
161
+ end
162
+
163
+ # Error message for missing binary
164
+ #
165
+ # @return [String]
166
+ #
167
+ def missing_binary_message
168
+ <<~MSG.strip
169
+ kreuzberg binary not found for MCP server. Build it with:
170
+ `cargo build --release --package kreuzberg-cli`
171
+
172
+ Or ensure kreuzberg is installed with MCP support.
173
+ MSG
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # @example Implementing a custom OCR backend
5
+ # @example Implementing an OCR backend with initialization
6
+ module OcrBackendProtocol
7
+ # @return [String] Unique backend identifier
8
+ # @example
9
+ def name
10
+ raise NotImplementedError, "#{self.class} must implement #name"
11
+ end
12
+
13
+ # Process image bytes and extract text via OCR.
14
+ #
15
+ # This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
16
+ # hash. It must return the extracted text as a string.
17
+ #
18
+ # The config hash contains OCR settings such as:
19
+ # - "language" [String] - Language code (e.g., "eng", "deu", "fra")
20
+ # - "backend" [String] - Backend name (same as #name)
21
+ # - Additional backend-specific settings
22
+ #
23
+ # @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
24
+ # @param config [Hash] OCR configuration with the following keys:
25
+ # - "language" [String] - Language code for OCR (e.g., "eng", "deu")
26
+ # - "backend" [String] - Backend name
27
+ #
28
+ # @return [String] Extracted text content
29
+ #
30
+ # @example
31
+ # def process_image(image_bytes, config)
32
+ # language = config["language"] || "eng"
33
+ # text = my_ocr_engine.recognize(image_bytes, language: language)
34
+ # text
35
+ # end
36
+ def process_image(image_bytes, config)
37
+ raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # @example Implementing a simple post-processor
5
+ # @example Implementing a post-processor that adds metadata
6
+ # @example Using a Proc as a post-processor
7
+ module PostProcessorProtocol
8
+ # @param result [Hash] Extraction result with the following structure:
9
+ # @return [Hash] Modified extraction result with enriched metadata
10
+ # @example
11
+ def call(result)
12
+ raise NotImplementedError, "#{self.class} must implement #call(result)"
13
+ end
14
+ end
15
+ end