kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+
5
+ module Kreuzberg
6
+ # @example Start the server
7
+ # @example With block
8
+ module APIProxy
9
+ class Error < Kreuzberg::Errors::Error; end
10
+ class MissingBinaryError < Error; end
11
+ class ServerError < Error; end
12
+
13
+ # API server instance
14
+ class Server
15
+ attr_reader :port, :host, :pid
16
+
17
+ # Initialize server
18
+ #
19
+ # @param port [Integer] Port to run on (default: 8000)
20
+ # @param host [String] Host to bind to (default: "0.0.0.0")
21
+ #
22
+ def initialize(port: 8000, host: '0.0.0.0')
23
+ @port = port
24
+ @host = host
25
+ @pid = nil
26
+ @process = nil
27
+ end
28
+
29
+ # Start the server in the background
30
+ #
31
+ # @return [Integer] Process ID
32
+ # @raise [ServerError] If server fails to start
33
+ #
34
+ def start
35
+ binary = APIProxy.find_api_binary
36
+ @pid = spawn(
37
+ binary.to_s,
38
+ 'api',
39
+ '--host', @host,
40
+ '--port', @port.to_s,
41
+ out: $stdout,
42
+ err: $stderr
43
+ )
44
+ Process.detach(@pid)
45
+ sleep 1
46
+ @pid
47
+ end
48
+
49
+ # Stop the server
50
+ #
51
+ # @return [void]
52
+ #
53
+ def stop
54
+ return unless @pid
55
+
56
+ Process.kill('TERM', @pid)
57
+ Process.wait(@pid)
58
+ rescue Errno::ESRCH, Errno::ECHILD # rubocop:disable Lint/SuppressedException
59
+ ensure
60
+ @pid = nil
61
+ end
62
+
63
+ # Check if server is running
64
+ #
65
+ # @return [Boolean]
66
+ #
67
+ def running?
68
+ return false unless @pid
69
+
70
+ Process.kill(0, @pid)
71
+ true
72
+ rescue Errno::ESRCH, Errno::EPERM
73
+ false
74
+ end
75
+ end
76
+
77
+ module_function
78
+
79
+ # Run server with a block
80
+ #
81
+ # @param port [Integer] Port to run on
82
+ # @param host [String] Host to bind to
83
+ # @yield [Server] Yields server instance
84
+ # @return [Object] Block result
85
+ #
86
+ # @example
87
+ # Kreuzberg::APIProxy.run(port: 8000) do |server|
88
+ # # Make API requests
89
+ # end
90
+ #
91
+ def run(port: 8000, host: '0.0.0.0')
92
+ server = Server.new(port: port, host: host)
93
+ server.start
94
+ yield server
95
+ ensure
96
+ server&.stop
97
+ end
98
+
99
+ # Find the API binary
100
+ #
101
+ # @return [Pathname] Path to binary
102
+ # @raise [MissingBinaryError] If not found
103
+ #
104
+ def find_api_binary
105
+ binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
106
+ found = CLIProxy.search_paths(binary_name).find(&:file?)
107
+ return found if found
108
+
109
+ raise MissingBinaryError, missing_binary_message
110
+ end
111
+
112
+ # Error message for missing binary
113
+ #
114
+ # @return [String]
115
+ #
116
+ def missing_binary_message
117
+ <<~MSG.strip
118
+ kreuzberg binary not found for API server. Build it with:
119
+ `cargo build --release --package kreuzberg-cli`
120
+
121
+ Or ensure kreuzberg is installed with API support.
122
+ MSG
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module CacheAPI
5
+ # @return [void] No meaningful return value
6
+ # @example Clear cache
7
+ def clear_cache
8
+ native_clear_cache
9
+ reset_cache_tracker!
10
+ end
11
+
12
+ # Retrieve cache statistics.
13
+ #
14
+ # Returns information about the current state of the extraction result cache,
15
+ # including the number of cached entries and total memory used. Statistics include
16
+ # both native Rust cache metrics and local tracker metrics.
17
+ #
18
+ # @return [Hash{Symbol | String => Integer}] Cache statistics hash containing:
19
+ # - :total_entries [Integer] Total number of cached extraction results
20
+ # - :total_size_bytes [Integer] Total memory used by cached results in bytes
21
+ #
22
+ # @example Get cache statistics
23
+ # stats = Kreuzberg.cache_stats
24
+ # puts "Cached entries: #{stats[:total_entries]}"
25
+ # puts "Cache size: #{stats[:total_size_bytes]} bytes"
26
+ #
27
+ # @example Check if cache is full
28
+ # stats = Kreuzberg.cache_stats
29
+ # if stats[:total_size_bytes] > 1_000_000_000 # 1GB
30
+ # Kreuzberg.clear_cache
31
+ # end
32
+ def cache_stats
33
+ stats = native_cache_stats
34
+ total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
35
+ total_size = (stats['total_size_bytes'] || stats[:total_size_bytes] || 0) + @__cache_tracker[:bytes]
36
+
37
+ stats['total_entries'] = total_entries
38
+ stats[:total_entries] = total_entries
39
+ stats['total_size_bytes'] = total_size
40
+ stats[:total_size_bytes] = total_size
41
+
42
+ stats
43
+ end
44
+
45
+ private
46
+
47
+ def record_cache_entry!(results, opts)
48
+ use_cache = opts.key?(:use_cache) ? opts[:use_cache] : true
49
+ return unless use_cache
50
+
51
+ results_array = results.is_a?(Array) ? results : [results]
52
+ results_array.each do |result|
53
+ # @type var result: Result
54
+ next unless result.respond_to?(:content)
55
+
56
+ @__cache_tracker[:entries] += 1
57
+ @__cache_tracker[:bytes] += result.content.to_s.bytesize
58
+ end
59
+ end
60
+
61
+ def reset_cache_tracker!
62
+ @__cache_tracker[:entries] = 0
63
+ @__cache_tracker[:bytes] = 0
64
+ nil
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # @example Extract a file
5
+ # @example Detect file type
6
+ module CLI
7
+ module_function
8
+
9
+ # Extract content from a file using the CLI
10
+ #
11
+ # @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
12
+ # @param path [String] Path to the file (keyword argument)
13
+ # @param output [String] Output format ("text", "json", "markdown")
14
+ # @param ocr [Boolean] Enable OCR
15
+ # @return [String] Extracted content
16
+ #
17
+ def extract(path_or_nil = nil, path: nil, output: 'text', ocr: false)
18
+ # Support both positional and keyword argument for path (backward compatibility)
19
+ actual_path = path_or_nil || path
20
+ raise ArgumentError, 'path is required' if actual_path.nil?
21
+
22
+ args = ['extract', actual_path, '--format', output]
23
+ args.push('--ocr', ocr ? 'true' : 'false')
24
+ CLIProxy.call(args)
25
+ end
26
+
27
+ # Detect MIME type of a file using the CLI
28
+ #
29
+ # @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
30
+ # @param path [String] Path to the file (keyword argument)
31
+ # @return [String] MIME type
32
+ #
33
+ def detect(path_or_nil = nil, path: nil)
34
+ # Support both positional and keyword argument for path (backward compatibility)
35
+ actual_path = path_or_nil || path
36
+ raise ArgumentError, 'path is required' if actual_path.nil?
37
+
38
+ CLIProxy.call(['detect', actual_path]).strip
39
+ end
40
+
41
+ # Get CLI version
42
+ #
43
+ # @return [String] Version string
44
+ #
45
+ def version
46
+ CLIProxy.call(['--version']).strip
47
+ end
48
+
49
+ # Get CLI help text
50
+ #
51
+ # @return [String] Help text
52
+ #
53
+ def help
54
+ CLIProxy.call(['--help'])
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+
5
+ module Kreuzberg
6
+ # @example
7
+ module CLIProxy
8
+ class Error < Kreuzberg::Errors::Error; end
9
+ class MissingBinaryError < Error; end
10
+
11
+ # CLI execution error with stderr and exit status
12
+ class CLIExecutionError < Error
13
+ attr_reader :stderr, :status
14
+
15
+ def initialize(message, stderr:, status:)
16
+ super(message)
17
+ @stderr = stderr
18
+ @status = status
19
+ end
20
+ end
21
+
22
+ module_function
23
+
24
+ # Execute the Kreuzberg CLI with given arguments
25
+ #
26
+ # @param argv [Array<String>] Command-line arguments
27
+ # @return [String] Standard output from the CLI
28
+ # @raise [CLIExecutionError] If the CLI exits with non-zero status
29
+ # @raise [MissingBinaryError] If the CLI binary is not found
30
+ #
31
+ # @example Extract a file
32
+ # output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
33
+ #
34
+ # @example Detect file type
35
+ # output = Kreuzberg::CLIProxy.call(['detect', 'document.pdf'])
36
+ #
37
+ def call(argv)
38
+ binary = find_cli_binary
39
+ args = Array(argv).map(&:to_s)
40
+ stdout, stderr, status = Open3.capture3(binary.to_s, *args)
41
+ return stdout if status.success?
42
+
43
+ raise CLIExecutionError.new(
44
+ "kreuzberg CLI exited with status #{status.exitstatus}",
45
+ stderr: stderr,
46
+ status: status.exitstatus
47
+ )
48
+ end
49
+
50
+ # Find the kreuzberg CLI binary
51
+ #
52
+ # Searches in multiple locations:
53
+ # - crates/kreuzberg-cli/target/release/
54
+ # - packages/ruby/lib/bin/
55
+ # - workspace root target/release/
56
+ #
57
+ # @return [Pathname] Path to the CLI binary
58
+ # @raise [MissingBinaryError] If binary not found
59
+ #
60
+ def find_cli_binary
61
+ binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
62
+ found = search_paths(binary_name).find(&:file?)
63
+ return found if found
64
+
65
+ raise MissingBinaryError, missing_binary_message
66
+ end
67
+
68
+ # Get the root path of the Ruby package
69
+ #
70
+ # @return [Pathname] Root path
71
+ #
72
+ def root_path
73
+ @root_path ||= Pathname(__dir__ || '.').join('../..').expand_path
74
+ end
75
+
76
+ # Get the lib path
77
+ #
78
+ # @return [Pathname] Lib path
79
+ #
80
+ def lib_path
81
+ @lib_path ||= Pathname(__dir__ || '.').join('..').expand_path
82
+ end
83
+
84
+ # Search paths for the CLI binary
85
+ #
86
+ # @param binary_name [String] Name of the binary
87
+ # @return [Array<Pathname>] List of paths to search
88
+ #
89
+ def search_paths(binary_name)
90
+ paths = [
91
+ lib_path.join('bin', binary_name),
92
+ lib_path.join(binary_name),
93
+ root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
94
+ root_path.join('../../target/release', binary_name),
95
+ root_path.join('../../target/debug', binary_name)
96
+ ]
97
+
98
+ workspace_root = root_path.parent&.parent
99
+ paths << workspace_root.join('target', 'release', binary_name) if workspace_root
100
+ paths << workspace_root.join('target', 'debug', binary_name) if workspace_root
101
+
102
+ paths
103
+ end
104
+
105
+ # Error message when binary is missing
106
+ #
107
+ # @return [String] Error message
108
+ #
109
+ def missing_binary_message
110
+ <<~MSG.strip
111
+ kreuzberg CLI binary not found. Build it with:
112
+ `cargo build --release --package kreuzberg-cli`
113
+
114
+ Or install the gem with pre-built binaries.
115
+ MSG
116
+ end
117
+ end
118
+ end