kreuzberg 4.6.1-aarch64-linux → 4.6.3-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kreuzberg/config.rb +46 -5
- data/lib/kreuzberg/extraction_api.rb +37 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +21 -0
- data/spec/binding/render_spec.rb +91 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: adae55dc7f30e68a211cc0493985d0d1687b3988e76509fbffd87f91fee45207
|
|
4
|
+
data.tar.gz: 6071b7d76b01dc15b47a11fc5eaeb4292fbb07630d20c3ac113751bbded3de0f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1e90683694a29205d479b3cda7fb367658bf311520884f7c1faa1b4ec1d6be69dad491b620e33d521b8dea893e062c620738b0974157a096670e825a0fd1a434
|
|
7
|
+
data.tar.gz: ff29c19cb5b0085b84ba1a3ad9f97602ed83211e934d5eff1b8a630868a2ff7e919bba5937d7c49cb7a424fbc2239b726750cdca5c8893fdeb6cb4d98540f5fa
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -837,6 +837,25 @@ module Kreuzberg
|
|
|
837
837
|
end
|
|
838
838
|
end
|
|
839
839
|
|
|
840
|
+
# Email extraction configuration
|
|
841
|
+
#
|
|
842
|
+
# @example With fallback codepage
|
|
843
|
+
# email = Email.new(msg_fallback_codepage: 1251)
|
|
844
|
+
#
|
|
845
|
+
class Email
|
|
846
|
+
attr_reader :msg_fallback_codepage
|
|
847
|
+
|
|
848
|
+
def initialize(msg_fallback_codepage: nil)
|
|
849
|
+
@msg_fallback_codepage = msg_fallback_codepage&.to_i
|
|
850
|
+
end
|
|
851
|
+
|
|
852
|
+
def to_h
|
|
853
|
+
h = {}
|
|
854
|
+
h[:msg_fallback_codepage] = @msg_fallback_codepage unless @msg_fallback_codepage.nil?
|
|
855
|
+
h
|
|
856
|
+
end
|
|
857
|
+
end
|
|
858
|
+
|
|
840
859
|
# Layout detection configuration
|
|
841
860
|
#
|
|
842
861
|
# @example Basic usage with fast preset
|
|
@@ -933,7 +952,8 @@ module Kreuzberg
|
|
|
933
952
|
:token_reduction, :keywords, :html_options, :pages,
|
|
934
953
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
935
954
|
:security_limits, :layout, :concurrency,
|
|
936
|
-
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
|
|
955
|
+
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
|
|
956
|
+
:max_archive_depth, :acceleration, :email
|
|
937
957
|
|
|
938
958
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
939
959
|
alias image_extraction images
|
|
@@ -959,6 +979,7 @@ module Kreuzberg
|
|
|
959
979
|
postprocessor token_reduction keywords html_options pages
|
|
960
980
|
max_concurrent_extractions output_format result_format
|
|
961
981
|
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
982
|
+
max_archive_depth acceleration email
|
|
962
983
|
].freeze
|
|
963
984
|
|
|
964
985
|
# Aliases for backward compatibility
|
|
@@ -1015,7 +1036,7 @@ module Kreuzberg
|
|
|
1015
1036
|
new(**normalize_hash_keys(hash))
|
|
1016
1037
|
end
|
|
1017
1038
|
|
|
1018
|
-
def initialize(hash = nil,
|
|
1039
|
+
def initialize(hash = nil, # rubocop:disable Metrics/MethodLength
|
|
1019
1040
|
use_cache: true,
|
|
1020
1041
|
enable_quality_processing: true,
|
|
1021
1042
|
force_ocr: false,
|
|
@@ -1039,7 +1060,10 @@ module Kreuzberg
|
|
|
1039
1060
|
concurrency: nil,
|
|
1040
1061
|
cache_namespace: nil,
|
|
1041
1062
|
cache_ttl_secs: nil,
|
|
1042
|
-
extraction_timeout_secs: nil
|
|
1063
|
+
extraction_timeout_secs: nil,
|
|
1064
|
+
max_archive_depth: 3,
|
|
1065
|
+
acceleration: nil,
|
|
1066
|
+
email: nil)
|
|
1043
1067
|
kwargs = {
|
|
1044
1068
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1045
1069
|
force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
|
|
@@ -1054,7 +1078,10 @@ module Kreuzberg
|
|
|
1054
1078
|
concurrency: concurrency,
|
|
1055
1079
|
cache_namespace: cache_namespace,
|
|
1056
1080
|
cache_ttl_secs: cache_ttl_secs,
|
|
1057
|
-
extraction_timeout_secs: extraction_timeout_secs
|
|
1081
|
+
extraction_timeout_secs: extraction_timeout_secs,
|
|
1082
|
+
max_archive_depth: max_archive_depth,
|
|
1083
|
+
acceleration: acceleration,
|
|
1084
|
+
email: email
|
|
1058
1085
|
}
|
|
1059
1086
|
extracted = extract_from_hash(hash, kwargs)
|
|
1060
1087
|
|
|
@@ -1086,7 +1113,10 @@ module Kreuzberg
|
|
|
1086
1113
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
1087
1114
|
@layout = normalize_config(params[:layout], LayoutDetection)
|
|
1088
1115
|
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
1116
|
+
@acceleration = normalize_config(params[:acceleration], Acceleration)
|
|
1117
|
+
@email = normalize_config(params[:email], Email)
|
|
1089
1118
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
1119
|
+
@max_archive_depth = params[:max_archive_depth]&.to_i || 3
|
|
1090
1120
|
@output_format = validate_output_format(params[:output_format])
|
|
1091
1121
|
@result_format = validate_result_format(params[:result_format])
|
|
1092
1122
|
@cache_namespace = params[:cache_namespace]
|
|
@@ -1127,6 +1157,7 @@ module Kreuzberg
|
|
|
1127
1157
|
force_ocr_pages: @force_ocr_pages,
|
|
1128
1158
|
include_document_structure: @include_document_structure,
|
|
1129
1159
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1160
|
+
max_archive_depth: @max_archive_depth,
|
|
1130
1161
|
output_format: @output_format,
|
|
1131
1162
|
result_format: @result_format,
|
|
1132
1163
|
cache_namespace: @cache_namespace,
|
|
@@ -1142,7 +1173,8 @@ module Kreuzberg
|
|
|
1142
1173
|
image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
|
|
1143
1174
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1144
1175
|
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1145
|
-
layout: @layout&.to_h, concurrency: @concurrency&.to_h
|
|
1176
|
+
layout: @layout&.to_h, concurrency: @concurrency&.to_h,
|
|
1177
|
+
acceleration: @acceleration&.to_h, email: @email&.to_h
|
|
1146
1178
|
}
|
|
1147
1179
|
end
|
|
1148
1180
|
|
|
@@ -1286,6 +1318,12 @@ module Kreuzberg
|
|
|
1286
1318
|
@layout = normalize_config(value, LayoutDetection)
|
|
1287
1319
|
when :concurrency
|
|
1288
1320
|
@concurrency = normalize_config(value, Concurrency)
|
|
1321
|
+
when :acceleration
|
|
1322
|
+
@acceleration = normalize_config(value, Acceleration)
|
|
1323
|
+
when :email
|
|
1324
|
+
@email = normalize_config(value, Email)
|
|
1325
|
+
when :max_archive_depth
|
|
1326
|
+
@max_archive_depth = value&.to_i || 3
|
|
1289
1327
|
when :max_concurrent_extractions
|
|
1290
1328
|
@max_concurrent_extractions = value&.to_i
|
|
1291
1329
|
when :output_format
|
|
@@ -1373,6 +1411,9 @@ module Kreuzberg
|
|
|
1373
1411
|
@html_options = merged.html_options
|
|
1374
1412
|
@pages = merged.pages
|
|
1375
1413
|
@layout = merged.layout
|
|
1414
|
+
@acceleration = merged.acceleration
|
|
1415
|
+
@email = merged.email
|
|
1416
|
+
@max_archive_depth = merged.max_archive_depth
|
|
1376
1417
|
end
|
|
1377
1418
|
|
|
1378
1419
|
def update_output_options(merged)
|
|
@@ -319,6 +319,43 @@ module Kreuzberg
|
|
|
319
319
|
results
|
|
320
320
|
end
|
|
321
321
|
|
|
322
|
+
# Render a single PDF page as a PNG image.
|
|
323
|
+
#
|
|
324
|
+
# @param path [String, Pathname] Path to the PDF file
|
|
325
|
+
# @param page_index [Integer] Zero-based page index
|
|
326
|
+
# @param dpi [Integer] Rendering resolution (default 150)
|
|
327
|
+
# @return [String] PNG-encoded binary string
|
|
328
|
+
# @raise [Errors::IOError] If the file cannot be read
|
|
329
|
+
# @raise [Errors::ParsingError] If rendering fails
|
|
330
|
+
def render_pdf_page(path, page_index, dpi: 150)
|
|
331
|
+
path_str = path.to_s
|
|
332
|
+
raise ArgumentError, 'page_index must be non-negative' if page_index.negative?
|
|
333
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
334
|
+
|
|
335
|
+
native_render_pdf_page(path_str, page_index, dpi)
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# Iterate over pages of a PDF lazily, yielding each page as it is rendered.
|
|
339
|
+
#
|
|
340
|
+
# Each page is rendered via the native FFI iterator, so only one page is in
|
|
341
|
+
# memory at a time.
|
|
342
|
+
#
|
|
343
|
+
# @param path [String, Pathname] Path to the PDF file
|
|
344
|
+
# @param dpi [Integer] Rendering resolution (default 150)
|
|
345
|
+
# @yieldparam page_index [Integer] Zero-based page index
|
|
346
|
+
# @yieldparam png_bytes [String] PNG-encoded binary string for the page
|
|
347
|
+
# @return [Enumerator] if no block is given
|
|
348
|
+
# @raise [Errors::IOError] If the file cannot be read
|
|
349
|
+
# @raise [Errors::ParsingError] If rendering fails
|
|
350
|
+
def render_pdf_pages_iter(path, dpi: 150, &block)
|
|
351
|
+
path_str = path.to_s
|
|
352
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
353
|
+
|
|
354
|
+
return enum_for(:render_pdf_pages_iter, path, dpi: dpi) unless block
|
|
355
|
+
|
|
356
|
+
native_render_pdf_pages_iter(path_str, dpi, &block)
|
|
357
|
+
end
|
|
358
|
+
|
|
322
359
|
def normalize_config(config)
|
|
323
360
|
return {} if config.nil?
|
|
324
361
|
return config if config.is_a?(Hash)
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -459,6 +459,21 @@ module Kreuzberg
|
|
|
459
459
|
def to_h: () -> Hash[Symbol, untyped]
|
|
460
460
|
end
|
|
461
461
|
|
|
462
|
+
class Acceleration
|
|
463
|
+
attr_reader provider: String
|
|
464
|
+
attr_reader device_id: Integer
|
|
465
|
+
|
|
466
|
+
def initialize: (?provider: String, ?device_id: Integer) -> void
|
|
467
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
class Email
|
|
471
|
+
attr_reader msg_fallback_codepage: Integer?
|
|
472
|
+
|
|
473
|
+
def initialize: (?msg_fallback_codepage: Integer?) -> void
|
|
474
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
475
|
+
end
|
|
476
|
+
|
|
462
477
|
class LayoutDetection
|
|
463
478
|
attr_reader preset: String
|
|
464
479
|
attr_reader confidence_threshold: Float?
|
|
@@ -497,7 +512,10 @@ module Kreuzberg
|
|
|
497
512
|
attr_reader pages: PageConfig?
|
|
498
513
|
attr_reader layout: LayoutDetection?
|
|
499
514
|
attr_reader concurrency: Concurrency?
|
|
515
|
+
attr_reader acceleration: Acceleration?
|
|
516
|
+
attr_reader email: Email?
|
|
500
517
|
attr_reader max_concurrent_extractions: Integer?
|
|
518
|
+
attr_reader max_archive_depth: Integer
|
|
501
519
|
attr_reader output_format: String?
|
|
502
520
|
attr_reader result_format: String?
|
|
503
521
|
attr_reader security_limits: Hash[String, Integer]?
|
|
@@ -524,7 +542,10 @@ module Kreuzberg
|
|
|
524
542
|
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
525
543
|
?layout: (LayoutDetection | Hash[Symbol, untyped])?,
|
|
526
544
|
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
545
|
+
?acceleration: (Acceleration | Hash[Symbol, untyped])?,
|
|
546
|
+
?email: (Email | Hash[Symbol, untyped])?,
|
|
527
547
|
?max_concurrent_extractions: Integer?,
|
|
548
|
+
?max_archive_depth: Integer,
|
|
528
549
|
?output_format: String?,
|
|
529
550
|
?result_format: String?,
|
|
530
551
|
?cache_namespace: String?,
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Hand-written binding-specific edge case tests for PDF rendering.
|
|
2
|
+
# Happy-path render tests are auto-generated from fixtures in e2e/.
|
|
3
|
+
# These tests cover error handling, validation, and lifecycle patterns
|
|
4
|
+
# that vary per language and can't be generated uniformly.
|
|
5
|
+
|
|
6
|
+
# frozen_string_literal: true
|
|
7
|
+
|
|
8
|
+
require 'spec_helper'
|
|
9
|
+
|
|
10
|
+
RSpec.describe 'PDF Rendering' do
|
|
11
|
+
it 'exposes rendering methods' do
|
|
12
|
+
expect(Kreuzberg).to respond_to(:render_pdf_page)
|
|
13
|
+
expect(Kreuzberg).to respond_to(:render_pdf_pages_iter)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe '.render_pdf_page' do
|
|
17
|
+
it 'raises an error for a nonexistent file' do
|
|
18
|
+
expect do
|
|
19
|
+
Kreuzberg.render_pdf_page('/nonexistent/path/to/document.pdf', 0)
|
|
20
|
+
end.to raise_error(Kreuzberg::Errors::IOError)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'raises an error for an out-of-bounds page index' do
|
|
24
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
25
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
26
|
+
|
|
27
|
+
expect do
|
|
28
|
+
Kreuzberg.render_pdf_page(pdf_path, 9999)
|
|
29
|
+
end.to raise_error(StandardError)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe '.render_pdf_page with negative index' do
|
|
34
|
+
it 'raises ArgumentError for a negative page index' do
|
|
35
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
36
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
37
|
+
|
|
38
|
+
expect do
|
|
39
|
+
Kreuzberg.render_pdf_page(pdf_path, -1)
|
|
40
|
+
end.to raise_error(ArgumentError)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
describe '.render_pdf_pages_iter' do
|
|
45
|
+
it 'raises an error for a nonexistent file' do
|
|
46
|
+
expect do
|
|
47
|
+
Kreuzberg.render_pdf_pages_iter('/nonexistent/path/to/document.pdf') { |_, _| nil }
|
|
48
|
+
end.to raise_error(Kreuzberg::Errors::IOError)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
describe '.render_pdf_page with empty path' do
|
|
53
|
+
it 'raises an error for an empty path' do
|
|
54
|
+
expect do
|
|
55
|
+
Kreuzberg.render_pdf_page('', 0)
|
|
56
|
+
end.to raise_error(StandardError)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
describe '.render_pdf_pages_iter cleanup' do
|
|
61
|
+
it 'handles iterator cleanup without fully consuming' do
|
|
62
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
63
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
64
|
+
|
|
65
|
+
# Iterate but stop immediately — no crash
|
|
66
|
+
Kreuzberg.render_pdf_pages_iter(pdf_path) do |_page_index, _png_data|
|
|
67
|
+
break
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
describe '.render_pdf_pages_iter early termination' do
|
|
73
|
+
it 'returns valid PNG for the first page then stops' do
|
|
74
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
75
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
76
|
+
|
|
77
|
+
first_png = nil
|
|
78
|
+
Kreuzberg.render_pdf_pages_iter(pdf_path) do |page_index, png_data|
|
|
79
|
+
expect(page_index).to eq(0)
|
|
80
|
+
expect(png_data).to be_a(String)
|
|
81
|
+
expect(png_data.bytesize).to be > 8
|
|
82
|
+
# PNG magic bytes
|
|
83
|
+
expect(png_data.bytes[0..3]).to eq([0x89, 0x50, 0x4E, 0x47])
|
|
84
|
+
first_png = png_data
|
|
85
|
+
break
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
expect(first_png).not_to be_nil
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.6.
|
|
4
|
+
version: 4.6.3
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -220,6 +220,7 @@ files:
|
|
|
220
220
|
- spec/binding/plugins/ocr_backend_spec.rb
|
|
221
221
|
- spec/binding/plugins/postprocessor_spec.rb
|
|
222
222
|
- spec/binding/plugins/validator_spec.rb
|
|
223
|
+
- spec/binding/render_spec.rb
|
|
223
224
|
- spec/binding/tables_spec.rb
|
|
224
225
|
- spec/serialization_spec.rb
|
|
225
226
|
- spec/smoke/package_spec.rb
|