kreuzberg 4.3.5-aarch64-linux → 4.3.7-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +1 -1
- data/kreuzberg.gemspec +1 -1
- data/lib/kreuzberg/config.rb +13 -3
- data/lib/kreuzberg/result.rb +32 -2
- data/lib/kreuzberg/types.rb +20 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +28 -2
- data/spec/binding/embeddings_spec.rb +0 -682
- data/spec/binding/images_spec.rb +0 -577
- data/spec/binding/keywords_extraction_spec.rb +0 -548
- data/spec/binding/pages_extraction_spec.rb +0 -449
- data/spec/binding/tables_spec.rb +0 -467
- data/spec/smoke/package_spec.rb +22 -0
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8519525216a517ce9d2078552b1492c4209c2d1a8bff7b58eae88a8137a603d4
|
|
4
|
+
data.tar.gz: ab098d31da39a6965c57f65d4a6a282570333ed6a0a8d00886c22982b1bce76c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 209c8c32c54f5b73e90954ae9db136aa87d13e9738c48de6e89282396986cc705b1f7c64841a7358f6357d34980f023890c8031a6e10a32eb20237564fad3d30
|
|
7
|
+
data.tar.gz: c92af45298595d9bd433dd9b01223b45870063149042bd3d36b26d5201a794b78fffa2f79cf6d9621a80e5d65751ea7fec52756fce44a28e53de1d6a7cf79e2f
|
data/Gemfile.lock
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.3.
|
|
4
|
+
kreuzberg (4.3.7)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
|
+
sorbet-runtime (~> 0.5)
|
|
6
7
|
|
|
7
8
|
GEM
|
|
8
9
|
remote: https://rubygems.org/
|
|
@@ -123,7 +124,7 @@ GEM
|
|
|
123
124
|
rubocop (~> 1.81)
|
|
124
125
|
ruby-progressbar (1.13.0)
|
|
125
126
|
securerandom (0.4.1)
|
|
126
|
-
sorbet-runtime (0.6.
|
|
127
|
+
sorbet-runtime (0.6.12956)
|
|
127
128
|
steep (1.10.0)
|
|
128
129
|
activesupport (>= 5.1)
|
|
129
130
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -179,7 +180,6 @@ DEPENDENCIES
|
|
|
179
180
|
rubocop (~> 1.66)
|
|
180
181
|
rubocop-performance (~> 1.21)
|
|
181
182
|
rubocop-rspec (~> 3.0)
|
|
182
|
-
sorbet-runtime (~> 0.5)
|
|
183
183
|
steep (~> 1.8)
|
|
184
184
|
yard (~> 0.9)
|
|
185
185
|
|
|
@@ -210,7 +210,7 @@ CHECKSUMS
|
|
|
210
210
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
211
211
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
212
212
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
213
|
-
kreuzberg (4.3.
|
|
213
|
+
kreuzberg (4.3.7)
|
|
214
214
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
215
215
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
216
216
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -245,7 +245,7 @@ CHECKSUMS
|
|
|
245
245
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
246
246
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
247
247
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
248
|
-
sorbet-runtime (0.6.
|
|
248
|
+
sorbet-runtime (0.6.12956) sha256=fee716a62d0b1d94ebc8e6ba23e76a7654eeac66c1f5cc1e1bef78b8e9ff87c7
|
|
249
249
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
250
250
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
251
251
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.7" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/kreuzberg.gemspec
CHANGED
|
@@ -241,7 +241,7 @@ Gem::Specification.new do |spec|
|
|
|
241
241
|
spec.add_development_dependency 'rake', '~> 13.0'
|
|
242
242
|
spec.add_development_dependency 'rake-compiler', '~> 1.2'
|
|
243
243
|
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
244
|
-
spec.
|
|
244
|
+
spec.add_dependency 'sorbet-runtime', '~> 0.5'
|
|
245
245
|
unless Gem.win_platform?
|
|
246
246
|
spec.add_development_dependency 'rbs', '~> 3.0'
|
|
247
247
|
spec.add_development_dependency 'rubocop', '~> 1.66'
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -391,14 +391,18 @@ module Kreuzberg
|
|
|
391
391
|
# pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
|
|
392
392
|
#
|
|
393
393
|
class PDF
|
|
394
|
-
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
|
|
394
|
+
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy,
|
|
395
|
+
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction
|
|
395
396
|
|
|
396
397
|
def initialize(
|
|
397
398
|
extract_images: false,
|
|
398
399
|
passwords: nil,
|
|
399
400
|
extract_metadata: true,
|
|
400
401
|
font_config: nil,
|
|
401
|
-
hierarchy: nil
|
|
402
|
+
hierarchy: nil,
|
|
403
|
+
extract_annotations: false,
|
|
404
|
+
top_margin_fraction: nil,
|
|
405
|
+
bottom_margin_fraction: nil
|
|
402
406
|
)
|
|
403
407
|
@extract_images = extract_images ? true : false
|
|
404
408
|
@passwords = if passwords.is_a?(Array)
|
|
@@ -409,6 +413,9 @@ module Kreuzberg
|
|
|
409
413
|
@extract_metadata = extract_metadata ? true : false
|
|
410
414
|
@font_config = normalize_font_config(font_config)
|
|
411
415
|
@hierarchy = normalize_hierarchy(hierarchy)
|
|
416
|
+
@extract_annotations = extract_annotations ? true : false
|
|
417
|
+
@top_margin_fraction = top_margin_fraction&.to_f
|
|
418
|
+
@bottom_margin_fraction = bottom_margin_fraction&.to_f
|
|
412
419
|
end
|
|
413
420
|
|
|
414
421
|
def to_h
|
|
@@ -417,7 +424,10 @@ module Kreuzberg
|
|
|
417
424
|
passwords: @passwords,
|
|
418
425
|
extract_metadata: @extract_metadata,
|
|
419
426
|
font_config: @font_config&.to_h,
|
|
420
|
-
hierarchy: @hierarchy&.to_h
|
|
427
|
+
hierarchy: @hierarchy&.to_h,
|
|
428
|
+
extract_annotations: @extract_annotations,
|
|
429
|
+
top_margin_fraction: @top_margin_fraction,
|
|
430
|
+
bottom_margin_fraction: @bottom_margin_fraction
|
|
421
431
|
}.compact
|
|
422
432
|
end
|
|
423
433
|
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -14,7 +14,7 @@ module Kreuzberg
|
|
|
14
14
|
class Result
|
|
15
15
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
16
16
|
:detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
|
|
17
|
-
:document, :extracted_keywords, :quality_score, :processing_warnings
|
|
17
|
+
:document, :extracted_keywords, :quality_score, :processing_warnings, :annotations
|
|
18
18
|
|
|
19
19
|
# @!attribute [r] cells
|
|
20
20
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -339,6 +339,7 @@ module Kreuzberg
|
|
|
339
339
|
@extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
|
|
340
340
|
@quality_score = get_value(hash, 'quality_score')
|
|
341
341
|
@processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
|
|
342
|
+
@annotations = parse_annotations(get_value(hash, 'annotations'))
|
|
342
343
|
end
|
|
343
344
|
# rubocop:enable Metrics/AbcSize
|
|
344
345
|
|
|
@@ -346,6 +347,7 @@ module Kreuzberg
|
|
|
346
347
|
#
|
|
347
348
|
# @return [Hash] Hash representation
|
|
348
349
|
#
|
|
350
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
349
351
|
def to_h
|
|
350
352
|
{
|
|
351
353
|
content: @content,
|
|
@@ -362,9 +364,11 @@ module Kreuzberg
|
|
|
362
364
|
document: @document&.to_h,
|
|
363
365
|
extracted_keywords: @extracted_keywords&.map(&:to_h),
|
|
364
366
|
quality_score: @quality_score,
|
|
365
|
-
processing_warnings: @processing_warnings.map(&:to_h)
|
|
367
|
+
processing_warnings: @processing_warnings.map(&:to_h),
|
|
368
|
+
annotations: @annotations&.map(&:to_h)
|
|
366
369
|
}
|
|
367
370
|
end
|
|
371
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
368
372
|
|
|
369
373
|
# Convert to JSON
|
|
370
374
|
#
|
|
@@ -707,6 +711,32 @@ module Kreuzberg
|
|
|
707
711
|
)
|
|
708
712
|
end
|
|
709
713
|
end
|
|
714
|
+
|
|
715
|
+
def parse_annotations(annotations_data)
|
|
716
|
+
return nil if annotations_data.nil?
|
|
717
|
+
|
|
718
|
+
annotations_data.map { |a_hash| build_annotation(a_hash) }
|
|
719
|
+
end
|
|
720
|
+
|
|
721
|
+
def build_annotation(a_hash)
|
|
722
|
+
PdfAnnotation.new(
|
|
723
|
+
annotation_type: a_hash['annotation_type'] || '',
|
|
724
|
+
content: a_hash['content'],
|
|
725
|
+
page_number: a_hash['page_number']&.to_i,
|
|
726
|
+
bounding_box: build_annotation_bbox(a_hash['bounding_box'])
|
|
727
|
+
)
|
|
728
|
+
end
|
|
729
|
+
|
|
730
|
+
def build_annotation_bbox(bbox_data)
|
|
731
|
+
return nil if bbox_data.nil?
|
|
732
|
+
|
|
733
|
+
PdfAnnotationBoundingBox.new(
|
|
734
|
+
left: bbox_data['left']&.to_f,
|
|
735
|
+
top: bbox_data['top']&.to_f,
|
|
736
|
+
right: bbox_data['right']&.to_f,
|
|
737
|
+
bottom: bbox_data['bottom']&.to_f
|
|
738
|
+
)
|
|
739
|
+
end
|
|
710
740
|
end
|
|
711
741
|
# rubocop:enable Metrics/ClassLength
|
|
712
742
|
end
|
data/lib/kreuzberg/types.rb
CHANGED
|
@@ -411,4 +411,24 @@ module Kreuzberg
|
|
|
411
411
|
|
|
412
412
|
const :nodes, T::Array[DocumentNode]
|
|
413
413
|
end
|
|
414
|
+
|
|
415
|
+
# Bounding box for a PDF annotation.
|
|
416
|
+
class PdfAnnotationBoundingBox < T::Struct
|
|
417
|
+
extend T::Sig
|
|
418
|
+
|
|
419
|
+
const :left, T.nilable(Float)
|
|
420
|
+
const :top, T.nilable(Float)
|
|
421
|
+
const :right, T.nilable(Float)
|
|
422
|
+
const :bottom, T.nilable(Float)
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
# A PDF annotation extracted from a document page.
|
|
426
|
+
class PdfAnnotation < T::Struct
|
|
427
|
+
extend T::Sig
|
|
428
|
+
|
|
429
|
+
const :annotation_type, String
|
|
430
|
+
const :content, T.nilable(String)
|
|
431
|
+
const :page_number, T.nilable(Integer)
|
|
432
|
+
const :bounding_box, T.nilable(PdfAnnotationBoundingBox)
|
|
433
|
+
end
|
|
414
434
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -323,8 +323,11 @@ module Kreuzberg
|
|
|
323
323
|
attr_reader extract_metadata: bool
|
|
324
324
|
attr_reader font_config: FontConfig?
|
|
325
325
|
attr_reader hierarchy: Hierarchy?
|
|
326
|
+
attr_reader extract_annotations: bool
|
|
327
|
+
attr_reader top_margin_fraction: Float?
|
|
328
|
+
attr_reader bottom_margin_fraction: Float?
|
|
326
329
|
|
|
327
|
-
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
|
|
330
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?) -> void
|
|
328
331
|
def to_h: () -> Hash[Symbol, untyped]
|
|
329
332
|
end
|
|
330
333
|
|
|
@@ -525,6 +528,15 @@ module Kreuzberg
|
|
|
525
528
|
end
|
|
526
529
|
|
|
527
530
|
# Extraction result type
|
|
531
|
+
type pdf_annotation_type = 'text' | 'highlight' | 'link' | 'stamp' | 'underline' | 'strike_out' | 'other'
|
|
532
|
+
|
|
533
|
+
type pdf_annotation_hash = {
|
|
534
|
+
annotation_type: String,
|
|
535
|
+
content: String?,
|
|
536
|
+
page_number: Integer,
|
|
537
|
+
bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
|
|
538
|
+
}
|
|
539
|
+
|
|
528
540
|
type extraction_result_hash = {
|
|
529
541
|
content: String,
|
|
530
542
|
mime_type: String,
|
|
@@ -541,7 +553,8 @@ module Kreuzberg
|
|
|
541
553
|
document: document_structure_hash?,
|
|
542
554
|
extracted_keywords: Array[extracted_keyword_hash]?,
|
|
543
555
|
quality_score: Float?,
|
|
544
|
-
processing_warnings: Array[processing_warning_hash]
|
|
556
|
+
processing_warnings: Array[processing_warning_hash]?,
|
|
557
|
+
annotations: Array[pdf_annotation_hash]?
|
|
545
558
|
}
|
|
546
559
|
|
|
547
560
|
type extracted_keyword_hash = {
|
|
@@ -1076,6 +1089,18 @@ module Kreuzberg
|
|
|
1076
1089
|
attr_reader extracted_keywords: Array[ExtractedKeyword]?
|
|
1077
1090
|
attr_reader quality_score: Float?
|
|
1078
1091
|
attr_reader processing_warnings: Array[ProcessingWarning]?
|
|
1092
|
+
attr_reader annotations: Array[PdfAnnotation]?
|
|
1093
|
+
|
|
1094
|
+
# PDF annotation extracted from a document page (Struct from result.rb)
|
|
1095
|
+
class PdfAnnotation
|
|
1096
|
+
attr_reader annotation_type: String
|
|
1097
|
+
attr_reader content: String?
|
|
1098
|
+
attr_reader page_number: Integer
|
|
1099
|
+
attr_reader bounding_box: BoundingBox?
|
|
1100
|
+
|
|
1101
|
+
def initialize: (annotation_type: String, content: String?, page_number: Integer, bounding_box: BoundingBox?) -> void
|
|
1102
|
+
def to_h: () -> pdf_annotation_hash
|
|
1103
|
+
end
|
|
1079
1104
|
|
|
1080
1105
|
def initialize: (extraction_result_hash hash) -> void
|
|
1081
1106
|
def to_h: () -> Hash[Symbol, untyped]
|
|
@@ -1113,6 +1138,7 @@ module Kreuzberg
|
|
|
1113
1138
|
def parse_ocr_geometry: (Hash[String, untyped]? data) -> OcrBoundingGeometry?
|
|
1114
1139
|
def parse_ocr_confidence: (Hash[String, untyped]? data) -> OcrConfidence?
|
|
1115
1140
|
def parse_ocr_rotation: (Hash[String, untyped]? data) -> OcrRotation?
|
|
1141
|
+
def parse_annotations: (Array[pdf_annotation_hash]? annotations_data) -> Array[PdfAnnotation]?
|
|
1116
1142
|
end
|
|
1117
1143
|
|
|
1118
1144
|
# Module methods (extraction API)
|