kreuzberg 4.3.6-aarch64-linux → 4.3.7-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4894dca10aca9bd1e839e9bdf6326c5ba0b3be8deac8d818be144f7e48bb3e1b
4
- data.tar.gz: 81a3915363813d52f5d84f3935ef3fcc148f364c892ee06d213f98ef40b00c0c
3
+ metadata.gz: 8519525216a517ce9d2078552b1492c4209c2d1a8bff7b58eae88a8137a603d4
4
+ data.tar.gz: ab098d31da39a6965c57f65d4a6a282570333ed6a0a8d00886c22982b1bce76c
5
5
  SHA512:
6
- metadata.gz: 29b0c2e1a978722b2d9bc61c45ffc736b9666cb384db3d3dc898d451ecf115e6d40090271ac96914379f9ac0af2fa4d108e3cd87466f7428333a2695caf16125
7
- data.tar.gz: 57646b12967677c3668fdb6859003fa5455f49e6ab9efa3ff05abc61f703d74b872157e7d20503120696be2110ef1484797d66b4819d3f7e570e3d3a3fb65b21
6
+ metadata.gz: 209c8c32c54f5b73e90954ae9db136aa87d13e9738c48de6e89282396986cc705b1f7c64841a7358f6357d34980f023890c8031a6e10a32eb20237564fad3d30
7
+ data.tar.gz: c92af45298595d9bd433dd9b01223b45870063149042bd3d36b26d5201a794b78fffa2f79cf6d9621a80e5d65751ea7fec52756fce44a28e53de1d6a7cf79e2f
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.3.6)
4
+ kreuzberg (4.3.7)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -124,7 +124,7 @@ GEM
124
124
  rubocop (~> 1.81)
125
125
  ruby-progressbar (1.13.0)
126
126
  securerandom (0.4.1)
127
- sorbet-runtime (0.6.12942)
127
+ sorbet-runtime (0.6.12956)
128
128
  steep (1.10.0)
129
129
  activesupport (>= 5.1)
130
130
  concurrent-ruby (>= 1.1.10)
@@ -210,7 +210,7 @@ CHECKSUMS
210
210
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
211
211
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
212
212
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
213
- kreuzberg (4.3.6)
213
+ kreuzberg (4.3.7)
214
214
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
215
215
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
216
216
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -245,7 +245,7 @@ CHECKSUMS
245
245
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
246
246
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
247
247
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
248
- sorbet-runtime (0.6.12942) sha256=967bda04814d234e4239c4f883c1d0ee6de3e47bf8bafd2c0cc30d18df2ddd3a
248
+ sorbet-runtime (0.6.12956) sha256=fee716a62d0b1d94ebc8e6ba23e76a7654eeac66c1f5cc1e1bef78b8e9ff87c7
249
249
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
250
250
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
251
251
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.6" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.7" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -391,14 +391,18 @@ module Kreuzberg
391
391
  # pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
392
392
  #
393
393
  class PDF
394
- attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
394
+ attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy,
395
+ :extract_annotations, :top_margin_fraction, :bottom_margin_fraction
395
396
 
396
397
  def initialize(
397
398
  extract_images: false,
398
399
  passwords: nil,
399
400
  extract_metadata: true,
400
401
  font_config: nil,
401
- hierarchy: nil
402
+ hierarchy: nil,
403
+ extract_annotations: false,
404
+ top_margin_fraction: nil,
405
+ bottom_margin_fraction: nil
402
406
  )
403
407
  @extract_images = extract_images ? true : false
404
408
  @passwords = if passwords.is_a?(Array)
@@ -409,6 +413,9 @@ module Kreuzberg
409
413
  @extract_metadata = extract_metadata ? true : false
410
414
  @font_config = normalize_font_config(font_config)
411
415
  @hierarchy = normalize_hierarchy(hierarchy)
416
+ @extract_annotations = extract_annotations ? true : false
417
+ @top_margin_fraction = top_margin_fraction&.to_f
418
+ @bottom_margin_fraction = bottom_margin_fraction&.to_f
412
419
  end
413
420
 
414
421
  def to_h
@@ -417,7 +424,10 @@ module Kreuzberg
417
424
  passwords: @passwords,
418
425
  extract_metadata: @extract_metadata,
419
426
  font_config: @font_config&.to_h,
420
- hierarchy: @hierarchy&.to_h
427
+ hierarchy: @hierarchy&.to_h,
428
+ extract_annotations: @extract_annotations,
429
+ top_margin_fraction: @top_margin_fraction,
430
+ bottom_margin_fraction: @bottom_margin_fraction
421
431
  }.compact
422
432
  end
423
433
 
@@ -14,7 +14,7 @@ module Kreuzberg
14
14
  class Result
15
15
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
16
16
  :detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
17
- :document, :extracted_keywords, :quality_score, :processing_warnings
17
+ :document, :extracted_keywords, :quality_score, :processing_warnings, :annotations
18
18
 
19
19
  # @!attribute [r] cells
20
20
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -339,6 +339,7 @@ module Kreuzberg
339
339
  @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
340
340
  @quality_score = get_value(hash, 'quality_score')
341
341
  @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
342
+ @annotations = parse_annotations(get_value(hash, 'annotations'))
342
343
  end
343
344
  # rubocop:enable Metrics/AbcSize
344
345
 
@@ -346,6 +347,7 @@ module Kreuzberg
346
347
  #
347
348
  # @return [Hash] Hash representation
348
349
  #
350
+ # rubocop:disable Metrics/CyclomaticComplexity
349
351
  def to_h
350
352
  {
351
353
  content: @content,
@@ -362,9 +364,11 @@ module Kreuzberg
362
364
  document: @document&.to_h,
363
365
  extracted_keywords: @extracted_keywords&.map(&:to_h),
364
366
  quality_score: @quality_score,
365
- processing_warnings: @processing_warnings.map(&:to_h)
367
+ processing_warnings: @processing_warnings.map(&:to_h),
368
+ annotations: @annotations&.map(&:to_h)
366
369
  }
367
370
  end
371
+ # rubocop:enable Metrics/CyclomaticComplexity
368
372
 
369
373
  # Convert to JSON
370
374
  #
@@ -707,6 +711,32 @@ module Kreuzberg
707
711
  )
708
712
  end
709
713
  end
714
+
715
+ def parse_annotations(annotations_data)
716
+ return nil if annotations_data.nil?
717
+
718
+ annotations_data.map { |a_hash| build_annotation(a_hash) }
719
+ end
720
+
721
+ def build_annotation(a_hash)
722
+ PdfAnnotation.new(
723
+ annotation_type: a_hash['annotation_type'] || '',
724
+ content: a_hash['content'],
725
+ page_number: a_hash['page_number']&.to_i,
726
+ bounding_box: build_annotation_bbox(a_hash['bounding_box'])
727
+ )
728
+ end
729
+
730
+ def build_annotation_bbox(bbox_data)
731
+ return nil if bbox_data.nil?
732
+
733
+ PdfAnnotationBoundingBox.new(
734
+ left: bbox_data['left']&.to_f,
735
+ top: bbox_data['top']&.to_f,
736
+ right: bbox_data['right']&.to_f,
737
+ bottom: bbox_data['bottom']&.to_f
738
+ )
739
+ end
710
740
  end
711
741
  # rubocop:enable Metrics/ClassLength
712
742
  end
@@ -411,4 +411,24 @@ module Kreuzberg
411
411
 
412
412
  const :nodes, T::Array[DocumentNode]
413
413
  end
414
+
415
+ # Bounding box for a PDF annotation.
416
+ class PdfAnnotationBoundingBox < T::Struct
417
+ extend T::Sig
418
+
419
+ const :left, T.nilable(Float)
420
+ const :top, T.nilable(Float)
421
+ const :right, T.nilable(Float)
422
+ const :bottom, T.nilable(Float)
423
+ end
424
+
425
+ # A PDF annotation extracted from a document page.
426
+ class PdfAnnotation < T::Struct
427
+ extend T::Sig
428
+
429
+ const :annotation_type, String
430
+ const :content, T.nilable(String)
431
+ const :page_number, T.nilable(Integer)
432
+ const :bounding_box, T.nilable(PdfAnnotationBoundingBox)
433
+ end
414
434
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.3.6'
4
+ VERSION = '4.3.7'
5
5
  end
data/lib/kreuzberg_rb.so CHANGED
Binary file
data/sig/kreuzberg.rbs CHANGED
@@ -323,8 +323,11 @@ module Kreuzberg
323
323
  attr_reader extract_metadata: bool
324
324
  attr_reader font_config: FontConfig?
325
325
  attr_reader hierarchy: Hierarchy?
326
+ attr_reader extract_annotations: bool
327
+ attr_reader top_margin_fraction: Float?
328
+ attr_reader bottom_margin_fraction: Float?
326
329
 
327
- def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
330
+ def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?) -> void
328
331
  def to_h: () -> Hash[Symbol, untyped]
329
332
  end
330
333
 
@@ -525,6 +528,15 @@ module Kreuzberg
525
528
  end
526
529
 
527
530
  # Extraction result type
531
+ type pdf_annotation_type = 'text' | 'highlight' | 'link' | 'stamp' | 'underline' | 'strike_out' | 'other'
532
+
533
+ type pdf_annotation_hash = {
534
+ annotation_type: String,
535
+ content: String?,
536
+ page_number: Integer,
537
+ bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
538
+ }
539
+
528
540
  type extraction_result_hash = {
529
541
  content: String,
530
542
  mime_type: String,
@@ -541,7 +553,8 @@ module Kreuzberg
541
553
  document: document_structure_hash?,
542
554
  extracted_keywords: Array[extracted_keyword_hash]?,
543
555
  quality_score: Float?,
544
- processing_warnings: Array[processing_warning_hash]?
556
+ processing_warnings: Array[processing_warning_hash]?,
557
+ annotations: Array[pdf_annotation_hash]?
545
558
  }
546
559
 
547
560
  type extracted_keyword_hash = {
@@ -1076,6 +1089,18 @@ module Kreuzberg
1076
1089
  attr_reader extracted_keywords: Array[ExtractedKeyword]?
1077
1090
  attr_reader quality_score: Float?
1078
1091
  attr_reader processing_warnings: Array[ProcessingWarning]?
1092
+ attr_reader annotations: Array[PdfAnnotation]?
1093
+
1094
+ # PDF annotation extracted from a document page (Struct from result.rb)
1095
+ class PdfAnnotation
1096
+ attr_reader annotation_type: String
1097
+ attr_reader content: String?
1098
+ attr_reader page_number: Integer
1099
+ attr_reader bounding_box: BoundingBox?
1100
+
1101
+ def initialize: (annotation_type: String, content: String?, page_number: Integer, bounding_box: BoundingBox?) -> void
1102
+ def to_h: () -> pdf_annotation_hash
1103
+ end
1079
1104
 
1080
1105
  def initialize: (extraction_result_hash hash) -> void
1081
1106
  def to_h: () -> Hash[Symbol, untyped]
@@ -1113,6 +1138,7 @@ module Kreuzberg
1113
1138
  def parse_ocr_geometry: (Hash[String, untyped]? data) -> OcrBoundingGeometry?
1114
1139
  def parse_ocr_confidence: (Hash[String, untyped]? data) -> OcrConfidence?
1115
1140
  def parse_ocr_rotation: (Hash[String, untyped]? data) -> OcrRotation?
1141
+ def parse_annotations: (Array[pdf_annotation_hash]? annotations_data) -> Array[PdfAnnotation]?
1116
1142
  end
1117
1143
 
1118
1144
  # Module methods (extraction API)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.3.6
4
+ version: 4.3.7
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-19 00:00:00.000000000 Z
11
+ date: 2026-02-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler