kreuzberg 4.8.5-aarch64-linux → 4.9.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a8ebc29c703cfb07252f68d86a5dd37f17eb06d79bccce78f82eada2d732e9c
4
- data.tar.gz: e711b683c3bdfa37ea1b43bc9e50bc7f21a4e2fca9e4dee4e8c2a2c2664e1bc5
3
+ metadata.gz: c7a0c15c0ef0421fe0b2fe83a5feba779f2aa18a1be817b5c7b2f52323601531
4
+ data.tar.gz: 8553c7bcee1466b95ead2688b219d8953ddd941590e55a589f9c7b98317a45a5
5
5
  SHA512:
6
- metadata.gz: a50806a6cee6edfcf55db3a62d748c293c7b01f20f2744540d520489ca6bbbfe89d65fb3d0132e5e3f723121b835fdc99ac3856559910f227f708e43931070d9
7
- data.tar.gz: 26c2f577491b2be1f89e7f49832e308bdc7aabb7453b761a7a72692316a8fce84268d346a3adc545e1308a1b7f3b471d4b671b3f9f4a9ac1ef65dd07139ddb62
6
+ metadata.gz: 5b3e598e8cdecc91dbe6d6e68772f9fc3ceac7688d87d50b70e5120fb8957dafd4abe47dec6961b3d2046741b3a1bf0d8d618e0f8a36c255e687bbb290d2bd20
7
+ data.tar.gz: 8e82431165e64e89141d3077d2068a8435a08daf9d4708499121936493995dd874ba561a6c4b791f14be746bb91bd4d15ef444d6163a754ab5b555bc87c1f2eb
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.5" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -116,5 +116,8 @@ module Kreuzberg
116
116
 
117
117
  # Raised when embedding fails
118
118
  class EmbeddingError < Error; end
119
+
120
+ # Raised when an extraction is cancelled via a cancellation token
121
+ class CancelledError < Error; end
119
122
  end
120
123
  end
@@ -15,7 +15,7 @@ module Kreuzberg
15
15
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
16
16
  :detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
17
17
  :document, :extracted_keywords, :quality_score, :processing_warnings, :annotations,
18
- :uris, :children
18
+ :uris, :children, :structured_output
19
19
 
20
20
  # @!attribute [r] cells
21
21
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -145,7 +145,7 @@ module Kreuzberg
145
145
  # @return [Array<Image>] Images on this page
146
146
  # @!attribute [r] hierarchy
147
147
  # @return [PageHierarchy, nil] Hierarchy information for the page
148
- PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank) do
148
+ PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, :layout_regions) do
149
149
  def to_h
150
150
  {
151
151
  page_number: page_number,
@@ -153,7 +153,27 @@ module Kreuzberg
153
153
  tables: tables.map(&:to_h),
154
154
  images: images.map(&:to_h),
155
155
  hierarchy: hierarchy&.to_h,
156
- is_blank: is_blank
156
+ is_blank: is_blank,
157
+ layout_regions: layout_regions&.map(&:to_h)
158
+ }
159
+ end
160
+ end
161
+
162
+ # @!attribute [r] class_name
163
+ # @return [String] Layout class name (e.g. "picture", "table", "text")
164
+ # @!attribute [r] confidence
165
+ # @return [Float] Detection confidence score (0.0 to 1.0)
166
+ # @!attribute [r] bounding_box
167
+ # @return [ElementBoundingBox] Bounding box in document coordinate space
168
+ # @!attribute [r] area_fraction
169
+ # @return [Float] Fraction of page area covered (0.0 to 1.0)
170
+ LayoutRegion = Struct.new(:class_name, :confidence, :bounding_box, :area_fraction) do
171
+ def to_h
172
+ {
173
+ class: class_name,
174
+ confidence: confidence,
175
+ bounding_box: bounding_box&.to_h,
176
+ area_fraction: area_fraction
157
177
  }
158
178
  end
159
179
  end
@@ -342,6 +362,7 @@ module Kreuzberg
342
362
  @annotations = parse_annotations(get_value(hash, 'annotations'))
343
363
  @uris = parse_uris(get_value(hash, 'uris'))
344
364
  @children = parse_children(get_value(hash, 'children'))
365
+ @structured_output = get_value(hash, 'structured_output')
345
366
  end
346
367
  # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
347
368
 
@@ -369,7 +390,8 @@ module Kreuzberg
369
390
  processing_warnings: @processing_warnings.map(&:to_h),
370
391
  annotations: @annotations&.map(&:to_h),
371
392
  uris: @uris&.map(&:to_h),
372
- children: @children&.map(&:to_h)
393
+ children: @children&.map(&:to_h),
394
+ structured_output: @structured_output
373
395
  }
374
396
  end
375
397
  # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
@@ -568,11 +590,36 @@ module Kreuzberg
568
590
  tables: parse_tables(page_hash['tables']),
569
591
  images: parse_images(page_hash['images']),
570
592
  hierarchy: parse_page_hierarchy(page_hash['hierarchy']),
571
- is_blank: page_hash['is_blank']
593
+ is_blank: page_hash['is_blank'],
594
+ layout_regions: parse_layout_regions(page_hash['layout_regions'])
572
595
  )
573
596
  end
574
597
  end
575
598
 
599
+ def parse_layout_regions(regions_data)
600
+ return nil if regions_data.nil?
601
+
602
+ regions_data.map do |region_hash|
603
+ LayoutRegion.new(
604
+ class_name: region_hash['class'],
605
+ confidence: region_hash['confidence']&.to_f,
606
+ bounding_box: parse_element_bounding_box(region_hash['bounding_box']),
607
+ area_fraction: region_hash['area_fraction']&.to_f
608
+ )
609
+ end
610
+ end
611
+
612
+ def parse_element_bounding_box(bounding_box_data)
613
+ return nil if bounding_box_data.nil?
614
+
615
+ ElementBoundingBox.new(
616
+ x0: bounding_box_data['x0'].to_f,
617
+ y0: bounding_box_data['y0'].to_f,
618
+ x1: bounding_box_data['x1'].to_f,
619
+ y1: bounding_box_data['y1'].to_f
620
+ )
621
+ end
622
+
576
623
  def parse_page_hierarchy(hierarchy_data)
577
624
  return nil if hierarchy_data.nil?
578
625
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.8.5'
4
+ VERSION = '4.9.0'
5
5
  end
data/lib/kreuzberg_rb.so CHANGED
Binary file
data/sig/kreuzberg.rbs CHANGED
@@ -596,13 +596,15 @@ module Kreuzberg
596
596
  attr_reader batch_size: Integer?
597
597
  attr_reader show_download_progress: bool?
598
598
  attr_reader cache_dir: String?
599
+ attr_reader acceleration: Acceleration?
599
600
 
600
601
  def initialize: (
601
602
  ?model: Hash[Symbol, untyped],
602
603
  ?normalize: bool?,
603
604
  ?batch_size: Integer?,
604
605
  ?show_download_progress: bool?,
605
- ?cache_dir: String?
606
+ ?cache_dir: String?,
607
+ ?acceleration: (Acceleration | Hash[Symbol, untyped])?
606
608
  ) -> void
607
609
  def to_h: () -> Hash[Symbol, untyped]
608
610
  end
@@ -799,8 +801,9 @@ module Kreuzberg
799
801
  attr_reader confidence_threshold: Float?
800
802
  attr_reader apply_heuristics: bool
801
803
  attr_reader table_model: String?
804
+ attr_reader acceleration: Acceleration?
802
805
 
803
- def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?) -> void
806
+ def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?, ?acceleration: (Acceleration | Hash[Symbol, untyped])?) -> void
804
807
  def to_h: () -> Hash[Symbol, untyped]
805
808
  end
806
809
 
@@ -1009,7 +1012,8 @@ module Kreuzberg
1009
1012
  content: String,
1010
1013
  tables: Array[table_hash],
1011
1014
  images: Array[image_hash],
1012
- is_blank: bool?
1015
+ is_blank: bool?,
1016
+ layout_regions: Array[untyped]?
1013
1017
  }
1014
1018
 
1015
1019
  type djot_content_hash = {
@@ -1280,6 +1284,17 @@ module Kreuzberg
1280
1284
  def to_h: () -> image_hash
1281
1285
  end
1282
1286
 
1287
+ # Layout detection region on a page (Struct from result.rb)
1288
+ class LayoutRegion
1289
+ attr_reader class_name: String
1290
+ attr_reader confidence: Float
1291
+ attr_reader bounding_box: ElementBoundingBox?
1292
+ attr_reader area_fraction: Float
1293
+
1294
+ def initialize: (class_name: String, confidence: Float, bounding_box: ElementBoundingBox?, area_fraction: Float) -> void
1295
+ def to_h: () -> Hash[Symbol, untyped]
1296
+ end
1297
+
1283
1298
  # Page content with text and extracted elements (Struct from result.rb)
1284
1299
  class PageContent
1285
1300
  attr_reader page_number: Integer
@@ -1288,8 +1303,9 @@ module Kreuzberg
1288
1303
  attr_reader images: Array[Image]?
1289
1304
  attr_reader hierarchy: PageHierarchy?
1290
1305
  attr_reader is_blank: bool?
1306
+ attr_reader layout_regions: Array[LayoutRegion]?
1291
1307
 
1292
- def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?, is_blank: bool?) -> void
1308
+ def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?, is_blank: bool?, layout_regions: Array[LayoutRegion]?) -> void
1293
1309
  def to_h: () -> Hash[Symbol, untyped]
1294
1310
  end
1295
1311
 
@@ -1548,6 +1564,7 @@ module Kreuzberg
1548
1564
  attr_reader uris: Array[uri_hash]?
1549
1565
  attr_reader children: Array[archive_entry_hash]?
1550
1566
  attr_reader llm_usage: Array[LlmUsage]?
1567
+ attr_reader structured_output: (Hash[String, untyped] | Array[untyped] | Integer | Float | String | bool | nil)
1551
1568
 
1552
1569
  # PDF annotation extracted from a document page (Struct from result.rb)
1553
1570
  class PdfAnnotation
@@ -1581,6 +1598,8 @@ module Kreuzberg
1581
1598
  def parse_element: (Hash[String, untyped] element_hash) -> ElementStruct
1582
1599
  def parse_element_coordinates: (Hash[String, untyped]? coordinates_data) -> ElementBoundingBox?
1583
1600
  def parse_ocr_elements: (Array[ocr_element_hash]? ocr_elements_data) -> Array[OcrElement]?
1601
+ def parse_layout_regions: (Array[untyped]? regions_data) -> Array[LayoutRegion]?
1602
+ def parse_element_bounding_box: (Hash[String, untyped]? bounding_box_data) -> ElementBoundingBox?
1584
1603
  def parse_page_hierarchy: (Hash[String, untyped]? hierarchy_data) -> PageHierarchy?
1585
1604
  def parse_djot_content: (Hash[String, untyped]? djot_data) -> DjotContent?
1586
1605
  def parse_document_structure: (Hash[String, untyped]? document_data) -> DocumentStructure?
@@ -1812,6 +1831,9 @@ module Kreuzberg
1812
1831
 
1813
1832
  class EmbeddingError < Error
1814
1833
  end
1834
+
1835
+ class CancelledError < Error
1836
+ end
1815
1837
  end
1816
1838
 
1817
1839
  # Internal modules (prepended to Kreuzberg singleton)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.8.5
4
+ version: 4.9.0
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-14 00:00:00.000000000 Z
11
+ date: 2026-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -86,14 +86,14 @@ dependencies:
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '3.0'
89
+ version: '4.0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '3.0'
96
+ version: '4.0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: rubocop
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -142,14 +142,14 @@ dependencies:
142
142
  requirements:
143
143
  - - "~>"
144
144
  - !ruby/object:Gem::Version
145
- version: '1.8'
145
+ version: '2.0'
146
146
  type: :development
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
- version: '1.8'
152
+ version: '2.0'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: yard
155
155
  requirement: !ruby/object:Gem::Requirement