kreuzberg 4.8.5-aarch64-linux → 4.9.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kreuzberg/errors.rb +3 -0
- data/lib/kreuzberg/result.rb +52 -5
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +26 -4
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c7a0c15c0ef0421fe0b2fe83a5feba779f2aa18a1be817b5c7b2f52323601531
|
|
4
|
+
data.tar.gz: 8553c7bcee1466b95ead2688b219d8953ddd941590e55a589f9c7b98317a45a5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5b3e598e8cdecc91dbe6d6e68772f9fc3ceac7688d87d50b70e5120fb8957dafd4abe47dec6961b3d2046741b3a1bf0d8d618e0f8a36c255e687bbb290d2bd20
|
|
7
|
+
data.tar.gz: 8e82431165e64e89141d3077d2068a8435a08daf9d4708499121936493995dd874ba561a6c4b791f14be746bb91bd4d15ef444d6163a754ab5b555bc87c1f2eb
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/errors.rb
CHANGED
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -15,7 +15,7 @@ module Kreuzberg
|
|
|
15
15
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
16
16
|
:detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
|
|
17
17
|
:document, :extracted_keywords, :quality_score, :processing_warnings, :annotations,
|
|
18
|
-
:uris, :children
|
|
18
|
+
:uris, :children, :structured_output
|
|
19
19
|
|
|
20
20
|
# @!attribute [r] cells
|
|
21
21
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -145,7 +145,7 @@ module Kreuzberg
|
|
|
145
145
|
# @return [Array<Image>] Images on this page
|
|
146
146
|
# @!attribute [r] hierarchy
|
|
147
147
|
# @return [PageHierarchy, nil] Hierarchy information for the page
|
|
148
|
-
PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank) do
|
|
148
|
+
PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, :layout_regions) do
|
|
149
149
|
def to_h
|
|
150
150
|
{
|
|
151
151
|
page_number: page_number,
|
|
@@ -153,7 +153,27 @@ module Kreuzberg
|
|
|
153
153
|
tables: tables.map(&:to_h),
|
|
154
154
|
images: images.map(&:to_h),
|
|
155
155
|
hierarchy: hierarchy&.to_h,
|
|
156
|
-
is_blank: is_blank
|
|
156
|
+
is_blank: is_blank,
|
|
157
|
+
layout_regions: layout_regions&.map(&:to_h)
|
|
158
|
+
}
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# @!attribute [r] class_name
|
|
163
|
+
# @return [String] Layout class name (e.g. "picture", "table", "text")
|
|
164
|
+
# @!attribute [r] confidence
|
|
165
|
+
# @return [Float] Detection confidence score (0.0 to 1.0)
|
|
166
|
+
# @!attribute [r] bounding_box
|
|
167
|
+
# @return [ElementBoundingBox] Bounding box in document coordinate space
|
|
168
|
+
# @!attribute [r] area_fraction
|
|
169
|
+
# @return [Float] Fraction of page area covered (0.0 to 1.0)
|
|
170
|
+
LayoutRegion = Struct.new(:class_name, :confidence, :bounding_box, :area_fraction) do
|
|
171
|
+
def to_h
|
|
172
|
+
{
|
|
173
|
+
class: class_name,
|
|
174
|
+
confidence: confidence,
|
|
175
|
+
bounding_box: bounding_box&.to_h,
|
|
176
|
+
area_fraction: area_fraction
|
|
157
177
|
}
|
|
158
178
|
end
|
|
159
179
|
end
|
|
@@ -342,6 +362,7 @@ module Kreuzberg
|
|
|
342
362
|
@annotations = parse_annotations(get_value(hash, 'annotations'))
|
|
343
363
|
@uris = parse_uris(get_value(hash, 'uris'))
|
|
344
364
|
@children = parse_children(get_value(hash, 'children'))
|
|
365
|
+
@structured_output = get_value(hash, 'structured_output')
|
|
345
366
|
end
|
|
346
367
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
|
347
368
|
|
|
@@ -369,7 +390,8 @@ module Kreuzberg
|
|
|
369
390
|
processing_warnings: @processing_warnings.map(&:to_h),
|
|
370
391
|
annotations: @annotations&.map(&:to_h),
|
|
371
392
|
uris: @uris&.map(&:to_h),
|
|
372
|
-
children: @children&.map(&:to_h)
|
|
393
|
+
children: @children&.map(&:to_h),
|
|
394
|
+
structured_output: @structured_output
|
|
373
395
|
}
|
|
374
396
|
end
|
|
375
397
|
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
@@ -568,11 +590,36 @@ module Kreuzberg
|
|
|
568
590
|
tables: parse_tables(page_hash['tables']),
|
|
569
591
|
images: parse_images(page_hash['images']),
|
|
570
592
|
hierarchy: parse_page_hierarchy(page_hash['hierarchy']),
|
|
571
|
-
is_blank: page_hash['is_blank']
|
|
593
|
+
is_blank: page_hash['is_blank'],
|
|
594
|
+
layout_regions: parse_layout_regions(page_hash['layout_regions'])
|
|
572
595
|
)
|
|
573
596
|
end
|
|
574
597
|
end
|
|
575
598
|
|
|
599
|
+
def parse_layout_regions(regions_data)
|
|
600
|
+
return nil if regions_data.nil?
|
|
601
|
+
|
|
602
|
+
regions_data.map do |region_hash|
|
|
603
|
+
LayoutRegion.new(
|
|
604
|
+
class_name: region_hash['class'],
|
|
605
|
+
confidence: region_hash['confidence']&.to_f,
|
|
606
|
+
bounding_box: parse_element_bounding_box(region_hash['bounding_box']),
|
|
607
|
+
area_fraction: region_hash['area_fraction']&.to_f
|
|
608
|
+
)
|
|
609
|
+
end
|
|
610
|
+
end
|
|
611
|
+
|
|
612
|
+
def parse_element_bounding_box(bounding_box_data)
|
|
613
|
+
return nil if bounding_box_data.nil?
|
|
614
|
+
|
|
615
|
+
ElementBoundingBox.new(
|
|
616
|
+
x0: bounding_box_data['x0'].to_f,
|
|
617
|
+
y0: bounding_box_data['y0'].to_f,
|
|
618
|
+
x1: bounding_box_data['x1'].to_f,
|
|
619
|
+
y1: bounding_box_data['y1'].to_f
|
|
620
|
+
)
|
|
621
|
+
end
|
|
622
|
+
|
|
576
623
|
def parse_page_hierarchy(hierarchy_data)
|
|
577
624
|
return nil if hierarchy_data.nil?
|
|
578
625
|
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -596,13 +596,15 @@ module Kreuzberg
|
|
|
596
596
|
attr_reader batch_size: Integer?
|
|
597
597
|
attr_reader show_download_progress: bool?
|
|
598
598
|
attr_reader cache_dir: String?
|
|
599
|
+
attr_reader acceleration: Acceleration?
|
|
599
600
|
|
|
600
601
|
def initialize: (
|
|
601
602
|
?model: Hash[Symbol, untyped],
|
|
602
603
|
?normalize: bool?,
|
|
603
604
|
?batch_size: Integer?,
|
|
604
605
|
?show_download_progress: bool?,
|
|
605
|
-
?cache_dir: String
|
|
606
|
+
?cache_dir: String?,
|
|
607
|
+
?acceleration: (Acceleration | Hash[Symbol, untyped])?
|
|
606
608
|
) -> void
|
|
607
609
|
def to_h: () -> Hash[Symbol, untyped]
|
|
608
610
|
end
|
|
@@ -799,8 +801,9 @@ module Kreuzberg
|
|
|
799
801
|
attr_reader confidence_threshold: Float?
|
|
800
802
|
attr_reader apply_heuristics: bool
|
|
801
803
|
attr_reader table_model: String?
|
|
804
|
+
attr_reader acceleration: Acceleration?
|
|
802
805
|
|
|
803
|
-
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?) -> void
|
|
806
|
+
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?, ?acceleration: (Acceleration | Hash[Symbol, untyped])?) -> void
|
|
804
807
|
def to_h: () -> Hash[Symbol, untyped]
|
|
805
808
|
end
|
|
806
809
|
|
|
@@ -1009,7 +1012,8 @@ module Kreuzberg
|
|
|
1009
1012
|
content: String,
|
|
1010
1013
|
tables: Array[table_hash],
|
|
1011
1014
|
images: Array[image_hash],
|
|
1012
|
-
is_blank: bool
|
|
1015
|
+
is_blank: bool?,
|
|
1016
|
+
layout_regions: Array[untyped]?
|
|
1013
1017
|
}
|
|
1014
1018
|
|
|
1015
1019
|
type djot_content_hash = {
|
|
@@ -1280,6 +1284,17 @@ module Kreuzberg
|
|
|
1280
1284
|
def to_h: () -> image_hash
|
|
1281
1285
|
end
|
|
1282
1286
|
|
|
1287
|
+
# Layout detection region on a page (Struct from result.rb)
|
|
1288
|
+
class LayoutRegion
|
|
1289
|
+
attr_reader class_name: String
|
|
1290
|
+
attr_reader confidence: Float
|
|
1291
|
+
attr_reader bounding_box: ElementBoundingBox?
|
|
1292
|
+
attr_reader area_fraction: Float
|
|
1293
|
+
|
|
1294
|
+
def initialize: (class_name: String, confidence: Float, bounding_box: ElementBoundingBox?, area_fraction: Float) -> void
|
|
1295
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
1296
|
+
end
|
|
1297
|
+
|
|
1283
1298
|
# Page content with text and extracted elements (Struct from result.rb)
|
|
1284
1299
|
class PageContent
|
|
1285
1300
|
attr_reader page_number: Integer
|
|
@@ -1288,8 +1303,9 @@ module Kreuzberg
|
|
|
1288
1303
|
attr_reader images: Array[Image]?
|
|
1289
1304
|
attr_reader hierarchy: PageHierarchy?
|
|
1290
1305
|
attr_reader is_blank: bool?
|
|
1306
|
+
attr_reader layout_regions: Array[LayoutRegion]?
|
|
1291
1307
|
|
|
1292
|
-
def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?, is_blank: bool?) -> void
|
|
1308
|
+
def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?, is_blank: bool?, layout_regions: Array[LayoutRegion]?) -> void
|
|
1293
1309
|
def to_h: () -> Hash[Symbol, untyped]
|
|
1294
1310
|
end
|
|
1295
1311
|
|
|
@@ -1548,6 +1564,7 @@ module Kreuzberg
|
|
|
1548
1564
|
attr_reader uris: Array[uri_hash]?
|
|
1549
1565
|
attr_reader children: Array[archive_entry_hash]?
|
|
1550
1566
|
attr_reader llm_usage: Array[LlmUsage]?
|
|
1567
|
+
attr_reader structured_output: (Hash[String, untyped] | Array[untyped] | Integer | Float | String | bool | nil)
|
|
1551
1568
|
|
|
1552
1569
|
# PDF annotation extracted from a document page (Struct from result.rb)
|
|
1553
1570
|
class PdfAnnotation
|
|
@@ -1581,6 +1598,8 @@ module Kreuzberg
|
|
|
1581
1598
|
def parse_element: (Hash[String, untyped] element_hash) -> ElementStruct
|
|
1582
1599
|
def parse_element_coordinates: (Hash[String, untyped]? coordinates_data) -> ElementBoundingBox?
|
|
1583
1600
|
def parse_ocr_elements: (Array[ocr_element_hash]? ocr_elements_data) -> Array[OcrElement]?
|
|
1601
|
+
def parse_layout_regions: (Array[untyped]? regions_data) -> Array[LayoutRegion]?
|
|
1602
|
+
def parse_element_bounding_box: (Hash[String, untyped]? bounding_box_data) -> ElementBoundingBox?
|
|
1584
1603
|
def parse_page_hierarchy: (Hash[String, untyped]? hierarchy_data) -> PageHierarchy?
|
|
1585
1604
|
def parse_djot_content: (Hash[String, untyped]? djot_data) -> DjotContent?
|
|
1586
1605
|
def parse_document_structure: (Hash[String, untyped]? document_data) -> DocumentStructure?
|
|
@@ -1812,6 +1831,9 @@ module Kreuzberg
|
|
|
1812
1831
|
|
|
1813
1832
|
class EmbeddingError < Error
|
|
1814
1833
|
end
|
|
1834
|
+
|
|
1835
|
+
class CancelledError < Error
|
|
1836
|
+
end
|
|
1815
1837
|
end
|
|
1816
1838
|
|
|
1817
1839
|
# Internal modules (prepended to Kreuzberg singleton)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.
|
|
4
|
+
version: 4.9.0
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-18 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -86,14 +86,14 @@ dependencies:
|
|
|
86
86
|
requirements:
|
|
87
87
|
- - "~>"
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '
|
|
89
|
+
version: '4.0'
|
|
90
90
|
type: :development
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
94
|
- - "~>"
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '
|
|
96
|
+
version: '4.0'
|
|
97
97
|
- !ruby/object:Gem::Dependency
|
|
98
98
|
name: rubocop
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -142,14 +142,14 @@ dependencies:
|
|
|
142
142
|
requirements:
|
|
143
143
|
- - "~>"
|
|
144
144
|
- !ruby/object:Gem::Version
|
|
145
|
-
version: '
|
|
145
|
+
version: '2.0'
|
|
146
146
|
type: :development
|
|
147
147
|
prerelease: false
|
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
|
149
149
|
requirements:
|
|
150
150
|
- - "~>"
|
|
151
151
|
- !ruby/object:Gem::Version
|
|
152
|
-
version: '
|
|
152
|
+
version: '2.0'
|
|
153
153
|
- !ruby/object:Gem::Dependency
|
|
154
154
|
name: yard
|
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|