kreuzberg 4.2.4 → 4.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 037c280a2c425e3e2f9f0664d63ec8760ec5751714c7cb3178b1d8d8dd004999
4
- data.tar.gz: 518f80241e0c7b4515276b37261fc20114f3928740e2809e3a5519a6a3ae11a1
3
+ metadata.gz: 7d0b60c586fdfacb40e14d02a0f21f0d8844e347095a7d566a310ca5d3600e82
4
+ data.tar.gz: 5319a67e46fa8422b2f37b82b06329dfb4322c4afa0182a5ca434f3068cde531
5
5
  SHA512:
6
- metadata.gz: 2f6cdde03849c18c54c587e3a500c250458b2960da207fd351b76c202842189b5eb24bd5eab5d4a5355f0faff8aeebf81fb0c5b8416aed2f75778c820f3ae000
7
- data.tar.gz: a19437bcb6cf06382456718d1c53b06d065f905c55dd1e75ae5503203e1984417bb6f7468dde31af5b2c695ddc5d119d59204238086067c96cd15f659eaed566
6
+ metadata.gz: 4670ecee61b6ba3f0e13979587a746f3a57e7910408984f73e504258bd1a8ef053421a3e1c6ea99d86cd79f8dae899dc64c13b0ff5e997a2eda2fff2a08ba179
7
+ data.tar.gz: dd750e605c25e6db09093ea50c9ab44aa1b7c7fd8141ad9bb6c691edabbba553e455a358809bfafbab1a450f6c22127733eb63c5dff4737385dd3f6d200848fb
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.4)
4
+ kreuzberg (4.2.6)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -121,7 +121,7 @@ GEM
121
121
  rubocop (~> 1.81)
122
122
  ruby-progressbar (1.13.0)
123
123
  securerandom (0.4.1)
124
- sorbet-runtime (0.6.12904)
124
+ sorbet-runtime (0.6.12908)
125
125
  steep (1.10.0)
126
126
  activesupport (>= 5.1)
127
127
  concurrent-ruby (>= 1.1.10)
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.2.4)
210
+ kreuzberg (4.2.6)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -242,7 +242,7 @@ CHECKSUMS
242
242
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
243
243
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
244
244
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
245
- sorbet-runtime (0.6.12904) sha256=0bf2ea0d70de7f3896ec1db3fbdbe89be970ddc1b92406630fd7411a3a8b1bd0
245
+ sorbet-runtime (0.6.12908) sha256=229f43e76527b01c5291c00c43cc29ccebe437a87b34925c3ee250ebf23d328e
246
246
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
247
247
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
248
248
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.6" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -877,7 +877,7 @@ module Kreuzberg
877
877
  chunking: @chunking&.to_h,
878
878
  language_detection: @language_detection&.to_h,
879
879
  pdf_options: @pdf_options&.to_h,
880
- images: @images&.to_h,
880
+ image_extraction: @images&.to_h,
881
881
  postprocessor: @postprocessor&.to_h,
882
882
  token_reduction: @token_reduction&.to_h,
883
883
  keywords: @keywords&.to_h,
@@ -103,13 +103,48 @@ module Kreuzberg
103
103
  # @return [Array<Table>] Tables on this page
104
104
  # @!attribute [r] images
105
105
  # @return [Array<Image>] Images on this page
106
- PageContent = Struct.new(:page_number, :content, :tables, :images, keyword_init: true) do
106
+ # @!attribute [r] text
107
+ # @return [String] The text content of this block
108
+ # @!attribute [r] font_size
109
+ # @return [Float] The font size of the text
110
+ # @!attribute [r] level
111
+ # @return [String] The hierarchy level (h1-h6 or body)
112
+ # @!attribute [r] bbox
113
+ # @return [Array<Float>, nil] Bounding box (left, top, right, bottom)
114
+ HierarchicalBlock = Struct.new(:text, :font_size, :level, :bbox, keyword_init: true) do
115
+ def to_h
116
+ { text: text, font_size: font_size, level: level, bbox: bbox }
117
+ end
118
+ end
119
+
120
+ # @!attribute [r] block_count
121
+ # @return [Integer] Number of hierarchy blocks
122
+ # @!attribute [r] blocks
123
+ # @return [Array<HierarchicalBlock>] Hierarchical blocks
124
+ PageHierarchy = Struct.new(:block_count, :blocks, keyword_init: true) do
125
+ def to_h
126
+ { block_count: block_count, blocks: blocks.map(&:to_h) }
127
+ end
128
+ end
129
+
130
+ # @!attribute [r] page_number
131
+ # @return [Integer] Page number (1-indexed)
132
+ # @!attribute [r] content
133
+ # @return [String] Text content for this page
134
+ # @!attribute [r] tables
135
+ # @return [Array<Table>] Tables on this page
136
+ # @!attribute [r] images
137
+ # @return [Array<Image>] Images on this page
138
+ # @!attribute [r] hierarchy
139
+ # @return [PageHierarchy, nil] Hierarchy information for the page
140
+ PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, keyword_init: true) do
107
141
  def to_h
108
142
  {
109
143
  page_number: page_number,
110
144
  content: content,
111
145
  tables: tables.map(&:to_h),
112
- images: images.map(&:to_h)
146
+ images: images.map(&:to_h),
147
+ hierarchy: hierarchy&.to_h
113
148
  }
114
149
  end
115
150
  end
@@ -397,9 +432,28 @@ module Kreuzberg
397
432
  page_number: page_hash['page_number'],
398
433
  content: page_hash['content'],
399
434
  tables: parse_tables(page_hash['tables']),
400
- images: parse_images(page_hash['images'])
435
+ images: parse_images(page_hash['images']),
436
+ hierarchy: parse_page_hierarchy(page_hash['hierarchy'])
437
+ )
438
+ end
439
+ end
440
+
441
+ def parse_page_hierarchy(hierarchy_data)
442
+ return nil if hierarchy_data.nil?
443
+
444
+ blocks = (hierarchy_data['blocks'] || []).map do |block_hash|
445
+ HierarchicalBlock.new(
446
+ text: block_hash['text'],
447
+ font_size: block_hash['font_size']&.to_f,
448
+ level: block_hash['level'],
449
+ bbox: block_hash['bbox']
401
450
  )
402
451
  end
452
+
453
+ PageHierarchy.new(
454
+ block_count: hierarchy_data['block_count'] || 0,
455
+ blocks: blocks
456
+ )
403
457
  end
404
458
 
405
459
  def parse_elements(elements_data)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.4'
4
+ VERSION = '4.2.6'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -195,7 +195,6 @@ module Kreuzberg
195
195
  attr_reader language_detection: LanguageDetection?
196
196
  attr_reader pdf_options: PDF?
197
197
  attr_reader image_extraction: ImageExtraction?
198
- attr_reader image_preprocessing: ImagePreprocessing?
199
198
  attr_reader postprocessor: PostProcessor?
200
199
  attr_reader token_reduction: TokenReduction?
201
200
  attr_reader keywords: Keywords?
@@ -215,7 +214,6 @@ module Kreuzberg
215
214
  ?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
216
215
  ?pdf_options: (PDF | Hash[Symbol, untyped])?,
217
216
  ?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
218
- ?image_preprocessing: (ImagePreprocessing | Hash[Symbol, untyped])?,
219
217
  ?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
220
218
  ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
221
219
  ?keywords: (Keywords | Hash[Symbol, untyped])?,
@@ -13,7 +13,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
13
13
  expect(config.language_detection).to be_nil
14
14
  expect(config.pdf_options).to be_nil
15
15
  expect(config.image_extraction).to be_nil
16
- expect(config.image_preprocessing).to be_nil
17
16
  expect(config.postprocessor).to be_nil
18
17
  expect(config.token_reduction).to be_nil
19
18
  expect(config.keywords).to be_nil
@@ -258,7 +257,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
258
257
  language_detection: { enabled: true },
259
258
  pdf_options: { extract_images: true },
260
259
  image_extraction: { target_dpi: 600 },
261
- image_preprocessing: { denoise: true },
262
260
  postprocessor: { enabled: true },
263
261
  token_reduction: { mode: 'light' },
264
262
  keywords: { algorithm: 'yake' },
@@ -401,7 +399,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
401
399
  language_detection: { enabled: true, min_confidence: 0.9 },
402
400
  pdf_options: { extract_images: true, passwords: ['secret'] },
403
401
  image_extraction: { target_dpi: 600 },
404
- image_preprocessing: { denoise: true, binarization_method: 'sauvola' },
405
402
  postprocessor: { enabled: true, enabled_processors: %w[quality] },
406
403
  token_reduction: { mode: 'light' },
407
404
  keywords: { algorithm: 'yake', max_keywords: 10 },
@@ -415,7 +412,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
415
412
  expect(config.language_detection.enabled).to be true
416
413
  expect(config.pdf_options.extract_images).to be true
417
414
  expect(config.image_extraction.target_dpi).to eq 600
418
- expect(config.image_preprocessing.denoise).to be true
419
415
  expect(config.postprocessor.enabled).to be true
420
416
  expect(config.token_reduction.mode).to eq 'light'
421
417
  expect(config.keywords.max_keywords).to eq 10
@@ -157,25 +157,6 @@ RSpec.describe Kreuzberg::Config::ImagePreprocessing do
157
157
  end
158
158
 
159
159
  describe 'nested config integration' do
160
- it 'can be nested in Extraction config' do
161
- preprocessing = described_class.new(target_dpi: 600, denoise: true)
162
- extraction = Kreuzberg::Config::Extraction.new(image_preprocessing: preprocessing)
163
-
164
- expect(extraction.image_preprocessing).to be_a described_class
165
- expect(extraction.image_preprocessing.target_dpi).to eq 600
166
- expect(extraction.image_preprocessing.denoise).to be true
167
- end
168
-
169
- it 'accepts hash in Extraction config' do
170
- extraction = Kreuzberg::Config::Extraction.new(
171
- image_preprocessing: { target_dpi: 600, binarization_method: 'sauvola' }
172
- )
173
-
174
- expect(extraction.image_preprocessing).to be_a described_class
175
- expect(extraction.image_preprocessing.target_dpi).to eq 600
176
- expect(extraction.image_preprocessing.binarization_method).to eq 'sauvola'
177
- end
178
-
179
160
  it 'can be nested in Tesseract config' do
180
161
  preprocessing = described_class.new(denoise: true)
181
162
  tesseract = Kreuzberg::Config::Tesseract.new(preprocessing: preprocessing)
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.4"
6
+ version = "4.2.6"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -47,7 +47,7 @@ hex = "0.4.3"
47
47
  toml = "0.9.11"
48
48
  num_cpus = "1.17.0"
49
49
  once_cell = "1.21.3"
50
- html-to-markdown-rs = { version = "2.23.4", default-features = false }
50
+ html-to-markdown-rs = { version = "2.24.1", default-features = false }
51
51
  reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
52
52
  image = { version = "0.25.9", default-features = false }
53
53
  lzma-rust2 = { version = "0.15.7" }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.4"
3
+ version = "4.2.6"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.4 Release**
20
+ > **🚀 Version 4.2.6 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -28,7 +28,7 @@ serde_json = { workspace = true }
28
28
  serde = { workspace = true }
29
29
  async-trait = { workspace = true }
30
30
  tokio = { workspace = true }
31
- html-to-markdown-rs = { version = "2.23.4", default-features = false }
31
+ html-to-markdown-rs = { version = "2.24.1", default-features = false }
32
32
  rayon = { version = "1.11", optional = true }
33
33
  log = "0.4"
34
34
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.4"
3
+ version = "4.2.6"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -24,7 +24,7 @@ thiserror = { workspace = true }
24
24
  image = { workspace = true, features = ["png"] }
25
25
 
26
26
  [build-dependencies]
27
- cc = { version = "^1.2.54", optional = true }
27
+ cc = { version = "^1.2.55", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
29
  zip = { version = "7.2.0", optional = true }
30
30
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.4
4
+ version: 4.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-29 00:00:00.000000000 Z
11
+ date: 2026-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler