kreuzberg 4.2.3 → 4.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2c6fc44b151014f7e56c82bd191f55244a4294a259b24b95fc494dba6f8eaba6
4
- data.tar.gz: 6e40a732814ff3e2a164e718cdb1c7a6ae838b2b2210a66b232f8675c7f79a80
3
+ metadata.gz: 800ea3cd581a353a00fd4c67025b004ad40a15cc5dc656d04e05c60544387b53
4
+ data.tar.gz: b9939af20686f24d25b983946c6c024da2def226c3a19ae46f60182981930ddd
5
5
  SHA512:
6
- metadata.gz: f9c3a45f31c3ad9e3857872d8705b397b40c4317844ef421f4da4c2918e57411f5a626df4f6706d7db4916f33b8644c736e7b41508b398fd0197f1a87170fa3c
7
- data.tar.gz: 8b05a75be261dbe583c4873d9d21079efff97d6c9c0340bbd8a73a43c9d15955431f4de20cd8b4a8b7956872f52e4467c253f5da03177a1e7d3b6a10d202b59d
6
+ metadata.gz: ad5fc359c85e886364cca94497635b8659d363ad2732e7eb5f67d1bc55951d413a6cec201bd16d76c1b826d982881a9f8b028c5d5608be63c30528d83ccafb13
7
+ data.tar.gz: 23d95fb583c36dd9c1bb060a4f5f43bec540165853ac2007903941463d12fa83d10e805f2cd9b4f5859eeda4465a2ffe3e289e7d884f313867b3611e445fb7f9
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.3)
4
+ kreuzberg (4.2.5)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -97,7 +97,7 @@ GEM
97
97
  rspec-mocks (3.13.7)
98
98
  diff-lcs (>= 1.2.0, < 2.0)
99
99
  rspec-support (~> 3.13.0)
100
- rspec-support (3.13.6)
100
+ rspec-support (3.13.7)
101
101
  rubocop (1.84.0)
102
102
  json (~> 2.3)
103
103
  language_server-protocol (~> 3.17.0.2)
@@ -121,7 +121,7 @@ GEM
121
121
  rubocop (~> 1.81)
122
122
  ruby-progressbar (1.13.0)
123
123
  securerandom (0.4.1)
124
- sorbet-runtime (0.6.12903)
124
+ sorbet-runtime (0.6.12908)
125
125
  steep (1.10.0)
126
126
  activesupport (>= 5.1)
127
127
  concurrent-ruby (>= 1.1.10)
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.2.3)
210
+ kreuzberg (4.2.5)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -235,14 +235,14 @@ CHECKSUMS
235
235
  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
236
236
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
237
237
  rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
238
- rspec-support (3.13.6) sha256=2e8de3702427eab064c9352fe74488cc12a1bfae887ad8b91cba480ec9f8afb2
238
+ rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
239
239
  rubocop (1.84.0) sha256=88dec310153bb685a879f5a7cdb601f6287b8f0ee675d9dc63a17c7204c4190a
240
240
  rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
241
241
  rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
242
242
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
243
243
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
244
244
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
245
- sorbet-runtime (0.6.12903) sha256=c23968c0dcf5a5db57f32c003fe3db7fb588c168cdd57d92ea4dceaba063118a
245
+ sorbet-runtime (0.6.12908) sha256=229f43e76527b01c5291c00c43cc29ccebe437a87b34925c3ee250ebf23d328e
246
246
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
247
247
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
248
248
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.3" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.5" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -877,7 +877,7 @@ module Kreuzberg
877
877
  chunking: @chunking&.to_h,
878
878
  language_detection: @language_detection&.to_h,
879
879
  pdf_options: @pdf_options&.to_h,
880
- images: @images&.to_h,
880
+ image_extraction: @images&.to_h,
881
881
  postprocessor: @postprocessor&.to_h,
882
882
  token_reduction: @token_reduction&.to_h,
883
883
  keywords: @keywords&.to_h,
@@ -103,13 +103,48 @@ module Kreuzberg
103
103
  # @return [Array<Table>] Tables on this page
104
104
  # @!attribute [r] images
105
105
  # @return [Array<Image>] Images on this page
106
- PageContent = Struct.new(:page_number, :content, :tables, :images, keyword_init: true) do
106
+ # @!attribute [r] text
107
+ # @return [String] The text content of this block
108
+ # @!attribute [r] font_size
109
+ # @return [Float] The font size of the text
110
+ # @!attribute [r] level
111
+ # @return [String] The hierarchy level (h1-h6 or body)
112
+ # @!attribute [r] bbox
113
+ # @return [Array<Float>, nil] Bounding box (left, top, right, bottom)
114
+ HierarchicalBlock = Struct.new(:text, :font_size, :level, :bbox, keyword_init: true) do
115
+ def to_h
116
+ { text: text, font_size: font_size, level: level, bbox: bbox }
117
+ end
118
+ end
119
+
120
+ # @!attribute [r] block_count
121
+ # @return [Integer] Number of hierarchy blocks
122
+ # @!attribute [r] blocks
123
+ # @return [Array<HierarchicalBlock>] Hierarchical blocks
124
+ PageHierarchy = Struct.new(:block_count, :blocks, keyword_init: true) do
125
+ def to_h
126
+ { block_count: block_count, blocks: blocks.map(&:to_h) }
127
+ end
128
+ end
129
+
130
+ # @!attribute [r] page_number
131
+ # @return [Integer] Page number (1-indexed)
132
+ # @!attribute [r] content
133
+ # @return [String] Text content for this page
134
+ # @!attribute [r] tables
135
+ # @return [Array<Table>] Tables on this page
136
+ # @!attribute [r] images
137
+ # @return [Array<Image>] Images on this page
138
+ # @!attribute [r] hierarchy
139
+ # @return [PageHierarchy, nil] Hierarchy information for the page
140
+ PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, keyword_init: true) do
107
141
  def to_h
108
142
  {
109
143
  page_number: page_number,
110
144
  content: content,
111
145
  tables: tables.map(&:to_h),
112
- images: images.map(&:to_h)
146
+ images: images.map(&:to_h),
147
+ hierarchy: hierarchy&.to_h
113
148
  }
114
149
  end
115
150
  end
@@ -397,9 +432,28 @@ module Kreuzberg
397
432
  page_number: page_hash['page_number'],
398
433
  content: page_hash['content'],
399
434
  tables: parse_tables(page_hash['tables']),
400
- images: parse_images(page_hash['images'])
435
+ images: parse_images(page_hash['images']),
436
+ hierarchy: parse_page_hierarchy(page_hash['hierarchy'])
437
+ )
438
+ end
439
+ end
440
+
441
+ def parse_page_hierarchy(hierarchy_data)
442
+ return nil if hierarchy_data.nil?
443
+
444
+ blocks = (hierarchy_data['blocks'] || []).map do |block_hash|
445
+ HierarchicalBlock.new(
446
+ text: block_hash['text'],
447
+ font_size: block_hash['font_size']&.to_f,
448
+ level: block_hash['level'],
449
+ bbox: block_hash['bbox']
401
450
  )
402
451
  end
452
+
453
+ PageHierarchy.new(
454
+ block_count: hierarchy_data['block_count'] || 0,
455
+ blocks: blocks
456
+ )
403
457
  end
404
458
 
405
459
  def parse_elements(elements_data)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.3'
4
+ VERSION = '4.2.5'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -195,7 +195,6 @@ module Kreuzberg
195
195
  attr_reader language_detection: LanguageDetection?
196
196
  attr_reader pdf_options: PDF?
197
197
  attr_reader image_extraction: ImageExtraction?
198
- attr_reader image_preprocessing: ImagePreprocessing?
199
198
  attr_reader postprocessor: PostProcessor?
200
199
  attr_reader token_reduction: TokenReduction?
201
200
  attr_reader keywords: Keywords?
@@ -215,7 +214,6 @@ module Kreuzberg
215
214
  ?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
216
215
  ?pdf_options: (PDF | Hash[Symbol, untyped])?,
217
216
  ?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
218
- ?image_preprocessing: (ImagePreprocessing | Hash[Symbol, untyped])?,
219
217
  ?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
220
218
  ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
221
219
  ?keywords: (Keywords | Hash[Symbol, untyped])?,
@@ -13,7 +13,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
13
13
  expect(config.language_detection).to be_nil
14
14
  expect(config.pdf_options).to be_nil
15
15
  expect(config.image_extraction).to be_nil
16
- expect(config.image_preprocessing).to be_nil
17
16
  expect(config.postprocessor).to be_nil
18
17
  expect(config.token_reduction).to be_nil
19
18
  expect(config.keywords).to be_nil
@@ -258,7 +257,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
258
257
  language_detection: { enabled: true },
259
258
  pdf_options: { extract_images: true },
260
259
  image_extraction: { target_dpi: 600 },
261
- image_preprocessing: { denoise: true },
262
260
  postprocessor: { enabled: true },
263
261
  token_reduction: { mode: 'light' },
264
262
  keywords: { algorithm: 'yake' },
@@ -401,7 +399,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
401
399
  language_detection: { enabled: true, min_confidence: 0.9 },
402
400
  pdf_options: { extract_images: true, passwords: ['secret'] },
403
401
  image_extraction: { target_dpi: 600 },
404
- image_preprocessing: { denoise: true, binarization_method: 'sauvola' },
405
402
  postprocessor: { enabled: true, enabled_processors: %w[quality] },
406
403
  token_reduction: { mode: 'light' },
407
404
  keywords: { algorithm: 'yake', max_keywords: 10 },
@@ -415,7 +412,6 @@ RSpec.describe Kreuzberg::Config::Extraction do
415
412
  expect(config.language_detection.enabled).to be true
416
413
  expect(config.pdf_options.extract_images).to be true
417
414
  expect(config.image_extraction.target_dpi).to eq 600
418
- expect(config.image_preprocessing.denoise).to be true
419
415
  expect(config.postprocessor.enabled).to be true
420
416
  expect(config.token_reduction.mode).to eq 'light'
421
417
  expect(config.keywords.max_keywords).to eq 10
@@ -157,25 +157,6 @@ RSpec.describe Kreuzberg::Config::ImagePreprocessing do
157
157
  end
158
158
 
159
159
  describe 'nested config integration' do
160
- it 'can be nested in Extraction config' do
161
- preprocessing = described_class.new(target_dpi: 600, denoise: true)
162
- extraction = Kreuzberg::Config::Extraction.new(image_preprocessing: preprocessing)
163
-
164
- expect(extraction.image_preprocessing).to be_a described_class
165
- expect(extraction.image_preprocessing.target_dpi).to eq 600
166
- expect(extraction.image_preprocessing.denoise).to be true
167
- end
168
-
169
- it 'accepts hash in Extraction config' do
170
- extraction = Kreuzberg::Config::Extraction.new(
171
- image_preprocessing: { target_dpi: 600, binarization_method: 'sauvola' }
172
- )
173
-
174
- expect(extraction.image_preprocessing).to be_a described_class
175
- expect(extraction.image_preprocessing.target_dpi).to eq 600
176
- expect(extraction.image_preprocessing.binarization_method).to eq 'sauvola'
177
- end
178
-
179
160
  it 'can be nested in Tesseract config' do
180
161
  preprocessing = described_class.new(denoise: true)
181
162
  tesseract = Kreuzberg::Config::Tesseract.new(preprocessing: preprocessing)
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.3"
6
+ version = "4.2.5"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -47,7 +47,7 @@ hex = "0.4.3"
47
47
  toml = "0.9.11"
48
48
  num_cpus = "1.17.0"
49
49
  once_cell = "1.21.3"
50
- html-to-markdown-rs = { version = "2.23.4", default-features = false }
50
+ html-to-markdown-rs = { version = "2.24.1", default-features = false }
51
51
  reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
52
52
  image = { version = "0.25.9", default-features = false }
53
53
  lzma-rust2 = { version = "0.15.7" }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.3"
3
+ version = "4.2.5"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.3 Release**
20
+ > **🚀 Version 4.2.5 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -3,6 +3,14 @@
3
3
  use super::types::KeywordAlgorithm;
4
4
  use serde::{Deserialize, Serialize};
5
5
 
6
+ fn default_max_keywords() -> usize {
7
+ 10
8
+ }
9
+
10
+ fn default_ngram_range() -> (usize, usize) {
11
+ (1, 3)
12
+ }
13
+
6
14
  /// YAKE-specific parameters.
7
15
  #[cfg(feature = "keywords-yake")]
8
16
  #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -45,15 +53,18 @@ impl Default for RakeParams {
45
53
  #[derive(Debug, Clone, Serialize, Deserialize)]
46
54
  pub struct KeywordConfig {
47
55
  /// Algorithm to use for extraction.
56
+ #[serde(default)]
48
57
  pub algorithm: KeywordAlgorithm,
49
58
 
50
59
  /// Maximum number of keywords to extract (default: 10).
60
+ #[serde(default = "default_max_keywords")]
51
61
  pub max_keywords: usize,
52
62
 
53
63
  /// Minimum score threshold (0.0-1.0, default: 0.0).
54
64
  ///
55
65
  /// Keywords with scores below this threshold are filtered out.
56
66
  /// Note: Score ranges differ between algorithms.
67
+ #[serde(default)]
57
68
  pub min_score: f32,
58
69
 
59
70
  /// N-gram range for keyword extraction (min, max).
@@ -61,6 +72,7 @@ pub struct KeywordConfig {
61
72
  /// (1, 1) = unigrams only
62
73
  /// (1, 2) = unigrams and bigrams
63
74
  /// (1, 3) = unigrams, bigrams, and trigrams (default)
75
+ #[serde(default = "default_ngram_range")]
64
76
  pub ngram_range: (usize, usize),
65
77
 
66
78
  /// Language code for stopword filtering (e.g., "en", "de", "fr").
@@ -28,7 +28,7 @@ serde_json = { workspace = true }
28
28
  serde = { workspace = true }
29
29
  async-trait = { workspace = true }
30
30
  tokio = { workspace = true }
31
- html-to-markdown-rs = { version = "2.23.4", default-features = false }
31
+ html-to-markdown-rs = { version = "2.24.1", default-features = false }
32
32
  rayon = { version = "1.11", optional = true }
33
33
  log = "0.4"
34
34
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.3"
3
+ version = "4.2.5"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -24,7 +24,7 @@ thiserror = { workspace = true }
24
24
  image = { workspace = true, features = ["png"] }
25
25
 
26
26
  [build-dependencies]
27
- cc = { version = "^1.2.54", optional = true }
27
+ cc = { version = "^1.2.55", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
29
  zip = { version = "7.2.0", optional = true }
30
30
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.3
4
+ version: 4.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-28 00:00:00.000000000 Z
11
+ date: 2026-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler