kreuzberg 4.7.4-aarch64-linux → 4.8.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 00cdc7fc0ab8f3b23ba177771f8fd4782e0b241a1dbce1cbde3daab2713f3932
4
- data.tar.gz: f4aec1401be34830b59426141d31df2e40a1a26f22af5a0b4100e8bf153ee21f
3
+ metadata.gz: 7b8c670c1b98342788190baf143a5ae2ae605fcfec4974a162f931d36052a87f
4
+ data.tar.gz: e6a35b64afbb09e7c228d6cf8e148be71cd70215c8be46ba9d291f265597a942
5
5
  SHA512:
6
- metadata.gz: 1be6b7d0ed3aee0ad35639e285fb35fd23a2df6df870aae88ad7a038ab0a9ee078b94f5fa4e3ee035ce93bda5e7c053c365fbdfe836a65dac527484a7b04f03d
7
- data.tar.gz: 284e3fa2c4d038f8345fe5b73ca2497890c5c6bdf23106c2f714e19e1573cd720ef830af4d06458a3a8658cb36fc616b1a8f61fefec1da4acd85d4db801ac066
6
+ metadata.gz: 4981b0051cc6a8db5de244957c455eaced589b0404da51dd09278d742a9e0d79cb459b79788950d2608c6c3728abfe5dc38d294eb324178cfa31c284080224d9
7
+ data.tar.gz: 4e3848edfb9d39632727ae5018ccb745ae16d80acc638d43ecc2e7119489c1a02f986819879f5bb7762d25c21692f56da2520c282516b6b43d91640bb0423558
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -42,7 +42,7 @@
42
42
 
43
43
  <!-- Project Info -->
44
44
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
45
- <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
45
+ <img src="https://img.shields.io/badge/License-Elastic--2.0-blue.svg" alt="License">
46
46
  </a>
47
47
  <a href="https://docs.kreuzberg.dev">
48
48
  <img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
@@ -419,7 +419,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
419
419
 
420
420
  ## License
421
421
 
422
- MIT License - see LICENSE file for details.
422
+ Elastic License 2.0 (ELv2) - see [LICENSE](../../LICENSE) for details.
423
423
 
424
424
  ## Support
425
425
 
@@ -856,6 +856,40 @@ module Kreuzberg
856
856
  end
857
857
  end
858
858
 
859
+ # Content filter configuration for controlling extraction of headers, footers,
860
+ # watermarks, and repeating text across document formats.
861
+ #
862
+ # @example Include headers and footers
863
+ # filter = ContentFilter.new(include_headers: true, include_footers: true)
864
+ #
865
+ # @example Disable repeating text removal
866
+ # filter = ContentFilter.new(strip_repeating_text: false)
867
+ #
868
+ class ContentFilter
869
+ attr_reader :include_headers, :include_footers, :strip_repeating_text, :include_watermarks
870
+
871
+ def initialize(
872
+ include_headers: false,
873
+ include_footers: false,
874
+ strip_repeating_text: true,
875
+ include_watermarks: false
876
+ )
877
+ @include_headers = include_headers ? true : false
878
+ @include_footers = include_footers ? true : false
879
+ @strip_repeating_text = strip_repeating_text ? true : false
880
+ @include_watermarks = include_watermarks ? true : false
881
+ end
882
+
883
+ def to_h
884
+ {
885
+ include_headers: @include_headers,
886
+ include_footers: @include_footers,
887
+ strip_repeating_text: @strip_repeating_text,
888
+ include_watermarks: @include_watermarks
889
+ }
890
+ end
891
+ end
892
+
859
893
  # Layout detection configuration
860
894
  #
861
895
  # @example Basic usage
@@ -951,7 +985,7 @@ module Kreuzberg
951
985
  :max_concurrent_extractions, :output_format, :result_format,
952
986
  :security_limits, :layout, :concurrency,
953
987
  :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
954
- :max_archive_depth, :acceleration, :email
988
+ :max_archive_depth, :acceleration, :email, :content_filter
955
989
 
956
990
  # Alias for backward compatibility - image_extraction is the canonical name
957
991
  alias image_extraction images
@@ -977,7 +1011,7 @@ module Kreuzberg
977
1011
  postprocessor token_reduction keywords html_options pages
978
1012
  max_concurrent_extractions output_format result_format
979
1013
  security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
980
- max_archive_depth acceleration email
1014
+ max_archive_depth acceleration email content_filter
981
1015
  ].freeze
982
1016
 
983
1017
  # Aliases for backward compatibility
@@ -1062,7 +1096,8 @@ module Kreuzberg
1062
1096
  extraction_timeout_secs: nil,
1063
1097
  max_archive_depth: 3,
1064
1098
  acceleration: nil,
1065
- email: nil)
1099
+ email: nil,
1100
+ content_filter: nil)
1066
1101
  kwargs = {
1067
1102
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1068
1103
  force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
@@ -1080,7 +1115,8 @@ module Kreuzberg
1080
1115
  extraction_timeout_secs: extraction_timeout_secs,
1081
1116
  max_archive_depth: max_archive_depth,
1082
1117
  acceleration: acceleration,
1083
- email: email
1118
+ email: email,
1119
+ content_filter: content_filter
1084
1120
  }
1085
1121
  extracted = extract_from_hash(hash, kwargs)
1086
1122
 
@@ -1115,6 +1151,7 @@ module Kreuzberg
1115
1151
  @concurrency = normalize_config(params[:concurrency], Concurrency)
1116
1152
  @acceleration = normalize_config(params[:acceleration], Acceleration)
1117
1153
  @email = normalize_config(params[:email], Email)
1154
+ @content_filter = normalize_config(params[:content_filter], ContentFilter)
1118
1155
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
1119
1156
  @max_archive_depth = params[:max_archive_depth]&.to_i || 3
1120
1157
  @output_format = validate_output_format(params[:output_format])
@@ -1175,7 +1212,8 @@ module Kreuzberg
1175
1212
  token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
1176
1213
  html_options: @html_options&.to_h, pages: @pages&.to_h,
1177
1214
  layout: @layout&.to_h, concurrency: @concurrency&.to_h,
1178
- acceleration: @acceleration&.to_h, email: @email&.to_h
1215
+ acceleration: @acceleration&.to_h, email: @email&.to_h,
1216
+ content_filter: @content_filter&.to_h
1179
1217
  }
1180
1218
  end
1181
1219
 
@@ -11,6 +11,7 @@ module Kreuzberg
11
11
  ERROR_CODE_PARSING = 5
12
12
  ERROR_CODE_OCR = 6
13
13
  ERROR_CODE_MISSING_DEPENDENCY = 7
14
+ ERROR_CODE_EMBEDDING = 8
14
15
 
15
16
  module Errors
16
17
  class PanicContext
@@ -112,5 +113,8 @@ module Kreuzberg
112
113
 
113
114
  # Raised when an unsupported file format or MIME type is encountered
114
115
  class UnsupportedFormatError < Error; end
116
+
117
+ # Raised when embedding fails
118
+ class EmbeddingError < Error; end
115
119
  end
116
120
  end
@@ -236,6 +236,41 @@ module Kreuzberg
236
236
  results
237
237
  end
238
238
 
239
+ # Asynchronously generate embeddings for multiple texts.
240
+ #
241
+ # Non-blocking embedding generation from a list of strings.
242
+ #
243
+ # @param texts [Array<String>] List of strings to embed.
244
+ # @param config [Config::Embedding, Hash, nil] Embedding configuration.
245
+ #
246
+ # @return [Array<Array<Float>>] Array of embedding vectors.
247
+ #
248
+ # @raise [Errors::EmbeddingError] If embedding generation fails.
249
+ #
250
+ # @example Generate embeddings asynchronously
251
+ # texts = ["Hello, world!", "Kreuzberg is awesome."]
252
+ # embeddings = Kreuzberg.embed(texts: texts)
253
+ # puts embeddings.first.length # 384
254
+ def embed(texts:, config: nil)
255
+ opts = normalize_config(config)
256
+ native_embed(texts: texts.map(&:to_s), config: opts)
257
+ end
258
+
259
+ # Synchronously generate embeddings for multiple texts.
260
+ #
261
+ # Blocking embedding generation from a list of strings.
262
+ #
263
+ # @param texts [Array<String>] List of strings to embed.
264
+ # @param config [Config::Embedding, Hash, nil] Embedding configuration.
265
+ #
266
+ # @return [Array<Array<Float>>] Array of embedding vectors.
267
+ #
268
+ # @raise [Errors::EmbeddingError] If embedding generation fails.
269
+ def embed_sync(texts:, config: nil)
270
+ opts = normalize_config(config)
271
+ native_embed_sync(texts: texts.map(&:to_s), config: opts)
272
+ end
273
+
239
274
  # Synchronously extract content from multiple byte data sources.
240
275
  #
241
276
  # Processes multiple in-memory binary documents in a single batch operation. Results
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.7.4'
4
+ VERSION = '4.8.0'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -59,10 +59,13 @@ module Kreuzberg
59
59
  alias native_batch_extract_bytes batch_extract_bytes
60
60
  alias native_clear_cache clear_cache
61
61
  alias native_cache_stats cache_stats
62
+ alias native_embed_sync embed_sync
63
+ alias native_embed embed
62
64
 
63
65
  private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
64
66
  private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
65
67
  private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
68
+ private :native_embed_sync, :native_embed
66
69
  end
67
70
 
68
71
  module_function :register_post_processor
@@ -94,6 +97,10 @@ module Kreuzberg
94
97
  module_function :validate_mime_type
95
98
 
96
99
  module_function :get_extensions_for_mime
100
+
101
+ module_function :embed_sync
102
+
103
+ module_function :embed
97
104
  end
98
105
 
99
106
  require_relative 'kreuzberg/cache_api'
data/lib/kreuzberg_rb.so CHANGED
Binary file
data/sig/kreuzberg.rbs CHANGED
@@ -12,6 +12,7 @@ module Kreuzberg
12
12
  ERROR_CODE_PARSING: Integer
13
13
  ERROR_CODE_OCR: Integer
14
14
  ERROR_CODE_MISSING_DEPENDENCY: Integer
15
+ ERROR_CODE_EMBEDDING: Integer
15
16
 
16
17
  # Semantic element type classification (T.type_alias)
17
18
  type element_type = 'title' | 'narrative_text' | 'heading' | 'list_item' | 'table' | 'image' | 'page_break' | 'code_block' | 'block_quote' | 'footer' | 'header'
@@ -747,6 +748,16 @@ module Kreuzberg
747
748
  def to_h: () -> Hash[Symbol, untyped]
748
749
  end
749
750
 
751
+ class ContentFilter
752
+ attr_reader include_headers: bool
753
+ attr_reader include_footers: bool
754
+ attr_reader strip_repeating_text: bool
755
+ attr_reader include_watermarks: bool
756
+
757
+ def initialize: (?include_headers: bool, ?include_footers: bool, ?strip_repeating_text: bool, ?include_watermarks: bool) -> void
758
+ def to_h: () -> Hash[Symbol, untyped]
759
+ end
760
+
750
761
  class LayoutDetection
751
762
  attr_reader preset: String
752
763
  attr_reader confidence_threshold: Float?
@@ -813,6 +824,7 @@ module Kreuzberg
813
824
  attr_reader concurrency: Concurrency?
814
825
  attr_reader acceleration: Acceleration?
815
826
  attr_reader email: Email?
827
+ attr_reader content_filter: ContentFilter?
816
828
  attr_reader tree_sitter: TreeSitterConfig?
817
829
  attr_reader max_concurrent_extractions: Integer?
818
830
  attr_reader max_archive_depth: Integer
@@ -844,6 +856,7 @@ module Kreuzberg
844
856
  ?concurrency: (Concurrency | Hash[Symbol, untyped])?,
845
857
  ?acceleration: (Acceleration | Hash[Symbol, untyped])?,
846
858
  ?email: (Email | Hash[Symbol, untyped])?,
859
+ ?content_filter: (ContentFilter | Hash[Symbol, untyped])?,
847
860
  ?tree_sitter: (TreeSitterConfig | Hash[Symbol, untyped])?,
848
861
  ?max_concurrent_extractions: Integer?,
849
862
  ?max_archive_depth: Integer,
@@ -1572,6 +1585,10 @@ module Kreuzberg
1572
1585
  ?file_configs: Array[file_config_input]?
1573
1586
  ) -> Array[Result]
1574
1587
 
1588
+ # Standalone embedding
1589
+ def self.embed_sync: (texts: Array[String], ?config: Hash[Symbol, untyped]?) -> Array[Array[Float]]
1590
+ def self.embed: (texts: Array[String], ?config: Hash[Symbol, untyped]?) -> Array[Array[Float]]
1591
+
1575
1592
  # Cache API
1576
1593
  def self.clear_cache: () -> void
1577
1594
  def self.cache_stats: () -> Hash[Symbol | String, Integer]
@@ -1722,6 +1739,9 @@ module Kreuzberg
1722
1739
 
1723
1740
  class UnsupportedFormatError < Error
1724
1741
  end
1742
+
1743
+ class EmbeddingError < Error
1744
+ end
1725
1745
  end
1726
1746
 
1727
1747
  # Internal modules (prepended to Kreuzberg singleton)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.7.4
4
+ version: 4.8.0
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-06 00:00:00.000000000 Z
11
+ date: 2026-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -223,7 +223,7 @@ files:
223
223
  - spec/spec_helper.rb
224
224
  homepage: https://github.com/kreuzberg-dev/kreuzberg
225
225
  licenses:
226
- - MIT
226
+ - Elastic-2.0
227
227
  metadata:
228
228
  homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
229
229
  source_code_uri: https://github.com/kreuzberg-dev/kreuzberg