kreuzberg 4.7.4-aarch64-linux → 4.8.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/kreuzberg/config.rb +43 -5
- data/lib/kreuzberg/errors.rb +4 -0
- data/lib/kreuzberg/extraction_api.rb +35 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +7 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +20 -0
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7b8c670c1b98342788190baf143a5ae2ae605fcfec4974a162f931d36052a87f
|
|
4
|
+
data.tar.gz: e6a35b64afbb09e7c228d6cf8e148be71cd70215c8be46ba9d291f265597a942
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4981b0051cc6a8db5de244957c455eaced589b0404da51dd09278d742a9e0d79cb459b79788950d2608c6c3728abfe5dc38d294eb324178cfa31c284080224d9
|
|
7
|
+
data.tar.gz: 4e3848edfb9d39632727ae5018ccb745ae16d80acc638d43ecc2e7119489c1a02f986819879f5bb7762d25c21692f56da2520c282516b6b43d91640bb0423558
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
45
|
-
<img src="https://img.shields.io/badge/License-
|
|
45
|
+
<img src="https://img.shields.io/badge/License-Elastic--2.0-blue.svg" alt="License">
|
|
46
46
|
</a>
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
48
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
|
|
@@ -419,7 +419,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
|
|
|
419
419
|
|
|
420
420
|
## License
|
|
421
421
|
|
|
422
|
-
|
|
422
|
+
Elastic License 2.0 (ELv2) - see [LICENSE](../../LICENSE) for details.
|
|
423
423
|
|
|
424
424
|
## Support
|
|
425
425
|
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -856,6 +856,40 @@ module Kreuzberg
|
|
|
856
856
|
end
|
|
857
857
|
end
|
|
858
858
|
|
|
859
|
+
# Content filter configuration for controlling extraction of headers, footers,
|
|
860
|
+
# watermarks, and repeating text across document formats.
|
|
861
|
+
#
|
|
862
|
+
# @example Include headers and footers
|
|
863
|
+
# filter = ContentFilter.new(include_headers: true, include_footers: true)
|
|
864
|
+
#
|
|
865
|
+
# @example Disable repeating text removal
|
|
866
|
+
# filter = ContentFilter.new(strip_repeating_text: false)
|
|
867
|
+
#
|
|
868
|
+
class ContentFilter
|
|
869
|
+
attr_reader :include_headers, :include_footers, :strip_repeating_text, :include_watermarks
|
|
870
|
+
|
|
871
|
+
def initialize(
|
|
872
|
+
include_headers: false,
|
|
873
|
+
include_footers: false,
|
|
874
|
+
strip_repeating_text: true,
|
|
875
|
+
include_watermarks: false
|
|
876
|
+
)
|
|
877
|
+
@include_headers = include_headers ? true : false
|
|
878
|
+
@include_footers = include_footers ? true : false
|
|
879
|
+
@strip_repeating_text = strip_repeating_text ? true : false
|
|
880
|
+
@include_watermarks = include_watermarks ? true : false
|
|
881
|
+
end
|
|
882
|
+
|
|
883
|
+
def to_h
|
|
884
|
+
{
|
|
885
|
+
include_headers: @include_headers,
|
|
886
|
+
include_footers: @include_footers,
|
|
887
|
+
strip_repeating_text: @strip_repeating_text,
|
|
888
|
+
include_watermarks: @include_watermarks
|
|
889
|
+
}
|
|
890
|
+
end
|
|
891
|
+
end
|
|
892
|
+
|
|
859
893
|
# Layout detection configuration
|
|
860
894
|
#
|
|
861
895
|
# @example Basic usage
|
|
@@ -951,7 +985,7 @@ module Kreuzberg
|
|
|
951
985
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
952
986
|
:security_limits, :layout, :concurrency,
|
|
953
987
|
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
|
|
954
|
-
:max_archive_depth, :acceleration, :email
|
|
988
|
+
:max_archive_depth, :acceleration, :email, :content_filter
|
|
955
989
|
|
|
956
990
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
957
991
|
alias image_extraction images
|
|
@@ -977,7 +1011,7 @@ module Kreuzberg
|
|
|
977
1011
|
postprocessor token_reduction keywords html_options pages
|
|
978
1012
|
max_concurrent_extractions output_format result_format
|
|
979
1013
|
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
980
|
-
max_archive_depth acceleration email
|
|
1014
|
+
max_archive_depth acceleration email content_filter
|
|
981
1015
|
].freeze
|
|
982
1016
|
|
|
983
1017
|
# Aliases for backward compatibility
|
|
@@ -1062,7 +1096,8 @@ module Kreuzberg
|
|
|
1062
1096
|
extraction_timeout_secs: nil,
|
|
1063
1097
|
max_archive_depth: 3,
|
|
1064
1098
|
acceleration: nil,
|
|
1065
|
-
email: nil
|
|
1099
|
+
email: nil,
|
|
1100
|
+
content_filter: nil)
|
|
1066
1101
|
kwargs = {
|
|
1067
1102
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1068
1103
|
force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
|
|
@@ -1080,7 +1115,8 @@ module Kreuzberg
|
|
|
1080
1115
|
extraction_timeout_secs: extraction_timeout_secs,
|
|
1081
1116
|
max_archive_depth: max_archive_depth,
|
|
1082
1117
|
acceleration: acceleration,
|
|
1083
|
-
email: email
|
|
1118
|
+
email: email,
|
|
1119
|
+
content_filter: content_filter
|
|
1084
1120
|
}
|
|
1085
1121
|
extracted = extract_from_hash(hash, kwargs)
|
|
1086
1122
|
|
|
@@ -1115,6 +1151,7 @@ module Kreuzberg
|
|
|
1115
1151
|
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
1116
1152
|
@acceleration = normalize_config(params[:acceleration], Acceleration)
|
|
1117
1153
|
@email = normalize_config(params[:email], Email)
|
|
1154
|
+
@content_filter = normalize_config(params[:content_filter], ContentFilter)
|
|
1118
1155
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
1119
1156
|
@max_archive_depth = params[:max_archive_depth]&.to_i || 3
|
|
1120
1157
|
@output_format = validate_output_format(params[:output_format])
|
|
@@ -1175,7 +1212,8 @@ module Kreuzberg
|
|
|
1175
1212
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1176
1213
|
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1177
1214
|
layout: @layout&.to_h, concurrency: @concurrency&.to_h,
|
|
1178
|
-
acceleration: @acceleration&.to_h, email: @email&.to_h
|
|
1215
|
+
acceleration: @acceleration&.to_h, email: @email&.to_h,
|
|
1216
|
+
content_filter: @content_filter&.to_h
|
|
1179
1217
|
}
|
|
1180
1218
|
end
|
|
1181
1219
|
|
data/lib/kreuzberg/errors.rb
CHANGED
|
@@ -11,6 +11,7 @@ module Kreuzberg
|
|
|
11
11
|
ERROR_CODE_PARSING = 5
|
|
12
12
|
ERROR_CODE_OCR = 6
|
|
13
13
|
ERROR_CODE_MISSING_DEPENDENCY = 7
|
|
14
|
+
ERROR_CODE_EMBEDDING = 8
|
|
14
15
|
|
|
15
16
|
module Errors
|
|
16
17
|
class PanicContext
|
|
@@ -112,5 +113,8 @@ module Kreuzberg
|
|
|
112
113
|
|
|
113
114
|
# Raised when an unsupported file format or MIME type is encountered
|
|
114
115
|
class UnsupportedFormatError < Error; end
|
|
116
|
+
|
|
117
|
+
# Raised when embedding fails
|
|
118
|
+
class EmbeddingError < Error; end
|
|
115
119
|
end
|
|
116
120
|
end
|
|
@@ -236,6 +236,41 @@ module Kreuzberg
|
|
|
236
236
|
results
|
|
237
237
|
end
|
|
238
238
|
|
|
239
|
+
# Asynchronously generate embeddings for multiple texts.
|
|
240
|
+
#
|
|
241
|
+
# Non-blocking embedding generation from a list of strings.
|
|
242
|
+
#
|
|
243
|
+
# @param texts [Array<String>] List of strings to embed.
|
|
244
|
+
# @param config [Config::Embedding, Hash, nil] Embedding configuration.
|
|
245
|
+
#
|
|
246
|
+
# @return [Array<Array<Float>>] Array of embedding vectors.
|
|
247
|
+
#
|
|
248
|
+
# @raise [Errors::EmbeddingError] If embedding generation fails.
|
|
249
|
+
#
|
|
250
|
+
# @example Generate embeddings asynchronously
|
|
251
|
+
# texts = ["Hello, world!", "Kreuzberg is awesome."]
|
|
252
|
+
# embeddings = Kreuzberg.embed(texts: texts)
|
|
253
|
+
# puts embeddings.first.length # 384
|
|
254
|
+
def embed(texts:, config: nil)
|
|
255
|
+
opts = normalize_config(config)
|
|
256
|
+
native_embed(texts: texts.map(&:to_s), config: opts)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Synchronously generate embeddings for multiple texts.
|
|
260
|
+
#
|
|
261
|
+
# Blocking embedding generation from a list of strings.
|
|
262
|
+
#
|
|
263
|
+
# @param texts [Array<String>] List of strings to embed.
|
|
264
|
+
# @param config [Config::Embedding, Hash, nil] Embedding configuration.
|
|
265
|
+
#
|
|
266
|
+
# @return [Array<Array<Float>>] Array of embedding vectors.
|
|
267
|
+
#
|
|
268
|
+
# @raise [Errors::EmbeddingError] If embedding generation fails.
|
|
269
|
+
def embed_sync(texts:, config: nil)
|
|
270
|
+
opts = normalize_config(config)
|
|
271
|
+
native_embed_sync(texts: texts.map(&:to_s), config: opts)
|
|
272
|
+
end
|
|
273
|
+
|
|
239
274
|
# Synchronously extract content from multiple byte data sources.
|
|
240
275
|
#
|
|
241
276
|
# Processes multiple in-memory binary documents in a single batch operation. Results
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -59,10 +59,13 @@ module Kreuzberg
|
|
|
59
59
|
alias native_batch_extract_bytes batch_extract_bytes
|
|
60
60
|
alias native_clear_cache clear_cache
|
|
61
61
|
alias native_cache_stats cache_stats
|
|
62
|
+
alias native_embed_sync embed_sync
|
|
63
|
+
alias native_embed embed
|
|
62
64
|
|
|
63
65
|
private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
|
|
64
66
|
private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
|
|
65
67
|
private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
|
|
68
|
+
private :native_embed_sync, :native_embed
|
|
66
69
|
end
|
|
67
70
|
|
|
68
71
|
module_function :register_post_processor
|
|
@@ -94,6 +97,10 @@ module Kreuzberg
|
|
|
94
97
|
module_function :validate_mime_type
|
|
95
98
|
|
|
96
99
|
module_function :get_extensions_for_mime
|
|
100
|
+
|
|
101
|
+
module_function :embed_sync
|
|
102
|
+
|
|
103
|
+
module_function :embed
|
|
97
104
|
end
|
|
98
105
|
|
|
99
106
|
require_relative 'kreuzberg/cache_api'
|
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -12,6 +12,7 @@ module Kreuzberg
|
|
|
12
12
|
ERROR_CODE_PARSING: Integer
|
|
13
13
|
ERROR_CODE_OCR: Integer
|
|
14
14
|
ERROR_CODE_MISSING_DEPENDENCY: Integer
|
|
15
|
+
ERROR_CODE_EMBEDDING: Integer
|
|
15
16
|
|
|
16
17
|
# Semantic element type classification (T.type_alias)
|
|
17
18
|
type element_type = 'title' | 'narrative_text' | 'heading' | 'list_item' | 'table' | 'image' | 'page_break' | 'code_block' | 'block_quote' | 'footer' | 'header'
|
|
@@ -747,6 +748,16 @@ module Kreuzberg
|
|
|
747
748
|
def to_h: () -> Hash[Symbol, untyped]
|
|
748
749
|
end
|
|
749
750
|
|
|
751
|
+
class ContentFilter
|
|
752
|
+
attr_reader include_headers: bool
|
|
753
|
+
attr_reader include_footers: bool
|
|
754
|
+
attr_reader strip_repeating_text: bool
|
|
755
|
+
attr_reader include_watermarks: bool
|
|
756
|
+
|
|
757
|
+
def initialize: (?include_headers: bool, ?include_footers: bool, ?strip_repeating_text: bool, ?include_watermarks: bool) -> void
|
|
758
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
759
|
+
end
|
|
760
|
+
|
|
750
761
|
class LayoutDetection
|
|
751
762
|
attr_reader preset: String
|
|
752
763
|
attr_reader confidence_threshold: Float?
|
|
@@ -813,6 +824,7 @@ module Kreuzberg
|
|
|
813
824
|
attr_reader concurrency: Concurrency?
|
|
814
825
|
attr_reader acceleration: Acceleration?
|
|
815
826
|
attr_reader email: Email?
|
|
827
|
+
attr_reader content_filter: ContentFilter?
|
|
816
828
|
attr_reader tree_sitter: TreeSitterConfig?
|
|
817
829
|
attr_reader max_concurrent_extractions: Integer?
|
|
818
830
|
attr_reader max_archive_depth: Integer
|
|
@@ -844,6 +856,7 @@ module Kreuzberg
|
|
|
844
856
|
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
845
857
|
?acceleration: (Acceleration | Hash[Symbol, untyped])?,
|
|
846
858
|
?email: (Email | Hash[Symbol, untyped])?,
|
|
859
|
+
?content_filter: (ContentFilter | Hash[Symbol, untyped])?,
|
|
847
860
|
?tree_sitter: (TreeSitterConfig | Hash[Symbol, untyped])?,
|
|
848
861
|
?max_concurrent_extractions: Integer?,
|
|
849
862
|
?max_archive_depth: Integer,
|
|
@@ -1572,6 +1585,10 @@ module Kreuzberg
|
|
|
1572
1585
|
?file_configs: Array[file_config_input]?
|
|
1573
1586
|
) -> Array[Result]
|
|
1574
1587
|
|
|
1588
|
+
# Standalone embedding
|
|
1589
|
+
def self.embed_sync: (texts: Array[String], ?config: Hash[Symbol, untyped]?) -> Array[Array[Float]]
|
|
1590
|
+
def self.embed: (texts: Array[String], ?config: Hash[Symbol, untyped]?) -> Array[Array[Float]]
|
|
1591
|
+
|
|
1575
1592
|
# Cache API
|
|
1576
1593
|
def self.clear_cache: () -> void
|
|
1577
1594
|
def self.cache_stats: () -> Hash[Symbol | String, Integer]
|
|
@@ -1722,6 +1739,9 @@ module Kreuzberg
|
|
|
1722
1739
|
|
|
1723
1740
|
class UnsupportedFormatError < Error
|
|
1724
1741
|
end
|
|
1742
|
+
|
|
1743
|
+
class EmbeddingError < Error
|
|
1744
|
+
end
|
|
1725
1745
|
end
|
|
1726
1746
|
|
|
1727
1747
|
# Internal modules (prepended to Kreuzberg singleton)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.
|
|
4
|
+
version: 4.8.0
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -223,7 +223,7 @@ files:
|
|
|
223
223
|
- spec/spec_helper.rb
|
|
224
224
|
homepage: https://github.com/kreuzberg-dev/kreuzberg
|
|
225
225
|
licenses:
|
|
226
|
-
-
|
|
226
|
+
- Elastic-2.0
|
|
227
227
|
metadata:
|
|
228
228
|
homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
|
|
229
229
|
source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
|