kreuzberg 4.6.2-aarch64-linux → 4.7.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -5
- data/lib/kreuzberg/config.rb +63 -18
- data/lib/kreuzberg/result.rb +43 -6
- data/lib/kreuzberg/types.rb +205 -15
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +324 -0
- metadata +2 -22
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/metadata_types_spec.rb +0 -1253
- data/spec/serialization_spec.rb +0 -134
- data/spec/smoke/package_spec.rb +0 -199
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -434
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -230
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/output_format_spec.rb +0 -380
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1777d29275333b413764e5417f805de33ad0f9378dbb2a6372d9d573a23ae0e9
|
|
4
|
+
data.tar.gz: 34fad03e39480a52e6a2f91ea4a7a17335eacaf97f8b17d32d440d617842b068
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 93cc0429e0d310125071f7091c3029936683b69c665b499a411fa9cd8df1e22077fb6ff3c009040b3395f6cabc92fcac490db3a2bf013680a9c1393f15245799
|
|
7
|
+
data.tar.gz: e62adbc1ed01632d96f397d89b18050e403d6363d8a2658f4b48da99dc2d284a48cc0891d77cc0afb7483ce9383f3eab6a1367bd2207083b5c1fec48b8543c8e
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -42,13 +42,16 @@
|
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
45
|
-
<img src="https://img.shields.io/badge/License-MIT-
|
|
45
|
+
<img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
|
|
46
46
|
</a>
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
|
-
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-
|
|
48
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
|
|
49
|
+
</a>
|
|
50
|
+
<a href="https://docs.kreuzberg.dev/demo.html">
|
|
51
|
+
<img src="https://img.shields.io/badge/%E2%96%B6%EF%B8%8F_Live_Demo-007ec6" alt="Live Demo">
|
|
49
52
|
</a>
|
|
50
53
|
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
-
<img src="https://img.shields.io/badge/%F0%9F%A4%
|
|
54
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97_Hugging_Face-007ec6" alt="Hugging Face">
|
|
52
55
|
</a>
|
|
53
56
|
</div>
|
|
54
57
|
|
|
@@ -61,7 +64,7 @@
|
|
|
61
64
|
</div>
|
|
62
65
|
|
|
63
66
|
|
|
64
|
-
Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
67
|
+
Extract text, tables, images, and metadata from 91+ file formats and 248 programming languages including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
65
68
|
|
|
66
69
|
|
|
67
70
|
## Installation
|
|
@@ -74,6 +77,7 @@ Install via one of the supported package managers:
|
|
|
74
77
|
|
|
75
78
|
|
|
76
79
|
**gem:**
|
|
80
|
+
|
|
77
81
|
```bash
|
|
78
82
|
gem install kreuzberg
|
|
79
83
|
```
|
|
@@ -82,6 +86,7 @@ gem install kreuzberg
|
|
|
82
86
|
|
|
83
87
|
|
|
84
88
|
**Bundler:**
|
|
89
|
+
|
|
85
90
|
```ruby
|
|
86
91
|
gem 'kreuzberg'
|
|
87
92
|
```
|
|
@@ -258,6 +263,19 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
258
263
|
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
259
264
|
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
260
265
|
|
|
266
|
+
#### Code Intelligence (248 Languages)
|
|
267
|
+
|
|
268
|
+
| Feature | Description |
|
|
269
|
+
|---------|-------------|
|
|
270
|
+
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
|
|
271
|
+
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
|
|
272
|
+
| **Symbol Extraction** | Variables, constants, type aliases, properties |
|
|
273
|
+
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
|
|
274
|
+
| **Diagnostics** | Parse errors with line/column positions |
|
|
275
|
+
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
|
|
276
|
+
|
|
277
|
+
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
|
|
278
|
+
|
|
261
279
|
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
262
280
|
|
|
263
281
|
### Key Capabilities
|
|
@@ -279,6 +297,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
279
297
|
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
280
298
|
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
281
299
|
- **Language Detection** - Detect and support multiple languages in documents
|
|
300
|
+
|
|
301
|
+
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [248 programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
|
|
302
|
+
|
|
282
303
|
- **Configuration** - Fine-grained control over extraction behavior
|
|
283
304
|
|
|
284
305
|
### Performance Characteristics
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -837,23 +837,41 @@ module Kreuzberg
|
|
|
837
837
|
end
|
|
838
838
|
end
|
|
839
839
|
|
|
840
|
+
# Email extraction configuration
|
|
841
|
+
#
|
|
842
|
+
# @example With fallback codepage
|
|
843
|
+
# email = Email.new(msg_fallback_codepage: 1251)
|
|
844
|
+
#
|
|
845
|
+
class Email
|
|
846
|
+
attr_reader :msg_fallback_codepage
|
|
847
|
+
|
|
848
|
+
def initialize(msg_fallback_codepage: nil)
|
|
849
|
+
@msg_fallback_codepage = msg_fallback_codepage&.to_i
|
|
850
|
+
end
|
|
851
|
+
|
|
852
|
+
def to_h
|
|
853
|
+
h = {}
|
|
854
|
+
h[:msg_fallback_codepage] = @msg_fallback_codepage unless @msg_fallback_codepage.nil?
|
|
855
|
+
h
|
|
856
|
+
end
|
|
857
|
+
end
|
|
858
|
+
|
|
840
859
|
# Layout detection configuration
|
|
841
860
|
#
|
|
842
|
-
# @example Basic usage
|
|
843
|
-
# layout = LayoutDetection.new
|
|
861
|
+
# @example Basic usage
|
|
862
|
+
# layout = LayoutDetection.new
|
|
844
863
|
#
|
|
845
|
-
# @example
|
|
864
|
+
# @example With custom threshold and table model
|
|
846
865
|
# layout = LayoutDetection.new(
|
|
847
|
-
# preset: "accurate",
|
|
848
866
|
# confidence_threshold: 0.5,
|
|
849
|
-
# apply_heuristics: true
|
|
867
|
+
# apply_heuristics: true,
|
|
868
|
+
# table_model: "tatr"
|
|
850
869
|
# )
|
|
851
870
|
#
|
|
852
871
|
class LayoutDetection
|
|
853
|
-
attr_reader :
|
|
872
|
+
attr_reader :confidence_threshold, :apply_heuristics, :table_model
|
|
854
873
|
|
|
855
|
-
def initialize(
|
|
856
|
-
@preset = preset.to_s
|
|
874
|
+
def initialize(confidence_threshold: nil, apply_heuristics: true, table_model: nil)
|
|
857
875
|
@confidence_threshold = confidence_threshold&.to_f
|
|
858
876
|
@apply_heuristics = apply_heuristics ? true : false
|
|
859
877
|
@table_model = table_model&.to_s
|
|
@@ -861,7 +879,6 @@ module Kreuzberg
|
|
|
861
879
|
|
|
862
880
|
def to_h
|
|
863
881
|
{
|
|
864
|
-
preset: @preset,
|
|
865
882
|
confidence_threshold: @confidence_threshold,
|
|
866
883
|
apply_heuristics: @apply_heuristics,
|
|
867
884
|
table_model: @table_model
|
|
@@ -926,14 +943,15 @@ module Kreuzberg
|
|
|
926
943
|
# )
|
|
927
944
|
#
|
|
928
945
|
class Extraction
|
|
929
|
-
attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
|
|
946
|
+
attr_reader :use_cache, :enable_quality_processing, :force_ocr, :disable_ocr, :force_ocr_pages,
|
|
930
947
|
:include_document_structure,
|
|
931
948
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
932
949
|
:images, :postprocessor,
|
|
933
950
|
:token_reduction, :keywords, :html_options, :pages,
|
|
934
951
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
935
952
|
:security_limits, :layout, :concurrency,
|
|
936
|
-
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
|
|
953
|
+
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
|
|
954
|
+
:max_archive_depth, :acceleration, :email
|
|
937
955
|
|
|
938
956
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
939
957
|
alias image_extraction images
|
|
@@ -954,11 +972,12 @@ module Kreuzberg
|
|
|
954
972
|
#
|
|
955
973
|
# Keys that are allowed in the Extraction config
|
|
956
974
|
ALLOWED_KEYS = %i[
|
|
957
|
-
use_cache enable_quality_processing force_ocr force_ocr_pages
|
|
958
|
-
language_detection pdf_options image_extraction
|
|
975
|
+
use_cache enable_quality_processing force_ocr disable_ocr force_ocr_pages
|
|
976
|
+
include_document_structure ocr chunking language_detection pdf_options image_extraction
|
|
959
977
|
postprocessor token_reduction keywords html_options pages
|
|
960
978
|
max_concurrent_extractions output_format result_format
|
|
961
979
|
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
980
|
+
max_archive_depth acceleration email
|
|
962
981
|
].freeze
|
|
963
982
|
|
|
964
983
|
# Aliases for backward compatibility
|
|
@@ -1015,10 +1034,11 @@ module Kreuzberg
|
|
|
1015
1034
|
new(**normalize_hash_keys(hash))
|
|
1016
1035
|
end
|
|
1017
1036
|
|
|
1018
|
-
def initialize(hash = nil,
|
|
1037
|
+
def initialize(hash = nil, # rubocop:disable Metrics/MethodLength
|
|
1019
1038
|
use_cache: true,
|
|
1020
1039
|
enable_quality_processing: true,
|
|
1021
1040
|
force_ocr: false,
|
|
1041
|
+
disable_ocr: false,
|
|
1022
1042
|
force_ocr_pages: nil,
|
|
1023
1043
|
include_document_structure: false,
|
|
1024
1044
|
ocr: nil,
|
|
@@ -1039,10 +1059,13 @@ module Kreuzberg
|
|
|
1039
1059
|
concurrency: nil,
|
|
1040
1060
|
cache_namespace: nil,
|
|
1041
1061
|
cache_ttl_secs: nil,
|
|
1042
|
-
extraction_timeout_secs: nil
|
|
1062
|
+
extraction_timeout_secs: nil,
|
|
1063
|
+
max_archive_depth: 3,
|
|
1064
|
+
acceleration: nil,
|
|
1065
|
+
email: nil)
|
|
1043
1066
|
kwargs = {
|
|
1044
1067
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1045
|
-
force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
|
|
1068
|
+
force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
|
|
1046
1069
|
include_document_structure: include_document_structure,
|
|
1047
1070
|
ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
1048
1071
|
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
@@ -1054,7 +1077,10 @@ module Kreuzberg
|
|
|
1054
1077
|
concurrency: concurrency,
|
|
1055
1078
|
cache_namespace: cache_namespace,
|
|
1056
1079
|
cache_ttl_secs: cache_ttl_secs,
|
|
1057
|
-
extraction_timeout_secs: extraction_timeout_secs
|
|
1080
|
+
extraction_timeout_secs: extraction_timeout_secs,
|
|
1081
|
+
max_archive_depth: max_archive_depth,
|
|
1082
|
+
acceleration: acceleration,
|
|
1083
|
+
email: email
|
|
1058
1084
|
}
|
|
1059
1085
|
extracted = extract_from_hash(hash, kwargs)
|
|
1060
1086
|
|
|
@@ -1072,6 +1098,7 @@ module Kreuzberg
|
|
|
1072
1098
|
@use_cache = params[:use_cache] ? true : false
|
|
1073
1099
|
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
1074
1100
|
@force_ocr = params[:force_ocr] ? true : false
|
|
1101
|
+
@disable_ocr = params[:disable_ocr] ? true : false
|
|
1075
1102
|
@force_ocr_pages = params[:force_ocr_pages]
|
|
1076
1103
|
@include_document_structure = params[:include_document_structure] ? true : false
|
|
1077
1104
|
@ocr = normalize_config(params[:ocr], OCR)
|
|
@@ -1086,7 +1113,10 @@ module Kreuzberg
|
|
|
1086
1113
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
1087
1114
|
@layout = normalize_config(params[:layout], LayoutDetection)
|
|
1088
1115
|
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
1116
|
+
@acceleration = normalize_config(params[:acceleration], Acceleration)
|
|
1117
|
+
@email = normalize_config(params[:email], Email)
|
|
1089
1118
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
1119
|
+
@max_archive_depth = params[:max_archive_depth]&.to_i || 3
|
|
1090
1120
|
@output_format = validate_output_format(params[:output_format])
|
|
1091
1121
|
@result_format = validate_result_format(params[:result_format])
|
|
1092
1122
|
@cache_namespace = params[:cache_namespace]
|
|
@@ -1124,9 +1154,11 @@ module Kreuzberg
|
|
|
1124
1154
|
use_cache: @use_cache,
|
|
1125
1155
|
enable_quality_processing: @enable_quality_processing,
|
|
1126
1156
|
force_ocr: @force_ocr,
|
|
1157
|
+
disable_ocr: @disable_ocr,
|
|
1127
1158
|
force_ocr_pages: @force_ocr_pages,
|
|
1128
1159
|
include_document_structure: @include_document_structure,
|
|
1129
1160
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1161
|
+
max_archive_depth: @max_archive_depth,
|
|
1130
1162
|
output_format: @output_format,
|
|
1131
1163
|
result_format: @result_format,
|
|
1132
1164
|
cache_namespace: @cache_namespace,
|
|
@@ -1142,7 +1174,8 @@ module Kreuzberg
|
|
|
1142
1174
|
image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
|
|
1143
1175
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1144
1176
|
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1145
|
-
layout: @layout&.to_h, concurrency: @concurrency&.to_h
|
|
1177
|
+
layout: @layout&.to_h, concurrency: @concurrency&.to_h,
|
|
1178
|
+
acceleration: @acceleration&.to_h, email: @email&.to_h
|
|
1146
1179
|
}
|
|
1147
1180
|
end
|
|
1148
1181
|
|
|
@@ -1258,6 +1291,8 @@ module Kreuzberg
|
|
|
1258
1291
|
@enable_quality_processing = value ? true : false
|
|
1259
1292
|
when :force_ocr
|
|
1260
1293
|
@force_ocr = value ? true : false
|
|
1294
|
+
when :disable_ocr
|
|
1295
|
+
@disable_ocr = value ? true : false
|
|
1261
1296
|
when :force_ocr_pages
|
|
1262
1297
|
@force_ocr_pages = value
|
|
1263
1298
|
when :include_document_structure
|
|
@@ -1286,6 +1321,12 @@ module Kreuzberg
|
|
|
1286
1321
|
@layout = normalize_config(value, LayoutDetection)
|
|
1287
1322
|
when :concurrency
|
|
1288
1323
|
@concurrency = normalize_config(value, Concurrency)
|
|
1324
|
+
when :acceleration
|
|
1325
|
+
@acceleration = normalize_config(value, Acceleration)
|
|
1326
|
+
when :email
|
|
1327
|
+
@email = normalize_config(value, Email)
|
|
1328
|
+
when :max_archive_depth
|
|
1329
|
+
@max_archive_depth = value&.to_i || 3
|
|
1289
1330
|
when :max_concurrent_extractions
|
|
1290
1331
|
@max_concurrent_extractions = value&.to_i
|
|
1291
1332
|
when :output_format
|
|
@@ -1357,6 +1398,7 @@ module Kreuzberg
|
|
|
1357
1398
|
@use_cache = merged.use_cache
|
|
1358
1399
|
@enable_quality_processing = merged.enable_quality_processing
|
|
1359
1400
|
@force_ocr = merged.force_ocr
|
|
1401
|
+
@disable_ocr = merged.disable_ocr
|
|
1360
1402
|
@force_ocr_pages = merged.force_ocr_pages
|
|
1361
1403
|
@include_document_structure = merged.include_document_structure
|
|
1362
1404
|
@ocr = merged.ocr
|
|
@@ -1373,6 +1415,9 @@ module Kreuzberg
|
|
|
1373
1415
|
@html_options = merged.html_options
|
|
1374
1416
|
@pages = merged.pages
|
|
1375
1417
|
@layout = merged.layout
|
|
1418
|
+
@acceleration = merged.acceleration
|
|
1419
|
+
@email = merged.email
|
|
1420
|
+
@max_archive_depth = merged.max_archive_depth
|
|
1376
1421
|
end
|
|
1377
1422
|
|
|
1378
1423
|
def update_output_options(merged)
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -14,7 +14,8 @@ module Kreuzberg
|
|
|
14
14
|
class Result
|
|
15
15
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
16
16
|
:detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
|
|
17
|
-
:document, :extracted_keywords, :quality_score, :processing_warnings, :annotations
|
|
17
|
+
:document, :extracted_keywords, :quality_score, :processing_warnings, :annotations,
|
|
18
|
+
:uris, :children
|
|
18
19
|
|
|
19
20
|
# @!attribute [r] cells
|
|
20
21
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -51,6 +52,7 @@ module Kreuzberg
|
|
|
51
52
|
:total_chunks,
|
|
52
53
|
:first_page,
|
|
53
54
|
:last_page,
|
|
55
|
+
:chunk_type,
|
|
54
56
|
:embedding
|
|
55
57
|
) do
|
|
56
58
|
def to_h
|
|
@@ -63,6 +65,7 @@ module Kreuzberg
|
|
|
63
65
|
total_chunks: total_chunks,
|
|
64
66
|
first_page: first_page,
|
|
65
67
|
last_page: last_page,
|
|
68
|
+
chunk_type: chunk_type,
|
|
66
69
|
embedding: embedding
|
|
67
70
|
}
|
|
68
71
|
end
|
|
@@ -318,7 +321,7 @@ module Kreuzberg
|
|
|
318
321
|
#
|
|
319
322
|
# @param hash [Hash] Hash returned from native extension
|
|
320
323
|
#
|
|
321
|
-
# rubocop:disable Metrics/AbcSize
|
|
324
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
322
325
|
def initialize(hash)
|
|
323
326
|
@content = get_value(hash, 'content', '')
|
|
324
327
|
@mime_type = get_value(hash, 'mime_type', '')
|
|
@@ -337,14 +340,16 @@ module Kreuzberg
|
|
|
337
340
|
@quality_score = get_value(hash, 'quality_score')
|
|
338
341
|
@processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
|
|
339
342
|
@annotations = parse_annotations(get_value(hash, 'annotations'))
|
|
343
|
+
@uris = parse_uris(get_value(hash, 'uris'))
|
|
344
|
+
@children = parse_children(get_value(hash, 'children'))
|
|
340
345
|
end
|
|
341
|
-
# rubocop:enable Metrics/AbcSize
|
|
346
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
|
342
347
|
|
|
343
348
|
# Convert to hash
|
|
344
349
|
#
|
|
345
350
|
# @return [Hash] Hash representation
|
|
346
351
|
#
|
|
347
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
|
352
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
348
353
|
def to_h
|
|
349
354
|
{
|
|
350
355
|
content: @content,
|
|
@@ -362,10 +367,12 @@ module Kreuzberg
|
|
|
362
367
|
extracted_keywords: @extracted_keywords&.map(&:to_h),
|
|
363
368
|
quality_score: @quality_score,
|
|
364
369
|
processing_warnings: @processing_warnings.map(&:to_h),
|
|
365
|
-
annotations: @annotations&.map(&:to_h)
|
|
370
|
+
annotations: @annotations&.map(&:to_h),
|
|
371
|
+
uris: @uris&.map(&:to_h),
|
|
372
|
+
children: @children&.map(&:to_h)
|
|
366
373
|
}
|
|
367
374
|
end
|
|
368
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
|
375
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
369
376
|
|
|
370
377
|
# Convert to JSON
|
|
371
378
|
#
|
|
@@ -520,6 +527,7 @@ module Kreuzberg
|
|
|
520
527
|
total_chunks: chunk_hash['total_chunks'],
|
|
521
528
|
first_page: chunk_hash['first_page'],
|
|
522
529
|
last_page: chunk_hash['last_page'],
|
|
530
|
+
chunk_type: chunk_hash['chunk_type'],
|
|
523
531
|
embedding: chunk_hash['embedding']
|
|
524
532
|
)
|
|
525
533
|
end
|
|
@@ -738,6 +746,35 @@ module Kreuzberg
|
|
|
738
746
|
def bbox_field(bbox_data, primary_key, fallback_key)
|
|
739
747
|
(bbox_data[primary_key] || bbox_data[fallback_key])&.to_f
|
|
740
748
|
end
|
|
749
|
+
|
|
750
|
+
def parse_uris(uris_data)
|
|
751
|
+
return nil if uris_data.nil?
|
|
752
|
+
|
|
753
|
+
uris_data.map { |u| build_uri(u) }
|
|
754
|
+
end
|
|
755
|
+
|
|
756
|
+
def build_uri(u_hash)
|
|
757
|
+
Struct.new(:url, :label, :page, :kind).new(
|
|
758
|
+
url: u_hash['url'] || '',
|
|
759
|
+
label: u_hash['label'],
|
|
760
|
+
page: u_hash['page']&.to_i,
|
|
761
|
+
kind: u_hash['kind'] || 'hyperlink'
|
|
762
|
+
)
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
def parse_children(children_data)
|
|
766
|
+
return nil if children_data.nil?
|
|
767
|
+
|
|
768
|
+
children_data.map { |c| build_archive_entry(c) }
|
|
769
|
+
end
|
|
770
|
+
|
|
771
|
+
def build_archive_entry(c_hash)
|
|
772
|
+
Struct.new(:path, :mime_type, :result).new(
|
|
773
|
+
path: c_hash['path'] || '',
|
|
774
|
+
mime_type: c_hash['mime_type'] || '',
|
|
775
|
+
result: c_hash['result'] ? self.class.new(c_hash['result']) : nil
|
|
776
|
+
)
|
|
777
|
+
end
|
|
741
778
|
end
|
|
742
779
|
# rubocop:enable Metrics/ClassLength
|
|
743
780
|
end
|
data/lib/kreuzberg/types.rb
CHANGED
|
@@ -10,21 +10,24 @@ module Kreuzberg
|
|
|
10
10
|
#
|
|
11
11
|
# @example
|
|
12
12
|
# type = Kreuzberg::ElementType::TITLE
|
|
13
|
-
#
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
13
|
+
# Kreuzberg::ElementType.values # => ["title", "narrative_text", ...]
|
|
14
|
+
#
|
|
15
|
+
module ElementType
|
|
16
|
+
TITLE = 'title'
|
|
17
|
+
NARRATIVE_TEXT = 'narrative_text'
|
|
18
|
+
HEADING = 'heading'
|
|
19
|
+
LIST_ITEM = 'list_item'
|
|
20
|
+
TABLE = 'table'
|
|
21
|
+
IMAGE = 'image'
|
|
22
|
+
PAGE_BREAK = 'page_break'
|
|
23
|
+
CODE_BLOCK = 'code_block'
|
|
24
|
+
BLOCK_QUOTE = 'block_quote'
|
|
25
|
+
FOOTER = 'footer'
|
|
26
|
+
HEADER = 'header'
|
|
27
|
+
|
|
28
|
+
def self.values
|
|
29
|
+
[TITLE, NARRATIVE_TEXT, HEADING, LIST_ITEM, TABLE, IMAGE, PAGE_BREAK, CODE_BLOCK, BLOCK_QUOTE, FOOTER, HEADER]
|
|
30
|
+
end
|
|
28
31
|
end
|
|
29
32
|
|
|
30
33
|
# Bounding box coordinates for element positioning.
|
|
@@ -431,4 +434,191 @@ module Kreuzberg
|
|
|
431
434
|
const :page_number, T.nilable(Integer)
|
|
432
435
|
const :bounding_box, T.nilable(PdfAnnotationBoundingBox)
|
|
433
436
|
end
|
|
437
|
+
|
|
438
|
+
# An entry within an archive (zip, tar, etc.) extraction result.
|
|
439
|
+
#
|
|
440
|
+
# @example
|
|
441
|
+
# entry = Kreuzberg::ArchiveEntry.new(
|
|
442
|
+
# path: "readme.txt",
|
|
443
|
+
# mime_type: "text/plain",
|
|
444
|
+
# result: extraction_result
|
|
445
|
+
# )
|
|
446
|
+
#
|
|
447
|
+
class ArchiveEntry < T::Struct
|
|
448
|
+
extend T::Sig
|
|
449
|
+
|
|
450
|
+
const :path, String
|
|
451
|
+
const :mime_type, String
|
|
452
|
+
const :result, T.untyped
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# Extracted keyword with relevance metadata.
|
|
456
|
+
#
|
|
457
|
+
# @example
|
|
458
|
+
# kw = Kreuzberg::Keyword.new(
|
|
459
|
+
# text: "machine learning",
|
|
460
|
+
# score: 0.95,
|
|
461
|
+
# algorithm: "yake",
|
|
462
|
+
# positions: [42, 128]
|
|
463
|
+
# )
|
|
464
|
+
#
|
|
465
|
+
class Keyword < T::Struct
|
|
466
|
+
extend T::Sig
|
|
467
|
+
|
|
468
|
+
const :text, String
|
|
469
|
+
const :score, Float
|
|
470
|
+
const :algorithm, String
|
|
471
|
+
const :positions, T.nilable(T::Array[Integer])
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# A table extracted from a document.
|
|
475
|
+
#
|
|
476
|
+
# @example
|
|
477
|
+
# table = Kreuzberg::Table.new(
|
|
478
|
+
# cells: [["A", "B"], ["1", "2"]],
|
|
479
|
+
# markdown: "| A | B |\n|---|---|\n| 1 | 2 |",
|
|
480
|
+
# page_number: 1,
|
|
481
|
+
# bounding_box: bbox
|
|
482
|
+
# )
|
|
483
|
+
#
|
|
484
|
+
class Table < T::Struct
|
|
485
|
+
extend T::Sig
|
|
486
|
+
|
|
487
|
+
const :cells, T::Array[T::Array[String]]
|
|
488
|
+
const :markdown, String
|
|
489
|
+
const :page_number, Integer
|
|
490
|
+
const :bounding_box, T.nilable(BoundingBox)
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
# A URI extracted from a document.
|
|
494
|
+
#
|
|
495
|
+
# @example
|
|
496
|
+
# uri = Kreuzberg::Uri.new(
|
|
497
|
+
# url: "https://example.com",
|
|
498
|
+
# kind: "hyperlink",
|
|
499
|
+
# label: "Example",
|
|
500
|
+
# page: 1
|
|
501
|
+
# )
|
|
502
|
+
#
|
|
503
|
+
class Uri < T::Struct
|
|
504
|
+
extend T::Sig
|
|
505
|
+
|
|
506
|
+
const :url, String
|
|
507
|
+
const :kind, String
|
|
508
|
+
const :label, T.nilable(String)
|
|
509
|
+
const :page, T.nilable(Integer)
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
# Content layer classification for document nodes.
|
|
513
|
+
module ContentLayer
|
|
514
|
+
BODY = 'body'
|
|
515
|
+
HEADER = 'header'
|
|
516
|
+
FOOTER = 'footer'
|
|
517
|
+
FOOTNOTE = 'footnote'
|
|
518
|
+
|
|
519
|
+
def self.values
|
|
520
|
+
[BODY, HEADER, FOOTER, FOOTNOTE]
|
|
521
|
+
end
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
# Algorithm used for keyword extraction.
|
|
525
|
+
module KeywordAlgorithm
|
|
526
|
+
YAKE = 'yake'
|
|
527
|
+
RAKE = 'rake'
|
|
528
|
+
|
|
529
|
+
def self.values
|
|
530
|
+
[YAKE, RAKE]
|
|
531
|
+
end
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
# OCR element granularity level.
|
|
535
|
+
module OcrElementLevel
|
|
536
|
+
WORD = 'word'
|
|
537
|
+
LINE = 'line'
|
|
538
|
+
BLOCK = 'block'
|
|
539
|
+
PAGE = 'page'
|
|
540
|
+
|
|
541
|
+
def self.values
|
|
542
|
+
[WORD, LINE, BLOCK, PAGE]
|
|
543
|
+
end
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
# Output format for extraction results.
|
|
547
|
+
module OutputFormat
|
|
548
|
+
PLAIN = 'plain'
|
|
549
|
+
MARKDOWN = 'markdown'
|
|
550
|
+
DJOT = 'djot'
|
|
551
|
+
HTML = 'html'
|
|
552
|
+
JSON = 'json'
|
|
553
|
+
STRUCTURED = 'structured'
|
|
554
|
+
|
|
555
|
+
def self.values
|
|
556
|
+
[PLAIN, MARKDOWN, DJOT, HTML, JSON, STRUCTURED]
|
|
557
|
+
end
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
# Page unit type classification.
|
|
561
|
+
module PageUnitType
|
|
562
|
+
PAGE = 'page'
|
|
563
|
+
SLIDE = 'slide'
|
|
564
|
+
SHEET = 'sheet'
|
|
565
|
+
|
|
566
|
+
def self.values
|
|
567
|
+
[PAGE, SLIDE, SHEET]
|
|
568
|
+
end
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# PDF annotation type classification.
|
|
572
|
+
module PdfAnnotationType
|
|
573
|
+
TEXT = 'text'
|
|
574
|
+
HIGHLIGHT = 'highlight'
|
|
575
|
+
LINK = 'link'
|
|
576
|
+
STAMP = 'stamp'
|
|
577
|
+
UNDERLINE = 'underline'
|
|
578
|
+
STRIKE_OUT = 'strike_out'
|
|
579
|
+
OTHER = 'other'
|
|
580
|
+
|
|
581
|
+
def self.values
|
|
582
|
+
[TEXT, HIGHLIGHT, LINK, STAMP, UNDERLINE, STRIKE_OUT, OTHER]
|
|
583
|
+
end
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
# Relationship kind between document elements.
|
|
587
|
+
module RelationshipKind
|
|
588
|
+
FOOTNOTE_REFERENCE = 'footnote_reference'
|
|
589
|
+
CITATION_REFERENCE = 'citation_reference'
|
|
590
|
+
INTERNAL_LINK = 'internal_link'
|
|
591
|
+
CAPTION = 'caption'
|
|
592
|
+
LABEL = 'label'
|
|
593
|
+
TOC_ENTRY = 'toc_entry'
|
|
594
|
+
CROSS_REFERENCE = 'cross_reference'
|
|
595
|
+
|
|
596
|
+
def self.values
|
|
597
|
+
[FOOTNOTE_REFERENCE, CITATION_REFERENCE, INTERNAL_LINK, CAPTION, LABEL, TOC_ENTRY, CROSS_REFERENCE]
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
# Result format classification.
|
|
602
|
+
module ResultFormat
|
|
603
|
+
UNIFIED = 'unified'
|
|
604
|
+
ELEMENT_BASED = 'element_based'
|
|
605
|
+
|
|
606
|
+
def self.values
|
|
607
|
+
[UNIFIED, ELEMENT_BASED]
|
|
608
|
+
end
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
# URI kind classification.
|
|
612
|
+
module UriKind
|
|
613
|
+
HYPERLINK = 'hyperlink'
|
|
614
|
+
IMAGE = 'image'
|
|
615
|
+
ANCHOR = 'anchor'
|
|
616
|
+
CITATION = 'citation'
|
|
617
|
+
REFERENCE = 'reference'
|
|
618
|
+
EMAIL = 'email'
|
|
619
|
+
|
|
620
|
+
def self.values
|
|
621
|
+
[HYPERLINK, IMAGE, ANCHOR, CITATION, REFERENCE, EMAIL]
|
|
622
|
+
end
|
|
623
|
+
end
|
|
434
624
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|