kreuzberg 4.6.3-aarch64-linux → 4.7.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -5
- data/lib/kreuzberg/config.rb +17 -13
- data/lib/kreuzberg/result.rb +43 -6
- data/lib/kreuzberg/types.rb +205 -15
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +303 -0
- metadata +2 -22
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/metadata_types_spec.rb +0 -1253
- data/spec/serialization_spec.rb +0 -134
- data/spec/smoke/package_spec.rb +0 -199
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -434
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -230
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/output_format_spec.rb +0 -380
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1777d29275333b413764e5417f805de33ad0f9378dbb2a6372d9d573a23ae0e9
|
|
4
|
+
data.tar.gz: 34fad03e39480a52e6a2f91ea4a7a17335eacaf97f8b17d32d440d617842b068
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 93cc0429e0d310125071f7091c3029936683b69c665b499a411fa9cd8df1e22077fb6ff3c009040b3395f6cabc92fcac490db3a2bf013680a9c1393f15245799
|
|
7
|
+
data.tar.gz: e62adbc1ed01632d96f397d89b18050e403d6363d8a2658f4b48da99dc2d284a48cc0891d77cc0afb7483ce9383f3eab6a1367bd2207083b5c1fec48b8543c8e
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -42,13 +42,16 @@
|
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
45
|
-
<img src="https://img.shields.io/badge/License-MIT-
|
|
45
|
+
<img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
|
|
46
46
|
</a>
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
|
-
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-
|
|
48
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
|
|
49
|
+
</a>
|
|
50
|
+
<a href="https://docs.kreuzberg.dev/demo.html">
|
|
51
|
+
<img src="https://img.shields.io/badge/%E2%96%B6%EF%B8%8F_Live_Demo-007ec6" alt="Live Demo">
|
|
49
52
|
</a>
|
|
50
53
|
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
-
<img src="https://img.shields.io/badge/%F0%9F%A4%
|
|
54
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97_Hugging_Face-007ec6" alt="Hugging Face">
|
|
52
55
|
</a>
|
|
53
56
|
</div>
|
|
54
57
|
|
|
@@ -61,7 +64,7 @@
|
|
|
61
64
|
</div>
|
|
62
65
|
|
|
63
66
|
|
|
64
|
-
Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
67
|
+
Extract text, tables, images, and metadata from 91+ file formats and 248 programming languages including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
65
68
|
|
|
66
69
|
|
|
67
70
|
## Installation
|
|
@@ -74,6 +77,7 @@ Install via one of the supported package managers:
|
|
|
74
77
|
|
|
75
78
|
|
|
76
79
|
**gem:**
|
|
80
|
+
|
|
77
81
|
```bash
|
|
78
82
|
gem install kreuzberg
|
|
79
83
|
```
|
|
@@ -82,6 +86,7 @@ gem install kreuzberg
|
|
|
82
86
|
|
|
83
87
|
|
|
84
88
|
**Bundler:**
|
|
89
|
+
|
|
85
90
|
```ruby
|
|
86
91
|
gem 'kreuzberg'
|
|
87
92
|
```
|
|
@@ -258,6 +263,19 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
258
263
|
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
259
264
|
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
260
265
|
|
|
266
|
+
#### Code Intelligence (248 Languages)
|
|
267
|
+
|
|
268
|
+
| Feature | Description |
|
|
269
|
+
|---------|-------------|
|
|
270
|
+
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
|
|
271
|
+
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
|
|
272
|
+
| **Symbol Extraction** | Variables, constants, type aliases, properties |
|
|
273
|
+
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
|
|
274
|
+
| **Diagnostics** | Parse errors with line/column positions |
|
|
275
|
+
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
|
|
276
|
+
|
|
277
|
+
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
|
|
278
|
+
|
|
261
279
|
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
262
280
|
|
|
263
281
|
### Key Capabilities
|
|
@@ -279,6 +297,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
279
297
|
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
280
298
|
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
281
299
|
- **Language Detection** - Detect and support multiple languages in documents
|
|
300
|
+
|
|
301
|
+
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [248 programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
|
|
302
|
+
|
|
282
303
|
- **Configuration** - Fine-grained control over extraction behavior
|
|
283
304
|
|
|
284
305
|
### Performance Characteristics
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -858,21 +858,20 @@ module Kreuzberg
|
|
|
858
858
|
|
|
859
859
|
# Layout detection configuration
|
|
860
860
|
#
|
|
861
|
-
# @example Basic usage
|
|
862
|
-
# layout = LayoutDetection.new
|
|
861
|
+
# @example Basic usage
|
|
862
|
+
# layout = LayoutDetection.new
|
|
863
863
|
#
|
|
864
|
-
# @example
|
|
864
|
+
# @example With custom threshold and table model
|
|
865
865
|
# layout = LayoutDetection.new(
|
|
866
|
-
# preset: "accurate",
|
|
867
866
|
# confidence_threshold: 0.5,
|
|
868
|
-
# apply_heuristics: true
|
|
867
|
+
# apply_heuristics: true,
|
|
868
|
+
# table_model: "tatr"
|
|
869
869
|
# )
|
|
870
870
|
#
|
|
871
871
|
class LayoutDetection
|
|
872
|
-
attr_reader :
|
|
872
|
+
attr_reader :confidence_threshold, :apply_heuristics, :table_model
|
|
873
873
|
|
|
874
|
-
def initialize(
|
|
875
|
-
@preset = preset.to_s
|
|
874
|
+
def initialize(confidence_threshold: nil, apply_heuristics: true, table_model: nil)
|
|
876
875
|
@confidence_threshold = confidence_threshold&.to_f
|
|
877
876
|
@apply_heuristics = apply_heuristics ? true : false
|
|
878
877
|
@table_model = table_model&.to_s
|
|
@@ -880,7 +879,6 @@ module Kreuzberg
|
|
|
880
879
|
|
|
881
880
|
def to_h
|
|
882
881
|
{
|
|
883
|
-
preset: @preset,
|
|
884
882
|
confidence_threshold: @confidence_threshold,
|
|
885
883
|
apply_heuristics: @apply_heuristics,
|
|
886
884
|
table_model: @table_model
|
|
@@ -945,7 +943,7 @@ module Kreuzberg
|
|
|
945
943
|
# )
|
|
946
944
|
#
|
|
947
945
|
class Extraction
|
|
948
|
-
attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
|
|
946
|
+
attr_reader :use_cache, :enable_quality_processing, :force_ocr, :disable_ocr, :force_ocr_pages,
|
|
949
947
|
:include_document_structure,
|
|
950
948
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
951
949
|
:images, :postprocessor,
|
|
@@ -974,8 +972,8 @@ module Kreuzberg
|
|
|
974
972
|
#
|
|
975
973
|
# Keys that are allowed in the Extraction config
|
|
976
974
|
ALLOWED_KEYS = %i[
|
|
977
|
-
use_cache enable_quality_processing force_ocr force_ocr_pages
|
|
978
|
-
language_detection pdf_options image_extraction
|
|
975
|
+
use_cache enable_quality_processing force_ocr disable_ocr force_ocr_pages
|
|
976
|
+
include_document_structure ocr chunking language_detection pdf_options image_extraction
|
|
979
977
|
postprocessor token_reduction keywords html_options pages
|
|
980
978
|
max_concurrent_extractions output_format result_format
|
|
981
979
|
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
@@ -1040,6 +1038,7 @@ module Kreuzberg
|
|
|
1040
1038
|
use_cache: true,
|
|
1041
1039
|
enable_quality_processing: true,
|
|
1042
1040
|
force_ocr: false,
|
|
1041
|
+
disable_ocr: false,
|
|
1043
1042
|
force_ocr_pages: nil,
|
|
1044
1043
|
include_document_structure: false,
|
|
1045
1044
|
ocr: nil,
|
|
@@ -1066,7 +1065,7 @@ module Kreuzberg
|
|
|
1066
1065
|
email: nil)
|
|
1067
1066
|
kwargs = {
|
|
1068
1067
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1069
|
-
force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
|
|
1068
|
+
force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
|
|
1070
1069
|
include_document_structure: include_document_structure,
|
|
1071
1070
|
ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
1072
1071
|
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
@@ -1099,6 +1098,7 @@ module Kreuzberg
|
|
|
1099
1098
|
@use_cache = params[:use_cache] ? true : false
|
|
1100
1099
|
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
1101
1100
|
@force_ocr = params[:force_ocr] ? true : false
|
|
1101
|
+
@disable_ocr = params[:disable_ocr] ? true : false
|
|
1102
1102
|
@force_ocr_pages = params[:force_ocr_pages]
|
|
1103
1103
|
@include_document_structure = params[:include_document_structure] ? true : false
|
|
1104
1104
|
@ocr = normalize_config(params[:ocr], OCR)
|
|
@@ -1154,6 +1154,7 @@ module Kreuzberg
|
|
|
1154
1154
|
use_cache: @use_cache,
|
|
1155
1155
|
enable_quality_processing: @enable_quality_processing,
|
|
1156
1156
|
force_ocr: @force_ocr,
|
|
1157
|
+
disable_ocr: @disable_ocr,
|
|
1157
1158
|
force_ocr_pages: @force_ocr_pages,
|
|
1158
1159
|
include_document_structure: @include_document_structure,
|
|
1159
1160
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
@@ -1290,6 +1291,8 @@ module Kreuzberg
|
|
|
1290
1291
|
@enable_quality_processing = value ? true : false
|
|
1291
1292
|
when :force_ocr
|
|
1292
1293
|
@force_ocr = value ? true : false
|
|
1294
|
+
when :disable_ocr
|
|
1295
|
+
@disable_ocr = value ? true : false
|
|
1293
1296
|
when :force_ocr_pages
|
|
1294
1297
|
@force_ocr_pages = value
|
|
1295
1298
|
when :include_document_structure
|
|
@@ -1395,6 +1398,7 @@ module Kreuzberg
|
|
|
1395
1398
|
@use_cache = merged.use_cache
|
|
1396
1399
|
@enable_quality_processing = merged.enable_quality_processing
|
|
1397
1400
|
@force_ocr = merged.force_ocr
|
|
1401
|
+
@disable_ocr = merged.disable_ocr
|
|
1398
1402
|
@force_ocr_pages = merged.force_ocr_pages
|
|
1399
1403
|
@include_document_structure = merged.include_document_structure
|
|
1400
1404
|
@ocr = merged.ocr
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -14,7 +14,8 @@ module Kreuzberg
|
|
|
14
14
|
class Result
|
|
15
15
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
16
16
|
:detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
|
|
17
|
-
:document, :extracted_keywords, :quality_score, :processing_warnings, :annotations
|
|
17
|
+
:document, :extracted_keywords, :quality_score, :processing_warnings, :annotations,
|
|
18
|
+
:uris, :children
|
|
18
19
|
|
|
19
20
|
# @!attribute [r] cells
|
|
20
21
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -51,6 +52,7 @@ module Kreuzberg
|
|
|
51
52
|
:total_chunks,
|
|
52
53
|
:first_page,
|
|
53
54
|
:last_page,
|
|
55
|
+
:chunk_type,
|
|
54
56
|
:embedding
|
|
55
57
|
) do
|
|
56
58
|
def to_h
|
|
@@ -63,6 +65,7 @@ module Kreuzberg
|
|
|
63
65
|
total_chunks: total_chunks,
|
|
64
66
|
first_page: first_page,
|
|
65
67
|
last_page: last_page,
|
|
68
|
+
chunk_type: chunk_type,
|
|
66
69
|
embedding: embedding
|
|
67
70
|
}
|
|
68
71
|
end
|
|
@@ -318,7 +321,7 @@ module Kreuzberg
|
|
|
318
321
|
#
|
|
319
322
|
# @param hash [Hash] Hash returned from native extension
|
|
320
323
|
#
|
|
321
|
-
# rubocop:disable Metrics/AbcSize
|
|
324
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
322
325
|
def initialize(hash)
|
|
323
326
|
@content = get_value(hash, 'content', '')
|
|
324
327
|
@mime_type = get_value(hash, 'mime_type', '')
|
|
@@ -337,14 +340,16 @@ module Kreuzberg
|
|
|
337
340
|
@quality_score = get_value(hash, 'quality_score')
|
|
338
341
|
@processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
|
|
339
342
|
@annotations = parse_annotations(get_value(hash, 'annotations'))
|
|
343
|
+
@uris = parse_uris(get_value(hash, 'uris'))
|
|
344
|
+
@children = parse_children(get_value(hash, 'children'))
|
|
340
345
|
end
|
|
341
|
-
# rubocop:enable Metrics/AbcSize
|
|
346
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
|
342
347
|
|
|
343
348
|
# Convert to hash
|
|
344
349
|
#
|
|
345
350
|
# @return [Hash] Hash representation
|
|
346
351
|
#
|
|
347
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
|
352
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
348
353
|
def to_h
|
|
349
354
|
{
|
|
350
355
|
content: @content,
|
|
@@ -362,10 +367,12 @@ module Kreuzberg
|
|
|
362
367
|
extracted_keywords: @extracted_keywords&.map(&:to_h),
|
|
363
368
|
quality_score: @quality_score,
|
|
364
369
|
processing_warnings: @processing_warnings.map(&:to_h),
|
|
365
|
-
annotations: @annotations&.map(&:to_h)
|
|
370
|
+
annotations: @annotations&.map(&:to_h),
|
|
371
|
+
uris: @uris&.map(&:to_h),
|
|
372
|
+
children: @children&.map(&:to_h)
|
|
366
373
|
}
|
|
367
374
|
end
|
|
368
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
|
375
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
369
376
|
|
|
370
377
|
# Convert to JSON
|
|
371
378
|
#
|
|
@@ -520,6 +527,7 @@ module Kreuzberg
|
|
|
520
527
|
total_chunks: chunk_hash['total_chunks'],
|
|
521
528
|
first_page: chunk_hash['first_page'],
|
|
522
529
|
last_page: chunk_hash['last_page'],
|
|
530
|
+
chunk_type: chunk_hash['chunk_type'],
|
|
523
531
|
embedding: chunk_hash['embedding']
|
|
524
532
|
)
|
|
525
533
|
end
|
|
@@ -738,6 +746,35 @@ module Kreuzberg
|
|
|
738
746
|
def bbox_field(bbox_data, primary_key, fallback_key)
|
|
739
747
|
(bbox_data[primary_key] || bbox_data[fallback_key])&.to_f
|
|
740
748
|
end
|
|
749
|
+
|
|
750
|
+
def parse_uris(uris_data)
|
|
751
|
+
return nil if uris_data.nil?
|
|
752
|
+
|
|
753
|
+
uris_data.map { |u| build_uri(u) }
|
|
754
|
+
end
|
|
755
|
+
|
|
756
|
+
def build_uri(u_hash)
|
|
757
|
+
Struct.new(:url, :label, :page, :kind).new(
|
|
758
|
+
url: u_hash['url'] || '',
|
|
759
|
+
label: u_hash['label'],
|
|
760
|
+
page: u_hash['page']&.to_i,
|
|
761
|
+
kind: u_hash['kind'] || 'hyperlink'
|
|
762
|
+
)
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
def parse_children(children_data)
|
|
766
|
+
return nil if children_data.nil?
|
|
767
|
+
|
|
768
|
+
children_data.map { |c| build_archive_entry(c) }
|
|
769
|
+
end
|
|
770
|
+
|
|
771
|
+
def build_archive_entry(c_hash)
|
|
772
|
+
Struct.new(:path, :mime_type, :result).new(
|
|
773
|
+
path: c_hash['path'] || '',
|
|
774
|
+
mime_type: c_hash['mime_type'] || '',
|
|
775
|
+
result: c_hash['result'] ? self.class.new(c_hash['result']) : nil
|
|
776
|
+
)
|
|
777
|
+
end
|
|
741
778
|
end
|
|
742
779
|
# rubocop:enable Metrics/ClassLength
|
|
743
780
|
end
|
data/lib/kreuzberg/types.rb
CHANGED
|
@@ -10,21 +10,24 @@ module Kreuzberg
|
|
|
10
10
|
#
|
|
11
11
|
# @example
|
|
12
12
|
# type = Kreuzberg::ElementType::TITLE
|
|
13
|
-
#
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
13
|
+
# Kreuzberg::ElementType.values # => ["title", "narrative_text", ...]
|
|
14
|
+
#
|
|
15
|
+
module ElementType
|
|
16
|
+
TITLE = 'title'
|
|
17
|
+
NARRATIVE_TEXT = 'narrative_text'
|
|
18
|
+
HEADING = 'heading'
|
|
19
|
+
LIST_ITEM = 'list_item'
|
|
20
|
+
TABLE = 'table'
|
|
21
|
+
IMAGE = 'image'
|
|
22
|
+
PAGE_BREAK = 'page_break'
|
|
23
|
+
CODE_BLOCK = 'code_block'
|
|
24
|
+
BLOCK_QUOTE = 'block_quote'
|
|
25
|
+
FOOTER = 'footer'
|
|
26
|
+
HEADER = 'header'
|
|
27
|
+
|
|
28
|
+
def self.values
|
|
29
|
+
[TITLE, NARRATIVE_TEXT, HEADING, LIST_ITEM, TABLE, IMAGE, PAGE_BREAK, CODE_BLOCK, BLOCK_QUOTE, FOOTER, HEADER]
|
|
30
|
+
end
|
|
28
31
|
end
|
|
29
32
|
|
|
30
33
|
# Bounding box coordinates for element positioning.
|
|
@@ -431,4 +434,191 @@ module Kreuzberg
|
|
|
431
434
|
const :page_number, T.nilable(Integer)
|
|
432
435
|
const :bounding_box, T.nilable(PdfAnnotationBoundingBox)
|
|
433
436
|
end
|
|
437
|
+
|
|
438
|
+
# An entry within an archive (zip, tar, etc.) extraction result.
|
|
439
|
+
#
|
|
440
|
+
# @example
|
|
441
|
+
# entry = Kreuzberg::ArchiveEntry.new(
|
|
442
|
+
# path: "readme.txt",
|
|
443
|
+
# mime_type: "text/plain",
|
|
444
|
+
# result: extraction_result
|
|
445
|
+
# )
|
|
446
|
+
#
|
|
447
|
+
class ArchiveEntry < T::Struct
|
|
448
|
+
extend T::Sig
|
|
449
|
+
|
|
450
|
+
const :path, String
|
|
451
|
+
const :mime_type, String
|
|
452
|
+
const :result, T.untyped
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# Extracted keyword with relevance metadata.
|
|
456
|
+
#
|
|
457
|
+
# @example
|
|
458
|
+
# kw = Kreuzberg::Keyword.new(
|
|
459
|
+
# text: "machine learning",
|
|
460
|
+
# score: 0.95,
|
|
461
|
+
# algorithm: "yake",
|
|
462
|
+
# positions: [42, 128]
|
|
463
|
+
# )
|
|
464
|
+
#
|
|
465
|
+
class Keyword < T::Struct
|
|
466
|
+
extend T::Sig
|
|
467
|
+
|
|
468
|
+
const :text, String
|
|
469
|
+
const :score, Float
|
|
470
|
+
const :algorithm, String
|
|
471
|
+
const :positions, T.nilable(T::Array[Integer])
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# A table extracted from a document.
|
|
475
|
+
#
|
|
476
|
+
# @example
|
|
477
|
+
# table = Kreuzberg::Table.new(
|
|
478
|
+
# cells: [["A", "B"], ["1", "2"]],
|
|
479
|
+
# markdown: "| A | B |\n|---|---|\n| 1 | 2 |",
|
|
480
|
+
# page_number: 1,
|
|
481
|
+
# bounding_box: bbox
|
|
482
|
+
# )
|
|
483
|
+
#
|
|
484
|
+
class Table < T::Struct
|
|
485
|
+
extend T::Sig
|
|
486
|
+
|
|
487
|
+
const :cells, T::Array[T::Array[String]]
|
|
488
|
+
const :markdown, String
|
|
489
|
+
const :page_number, Integer
|
|
490
|
+
const :bounding_box, T.nilable(BoundingBox)
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
# A URI extracted from a document.
|
|
494
|
+
#
|
|
495
|
+
# @example
|
|
496
|
+
# uri = Kreuzberg::Uri.new(
|
|
497
|
+
# url: "https://example.com",
|
|
498
|
+
# kind: "hyperlink",
|
|
499
|
+
# label: "Example",
|
|
500
|
+
# page: 1
|
|
501
|
+
# )
|
|
502
|
+
#
|
|
503
|
+
class Uri < T::Struct
|
|
504
|
+
extend T::Sig
|
|
505
|
+
|
|
506
|
+
const :url, String
|
|
507
|
+
const :kind, String
|
|
508
|
+
const :label, T.nilable(String)
|
|
509
|
+
const :page, T.nilable(Integer)
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
# Content layer classification for document nodes.
|
|
513
|
+
module ContentLayer
|
|
514
|
+
BODY = 'body'
|
|
515
|
+
HEADER = 'header'
|
|
516
|
+
FOOTER = 'footer'
|
|
517
|
+
FOOTNOTE = 'footnote'
|
|
518
|
+
|
|
519
|
+
def self.values
|
|
520
|
+
[BODY, HEADER, FOOTER, FOOTNOTE]
|
|
521
|
+
end
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
# Algorithm used for keyword extraction.
|
|
525
|
+
module KeywordAlgorithm
|
|
526
|
+
YAKE = 'yake'
|
|
527
|
+
RAKE = 'rake'
|
|
528
|
+
|
|
529
|
+
def self.values
|
|
530
|
+
[YAKE, RAKE]
|
|
531
|
+
end
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
# OCR element granularity level.
|
|
535
|
+
module OcrElementLevel
|
|
536
|
+
WORD = 'word'
|
|
537
|
+
LINE = 'line'
|
|
538
|
+
BLOCK = 'block'
|
|
539
|
+
PAGE = 'page'
|
|
540
|
+
|
|
541
|
+
def self.values
|
|
542
|
+
[WORD, LINE, BLOCK, PAGE]
|
|
543
|
+
end
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
# Output format for extraction results.
|
|
547
|
+
module OutputFormat
|
|
548
|
+
PLAIN = 'plain'
|
|
549
|
+
MARKDOWN = 'markdown'
|
|
550
|
+
DJOT = 'djot'
|
|
551
|
+
HTML = 'html'
|
|
552
|
+
JSON = 'json'
|
|
553
|
+
STRUCTURED = 'structured'
|
|
554
|
+
|
|
555
|
+
def self.values
|
|
556
|
+
[PLAIN, MARKDOWN, DJOT, HTML, JSON, STRUCTURED]
|
|
557
|
+
end
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
# Page unit type classification.
|
|
561
|
+
module PageUnitType
|
|
562
|
+
PAGE = 'page'
|
|
563
|
+
SLIDE = 'slide'
|
|
564
|
+
SHEET = 'sheet'
|
|
565
|
+
|
|
566
|
+
def self.values
|
|
567
|
+
[PAGE, SLIDE, SHEET]
|
|
568
|
+
end
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# PDF annotation type classification.
|
|
572
|
+
module PdfAnnotationType
|
|
573
|
+
TEXT = 'text'
|
|
574
|
+
HIGHLIGHT = 'highlight'
|
|
575
|
+
LINK = 'link'
|
|
576
|
+
STAMP = 'stamp'
|
|
577
|
+
UNDERLINE = 'underline'
|
|
578
|
+
STRIKE_OUT = 'strike_out'
|
|
579
|
+
OTHER = 'other'
|
|
580
|
+
|
|
581
|
+
def self.values
|
|
582
|
+
[TEXT, HIGHLIGHT, LINK, STAMP, UNDERLINE, STRIKE_OUT, OTHER]
|
|
583
|
+
end
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
# Relationship kind between document elements.
|
|
587
|
+
module RelationshipKind
|
|
588
|
+
FOOTNOTE_REFERENCE = 'footnote_reference'
|
|
589
|
+
CITATION_REFERENCE = 'citation_reference'
|
|
590
|
+
INTERNAL_LINK = 'internal_link'
|
|
591
|
+
CAPTION = 'caption'
|
|
592
|
+
LABEL = 'label'
|
|
593
|
+
TOC_ENTRY = 'toc_entry'
|
|
594
|
+
CROSS_REFERENCE = 'cross_reference'
|
|
595
|
+
|
|
596
|
+
def self.values
|
|
597
|
+
[FOOTNOTE_REFERENCE, CITATION_REFERENCE, INTERNAL_LINK, CAPTION, LABEL, TOC_ENTRY, CROSS_REFERENCE]
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
# Result format classification.
|
|
602
|
+
module ResultFormat
|
|
603
|
+
UNIFIED = 'unified'
|
|
604
|
+
ELEMENT_BASED = 'element_based'
|
|
605
|
+
|
|
606
|
+
def self.values
|
|
607
|
+
[UNIFIED, ELEMENT_BASED]
|
|
608
|
+
end
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
# URI kind classification.
|
|
612
|
+
module UriKind
|
|
613
|
+
HYPERLINK = 'hyperlink'
|
|
614
|
+
IMAGE = 'image'
|
|
615
|
+
ANCHOR = 'anchor'
|
|
616
|
+
CITATION = 'citation'
|
|
617
|
+
REFERENCE = 'reference'
|
|
618
|
+
EMAIL = 'email'
|
|
619
|
+
|
|
620
|
+
def self.values
|
|
621
|
+
[HYPERLINK, IMAGE, ANCHOR, CITATION, REFERENCE, EMAIL]
|
|
622
|
+
end
|
|
623
|
+
end
|
|
434
624
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|