kreuzberg 4.6.2-aarch64-linux → 4.7.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 406c791db1a8cb29e3ff2e89a60e0a29ed73c0ff0548338b60c3574bd5944f6f
4
- data.tar.gz: 7565bbe0708afceadc13f43b57103a03529992f705caf21f1ddaa00e68ad27d6
3
+ metadata.gz: 1777d29275333b413764e5417f805de33ad0f9378dbb2a6372d9d573a23ae0e9
4
+ data.tar.gz: 34fad03e39480a52e6a2f91ea4a7a17335eacaf97f8b17d32d440d617842b068
5
5
  SHA512:
6
- metadata.gz: a680fd2406f8dac338a53ab303ef29d117462cbf376d48be4af48846f7d4f82d4e7999e2aeac17a404e5da7feb79f1cccfb8f240153c0c36a4274e16eeb50f6f
7
- data.tar.gz: dc1c1dc215020a65560490e159e38d6780b703bae597279cb0e3d2c369afce7a7bee06984972987d1c3a69fe2040da1cb1e2812df5c542590300dfcb09e5e7d7
6
+ metadata.gz: 93cc0429e0d310125071f7091c3029936683b69c665b499a411fa9cd8df1e22077fb6ff3c009040b3395f6cabc92fcac490db3a2bf013680a9c1393f15245799
7
+ data.tar.gz: e62adbc1ed01632d96f397d89b18050e403d6363d8a2658f4b48da99dc2d284a48cc0891d77cc0afb7483ce9383f3eab6a1367bd2207083b5c1fec48b8543c8e
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -42,13 +42,16 @@
42
42
 
43
43
  <!-- Project Info -->
44
44
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
45
- <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
45
+ <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
46
46
  </a>
47
47
  <a href="https://docs.kreuzberg.dev">
48
- <img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
48
+ <img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
49
+ </a>
50
+ <a href="https://docs.kreuzberg.dev/demo.html">
51
+ <img src="https://img.shields.io/badge/%E2%96%B6%EF%B8%8F_Live_Demo-007ec6" alt="Live Demo">
49
52
  </a>
50
53
  <a href="https://huggingface.co/Kreuzberg">
51
- <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
54
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97_Hugging_Face-007ec6" alt="Hugging Face">
52
55
  </a>
53
56
  </div>
54
57
 
@@ -61,7 +64,7 @@
61
64
  </div>
62
65
 
63
66
 
64
- Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
67
+ Extract text, tables, images, and metadata from 91+ file formats and 248 programming languages including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
65
68
 
66
69
 
67
70
  ## Installation
@@ -74,6 +77,7 @@ Install via one of the supported package managers:
74
77
 
75
78
 
76
79
  **gem:**
80
+
77
81
  ```bash
78
82
  gem install kreuzberg
79
83
  ```
@@ -82,6 +86,7 @@ gem install kreuzberg
82
86
 
83
87
 
84
88
  **Bundler:**
89
+
85
90
  ```ruby
86
91
  gem 'kreuzberg'
87
92
  ```
@@ -258,6 +263,19 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
258
263
  | **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
259
264
  | **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
260
265
 
266
+ #### Code Intelligence (248 Languages)
267
+
268
+ | Feature | Description |
269
+ |---------|-------------|
270
+ | **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
271
+ | **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
272
+ | **Symbol Extraction** | Variables, constants, type aliases, properties |
273
+ | **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
274
+ | **Diagnostics** | Parse errors with line/column positions |
275
+ | **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
276
+
277
+ Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
278
+
261
279
  **[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
262
280
 
263
281
  ### Key Capabilities
@@ -279,6 +297,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
279
297
  - **Batch Processing** - Efficiently process multiple documents in parallel
280
298
  - **Memory Efficient** - Stream large files without loading entirely into memory
281
299
  - **Language Detection** - Detect and support multiple languages in documents
300
+
301
+ - **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [248 programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
302
+
282
303
  - **Configuration** - Fine-grained control over extraction behavior
283
304
 
284
305
  ### Performance Characteristics
@@ -837,23 +837,41 @@ module Kreuzberg
837
837
  end
838
838
  end
839
839
 
840
+ # Email extraction configuration
841
+ #
842
+ # @example With fallback codepage
843
+ # email = Email.new(msg_fallback_codepage: 1251)
844
+ #
845
+ class Email
846
+ attr_reader :msg_fallback_codepage
847
+
848
+ def initialize(msg_fallback_codepage: nil)
849
+ @msg_fallback_codepage = msg_fallback_codepage&.to_i
850
+ end
851
+
852
+ def to_h
853
+ h = {}
854
+ h[:msg_fallback_codepage] = @msg_fallback_codepage unless @msg_fallback_codepage.nil?
855
+ h
856
+ end
857
+ end
858
+
840
859
  # Layout detection configuration
841
860
  #
842
- # @example Basic usage with fast preset
843
- # layout = LayoutDetection.new(preset: "fast")
861
+ # @example Basic usage
862
+ # layout = LayoutDetection.new
844
863
  #
845
- # @example Accurate preset with custom threshold
864
+ # @example With custom threshold and table model
846
865
  # layout = LayoutDetection.new(
847
- # preset: "accurate",
848
866
  # confidence_threshold: 0.5,
849
- # apply_heuristics: true
867
+ # apply_heuristics: true,
868
+ # table_model: "tatr"
850
869
  # )
851
870
  #
852
871
  class LayoutDetection
853
- attr_reader :preset, :confidence_threshold, :apply_heuristics, :table_model
872
+ attr_reader :confidence_threshold, :apply_heuristics, :table_model
854
873
 
855
- def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true, table_model: nil)
856
- @preset = preset.to_s
874
+ def initialize(confidence_threshold: nil, apply_heuristics: true, table_model: nil)
857
875
  @confidence_threshold = confidence_threshold&.to_f
858
876
  @apply_heuristics = apply_heuristics ? true : false
859
877
  @table_model = table_model&.to_s
@@ -861,7 +879,6 @@ module Kreuzberg
861
879
 
862
880
  def to_h
863
881
  {
864
- preset: @preset,
865
882
  confidence_threshold: @confidence_threshold,
866
883
  apply_heuristics: @apply_heuristics,
867
884
  table_model: @table_model
@@ -926,14 +943,15 @@ module Kreuzberg
926
943
  # )
927
944
  #
928
945
  class Extraction
929
- attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
946
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr, :disable_ocr, :force_ocr_pages,
930
947
  :include_document_structure,
931
948
  :ocr, :chunking, :language_detection, :pdf_options,
932
949
  :images, :postprocessor,
933
950
  :token_reduction, :keywords, :html_options, :pages,
934
951
  :max_concurrent_extractions, :output_format, :result_format,
935
952
  :security_limits, :layout, :concurrency,
936
- :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
953
+ :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
954
+ :max_archive_depth, :acceleration, :email
937
955
 
938
956
  # Alias for backward compatibility - image_extraction is the canonical name
939
957
  alias image_extraction images
@@ -954,11 +972,12 @@ module Kreuzberg
954
972
  #
955
973
  # Keys that are allowed in the Extraction config
956
974
  ALLOWED_KEYS = %i[
957
- use_cache enable_quality_processing force_ocr force_ocr_pages include_document_structure ocr chunking
958
- language_detection pdf_options image_extraction
975
+ use_cache enable_quality_processing force_ocr disable_ocr force_ocr_pages
976
+ include_document_structure ocr chunking language_detection pdf_options image_extraction
959
977
  postprocessor token_reduction keywords html_options pages
960
978
  max_concurrent_extractions output_format result_format
961
979
  security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
980
+ max_archive_depth acceleration email
962
981
  ].freeze
963
982
 
964
983
  # Aliases for backward compatibility
@@ -1015,10 +1034,11 @@ module Kreuzberg
1015
1034
  new(**normalize_hash_keys(hash))
1016
1035
  end
1017
1036
 
1018
- def initialize(hash = nil,
1037
+ def initialize(hash = nil, # rubocop:disable Metrics/MethodLength
1019
1038
  use_cache: true,
1020
1039
  enable_quality_processing: true,
1021
1040
  force_ocr: false,
1041
+ disable_ocr: false,
1022
1042
  force_ocr_pages: nil,
1023
1043
  include_document_structure: false,
1024
1044
  ocr: nil,
@@ -1039,10 +1059,13 @@ module Kreuzberg
1039
1059
  concurrency: nil,
1040
1060
  cache_namespace: nil,
1041
1061
  cache_ttl_secs: nil,
1042
- extraction_timeout_secs: nil)
1062
+ extraction_timeout_secs: nil,
1063
+ max_archive_depth: 3,
1064
+ acceleration: nil,
1065
+ email: nil)
1043
1066
  kwargs = {
1044
1067
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1045
- force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
1068
+ force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
1046
1069
  include_document_structure: include_document_structure,
1047
1070
  ocr: ocr, chunking: chunking, language_detection: language_detection,
1048
1071
  pdf_options: pdf_options, image_extraction: image_extraction,
@@ -1054,7 +1077,10 @@ module Kreuzberg
1054
1077
  concurrency: concurrency,
1055
1078
  cache_namespace: cache_namespace,
1056
1079
  cache_ttl_secs: cache_ttl_secs,
1057
- extraction_timeout_secs: extraction_timeout_secs
1080
+ extraction_timeout_secs: extraction_timeout_secs,
1081
+ max_archive_depth: max_archive_depth,
1082
+ acceleration: acceleration,
1083
+ email: email
1058
1084
  }
1059
1085
  extracted = extract_from_hash(hash, kwargs)
1060
1086
 
@@ -1072,6 +1098,7 @@ module Kreuzberg
1072
1098
  @use_cache = params[:use_cache] ? true : false
1073
1099
  @enable_quality_processing = params[:enable_quality_processing] ? true : false
1074
1100
  @force_ocr = params[:force_ocr] ? true : false
1101
+ @disable_ocr = params[:disable_ocr] ? true : false
1075
1102
  @force_ocr_pages = params[:force_ocr_pages]
1076
1103
  @include_document_structure = params[:include_document_structure] ? true : false
1077
1104
  @ocr = normalize_config(params[:ocr], OCR)
@@ -1086,7 +1113,10 @@ module Kreuzberg
1086
1113
  @pages = normalize_config(params[:pages], PageConfig)
1087
1114
  @layout = normalize_config(params[:layout], LayoutDetection)
1088
1115
  @concurrency = normalize_config(params[:concurrency], Concurrency)
1116
+ @acceleration = normalize_config(params[:acceleration], Acceleration)
1117
+ @email = normalize_config(params[:email], Email)
1089
1118
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
1119
+ @max_archive_depth = params[:max_archive_depth]&.to_i || 3
1090
1120
  @output_format = validate_output_format(params[:output_format])
1091
1121
  @result_format = validate_result_format(params[:result_format])
1092
1122
  @cache_namespace = params[:cache_namespace]
@@ -1124,9 +1154,11 @@ module Kreuzberg
1124
1154
  use_cache: @use_cache,
1125
1155
  enable_quality_processing: @enable_quality_processing,
1126
1156
  force_ocr: @force_ocr,
1157
+ disable_ocr: @disable_ocr,
1127
1158
  force_ocr_pages: @force_ocr_pages,
1128
1159
  include_document_structure: @include_document_structure,
1129
1160
  max_concurrent_extractions: @max_concurrent_extractions,
1161
+ max_archive_depth: @max_archive_depth,
1130
1162
  output_format: @output_format,
1131
1163
  result_format: @result_format,
1132
1164
  cache_namespace: @cache_namespace,
@@ -1142,7 +1174,8 @@ module Kreuzberg
1142
1174
  image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
1143
1175
  token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
1144
1176
  html_options: @html_options&.to_h, pages: @pages&.to_h,
1145
- layout: @layout&.to_h, concurrency: @concurrency&.to_h
1177
+ layout: @layout&.to_h, concurrency: @concurrency&.to_h,
1178
+ acceleration: @acceleration&.to_h, email: @email&.to_h
1146
1179
  }
1147
1180
  end
1148
1181
 
@@ -1258,6 +1291,8 @@ module Kreuzberg
1258
1291
  @enable_quality_processing = value ? true : false
1259
1292
  when :force_ocr
1260
1293
  @force_ocr = value ? true : false
1294
+ when :disable_ocr
1295
+ @disable_ocr = value ? true : false
1261
1296
  when :force_ocr_pages
1262
1297
  @force_ocr_pages = value
1263
1298
  when :include_document_structure
@@ -1286,6 +1321,12 @@ module Kreuzberg
1286
1321
  @layout = normalize_config(value, LayoutDetection)
1287
1322
  when :concurrency
1288
1323
  @concurrency = normalize_config(value, Concurrency)
1324
+ when :acceleration
1325
+ @acceleration = normalize_config(value, Acceleration)
1326
+ when :email
1327
+ @email = normalize_config(value, Email)
1328
+ when :max_archive_depth
1329
+ @max_archive_depth = value&.to_i || 3
1289
1330
  when :max_concurrent_extractions
1290
1331
  @max_concurrent_extractions = value&.to_i
1291
1332
  when :output_format
@@ -1357,6 +1398,7 @@ module Kreuzberg
1357
1398
  @use_cache = merged.use_cache
1358
1399
  @enable_quality_processing = merged.enable_quality_processing
1359
1400
  @force_ocr = merged.force_ocr
1401
+ @disable_ocr = merged.disable_ocr
1360
1402
  @force_ocr_pages = merged.force_ocr_pages
1361
1403
  @include_document_structure = merged.include_document_structure
1362
1404
  @ocr = merged.ocr
@@ -1373,6 +1415,9 @@ module Kreuzberg
1373
1415
  @html_options = merged.html_options
1374
1416
  @pages = merged.pages
1375
1417
  @layout = merged.layout
1418
+ @acceleration = merged.acceleration
1419
+ @email = merged.email
1420
+ @max_archive_depth = merged.max_archive_depth
1376
1421
  end
1377
1422
 
1378
1423
  def update_output_options(merged)
@@ -14,7 +14,8 @@ module Kreuzberg
14
14
  class Result
15
15
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
16
16
  :detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
17
- :document, :extracted_keywords, :quality_score, :processing_warnings, :annotations
17
+ :document, :extracted_keywords, :quality_score, :processing_warnings, :annotations,
18
+ :uris, :children
18
19
 
19
20
  # @!attribute [r] cells
20
21
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -51,6 +52,7 @@ module Kreuzberg
51
52
  :total_chunks,
52
53
  :first_page,
53
54
  :last_page,
55
+ :chunk_type,
54
56
  :embedding
55
57
  ) do
56
58
  def to_h
@@ -63,6 +65,7 @@ module Kreuzberg
63
65
  total_chunks: total_chunks,
64
66
  first_page: first_page,
65
67
  last_page: last_page,
68
+ chunk_type: chunk_type,
66
69
  embedding: embedding
67
70
  }
68
71
  end
@@ -318,7 +321,7 @@ module Kreuzberg
318
321
  #
319
322
  # @param hash [Hash] Hash returned from native extension
320
323
  #
321
- # rubocop:disable Metrics/AbcSize
324
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
322
325
  def initialize(hash)
323
326
  @content = get_value(hash, 'content', '')
324
327
  @mime_type = get_value(hash, 'mime_type', '')
@@ -337,14 +340,16 @@ module Kreuzberg
337
340
  @quality_score = get_value(hash, 'quality_score')
338
341
  @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
339
342
  @annotations = parse_annotations(get_value(hash, 'annotations'))
343
+ @uris = parse_uris(get_value(hash, 'uris'))
344
+ @children = parse_children(get_value(hash, 'children'))
340
345
  end
341
- # rubocop:enable Metrics/AbcSize
346
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
342
347
 
343
348
  # Convert to hash
344
349
  #
345
350
  # @return [Hash] Hash representation
346
351
  #
347
- # rubocop:disable Metrics/CyclomaticComplexity
352
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
348
353
  def to_h
349
354
  {
350
355
  content: @content,
@@ -362,10 +367,12 @@ module Kreuzberg
362
367
  extracted_keywords: @extracted_keywords&.map(&:to_h),
363
368
  quality_score: @quality_score,
364
369
  processing_warnings: @processing_warnings.map(&:to_h),
365
- annotations: @annotations&.map(&:to_h)
370
+ annotations: @annotations&.map(&:to_h),
371
+ uris: @uris&.map(&:to_h),
372
+ children: @children&.map(&:to_h)
366
373
  }
367
374
  end
368
- # rubocop:enable Metrics/CyclomaticComplexity
375
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
369
376
 
370
377
  # Convert to JSON
371
378
  #
@@ -520,6 +527,7 @@ module Kreuzberg
520
527
  total_chunks: chunk_hash['total_chunks'],
521
528
  first_page: chunk_hash['first_page'],
522
529
  last_page: chunk_hash['last_page'],
530
+ chunk_type: chunk_hash['chunk_type'],
523
531
  embedding: chunk_hash['embedding']
524
532
  )
525
533
  end
@@ -738,6 +746,35 @@ module Kreuzberg
738
746
  def bbox_field(bbox_data, primary_key, fallback_key)
739
747
  (bbox_data[primary_key] || bbox_data[fallback_key])&.to_f
740
748
  end
749
+
750
+ def parse_uris(uris_data)
751
+ return nil if uris_data.nil?
752
+
753
+ uris_data.map { |u| build_uri(u) }
754
+ end
755
+
756
+ def build_uri(u_hash)
757
+ Struct.new(:url, :label, :page, :kind).new(
758
+ url: u_hash['url'] || '',
759
+ label: u_hash['label'],
760
+ page: u_hash['page']&.to_i,
761
+ kind: u_hash['kind'] || 'hyperlink'
762
+ )
763
+ end
764
+
765
+ def parse_children(children_data)
766
+ return nil if children_data.nil?
767
+
768
+ children_data.map { |c| build_archive_entry(c) }
769
+ end
770
+
771
+ def build_archive_entry(c_hash)
772
+ Struct.new(:path, :mime_type, :result).new(
773
+ path: c_hash['path'] || '',
774
+ mime_type: c_hash['mime_type'] || '',
775
+ result: c_hash['result'] ? self.class.new(c_hash['result']) : nil
776
+ )
777
+ end
741
778
  end
742
779
  # rubocop:enable Metrics/ClassLength
743
780
  end
@@ -10,21 +10,24 @@ module Kreuzberg
10
10
  #
11
11
  # @example
12
12
  # type = Kreuzberg::ElementType::TITLE
13
- #
14
- ElementType = T.type_alias do
15
- T.any(
16
- 'title',
17
- 'narrative_text',
18
- 'heading',
19
- 'list_item',
20
- 'table',
21
- 'image',
22
- 'page_break',
23
- 'code_block',
24
- 'block_quote',
25
- 'footer',
26
- 'header'
27
- )
13
+ # Kreuzberg::ElementType.values # => ["title", "narrative_text", ...]
14
+ #
15
+ module ElementType
16
+ TITLE = 'title'
17
+ NARRATIVE_TEXT = 'narrative_text'
18
+ HEADING = 'heading'
19
+ LIST_ITEM = 'list_item'
20
+ TABLE = 'table'
21
+ IMAGE = 'image'
22
+ PAGE_BREAK = 'page_break'
23
+ CODE_BLOCK = 'code_block'
24
+ BLOCK_QUOTE = 'block_quote'
25
+ FOOTER = 'footer'
26
+ HEADER = 'header'
27
+
28
+ def self.values
29
+ [TITLE, NARRATIVE_TEXT, HEADING, LIST_ITEM, TABLE, IMAGE, PAGE_BREAK, CODE_BLOCK, BLOCK_QUOTE, FOOTER, HEADER]
30
+ end
28
31
  end
29
32
 
30
33
  # Bounding box coordinates for element positioning.
@@ -431,4 +434,191 @@ module Kreuzberg
431
434
  const :page_number, T.nilable(Integer)
432
435
  const :bounding_box, T.nilable(PdfAnnotationBoundingBox)
433
436
  end
437
+
438
+ # An entry within an archive (zip, tar, etc.) extraction result.
439
+ #
440
+ # @example
441
+ # entry = Kreuzberg::ArchiveEntry.new(
442
+ # path: "readme.txt",
443
+ # mime_type: "text/plain",
444
+ # result: extraction_result
445
+ # )
446
+ #
447
+ class ArchiveEntry < T::Struct
448
+ extend T::Sig
449
+
450
+ const :path, String
451
+ const :mime_type, String
452
+ const :result, T.untyped
453
+ end
454
+
455
+ # Extracted keyword with relevance metadata.
456
+ #
457
+ # @example
458
+ # kw = Kreuzberg::Keyword.new(
459
+ # text: "machine learning",
460
+ # score: 0.95,
461
+ # algorithm: "yake",
462
+ # positions: [42, 128]
463
+ # )
464
+ #
465
+ class Keyword < T::Struct
466
+ extend T::Sig
467
+
468
+ const :text, String
469
+ const :score, Float
470
+ const :algorithm, String
471
+ const :positions, T.nilable(T::Array[Integer])
472
+ end
473
+
474
+ # A table extracted from a document.
475
+ #
476
+ # @example
477
+ # table = Kreuzberg::Table.new(
478
+ # cells: [["A", "B"], ["1", "2"]],
479
+ # markdown: "| A | B |\n|---|---|\n| 1 | 2 |",
480
+ # page_number: 1,
481
+ # bounding_box: bbox
482
+ # )
483
+ #
484
+ class Table < T::Struct
485
+ extend T::Sig
486
+
487
+ const :cells, T::Array[T::Array[String]]
488
+ const :markdown, String
489
+ const :page_number, Integer
490
+ const :bounding_box, T.nilable(BoundingBox)
491
+ end
492
+
493
+ # A URI extracted from a document.
494
+ #
495
+ # @example
496
+ # uri = Kreuzberg::Uri.new(
497
+ # url: "https://example.com",
498
+ # kind: "hyperlink",
499
+ # label: "Example",
500
+ # page: 1
501
+ # )
502
+ #
503
+ class Uri < T::Struct
504
+ extend T::Sig
505
+
506
+ const :url, String
507
+ const :kind, String
508
+ const :label, T.nilable(String)
509
+ const :page, T.nilable(Integer)
510
+ end
511
+
512
+ # Content layer classification for document nodes.
513
+ module ContentLayer
514
+ BODY = 'body'
515
+ HEADER = 'header'
516
+ FOOTER = 'footer'
517
+ FOOTNOTE = 'footnote'
518
+
519
+ def self.values
520
+ [BODY, HEADER, FOOTER, FOOTNOTE]
521
+ end
522
+ end
523
+
524
+ # Algorithm used for keyword extraction.
525
+ module KeywordAlgorithm
526
+ YAKE = 'yake'
527
+ RAKE = 'rake'
528
+
529
+ def self.values
530
+ [YAKE, RAKE]
531
+ end
532
+ end
533
+
534
+ # OCR element granularity level.
535
+ module OcrElementLevel
536
+ WORD = 'word'
537
+ LINE = 'line'
538
+ BLOCK = 'block'
539
+ PAGE = 'page'
540
+
541
+ def self.values
542
+ [WORD, LINE, BLOCK, PAGE]
543
+ end
544
+ end
545
+
546
+ # Output format for extraction results.
547
+ module OutputFormat
548
+ PLAIN = 'plain'
549
+ MARKDOWN = 'markdown'
550
+ DJOT = 'djot'
551
+ HTML = 'html'
552
+ JSON = 'json'
553
+ STRUCTURED = 'structured'
554
+
555
+ def self.values
556
+ [PLAIN, MARKDOWN, DJOT, HTML, JSON, STRUCTURED]
557
+ end
558
+ end
559
+
560
+ # Page unit type classification.
561
+ module PageUnitType
562
+ PAGE = 'page'
563
+ SLIDE = 'slide'
564
+ SHEET = 'sheet'
565
+
566
+ def self.values
567
+ [PAGE, SLIDE, SHEET]
568
+ end
569
+ end
570
+
571
+ # PDF annotation type classification.
572
+ module PdfAnnotationType
573
+ TEXT = 'text'
574
+ HIGHLIGHT = 'highlight'
575
+ LINK = 'link'
576
+ STAMP = 'stamp'
577
+ UNDERLINE = 'underline'
578
+ STRIKE_OUT = 'strike_out'
579
+ OTHER = 'other'
580
+
581
+ def self.values
582
+ [TEXT, HIGHLIGHT, LINK, STAMP, UNDERLINE, STRIKE_OUT, OTHER]
583
+ end
584
+ end
585
+
586
+ # Relationship kind between document elements.
587
+ module RelationshipKind
588
+ FOOTNOTE_REFERENCE = 'footnote_reference'
589
+ CITATION_REFERENCE = 'citation_reference'
590
+ INTERNAL_LINK = 'internal_link'
591
+ CAPTION = 'caption'
592
+ LABEL = 'label'
593
+ TOC_ENTRY = 'toc_entry'
594
+ CROSS_REFERENCE = 'cross_reference'
595
+
596
+ def self.values
597
+ [FOOTNOTE_REFERENCE, CITATION_REFERENCE, INTERNAL_LINK, CAPTION, LABEL, TOC_ENTRY, CROSS_REFERENCE]
598
+ end
599
+ end
600
+
601
+ # Result format classification.
602
+ module ResultFormat
603
+ UNIFIED = 'unified'
604
+ ELEMENT_BASED = 'element_based'
605
+
606
+ def self.values
607
+ [UNIFIED, ELEMENT_BASED]
608
+ end
609
+ end
610
+
611
+ # URI kind classification.
612
+ module UriKind
613
+ HYPERLINK = 'hyperlink'
614
+ IMAGE = 'image'
615
+ ANCHOR = 'anchor'
616
+ CITATION = 'citation'
617
+ REFERENCE = 'reference'
618
+ EMAIL = 'email'
619
+
620
+ def self.values
621
+ [HYPERLINK, IMAGE, ANCHOR, CITATION, REFERENCE, EMAIL]
622
+ end
623
+ end
434
624
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.6.2'
4
+ VERSION = '4.7.0'
5
5
  end
data/lib/kreuzberg_rb.so CHANGED
Binary file