kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
7
7
 
8
8
  [package]
9
9
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.6"
10
+ version = "4.0.0-rc.8"
11
11
  edition = "2024"
12
12
  rust-version = "1.91"
13
13
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -30,6 +30,7 @@ default = []
30
30
  [dependencies]
31
31
  async-trait = "0.1.89"
32
32
  kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full", "embeddings"] }
33
+ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi", features = ["embeddings"] }
33
34
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
34
35
  "rb-sys",
35
36
  ] }
@@ -37,8 +38,17 @@ rb-sys = { version = "0.9.119", default-features = false, features = [
37
38
  "stable-api-compiled-fallback",
38
39
  ] }
39
40
  serde_json = "1.0.145"
40
- tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
41
- html-to-markdown-rs = { version = "2.11.1", default-features = false }
41
+ tokio = { version = "1.48.0", features = [
42
+ "rt",
43
+ "rt-multi-thread",
44
+ "macros",
45
+ "sync",
46
+ "process",
47
+ "fs",
48
+ "time",
49
+ "io-util",
50
+ ] }
51
+ html-to-markdown-rs = { version = "2.14.1", default-features = false }
42
52
 
43
53
  [dev-dependencies]
44
54
  pretty_assertions = "1.4"
@@ -1,3 +1,5 @@
1
+ #![allow(unpredictable_function_pointer_comparisons)]
2
+
1
3
  //! Kreuzberg Ruby Bindings (Magnus)
2
4
  //!
3
5
  //! High-performance document intelligence framework bindings for Ruby.
@@ -7,6 +9,7 @@ use html_to_markdown_rs::options::{
7
9
  CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
8
10
  WhitespaceMode,
9
11
  };
12
+ use kreuzberg::core::config::PageConfig;
10
13
  use kreuzberg::keywords::{
11
14
  KeywordAlgorithm as RustKeywordAlgorithm, KeywordConfig as RustKeywordConfig, RakeParams as RustRakeParams,
12
15
  YakeParams as RustYakeParams,
@@ -1050,6 +1053,36 @@ fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result
1050
1053
 
1051
1054
  Ok(hash)
1052
1055
  }
1056
+
1057
+ /// Parse PageConfig from Ruby Hash
1058
+ fn parse_page_config(ruby: &Ruby, hash: RHash) -> Result<PageConfig, Error> {
1059
+ let extract_pages = if let Some(val) = get_kw(ruby, hash, "extract_pages") {
1060
+ bool::try_convert(val)?
1061
+ } else {
1062
+ false
1063
+ };
1064
+
1065
+ let insert_page_markers = if let Some(val) = get_kw(ruby, hash, "insert_page_markers") {
1066
+ bool::try_convert(val)?
1067
+ } else {
1068
+ false
1069
+ };
1070
+
1071
+ let marker_format = if let Some(val) = get_kw(ruby, hash, "marker_format") {
1072
+ String::try_convert(val)?
1073
+ } else {
1074
+ "\n\n<!-- PAGE {page_num} -->\n\n".to_string()
1075
+ };
1076
+
1077
+ let config = PageConfig {
1078
+ extract_pages,
1079
+ insert_page_markers,
1080
+ marker_format,
1081
+ };
1082
+
1083
+ Ok(config)
1084
+ }
1085
+
1053
1086
  /// Parse ExtractionConfig from Ruby Hash
1054
1087
  fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
1055
1088
  let mut config = ExtractionConfig::default();
@@ -1130,6 +1163,13 @@ fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extractio
1130
1163
  config.html_options = Some(parse_html_options(ruby, html_hash)?);
1131
1164
  }
1132
1165
 
1166
+ if let Some(val) = get_kw(ruby, hash, "pages")
1167
+ && !val.is_nil()
1168
+ {
1169
+ let pages_hash = RHash::try_convert(val)?;
1170
+ config.pages = Some(parse_page_config(ruby, pages_hash)?);
1171
+ }
1172
+
1133
1173
  if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
1134
1174
  let value = usize::try_convert(val)?;
1135
1175
  config.max_concurrent_extractions = Some(value);
@@ -1532,8 +1572,8 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
1532
1572
  for chunk in chunks {
1533
1573
  let chunk_hash = ruby.hash_new();
1534
1574
  chunk_hash.aset("content", chunk.content)?;
1535
- chunk_hash.aset("char_start", chunk.metadata.char_start)?;
1536
- chunk_hash.aset("char_end", chunk.metadata.char_end)?;
1575
+ chunk_hash.aset("byte_start", chunk.metadata.byte_start)?;
1576
+ chunk_hash.aset("byte_end", chunk.metadata.byte_end)?;
1537
1577
  if let Some(token_count) = chunk.metadata.token_count {
1538
1578
  chunk_hash.aset("token_count", token_count)?;
1539
1579
  } else {
@@ -1541,6 +1581,16 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
1541
1581
  }
1542
1582
  chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
1543
1583
  chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
1584
+ if let Some(first_page) = chunk.metadata.first_page {
1585
+ chunk_hash.aset("first_page", first_page as i64)?;
1586
+ } else {
1587
+ chunk_hash.aset("first_page", ruby.qnil().as_value())?;
1588
+ }
1589
+ if let Some(last_page) = chunk.metadata.last_page {
1590
+ chunk_hash.aset("last_page", last_page as i64)?;
1591
+ } else {
1592
+ chunk_hash.aset("last_page", ruby.qnil().as_value())?;
1593
+ }
1544
1594
  if let Some(embedding) = chunk.embedding {
1545
1595
  let embedding_array = ruby.ary_new();
1546
1596
  for value in embedding {
@@ -1617,6 +1667,92 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
1617
1667
  set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
1618
1668
  }
1619
1669
 
1670
+ if let Some(page_content_list) = result.pages {
1671
+ let pages_array = ruby.ary_new();
1672
+ for page_content in page_content_list {
1673
+ let page_hash = ruby.hash_new();
1674
+ page_hash.aset("page_number", page_content.page_number as i64)?;
1675
+ page_hash.aset("content", page_content.content)?;
1676
+
1677
+ let tables_array = ruby.ary_new();
1678
+ for table in page_content.tables {
1679
+ let table_hash = ruby.hash_new();
1680
+
1681
+ let cells_array = ruby.ary_new();
1682
+ for row in table.cells {
1683
+ let row_array = ruby.ary_from_vec(row);
1684
+ cells_array.push(row_array)?;
1685
+ }
1686
+ table_hash.aset("cells", cells_array)?;
1687
+ table_hash.aset("markdown", table.markdown)?;
1688
+ table_hash.aset("page_number", table.page_number as i64)?;
1689
+
1690
+ tables_array.push(table_hash)?;
1691
+ }
1692
+ page_hash.aset("tables", tables_array)?;
1693
+
1694
+ let images_array = ruby.ary_new();
1695
+ for image in page_content.images {
1696
+ let image_hash = ruby.hash_new();
1697
+ let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
1698
+ image_hash.aset("data", data_value)?;
1699
+ image_hash.aset("format", image.format)?;
1700
+ image_hash.aset("image_index", image.image_index as i64)?;
1701
+ if let Some(page) = image.page_number {
1702
+ image_hash.aset("page_number", page as i64)?;
1703
+ } else {
1704
+ image_hash.aset("page_number", ruby.qnil().as_value())?;
1705
+ }
1706
+ if let Some(width) = image.width {
1707
+ image_hash.aset("width", width as i64)?;
1708
+ } else {
1709
+ image_hash.aset("width", ruby.qnil().as_value())?;
1710
+ }
1711
+ if let Some(height) = image.height {
1712
+ image_hash.aset("height", height as i64)?;
1713
+ } else {
1714
+ image_hash.aset("height", ruby.qnil().as_value())?;
1715
+ }
1716
+ if let Some(colorspace) = image.colorspace {
1717
+ image_hash.aset("colorspace", colorspace)?;
1718
+ } else {
1719
+ image_hash.aset("colorspace", ruby.qnil().as_value())?;
1720
+ }
1721
+ if let Some(bits) = image.bits_per_component {
1722
+ image_hash.aset("bits_per_component", bits as i64)?;
1723
+ } else {
1724
+ image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
1725
+ }
1726
+ image_hash.aset(
1727
+ "is_mask",
1728
+ if image.is_mask {
1729
+ ruby.qtrue().as_value()
1730
+ } else {
1731
+ ruby.qfalse().as_value()
1732
+ },
1733
+ )?;
1734
+ if let Some(description) = image.description {
1735
+ image_hash.aset("description", description)?;
1736
+ } else {
1737
+ image_hash.aset("description", ruby.qnil().as_value())?;
1738
+ }
1739
+ if let Some(ocr_result) = image.ocr_result {
1740
+ let nested = extraction_result_to_ruby(ruby, *ocr_result)?;
1741
+ image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
1742
+ } else {
1743
+ image_hash.aset("ocr_result", ruby.qnil().as_value())?;
1744
+ }
1745
+ images_array.push(image_hash)?;
1746
+ }
1747
+ page_hash.aset("images", images_array)?;
1748
+
1749
+ pages_array.push(page_hash)?;
1750
+ }
1751
+ set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
1752
+ } else {
1753
+ set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
1754
+ }
1755
+
1620
1756
  Ok(hash)
1621
1757
  }
1622
1758
 
@@ -2366,6 +2502,7 @@ fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
2366
2502
  detected_languages: None,
2367
2503
  chunks: None,
2368
2504
  images: None,
2505
+ pages: None,
2369
2506
  })
2370
2507
  }
2371
2508
 
data/kreuzberg.gemspec CHANGED
@@ -21,6 +21,15 @@ core_files =
21
21
  .map { |path| path.delete_prefix('crates/') }
22
22
  .map { |path| "vendor/#{path}" }
23
23
 
24
+ # Include the kreuzberg-ffi crate
25
+ ffi_prefix = 'crates/kreuzberg-ffi/'
26
+ ffi_cmd = %(git -C "#{repo_root}" ls-files -z #{ffi_prefix})
27
+ ffi_files =
28
+ `#{ffi_cmd}`.split("\x0")
29
+ .select { |path| path.start_with?(ffi_prefix) }
30
+ .map { |path| path.delete_prefix('crates/') }
31
+ .map { |path| "vendor/#{path}" }
32
+
24
33
  fallback_files = Dir.chdir(__dir__) do
25
34
  ruby_fallback = Dir.glob(
26
35
  %w[
@@ -45,10 +54,24 @@ fallback_files = Dir.chdir(__dir__) do
45
54
  core_fallback = Dir.chdir(repo_root) do
46
55
  Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
47
56
  .reject { |f| File.directory?(f) }
57
+ .reject { |f| f.include?('/.fastembed_cache/') }
58
+ .reject { |f| f.include?('/target/') }
59
+ .grep_v(/\.(swp|bak|tmp)$/)
60
+ .grep_v(/~$/)
61
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
62
+ end
63
+
64
+ # Fallback for FFI crate - copy from repo root
65
+ ffi_fallback = Dir.chdir(repo_root) do
66
+ Dir.glob('crates/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
67
+ .reject { |f| File.directory?(f) }
68
+ .reject { |f| f.include?('/target/') }
69
+ .grep_v(/\.(swp|bak|tmp)$/)
70
+ .grep_v(/~$/)
48
71
  .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
49
72
  end
50
73
 
51
- ruby_fallback + core_fallback
74
+ ruby_fallback + core_fallback + ffi_fallback
52
75
  end
53
76
 
54
77
  # Check for vendored crates (copied during CI/packaging)
@@ -57,6 +80,7 @@ vendor_files = Dir.chdir(__dir__) do
57
80
  Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
58
81
  .reject { |f| File.directory?(f) }
59
82
  .reject { |f| f.include?('/.fastembed_cache/') }
83
+ .reject { |f| f.include?('/.kreuzberg/') }
60
84
  .reject { |f| f.include?('/target/') }
61
85
  .grep_v(/\.(swp|bak|tmp)$/)
62
86
  .grep_v(/~$/)
@@ -64,6 +88,16 @@ vendor_files = Dir.chdir(__dir__) do
64
88
  []
65
89
  end
66
90
 
91
+ kreuzberg_ffi_files = if Dir.exist?('vendor/kreuzberg-ffi')
92
+ Dir.glob('vendor/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
93
+ .reject { |f| File.directory?(f) }
94
+ .reject { |f| f.include?('/target/') }
95
+ .grep_v(/\.(swp|bak|tmp)$/)
96
+ .grep_v(/~$/)
97
+ else
98
+ []
99
+ end
100
+
67
101
  rb_sys_files = if Dir.exist?('vendor/rb-sys')
68
102
  Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
69
103
  .reject { |f| File.directory?(f) }
@@ -80,17 +114,17 @@ vendor_files = Dir.chdir(__dir__) do
80
114
  []
81
115
  end
82
116
 
83
- kreuzberg_files + rb_sys_files + workspace_toml
117
+ kreuzberg_files + kreuzberg_ffi_files + rb_sys_files + workspace_toml
84
118
  end
85
119
 
86
120
  # Use git-tracked files if available, otherwise fallback to glob
87
121
  # Always include vendored files if they exist on disk (for CI packaging)
88
- files = if (ruby_files + core_files).empty?
122
+ files = if (ruby_files + core_files + ffi_files).empty?
89
123
  fallback_files
90
124
  elsif vendor_files.any?
91
125
  ruby_files + vendor_files
92
126
  else
93
- ruby_files + core_files
127
+ ruby_files + core_files + ffi_files
94
128
  end
95
129
 
96
130
  # Filter to only include files that actually exist
@@ -492,6 +492,36 @@ module Kreuzberg
492
492
  end
493
493
  end
494
494
 
495
+ # Page tracking configuration for multi-page documents
496
+ #
497
+ # @example Enable page extraction
498
+ # pages = PageConfig.new(extract_pages: true)
499
+ #
500
+ # @example Enable page markers in content
501
+ # pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
502
+ #
503
+ class PageConfig
504
+ attr_reader :extract_pages, :insert_page_markers, :marker_format
505
+
506
+ def initialize(
507
+ extract_pages: false,
508
+ insert_page_markers: false,
509
+ marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
510
+ )
511
+ @extract_pages = extract_pages ? true : false
512
+ @insert_page_markers = insert_page_markers ? true : false
513
+ @marker_format = marker_format.to_s
514
+ end
515
+
516
+ def to_h
517
+ {
518
+ extract_pages: @extract_pages,
519
+ insert_page_markers: @insert_page_markers,
520
+ marker_format: @marker_format
521
+ }
522
+ end
523
+ end
524
+
495
525
  # Post-processor configuration
496
526
  #
497
527
  # @example Enable all post-processors
@@ -576,7 +606,7 @@ module Kreuzberg
576
606
  attr_reader :use_cache, :enable_quality_processing, :force_ocr,
577
607
  :ocr, :chunking, :language_detection, :pdf_options,
578
608
  :image_extraction, :image_preprocessing, :postprocessor,
579
- :token_reduction, :keywords, :html_options,
609
+ :token_reduction, :keywords, :html_options, :pages,
580
610
  :max_concurrent_extractions
581
611
 
582
612
  # Load configuration from a file.
@@ -634,6 +664,7 @@ module Kreuzberg
634
664
  token_reduction: nil,
635
665
  keywords: nil,
636
666
  html_options: nil,
667
+ pages: nil,
637
668
  max_concurrent_extractions: nil
638
669
  )
639
670
  @use_cache = use_cache ? true : false
@@ -649,6 +680,7 @@ module Kreuzberg
649
680
  @token_reduction = normalize_config(token_reduction, TokenReduction)
650
681
  @keywords = normalize_config(keywords, Keywords)
651
682
  @html_options = normalize_config(html_options, HtmlOptions)
683
+ @pages = normalize_config(pages, PageConfig)
652
684
  @max_concurrent_extractions = max_concurrent_extractions&.to_i
653
685
  end
654
686
 
@@ -668,6 +700,7 @@ module Kreuzberg
668
700
  token_reduction: @token_reduction&.to_h,
669
701
  keywords: @keywords&.to_h,
670
702
  html_options: @html_options&.to_h,
703
+ pages: @pages&.to_h,
671
704
  max_concurrent_extractions: @max_concurrent_extractions
672
705
  }.compact
673
706
  end
@@ -21,7 +21,7 @@ module Kreuzberg
21
21
  # rubocop:disable Metrics/ClassLength
22
22
  class Result
23
23
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
24
- :detected_languages, :chunks, :images
24
+ :detected_languages, :chunks, :images, :pages
25
25
 
26
26
  # Table structure
27
27
  #
@@ -42,31 +42,39 @@ module Kreuzberg
42
42
  #
43
43
  # @!attribute [r] content
44
44
  # @return [String] Chunk content
45
- # @!attribute [r] char_start
46
- # @return [Integer] Starting character index
47
- # @!attribute [r] char_end
48
- # @return [Integer] Ending character index
45
+ # @!attribute [r] byte_start
46
+ # @return [Integer] Starting byte offset (UTF-8)
47
+ # @!attribute [r] byte_end
48
+ # @return [Integer] Ending byte offset (UTF-8)
49
49
  # @!attribute [r] token_count
50
50
  # @return [Integer, nil] Approximate token count (may be nil)
51
+ # @!attribute [r] first_page
52
+ # @return [Integer, nil] First page number (1-indexed)
53
+ # @!attribute [r] last_page
54
+ # @return [Integer, nil] Last page number (1-indexed)
51
55
  #
52
56
  Chunk = Struct.new(
53
57
  :content,
54
- :char_start,
55
- :char_end,
58
+ :byte_start,
59
+ :byte_end,
56
60
  :token_count,
57
61
  :chunk_index,
58
62
  :total_chunks,
63
+ :first_page,
64
+ :last_page,
59
65
  :embedding,
60
66
  keyword_init: true
61
67
  ) do
62
68
  def to_h
63
69
  {
64
70
  content: content,
65
- char_start: char_start,
66
- char_end: char_end,
71
+ byte_start: byte_start,
72
+ byte_end: byte_end,
67
73
  token_count: token_count,
68
74
  chunk_index: chunk_index,
69
75
  total_chunks: total_chunks,
76
+ first_page: first_page,
77
+ last_page: last_page,
70
78
  embedding: embedding
71
79
  }
72
80
  end
@@ -103,6 +111,28 @@ module Kreuzberg
103
111
  end
104
112
  end
105
113
 
114
+ # Per-page content
115
+ #
116
+ # @!attribute [r] page_number
117
+ # @return [Integer] Page number (1-indexed)
118
+ # @!attribute [r] content
119
+ # @return [String] Text content for this page
120
+ # @!attribute [r] tables
121
+ # @return [Array<Table>] Tables on this page
122
+ # @!attribute [r] images
123
+ # @return [Array<Image>] Images on this page
124
+ #
125
+ PageContent = Struct.new(:page_number, :content, :tables, :images, keyword_init: true) do
126
+ def to_h
127
+ {
128
+ page_number: page_number,
129
+ content: content,
130
+ tables: tables.map(&:to_h),
131
+ images: images.map(&:to_h)
132
+ }
133
+ end
134
+ end
135
+
106
136
  # Initialize from native hash result
107
137
  #
108
138
  # @param hash [Hash] Hash returned from native extension
@@ -117,6 +147,7 @@ module Kreuzberg
117
147
  @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
118
148
  @chunks = parse_chunks(get_value(hash, 'chunks'))
119
149
  @images = parse_images(get_value(hash, 'images'))
150
+ @pages = parse_pages(get_value(hash, 'pages'))
120
151
  end
121
152
 
122
153
  # Convert to hash
@@ -128,10 +159,11 @@ module Kreuzberg
128
159
  content: @content,
129
160
  mime_type: @mime_type,
130
161
  metadata: @metadata,
131
- tables: @tables.map(&:to_h),
162
+ tables: serialize_tables,
132
163
  detected_languages: @detected_languages,
133
- chunks: @chunks&.map(&:to_h),
134
- images: @images&.map(&:to_h)
164
+ chunks: serialize_chunks,
165
+ images: serialize_images,
166
+ pages: serialize_pages
135
167
  }
136
168
  end
137
169
 
@@ -145,6 +177,22 @@ module Kreuzberg
145
177
 
146
178
  private
147
179
 
180
+ def serialize_tables
181
+ @tables.map(&:to_h)
182
+ end
183
+
184
+ def serialize_chunks
185
+ @chunks&.map(&:to_h)
186
+ end
187
+
188
+ def serialize_images
189
+ @images&.map(&:to_h)
190
+ end
191
+
192
+ def serialize_pages
193
+ @pages&.map(&:to_h)
194
+ end
195
+
148
196
  def get_value(hash, key, default = nil)
149
197
  hash[key] || hash[key.to_sym] || default
150
198
  end
@@ -180,11 +228,13 @@ module Kreuzberg
180
228
  chunks_data.map do |chunk_hash|
181
229
  Chunk.new(
182
230
  content: chunk_hash['content'],
183
- char_start: chunk_hash['char_start'],
184
- char_end: chunk_hash['char_end'],
231
+ byte_start: chunk_hash['byte_start'],
232
+ byte_end: chunk_hash['byte_end'],
185
233
  token_count: chunk_hash['token_count'],
186
234
  chunk_index: chunk_hash['chunk_index'],
187
235
  total_chunks: chunk_hash['total_chunks'],
236
+ first_page: chunk_hash['first_page'],
237
+ last_page: chunk_hash['last_page'],
188
238
  embedding: chunk_hash['embedding']
189
239
  )
190
240
  end
@@ -211,6 +261,19 @@ module Kreuzberg
211
261
  )
212
262
  end
213
263
  end
264
+
265
+ def parse_pages(pages_data)
266
+ return nil if pages_data.nil?
267
+
268
+ pages_data.map do |page_hash|
269
+ PageContent.new(
270
+ page_number: page_hash['page_number'],
271
+ content: page_hash['content'],
272
+ tables: parse_tables(page_hash['tables']),
273
+ images: parse_images(page_hash['images'])
274
+ )
275
+ end
276
+ end
214
277
  end
215
278
  # rubocop:enable Metrics/ClassLength
216
279
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.6'
4
+ VERSION = '4.0.0-rc.8'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -168,6 +168,15 @@ module Kreuzberg
168
168
  def to_h: () -> Hash[Symbol, untyped]
169
169
  end
170
170
 
171
+ class PageConfig
172
+ attr_reader extract_pages: bool
173
+ attr_reader insert_page_markers: bool
174
+ attr_reader marker_format: String
175
+
176
+ def initialize: (?extract_pages: bool, ?insert_page_markers: bool, ?marker_format: String) -> void
177
+ def to_h: () -> Hash[Symbol, untyped]
178
+ end
179
+
171
180
  class Extraction
172
181
  attr_reader use_cache: bool
173
182
  attr_reader enable_quality_processing: bool
@@ -182,6 +191,7 @@ module Kreuzberg
182
191
  attr_reader token_reduction: TokenReduction?
183
192
  attr_reader keywords: Keywords?
184
193
  attr_reader html_options: HtmlOptions?
194
+ attr_reader pages: PageConfig?
185
195
  attr_reader max_concurrent_extractions: Integer?
186
196
 
187
197
  def self.from_file: (String path) -> Extraction
@@ -199,6 +209,7 @@ module Kreuzberg
199
209
  ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
200
210
  ?keywords: (Keywords | Hash[Symbol, untyped])?,
201
211
  ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
212
+ ?pages: (PageConfig | Hash[Symbol, untyped])?,
202
213
  ?max_concurrent_extractions: Integer?
203
214
  ) -> void
204
215
  def to_h: () -> Hash[Symbol, untyped]
@@ -234,11 +245,13 @@ module Kreuzberg
234
245
 
235
246
  type chunk_hash = {
236
247
  content: String,
237
- char_start: Integer,
238
- char_end: Integer,
248
+ byte_start: Integer,
249
+ byte_end: Integer,
239
250
  token_count: Integer?,
240
251
  chunk_index: Integer?,
241
252
  total_chunks: Integer?,
253
+ first_page: Integer?,
254
+ last_page: Integer?,
242
255
  embedding: Array[Float]?
243
256
  }
244
257
 
@@ -278,20 +291,24 @@ module Kreuzberg
278
291
  # Text chunk
279
292
  class Chunk
280
293
  attr_reader content: String
281
- attr_reader char_start: Integer
282
- attr_reader char_end: Integer
294
+ attr_reader byte_start: Integer
295
+ attr_reader byte_end: Integer
283
296
  attr_reader token_count: Integer?
284
297
  attr_reader chunk_index: Integer?
285
298
  attr_reader total_chunks: Integer?
299
+ attr_reader first_page: Integer?
300
+ attr_reader last_page: Integer?
286
301
  attr_reader embedding: Array[Float]?
287
302
 
288
303
  def initialize: (
289
304
  content: String,
290
- char_start: Integer,
291
- char_end: Integer,
305
+ byte_start: Integer,
306
+ byte_end: Integer,
292
307
  token_count: Integer?,
293
308
  chunk_index: Integer?,
294
309
  total_chunks: Integer?,
310
+ first_page: Integer?,
311
+ last_page: Integer?,
295
312
  embedding: Array[Float]?
296
313
  ) -> void
297
314
  def to_h: () -> chunk_hash