kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +32 -11
- data/vendor/kreuzberg/README.md +54 -8
- data/vendor/kreuzberg/build.rs +549 -132
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +120 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/lib.rs +1 -0
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +13 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
|
|
|
7
7
|
|
|
8
8
|
[package]
|
|
9
9
|
name = "kreuzberg-rb"
|
|
10
|
-
version = "4.0.0-rc.
|
|
10
|
+
version = "4.0.0-rc.8"
|
|
11
11
|
edition = "2024"
|
|
12
12
|
rust-version = "1.91"
|
|
13
13
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -30,6 +30,7 @@ default = []
|
|
|
30
30
|
[dependencies]
|
|
31
31
|
async-trait = "0.1.89"
|
|
32
32
|
kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full", "embeddings"] }
|
|
33
|
+
kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi", features = ["embeddings"] }
|
|
33
34
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
34
35
|
"rb-sys",
|
|
35
36
|
] }
|
|
@@ -37,8 +38,17 @@ rb-sys = { version = "0.9.119", default-features = false, features = [
|
|
|
37
38
|
"stable-api-compiled-fallback",
|
|
38
39
|
] }
|
|
39
40
|
serde_json = "1.0.145"
|
|
40
|
-
tokio = { version = "1.48.0", features = [
|
|
41
|
-
|
|
41
|
+
tokio = { version = "1.48.0", features = [
|
|
42
|
+
"rt",
|
|
43
|
+
"rt-multi-thread",
|
|
44
|
+
"macros",
|
|
45
|
+
"sync",
|
|
46
|
+
"process",
|
|
47
|
+
"fs",
|
|
48
|
+
"time",
|
|
49
|
+
"io-util",
|
|
50
|
+
] }
|
|
51
|
+
html-to-markdown-rs = { version = "2.14.1", default-features = false }
|
|
42
52
|
|
|
43
53
|
[dev-dependencies]
|
|
44
54
|
pretty_assertions = "1.4"
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#![allow(unpredictable_function_pointer_comparisons)]
|
|
2
|
+
|
|
1
3
|
//! Kreuzberg Ruby Bindings (Magnus)
|
|
2
4
|
//!
|
|
3
5
|
//! High-performance document intelligence framework bindings for Ruby.
|
|
@@ -7,6 +9,7 @@ use html_to_markdown_rs::options::{
|
|
|
7
9
|
CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
|
|
8
10
|
WhitespaceMode,
|
|
9
11
|
};
|
|
12
|
+
use kreuzberg::core::config::PageConfig;
|
|
10
13
|
use kreuzberg::keywords::{
|
|
11
14
|
KeywordAlgorithm as RustKeywordAlgorithm, KeywordConfig as RustKeywordConfig, RakeParams as RustRakeParams,
|
|
12
15
|
YakeParams as RustYakeParams,
|
|
@@ -1050,6 +1053,36 @@ fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result
|
|
|
1050
1053
|
|
|
1051
1054
|
Ok(hash)
|
|
1052
1055
|
}
|
|
1056
|
+
|
|
1057
|
+
/// Parse PageConfig from Ruby Hash
|
|
1058
|
+
fn parse_page_config(ruby: &Ruby, hash: RHash) -> Result<PageConfig, Error> {
|
|
1059
|
+
let extract_pages = if let Some(val) = get_kw(ruby, hash, "extract_pages") {
|
|
1060
|
+
bool::try_convert(val)?
|
|
1061
|
+
} else {
|
|
1062
|
+
false
|
|
1063
|
+
};
|
|
1064
|
+
|
|
1065
|
+
let insert_page_markers = if let Some(val) = get_kw(ruby, hash, "insert_page_markers") {
|
|
1066
|
+
bool::try_convert(val)?
|
|
1067
|
+
} else {
|
|
1068
|
+
false
|
|
1069
|
+
};
|
|
1070
|
+
|
|
1071
|
+
let marker_format = if let Some(val) = get_kw(ruby, hash, "marker_format") {
|
|
1072
|
+
String::try_convert(val)?
|
|
1073
|
+
} else {
|
|
1074
|
+
"\n\n<!-- PAGE {page_num} -->\n\n".to_string()
|
|
1075
|
+
};
|
|
1076
|
+
|
|
1077
|
+
let config = PageConfig {
|
|
1078
|
+
extract_pages,
|
|
1079
|
+
insert_page_markers,
|
|
1080
|
+
marker_format,
|
|
1081
|
+
};
|
|
1082
|
+
|
|
1083
|
+
Ok(config)
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1053
1086
|
/// Parse ExtractionConfig from Ruby Hash
|
|
1054
1087
|
fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
|
|
1055
1088
|
let mut config = ExtractionConfig::default();
|
|
@@ -1130,6 +1163,13 @@ fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extractio
|
|
|
1130
1163
|
config.html_options = Some(parse_html_options(ruby, html_hash)?);
|
|
1131
1164
|
}
|
|
1132
1165
|
|
|
1166
|
+
if let Some(val) = get_kw(ruby, hash, "pages")
|
|
1167
|
+
&& !val.is_nil()
|
|
1168
|
+
{
|
|
1169
|
+
let pages_hash = RHash::try_convert(val)?;
|
|
1170
|
+
config.pages = Some(parse_page_config(ruby, pages_hash)?);
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1133
1173
|
if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
|
|
1134
1174
|
let value = usize::try_convert(val)?;
|
|
1135
1175
|
config.max_concurrent_extractions = Some(value);
|
|
@@ -1532,8 +1572,8 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
|
|
|
1532
1572
|
for chunk in chunks {
|
|
1533
1573
|
let chunk_hash = ruby.hash_new();
|
|
1534
1574
|
chunk_hash.aset("content", chunk.content)?;
|
|
1535
|
-
chunk_hash.aset("
|
|
1536
|
-
chunk_hash.aset("
|
|
1575
|
+
chunk_hash.aset("byte_start", chunk.metadata.byte_start)?;
|
|
1576
|
+
chunk_hash.aset("byte_end", chunk.metadata.byte_end)?;
|
|
1537
1577
|
if let Some(token_count) = chunk.metadata.token_count {
|
|
1538
1578
|
chunk_hash.aset("token_count", token_count)?;
|
|
1539
1579
|
} else {
|
|
@@ -1541,6 +1581,16 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
|
|
|
1541
1581
|
}
|
|
1542
1582
|
chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
|
|
1543
1583
|
chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
|
|
1584
|
+
if let Some(first_page) = chunk.metadata.first_page {
|
|
1585
|
+
chunk_hash.aset("first_page", first_page as i64)?;
|
|
1586
|
+
} else {
|
|
1587
|
+
chunk_hash.aset("first_page", ruby.qnil().as_value())?;
|
|
1588
|
+
}
|
|
1589
|
+
if let Some(last_page) = chunk.metadata.last_page {
|
|
1590
|
+
chunk_hash.aset("last_page", last_page as i64)?;
|
|
1591
|
+
} else {
|
|
1592
|
+
chunk_hash.aset("last_page", ruby.qnil().as_value())?;
|
|
1593
|
+
}
|
|
1544
1594
|
if let Some(embedding) = chunk.embedding {
|
|
1545
1595
|
let embedding_array = ruby.ary_new();
|
|
1546
1596
|
for value in embedding {
|
|
@@ -1617,6 +1667,92 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
|
|
|
1617
1667
|
set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
|
|
1618
1668
|
}
|
|
1619
1669
|
|
|
1670
|
+
if let Some(page_content_list) = result.pages {
|
|
1671
|
+
let pages_array = ruby.ary_new();
|
|
1672
|
+
for page_content in page_content_list {
|
|
1673
|
+
let page_hash = ruby.hash_new();
|
|
1674
|
+
page_hash.aset("page_number", page_content.page_number as i64)?;
|
|
1675
|
+
page_hash.aset("content", page_content.content)?;
|
|
1676
|
+
|
|
1677
|
+
let tables_array = ruby.ary_new();
|
|
1678
|
+
for table in page_content.tables {
|
|
1679
|
+
let table_hash = ruby.hash_new();
|
|
1680
|
+
|
|
1681
|
+
let cells_array = ruby.ary_new();
|
|
1682
|
+
for row in table.cells {
|
|
1683
|
+
let row_array = ruby.ary_from_vec(row);
|
|
1684
|
+
cells_array.push(row_array)?;
|
|
1685
|
+
}
|
|
1686
|
+
table_hash.aset("cells", cells_array)?;
|
|
1687
|
+
table_hash.aset("markdown", table.markdown)?;
|
|
1688
|
+
table_hash.aset("page_number", table.page_number as i64)?;
|
|
1689
|
+
|
|
1690
|
+
tables_array.push(table_hash)?;
|
|
1691
|
+
}
|
|
1692
|
+
page_hash.aset("tables", tables_array)?;
|
|
1693
|
+
|
|
1694
|
+
let images_array = ruby.ary_new();
|
|
1695
|
+
for image in page_content.images {
|
|
1696
|
+
let image_hash = ruby.hash_new();
|
|
1697
|
+
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
1698
|
+
image_hash.aset("data", data_value)?;
|
|
1699
|
+
image_hash.aset("format", image.format)?;
|
|
1700
|
+
image_hash.aset("image_index", image.image_index as i64)?;
|
|
1701
|
+
if let Some(page) = image.page_number {
|
|
1702
|
+
image_hash.aset("page_number", page as i64)?;
|
|
1703
|
+
} else {
|
|
1704
|
+
image_hash.aset("page_number", ruby.qnil().as_value())?;
|
|
1705
|
+
}
|
|
1706
|
+
if let Some(width) = image.width {
|
|
1707
|
+
image_hash.aset("width", width as i64)?;
|
|
1708
|
+
} else {
|
|
1709
|
+
image_hash.aset("width", ruby.qnil().as_value())?;
|
|
1710
|
+
}
|
|
1711
|
+
if let Some(height) = image.height {
|
|
1712
|
+
image_hash.aset("height", height as i64)?;
|
|
1713
|
+
} else {
|
|
1714
|
+
image_hash.aset("height", ruby.qnil().as_value())?;
|
|
1715
|
+
}
|
|
1716
|
+
if let Some(colorspace) = image.colorspace {
|
|
1717
|
+
image_hash.aset("colorspace", colorspace)?;
|
|
1718
|
+
} else {
|
|
1719
|
+
image_hash.aset("colorspace", ruby.qnil().as_value())?;
|
|
1720
|
+
}
|
|
1721
|
+
if let Some(bits) = image.bits_per_component {
|
|
1722
|
+
image_hash.aset("bits_per_component", bits as i64)?;
|
|
1723
|
+
} else {
|
|
1724
|
+
image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
|
|
1725
|
+
}
|
|
1726
|
+
image_hash.aset(
|
|
1727
|
+
"is_mask",
|
|
1728
|
+
if image.is_mask {
|
|
1729
|
+
ruby.qtrue().as_value()
|
|
1730
|
+
} else {
|
|
1731
|
+
ruby.qfalse().as_value()
|
|
1732
|
+
},
|
|
1733
|
+
)?;
|
|
1734
|
+
if let Some(description) = image.description {
|
|
1735
|
+
image_hash.aset("description", description)?;
|
|
1736
|
+
} else {
|
|
1737
|
+
image_hash.aset("description", ruby.qnil().as_value())?;
|
|
1738
|
+
}
|
|
1739
|
+
if let Some(ocr_result) = image.ocr_result {
|
|
1740
|
+
let nested = extraction_result_to_ruby(ruby, *ocr_result)?;
|
|
1741
|
+
image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
|
|
1742
|
+
} else {
|
|
1743
|
+
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
1744
|
+
}
|
|
1745
|
+
images_array.push(image_hash)?;
|
|
1746
|
+
}
|
|
1747
|
+
page_hash.aset("images", images_array)?;
|
|
1748
|
+
|
|
1749
|
+
pages_array.push(page_hash)?;
|
|
1750
|
+
}
|
|
1751
|
+
set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
|
|
1752
|
+
} else {
|
|
1753
|
+
set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
|
|
1754
|
+
}
|
|
1755
|
+
|
|
1620
1756
|
Ok(hash)
|
|
1621
1757
|
}
|
|
1622
1758
|
|
|
@@ -2366,6 +2502,7 @@ fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
|
|
|
2366
2502
|
detected_languages: None,
|
|
2367
2503
|
chunks: None,
|
|
2368
2504
|
images: None,
|
|
2505
|
+
pages: None,
|
|
2369
2506
|
})
|
|
2370
2507
|
}
|
|
2371
2508
|
|
data/kreuzberg.gemspec
CHANGED
|
@@ -21,6 +21,15 @@ core_files =
|
|
|
21
21
|
.map { |path| path.delete_prefix('crates/') }
|
|
22
22
|
.map { |path| "vendor/#{path}" }
|
|
23
23
|
|
|
24
|
+
# Include the kreuzberg-ffi crate
|
|
25
|
+
ffi_prefix = 'crates/kreuzberg-ffi/'
|
|
26
|
+
ffi_cmd = %(git -C "#{repo_root}" ls-files -z #{ffi_prefix})
|
|
27
|
+
ffi_files =
|
|
28
|
+
`#{ffi_cmd}`.split("\x0")
|
|
29
|
+
.select { |path| path.start_with?(ffi_prefix) }
|
|
30
|
+
.map { |path| path.delete_prefix('crates/') }
|
|
31
|
+
.map { |path| "vendor/#{path}" }
|
|
32
|
+
|
|
24
33
|
fallback_files = Dir.chdir(__dir__) do
|
|
25
34
|
ruby_fallback = Dir.glob(
|
|
26
35
|
%w[
|
|
@@ -45,10 +54,24 @@ fallback_files = Dir.chdir(__dir__) do
|
|
|
45
54
|
core_fallback = Dir.chdir(repo_root) do
|
|
46
55
|
Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
47
56
|
.reject { |f| File.directory?(f) }
|
|
57
|
+
.reject { |f| f.include?('/.fastembed_cache/') }
|
|
58
|
+
.reject { |f| f.include?('/target/') }
|
|
59
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
60
|
+
.grep_v(/~$/)
|
|
61
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Fallback for FFI crate - copy from repo root
|
|
65
|
+
ffi_fallback = Dir.chdir(repo_root) do
|
|
66
|
+
Dir.glob('crates/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
|
|
67
|
+
.reject { |f| File.directory?(f) }
|
|
68
|
+
.reject { |f| f.include?('/target/') }
|
|
69
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
70
|
+
.grep_v(/~$/)
|
|
48
71
|
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
49
72
|
end
|
|
50
73
|
|
|
51
|
-
ruby_fallback + core_fallback
|
|
74
|
+
ruby_fallback + core_fallback + ffi_fallback
|
|
52
75
|
end
|
|
53
76
|
|
|
54
77
|
# Check for vendored crates (copied during CI/packaging)
|
|
@@ -57,6 +80,7 @@ vendor_files = Dir.chdir(__dir__) do
|
|
|
57
80
|
Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
58
81
|
.reject { |f| File.directory?(f) }
|
|
59
82
|
.reject { |f| f.include?('/.fastembed_cache/') }
|
|
83
|
+
.reject { |f| f.include?('/.kreuzberg/') }
|
|
60
84
|
.reject { |f| f.include?('/target/') }
|
|
61
85
|
.grep_v(/\.(swp|bak|tmp)$/)
|
|
62
86
|
.grep_v(/~$/)
|
|
@@ -64,6 +88,16 @@ vendor_files = Dir.chdir(__dir__) do
|
|
|
64
88
|
[]
|
|
65
89
|
end
|
|
66
90
|
|
|
91
|
+
kreuzberg_ffi_files = if Dir.exist?('vendor/kreuzberg-ffi')
|
|
92
|
+
Dir.glob('vendor/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
|
|
93
|
+
.reject { |f| File.directory?(f) }
|
|
94
|
+
.reject { |f| f.include?('/target/') }
|
|
95
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
96
|
+
.grep_v(/~$/)
|
|
97
|
+
else
|
|
98
|
+
[]
|
|
99
|
+
end
|
|
100
|
+
|
|
67
101
|
rb_sys_files = if Dir.exist?('vendor/rb-sys')
|
|
68
102
|
Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
|
|
69
103
|
.reject { |f| File.directory?(f) }
|
|
@@ -80,17 +114,17 @@ vendor_files = Dir.chdir(__dir__) do
|
|
|
80
114
|
[]
|
|
81
115
|
end
|
|
82
116
|
|
|
83
|
-
kreuzberg_files + rb_sys_files + workspace_toml
|
|
117
|
+
kreuzberg_files + kreuzberg_ffi_files + rb_sys_files + workspace_toml
|
|
84
118
|
end
|
|
85
119
|
|
|
86
120
|
# Use git-tracked files if available, otherwise fallback to glob
|
|
87
121
|
# Always include vendored files if they exist on disk (for CI packaging)
|
|
88
|
-
files = if (ruby_files + core_files).empty?
|
|
122
|
+
files = if (ruby_files + core_files + ffi_files).empty?
|
|
89
123
|
fallback_files
|
|
90
124
|
elsif vendor_files.any?
|
|
91
125
|
ruby_files + vendor_files
|
|
92
126
|
else
|
|
93
|
-
ruby_files + core_files
|
|
127
|
+
ruby_files + core_files + ffi_files
|
|
94
128
|
end
|
|
95
129
|
|
|
96
130
|
# Filter to only include files that actually exist
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -492,6 +492,36 @@ module Kreuzberg
|
|
|
492
492
|
end
|
|
493
493
|
end
|
|
494
494
|
|
|
495
|
+
# Page tracking configuration for multi-page documents
|
|
496
|
+
#
|
|
497
|
+
# @example Enable page extraction
|
|
498
|
+
# pages = PageConfig.new(extract_pages: true)
|
|
499
|
+
#
|
|
500
|
+
# @example Enable page markers in content
|
|
501
|
+
# pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
|
|
502
|
+
#
|
|
503
|
+
class PageConfig
|
|
504
|
+
attr_reader :extract_pages, :insert_page_markers, :marker_format
|
|
505
|
+
|
|
506
|
+
def initialize(
|
|
507
|
+
extract_pages: false,
|
|
508
|
+
insert_page_markers: false,
|
|
509
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
510
|
+
)
|
|
511
|
+
@extract_pages = extract_pages ? true : false
|
|
512
|
+
@insert_page_markers = insert_page_markers ? true : false
|
|
513
|
+
@marker_format = marker_format.to_s
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
def to_h
|
|
517
|
+
{
|
|
518
|
+
extract_pages: @extract_pages,
|
|
519
|
+
insert_page_markers: @insert_page_markers,
|
|
520
|
+
marker_format: @marker_format
|
|
521
|
+
}
|
|
522
|
+
end
|
|
523
|
+
end
|
|
524
|
+
|
|
495
525
|
# Post-processor configuration
|
|
496
526
|
#
|
|
497
527
|
# @example Enable all post-processors
|
|
@@ -576,7 +606,7 @@ module Kreuzberg
|
|
|
576
606
|
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
577
607
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
578
608
|
:image_extraction, :image_preprocessing, :postprocessor,
|
|
579
|
-
:token_reduction, :keywords, :html_options,
|
|
609
|
+
:token_reduction, :keywords, :html_options, :pages,
|
|
580
610
|
:max_concurrent_extractions
|
|
581
611
|
|
|
582
612
|
# Load configuration from a file.
|
|
@@ -634,6 +664,7 @@ module Kreuzberg
|
|
|
634
664
|
token_reduction: nil,
|
|
635
665
|
keywords: nil,
|
|
636
666
|
html_options: nil,
|
|
667
|
+
pages: nil,
|
|
637
668
|
max_concurrent_extractions: nil
|
|
638
669
|
)
|
|
639
670
|
@use_cache = use_cache ? true : false
|
|
@@ -649,6 +680,7 @@ module Kreuzberg
|
|
|
649
680
|
@token_reduction = normalize_config(token_reduction, TokenReduction)
|
|
650
681
|
@keywords = normalize_config(keywords, Keywords)
|
|
651
682
|
@html_options = normalize_config(html_options, HtmlOptions)
|
|
683
|
+
@pages = normalize_config(pages, PageConfig)
|
|
652
684
|
@max_concurrent_extractions = max_concurrent_extractions&.to_i
|
|
653
685
|
end
|
|
654
686
|
|
|
@@ -668,6 +700,7 @@ module Kreuzberg
|
|
|
668
700
|
token_reduction: @token_reduction&.to_h,
|
|
669
701
|
keywords: @keywords&.to_h,
|
|
670
702
|
html_options: @html_options&.to_h,
|
|
703
|
+
pages: @pages&.to_h,
|
|
671
704
|
max_concurrent_extractions: @max_concurrent_extractions
|
|
672
705
|
}.compact
|
|
673
706
|
end
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -21,7 +21,7 @@ module Kreuzberg
|
|
|
21
21
|
# rubocop:disable Metrics/ClassLength
|
|
22
22
|
class Result
|
|
23
23
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
24
|
-
:detected_languages, :chunks, :images
|
|
24
|
+
:detected_languages, :chunks, :images, :pages
|
|
25
25
|
|
|
26
26
|
# Table structure
|
|
27
27
|
#
|
|
@@ -42,31 +42,39 @@ module Kreuzberg
|
|
|
42
42
|
#
|
|
43
43
|
# @!attribute [r] content
|
|
44
44
|
# @return [String] Chunk content
|
|
45
|
-
# @!attribute [r]
|
|
46
|
-
# @return [Integer] Starting
|
|
47
|
-
# @!attribute [r]
|
|
48
|
-
# @return [Integer] Ending
|
|
45
|
+
# @!attribute [r] byte_start
|
|
46
|
+
# @return [Integer] Starting byte offset (UTF-8)
|
|
47
|
+
# @!attribute [r] byte_end
|
|
48
|
+
# @return [Integer] Ending byte offset (UTF-8)
|
|
49
49
|
# @!attribute [r] token_count
|
|
50
50
|
# @return [Integer, nil] Approximate token count (may be nil)
|
|
51
|
+
# @!attribute [r] first_page
|
|
52
|
+
# @return [Integer, nil] First page number (1-indexed)
|
|
53
|
+
# @!attribute [r] last_page
|
|
54
|
+
# @return [Integer, nil] Last page number (1-indexed)
|
|
51
55
|
#
|
|
52
56
|
Chunk = Struct.new(
|
|
53
57
|
:content,
|
|
54
|
-
:
|
|
55
|
-
:
|
|
58
|
+
:byte_start,
|
|
59
|
+
:byte_end,
|
|
56
60
|
:token_count,
|
|
57
61
|
:chunk_index,
|
|
58
62
|
:total_chunks,
|
|
63
|
+
:first_page,
|
|
64
|
+
:last_page,
|
|
59
65
|
:embedding,
|
|
60
66
|
keyword_init: true
|
|
61
67
|
) do
|
|
62
68
|
def to_h
|
|
63
69
|
{
|
|
64
70
|
content: content,
|
|
65
|
-
|
|
66
|
-
|
|
71
|
+
byte_start: byte_start,
|
|
72
|
+
byte_end: byte_end,
|
|
67
73
|
token_count: token_count,
|
|
68
74
|
chunk_index: chunk_index,
|
|
69
75
|
total_chunks: total_chunks,
|
|
76
|
+
first_page: first_page,
|
|
77
|
+
last_page: last_page,
|
|
70
78
|
embedding: embedding
|
|
71
79
|
}
|
|
72
80
|
end
|
|
@@ -103,6 +111,28 @@ module Kreuzberg
|
|
|
103
111
|
end
|
|
104
112
|
end
|
|
105
113
|
|
|
114
|
+
# Per-page content
|
|
115
|
+
#
|
|
116
|
+
# @!attribute [r] page_number
|
|
117
|
+
# @return [Integer] Page number (1-indexed)
|
|
118
|
+
# @!attribute [r] content
|
|
119
|
+
# @return [String] Text content for this page
|
|
120
|
+
# @!attribute [r] tables
|
|
121
|
+
# @return [Array<Table>] Tables on this page
|
|
122
|
+
# @!attribute [r] images
|
|
123
|
+
# @return [Array<Image>] Images on this page
|
|
124
|
+
#
|
|
125
|
+
PageContent = Struct.new(:page_number, :content, :tables, :images, keyword_init: true) do
|
|
126
|
+
def to_h
|
|
127
|
+
{
|
|
128
|
+
page_number: page_number,
|
|
129
|
+
content: content,
|
|
130
|
+
tables: tables.map(&:to_h),
|
|
131
|
+
images: images.map(&:to_h)
|
|
132
|
+
}
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
106
136
|
# Initialize from native hash result
|
|
107
137
|
#
|
|
108
138
|
# @param hash [Hash] Hash returned from native extension
|
|
@@ -117,6 +147,7 @@ module Kreuzberg
|
|
|
117
147
|
@detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
|
|
118
148
|
@chunks = parse_chunks(get_value(hash, 'chunks'))
|
|
119
149
|
@images = parse_images(get_value(hash, 'images'))
|
|
150
|
+
@pages = parse_pages(get_value(hash, 'pages'))
|
|
120
151
|
end
|
|
121
152
|
|
|
122
153
|
# Convert to hash
|
|
@@ -128,10 +159,11 @@ module Kreuzberg
|
|
|
128
159
|
content: @content,
|
|
129
160
|
mime_type: @mime_type,
|
|
130
161
|
metadata: @metadata,
|
|
131
|
-
tables:
|
|
162
|
+
tables: serialize_tables,
|
|
132
163
|
detected_languages: @detected_languages,
|
|
133
|
-
chunks:
|
|
134
|
-
images:
|
|
164
|
+
chunks: serialize_chunks,
|
|
165
|
+
images: serialize_images,
|
|
166
|
+
pages: serialize_pages
|
|
135
167
|
}
|
|
136
168
|
end
|
|
137
169
|
|
|
@@ -145,6 +177,22 @@ module Kreuzberg
|
|
|
145
177
|
|
|
146
178
|
private
|
|
147
179
|
|
|
180
|
+
def serialize_tables
|
|
181
|
+
@tables.map(&:to_h)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def serialize_chunks
|
|
185
|
+
@chunks&.map(&:to_h)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def serialize_images
|
|
189
|
+
@images&.map(&:to_h)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def serialize_pages
|
|
193
|
+
@pages&.map(&:to_h)
|
|
194
|
+
end
|
|
195
|
+
|
|
148
196
|
def get_value(hash, key, default = nil)
|
|
149
197
|
hash[key] || hash[key.to_sym] || default
|
|
150
198
|
end
|
|
@@ -180,11 +228,13 @@ module Kreuzberg
|
|
|
180
228
|
chunks_data.map do |chunk_hash|
|
|
181
229
|
Chunk.new(
|
|
182
230
|
content: chunk_hash['content'],
|
|
183
|
-
|
|
184
|
-
|
|
231
|
+
byte_start: chunk_hash['byte_start'],
|
|
232
|
+
byte_end: chunk_hash['byte_end'],
|
|
185
233
|
token_count: chunk_hash['token_count'],
|
|
186
234
|
chunk_index: chunk_hash['chunk_index'],
|
|
187
235
|
total_chunks: chunk_hash['total_chunks'],
|
|
236
|
+
first_page: chunk_hash['first_page'],
|
|
237
|
+
last_page: chunk_hash['last_page'],
|
|
188
238
|
embedding: chunk_hash['embedding']
|
|
189
239
|
)
|
|
190
240
|
end
|
|
@@ -211,6 +261,19 @@ module Kreuzberg
|
|
|
211
261
|
)
|
|
212
262
|
end
|
|
213
263
|
end
|
|
264
|
+
|
|
265
|
+
def parse_pages(pages_data)
|
|
266
|
+
return nil if pages_data.nil?
|
|
267
|
+
|
|
268
|
+
pages_data.map do |page_hash|
|
|
269
|
+
PageContent.new(
|
|
270
|
+
page_number: page_hash['page_number'],
|
|
271
|
+
content: page_hash['content'],
|
|
272
|
+
tables: parse_tables(page_hash['tables']),
|
|
273
|
+
images: parse_images(page_hash['images'])
|
|
274
|
+
)
|
|
275
|
+
end
|
|
276
|
+
end
|
|
214
277
|
end
|
|
215
278
|
# rubocop:enable Metrics/ClassLength
|
|
216
279
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -168,6 +168,15 @@ module Kreuzberg
|
|
|
168
168
|
def to_h: () -> Hash[Symbol, untyped]
|
|
169
169
|
end
|
|
170
170
|
|
|
171
|
+
class PageConfig
|
|
172
|
+
attr_reader extract_pages: bool
|
|
173
|
+
attr_reader insert_page_markers: bool
|
|
174
|
+
attr_reader marker_format: String
|
|
175
|
+
|
|
176
|
+
def initialize: (?extract_pages: bool, ?insert_page_markers: bool, ?marker_format: String) -> void
|
|
177
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
178
|
+
end
|
|
179
|
+
|
|
171
180
|
class Extraction
|
|
172
181
|
attr_reader use_cache: bool
|
|
173
182
|
attr_reader enable_quality_processing: bool
|
|
@@ -182,6 +191,7 @@ module Kreuzberg
|
|
|
182
191
|
attr_reader token_reduction: TokenReduction?
|
|
183
192
|
attr_reader keywords: Keywords?
|
|
184
193
|
attr_reader html_options: HtmlOptions?
|
|
194
|
+
attr_reader pages: PageConfig?
|
|
185
195
|
attr_reader max_concurrent_extractions: Integer?
|
|
186
196
|
|
|
187
197
|
def self.from_file: (String path) -> Extraction
|
|
@@ -199,6 +209,7 @@ module Kreuzberg
|
|
|
199
209
|
?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
|
|
200
210
|
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
201
211
|
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
212
|
+
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
202
213
|
?max_concurrent_extractions: Integer?
|
|
203
214
|
) -> void
|
|
204
215
|
def to_h: () -> Hash[Symbol, untyped]
|
|
@@ -234,11 +245,13 @@ module Kreuzberg
|
|
|
234
245
|
|
|
235
246
|
type chunk_hash = {
|
|
236
247
|
content: String,
|
|
237
|
-
|
|
238
|
-
|
|
248
|
+
byte_start: Integer,
|
|
249
|
+
byte_end: Integer,
|
|
239
250
|
token_count: Integer?,
|
|
240
251
|
chunk_index: Integer?,
|
|
241
252
|
total_chunks: Integer?,
|
|
253
|
+
first_page: Integer?,
|
|
254
|
+
last_page: Integer?,
|
|
242
255
|
embedding: Array[Float]?
|
|
243
256
|
}
|
|
244
257
|
|
|
@@ -278,20 +291,24 @@ module Kreuzberg
|
|
|
278
291
|
# Text chunk
|
|
279
292
|
class Chunk
|
|
280
293
|
attr_reader content: String
|
|
281
|
-
attr_reader
|
|
282
|
-
attr_reader
|
|
294
|
+
attr_reader byte_start: Integer
|
|
295
|
+
attr_reader byte_end: Integer
|
|
283
296
|
attr_reader token_count: Integer?
|
|
284
297
|
attr_reader chunk_index: Integer?
|
|
285
298
|
attr_reader total_chunks: Integer?
|
|
299
|
+
attr_reader first_page: Integer?
|
|
300
|
+
attr_reader last_page: Integer?
|
|
286
301
|
attr_reader embedding: Array[Float]?
|
|
287
302
|
|
|
288
303
|
def initialize: (
|
|
289
304
|
content: String,
|
|
290
|
-
|
|
291
|
-
|
|
305
|
+
byte_start: Integer,
|
|
306
|
+
byte_end: Integer,
|
|
292
307
|
token_count: Integer?,
|
|
293
308
|
chunk_index: Integer?,
|
|
294
309
|
total_chunks: Integer?,
|
|
310
|
+
first_page: Integer?,
|
|
311
|
+
last_page: Integer?,
|
|
295
312
|
embedding: Array[Float]?
|
|
296
313
|
) -> void
|
|
297
314
|
def to_h: () -> chunk_hash
|