kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -3
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +25 -11
  14. data/vendor/kreuzberg/README.md +13 -8
  15. data/vendor/kreuzberg/build.rs +17 -6
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  18. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  19. data/vendor/kreuzberg/src/core/config.rs +49 -1
  20. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  21. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  22. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  23. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  24. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  25. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  26. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  27. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  28. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  29. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  33. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  34. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  36. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  38. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  39. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  40. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  44. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  45. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
  48. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  49. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  50. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  51. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  52. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  53. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  55. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  56. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  57. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  58. data/vendor/kreuzberg/src/lib.rs +10 -2
  59. data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
  60. data/vendor/kreuzberg/src/mcp/server.rs +14 -12
  61. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  94. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  95. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  97. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  98. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  99. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  100. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  101. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  102. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  103. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  104. data/vendor/rb-sys/Cargo.lock +15 -15
  105. data/vendor/rb-sys/Cargo.toml +4 -4
  106. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  107. data/vendor/rb-sys/bin/release.sh +9 -8
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/macros.rs +2 -2
  113. data/vendor/rb-sys/src/special_consts.rs +1 -1
  114. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  116. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  120. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  121. data/vendor/rb-sys/src/stable_api.rs +0 -1
  122. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  123. metadata +11 -10
  124. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  125. data/vendor/rb-sys/.cargo-ok +0 -1
  126. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -1,46 +1,91 @@
1
1
  use super::error::{PdfError, Result};
2
+ use crate::types::{PageBoundary, PageInfo, PageStructure, PageUnitType};
2
3
  use pdfium_render::prelude::*;
3
4
  use serde::{Deserialize, Serialize};
4
5
 
6
+ /// PDF-specific metadata.
7
+ ///
8
+ /// Contains metadata fields specific to PDF documents that are not in the common
9
+ /// `Metadata` structure. Common fields like title, authors, keywords, and dates
10
+ /// are now at the `Metadata` level.
5
11
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
6
12
  pub struct PdfMetadata {
13
+ /// PDF version (e.g., "1.7", "2.0")
14
+ #[serde(skip_serializing_if = "Option::is_none")]
15
+ pub pdf_version: Option<String>,
16
+
17
+ /// PDF producer (application that created the PDF)
18
+ #[serde(skip_serializing_if = "Option::is_none")]
19
+ pub producer: Option<String>,
20
+
21
+ /// Whether the PDF is encrypted/password-protected
22
+ #[serde(skip_serializing_if = "Option::is_none")]
23
+ pub is_encrypted: Option<bool>,
24
+
25
+ /// First page width in points (1/72 inch)
26
+ #[serde(skip_serializing_if = "Option::is_none")]
27
+ pub width: Option<i64>,
28
+
29
+ /// First page height in points (1/72 inch)
30
+ #[serde(skip_serializing_if = "Option::is_none")]
31
+ pub height: Option<i64>,
32
+ }
33
+
34
+ /// Complete PDF extraction metadata including common and PDF-specific fields.
35
+ ///
36
+ /// This struct combines common document fields (title, authors, dates) with
37
+ /// PDF-specific metadata and optional page structure information. It is returned
38
+ /// by `extract_metadata_from_document()` when page boundaries are provided.
39
+ #[derive(Debug, Clone, Serialize, Deserialize)]
40
+ pub struct PdfExtractionMetadata {
41
+ /// Document title
7
42
  #[serde(skip_serializing_if = "Option::is_none")]
8
43
  pub title: Option<String>,
44
+
45
+ /// Document subject or description
9
46
  #[serde(skip_serializing_if = "Option::is_none")]
10
47
  pub subject: Option<String>,
48
+
49
+ /// Document authors (parsed from PDF Author field)
11
50
  #[serde(skip_serializing_if = "Option::is_none")]
12
51
  pub authors: Option<Vec<String>>,
52
+
53
+ /// Document keywords (parsed from PDF Keywords field)
13
54
  #[serde(skip_serializing_if = "Option::is_none")]
14
55
  pub keywords: Option<Vec<String>>,
56
+
57
+ /// Creation timestamp (ISO 8601 format)
15
58
  #[serde(skip_serializing_if = "Option::is_none")]
16
59
  pub created_at: Option<String>,
60
+
61
+ /// Last modification timestamp (ISO 8601 format)
17
62
  #[serde(skip_serializing_if = "Option::is_none")]
18
63
  pub modified_at: Option<String>,
64
+
65
+ /// Application or user that created the document
19
66
  #[serde(skip_serializing_if = "Option::is_none")]
20
67
  pub created_by: Option<String>,
68
+
69
+ /// PDF-specific metadata
70
+ pub pdf_specific: PdfMetadata,
71
+
72
+ /// Page structure with boundaries and optional per-page metadata
21
73
  #[serde(skip_serializing_if = "Option::is_none")]
22
- pub producer: Option<String>,
23
- #[serde(skip_serializing_if = "Option::is_none")]
24
- pub page_count: Option<usize>,
25
- #[serde(skip_serializing_if = "Option::is_none")]
26
- pub pdf_version: Option<String>,
27
- #[serde(skip_serializing_if = "Option::is_none")]
28
- pub is_encrypted: Option<bool>,
29
- #[serde(skip_serializing_if = "Option::is_none")]
30
- pub width: Option<i64>,
31
- #[serde(skip_serializing_if = "Option::is_none")]
32
- pub height: Option<i64>,
33
- #[serde(skip_serializing_if = "Option::is_none")]
34
- pub summary: Option<String>,
74
+ pub page_structure: Option<PageStructure>,
35
75
  }
36
76
 
77
+ /// Extract PDF-specific metadata from raw bytes.
78
+ ///
79
+ /// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
37
80
  pub fn extract_metadata(pdf_bytes: &[u8]) -> Result<PdfMetadata> {
38
81
  extract_metadata_with_password(pdf_bytes, None)
39
82
  }
40
83
 
84
+ /// Extract PDF-specific metadata from raw bytes with optional password.
85
+ ///
86
+ /// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
41
87
  pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>) -> Result<PdfMetadata> {
42
- let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
43
- .or_else(|_| Pdfium::bind_to_system_library())
88
+ let bindings = Pdfium::bind_to_system_library()
44
89
  .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
45
90
 
46
91
  let pdfium = Pdfium::new(bindings);
@@ -56,7 +101,7 @@ pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>)
56
101
  }
57
102
  })?;
58
103
 
59
- extract_metadata_from_document(&document)
104
+ extract_pdf_specific_metadata(&document)
60
105
  }
61
106
 
62
107
  pub fn extract_metadata_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) -> Result<PdfMetadata> {
@@ -79,70 +124,197 @@ pub fn extract_metadata_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) ->
79
124
  extract_metadata(pdf_bytes)
80
125
  }
81
126
 
82
- pub(crate) fn extract_metadata_from_document(document: &PdfDocument<'_>) -> Result<PdfMetadata> {
127
+ /// Extract complete PDF metadata from a document.
128
+ ///
129
+ /// Extracts common fields (title, subject, authors, keywords, dates, creator),
130
+ /// PDF-specific metadata, and optionally builds a PageStructure with boundaries.
131
+ ///
132
+ /// # Arguments
133
+ ///
134
+ /// * `document` - The PDF document to extract metadata from
135
+ /// * `page_boundaries` - Optional vector of PageBoundary entries for building PageStructure.
136
+ /// If provided, a PageStructure will be built with these boundaries.
137
+ ///
138
+ /// # Returns
139
+ ///
140
+ /// Returns a `PdfExtractionMetadata` struct containing all extracted metadata,
141
+ /// including page structure if boundaries were provided.
142
+ pub fn extract_metadata_from_document(
143
+ document: &PdfDocument<'_>,
144
+ page_boundaries: Option<&[PageBoundary]>,
145
+ ) -> Result<PdfExtractionMetadata> {
146
+ let pdf_specific = extract_pdf_specific_metadata(document)?;
147
+
148
+ let common = extract_common_metadata_from_document(document)?;
149
+
150
+ let page_structure = if let Some(boundaries) = page_boundaries {
151
+ Some(build_page_structure(document, boundaries)?)
152
+ } else {
153
+ None
154
+ };
155
+
156
+ Ok(PdfExtractionMetadata {
157
+ title: common.title,
158
+ subject: common.subject,
159
+ authors: common.authors,
160
+ keywords: common.keywords,
161
+ created_at: common.created_at,
162
+ modified_at: common.modified_at,
163
+ created_by: common.created_by,
164
+ pdf_specific,
165
+ page_structure,
166
+ })
167
+ }
168
+
169
+ /// Extract PDF-specific metadata from a document.
170
+ ///
171
+ /// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
172
+ fn extract_pdf_specific_metadata(document: &PdfDocument<'_>) -> Result<PdfMetadata> {
83
173
  let pdf_metadata = document.metadata();
84
174
 
85
175
  let mut metadata = PdfMetadata {
86
176
  pdf_version: format_pdf_version(document.version()),
87
177
  ..Default::default()
88
178
  };
89
- metadata.page_count = Some(document.pages().len() as usize);
179
+
90
180
  metadata.is_encrypted = document
91
181
  .permissions()
92
182
  .security_handler_revision()
93
183
  .ok()
94
184
  .map(|revision| revision != PdfSecurityHandlerRevision::Unprotected);
95
185
 
96
- metadata.title = pdf_metadata
97
- .get(PdfDocumentMetadataTagType::Title)
98
- .map(|tag| tag.value().to_string());
99
-
100
- metadata.subject = pdf_metadata
101
- .get(PdfDocumentMetadataTagType::Subject)
186
+ metadata.producer = pdf_metadata
187
+ .get(PdfDocumentMetadataTagType::Producer)
102
188
  .map(|tag| tag.value().to_string());
103
189
 
104
- if let Some(author_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::Author) {
105
- let authors = parse_authors(author_tag.value());
106
- if !authors.is_empty() {
107
- metadata.authors = Some(authors);
108
- }
190
+ if !document.pages().is_empty()
191
+ && let Ok(page_rect) = document.pages().page_size(0)
192
+ {
193
+ metadata.width = Some(page_rect.width().value.round() as i64);
194
+ metadata.height = Some(page_rect.height().value.round() as i64);
109
195
  }
110
196
 
111
- if let Some(keywords_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::Keywords) {
112
- let keywords = parse_keywords(keywords_tag.value());
113
- if !keywords.is_empty() {
114
- metadata.keywords = Some(keywords);
115
- }
197
+ Ok(metadata)
198
+ }
199
+
200
+ /// Build a PageStructure from a document and page boundaries.
201
+ ///
202
+ /// Constructs a complete PageStructure including:
203
+ /// - Total page count
204
+ /// - Unit type (Page)
205
+ /// - Character offset boundaries for each page
206
+ /// - Optional per-page metadata with dimensions
207
+ ///
208
+ /// # Validation
209
+ ///
210
+ /// - Boundaries must not be empty
211
+ /// - Boundary count must match the document's page count
212
+ fn build_page_structure(document: &PdfDocument<'_>, boundaries: &[PageBoundary]) -> Result<PageStructure> {
213
+ let total_count = document.pages().len() as usize;
214
+
215
+ if boundaries.is_empty() {
216
+ return Err(PdfError::MetadataExtractionFailed(
217
+ "No page boundaries provided for PageStructure".to_string(),
218
+ ));
116
219
  }
117
220
 
118
- if let Some(created_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::CreationDate) {
119
- metadata.created_at = Some(parse_pdf_date(created_tag.value()));
221
+ if boundaries.len() != total_count {
222
+ return Err(PdfError::MetadataExtractionFailed(format!(
223
+ "Boundary count {} doesn't match page count {}",
224
+ boundaries.len(),
225
+ total_count
226
+ )));
120
227
  }
121
228
 
122
- if let Some(modified_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::ModificationDate) {
123
- metadata.modified_at = Some(parse_pdf_date(modified_tag.value()));
229
+ let mut pages = Vec::new();
230
+ for (index, boundary) in boundaries.iter().enumerate() {
231
+ let page_number = boundary.page_number;
232
+
233
+ let dimensions = if let Ok(page_rect) = document.pages().page_size(index as u16) {
234
+ Some((page_rect.width().value as f64, page_rect.height().value as f64))
235
+ } else {
236
+ None
237
+ };
238
+
239
+ pages.push(PageInfo {
240
+ number: page_number,
241
+ title: None,
242
+ dimensions,
243
+ image_count: None,
244
+ table_count: None,
245
+ hidden: None,
246
+ });
124
247
  }
125
248
 
126
- metadata.created_by = pdf_metadata
127
- .get(PdfDocumentMetadataTagType::Creator)
249
+ Ok(PageStructure {
250
+ total_count,
251
+ unit_type: PageUnitType::Page,
252
+ boundaries: Some(boundaries.to_vec()),
253
+ pages: if pages.is_empty() { None } else { Some(pages) },
254
+ })
255
+ }
256
+
257
+ /// Extract common metadata from a PDF document.
258
+ ///
259
+ /// Returns common fields (title, authors, keywords, dates) that are now stored
260
+ /// in the base `Metadata` struct instead of format-specific metadata.
261
+ pub fn extract_common_metadata_from_document(document: &PdfDocument<'_>) -> Result<CommonPdfMetadata> {
262
+ let pdf_metadata = document.metadata();
263
+
264
+ let title = pdf_metadata
265
+ .get(PdfDocumentMetadataTagType::Title)
128
266
  .map(|tag| tag.value().to_string());
129
267
 
130
- metadata.producer = pdf_metadata
131
- .get(PdfDocumentMetadataTagType::Producer)
268
+ let subject = pdf_metadata
269
+ .get(PdfDocumentMetadataTagType::Subject)
132
270
  .map(|tag| tag.value().to_string());
133
271
 
134
- if !document.pages().is_empty()
135
- && let Ok(page_rect) = document.pages().page_size(0)
136
- {
137
- metadata.width = Some(page_rect.width().value.round() as i64);
138
- metadata.height = Some(page_rect.height().value.round() as i64);
139
- }
272
+ let authors = if let Some(author_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::Author) {
273
+ let parsed = parse_authors(author_tag.value());
274
+ if !parsed.is_empty() { Some(parsed) } else { None }
275
+ } else {
276
+ None
277
+ };
140
278
 
141
- if metadata.summary.is_none() {
142
- metadata.summary = Some(generate_summary(&metadata));
143
- }
279
+ let keywords = if let Some(keywords_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::Keywords) {
280
+ let parsed = parse_keywords(keywords_tag.value());
281
+ if !parsed.is_empty() { Some(parsed) } else { None }
282
+ } else {
283
+ None
284
+ };
144
285
 
145
- Ok(metadata)
286
+ let created_at = pdf_metadata
287
+ .get(PdfDocumentMetadataTagType::CreationDate)
288
+ .map(|tag| parse_pdf_date(tag.value()));
289
+
290
+ let modified_at = pdf_metadata
291
+ .get(PdfDocumentMetadataTagType::ModificationDate)
292
+ .map(|tag| parse_pdf_date(tag.value()));
293
+
294
+ let created_by = pdf_metadata
295
+ .get(PdfDocumentMetadataTagType::Creator)
296
+ .map(|tag| tag.value().to_string());
297
+
298
+ Ok(CommonPdfMetadata {
299
+ title,
300
+ subject,
301
+ authors,
302
+ keywords,
303
+ created_at,
304
+ modified_at,
305
+ created_by,
306
+ })
307
+ }
308
+
309
+ /// Common metadata fields extracted from a PDF.
310
+ pub struct CommonPdfMetadata {
311
+ pub title: Option<String>,
312
+ pub subject: Option<String>,
313
+ pub authors: Option<Vec<String>>,
314
+ pub keywords: Option<Vec<String>>,
315
+ pub created_at: Option<String>,
316
+ pub modified_at: Option<String>,
317
+ pub created_by: Option<String>,
146
318
  }
147
319
 
148
320
  fn parse_authors(author_str: &str) -> Vec<String> {
@@ -206,25 +378,6 @@ fn parse_pdf_date(date_str: &str) -> String {
206
378
  }
207
379
  }
208
380
 
209
- fn generate_summary(metadata: &PdfMetadata) -> String {
210
- let mut parts = Vec::new();
211
-
212
- if let Some(page_count) = metadata.page_count {
213
- let plural = if page_count != 1 { "s" } else { "" };
214
- parts.push(format!("PDF document with {} page{}.", page_count, plural));
215
- }
216
-
217
- if let Some(ref version) = metadata.pdf_version {
218
- parts.push(format!("PDF version {}.", version));
219
- }
220
-
221
- if metadata.is_encrypted == Some(true) {
222
- parts.push("Document is encrypted.".to_string());
223
- }
224
-
225
- parts.join(" ")
226
- }
227
-
228
381
  fn format_pdf_version(version: PdfDocumentVersion) -> Option<String> {
229
382
  match version {
230
383
  PdfDocumentVersion::Unset => None,
@@ -312,35 +465,25 @@ mod tests {
312
465
  }
313
466
 
314
467
  #[test]
315
- fn test_generate_summary() {
316
- let metadata = PdfMetadata {
317
- page_count: Some(10),
318
- pdf_version: Some("1.7".to_string()),
319
- is_encrypted: Some(false),
320
- ..Default::default()
321
- };
322
-
323
- let summary = generate_summary(&metadata);
324
- assert!(summary.contains("10 pages"));
325
- assert!(summary.contains("1.7"));
326
- assert!(!summary.contains("encrypted"));
468
+ fn test_extract_metadata_invalid_pdf() {
469
+ let result = extract_metadata(b"not a pdf");
470
+ assert!(result.is_err());
327
471
  }
328
472
 
329
473
  #[test]
330
- fn test_generate_summary_single_page() {
331
- let metadata = PdfMetadata {
332
- page_count: Some(1),
333
- ..Default::default()
334
- };
335
-
336
- let summary = generate_summary(&metadata);
337
- assert!(summary.contains("1 page."));
338
- assert!(!summary.contains("pages"));
474
+ fn test_build_page_structure_empty_boundaries() {
475
+ let result_msg = "No page boundaries provided for PageStructure".to_string();
476
+ assert!(!result_msg.is_empty());
339
477
  }
340
478
 
341
479
  #[test]
342
- fn test_extract_metadata_invalid_pdf() {
343
- let result = extract_metadata(b"not a pdf");
344
- assert!(result.is_err());
480
+ fn test_build_page_structure_boundary_mismatch_message() {
481
+ let boundaries_count = 3;
482
+ let page_count = 5;
483
+ let error_msg = format!(
484
+ "Boundary count {} doesn't match page count {}",
485
+ boundaries_count, page_count
486
+ );
487
+ assert_eq!(error_msg, "Boundary count 3 doesn't match page count 5");
345
488
  }
346
489
  }
@@ -26,25 +26,37 @@
26
26
  //!
27
27
  //! // Extract metadata
28
28
  //! let metadata = extract_metadata(&pdf_bytes)?;
29
- //! println!("Page count: {:?}", metadata.page_count);
29
+ //! println!("PDF version: {:?}", metadata.pdf_version);
30
30
  //! # Ok(())
31
31
  //! # }
32
32
  //! ```
33
33
  //!
34
34
  //! # Note
35
35
  //!
36
- //! This module is always available. The `ocr` feature enables additional
36
+ //! This module requires the `pdf` feature. The `ocr` feature enables additional
37
37
  //! functionality in the PDF extractor for rendering pages to images.
38
+ #[cfg(feature = "pdf")]
38
39
  pub mod error;
40
+ #[cfg(feature = "pdf")]
39
41
  pub mod images;
42
+ #[cfg(feature = "pdf")]
40
43
  pub mod metadata;
44
+ #[cfg(feature = "pdf")]
41
45
  pub mod rendering;
46
+ #[cfg(feature = "pdf")]
42
47
  pub mod table;
48
+ #[cfg(feature = "pdf")]
43
49
  pub mod text;
44
50
 
51
+ #[cfg(feature = "pdf")]
45
52
  pub use error::PdfError;
53
+ #[cfg(feature = "pdf")]
46
54
  pub use images::{PdfImage, PdfImageExtractor, extract_images_from_pdf};
55
+ #[cfg(feature = "pdf")]
47
56
  pub use metadata::extract_metadata;
57
+ #[cfg(feature = "pdf")]
48
58
  pub use rendering::{PageRenderOptions, render_page_to_image};
59
+ #[cfg(feature = "pdf")]
49
60
  pub use table::extract_words_from_page;
61
+ #[cfg(feature = "pdf")]
50
62
  pub use text::extract_text_from_pdf;
@@ -32,8 +32,7 @@ pub struct PdfRenderer {
32
32
 
33
33
  impl PdfRenderer {
34
34
  pub fn new() -> Result<Self> {
35
- let binding = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
36
- .or_else(|_| Pdfium::bind_to_system_library())
35
+ let binding = Pdfium::bind_to_system_library()
37
36
  .map_err(|e| PdfError::RenderingFailed(format!("Failed to initialize Pdfium: {}", e)))?;
38
37
 
39
38
  let pdfium = Pdfium::new(binding);
@@ -2,9 +2,12 @@
2
2
  //!
3
3
  //! This module converts pdfium character data to HocrWord format,
4
4
  //! allowing us to reuse the existing table reconstruction logic.
5
+ //!
6
+ //! Note: Table extraction requires the "ocr" feature and is not available in WASM builds.
5
7
 
6
8
  use super::error::{PdfError, Result};
7
- use html_to_markdown_rs::hocr::HocrWord;
9
+ #[cfg(feature = "ocr")]
10
+ use crate::ocr::table::HocrWord;
8
11
  use pdfium_render::prelude::*;
9
12
 
10
13
  /// Spacing threshold for word boundary detection (in PDF units).
@@ -29,9 +32,14 @@ const MIN_WORD_LENGTH: usize = 1;
29
32
  ///
30
33
  /// Vector of HocrWord objects with text and bounding box information.
31
34
  ///
35
+ /// # Note
36
+ /// This function requires the "ocr" feature to be enabled. Without it, returns an error.
37
+ ///
32
38
  /// # Example
33
39
  ///
34
40
  /// ```rust,no_run
41
+ /// # #[cfg(feature = "ocr")]
42
+ /// # {
35
43
  /// use kreuzberg::pdf::table::extract_words_from_page;
36
44
  /// use pdfium_render::prelude::*;
37
45
  ///
@@ -42,7 +50,9 @@ const MIN_WORD_LENGTH: usize = 1;
42
50
  /// let words = extract_words_from_page(&page, 90.0)?;
43
51
  /// # Ok(())
44
52
  /// # }
53
+ /// # }
45
54
  /// ```
55
+ #[cfg(feature = "ocr")]
46
56
  pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
47
57
  let page_width = page.width().value as i32;
48
58
  let page_height = page.height().value as i32;
@@ -58,6 +68,17 @@ pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Ve
58
68
  Ok(words)
59
69
  }
60
70
 
71
+ /// Fallback implementation when OCR feature is disabled.
72
+ ///
73
+ /// # Errors
74
+ /// Always returns an error indicating that the OCR feature is required.
75
+ #[cfg(not(feature = "ocr"))]
76
+ pub fn extract_words_from_page(_page: &PdfPage, _min_confidence: f64) -> Result<Vec<()>> {
77
+ Err(PdfError::TextExtractionFailed(
78
+ "PDF table extraction requires the 'ocr' feature to be enabled".to_string(),
79
+ ))
80
+ }
81
+
61
82
  /// Character with position information extracted from PDF.
62
83
  #[derive(Debug, Clone)]
63
84
  struct CharInfo {
@@ -80,6 +101,7 @@ struct CharInfo {
80
101
  /// * `page_width` - Page width in PDF units
81
102
  /// * `page_height` - Page height in PDF units
82
103
  /// * `min_confidence` - Minimum confidence threshold (PDF text uses 95.0)
104
+ #[cfg(feature = "ocr")]
83
105
  fn group_chars_into_words(
84
106
  chars: PdfPageTextChars,
85
107
  _page_width: i32,
@@ -139,6 +161,7 @@ fn group_chars_into_words(
139
161
  ///
140
162
  /// Returns true if the character is far from the previous character
141
163
  /// (indicating a word boundary) or on a different line.
164
+ #[cfg(feature = "ocr")]
142
165
  fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -> bool {
143
166
  if current_word_chars.is_empty() {
144
167
  return false;
@@ -159,6 +182,7 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
159
182
  ///
160
183
  /// Calculates bounding box and confidence for the word.
161
184
  /// Returns None if the word doesn't meet minimum criteria.
185
+ #[cfg(feature = "ocr")]
162
186
  fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> Option<HocrWord> {
163
187
  if chars.is_empty() {
164
188
  return None;
@@ -212,7 +236,7 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
212
236
  })
213
237
  }
214
238
 
215
- #[cfg(test)]
239
+ #[cfg(all(test, feature = "ocr"))]
216
240
  mod tests {
217
241
  use super::*;
218
242