kreuzberg 4.0.0.pre.rc.14 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +25 -215
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -2
- data/ext/kreuzberg_rb/native/build.rs +38 -1
- data/lib/kreuzberg/result.rb +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/{libpdfium.dylib → libpdfium.so} +0 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/build.rs +54 -10
- data/vendor/kreuzberg/src/api/mod.rs +8 -0
- data/vendor/kreuzberg/src/extraction/html.rs +40 -7
- data/vendor/kreuzberg/src/pdf/bundled.rs +115 -9
- data/vendor/kreuzberg/tests/format_integration.rs +1 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +11 -21
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,3616 +0,0 @@
|
|
|
1
|
-
//! C FFI bindings for Kreuzberg document intelligence library.
|
|
2
|
-
//!
|
|
3
|
-
//! Provides a C-compatible API that can be consumed by Java (Panama FFI),
|
|
4
|
-
//! Go (cgo), C# (P/Invoke), Zig, and other languages with C FFI support.
|
|
5
|
-
|
|
6
|
-
mod panic_shield;
|
|
7
|
-
|
|
8
|
-
pub use panic_shield::{
|
|
9
|
-
ErrorCode, StructuredError, clear_structured_error, get_last_error_code, get_last_error_message,
|
|
10
|
-
get_last_panic_context, set_structured_error,
|
|
11
|
-
};
|
|
12
|
-
|
|
13
|
-
use std::cell::RefCell;
|
|
14
|
-
use std::ffi::{CStr, CString};
|
|
15
|
-
use std::os::raw::c_char;
|
|
16
|
-
use std::path::Path;
|
|
17
|
-
use std::ptr;
|
|
18
|
-
use std::sync::Arc;
|
|
19
|
-
|
|
20
|
-
use async_trait::async_trait;
|
|
21
|
-
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
22
|
-
use kreuzberg::plugins::registry::get_ocr_backend_registry;
|
|
23
|
-
use kreuzberg::plugins::{OcrBackend, Plugin, ProcessingStage};
|
|
24
|
-
use kreuzberg::types::ExtractionResult;
|
|
25
|
-
use kreuzberg::{KreuzbergError, Result};
|
|
26
|
-
#[cfg(not(all(windows, target_env = "gnu")))]
|
|
27
|
-
use serde::Serialize;
|
|
28
|
-
|
|
29
|
-
thread_local! {
|
|
30
|
-
static LAST_ERROR_C_STRING: RefCell<Option<CString>> = const { RefCell::new(None) };
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
/// Set the last error message (convenience wrapper for backward compatibility)
|
|
34
|
-
fn set_last_error(err: String) {
|
|
35
|
-
if let Ok(c_str) = CString::new(err.clone()) {
|
|
36
|
-
LAST_ERROR_C_STRING.with(|last| *last.borrow_mut() = Some(c_str));
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
let structured_err = StructuredError::from_message(err, ErrorCode::GenericError);
|
|
40
|
-
set_structured_error(structured_err);
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
/// Clear the last error message
|
|
44
|
-
fn clear_last_error() {
|
|
45
|
-
LAST_ERROR_C_STRING.with(|last| *last.borrow_mut() = None);
|
|
46
|
-
clear_structured_error();
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
fn string_to_c_string(value: String) -> std::result::Result<*mut c_char, String> {
|
|
50
|
-
CString::new(value)
|
|
51
|
-
.map(CString::into_raw)
|
|
52
|
-
.map_err(|e| format!("Failed to create C string: {}", e))
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
type FfiResult<T> = std::result::Result<T, String>;
|
|
56
|
-
|
|
57
|
-
fn parse_extraction_config_from_json(config_str: &str) -> FfiResult<ExtractionConfig> {
|
|
58
|
-
use html_to_markdown_rs::options::{
|
|
59
|
-
CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
|
|
60
|
-
PreprocessingPreset, WhitespaceMode,
|
|
61
|
-
};
|
|
62
|
-
|
|
63
|
-
fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
|
|
64
|
-
where
|
|
65
|
-
F: Fn(&str) -> std::result::Result<T, String>,
|
|
66
|
-
{
|
|
67
|
-
if let Some(raw) = value {
|
|
68
|
-
let text = raw
|
|
69
|
-
.as_str()
|
|
70
|
-
.ok_or_else(|| "Expected string for html_options enum field".to_string())?;
|
|
71
|
-
return parse_fn(text).map(Some);
|
|
72
|
-
}
|
|
73
|
-
Ok(None)
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
|
|
77
|
-
match value.to_lowercase().as_str() {
|
|
78
|
-
"atx" => Ok(HeadingStyle::Atx),
|
|
79
|
-
"underlined" => Ok(HeadingStyle::Underlined),
|
|
80
|
-
"atx_closed" => Ok(HeadingStyle::AtxClosed),
|
|
81
|
-
other => Err(format!(
|
|
82
|
-
"Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
|
|
83
|
-
other
|
|
84
|
-
)),
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
|
|
89
|
-
match value.to_lowercase().as_str() {
|
|
90
|
-
"spaces" => Ok(ListIndentType::Spaces),
|
|
91
|
-
"tabs" => Ok(ListIndentType::Tabs),
|
|
92
|
-
other => Err(format!(
|
|
93
|
-
"Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
|
|
94
|
-
other
|
|
95
|
-
)),
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
|
|
100
|
-
match value.to_lowercase().as_str() {
|
|
101
|
-
"double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
|
|
102
|
-
"html" => Ok(HighlightStyle::Html),
|
|
103
|
-
"bold" => Ok(HighlightStyle::Bold),
|
|
104
|
-
"none" => Ok(HighlightStyle::None),
|
|
105
|
-
other => Err(format!(
|
|
106
|
-
"Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
|
|
107
|
-
other
|
|
108
|
-
)),
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
|
|
113
|
-
match value.to_lowercase().as_str() {
|
|
114
|
-
"normalized" => Ok(WhitespaceMode::Normalized),
|
|
115
|
-
"strict" => Ok(WhitespaceMode::Strict),
|
|
116
|
-
other => Err(format!(
|
|
117
|
-
"Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
|
|
118
|
-
other
|
|
119
|
-
)),
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
|
|
124
|
-
match value.to_lowercase().as_str() {
|
|
125
|
-
"spaces" => Ok(NewlineStyle::Spaces),
|
|
126
|
-
"backslash" => Ok(NewlineStyle::Backslash),
|
|
127
|
-
other => Err(format!(
|
|
128
|
-
"Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
|
|
129
|
-
other
|
|
130
|
-
)),
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
|
|
135
|
-
match value.to_lowercase().as_str() {
|
|
136
|
-
"indented" => Ok(CodeBlockStyle::Indented),
|
|
137
|
-
"backticks" => Ok(CodeBlockStyle::Backticks),
|
|
138
|
-
"tildes" => Ok(CodeBlockStyle::Tildes),
|
|
139
|
-
other => Err(format!(
|
|
140
|
-
"Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
|
|
141
|
-
other
|
|
142
|
-
)),
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
|
|
147
|
-
match value.to_lowercase().as_str() {
|
|
148
|
-
"minimal" => Ok(PreprocessingPreset::Minimal),
|
|
149
|
-
"standard" => Ok(PreprocessingPreset::Standard),
|
|
150
|
-
"aggressive" => Ok(PreprocessingPreset::Aggressive),
|
|
151
|
-
other => Err(format!(
|
|
152
|
-
"Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
|
|
153
|
-
other
|
|
154
|
-
)),
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
|
|
159
|
-
let mut opts = ConversionOptions::default();
|
|
160
|
-
let obj = value
|
|
161
|
-
.as_object()
|
|
162
|
-
.ok_or_else(|| "html_options must be an object".to_string())?;
|
|
163
|
-
|
|
164
|
-
if let Some(val) = obj.get("heading_style") {
|
|
165
|
-
opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
if let Some(val) = obj.get("list_indent_type") {
|
|
169
|
-
opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
if let Some(val) = obj.get("list_indent_width") {
|
|
173
|
-
opts.list_indent_width = val
|
|
174
|
-
.as_u64()
|
|
175
|
-
.map(|v| v as usize)
|
|
176
|
-
.ok_or_else(|| "list_indent_width must be an integer".to_string())?;
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
if let Some(val) = obj.get("bullets") {
|
|
180
|
-
opts.bullets = val
|
|
181
|
-
.as_str()
|
|
182
|
-
.map(str::to_string)
|
|
183
|
-
.ok_or_else(|| "bullets must be a string".to_string())?;
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
if let Some(val) = obj.get("strong_em_symbol") {
|
|
187
|
-
let symbol = val
|
|
188
|
-
.as_str()
|
|
189
|
-
.ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
|
|
190
|
-
let mut chars = symbol.chars();
|
|
191
|
-
opts.strong_em_symbol = chars
|
|
192
|
-
.next()
|
|
193
|
-
.ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
if let Some(val) = obj.get("escape_asterisks") {
|
|
197
|
-
opts.escape_asterisks = val
|
|
198
|
-
.as_bool()
|
|
199
|
-
.ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
|
|
200
|
-
}
|
|
201
|
-
if let Some(val) = obj.get("escape_underscores") {
|
|
202
|
-
opts.escape_underscores = val
|
|
203
|
-
.as_bool()
|
|
204
|
-
.ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
|
|
205
|
-
}
|
|
206
|
-
if let Some(val) = obj.get("escape_misc") {
|
|
207
|
-
opts.escape_misc = val
|
|
208
|
-
.as_bool()
|
|
209
|
-
.ok_or_else(|| "escape_misc must be a boolean".to_string())?;
|
|
210
|
-
}
|
|
211
|
-
if let Some(val) = obj.get("escape_ascii") {
|
|
212
|
-
opts.escape_ascii = val
|
|
213
|
-
.as_bool()
|
|
214
|
-
.ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
if let Some(val) = obj.get("code_language") {
|
|
218
|
-
opts.code_language = val
|
|
219
|
-
.as_str()
|
|
220
|
-
.map(str::to_string)
|
|
221
|
-
.ok_or_else(|| "code_language must be a string".to_string())?;
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
if let Some(val) = obj.get("autolinks") {
|
|
225
|
-
opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
if let Some(val) = obj.get("default_title") {
|
|
229
|
-
opts.default_title = val
|
|
230
|
-
.as_bool()
|
|
231
|
-
.ok_or_else(|| "default_title must be a boolean".to_string())?;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
if let Some(val) = obj.get("br_in_tables") {
|
|
235
|
-
opts.br_in_tables = val
|
|
236
|
-
.as_bool()
|
|
237
|
-
.ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
if let Some(val) = obj.get("hocr_spatial_tables") {
|
|
241
|
-
opts.hocr_spatial_tables = val
|
|
242
|
-
.as_bool()
|
|
243
|
-
.ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
if let Some(val) = obj.get("highlight_style") {
|
|
247
|
-
opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
if let Some(val) = obj.get("extract_metadata") {
|
|
251
|
-
opts.extract_metadata = val
|
|
252
|
-
.as_bool()
|
|
253
|
-
.ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
if let Some(val) = obj.get("whitespace_mode") {
|
|
257
|
-
opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
if let Some(val) = obj.get("strip_newlines") {
|
|
261
|
-
opts.strip_newlines = val
|
|
262
|
-
.as_bool()
|
|
263
|
-
.ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
if let Some(val) = obj.get("wrap") {
|
|
267
|
-
opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
if let Some(val) = obj.get("wrap_width") {
|
|
271
|
-
opts.wrap_width = val
|
|
272
|
-
.as_u64()
|
|
273
|
-
.map(|v| v as usize)
|
|
274
|
-
.ok_or_else(|| "wrap_width must be an integer".to_string())?;
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
if let Some(val) = obj.get("convert_as_inline") {
|
|
278
|
-
opts.convert_as_inline = val
|
|
279
|
-
.as_bool()
|
|
280
|
-
.ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
if let Some(val) = obj.get("sub_symbol") {
|
|
284
|
-
opts.sub_symbol = val
|
|
285
|
-
.as_str()
|
|
286
|
-
.map(str::to_string)
|
|
287
|
-
.ok_or_else(|| "sub_symbol must be a string".to_string())?;
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
if let Some(val) = obj.get("sup_symbol") {
|
|
291
|
-
opts.sup_symbol = val
|
|
292
|
-
.as_str()
|
|
293
|
-
.map(str::to_string)
|
|
294
|
-
.ok_or_else(|| "sup_symbol must be a string".to_string())?;
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
if let Some(val) = obj.get("newline_style") {
|
|
298
|
-
opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
if let Some(val) = obj.get("code_block_style") {
|
|
302
|
-
opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
if let Some(val) = obj.get("keep_inline_images_in") {
|
|
306
|
-
opts.keep_inline_images_in = val
|
|
307
|
-
.as_array()
|
|
308
|
-
.ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
|
|
309
|
-
.iter()
|
|
310
|
-
.map(|v| {
|
|
311
|
-
v.as_str()
|
|
312
|
-
.map(str::to_string)
|
|
313
|
-
.ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
|
|
314
|
-
})
|
|
315
|
-
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
if let Some(val) = obj.get("encoding") {
|
|
319
|
-
opts.encoding = val
|
|
320
|
-
.as_str()
|
|
321
|
-
.map(str::to_string)
|
|
322
|
-
.ok_or_else(|| "encoding must be a string".to_string())?;
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
if let Some(val) = obj.get("debug") {
|
|
326
|
-
opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
if let Some(val) = obj.get("strip_tags") {
|
|
330
|
-
opts.strip_tags = val
|
|
331
|
-
.as_array()
|
|
332
|
-
.ok_or_else(|| "strip_tags must be an array".to_string())?
|
|
333
|
-
.iter()
|
|
334
|
-
.map(|v| {
|
|
335
|
-
v.as_str()
|
|
336
|
-
.map(str::to_string)
|
|
337
|
-
.ok_or_else(|| "strip_tags entries must be strings".to_string())
|
|
338
|
-
})
|
|
339
|
-
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
if let Some(val) = obj.get("preserve_tags") {
|
|
343
|
-
opts.preserve_tags = val
|
|
344
|
-
.as_array()
|
|
345
|
-
.ok_or_else(|| "preserve_tags must be an array".to_string())?
|
|
346
|
-
.iter()
|
|
347
|
-
.map(|v| {
|
|
348
|
-
v.as_str()
|
|
349
|
-
.map(str::to_string)
|
|
350
|
-
.ok_or_else(|| "preserve_tags entries must be strings".to_string())
|
|
351
|
-
})
|
|
352
|
-
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
if let Some(val) = obj.get("preprocessing") {
|
|
356
|
-
let pre = val
|
|
357
|
-
.as_object()
|
|
358
|
-
.ok_or_else(|| "preprocessing must be an object".to_string())?;
|
|
359
|
-
let mut preprocessing = opts.preprocessing.clone();
|
|
360
|
-
|
|
361
|
-
if let Some(v) = pre.get("enabled") {
|
|
362
|
-
preprocessing.enabled = v
|
|
363
|
-
.as_bool()
|
|
364
|
-
.ok_or_else(|| "preprocessing.enabled must be a boolean".to_string())?;
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
if let Some(v) = pre.get("preset") {
|
|
368
|
-
let preset = v
|
|
369
|
-
.as_str()
|
|
370
|
-
.ok_or_else(|| "preprocessing.preset must be a string".to_string())?;
|
|
371
|
-
preprocessing.preset = parse_preprocessing_preset(preset)?;
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
if let Some(v) = pre.get("remove_navigation") {
|
|
375
|
-
preprocessing.remove_navigation = v
|
|
376
|
-
.as_bool()
|
|
377
|
-
.ok_or_else(|| "preprocessing.remove_navigation must be a boolean".to_string())?;
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
if let Some(v) = pre.get("remove_forms") {
|
|
381
|
-
preprocessing.remove_forms = v
|
|
382
|
-
.as_bool()
|
|
383
|
-
.ok_or_else(|| "preprocessing.remove_forms must be a boolean".to_string())?;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
opts.preprocessing = preprocessing;
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
Ok(opts)
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
let value: serde_json::Value =
|
|
393
|
-
serde_json::from_str(config_str).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
|
|
394
|
-
|
|
395
|
-
let html_options = value.get("html_options").map(parse_html_options).transpose()?;
|
|
396
|
-
|
|
397
|
-
let mut config: ExtractionConfig =
|
|
398
|
-
serde_json::from_value(value).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
|
|
399
|
-
|
|
400
|
-
if let Some(options) = html_options {
|
|
401
|
-
config.html_options = Some(options);
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
Ok(config)
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
/// RAII guard for C strings to prevent memory leaks on error paths.
|
|
408
|
-
///
|
|
409
|
-
/// This wrapper ensures that if any allocation fails during the construction
|
|
410
|
-
/// of a CExtractionResult, all previously allocated C strings are properly freed.
|
|
411
|
-
/// The Drop implementation handles cleanup automatically when the guard goes out of scope.
|
|
412
|
-
struct CStringGuard {
|
|
413
|
-
ptr: *mut c_char,
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
impl CStringGuard {
|
|
417
|
-
/// Create a new guard from a CString, transferring ownership of the raw pointer
|
|
418
|
-
fn new(s: CString) -> Self {
|
|
419
|
-
Self { ptr: s.into_raw() }
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
/// Transfer ownership of the raw pointer to the caller, preventing cleanup
|
|
423
|
-
fn into_raw(mut self) -> *mut c_char {
|
|
424
|
-
let ptr = self.ptr;
|
|
425
|
-
self.ptr = ptr::null_mut();
|
|
426
|
-
ptr
|
|
427
|
-
}
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
impl Drop for CStringGuard {
|
|
431
|
-
fn drop(&mut self) {
|
|
432
|
-
if !self.ptr.is_null() {
|
|
433
|
-
unsafe { drop(CString::from_raw(self.ptr)) };
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
/// C-compatible extraction result structure
|
|
439
|
-
///
|
|
440
|
-
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
441
|
-
/// Field order: 11 pointers (8 bytes each) + 1 bool + 7 bytes padding = 96 bytes total
|
|
442
|
-
#[repr(C)]
|
|
443
|
-
pub struct CExtractionResult {
|
|
444
|
-
/// Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
|
|
445
|
-
pub content: *mut c_char,
|
|
446
|
-
/// Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
|
|
447
|
-
pub mime_type: *mut c_char,
|
|
448
|
-
/// Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
449
|
-
pub language: *mut c_char,
|
|
450
|
-
/// Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
451
|
-
pub date: *mut c_char,
|
|
452
|
-
/// Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
453
|
-
pub subject: *mut c_char,
|
|
454
|
-
/// Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
|
|
455
|
-
pub tables_json: *mut c_char,
|
|
456
|
-
/// Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
457
|
-
pub detected_languages_json: *mut c_char,
|
|
458
|
-
/// Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
|
|
459
|
-
pub metadata_json: *mut c_char,
|
|
460
|
-
/// Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
461
|
-
pub chunks_json: *mut c_char,
|
|
462
|
-
/// Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
463
|
-
pub images_json: *mut c_char,
|
|
464
|
-
/// Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
465
|
-
pub page_structure_json: *mut c_char,
|
|
466
|
-
/// Whether extraction was successful
|
|
467
|
-
pub success: bool,
|
|
468
|
-
/// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
469
|
-
_padding1: [u8; 7],
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
/// Helper function to convert ExtractionResult to CExtractionResult
|
|
473
|
-
///
|
|
474
|
-
/// Uses RAII guards to prevent memory leaks if any string allocation fails.
|
|
475
|
-
/// All allocated C strings are automatically freed if an error occurs before
|
|
476
|
-
/// the final result is constructed.
|
|
477
|
-
fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*mut CExtractionResult, String> {
|
|
478
|
-
let ExtractionResult {
|
|
479
|
-
content,
|
|
480
|
-
mime_type,
|
|
481
|
-
metadata,
|
|
482
|
-
tables,
|
|
483
|
-
detected_languages,
|
|
484
|
-
chunks,
|
|
485
|
-
images,
|
|
486
|
-
pages,
|
|
487
|
-
} = result;
|
|
488
|
-
|
|
489
|
-
let content_guard =
|
|
490
|
-
CStringGuard::new(CString::new(content).map_err(|e| format!("Failed to convert content to C string: {}", e))?);
|
|
491
|
-
|
|
492
|
-
let mime_type_guard = CStringGuard::new(
|
|
493
|
-
CString::new(mime_type).map_err(|e| format!("Failed to convert MIME type to C string: {}", e))?,
|
|
494
|
-
);
|
|
495
|
-
|
|
496
|
-
let language_guard = match &metadata.language {
|
|
497
|
-
Some(lang) => Some(CStringGuard::new(
|
|
498
|
-
CString::new(lang.as_str()).map_err(|e| format!("Failed to convert language to C string: {}", e))?,
|
|
499
|
-
)),
|
|
500
|
-
None => None,
|
|
501
|
-
};
|
|
502
|
-
|
|
503
|
-
let date_guard = match &metadata.date {
|
|
504
|
-
Some(d) => Some(CStringGuard::new(
|
|
505
|
-
CString::new(d.as_str()).map_err(|e| format!("Failed to convert date to C string: {}", e))?,
|
|
506
|
-
)),
|
|
507
|
-
None => None,
|
|
508
|
-
};
|
|
509
|
-
|
|
510
|
-
let subject_guard = match &metadata.subject {
|
|
511
|
-
Some(subj) => Some(CStringGuard::new(
|
|
512
|
-
CString::new(subj.as_str()).map_err(|e| format!("Failed to convert subject to C string: {}", e))?,
|
|
513
|
-
)),
|
|
514
|
-
None => None,
|
|
515
|
-
};
|
|
516
|
-
|
|
517
|
-
let tables_json_guard = if !tables.is_empty() {
|
|
518
|
-
let json = serde_json::to_string(&tables).map_err(|e| format!("Failed to serialize tables to JSON: {}", e))?;
|
|
519
|
-
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
520
|
-
format!("Failed to convert tables JSON to C string: {}", e)
|
|
521
|
-
})?))
|
|
522
|
-
} else {
|
|
523
|
-
None
|
|
524
|
-
};
|
|
525
|
-
|
|
526
|
-
let detected_languages_json_guard = match detected_languages {
|
|
527
|
-
Some(langs) if !langs.is_empty() => {
|
|
528
|
-
let json = serde_json::to_string(&langs)
|
|
529
|
-
.map_err(|e| format!("Failed to serialize detected languages to JSON: {}", e))?;
|
|
530
|
-
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
531
|
-
format!("Failed to convert detected languages JSON to C string: {}", e)
|
|
532
|
-
})?))
|
|
533
|
-
}
|
|
534
|
-
_ => None,
|
|
535
|
-
};
|
|
536
|
-
|
|
537
|
-
let metadata_json_guard = {
|
|
538
|
-
let json =
|
|
539
|
-
serde_json::to_string(&metadata).map_err(|e| format!("Failed to serialize metadata to JSON: {}", e))?;
|
|
540
|
-
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
541
|
-
format!("Failed to convert metadata JSON to C string: {}", e)
|
|
542
|
-
})?))
|
|
543
|
-
};
|
|
544
|
-
|
|
545
|
-
let chunks_json_guard = match chunks {
|
|
546
|
-
Some(chunks) if !chunks.is_empty() => {
|
|
547
|
-
let json =
|
|
548
|
-
serde_json::to_string(&chunks).map_err(|e| format!("Failed to serialize chunks to JSON: {}", e))?;
|
|
549
|
-
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
550
|
-
format!("Failed to convert chunks JSON to C string: {}", e)
|
|
551
|
-
})?))
|
|
552
|
-
}
|
|
553
|
-
_ => None,
|
|
554
|
-
};
|
|
555
|
-
|
|
556
|
-
let images_json_guard = match images {
|
|
557
|
-
Some(images) if !images.is_empty() => {
|
|
558
|
-
let json =
|
|
559
|
-
serde_json::to_string(&images).map_err(|e| format!("Failed to serialize images to JSON: {}", e))?;
|
|
560
|
-
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
561
|
-
format!("Failed to convert images JSON to C string: {}", e)
|
|
562
|
-
})?))
|
|
563
|
-
}
|
|
564
|
-
_ => None,
|
|
565
|
-
};
|
|
566
|
-
|
|
567
|
-
let page_structure_json_guard = match &metadata.pages {
|
|
568
|
-
Some(page_structure) => {
|
|
569
|
-
let json = serde_json::to_string(&page_structure)
|
|
570
|
-
.map_err(|e| format!("Failed to serialize page structure to JSON: {}", e))?;
|
|
571
|
-
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
572
|
-
format!("Failed to convert page structure JSON to C string: {}", e)
|
|
573
|
-
})?))
|
|
574
|
-
}
|
|
575
|
-
_ => None,
|
|
576
|
-
};
|
|
577
|
-
|
|
578
|
-
let _pages_json_guard = match pages {
|
|
579
|
-
Some(pages) if !pages.is_empty() => {
|
|
580
|
-
let json =
|
|
581
|
-
serde_json::to_string(&pages).map_err(|e| format!("Failed to serialize pages to JSON: {}", e))?;
|
|
582
|
-
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
583
|
-
format!("Failed to convert pages JSON to C string: {}", e)
|
|
584
|
-
})?))
|
|
585
|
-
}
|
|
586
|
-
_ => None,
|
|
587
|
-
};
|
|
588
|
-
|
|
589
|
-
Ok(Box::into_raw(Box::new(CExtractionResult {
|
|
590
|
-
content: content_guard.into_raw(),
|
|
591
|
-
mime_type: mime_type_guard.into_raw(),
|
|
592
|
-
language: language_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
593
|
-
date: date_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
594
|
-
subject: subject_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
595
|
-
tables_json: tables_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
596
|
-
detected_languages_json: detected_languages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
597
|
-
metadata_json: metadata_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
598
|
-
chunks_json: chunks_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
599
|
-
images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
600
|
-
page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
601
|
-
success: true,
|
|
602
|
-
_padding1: [0u8; 7],
|
|
603
|
-
})))
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
/// Extract text and metadata from a file (synchronous).
|
|
607
|
-
///
|
|
608
|
-
/// # Safety
|
|
609
|
-
///
|
|
610
|
-
/// - `file_path` must be a valid null-terminated C string
|
|
611
|
-
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
612
|
-
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
613
|
-
///
|
|
614
|
-
/// # Example (C)
|
|
615
|
-
///
|
|
616
|
-
/// ```c
|
|
617
|
-
/// const char* path = "/path/to/document.pdf";
|
|
618
|
-
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
619
|
-
/// if (result != NULL && result->success) {
|
|
620
|
-
/// printf("Content: %s\n", result->content);
|
|
621
|
-
/// printf("MIME: %s\n", result->mime_type);
|
|
622
|
-
/// kreuzberg_free_result(result);
|
|
623
|
-
/// } else {
|
|
624
|
-
/// const char* error = kreuzberg_last_error();
|
|
625
|
-
/// printf("Error: %s\n", error);
|
|
626
|
-
/// }
|
|
627
|
-
/// ```
|
|
628
|
-
#[unsafe(no_mangle)]
|
|
629
|
-
pub unsafe extern "C" fn kreuzberg_extract_file_sync(file_path: *const c_char) -> *mut CExtractionResult {
|
|
630
|
-
ffi_panic_guard!("kreuzberg_extract_file_sync", {
|
|
631
|
-
clear_last_error();
|
|
632
|
-
|
|
633
|
-
if file_path.is_null() {
|
|
634
|
-
set_last_error("file_path cannot be NULL".to_string());
|
|
635
|
-
return ptr::null_mut();
|
|
636
|
-
}
|
|
637
|
-
|
|
638
|
-
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
639
|
-
Ok(s) => s,
|
|
640
|
-
Err(e) => {
|
|
641
|
-
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
642
|
-
return ptr::null_mut();
|
|
643
|
-
}
|
|
644
|
-
};
|
|
645
|
-
|
|
646
|
-
let path = Path::new(path_str);
|
|
647
|
-
let config = ExtractionConfig::default();
|
|
648
|
-
|
|
649
|
-
match kreuzberg::extract_file_sync(path, None, &config) {
|
|
650
|
-
Ok(result) => match to_c_extraction_result(result) {
|
|
651
|
-
Ok(ptr) => ptr,
|
|
652
|
-
Err(e) => {
|
|
653
|
-
set_last_error(e);
|
|
654
|
-
ptr::null_mut()
|
|
655
|
-
}
|
|
656
|
-
},
|
|
657
|
-
Err(e) => {
|
|
658
|
-
set_last_error(e.to_string());
|
|
659
|
-
ptr::null_mut()
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
})
|
|
663
|
-
}
|
|
664
|
-
|
|
665
|
-
/// Detect MIME type from a file path.
|
|
666
|
-
///
|
|
667
|
-
/// # Safety
|
|
668
|
-
///
|
|
669
|
-
/// - `file_path` must be a valid null-terminated C string
|
|
670
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
671
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
672
|
-
#[unsafe(no_mangle)]
|
|
673
|
-
pub unsafe extern "C" fn kreuzberg_detect_mime_type(file_path: *const c_char, check_exists: bool) -> *mut c_char {
|
|
674
|
-
ffi_panic_guard!("kreuzberg_detect_mime_type", {
|
|
675
|
-
clear_last_error();
|
|
676
|
-
|
|
677
|
-
if file_path.is_null() {
|
|
678
|
-
set_last_error("file_path cannot be NULL".to_string());
|
|
679
|
-
return ptr::null_mut();
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
683
|
-
Ok(s) => s,
|
|
684
|
-
Err(e) => {
|
|
685
|
-
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
686
|
-
return ptr::null_mut();
|
|
687
|
-
}
|
|
688
|
-
};
|
|
689
|
-
|
|
690
|
-
match kreuzberg::core::mime::detect_mime_type(path_str, check_exists) {
|
|
691
|
-
Ok(mime) => match string_to_c_string(mime) {
|
|
692
|
-
Ok(ptr) => ptr,
|
|
693
|
-
Err(e) => {
|
|
694
|
-
set_last_error(e);
|
|
695
|
-
ptr::null_mut()
|
|
696
|
-
}
|
|
697
|
-
},
|
|
698
|
-
Err(e) => {
|
|
699
|
-
set_last_error(e.to_string());
|
|
700
|
-
ptr::null_mut()
|
|
701
|
-
}
|
|
702
|
-
}
|
|
703
|
-
})
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
/// Validate that a MIME type is supported by Kreuzberg.
|
|
707
|
-
///
|
|
708
|
-
/// # Safety
|
|
709
|
-
///
|
|
710
|
-
/// - `mime_type` must be a valid null-terminated C string
|
|
711
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
712
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
713
|
-
#[unsafe(no_mangle)]
|
|
714
|
-
pub unsafe extern "C" fn kreuzberg_validate_mime_type(mime_type: *const c_char) -> *mut c_char {
|
|
715
|
-
ffi_panic_guard!("kreuzberg_validate_mime_type", {
|
|
716
|
-
clear_last_error();
|
|
717
|
-
|
|
718
|
-
if mime_type.is_null() {
|
|
719
|
-
set_last_error("mime_type cannot be NULL".to_string());
|
|
720
|
-
return ptr::null_mut();
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
let mime_type_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
|
|
724
|
-
Ok(s) => s,
|
|
725
|
-
Err(e) => {
|
|
726
|
-
set_last_error(format!("Invalid UTF-8 in mime_type: {}", e));
|
|
727
|
-
return ptr::null_mut();
|
|
728
|
-
}
|
|
729
|
-
};
|
|
730
|
-
|
|
731
|
-
match kreuzberg::validate_mime_type(mime_type_str) {
|
|
732
|
-
Ok(validated) => match string_to_c_string(validated) {
|
|
733
|
-
Ok(ptr) => ptr,
|
|
734
|
-
Err(e) => {
|
|
735
|
-
set_last_error(e);
|
|
736
|
-
ptr::null_mut()
|
|
737
|
-
}
|
|
738
|
-
},
|
|
739
|
-
Err(e) => {
|
|
740
|
-
set_last_error(e.to_string());
|
|
741
|
-
ptr::null_mut()
|
|
742
|
-
}
|
|
743
|
-
}
|
|
744
|
-
})
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
#[derive(Serialize)]
|
|
748
|
-
#[cfg(not(all(windows, target_env = "gnu")))]
|
|
749
|
-
struct SerializableEmbeddingPreset<'a> {
|
|
750
|
-
name: &'a str,
|
|
751
|
-
chunk_size: usize,
|
|
752
|
-
overlap: usize,
|
|
753
|
-
model_name: String,
|
|
754
|
-
dimensions: usize,
|
|
755
|
-
description: &'a str,
|
|
756
|
-
}
|
|
757
|
-
|
|
758
|
-
/// List available embedding preset names.
|
|
759
|
-
///
|
|
760
|
-
/// # Safety
|
|
761
|
-
///
|
|
762
|
-
/// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
|
|
763
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
764
|
-
#[cfg(not(all(windows, target_env = "gnu")))]
|
|
765
|
-
#[unsafe(no_mangle)]
|
|
766
|
-
pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
|
|
767
|
-
ffi_panic_guard!("kreuzberg_list_embedding_presets", {
|
|
768
|
-
clear_last_error();
|
|
769
|
-
|
|
770
|
-
let presets = kreuzberg::embeddings::list_presets();
|
|
771
|
-
match serde_json::to_string(&presets) {
|
|
772
|
-
Ok(json) => match string_to_c_string(json) {
|
|
773
|
-
Ok(ptr) => ptr,
|
|
774
|
-
Err(e) => {
|
|
775
|
-
set_last_error(e);
|
|
776
|
-
ptr::null_mut()
|
|
777
|
-
}
|
|
778
|
-
},
|
|
779
|
-
Err(e) => {
|
|
780
|
-
set_last_error(format!("Failed to serialize presets: {}", e));
|
|
781
|
-
ptr::null_mut()
|
|
782
|
-
}
|
|
783
|
-
}
|
|
784
|
-
})
|
|
785
|
-
}
|
|
786
|
-
|
|
787
|
-
/// Get a specific embedding preset by name.
|
|
788
|
-
///
|
|
789
|
-
/// # Safety
|
|
790
|
-
///
|
|
791
|
-
/// - `name` must be a valid null-terminated C string
|
|
792
|
-
/// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
|
|
793
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
794
|
-
#[cfg(not(all(windows, target_env = "gnu")))]
|
|
795
|
-
#[unsafe(no_mangle)]
|
|
796
|
-
pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
|
|
797
|
-
ffi_panic_guard!("kreuzberg_get_embedding_preset", {
|
|
798
|
-
clear_last_error();
|
|
799
|
-
|
|
800
|
-
if name.is_null() {
|
|
801
|
-
set_last_error("preset name cannot be NULL".to_string());
|
|
802
|
-
return ptr::null_mut();
|
|
803
|
-
}
|
|
804
|
-
|
|
805
|
-
let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
806
|
-
Ok(s) => s,
|
|
807
|
-
Err(e) => {
|
|
808
|
-
set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
|
|
809
|
-
return ptr::null_mut();
|
|
810
|
-
}
|
|
811
|
-
};
|
|
812
|
-
|
|
813
|
-
let preset = match kreuzberg::embeddings::get_preset(preset_name) {
|
|
814
|
-
Some(preset) => preset,
|
|
815
|
-
None => {
|
|
816
|
-
set_last_error(format!("Unknown embedding preset: {}", preset_name));
|
|
817
|
-
return ptr::null_mut();
|
|
818
|
-
}
|
|
819
|
-
};
|
|
820
|
-
|
|
821
|
-
let model_name = format!("{:?}", preset.model);
|
|
822
|
-
let serializable = SerializableEmbeddingPreset {
|
|
823
|
-
name: preset.name,
|
|
824
|
-
chunk_size: preset.chunk_size,
|
|
825
|
-
overlap: preset.overlap,
|
|
826
|
-
model_name,
|
|
827
|
-
dimensions: preset.dimensions,
|
|
828
|
-
description: preset.description,
|
|
829
|
-
};
|
|
830
|
-
|
|
831
|
-
match serde_json::to_string(&serializable) {
|
|
832
|
-
Ok(json) => match string_to_c_string(json) {
|
|
833
|
-
Ok(ptr) => ptr,
|
|
834
|
-
Err(e) => {
|
|
835
|
-
set_last_error(e);
|
|
836
|
-
ptr::null_mut()
|
|
837
|
-
}
|
|
838
|
-
},
|
|
839
|
-
Err(e) => {
|
|
840
|
-
set_last_error(format!("Failed to serialize embedding preset: {}", e));
|
|
841
|
-
ptr::null_mut()
|
|
842
|
-
}
|
|
843
|
-
}
|
|
844
|
-
})
|
|
845
|
-
}
|
|
846
|
-
|
|
847
|
-
/// Extract text and metadata from a file with custom configuration (synchronous).
|
|
848
|
-
///
|
|
849
|
-
/// # Safety
|
|
850
|
-
///
|
|
851
|
-
/// - `file_path` must be a valid null-terminated C string
|
|
852
|
-
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
853
|
-
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
854
|
-
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
855
|
-
///
|
|
856
|
-
/// # Example (C)
|
|
857
|
-
///
|
|
858
|
-
/// ```c
|
|
859
|
-
/// const char* path = "/path/to/document.pdf";
|
|
860
|
-
/// const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
861
|
-
/// CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
|
|
862
|
-
/// if (result != NULL && result->success) {
|
|
863
|
-
/// printf("Content: %s\n", result->content);
|
|
864
|
-
/// kreuzberg_free_result(result);
|
|
865
|
-
/// }
|
|
866
|
-
/// ```
|
|
867
|
-
#[unsafe(no_mangle)]
|
|
868
|
-
pub unsafe extern "C" fn kreuzberg_extract_file_sync_with_config(
|
|
869
|
-
file_path: *const c_char,
|
|
870
|
-
config_json: *const c_char,
|
|
871
|
-
) -> *mut CExtractionResult {
|
|
872
|
-
ffi_panic_guard!("kreuzberg_extract_file_sync_with_config", {
|
|
873
|
-
clear_last_error();
|
|
874
|
-
|
|
875
|
-
if file_path.is_null() {
|
|
876
|
-
set_last_error("file_path cannot be NULL".to_string());
|
|
877
|
-
return ptr::null_mut();
|
|
878
|
-
}
|
|
879
|
-
|
|
880
|
-
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
881
|
-
Ok(s) => s,
|
|
882
|
-
Err(e) => {
|
|
883
|
-
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
884
|
-
return ptr::null_mut();
|
|
885
|
-
}
|
|
886
|
-
};
|
|
887
|
-
|
|
888
|
-
let path = Path::new(path_str);
|
|
889
|
-
|
|
890
|
-
let config = if config_json.is_null() {
|
|
891
|
-
ExtractionConfig::default()
|
|
892
|
-
} else {
|
|
893
|
-
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
894
|
-
Ok(s) => s,
|
|
895
|
-
Err(e) => {
|
|
896
|
-
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
897
|
-
return ptr::null_mut();
|
|
898
|
-
}
|
|
899
|
-
};
|
|
900
|
-
|
|
901
|
-
match parse_extraction_config_from_json(config_str) {
|
|
902
|
-
Ok(cfg) => cfg,
|
|
903
|
-
Err(e) => {
|
|
904
|
-
set_last_error(e);
|
|
905
|
-
return ptr::null_mut();
|
|
906
|
-
}
|
|
907
|
-
}
|
|
908
|
-
};
|
|
909
|
-
|
|
910
|
-
match kreuzberg::extract_file_sync(path, None, &config) {
|
|
911
|
-
Ok(result) => match to_c_extraction_result(result) {
|
|
912
|
-
Ok(ptr) => ptr,
|
|
913
|
-
Err(e) => {
|
|
914
|
-
set_last_error(e);
|
|
915
|
-
ptr::null_mut()
|
|
916
|
-
}
|
|
917
|
-
},
|
|
918
|
-
Err(e) => {
|
|
919
|
-
set_last_error(e.to_string());
|
|
920
|
-
ptr::null_mut()
|
|
921
|
-
}
|
|
922
|
-
}
|
|
923
|
-
})
|
|
924
|
-
}
|
|
925
|
-
|
|
926
|
-
/// Extract text and metadata from byte array (synchronous).
|
|
927
|
-
///
|
|
928
|
-
/// # Safety
|
|
929
|
-
///
|
|
930
|
-
/// - `data` must be a valid pointer to a byte array of length `data_len`
|
|
931
|
-
/// - `mime_type` must be a valid null-terminated C string
|
|
932
|
-
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
933
|
-
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
934
|
-
///
|
|
935
|
-
/// # Example (C)
|
|
936
|
-
///
|
|
937
|
-
/// ```c
|
|
938
|
-
/// const uint8_t* data = ...; // Document bytes
|
|
939
|
-
/// size_t len = ...; // Length of data
|
|
940
|
-
/// const char* mime = "application/pdf";
|
|
941
|
-
/// CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
|
|
942
|
-
/// if (result != NULL && result->success) {
|
|
943
|
-
/// printf("Content: %s\n", result->content);
|
|
944
|
-
/// kreuzberg_free_result(result);
|
|
945
|
-
/// } else {
|
|
946
|
-
/// const char* error = kreuzberg_last_error();
|
|
947
|
-
/// printf("Error: %s\n", error);
|
|
948
|
-
/// }
|
|
949
|
-
/// ```
|
|
950
|
-
#[unsafe(no_mangle)]
|
|
951
|
-
pub unsafe extern "C" fn kreuzberg_extract_bytes_sync(
|
|
952
|
-
data: *const u8,
|
|
953
|
-
data_len: usize,
|
|
954
|
-
mime_type: *const c_char,
|
|
955
|
-
) -> *mut CExtractionResult {
|
|
956
|
-
ffi_panic_guard!("kreuzberg_extract_bytes_sync", {
|
|
957
|
-
clear_last_error();
|
|
958
|
-
|
|
959
|
-
if data.is_null() {
|
|
960
|
-
set_last_error("data cannot be NULL".to_string());
|
|
961
|
-
return ptr::null_mut();
|
|
962
|
-
}
|
|
963
|
-
|
|
964
|
-
if mime_type.is_null() {
|
|
965
|
-
set_last_error("mime_type cannot be NULL".to_string());
|
|
966
|
-
return ptr::null_mut();
|
|
967
|
-
}
|
|
968
|
-
|
|
969
|
-
let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
|
|
970
|
-
|
|
971
|
-
let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
|
|
972
|
-
Ok(s) => s,
|
|
973
|
-
Err(e) => {
|
|
974
|
-
set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
|
|
975
|
-
return ptr::null_mut();
|
|
976
|
-
}
|
|
977
|
-
};
|
|
978
|
-
|
|
979
|
-
let config = ExtractionConfig::default();
|
|
980
|
-
|
|
981
|
-
match kreuzberg::extract_bytes_sync(bytes, mime_str, &config) {
|
|
982
|
-
Ok(result) => match to_c_extraction_result(result) {
|
|
983
|
-
Ok(ptr) => ptr,
|
|
984
|
-
Err(e) => {
|
|
985
|
-
set_last_error(e);
|
|
986
|
-
ptr::null_mut()
|
|
987
|
-
}
|
|
988
|
-
},
|
|
989
|
-
Err(e) => {
|
|
990
|
-
set_last_error(e.to_string());
|
|
991
|
-
ptr::null_mut()
|
|
992
|
-
}
|
|
993
|
-
}
|
|
994
|
-
})
|
|
995
|
-
}
|
|
996
|
-
|
|
997
|
-
/// Extract text and metadata from byte array with custom configuration (synchronous).
|
|
998
|
-
///
|
|
999
|
-
/// # Safety
|
|
1000
|
-
///
|
|
1001
|
-
/// - `data` must be a valid pointer to a byte array of length `data_len`
|
|
1002
|
-
/// - `mime_type` must be a valid null-terminated C string
|
|
1003
|
-
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
1004
|
-
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
1005
|
-
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1006
|
-
///
|
|
1007
|
-
/// # Example (C)
|
|
1008
|
-
///
|
|
1009
|
-
/// ```c
|
|
1010
|
-
/// const uint8_t* data = ...; // Document bytes
|
|
1011
|
-
/// size_t len = ...; // Length of data
|
|
1012
|
-
/// const char* mime = "application/pdf";
|
|
1013
|
-
/// const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
1014
|
-
/// CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
|
|
1015
|
-
/// if (result != NULL && result->success) {
|
|
1016
|
-
/// printf("Content: %s\n", result->content);
|
|
1017
|
-
/// kreuzberg_free_result(result);
|
|
1018
|
-
/// }
|
|
1019
|
-
/// ```
|
|
1020
|
-
#[unsafe(no_mangle)]
|
|
1021
|
-
pub unsafe extern "C" fn kreuzberg_extract_bytes_sync_with_config(
|
|
1022
|
-
data: *const u8,
|
|
1023
|
-
data_len: usize,
|
|
1024
|
-
mime_type: *const c_char,
|
|
1025
|
-
config_json: *const c_char,
|
|
1026
|
-
) -> *mut CExtractionResult {
|
|
1027
|
-
ffi_panic_guard!("kreuzberg_extract_bytes_sync_with_config", {
|
|
1028
|
-
clear_last_error();
|
|
1029
|
-
|
|
1030
|
-
if data.is_null() {
|
|
1031
|
-
set_last_error("data cannot be NULL".to_string());
|
|
1032
|
-
return ptr::null_mut();
|
|
1033
|
-
}
|
|
1034
|
-
|
|
1035
|
-
if mime_type.is_null() {
|
|
1036
|
-
set_last_error("mime_type cannot be NULL".to_string());
|
|
1037
|
-
return ptr::null_mut();
|
|
1038
|
-
}
|
|
1039
|
-
|
|
1040
|
-
let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
|
|
1041
|
-
|
|
1042
|
-
let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
|
|
1043
|
-
Ok(s) => s,
|
|
1044
|
-
Err(e) => {
|
|
1045
|
-
set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
|
|
1046
|
-
return ptr::null_mut();
|
|
1047
|
-
}
|
|
1048
|
-
};
|
|
1049
|
-
|
|
1050
|
-
let config = if config_json.is_null() {
|
|
1051
|
-
ExtractionConfig::default()
|
|
1052
|
-
} else {
|
|
1053
|
-
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
1054
|
-
Ok(s) => s,
|
|
1055
|
-
Err(e) => {
|
|
1056
|
-
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
1057
|
-
return ptr::null_mut();
|
|
1058
|
-
}
|
|
1059
|
-
};
|
|
1060
|
-
|
|
1061
|
-
match parse_extraction_config_from_json(config_str) {
|
|
1062
|
-
Ok(cfg) => cfg,
|
|
1063
|
-
Err(e) => {
|
|
1064
|
-
set_last_error(e);
|
|
1065
|
-
return ptr::null_mut();
|
|
1066
|
-
}
|
|
1067
|
-
}
|
|
1068
|
-
};
|
|
1069
|
-
|
|
1070
|
-
match kreuzberg::extract_bytes_sync(bytes, mime_str, &config) {
|
|
1071
|
-
Ok(result) => match to_c_extraction_result(result) {
|
|
1072
|
-
Ok(ptr) => ptr,
|
|
1073
|
-
Err(e) => {
|
|
1074
|
-
set_last_error(e);
|
|
1075
|
-
ptr::null_mut()
|
|
1076
|
-
}
|
|
1077
|
-
},
|
|
1078
|
-
Err(e) => {
|
|
1079
|
-
set_last_error(e.to_string());
|
|
1080
|
-
ptr::null_mut()
|
|
1081
|
-
}
|
|
1082
|
-
}
|
|
1083
|
-
})
|
|
1084
|
-
}
|
|
1085
|
-
|
|
1086
|
-
/// C-compatible structure for passing byte array with MIME type in batch operations
|
|
1087
|
-
///
|
|
1088
|
-
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
1089
|
-
/// Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
|
|
1090
|
-
#[repr(C)]
|
|
1091
|
-
pub struct CBytesWithMime {
|
|
1092
|
-
/// Pointer to byte data
|
|
1093
|
-
pub data: *const u8,
|
|
1094
|
-
/// Length of byte data
|
|
1095
|
-
pub data_len: usize,
|
|
1096
|
-
/// MIME type as null-terminated C string
|
|
1097
|
-
pub mime_type: *const c_char,
|
|
1098
|
-
}
|
|
1099
|
-
|
|
1100
|
-
/// C-compatible structure for batch extraction results
|
|
1101
|
-
///
|
|
1102
|
-
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
1103
|
-
/// Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
|
|
1104
|
-
#[repr(C)]
|
|
1105
|
-
pub struct CBatchResult {
|
|
1106
|
-
/// Array of extraction results
|
|
1107
|
-
pub results: *mut *mut CExtractionResult,
|
|
1108
|
-
/// Number of results
|
|
1109
|
-
pub count: usize,
|
|
1110
|
-
/// Whether batch operation was successful
|
|
1111
|
-
pub success: bool,
|
|
1112
|
-
/// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
1113
|
-
_padding2: [u8; 7],
|
|
1114
|
-
}
|
|
1115
|
-
|
|
1116
|
-
/// Batch extract text and metadata from multiple files (synchronous).
|
|
1117
|
-
///
|
|
1118
|
-
/// # Safety
|
|
1119
|
-
///
|
|
1120
|
-
/// - `file_paths` must be a valid pointer to an array of null-terminated C strings
|
|
1121
|
-
/// - `count` must be the number of file paths in the array
|
|
1122
|
-
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
1123
|
-
/// - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
1124
|
-
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1125
|
-
#[unsafe(no_mangle)]
|
|
1126
|
-
pub unsafe extern "C" fn kreuzberg_batch_extract_files_sync(
|
|
1127
|
-
file_paths: *const *const c_char,
|
|
1128
|
-
count: usize,
|
|
1129
|
-
config_json: *const c_char,
|
|
1130
|
-
) -> *mut CBatchResult {
|
|
1131
|
-
ffi_panic_guard!("kreuzberg_batch_extract_files_sync", {
|
|
1132
|
-
clear_last_error();
|
|
1133
|
-
|
|
1134
|
-
if file_paths.is_null() {
|
|
1135
|
-
set_last_error("file_paths cannot be NULL".to_string());
|
|
1136
|
-
return ptr::null_mut();
|
|
1137
|
-
}
|
|
1138
|
-
|
|
1139
|
-
let config = if config_json.is_null() {
|
|
1140
|
-
ExtractionConfig::default()
|
|
1141
|
-
} else {
|
|
1142
|
-
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
1143
|
-
Ok(s) => s,
|
|
1144
|
-
Err(e) => {
|
|
1145
|
-
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
1146
|
-
return ptr::null_mut();
|
|
1147
|
-
}
|
|
1148
|
-
};
|
|
1149
|
-
|
|
1150
|
-
match parse_extraction_config_from_json(config_str) {
|
|
1151
|
-
Ok(cfg) => cfg,
|
|
1152
|
-
Err(e) => {
|
|
1153
|
-
set_last_error(e);
|
|
1154
|
-
return ptr::null_mut();
|
|
1155
|
-
}
|
|
1156
|
-
}
|
|
1157
|
-
};
|
|
1158
|
-
|
|
1159
|
-
let mut paths = Vec::with_capacity(count);
|
|
1160
|
-
for i in 0..count {
|
|
1161
|
-
let path_ptr = unsafe { *file_paths.add(i) };
|
|
1162
|
-
if path_ptr.is_null() {
|
|
1163
|
-
set_last_error(format!("File path at index {} is NULL", i));
|
|
1164
|
-
return ptr::null_mut();
|
|
1165
|
-
}
|
|
1166
|
-
|
|
1167
|
-
let path_str = match unsafe { CStr::from_ptr(path_ptr) }.to_str() {
|
|
1168
|
-
Ok(s) => s,
|
|
1169
|
-
Err(e) => {
|
|
1170
|
-
set_last_error(format!("Invalid UTF-8 in file path at index {}: {}", i, e));
|
|
1171
|
-
return ptr::null_mut();
|
|
1172
|
-
}
|
|
1173
|
-
};
|
|
1174
|
-
|
|
1175
|
-
paths.push(Path::new(path_str));
|
|
1176
|
-
}
|
|
1177
|
-
|
|
1178
|
-
match kreuzberg::batch_extract_file_sync(paths, &config) {
|
|
1179
|
-
Ok(results) => {
|
|
1180
|
-
let mut c_results = Vec::with_capacity(results.len());
|
|
1181
|
-
for result in results {
|
|
1182
|
-
match to_c_extraction_result(result) {
|
|
1183
|
-
Ok(ptr) => c_results.push(ptr),
|
|
1184
|
-
Err(e) => {
|
|
1185
|
-
for c_res in c_results {
|
|
1186
|
-
unsafe { kreuzberg_free_result(c_res) };
|
|
1187
|
-
}
|
|
1188
|
-
set_last_error(e);
|
|
1189
|
-
return ptr::null_mut();
|
|
1190
|
-
}
|
|
1191
|
-
}
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
|
-
let results_array = c_results.into_boxed_slice();
|
|
1195
|
-
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
1196
|
-
|
|
1197
|
-
Box::into_raw(Box::new(CBatchResult {
|
|
1198
|
-
results: results_ptr,
|
|
1199
|
-
count,
|
|
1200
|
-
success: true,
|
|
1201
|
-
_padding2: [0u8; 7],
|
|
1202
|
-
}))
|
|
1203
|
-
}
|
|
1204
|
-
Err(e) => {
|
|
1205
|
-
set_last_error(e.to_string());
|
|
1206
|
-
ptr::null_mut()
|
|
1207
|
-
}
|
|
1208
|
-
}
|
|
1209
|
-
})
|
|
1210
|
-
}
|
|
1211
|
-
|
|
1212
|
-
/// Batch extract text and metadata from multiple byte arrays (synchronous).
|
|
1213
|
-
///
|
|
1214
|
-
/// # Safety
|
|
1215
|
-
///
|
|
1216
|
-
/// - `items` must be a valid pointer to an array of CBytesWithMime structures
|
|
1217
|
-
/// - `count` must be the number of items in the array
|
|
1218
|
-
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
1219
|
-
/// - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
1220
|
-
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1221
|
-
#[unsafe(no_mangle)]
|
|
1222
|
-
pub unsafe extern "C" fn kreuzberg_batch_extract_bytes_sync(
|
|
1223
|
-
items: *const CBytesWithMime,
|
|
1224
|
-
count: usize,
|
|
1225
|
-
config_json: *const c_char,
|
|
1226
|
-
) -> *mut CBatchResult {
|
|
1227
|
-
ffi_panic_guard!("kreuzberg_batch_extract_bytes_sync", {
|
|
1228
|
-
clear_last_error();
|
|
1229
|
-
|
|
1230
|
-
if items.is_null() {
|
|
1231
|
-
set_last_error("items cannot be NULL".to_string());
|
|
1232
|
-
return ptr::null_mut();
|
|
1233
|
-
}
|
|
1234
|
-
|
|
1235
|
-
let config = if config_json.is_null() {
|
|
1236
|
-
ExtractionConfig::default()
|
|
1237
|
-
} else {
|
|
1238
|
-
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
1239
|
-
Ok(s) => s,
|
|
1240
|
-
Err(e) => {
|
|
1241
|
-
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
1242
|
-
return ptr::null_mut();
|
|
1243
|
-
}
|
|
1244
|
-
};
|
|
1245
|
-
|
|
1246
|
-
match parse_extraction_config_from_json(config_str) {
|
|
1247
|
-
Ok(cfg) => cfg,
|
|
1248
|
-
Err(e) => {
|
|
1249
|
-
set_last_error(e);
|
|
1250
|
-
return ptr::null_mut();
|
|
1251
|
-
}
|
|
1252
|
-
}
|
|
1253
|
-
};
|
|
1254
|
-
|
|
1255
|
-
let mut contents = Vec::with_capacity(count);
|
|
1256
|
-
for i in 0..count {
|
|
1257
|
-
let item = unsafe { &*items.add(i) };
|
|
1258
|
-
|
|
1259
|
-
if item.data.is_null() {
|
|
1260
|
-
set_last_error(format!("Data at index {} is NULL", i));
|
|
1261
|
-
return ptr::null_mut();
|
|
1262
|
-
}
|
|
1263
|
-
|
|
1264
|
-
if item.mime_type.is_null() {
|
|
1265
|
-
set_last_error(format!("MIME type at index {} is NULL", i));
|
|
1266
|
-
return ptr::null_mut();
|
|
1267
|
-
}
|
|
1268
|
-
|
|
1269
|
-
let bytes = unsafe { std::slice::from_raw_parts(item.data, item.data_len) };
|
|
1270
|
-
|
|
1271
|
-
let mime_str = match unsafe { CStr::from_ptr(item.mime_type) }.to_str() {
|
|
1272
|
-
Ok(s) => s,
|
|
1273
|
-
Err(e) => {
|
|
1274
|
-
set_last_error(format!("Invalid UTF-8 in MIME type at index {}: {}", i, e));
|
|
1275
|
-
return ptr::null_mut();
|
|
1276
|
-
}
|
|
1277
|
-
};
|
|
1278
|
-
|
|
1279
|
-
contents.push((bytes, mime_str));
|
|
1280
|
-
}
|
|
1281
|
-
|
|
1282
|
-
match kreuzberg::batch_extract_bytes_sync(contents, &config) {
|
|
1283
|
-
Ok(results) => {
|
|
1284
|
-
let mut c_results = Vec::with_capacity(results.len());
|
|
1285
|
-
for result in results {
|
|
1286
|
-
match to_c_extraction_result(result) {
|
|
1287
|
-
Ok(ptr) => c_results.push(ptr),
|
|
1288
|
-
Err(e) => {
|
|
1289
|
-
for c_res in c_results {
|
|
1290
|
-
unsafe { kreuzberg_free_result(c_res) };
|
|
1291
|
-
}
|
|
1292
|
-
set_last_error(e);
|
|
1293
|
-
return ptr::null_mut();
|
|
1294
|
-
}
|
|
1295
|
-
}
|
|
1296
|
-
}
|
|
1297
|
-
|
|
1298
|
-
let results_array = c_results.into_boxed_slice();
|
|
1299
|
-
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
1300
|
-
|
|
1301
|
-
Box::into_raw(Box::new(CBatchResult {
|
|
1302
|
-
results: results_ptr,
|
|
1303
|
-
count,
|
|
1304
|
-
success: true,
|
|
1305
|
-
_padding2: [0u8; 7],
|
|
1306
|
-
}))
|
|
1307
|
-
}
|
|
1308
|
-
Err(e) => {
|
|
1309
|
-
set_last_error(e.to_string());
|
|
1310
|
-
ptr::null_mut()
|
|
1311
|
-
}
|
|
1312
|
-
}
|
|
1313
|
-
})
|
|
1314
|
-
}
|
|
1315
|
-
|
|
1316
|
-
/// Load an extraction configuration from a TOML/YAML/JSON file.
|
|
1317
|
-
///
|
|
1318
|
-
/// # Safety
|
|
1319
|
-
///
|
|
1320
|
-
/// - `file_path` must be a valid null-terminated C string
|
|
1321
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
1322
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1323
|
-
#[unsafe(no_mangle)]
|
|
1324
|
-
pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
|
|
1325
|
-
ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
|
|
1326
|
-
clear_last_error();
|
|
1327
|
-
|
|
1328
|
-
if file_path.is_null() {
|
|
1329
|
-
set_last_error("file_path cannot be NULL".to_string());
|
|
1330
|
-
return ptr::null_mut();
|
|
1331
|
-
}
|
|
1332
|
-
|
|
1333
|
-
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
1334
|
-
Ok(s) => s,
|
|
1335
|
-
Err(e) => {
|
|
1336
|
-
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
1337
|
-
return ptr::null_mut();
|
|
1338
|
-
}
|
|
1339
|
-
};
|
|
1340
|
-
|
|
1341
|
-
match ExtractionConfig::from_file(path_str) {
|
|
1342
|
-
Ok(config) => match serde_json::to_string(&config) {
|
|
1343
|
-
Ok(json) => match CString::new(json) {
|
|
1344
|
-
Ok(cstr) => cstr.into_raw(),
|
|
1345
|
-
Err(e) => {
|
|
1346
|
-
set_last_error(format!("Failed to create C string: {}", e));
|
|
1347
|
-
ptr::null_mut()
|
|
1348
|
-
}
|
|
1349
|
-
},
|
|
1350
|
-
Err(e) => {
|
|
1351
|
-
set_last_error(format!("Failed to serialize config to JSON: {}", e));
|
|
1352
|
-
ptr::null_mut()
|
|
1353
|
-
}
|
|
1354
|
-
},
|
|
1355
|
-
Err(e) => {
|
|
1356
|
-
set_last_error(e.to_string());
|
|
1357
|
-
ptr::null_mut()
|
|
1358
|
-
}
|
|
1359
|
-
}
|
|
1360
|
-
})
|
|
1361
|
-
}
|
|
1362
|
-
|
|
1363
|
-
/// Free a batch result returned by batch extraction functions.
|
|
1364
|
-
///
|
|
1365
|
-
/// # Safety
|
|
1366
|
-
///
|
|
1367
|
-
/// - `batch_result` must be a pointer previously returned by a batch extraction function
|
|
1368
|
-
/// - `batch_result` can be NULL (no-op)
|
|
1369
|
-
/// - `batch_result` must not be used after this call
|
|
1370
|
-
/// - All results and strings within the batch result will be freed automatically
|
|
1371
|
-
#[unsafe(no_mangle)]
|
|
1372
|
-
pub unsafe extern "C" fn kreuzberg_free_batch_result(batch_result: *mut CBatchResult) {
|
|
1373
|
-
if !batch_result.is_null() {
|
|
1374
|
-
let batch = unsafe { Box::from_raw(batch_result) };
|
|
1375
|
-
|
|
1376
|
-
// NOTE: Do not free individual results here - calling code is responsible for that.
|
|
1377
|
-
|
|
1378
|
-
if !batch.results.is_null() {
|
|
1379
|
-
unsafe {
|
|
1380
|
-
let _results_array = Box::from_raw(std::ptr::slice_from_raw_parts_mut(batch.results, batch.count));
|
|
1381
|
-
};
|
|
1382
|
-
}
|
|
1383
|
-
}
|
|
1384
|
-
}
|
|
1385
|
-
|
|
1386
|
-
/// Free a string returned by Kreuzberg functions.
|
|
1387
|
-
///
|
|
1388
|
-
/// # Safety
|
|
1389
|
-
///
|
|
1390
|
-
/// - `s` must be a string previously returned by a Kreuzberg function
|
|
1391
|
-
/// - `s` can be NULL (no-op)
|
|
1392
|
-
/// - `s` must not be used after this call
|
|
1393
|
-
///
|
|
1394
|
-
/// # Example (C)
|
|
1395
|
-
///
|
|
1396
|
-
/// ```c
|
|
1397
|
-
/// char* str = result->content;
|
|
1398
|
-
/// kreuzberg_free_string(str);
|
|
1399
|
-
/// // str is now invalid
|
|
1400
|
-
/// ```
|
|
1401
|
-
#[unsafe(no_mangle)]
|
|
1402
|
-
pub unsafe extern "C" fn kreuzberg_free_string(s: *mut c_char) {
|
|
1403
|
-
if !s.is_null() {
|
|
1404
|
-
unsafe { drop(CString::from_raw(s)) };
|
|
1405
|
-
}
|
|
1406
|
-
}
|
|
1407
|
-
|
|
1408
|
-
/// Clone a null-terminated string using Rust's allocator.
|
|
1409
|
-
///
|
|
1410
|
-
/// # Safety
|
|
1411
|
-
///
|
|
1412
|
-
/// - `s` must be a valid null-terminated UTF-8 string
|
|
1413
|
-
/// - Returned pointer must be freed with `kreuzberg_free_string`
|
|
1414
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1415
|
-
#[unsafe(no_mangle)]
|
|
1416
|
-
pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char {
|
|
1417
|
-
ffi_panic_guard!("kreuzberg_clone_string", {
|
|
1418
|
-
clear_last_error();
|
|
1419
|
-
|
|
1420
|
-
if s.is_null() {
|
|
1421
|
-
set_last_error("Input string cannot be NULL".to_string());
|
|
1422
|
-
return ptr::null_mut();
|
|
1423
|
-
}
|
|
1424
|
-
|
|
1425
|
-
let raw = match unsafe { CStr::from_ptr(s) }.to_str() {
|
|
1426
|
-
Ok(val) => val,
|
|
1427
|
-
Err(e) => {
|
|
1428
|
-
set_last_error(format!("Invalid UTF-8 in string: {}", e));
|
|
1429
|
-
return ptr::null_mut();
|
|
1430
|
-
}
|
|
1431
|
-
};
|
|
1432
|
-
|
|
1433
|
-
match CString::new(raw) {
|
|
1434
|
-
Ok(cstr) => cstr.into_raw(),
|
|
1435
|
-
Err(e) => {
|
|
1436
|
-
set_last_error(format!("Failed to clone string: {}", e));
|
|
1437
|
-
ptr::null_mut()
|
|
1438
|
-
}
|
|
1439
|
-
}
|
|
1440
|
-
})
|
|
1441
|
-
}
|
|
1442
|
-
|
|
1443
|
-
/// Free an extraction result returned by `kreuzberg_extract_file_sync`.
|
|
1444
|
-
///
|
|
1445
|
-
/// # Safety
|
|
1446
|
-
///
|
|
1447
|
-
/// - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
|
|
1448
|
-
/// - `result` can be NULL (no-op)
|
|
1449
|
-
/// - `result` must not be used after this call
|
|
1450
|
-
/// - All string fields within the result will be freed automatically
|
|
1451
|
-
///
|
|
1452
|
-
/// # Example (C)
|
|
1453
|
-
///
|
|
1454
|
-
/// ```c
|
|
1455
|
-
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
1456
|
-
/// // Use result...
|
|
1457
|
-
/// kreuzberg_free_result(result);
|
|
1458
|
-
/// // result is now invalid
|
|
1459
|
-
/// ```
|
|
1460
|
-
#[unsafe(no_mangle)]
|
|
1461
|
-
pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
|
|
1462
|
-
if !result.is_null() {
|
|
1463
|
-
let result_box = unsafe { Box::from_raw(result) };
|
|
1464
|
-
|
|
1465
|
-
if !result_box.content.is_null() {
|
|
1466
|
-
unsafe { drop(CString::from_raw(result_box.content)) };
|
|
1467
|
-
}
|
|
1468
|
-
if !result_box.mime_type.is_null() {
|
|
1469
|
-
unsafe { drop(CString::from_raw(result_box.mime_type)) };
|
|
1470
|
-
}
|
|
1471
|
-
if !result_box.language.is_null() {
|
|
1472
|
-
unsafe { drop(CString::from_raw(result_box.language)) };
|
|
1473
|
-
}
|
|
1474
|
-
if !result_box.date.is_null() {
|
|
1475
|
-
unsafe { drop(CString::from_raw(result_box.date)) };
|
|
1476
|
-
}
|
|
1477
|
-
if !result_box.subject.is_null() {
|
|
1478
|
-
unsafe { drop(CString::from_raw(result_box.subject)) };
|
|
1479
|
-
}
|
|
1480
|
-
if !result_box.tables_json.is_null() {
|
|
1481
|
-
unsafe { drop(CString::from_raw(result_box.tables_json)) };
|
|
1482
|
-
}
|
|
1483
|
-
if !result_box.detected_languages_json.is_null() {
|
|
1484
|
-
unsafe { drop(CString::from_raw(result_box.detected_languages_json)) };
|
|
1485
|
-
}
|
|
1486
|
-
if !result_box.metadata_json.is_null() {
|
|
1487
|
-
unsafe { drop(CString::from_raw(result_box.metadata_json)) };
|
|
1488
|
-
}
|
|
1489
|
-
if !result_box.chunks_json.is_null() {
|
|
1490
|
-
unsafe { drop(CString::from_raw(result_box.chunks_json)) };
|
|
1491
|
-
}
|
|
1492
|
-
if !result_box.images_json.is_null() {
|
|
1493
|
-
unsafe { drop(CString::from_raw(result_box.images_json)) };
|
|
1494
|
-
}
|
|
1495
|
-
}
|
|
1496
|
-
}
|
|
1497
|
-
|
|
1498
|
-
/// Get the last error message from a failed operation.
|
|
1499
|
-
///
|
|
1500
|
-
/// # Safety
|
|
1501
|
-
///
|
|
1502
|
-
/// - Returns a static string that does not need to be freed
|
|
1503
|
-
/// - Returns NULL if no error has occurred
|
|
1504
|
-
/// - The returned string is valid until the next Kreuzberg function call on the same thread
|
|
1505
|
-
///
|
|
1506
|
-
/// # Example (C)
|
|
1507
|
-
///
|
|
1508
|
-
/// ```c
|
|
1509
|
-
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
1510
|
-
/// if (result == NULL) {
|
|
1511
|
-
/// const char* error = kreuzberg_last_error();
|
|
1512
|
-
/// if (error != NULL) {
|
|
1513
|
-
/// printf("Error: %s\n", error);
|
|
1514
|
-
/// }
|
|
1515
|
-
/// }
|
|
1516
|
-
/// ```
|
|
1517
|
-
#[unsafe(no_mangle)]
|
|
1518
|
-
pub unsafe extern "C" fn kreuzberg_last_error() -> *const c_char {
|
|
1519
|
-
LAST_ERROR_C_STRING.with(|last| match &*last.borrow() {
|
|
1520
|
-
Some(c_str) => c_str.as_ptr(),
|
|
1521
|
-
None => ptr::null(),
|
|
1522
|
-
})
|
|
1523
|
-
}
|
|
1524
|
-
|
|
1525
|
-
/// Get the error code for the last error.
|
|
1526
|
-
///
|
|
1527
|
-
/// Returns the error code as an i32. Error codes are defined in ErrorCode enum:
|
|
1528
|
-
/// - 0: Success (no error)
|
|
1529
|
-
/// - 1: GenericError
|
|
1530
|
-
/// - 2: Panic
|
|
1531
|
-
/// - 3: InvalidArgument
|
|
1532
|
-
/// - 4: IoError
|
|
1533
|
-
/// - 5: ParsingError
|
|
1534
|
-
/// - 6: OcrError
|
|
1535
|
-
/// - 7: MissingDependency
|
|
1536
|
-
///
|
|
1537
|
-
/// # Safety
|
|
1538
|
-
///
|
|
1539
|
-
/// This function is thread-safe and always safe to call.
|
|
1540
|
-
///
|
|
1541
|
-
/// # Example (C)
|
|
1542
|
-
///
|
|
1543
|
-
/// ```c
|
|
1544
|
-
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
1545
|
-
/// if (result == NULL) {
|
|
1546
|
-
/// int32_t code = kreuzberg_last_error_code();
|
|
1547
|
-
/// if (code == 2) {
|
|
1548
|
-
/// // A panic occurred
|
|
1549
|
-
/// }
|
|
1550
|
-
/// }
|
|
1551
|
-
/// ```
|
|
1552
|
-
#[unsafe(no_mangle)]
|
|
1553
|
-
pub unsafe extern "C" fn kreuzberg_last_error_code() -> i32 {
|
|
1554
|
-
get_last_error_code() as i32
|
|
1555
|
-
}
|
|
1556
|
-
|
|
1557
|
-
/// Get the panic context for the last error (if it was a panic).
|
|
1558
|
-
///
|
|
1559
|
-
/// Returns a JSON string containing panic context information, or NULL if
|
|
1560
|
-
/// the last error was not a panic.
|
|
1561
|
-
///
|
|
1562
|
-
/// The JSON structure contains:
|
|
1563
|
-
/// - file: Source file where panic occurred
|
|
1564
|
-
/// - line: Line number
|
|
1565
|
-
/// - function: Function name
|
|
1566
|
-
/// - message: Panic message
|
|
1567
|
-
/// - timestamp_secs: Unix timestamp (seconds since epoch)
|
|
1568
|
-
///
|
|
1569
|
-
/// # Safety
|
|
1570
|
-
///
|
|
1571
|
-
/// The returned string must be freed with kreuzberg_free_string().
|
|
1572
|
-
///
|
|
1573
|
-
/// # Example (C)
|
|
1574
|
-
///
|
|
1575
|
-
/// ```c
|
|
1576
|
-
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
1577
|
-
/// if (result == NULL && kreuzberg_last_error_code() == 2) {
|
|
1578
|
-
/// const char* context = kreuzberg_last_panic_context();
|
|
1579
|
-
/// if (context != NULL) {
|
|
1580
|
-
/// printf("Panic context: %s\n", context);
|
|
1581
|
-
/// kreuzberg_free_string((char*)context);
|
|
1582
|
-
/// }
|
|
1583
|
-
/// }
|
|
1584
|
-
/// ```
|
|
1585
|
-
#[unsafe(no_mangle)]
|
|
1586
|
-
pub unsafe extern "C" fn kreuzberg_last_panic_context() -> *mut c_char {
|
|
1587
|
-
ffi_panic_guard!("kreuzberg_last_panic_context", {
|
|
1588
|
-
match get_last_panic_context() {
|
|
1589
|
-
Some(ctx) => {
|
|
1590
|
-
use std::time::UNIX_EPOCH;
|
|
1591
|
-
|
|
1592
|
-
let timestamp_secs = ctx
|
|
1593
|
-
.timestamp
|
|
1594
|
-
.duration_since(UNIX_EPOCH)
|
|
1595
|
-
.map(|d| d.as_secs())
|
|
1596
|
-
.unwrap_or(0);
|
|
1597
|
-
|
|
1598
|
-
let json_value = serde_json::json!({
|
|
1599
|
-
"file": ctx.file,
|
|
1600
|
-
"line": ctx.line,
|
|
1601
|
-
"function": ctx.function,
|
|
1602
|
-
"message": ctx.message,
|
|
1603
|
-
"timestamp_secs": timestamp_secs
|
|
1604
|
-
});
|
|
1605
|
-
|
|
1606
|
-
match serde_json::to_string(&json_value) {
|
|
1607
|
-
Ok(json) => match CString::new(json) {
|
|
1608
|
-
Ok(c_str) => c_str.into_raw(),
|
|
1609
|
-
Err(_) => ptr::null_mut(),
|
|
1610
|
-
},
|
|
1611
|
-
Err(_) => ptr::null_mut(),
|
|
1612
|
-
}
|
|
1613
|
-
}
|
|
1614
|
-
None => ptr::null_mut(),
|
|
1615
|
-
}
|
|
1616
|
-
})
|
|
1617
|
-
}
|
|
1618
|
-
|
|
1619
|
-
/// Get the library version string.
|
|
1620
|
-
///
|
|
1621
|
-
/// # Safety
|
|
1622
|
-
///
|
|
1623
|
-
/// - Returns a static string that does not need to be freed
|
|
1624
|
-
/// - The returned string is always valid
|
|
1625
|
-
///
|
|
1626
|
-
/// # Example (C)
|
|
1627
|
-
///
|
|
1628
|
-
/// ```c
|
|
1629
|
-
/// const char* version = kreuzberg_version();
|
|
1630
|
-
/// printf("Kreuzberg version: %s\n", version);
|
|
1631
|
-
/// ```
|
|
1632
|
-
#[unsafe(no_mangle)]
|
|
1633
|
-
pub unsafe extern "C" fn kreuzberg_version() -> *const c_char {
|
|
1634
|
-
concat!(env!("CARGO_PKG_VERSION"), "\0").as_ptr() as *const c_char
|
|
1635
|
-
}
|
|
1636
|
-
|
|
1637
|
-
/// Type alias for the OCR backend callback function.
|
|
1638
|
-
///
|
|
1639
|
-
/// # Parameters
|
|
1640
|
-
///
|
|
1641
|
-
/// - `image_bytes`: Pointer to image data
|
|
1642
|
-
/// - `image_length`: Length of image data in bytes
|
|
1643
|
-
/// - `config_json`: JSON-encoded OcrConfig (null-terminated string)
|
|
1644
|
-
///
|
|
1645
|
-
/// # Returns
|
|
1646
|
-
///
|
|
1647
|
-
/// Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
|
|
1648
|
-
/// or NULL on error.
|
|
1649
|
-
///
|
|
1650
|
-
/// # Safety
|
|
1651
|
-
///
|
|
1652
|
-
/// The callback must:
|
|
1653
|
-
/// - Not store the image_bytes pointer (it's only valid for the duration of the call)
|
|
1654
|
-
/// - Return a valid null-terminated UTF-8 string allocated by the caller
|
|
1655
|
-
/// - Return NULL on error (error message should be retrievable separately)
|
|
1656
|
-
type OcrBackendCallback =
|
|
1657
|
-
unsafe extern "C" fn(image_bytes: *const u8, image_length: usize, config_json: *const c_char) -> *mut c_char;
|
|
1658
|
-
|
|
1659
|
-
fn parse_languages_from_json(languages_json: *const c_char) -> FfiResult<Option<Vec<String>>> {
|
|
1660
|
-
if languages_json.is_null() {
|
|
1661
|
-
return Ok(None);
|
|
1662
|
-
}
|
|
1663
|
-
|
|
1664
|
-
let raw = unsafe { CStr::from_ptr(languages_json) }
|
|
1665
|
-
.to_str()
|
|
1666
|
-
.map_err(|e| format!("Invalid UTF-8 in languages JSON: {}", e))?;
|
|
1667
|
-
|
|
1668
|
-
if raw.trim().is_empty() {
|
|
1669
|
-
return Ok(None);
|
|
1670
|
-
}
|
|
1671
|
-
|
|
1672
|
-
let langs: Vec<String> = serde_json::from_str(raw).map_err(|e| format!("Failed to parse languages JSON: {}", e))?;
|
|
1673
|
-
|
|
1674
|
-
if langs.is_empty() {
|
|
1675
|
-
return Ok(None);
|
|
1676
|
-
}
|
|
1677
|
-
|
|
1678
|
-
let normalized = langs
|
|
1679
|
-
.into_iter()
|
|
1680
|
-
.map(|l| l.trim().to_string())
|
|
1681
|
-
.filter(|l| !l.is_empty())
|
|
1682
|
-
.collect::<Vec<_>>();
|
|
1683
|
-
|
|
1684
|
-
if normalized.is_empty() {
|
|
1685
|
-
return Ok(None);
|
|
1686
|
-
}
|
|
1687
|
-
|
|
1688
|
-
Ok(Some(normalized))
|
|
1689
|
-
}
|
|
1690
|
-
|
|
1691
|
-
/// FFI wrapper for custom OCR backends registered from Java/C.
|
|
1692
|
-
///
|
|
1693
|
-
/// This struct wraps a C function pointer and implements the OcrBackend trait,
|
|
1694
|
-
/// allowing custom OCR implementations from FFI languages to be registered
|
|
1695
|
-
/// and used within the Rust extraction pipeline.
|
|
1696
|
-
struct FfiOcrBackend {
|
|
1697
|
-
name: String,
|
|
1698
|
-
callback: OcrBackendCallback,
|
|
1699
|
-
supported_languages: Option<Vec<String>>,
|
|
1700
|
-
}
|
|
1701
|
-
|
|
1702
|
-
impl FfiOcrBackend {
|
|
1703
|
-
fn new(name: String, callback: OcrBackendCallback, supported_languages: Option<Vec<String>>) -> Self {
|
|
1704
|
-
Self {
|
|
1705
|
-
name,
|
|
1706
|
-
callback,
|
|
1707
|
-
supported_languages,
|
|
1708
|
-
}
|
|
1709
|
-
}
|
|
1710
|
-
}
|
|
1711
|
-
|
|
1712
|
-
impl Plugin for FfiOcrBackend {
|
|
1713
|
-
fn name(&self) -> &str {
|
|
1714
|
-
&self.name
|
|
1715
|
-
}
|
|
1716
|
-
|
|
1717
|
-
fn version(&self) -> String {
|
|
1718
|
-
"ffi-1.0.0".to_string()
|
|
1719
|
-
}
|
|
1720
|
-
|
|
1721
|
-
fn initialize(&self) -> Result<()> {
|
|
1722
|
-
Ok(())
|
|
1723
|
-
}
|
|
1724
|
-
|
|
1725
|
-
fn shutdown(&self) -> Result<()> {
|
|
1726
|
-
Ok(())
|
|
1727
|
-
}
|
|
1728
|
-
}
|
|
1729
|
-
|
|
1730
|
-
#[async_trait]
|
|
1731
|
-
impl OcrBackend for FfiOcrBackend {
|
|
1732
|
-
async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
|
|
1733
|
-
let config_json = serde_json::to_string(config).map_err(|e| KreuzbergError::Validation {
|
|
1734
|
-
message: format!("Failed to serialize OCR config: {}", e),
|
|
1735
|
-
source: Some(Box::new(e)),
|
|
1736
|
-
})?;
|
|
1737
|
-
|
|
1738
|
-
let callback = self.callback;
|
|
1739
|
-
let image_data = image_bytes.to_vec();
|
|
1740
|
-
let config_json_owned = config_json.clone();
|
|
1741
|
-
|
|
1742
|
-
let result_text = tokio::task::spawn_blocking(move || {
|
|
1743
|
-
let config_cstring = CString::new(config_json_owned).map_err(|e| KreuzbergError::Validation {
|
|
1744
|
-
message: format!("Failed to create C string from config JSON: {}", e),
|
|
1745
|
-
source: Some(Box::new(e)),
|
|
1746
|
-
})?;
|
|
1747
|
-
|
|
1748
|
-
let result_ptr = unsafe { callback(image_data.as_ptr(), image_data.len(), config_cstring.as_ptr()) };
|
|
1749
|
-
|
|
1750
|
-
if result_ptr.is_null() {
|
|
1751
|
-
return Err(KreuzbergError::Ocr {
|
|
1752
|
-
message: "OCR backend returned NULL (operation failed)".to_string(),
|
|
1753
|
-
source: None,
|
|
1754
|
-
});
|
|
1755
|
-
}
|
|
1756
|
-
|
|
1757
|
-
let result_cstr = unsafe { CStr::from_ptr(result_ptr) };
|
|
1758
|
-
let text = result_cstr
|
|
1759
|
-
.to_str()
|
|
1760
|
-
.map_err(|e| KreuzbergError::Ocr {
|
|
1761
|
-
message: format!("OCR backend returned invalid UTF-8: {}", e),
|
|
1762
|
-
source: Some(Box::new(e)),
|
|
1763
|
-
})?
|
|
1764
|
-
.to_string();
|
|
1765
|
-
|
|
1766
|
-
unsafe { kreuzberg_free_string(result_ptr) };
|
|
1767
|
-
|
|
1768
|
-
Ok(text)
|
|
1769
|
-
})
|
|
1770
|
-
.await
|
|
1771
|
-
.map_err(|e| KreuzbergError::Ocr {
|
|
1772
|
-
message: format!("OCR backend task panicked: {}", e),
|
|
1773
|
-
source: Some(Box::new(e)),
|
|
1774
|
-
})??;
|
|
1775
|
-
|
|
1776
|
-
Ok(ExtractionResult {
|
|
1777
|
-
content: result_text,
|
|
1778
|
-
mime_type: "text/plain".to_string(),
|
|
1779
|
-
metadata: kreuzberg::types::Metadata::default(),
|
|
1780
|
-
tables: vec![],
|
|
1781
|
-
detected_languages: None,
|
|
1782
|
-
chunks: None,
|
|
1783
|
-
images: None,
|
|
1784
|
-
pages: None,
|
|
1785
|
-
})
|
|
1786
|
-
}
|
|
1787
|
-
|
|
1788
|
-
fn supports_language(&self, _lang: &str) -> bool {
|
|
1789
|
-
match &self.supported_languages {
|
|
1790
|
-
Some(langs) => langs.iter().any(|candidate| candidate.eq_ignore_ascii_case(_lang)),
|
|
1791
|
-
None => true,
|
|
1792
|
-
}
|
|
1793
|
-
}
|
|
1794
|
-
|
|
1795
|
-
fn backend_type(&self) -> kreuzberg::plugins::OcrBackendType {
|
|
1796
|
-
kreuzberg::plugins::OcrBackendType::Custom
|
|
1797
|
-
}
|
|
1798
|
-
}
|
|
1799
|
-
|
|
1800
|
-
/// Register a custom OCR backend via FFI callback.
|
|
1801
|
-
///
|
|
1802
|
-
/// # Safety
|
|
1803
|
-
///
|
|
1804
|
-
/// - `name` must be a valid null-terminated C string
|
|
1805
|
-
/// - `callback` must be a valid function pointer that:
|
|
1806
|
-
/// - Does not store the image_bytes pointer
|
|
1807
|
-
/// - Returns a null-terminated UTF-8 string or NULL on error
|
|
1808
|
-
/// - The returned string must be freeable by kreuzberg_free_string
|
|
1809
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1810
|
-
///
|
|
1811
|
-
/// # Example (C)
|
|
1812
|
-
///
|
|
1813
|
-
/// ```c
|
|
1814
|
-
/// char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
|
|
1815
|
-
/// // Implement OCR logic here
|
|
1816
|
-
/// // Return allocated string with result, or NULL on error
|
|
1817
|
-
/// return strdup("Extracted text");
|
|
1818
|
-
/// }
|
|
1819
|
-
///
|
|
1820
|
-
/// bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
|
|
1821
|
-
/// if (!success) {
|
|
1822
|
-
/// const char* error = kreuzberg_last_error();
|
|
1823
|
-
/// printf("Failed to register: %s\n", error);
|
|
1824
|
-
/// }
|
|
1825
|
-
/// ```
|
|
1826
|
-
#[unsafe(no_mangle)]
|
|
1827
|
-
pub unsafe extern "C" fn kreuzberg_register_ocr_backend(name: *const c_char, callback: OcrBackendCallback) -> bool {
|
|
1828
|
-
ffi_panic_guard_bool!("kreuzberg_register_ocr_backend", {
|
|
1829
|
-
clear_last_error();
|
|
1830
|
-
|
|
1831
|
-
if name.is_null() {
|
|
1832
|
-
set_last_error("Backend name cannot be NULL".to_string());
|
|
1833
|
-
return false;
|
|
1834
|
-
}
|
|
1835
|
-
|
|
1836
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
1837
|
-
Ok(s) => s,
|
|
1838
|
-
Err(e) => {
|
|
1839
|
-
set_last_error(format!("Invalid UTF-8 in backend name: {}", e));
|
|
1840
|
-
return false;
|
|
1841
|
-
}
|
|
1842
|
-
};
|
|
1843
|
-
|
|
1844
|
-
if name_str.is_empty() {
|
|
1845
|
-
set_last_error("Plugin name cannot be empty".to_string());
|
|
1846
|
-
return false;
|
|
1847
|
-
}
|
|
1848
|
-
|
|
1849
|
-
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
1850
|
-
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
1851
|
-
return false;
|
|
1852
|
-
}
|
|
1853
|
-
|
|
1854
|
-
let backend = Arc::new(FfiOcrBackend::new(name_str.to_string(), callback, None));
|
|
1855
|
-
|
|
1856
|
-
let registry = get_ocr_backend_registry();
|
|
1857
|
-
let mut registry_guard = match registry.write() {
|
|
1858
|
-
Ok(guard) => guard,
|
|
1859
|
-
Err(e) => {
|
|
1860
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
1861
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
1862
|
-
return false;
|
|
1863
|
-
}
|
|
1864
|
-
};
|
|
1865
|
-
|
|
1866
|
-
match registry_guard.register(backend) {
|
|
1867
|
-
Ok(()) => true,
|
|
1868
|
-
Err(e) => {
|
|
1869
|
-
set_last_error(format!("Failed to register OCR backend: {}", e));
|
|
1870
|
-
false
|
|
1871
|
-
}
|
|
1872
|
-
}
|
|
1873
|
-
})
|
|
1874
|
-
}
|
|
1875
|
-
|
|
1876
|
-
/// Register a custom OCR backend with explicit language support via FFI callback.
|
|
1877
|
-
///
|
|
1878
|
-
/// # Safety
|
|
1879
|
-
///
|
|
1880
|
-
/// - `languages_json` must be a null-terminated JSON array of language codes or NULL
|
|
1881
|
-
/// - See `kreuzberg_register_ocr_backend` for additional safety notes.
|
|
1882
|
-
#[unsafe(no_mangle)]
|
|
1883
|
-
pub unsafe extern "C" fn kreuzberg_register_ocr_backend_with_languages(
|
|
1884
|
-
name: *const c_char,
|
|
1885
|
-
callback: OcrBackendCallback,
|
|
1886
|
-
languages_json: *const c_char,
|
|
1887
|
-
) -> bool {
|
|
1888
|
-
ffi_panic_guard_bool!("kreuzberg_register_ocr_backend_with_languages", {
|
|
1889
|
-
clear_last_error();
|
|
1890
|
-
|
|
1891
|
-
if name.is_null() {
|
|
1892
|
-
set_last_error("Backend name cannot be NULL".to_string());
|
|
1893
|
-
return false;
|
|
1894
|
-
}
|
|
1895
|
-
|
|
1896
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
1897
|
-
Ok(s) => s,
|
|
1898
|
-
Err(e) => {
|
|
1899
|
-
set_last_error(format!("Invalid UTF-8 in backend name: {}", e));
|
|
1900
|
-
return false;
|
|
1901
|
-
}
|
|
1902
|
-
};
|
|
1903
|
-
|
|
1904
|
-
if name_str.is_empty() {
|
|
1905
|
-
set_last_error("Plugin name cannot be empty".to_string());
|
|
1906
|
-
return false;
|
|
1907
|
-
}
|
|
1908
|
-
|
|
1909
|
-
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
1910
|
-
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
1911
|
-
return false;
|
|
1912
|
-
}
|
|
1913
|
-
|
|
1914
|
-
let supported_languages = match parse_languages_from_json(languages_json) {
|
|
1915
|
-
Ok(langs) => langs,
|
|
1916
|
-
Err(e) => {
|
|
1917
|
-
set_last_error(e);
|
|
1918
|
-
return false;
|
|
1919
|
-
}
|
|
1920
|
-
};
|
|
1921
|
-
|
|
1922
|
-
let backend = Arc::new(FfiOcrBackend::new(name_str.to_string(), callback, supported_languages));
|
|
1923
|
-
|
|
1924
|
-
let registry = get_ocr_backend_registry();
|
|
1925
|
-
let mut registry_guard = match registry.write() {
|
|
1926
|
-
Ok(guard) => guard,
|
|
1927
|
-
Err(e) => {
|
|
1928
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
1929
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
1930
|
-
return false;
|
|
1931
|
-
}
|
|
1932
|
-
};
|
|
1933
|
-
|
|
1934
|
-
match registry_guard.register(backend) {
|
|
1935
|
-
Ok(()) => true,
|
|
1936
|
-
Err(e) => {
|
|
1937
|
-
set_last_error(format!("Failed to register OCR backend: {}", e));
|
|
1938
|
-
false
|
|
1939
|
-
}
|
|
1940
|
-
}
|
|
1941
|
-
})
|
|
1942
|
-
}
|
|
1943
|
-
|
|
1944
|
-
/// Type alias for the PostProcessor callback function.
|
|
1945
|
-
///
|
|
1946
|
-
/// # Parameters
|
|
1947
|
-
///
|
|
1948
|
-
/// - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
|
|
1949
|
-
///
|
|
1950
|
-
/// # Returns
|
|
1951
|
-
///
|
|
1952
|
-
/// Null-terminated JSON string containing the processed ExtractionResult
|
|
1953
|
-
/// (must be freed by Rust via kreuzberg_free_string), or NULL on error.
|
|
1954
|
-
///
|
|
1955
|
-
/// # Safety
|
|
1956
|
-
///
|
|
1957
|
-
/// The callback must:
|
|
1958
|
-
/// - Not store the result_json pointer (it's only valid for the duration of the call)
|
|
1959
|
-
/// - Return a valid null-terminated UTF-8 JSON string allocated by the caller
|
|
1960
|
-
/// - Return NULL on error (error message should be retrievable separately)
|
|
1961
|
-
type PostProcessorCallback = unsafe extern "C" fn(result_json: *const c_char) -> *mut c_char;
|
|
1962
|
-
|
|
1963
|
-
/// FFI wrapper for custom PostProcessors registered from Java/C.
|
|
1964
|
-
///
|
|
1965
|
-
/// This struct wraps a C function pointer and implements the PostProcessor trait,
|
|
1966
|
-
/// allowing custom post-processing implementations from FFI languages to be registered
|
|
1967
|
-
/// and used within the Rust extraction pipeline.
|
|
1968
|
-
struct FfiPostProcessor {
|
|
1969
|
-
name: String,
|
|
1970
|
-
callback: PostProcessorCallback,
|
|
1971
|
-
stage: ProcessingStage,
|
|
1972
|
-
}
|
|
1973
|
-
|
|
1974
|
-
impl FfiPostProcessor {
|
|
1975
|
-
fn new(name: String, callback: PostProcessorCallback, stage: ProcessingStage) -> Self {
|
|
1976
|
-
Self { name, callback, stage }
|
|
1977
|
-
}
|
|
1978
|
-
}
|
|
1979
|
-
|
|
1980
|
-
impl Plugin for FfiPostProcessor {
|
|
1981
|
-
fn name(&self) -> &str {
|
|
1982
|
-
&self.name
|
|
1983
|
-
}
|
|
1984
|
-
|
|
1985
|
-
fn version(&self) -> String {
|
|
1986
|
-
"ffi-1.0.0".to_string()
|
|
1987
|
-
}
|
|
1988
|
-
|
|
1989
|
-
fn initialize(&self) -> Result<()> {
|
|
1990
|
-
Ok(())
|
|
1991
|
-
}
|
|
1992
|
-
|
|
1993
|
-
fn shutdown(&self) -> Result<()> {
|
|
1994
|
-
Ok(())
|
|
1995
|
-
}
|
|
1996
|
-
}
|
|
1997
|
-
|
|
1998
|
-
#[async_trait]
|
|
1999
|
-
impl kreuzberg::plugins::PostProcessor for FfiPostProcessor {
|
|
2000
|
-
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
2001
|
-
let result_json = serde_json::to_string(&*result).map_err(|e| KreuzbergError::Validation {
|
|
2002
|
-
message: format!("Failed to serialize ExtractionResult: {}", e),
|
|
2003
|
-
source: Some(Box::new(e)),
|
|
2004
|
-
})?;
|
|
2005
|
-
|
|
2006
|
-
let callback = self.callback;
|
|
2007
|
-
let processor_name = self.name.clone();
|
|
2008
|
-
let result_json_owned = result_json.clone();
|
|
2009
|
-
|
|
2010
|
-
let processed_json = tokio::task::spawn_blocking(move || {
|
|
2011
|
-
let result_cstring = CString::new(result_json_owned).map_err(|e| KreuzbergError::Validation {
|
|
2012
|
-
message: format!("Failed to create C string from result JSON: {}", e),
|
|
2013
|
-
source: Some(Box::new(e)),
|
|
2014
|
-
})?;
|
|
2015
|
-
|
|
2016
|
-
let processed_ptr = unsafe { callback(result_cstring.as_ptr()) };
|
|
2017
|
-
|
|
2018
|
-
if processed_ptr.is_null() {
|
|
2019
|
-
return Err(KreuzbergError::Plugin {
|
|
2020
|
-
message: "PostProcessor returned NULL (operation failed)".to_string(),
|
|
2021
|
-
plugin_name: processor_name.clone(),
|
|
2022
|
-
});
|
|
2023
|
-
}
|
|
2024
|
-
|
|
2025
|
-
let processed_cstr = unsafe { CStr::from_ptr(processed_ptr) };
|
|
2026
|
-
let json = processed_cstr
|
|
2027
|
-
.to_str()
|
|
2028
|
-
.map_err(|e| KreuzbergError::Plugin {
|
|
2029
|
-
message: format!("PostProcessor returned invalid UTF-8: {}", e),
|
|
2030
|
-
plugin_name: processor_name.clone(),
|
|
2031
|
-
})?
|
|
2032
|
-
.to_string();
|
|
2033
|
-
|
|
2034
|
-
unsafe { kreuzberg_free_string(processed_ptr) };
|
|
2035
|
-
|
|
2036
|
-
Ok(json)
|
|
2037
|
-
})
|
|
2038
|
-
.await
|
|
2039
|
-
.map_err(|e| KreuzbergError::Plugin {
|
|
2040
|
-
message: format!("PostProcessor task panicked: {}", e),
|
|
2041
|
-
plugin_name: self.name.clone(),
|
|
2042
|
-
})??;
|
|
2043
|
-
|
|
2044
|
-
let processed_result: ExtractionResult =
|
|
2045
|
-
serde_json::from_str(&processed_json).map_err(|e| KreuzbergError::Plugin {
|
|
2046
|
-
message: format!("Failed to deserialize processed result: {}", e),
|
|
2047
|
-
plugin_name: self.name.clone(),
|
|
2048
|
-
})?;
|
|
2049
|
-
|
|
2050
|
-
*result = processed_result;
|
|
2051
|
-
|
|
2052
|
-
Ok(())
|
|
2053
|
-
}
|
|
2054
|
-
|
|
2055
|
-
fn processing_stage(&self) -> kreuzberg::plugins::ProcessingStage {
|
|
2056
|
-
self.stage
|
|
2057
|
-
}
|
|
2058
|
-
}
|
|
2059
|
-
|
|
2060
|
-
fn parse_processing_stage(stage: Option<&str>) -> FfiResult<ProcessingStage> {
|
|
2061
|
-
match stage {
|
|
2062
|
-
Some(value) => match value.to_lowercase().as_str() {
|
|
2063
|
-
"early" => Ok(ProcessingStage::Early),
|
|
2064
|
-
"middle" => Ok(ProcessingStage::Middle),
|
|
2065
|
-
"late" => Ok(ProcessingStage::Late),
|
|
2066
|
-
other => Err(format!(
|
|
2067
|
-
"Invalid processing stage '{}'. Expected one of: early, middle, late",
|
|
2068
|
-
other
|
|
2069
|
-
)),
|
|
2070
|
-
},
|
|
2071
|
-
None => Ok(ProcessingStage::Middle),
|
|
2072
|
-
}
|
|
2073
|
-
}
|
|
2074
|
-
|
|
2075
|
-
/// Register a custom PostProcessor via FFI callback.
|
|
2076
|
-
///
|
|
2077
|
-
/// # Safety
|
|
2078
|
-
///
|
|
2079
|
-
/// - `name` must be a valid null-terminated C string
|
|
2080
|
-
/// - `callback` must be a valid function pointer that:
|
|
2081
|
-
/// - Does not store the result_json pointer
|
|
2082
|
-
/// - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
2083
|
-
/// - The returned string must be freeable by kreuzberg_free_string
|
|
2084
|
-
/// - `priority` determines the order of execution (higher priority runs first)
|
|
2085
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
2086
|
-
///
|
|
2087
|
-
/// # Example (C)
|
|
2088
|
-
///
|
|
2089
|
-
/// ```c
|
|
2090
|
-
/// char* my_post_processor(const char* result_json) {
|
|
2091
|
-
/// // Parse result_json, modify it, return JSON string
|
|
2092
|
-
/// return strdup("{\"content\":\"PROCESSED\"}");
|
|
2093
|
-
/// }
|
|
2094
|
-
///
|
|
2095
|
-
/// bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
|
|
2096
|
-
/// if (!success) {
|
|
2097
|
-
/// const char* error = kreuzberg_last_error();
|
|
2098
|
-
/// printf("Failed to register: %s\n", error);
|
|
2099
|
-
/// }
|
|
2100
|
-
/// ```
|
|
2101
|
-
#[unsafe(no_mangle)]
|
|
2102
|
-
pub unsafe extern "C" fn kreuzberg_register_post_processor(
|
|
2103
|
-
name: *const c_char,
|
|
2104
|
-
callback: PostProcessorCallback,
|
|
2105
|
-
priority: i32,
|
|
2106
|
-
) -> bool {
|
|
2107
|
-
ffi_panic_guard_bool!("kreuzberg_register_post_processor", {
|
|
2108
|
-
clear_last_error();
|
|
2109
|
-
|
|
2110
|
-
if name.is_null() {
|
|
2111
|
-
set_last_error("PostProcessor name cannot be NULL".to_string());
|
|
2112
|
-
return false;
|
|
2113
|
-
}
|
|
2114
|
-
|
|
2115
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
2116
|
-
Ok(s) => s,
|
|
2117
|
-
Err(e) => {
|
|
2118
|
-
set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
|
|
2119
|
-
return false;
|
|
2120
|
-
}
|
|
2121
|
-
};
|
|
2122
|
-
|
|
2123
|
-
if name_str.is_empty() {
|
|
2124
|
-
set_last_error("Plugin name cannot be empty".to_string());
|
|
2125
|
-
return false;
|
|
2126
|
-
}
|
|
2127
|
-
|
|
2128
|
-
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
2129
|
-
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
2130
|
-
return false;
|
|
2131
|
-
}
|
|
2132
|
-
|
|
2133
|
-
let processor = Arc::new(FfiPostProcessor::new(
|
|
2134
|
-
name_str.to_string(),
|
|
2135
|
-
callback,
|
|
2136
|
-
ProcessingStage::Middle,
|
|
2137
|
-
));
|
|
2138
|
-
|
|
2139
|
-
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
2140
|
-
let mut registry_guard = match registry.write() {
|
|
2141
|
-
Ok(guard) => guard,
|
|
2142
|
-
Err(e) => {
|
|
2143
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2144
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2145
|
-
return false;
|
|
2146
|
-
}
|
|
2147
|
-
};
|
|
2148
|
-
|
|
2149
|
-
match registry_guard.register(processor, priority) {
|
|
2150
|
-
Ok(()) => true,
|
|
2151
|
-
Err(e) => {
|
|
2152
|
-
set_last_error(format!("Failed to register PostProcessor: {}", e));
|
|
2153
|
-
false
|
|
2154
|
-
}
|
|
2155
|
-
}
|
|
2156
|
-
})
|
|
2157
|
-
}
|
|
2158
|
-
|
|
2159
|
-
/// Register a custom PostProcessor with an explicit processing stage.
|
|
2160
|
-
///
|
|
2161
|
-
/// # Safety
|
|
2162
|
-
///
|
|
2163
|
-
/// - `name` must be a valid null-terminated C string
|
|
2164
|
-
/// - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
|
|
2165
|
-
/// - `callback` must be a valid function pointer that:
|
|
2166
|
-
/// - Does not store the result_json pointer
|
|
2167
|
-
/// - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
2168
|
-
/// - The returned string must be freeable by kreuzberg_free_string
|
|
2169
|
-
/// - `priority` determines the order of execution within the stage (higher priority runs first)
|
|
2170
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
2171
|
-
#[unsafe(no_mangle)]
|
|
2172
|
-
pub unsafe extern "C" fn kreuzberg_register_post_processor_with_stage(
|
|
2173
|
-
name: *const c_char,
|
|
2174
|
-
callback: PostProcessorCallback,
|
|
2175
|
-
priority: i32,
|
|
2176
|
-
stage: *const c_char,
|
|
2177
|
-
) -> bool {
|
|
2178
|
-
ffi_panic_guard_bool!("kreuzberg_register_post_processor_with_stage", {
|
|
2179
|
-
clear_last_error();
|
|
2180
|
-
|
|
2181
|
-
if name.is_null() {
|
|
2182
|
-
set_last_error("PostProcessor name cannot be NULL".to_string());
|
|
2183
|
-
return false;
|
|
2184
|
-
}
|
|
2185
|
-
|
|
2186
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
2187
|
-
Ok(s) => s,
|
|
2188
|
-
Err(e) => {
|
|
2189
|
-
set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
|
|
2190
|
-
return false;
|
|
2191
|
-
}
|
|
2192
|
-
};
|
|
2193
|
-
|
|
2194
|
-
if name_str.is_empty() {
|
|
2195
|
-
set_last_error("Plugin name cannot be empty".to_string());
|
|
2196
|
-
return false;
|
|
2197
|
-
}
|
|
2198
|
-
|
|
2199
|
-
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
2200
|
-
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
2201
|
-
return false;
|
|
2202
|
-
}
|
|
2203
|
-
|
|
2204
|
-
let stage_str = if stage.is_null() {
|
|
2205
|
-
None
|
|
2206
|
-
} else {
|
|
2207
|
-
match unsafe { CStr::from_ptr(stage) }.to_str() {
|
|
2208
|
-
Ok(s) => Some(s),
|
|
2209
|
-
Err(e) => {
|
|
2210
|
-
set_last_error(format!("Invalid UTF-8 in processing stage: {}", e));
|
|
2211
|
-
return false;
|
|
2212
|
-
}
|
|
2213
|
-
}
|
|
2214
|
-
};
|
|
2215
|
-
|
|
2216
|
-
let stage = match parse_processing_stage(stage_str) {
|
|
2217
|
-
Ok(stage) => stage,
|
|
2218
|
-
Err(e) => {
|
|
2219
|
-
set_last_error(e);
|
|
2220
|
-
return false;
|
|
2221
|
-
}
|
|
2222
|
-
};
|
|
2223
|
-
|
|
2224
|
-
let processor = Arc::new(FfiPostProcessor::new(name_str.to_string(), callback, stage));
|
|
2225
|
-
|
|
2226
|
-
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
2227
|
-
let mut registry_guard = match registry.write() {
|
|
2228
|
-
Ok(guard) => guard,
|
|
2229
|
-
Err(e) => {
|
|
2230
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2231
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2232
|
-
return false;
|
|
2233
|
-
}
|
|
2234
|
-
};
|
|
2235
|
-
|
|
2236
|
-
match registry_guard.register(processor, priority) {
|
|
2237
|
-
Ok(()) => true,
|
|
2238
|
-
Err(e) => {
|
|
2239
|
-
set_last_error(format!("Failed to register PostProcessor: {}", e));
|
|
2240
|
-
false
|
|
2241
|
-
}
|
|
2242
|
-
}
|
|
2243
|
-
})
|
|
2244
|
-
}
|
|
2245
|
-
|
|
2246
|
-
/// Unregister a PostProcessor by name.
|
|
2247
|
-
///
|
|
2248
|
-
/// # Safety
|
|
2249
|
-
///
|
|
2250
|
-
/// - `name` must be a valid null-terminated C string
|
|
2251
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
2252
|
-
///
|
|
2253
|
-
/// # Example (C)
|
|
2254
|
-
///
|
|
2255
|
-
/// ```c
|
|
2256
|
-
/// bool success = kreuzberg_unregister_post_processor("my-processor");
|
|
2257
|
-
/// if (!success) {
|
|
2258
|
-
/// const char* error = kreuzberg_last_error();
|
|
2259
|
-
/// printf("Failed to unregister: %s\n", error);
|
|
2260
|
-
/// }
|
|
2261
|
-
/// ```
|
|
2262
|
-
#[unsafe(no_mangle)]
|
|
2263
|
-
pub unsafe extern "C" fn kreuzberg_unregister_post_processor(name: *const c_char) -> bool {
|
|
2264
|
-
ffi_panic_guard_bool!("kreuzberg_unregister_post_processor", {
|
|
2265
|
-
clear_last_error();
|
|
2266
|
-
|
|
2267
|
-
if name.is_null() {
|
|
2268
|
-
set_last_error("PostProcessor name cannot be NULL".to_string());
|
|
2269
|
-
return false;
|
|
2270
|
-
}
|
|
2271
|
-
|
|
2272
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
2273
|
-
Ok(s) => s,
|
|
2274
|
-
Err(e) => {
|
|
2275
|
-
set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
|
|
2276
|
-
return false;
|
|
2277
|
-
}
|
|
2278
|
-
};
|
|
2279
|
-
|
|
2280
|
-
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
2281
|
-
let mut registry_guard = match registry.write() {
|
|
2282
|
-
Ok(guard) => guard,
|
|
2283
|
-
Err(e) => {
|
|
2284
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2285
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2286
|
-
return false;
|
|
2287
|
-
}
|
|
2288
|
-
};
|
|
2289
|
-
|
|
2290
|
-
match registry_guard.remove(name_str) {
|
|
2291
|
-
Ok(()) => true,
|
|
2292
|
-
Err(e) => {
|
|
2293
|
-
set_last_error(format!("Failed to remove PostProcessor: {}", e));
|
|
2294
|
-
false
|
|
2295
|
-
}
|
|
2296
|
-
}
|
|
2297
|
-
})
|
|
2298
|
-
}
|
|
2299
|
-
|
|
2300
|
-
/// Clear all registered PostProcessors.
|
|
2301
|
-
///
|
|
2302
|
-
/// # Safety
|
|
2303
|
-
///
|
|
2304
|
-
/// - Removes all registered processors. Subsequent extractions will run without them.
|
|
2305
|
-
/// - Returns true on success, false on error.
|
|
2306
|
-
#[unsafe(no_mangle)]
|
|
2307
|
-
pub unsafe extern "C" fn kreuzberg_clear_post_processors() -> bool {
|
|
2308
|
-
ffi_panic_guard_bool!("kreuzberg_clear_post_processors", {
|
|
2309
|
-
clear_last_error();
|
|
2310
|
-
|
|
2311
|
-
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
2312
|
-
let mut registry_guard = match registry.write() {
|
|
2313
|
-
Ok(guard) => guard,
|
|
2314
|
-
Err(e) => {
|
|
2315
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2316
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2317
|
-
return false;
|
|
2318
|
-
}
|
|
2319
|
-
};
|
|
2320
|
-
|
|
2321
|
-
*registry_guard = Default::default();
|
|
2322
|
-
true
|
|
2323
|
-
})
|
|
2324
|
-
}
|
|
2325
|
-
|
|
2326
|
-
/// List all registered PostProcessors as a JSON array of names.
|
|
2327
|
-
///
|
|
2328
|
-
/// # Safety
|
|
2329
|
-
///
|
|
2330
|
-
/// - Returned string must be freed with `kreuzberg_free_string`.
|
|
2331
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`).
|
|
2332
|
-
#[unsafe(no_mangle)]
|
|
2333
|
-
pub unsafe extern "C" fn kreuzberg_list_post_processors() -> *mut c_char {
|
|
2334
|
-
ffi_panic_guard!("kreuzberg_list_post_processors", {
|
|
2335
|
-
clear_last_error();
|
|
2336
|
-
|
|
2337
|
-
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
2338
|
-
let registry_guard = match registry.read() {
|
|
2339
|
-
Ok(guard) => guard,
|
|
2340
|
-
Err(e) => {
|
|
2341
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2342
|
-
set_last_error(format!("Failed to acquire registry read lock: {}", e));
|
|
2343
|
-
return ptr::null_mut();
|
|
2344
|
-
}
|
|
2345
|
-
};
|
|
2346
|
-
|
|
2347
|
-
match serde_json::to_string(®istry_guard.list()) {
|
|
2348
|
-
Ok(json) => match CString::new(json) {
|
|
2349
|
-
Ok(cstr) => cstr.into_raw(),
|
|
2350
|
-
Err(e) => {
|
|
2351
|
-
set_last_error(format!("Failed to create C string: {}", e));
|
|
2352
|
-
ptr::null_mut()
|
|
2353
|
-
}
|
|
2354
|
-
},
|
|
2355
|
-
Err(e) => {
|
|
2356
|
-
set_last_error(format!("Failed to serialize PostProcessor list: {}", e));
|
|
2357
|
-
ptr::null_mut()
|
|
2358
|
-
}
|
|
2359
|
-
}
|
|
2360
|
-
})
|
|
2361
|
-
}
|
|
2362
|
-
|
|
2363
|
-
/// Type alias for the DocumentExtractor callback function.
|
|
2364
|
-
///
|
|
2365
|
-
/// # Parameters
|
|
2366
|
-
///
|
|
2367
|
-
/// - `content`: Raw document bytes
|
|
2368
|
-
/// - `content_len`: Length of the content array
|
|
2369
|
-
/// - `mime_type`: MIME type of the document (null-terminated string)
|
|
2370
|
-
/// - `config_json`: JSON-encoded ExtractionConfig (null-terminated string)
|
|
2371
|
-
///
|
|
2372
|
-
/// # Returns
|
|
2373
|
-
///
|
|
2374
|
-
/// Null-terminated JSON string containing the ExtractionResult, or NULL on error.
|
|
2375
|
-
/// The returned string must be freeable by kreuzberg_free_string.
|
|
2376
|
-
///
|
|
2377
|
-
/// # Safety
|
|
2378
|
-
///
|
|
2379
|
-
/// The callback must:
|
|
2380
|
-
/// - Not store the content, mime_type, or config_json pointers (only valid during the call)
|
|
2381
|
-
/// - Return a valid null-terminated UTF-8 JSON string or NULL on error
|
|
2382
|
-
/// - The returned string must be freeable by kreuzberg_free_string
|
|
2383
|
-
type DocumentExtractorCallback = unsafe extern "C" fn(
|
|
2384
|
-
content: *const u8,
|
|
2385
|
-
content_len: usize,
|
|
2386
|
-
mime_type: *const c_char,
|
|
2387
|
-
config_json: *const c_char,
|
|
2388
|
-
) -> *mut c_char;
|
|
2389
|
-
|
|
2390
|
-
/// FFI wrapper for custom DocumentExtractors registered from Java/C.
|
|
2391
|
-
///
|
|
2392
|
-
/// This struct wraps a C function pointer and implements the DocumentExtractor trait,
|
|
2393
|
-
/// allowing custom extraction implementations from FFI languages to be registered
|
|
2394
|
-
/// and used within the Rust extraction pipeline.
|
|
2395
|
-
struct FfiDocumentExtractor {
|
|
2396
|
-
name: String,
|
|
2397
|
-
callback: DocumentExtractorCallback,
|
|
2398
|
-
#[allow(dead_code)]
|
|
2399
|
-
supported_types: Vec<String>,
|
|
2400
|
-
supported_types_static: Vec<&'static str>,
|
|
2401
|
-
priority: i32,
|
|
2402
|
-
}
|
|
2403
|
-
|
|
2404
|
-
impl FfiDocumentExtractor {
|
|
2405
|
-
fn new(name: String, callback: DocumentExtractorCallback, supported_types: Vec<String>, priority: i32) -> Self {
|
|
2406
|
-
let supported_types_static: Vec<&'static str> = supported_types
|
|
2407
|
-
.iter()
|
|
2408
|
-
.map(|s| {
|
|
2409
|
-
let leaked: &'static str = Box::leak(s.clone().into_boxed_str());
|
|
2410
|
-
leaked
|
|
2411
|
-
})
|
|
2412
|
-
.collect();
|
|
2413
|
-
|
|
2414
|
-
Self {
|
|
2415
|
-
name,
|
|
2416
|
-
callback,
|
|
2417
|
-
supported_types,
|
|
2418
|
-
supported_types_static,
|
|
2419
|
-
priority,
|
|
2420
|
-
}
|
|
2421
|
-
}
|
|
2422
|
-
}
|
|
2423
|
-
|
|
2424
|
-
impl Plugin for FfiDocumentExtractor {
|
|
2425
|
-
fn name(&self) -> &str {
|
|
2426
|
-
&self.name
|
|
2427
|
-
}
|
|
2428
|
-
|
|
2429
|
-
fn version(&self) -> String {
|
|
2430
|
-
"ffi-1.0.0".to_string()
|
|
2431
|
-
}
|
|
2432
|
-
|
|
2433
|
-
fn initialize(&self) -> Result<()> {
|
|
2434
|
-
Ok(())
|
|
2435
|
-
}
|
|
2436
|
-
|
|
2437
|
-
fn shutdown(&self) -> Result<()> {
|
|
2438
|
-
Ok(())
|
|
2439
|
-
}
|
|
2440
|
-
}
|
|
2441
|
-
|
|
2442
|
-
#[async_trait]
|
|
2443
|
-
impl kreuzberg::plugins::DocumentExtractor for FfiDocumentExtractor {
|
|
2444
|
-
async fn extract_bytes(
|
|
2445
|
-
&self,
|
|
2446
|
-
content: &[u8],
|
|
2447
|
-
mime_type: &str,
|
|
2448
|
-
config: &ExtractionConfig,
|
|
2449
|
-
) -> Result<ExtractionResult> {
|
|
2450
|
-
let config_json = serde_json::to_string(config).map_err(|e| KreuzbergError::Validation {
|
|
2451
|
-
message: format!("Failed to serialize ExtractionConfig: {}", e),
|
|
2452
|
-
source: Some(Box::new(e)),
|
|
2453
|
-
})?;
|
|
2454
|
-
|
|
2455
|
-
let callback = self.callback;
|
|
2456
|
-
let extractor_name = self.name.clone();
|
|
2457
|
-
let extractor_name_error = self.name.clone();
|
|
2458
|
-
let extractor_name_parse = self.name.clone();
|
|
2459
|
-
let content_vec = content.to_vec();
|
|
2460
|
-
let mime_type_owned = mime_type.to_string();
|
|
2461
|
-
let config_json_owned = config_json.clone();
|
|
2462
|
-
|
|
2463
|
-
let result_json = tokio::task::spawn_blocking(move || {
|
|
2464
|
-
let mime_cstr = match CString::new(mime_type_owned.clone()) {
|
|
2465
|
-
Ok(s) => s,
|
|
2466
|
-
Err(e) => {
|
|
2467
|
-
return Err(KreuzbergError::Validation {
|
|
2468
|
-
message: format!("Invalid MIME type for extractor '{}': {}", extractor_name, e),
|
|
2469
|
-
source: Some(Box::new(e)),
|
|
2470
|
-
});
|
|
2471
|
-
}
|
|
2472
|
-
};
|
|
2473
|
-
|
|
2474
|
-
let config_cstr = match CString::new(config_json_owned.clone()) {
|
|
2475
|
-
Ok(s) => s,
|
|
2476
|
-
Err(e) => {
|
|
2477
|
-
return Err(KreuzbergError::Validation {
|
|
2478
|
-
message: format!("Invalid config JSON for extractor '{}': {}", extractor_name, e),
|
|
2479
|
-
source: Some(Box::new(e)),
|
|
2480
|
-
});
|
|
2481
|
-
}
|
|
2482
|
-
};
|
|
2483
|
-
|
|
2484
|
-
let result_ptr = unsafe {
|
|
2485
|
-
callback(
|
|
2486
|
-
content_vec.as_ptr(),
|
|
2487
|
-
content_vec.len(),
|
|
2488
|
-
mime_cstr.as_ptr(),
|
|
2489
|
-
config_cstr.as_ptr(),
|
|
2490
|
-
)
|
|
2491
|
-
};
|
|
2492
|
-
|
|
2493
|
-
if result_ptr.is_null() {
|
|
2494
|
-
return Err(KreuzbergError::Parsing {
|
|
2495
|
-
message: format!("DocumentExtractor '{}' returned NULL (callback failed)", extractor_name),
|
|
2496
|
-
source: None,
|
|
2497
|
-
});
|
|
2498
|
-
}
|
|
2499
|
-
|
|
2500
|
-
let result_cstr = unsafe { CString::from_raw(result_ptr) };
|
|
2501
|
-
let result_str = result_cstr.to_str().map_err(|e| KreuzbergError::Validation {
|
|
2502
|
-
message: format!("Invalid UTF-8 in result from extractor '{}': {}", extractor_name, e),
|
|
2503
|
-
source: Some(Box::new(e)),
|
|
2504
|
-
})?;
|
|
2505
|
-
|
|
2506
|
-
Ok(result_str.to_string())
|
|
2507
|
-
})
|
|
2508
|
-
.await
|
|
2509
|
-
.map_err(|e| {
|
|
2510
|
-
KreuzbergError::Other(format!(
|
|
2511
|
-
"Task join error in extractor '{}': {}",
|
|
2512
|
-
extractor_name_error, e
|
|
2513
|
-
))
|
|
2514
|
-
})??;
|
|
2515
|
-
|
|
2516
|
-
serde_json::from_str(&result_json).map_err(|e| KreuzbergError::Parsing {
|
|
2517
|
-
message: format!(
|
|
2518
|
-
"Failed to deserialize ExtractionResult from extractor '{}': {}",
|
|
2519
|
-
extractor_name_parse, e
|
|
2520
|
-
),
|
|
2521
|
-
source: Some(Box::new(e)),
|
|
2522
|
-
})
|
|
2523
|
-
}
|
|
2524
|
-
|
|
2525
|
-
async fn extract_file(
|
|
2526
|
-
&self,
|
|
2527
|
-
path: &std::path::Path,
|
|
2528
|
-
mime_type: &str,
|
|
2529
|
-
config: &ExtractionConfig,
|
|
2530
|
-
) -> Result<ExtractionResult> {
|
|
2531
|
-
let content = tokio::fs::read(path).await.map_err(KreuzbergError::Io)?;
|
|
2532
|
-
self.extract_bytes(&content, mime_type, config).await
|
|
2533
|
-
}
|
|
2534
|
-
|
|
2535
|
-
fn supported_mime_types(&self) -> &[&str] {
|
|
2536
|
-
&self.supported_types_static
|
|
2537
|
-
}
|
|
2538
|
-
|
|
2539
|
-
fn priority(&self) -> i32 {
|
|
2540
|
-
self.priority
|
|
2541
|
-
}
|
|
2542
|
-
}
|
|
2543
|
-
|
|
2544
|
-
/// Register a custom DocumentExtractor via FFI callback.
|
|
2545
|
-
///
|
|
2546
|
-
/// # Safety
|
|
2547
|
-
///
|
|
2548
|
-
/// - `name` must be a valid null-terminated C string
|
|
2549
|
-
/// - `callback` must be a valid function pointer that:
|
|
2550
|
-
/// - Does not store the content, mime_type, or config_json pointers
|
|
2551
|
-
/// - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
2552
|
-
/// - The returned string must be freeable by kreuzberg_free_string
|
|
2553
|
-
/// - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
|
|
2554
|
-
/// - `priority` determines the order of selection (higher priority preferred)
|
|
2555
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
2556
|
-
///
|
|
2557
|
-
/// # Example (C)
|
|
2558
|
-
///
|
|
2559
|
-
/// ```c
|
|
2560
|
-
/// char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
|
|
2561
|
-
/// // Extract content from bytes, return JSON ExtractionResult
|
|
2562
|
-
/// return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
|
|
2563
|
-
/// }
|
|
2564
|
-
///
|
|
2565
|
-
/// bool success = kreuzberg_register_document_extractor(
|
|
2566
|
-
/// "my-extractor",
|
|
2567
|
-
/// my_extractor,
|
|
2568
|
-
/// "application/x-custom,text/x-custom",
|
|
2569
|
-
/// 100
|
|
2570
|
-
/// );
|
|
2571
|
-
/// if (!success) {
|
|
2572
|
-
/// const char* error = kreuzberg_last_error();
|
|
2573
|
-
/// printf("Failed to register: %s\n", error);
|
|
2574
|
-
/// }
|
|
2575
|
-
/// ```
|
|
2576
|
-
#[unsafe(no_mangle)]
|
|
2577
|
-
pub unsafe extern "C" fn kreuzberg_register_document_extractor(
|
|
2578
|
-
name: *const c_char,
|
|
2579
|
-
callback: DocumentExtractorCallback,
|
|
2580
|
-
mime_types: *const c_char,
|
|
2581
|
-
priority: i32,
|
|
2582
|
-
) -> bool {
|
|
2583
|
-
ffi_panic_guard_bool!("kreuzberg_register_document_extractor", {
|
|
2584
|
-
clear_last_error();
|
|
2585
|
-
|
|
2586
|
-
if name.is_null() {
|
|
2587
|
-
set_last_error("DocumentExtractor name cannot be NULL".to_string());
|
|
2588
|
-
return false;
|
|
2589
|
-
}
|
|
2590
|
-
|
|
2591
|
-
if mime_types.is_null() {
|
|
2592
|
-
set_last_error("MIME types cannot be NULL".to_string());
|
|
2593
|
-
return false;
|
|
2594
|
-
}
|
|
2595
|
-
|
|
2596
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
2597
|
-
Ok(s) => s,
|
|
2598
|
-
Err(e) => {
|
|
2599
|
-
set_last_error(format!("Invalid UTF-8 in DocumentExtractor name: {}", e));
|
|
2600
|
-
return false;
|
|
2601
|
-
}
|
|
2602
|
-
};
|
|
2603
|
-
|
|
2604
|
-
if name_str.is_empty() {
|
|
2605
|
-
set_last_error("Plugin name cannot be empty".to_string());
|
|
2606
|
-
return false;
|
|
2607
|
-
}
|
|
2608
|
-
|
|
2609
|
-
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
2610
|
-
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
2611
|
-
return false;
|
|
2612
|
-
}
|
|
2613
|
-
|
|
2614
|
-
let mime_types_str = match unsafe { CStr::from_ptr(mime_types) }.to_str() {
|
|
2615
|
-
Ok(s) => s,
|
|
2616
|
-
Err(e) => {
|
|
2617
|
-
set_last_error(format!("Invalid UTF-8 in MIME types: {}", e));
|
|
2618
|
-
return false;
|
|
2619
|
-
}
|
|
2620
|
-
};
|
|
2621
|
-
|
|
2622
|
-
let supported_types: Vec<String> = mime_types_str
|
|
2623
|
-
.split(',')
|
|
2624
|
-
.map(|s| s.trim().to_string())
|
|
2625
|
-
.filter(|s| !s.is_empty())
|
|
2626
|
-
.collect();
|
|
2627
|
-
|
|
2628
|
-
if supported_types.is_empty() {
|
|
2629
|
-
set_last_error("At least one MIME type must be specified".to_string());
|
|
2630
|
-
return false;
|
|
2631
|
-
}
|
|
2632
|
-
|
|
2633
|
-
let extractor = Arc::new(FfiDocumentExtractor::new(
|
|
2634
|
-
name_str.to_string(),
|
|
2635
|
-
callback,
|
|
2636
|
-
supported_types,
|
|
2637
|
-
priority,
|
|
2638
|
-
));
|
|
2639
|
-
|
|
2640
|
-
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
2641
|
-
let mut registry_guard = match registry.write() {
|
|
2642
|
-
Ok(guard) => guard,
|
|
2643
|
-
Err(e) => {
|
|
2644
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2645
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2646
|
-
return false;
|
|
2647
|
-
}
|
|
2648
|
-
};
|
|
2649
|
-
|
|
2650
|
-
match registry_guard.register(extractor) {
|
|
2651
|
-
Ok(()) => true,
|
|
2652
|
-
Err(e) => {
|
|
2653
|
-
set_last_error(format!("Failed to register DocumentExtractor: {}", e));
|
|
2654
|
-
false
|
|
2655
|
-
}
|
|
2656
|
-
}
|
|
2657
|
-
})
|
|
2658
|
-
}
|
|
2659
|
-
|
|
2660
|
-
/// Unregister a DocumentExtractor by name.
|
|
2661
|
-
///
|
|
2662
|
-
/// # Safety
|
|
2663
|
-
///
|
|
2664
|
-
/// - `name` must be a valid null-terminated C string
|
|
2665
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
2666
|
-
///
|
|
2667
|
-
/// # Example (C)
|
|
2668
|
-
///
|
|
2669
|
-
/// ```c
|
|
2670
|
-
/// bool success = kreuzberg_unregister_document_extractor("my-extractor");
|
|
2671
|
-
/// if (!success) {
|
|
2672
|
-
/// const char* error = kreuzberg_last_error();
|
|
2673
|
-
/// printf("Failed to unregister: %s\n", error);
|
|
2674
|
-
/// }
|
|
2675
|
-
/// ```
|
|
2676
|
-
#[unsafe(no_mangle)]
|
|
2677
|
-
pub unsafe extern "C" fn kreuzberg_unregister_document_extractor(name: *const c_char) -> bool {
|
|
2678
|
-
ffi_panic_guard_bool!("kreuzberg_unregister_document_extractor", {
|
|
2679
|
-
clear_last_error();
|
|
2680
|
-
|
|
2681
|
-
if name.is_null() {
|
|
2682
|
-
set_last_error("DocumentExtractor name cannot be NULL".to_string());
|
|
2683
|
-
return false;
|
|
2684
|
-
}
|
|
2685
|
-
|
|
2686
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
2687
|
-
Ok(s) => s,
|
|
2688
|
-
Err(e) => {
|
|
2689
|
-
set_last_error(format!("Invalid UTF-8 in DocumentExtractor name: {}", e));
|
|
2690
|
-
return false;
|
|
2691
|
-
}
|
|
2692
|
-
};
|
|
2693
|
-
|
|
2694
|
-
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
2695
|
-
let mut registry_guard = match registry.write() {
|
|
2696
|
-
Ok(guard) => guard,
|
|
2697
|
-
Err(e) => {
|
|
2698
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2699
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2700
|
-
return false;
|
|
2701
|
-
}
|
|
2702
|
-
};
|
|
2703
|
-
|
|
2704
|
-
match registry_guard.remove(name_str) {
|
|
2705
|
-
Ok(()) => true,
|
|
2706
|
-
Err(e) => {
|
|
2707
|
-
set_last_error(format!("Failed to remove DocumentExtractor: {}", e));
|
|
2708
|
-
false
|
|
2709
|
-
}
|
|
2710
|
-
}
|
|
2711
|
-
})
|
|
2712
|
-
}
|
|
2713
|
-
|
|
2714
|
-
/// List all registered DocumentExtractors as a JSON array of names.
|
|
2715
|
-
///
|
|
2716
|
-
/// # Safety
|
|
2717
|
-
///
|
|
2718
|
-
/// - Returned string must be freed with `kreuzberg_free_string`.
|
|
2719
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`).
|
|
2720
|
-
#[unsafe(no_mangle)]
|
|
2721
|
-
pub unsafe extern "C" fn kreuzberg_list_document_extractors() -> *mut c_char {
|
|
2722
|
-
ffi_panic_guard!("kreuzberg_list_document_extractors", {
|
|
2723
|
-
clear_last_error();
|
|
2724
|
-
|
|
2725
|
-
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
2726
|
-
let registry_guard = match registry.read() {
|
|
2727
|
-
Ok(guard) => guard,
|
|
2728
|
-
Err(e) => {
|
|
2729
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2730
|
-
set_last_error(format!("Failed to acquire registry read lock: {}", e));
|
|
2731
|
-
return ptr::null_mut();
|
|
2732
|
-
}
|
|
2733
|
-
};
|
|
2734
|
-
|
|
2735
|
-
match serde_json::to_string(®istry_guard.list()) {
|
|
2736
|
-
Ok(json) => match CString::new(json) {
|
|
2737
|
-
Ok(cstr) => cstr.into_raw(),
|
|
2738
|
-
Err(e) => {
|
|
2739
|
-
set_last_error(format!("Failed to create C string: {}", e));
|
|
2740
|
-
ptr::null_mut()
|
|
2741
|
-
}
|
|
2742
|
-
},
|
|
2743
|
-
Err(e) => {
|
|
2744
|
-
set_last_error(format!("Failed to serialize DocumentExtractor list: {}", e));
|
|
2745
|
-
ptr::null_mut()
|
|
2746
|
-
}
|
|
2747
|
-
}
|
|
2748
|
-
})
|
|
2749
|
-
}
|
|
2750
|
-
|
|
2751
|
-
/// Type alias for the Validator callback function.
|
|
2752
|
-
///
|
|
2753
|
-
/// # Parameters
|
|
2754
|
-
///
|
|
2755
|
-
/// - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
|
|
2756
|
-
///
|
|
2757
|
-
/// # Returns
|
|
2758
|
-
///
|
|
2759
|
-
/// Null-terminated error message string if validation fails (must be freed by Rust
|
|
2760
|
-
/// via kreuzberg_free_string), or NULL if validation passes.
|
|
2761
|
-
///
|
|
2762
|
-
/// # Safety
|
|
2763
|
-
///
|
|
2764
|
-
/// The callback must:
|
|
2765
|
-
/// - Not store the result_json pointer (it's only valid for the duration of the call)
|
|
2766
|
-
/// - Return a valid null-terminated UTF-8 string (error message) if validation fails
|
|
2767
|
-
/// - Return NULL if validation passes
|
|
2768
|
-
/// - The returned string must be freeable by kreuzberg_free_string
|
|
2769
|
-
type ValidatorCallback = unsafe extern "C" fn(result_json: *const c_char) -> *mut c_char;
|
|
2770
|
-
|
|
2771
|
-
/// FFI wrapper for custom Validators registered from Java/C.
|
|
2772
|
-
///
|
|
2773
|
-
/// This struct wraps a C function pointer and implements the Validator trait,
|
|
2774
|
-
/// allowing custom validation implementations from FFI languages to be registered
|
|
2775
|
-
/// and used within the Rust extraction pipeline.
|
|
2776
|
-
struct FfiValidator {
|
|
2777
|
-
name: String,
|
|
2778
|
-
callback: ValidatorCallback,
|
|
2779
|
-
priority: i32,
|
|
2780
|
-
}
|
|
2781
|
-
|
|
2782
|
-
impl FfiValidator {
|
|
2783
|
-
fn new(name: String, callback: ValidatorCallback, priority: i32) -> Self {
|
|
2784
|
-
Self {
|
|
2785
|
-
name,
|
|
2786
|
-
callback,
|
|
2787
|
-
priority,
|
|
2788
|
-
}
|
|
2789
|
-
}
|
|
2790
|
-
}
|
|
2791
|
-
|
|
2792
|
-
impl Plugin for FfiValidator {
|
|
2793
|
-
fn name(&self) -> &str {
|
|
2794
|
-
&self.name
|
|
2795
|
-
}
|
|
2796
|
-
|
|
2797
|
-
fn version(&self) -> String {
|
|
2798
|
-
"ffi-1.0.0".to_string()
|
|
2799
|
-
}
|
|
2800
|
-
|
|
2801
|
-
fn initialize(&self) -> Result<()> {
|
|
2802
|
-
Ok(())
|
|
2803
|
-
}
|
|
2804
|
-
|
|
2805
|
-
fn shutdown(&self) -> Result<()> {
|
|
2806
|
-
Ok(())
|
|
2807
|
-
}
|
|
2808
|
-
}
|
|
2809
|
-
|
|
2810
|
-
#[async_trait]
|
|
2811
|
-
impl kreuzberg::plugins::Validator for FfiValidator {
|
|
2812
|
-
fn priority(&self) -> i32 {
|
|
2813
|
-
self.priority
|
|
2814
|
-
}
|
|
2815
|
-
|
|
2816
|
-
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
2817
|
-
let result_json = serde_json::to_string(result).map_err(|e| KreuzbergError::Validation {
|
|
2818
|
-
message: format!("Failed to serialize ExtractionResult: {}", e),
|
|
2819
|
-
source: Some(Box::new(e)),
|
|
2820
|
-
})?;
|
|
2821
|
-
|
|
2822
|
-
let callback = self.callback;
|
|
2823
|
-
let validator_name = self.name.clone();
|
|
2824
|
-
let result_json_owned = result_json.clone();
|
|
2825
|
-
|
|
2826
|
-
let error_msg = tokio::task::spawn_blocking(move || {
|
|
2827
|
-
let result_cstring = CString::new(result_json_owned).map_err(|e| KreuzbergError::Validation {
|
|
2828
|
-
message: format!("Failed to create C string from result JSON: {}", e),
|
|
2829
|
-
source: Some(Box::new(e)),
|
|
2830
|
-
})?;
|
|
2831
|
-
|
|
2832
|
-
let error_ptr = unsafe { callback(result_cstring.as_ptr()) };
|
|
2833
|
-
|
|
2834
|
-
if error_ptr.is_null() {
|
|
2835
|
-
return Ok::<Option<String>, KreuzbergError>(None);
|
|
2836
|
-
}
|
|
2837
|
-
|
|
2838
|
-
let error_cstr = unsafe { CStr::from_ptr(error_ptr) };
|
|
2839
|
-
let error_msg = error_cstr
|
|
2840
|
-
.to_str()
|
|
2841
|
-
.map_err(|e| KreuzbergError::Plugin {
|
|
2842
|
-
message: format!("Validator returned invalid UTF-8: {}", e),
|
|
2843
|
-
plugin_name: validator_name.clone(),
|
|
2844
|
-
})?
|
|
2845
|
-
.to_string();
|
|
2846
|
-
|
|
2847
|
-
unsafe { kreuzberg_free_string(error_ptr) };
|
|
2848
|
-
|
|
2849
|
-
Ok(Some(error_msg))
|
|
2850
|
-
})
|
|
2851
|
-
.await
|
|
2852
|
-
.map_err(|e| KreuzbergError::Plugin {
|
|
2853
|
-
message: format!("Validator task panicked: {}", e),
|
|
2854
|
-
plugin_name: self.name.clone(),
|
|
2855
|
-
})??;
|
|
2856
|
-
|
|
2857
|
-
if let Some(msg) = error_msg {
|
|
2858
|
-
return Err(KreuzbergError::Validation {
|
|
2859
|
-
message: msg,
|
|
2860
|
-
source: None,
|
|
2861
|
-
});
|
|
2862
|
-
}
|
|
2863
|
-
|
|
2864
|
-
Ok(())
|
|
2865
|
-
}
|
|
2866
|
-
}
|
|
2867
|
-
|
|
2868
|
-
/// Register a custom Validator via FFI callback.
|
|
2869
|
-
///
|
|
2870
|
-
/// # Safety
|
|
2871
|
-
///
|
|
2872
|
-
/// - `name` must be a valid null-terminated C string
|
|
2873
|
-
/// - `callback` must be a valid function pointer that:
|
|
2874
|
-
/// - Does not store the result_json pointer
|
|
2875
|
-
/// - Returns a null-terminated UTF-8 string (error message) if validation fails
|
|
2876
|
-
/// - Returns NULL if validation passes
|
|
2877
|
-
/// - The returned string must be freeable by kreuzberg_free_string
|
|
2878
|
-
/// - `priority` determines the order of validation (higher priority runs first)
|
|
2879
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
2880
|
-
///
|
|
2881
|
-
/// # Example (C)
|
|
2882
|
-
///
|
|
2883
|
-
/// ```c
|
|
2884
|
-
/// char* my_validator(const char* result_json) {
|
|
2885
|
-
/// // Parse result_json, validate it
|
|
2886
|
-
/// // Return error message if validation fails, NULL if passes
|
|
2887
|
-
/// if (invalid) {
|
|
2888
|
-
/// return strdup("Validation failed: content too short");
|
|
2889
|
-
/// }
|
|
2890
|
-
/// return NULL;
|
|
2891
|
-
/// }
|
|
2892
|
-
///
|
|
2893
|
-
/// bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
|
|
2894
|
-
/// if (!success) {
|
|
2895
|
-
/// const char* error = kreuzberg_last_error();
|
|
2896
|
-
/// printf("Failed to register: %s\n", error);
|
|
2897
|
-
/// }
|
|
2898
|
-
/// ```
|
|
2899
|
-
#[unsafe(no_mangle)]
|
|
2900
|
-
pub unsafe extern "C" fn kreuzberg_register_validator(
|
|
2901
|
-
name: *const c_char,
|
|
2902
|
-
callback: ValidatorCallback,
|
|
2903
|
-
priority: i32,
|
|
2904
|
-
) -> bool {
|
|
2905
|
-
ffi_panic_guard_bool!("kreuzberg_register_validator", {
|
|
2906
|
-
clear_last_error();
|
|
2907
|
-
|
|
2908
|
-
if name.is_null() {
|
|
2909
|
-
set_last_error("Validator name cannot be NULL".to_string());
|
|
2910
|
-
return false;
|
|
2911
|
-
}
|
|
2912
|
-
|
|
2913
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
2914
|
-
Ok(s) => s,
|
|
2915
|
-
Err(e) => {
|
|
2916
|
-
set_last_error(format!("Invalid UTF-8 in Validator name: {}", e));
|
|
2917
|
-
return false;
|
|
2918
|
-
}
|
|
2919
|
-
};
|
|
2920
|
-
|
|
2921
|
-
if name_str.is_empty() {
|
|
2922
|
-
set_last_error("Plugin name cannot be empty".to_string());
|
|
2923
|
-
return false;
|
|
2924
|
-
}
|
|
2925
|
-
|
|
2926
|
-
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
2927
|
-
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
2928
|
-
return false;
|
|
2929
|
-
}
|
|
2930
|
-
|
|
2931
|
-
let validator = Arc::new(FfiValidator::new(name_str.to_string(), callback, priority));
|
|
2932
|
-
|
|
2933
|
-
let registry = kreuzberg::plugins::registry::get_validator_registry();
|
|
2934
|
-
let mut registry_guard = match registry.write() {
|
|
2935
|
-
Ok(guard) => guard,
|
|
2936
|
-
Err(e) => {
|
|
2937
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2938
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2939
|
-
return false;
|
|
2940
|
-
}
|
|
2941
|
-
};
|
|
2942
|
-
|
|
2943
|
-
match registry_guard.register(validator) {
|
|
2944
|
-
Ok(()) => true,
|
|
2945
|
-
Err(e) => {
|
|
2946
|
-
set_last_error(format!("Failed to register Validator: {}", e));
|
|
2947
|
-
false
|
|
2948
|
-
}
|
|
2949
|
-
}
|
|
2950
|
-
})
|
|
2951
|
-
}
|
|
2952
|
-
|
|
2953
|
-
/// Unregister a Validator by name.
|
|
2954
|
-
///
|
|
2955
|
-
/// # Safety
|
|
2956
|
-
///
|
|
2957
|
-
/// - `name` must be a valid null-terminated C string
|
|
2958
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
2959
|
-
///
|
|
2960
|
-
/// # Example (C)
|
|
2961
|
-
///
|
|
2962
|
-
/// ```c
|
|
2963
|
-
/// bool success = kreuzberg_unregister_validator("my-validator");
|
|
2964
|
-
/// if (!success) {
|
|
2965
|
-
/// const char* error = kreuzberg_last_error();
|
|
2966
|
-
/// printf("Failed to unregister: %s\n", error);
|
|
2967
|
-
/// }
|
|
2968
|
-
/// ```
|
|
2969
|
-
#[unsafe(no_mangle)]
|
|
2970
|
-
pub unsafe extern "C" fn kreuzberg_unregister_validator(name: *const c_char) -> bool {
|
|
2971
|
-
ffi_panic_guard_bool!("kreuzberg_unregister_validator", {
|
|
2972
|
-
clear_last_error();
|
|
2973
|
-
|
|
2974
|
-
if name.is_null() {
|
|
2975
|
-
set_last_error("Validator name cannot be NULL".to_string());
|
|
2976
|
-
return false;
|
|
2977
|
-
}
|
|
2978
|
-
|
|
2979
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
2980
|
-
Ok(s) => s,
|
|
2981
|
-
Err(e) => {
|
|
2982
|
-
set_last_error(format!("Invalid UTF-8 in Validator name: {}", e));
|
|
2983
|
-
return false;
|
|
2984
|
-
}
|
|
2985
|
-
};
|
|
2986
|
-
|
|
2987
|
-
let registry = kreuzberg::plugins::registry::get_validator_registry();
|
|
2988
|
-
let mut registry_guard = match registry.write() {
|
|
2989
|
-
Ok(guard) => guard,
|
|
2990
|
-
Err(e) => {
|
|
2991
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
2992
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
2993
|
-
return false;
|
|
2994
|
-
}
|
|
2995
|
-
};
|
|
2996
|
-
|
|
2997
|
-
match registry_guard.remove(name_str) {
|
|
2998
|
-
Ok(()) => true,
|
|
2999
|
-
Err(e) => {
|
|
3000
|
-
set_last_error(format!("Failed to remove Validator: {}", e));
|
|
3001
|
-
false
|
|
3002
|
-
}
|
|
3003
|
-
}
|
|
3004
|
-
})
|
|
3005
|
-
}
|
|
3006
|
-
|
|
3007
|
-
/// Clear all registered Validators.
|
|
3008
|
-
///
|
|
3009
|
-
/// # Safety
|
|
3010
|
-
///
|
|
3011
|
-
/// - Removes all validators. Subsequent extractions will skip custom validation.
|
|
3012
|
-
/// - Returns true on success, false on error.
|
|
3013
|
-
#[unsafe(no_mangle)]
|
|
3014
|
-
pub unsafe extern "C" fn kreuzberg_clear_validators() -> bool {
|
|
3015
|
-
ffi_panic_guard_bool!("kreuzberg_clear_validators", {
|
|
3016
|
-
clear_last_error();
|
|
3017
|
-
|
|
3018
|
-
let registry = kreuzberg::plugins::registry::get_validator_registry();
|
|
3019
|
-
let mut registry_guard = match registry.write() {
|
|
3020
|
-
Ok(guard) => guard,
|
|
3021
|
-
Err(e) => {
|
|
3022
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
3023
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
3024
|
-
return false;
|
|
3025
|
-
}
|
|
3026
|
-
};
|
|
3027
|
-
|
|
3028
|
-
*registry_guard = Default::default();
|
|
3029
|
-
true
|
|
3030
|
-
})
|
|
3031
|
-
}
|
|
3032
|
-
|
|
3033
|
-
/// List all registered Validators as a JSON array of names.
|
|
3034
|
-
///
|
|
3035
|
-
/// # Safety
|
|
3036
|
-
///
|
|
3037
|
-
/// - Returned string must be freed with `kreuzberg_free_string`.
|
|
3038
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`).
|
|
3039
|
-
#[unsafe(no_mangle)]
|
|
3040
|
-
pub unsafe extern "C" fn kreuzberg_list_validators() -> *mut c_char {
|
|
3041
|
-
ffi_panic_guard!("kreuzberg_list_validators", {
|
|
3042
|
-
clear_last_error();
|
|
3043
|
-
|
|
3044
|
-
let registry = kreuzberg::plugins::registry::get_validator_registry();
|
|
3045
|
-
let registry_guard = match registry.read() {
|
|
3046
|
-
Ok(guard) => guard,
|
|
3047
|
-
Err(e) => {
|
|
3048
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
3049
|
-
set_last_error(format!("Failed to acquire registry read lock: {}", e));
|
|
3050
|
-
return ptr::null_mut();
|
|
3051
|
-
}
|
|
3052
|
-
};
|
|
3053
|
-
|
|
3054
|
-
match serde_json::to_string(®istry_guard.list()) {
|
|
3055
|
-
Ok(json) => match CString::new(json) {
|
|
3056
|
-
Ok(cstr) => cstr.into_raw(),
|
|
3057
|
-
Err(e) => {
|
|
3058
|
-
set_last_error(format!("Failed to create C string: {}", e));
|
|
3059
|
-
ptr::null_mut()
|
|
3060
|
-
}
|
|
3061
|
-
},
|
|
3062
|
-
Err(e) => {
|
|
3063
|
-
set_last_error(format!("Failed to serialize Validator list: {}", e));
|
|
3064
|
-
ptr::null_mut()
|
|
3065
|
-
}
|
|
3066
|
-
}
|
|
3067
|
-
})
|
|
3068
|
-
}
|
|
3069
|
-
|
|
3070
|
-
/// Unregister an OCR backend by name.
|
|
3071
|
-
///
|
|
3072
|
-
/// # Safety
|
|
3073
|
-
///
|
|
3074
|
-
/// - `name` must be a valid null-terminated C string
|
|
3075
|
-
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
3076
|
-
///
|
|
3077
|
-
/// # Example (C)
|
|
3078
|
-
///
|
|
3079
|
-
/// ```c
|
|
3080
|
-
/// bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
|
|
3081
|
-
/// if (!success) {
|
|
3082
|
-
/// const char* error = kreuzberg_last_error();
|
|
3083
|
-
/// printf("Failed to unregister: %s\n", error);
|
|
3084
|
-
/// }
|
|
3085
|
-
/// ```
|
|
3086
|
-
#[unsafe(no_mangle)]
|
|
3087
|
-
pub unsafe extern "C" fn kreuzberg_unregister_ocr_backend(name: *const c_char) -> bool {
|
|
3088
|
-
ffi_panic_guard_bool!("kreuzberg_unregister_ocr_backend", {
|
|
3089
|
-
clear_last_error();
|
|
3090
|
-
|
|
3091
|
-
if name.is_null() {
|
|
3092
|
-
set_last_error("OCR backend name cannot be NULL".to_string());
|
|
3093
|
-
return false;
|
|
3094
|
-
}
|
|
3095
|
-
|
|
3096
|
-
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
3097
|
-
Ok(s) => s,
|
|
3098
|
-
Err(e) => {
|
|
3099
|
-
set_last_error(format!("Invalid UTF-8 in OCR backend name: {}", e));
|
|
3100
|
-
return false;
|
|
3101
|
-
}
|
|
3102
|
-
};
|
|
3103
|
-
|
|
3104
|
-
if name_str.is_empty() {
|
|
3105
|
-
set_last_error("OCR backend name cannot be empty".to_string());
|
|
3106
|
-
return false;
|
|
3107
|
-
}
|
|
3108
|
-
|
|
3109
|
-
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
3110
|
-
set_last_error("OCR backend name cannot contain whitespace".to_string());
|
|
3111
|
-
return false;
|
|
3112
|
-
}
|
|
3113
|
-
|
|
3114
|
-
match kreuzberg::plugins::unregister_ocr_backend(name_str) {
|
|
3115
|
-
Ok(()) => true,
|
|
3116
|
-
Err(e) => {
|
|
3117
|
-
set_last_error(e.to_string());
|
|
3118
|
-
false
|
|
3119
|
-
}
|
|
3120
|
-
}
|
|
3121
|
-
})
|
|
3122
|
-
}
|
|
3123
|
-
|
|
3124
|
-
/// List all registered OCR backends as a JSON array of names.
|
|
3125
|
-
///
|
|
3126
|
-
/// # Safety
|
|
3127
|
-
///
|
|
3128
|
-
/// - Returned string must be freed with `kreuzberg_free_string`.
|
|
3129
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`).
|
|
3130
|
-
///
|
|
3131
|
-
/// # Example (C)
|
|
3132
|
-
///
|
|
3133
|
-
/// ```c
|
|
3134
|
-
/// char* backends = kreuzberg_list_ocr_backends();
|
|
3135
|
-
/// if (backends == NULL) {
|
|
3136
|
-
/// const char* error = kreuzberg_last_error();
|
|
3137
|
-
/// printf("Failed to list backends: %s\n", error);
|
|
3138
|
-
/// } else {
|
|
3139
|
-
/// printf("OCR backends: %s\n", backends);
|
|
3140
|
-
/// kreuzberg_free_string(backends);
|
|
3141
|
-
/// }
|
|
3142
|
-
/// ```
|
|
3143
|
-
#[unsafe(no_mangle)]
|
|
3144
|
-
pub unsafe extern "C" fn kreuzberg_list_ocr_backends() -> *mut c_char {
|
|
3145
|
-
ffi_panic_guard!("kreuzberg_list_ocr_backends", {
|
|
3146
|
-
clear_last_error();
|
|
3147
|
-
|
|
3148
|
-
match kreuzberg::plugins::list_ocr_backends() {
|
|
3149
|
-
Ok(backends) => match serde_json::to_string(&backends) {
|
|
3150
|
-
Ok(json) => match CString::new(json) {
|
|
3151
|
-
Ok(cstr) => cstr.into_raw(),
|
|
3152
|
-
Err(e) => {
|
|
3153
|
-
set_last_error(format!("Failed to create C string: {}", e));
|
|
3154
|
-
ptr::null_mut()
|
|
3155
|
-
}
|
|
3156
|
-
},
|
|
3157
|
-
Err(e) => {
|
|
3158
|
-
set_last_error(format!("Failed to serialize OCR backend list: {}", e));
|
|
3159
|
-
ptr::null_mut()
|
|
3160
|
-
}
|
|
3161
|
-
},
|
|
3162
|
-
Err(e) => {
|
|
3163
|
-
set_last_error(e.to_string());
|
|
3164
|
-
ptr::null_mut()
|
|
3165
|
-
}
|
|
3166
|
-
}
|
|
3167
|
-
})
|
|
3168
|
-
}
|
|
3169
|
-
|
|
3170
|
-
/// Clear all registered OCR backends.
|
|
3171
|
-
///
|
|
3172
|
-
/// # Safety
|
|
3173
|
-
///
|
|
3174
|
-
/// - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
|
|
3175
|
-
/// - Returns true on success, false on error.
|
|
3176
|
-
///
|
|
3177
|
-
/// # Example (C)
|
|
3178
|
-
///
|
|
3179
|
-
/// ```c
|
|
3180
|
-
/// bool success = kreuzberg_clear_ocr_backends();
|
|
3181
|
-
/// if (!success) {
|
|
3182
|
-
/// const char* error = kreuzberg_last_error();
|
|
3183
|
-
/// printf("Failed to clear OCR backends: %s\n", error);
|
|
3184
|
-
/// }
|
|
3185
|
-
/// ```
|
|
3186
|
-
#[unsafe(no_mangle)]
|
|
3187
|
-
pub unsafe extern "C" fn kreuzberg_clear_ocr_backends() -> bool {
|
|
3188
|
-
ffi_panic_guard_bool!("kreuzberg_clear_ocr_backends", {
|
|
3189
|
-
clear_last_error();
|
|
3190
|
-
|
|
3191
|
-
match kreuzberg::plugins::clear_ocr_backends() {
|
|
3192
|
-
Ok(()) => true,
|
|
3193
|
-
Err(e) => {
|
|
3194
|
-
set_last_error(e.to_string());
|
|
3195
|
-
false
|
|
3196
|
-
}
|
|
3197
|
-
}
|
|
3198
|
-
})
|
|
3199
|
-
}
|
|
3200
|
-
|
|
3201
|
-
/// Clear all registered DocumentExtractors.
|
|
3202
|
-
///
|
|
3203
|
-
/// # Safety
|
|
3204
|
-
///
|
|
3205
|
-
/// - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
|
|
3206
|
-
/// - Returns true on success, false on error.
|
|
3207
|
-
///
|
|
3208
|
-
/// # Example (C)
|
|
3209
|
-
///
|
|
3210
|
-
/// ```c
|
|
3211
|
-
/// bool success = kreuzberg_clear_document_extractors();
|
|
3212
|
-
/// if (!success) {
|
|
3213
|
-
/// const char* error = kreuzberg_last_error();
|
|
3214
|
-
/// printf("Failed to clear document extractors: %s\n", error);
|
|
3215
|
-
/// }
|
|
3216
|
-
/// ```
|
|
3217
|
-
#[unsafe(no_mangle)]
|
|
3218
|
-
pub unsafe extern "C" fn kreuzberg_clear_document_extractors() -> bool {
|
|
3219
|
-
ffi_panic_guard_bool!("kreuzberg_clear_document_extractors", {
|
|
3220
|
-
clear_last_error();
|
|
3221
|
-
|
|
3222
|
-
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
3223
|
-
let mut registry_guard = match registry.write() {
|
|
3224
|
-
Ok(guard) => guard,
|
|
3225
|
-
Err(e) => {
|
|
3226
|
-
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
3227
|
-
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
3228
|
-
return false;
|
|
3229
|
-
}
|
|
3230
|
-
};
|
|
3231
|
-
|
|
3232
|
-
*registry_guard = Default::default();
|
|
3233
|
-
true
|
|
3234
|
-
})
|
|
3235
|
-
}
|
|
3236
|
-
|
|
3237
|
-
/// Detect MIME type from raw bytes.
|
|
3238
|
-
///
|
|
3239
|
-
/// # Safety
|
|
3240
|
-
///
|
|
3241
|
-
/// - `bytes` must be a valid pointer to byte data
|
|
3242
|
-
/// - `len` must be the correct length of the byte array
|
|
3243
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
3244
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
3245
|
-
///
|
|
3246
|
-
/// # Example (C)
|
|
3247
|
-
///
|
|
3248
|
-
/// ```c
|
|
3249
|
-
/// const char* pdf_bytes = "%PDF-1.4\n";
|
|
3250
|
-
/// char* mime = kreuzberg_detect_mime_type_from_bytes((const uint8_t*)pdf_bytes, strlen(pdf_bytes));
|
|
3251
|
-
/// if (mime == NULL) {
|
|
3252
|
-
/// const char* error = kreuzberg_last_error();
|
|
3253
|
-
/// printf("Failed to detect MIME type: %s\n", error);
|
|
3254
|
-
/// } else {
|
|
3255
|
-
/// printf("MIME type: %s\n", mime);
|
|
3256
|
-
/// kreuzberg_free_string(mime);
|
|
3257
|
-
/// }
|
|
3258
|
-
/// ```
|
|
3259
|
-
#[unsafe(no_mangle)]
|
|
3260
|
-
pub unsafe extern "C" fn kreuzberg_detect_mime_type_from_bytes(bytes: *const u8, len: usize) -> *mut c_char {
|
|
3261
|
-
ffi_panic_guard!("kreuzberg_detect_mime_type_from_bytes", {
|
|
3262
|
-
clear_last_error();
|
|
3263
|
-
|
|
3264
|
-
if bytes.is_null() {
|
|
3265
|
-
set_last_error("bytes cannot be NULL".to_string());
|
|
3266
|
-
return ptr::null_mut();
|
|
3267
|
-
}
|
|
3268
|
-
|
|
3269
|
-
let slice = unsafe { std::slice::from_raw_parts(bytes, len) };
|
|
3270
|
-
|
|
3271
|
-
match kreuzberg::core::mime::detect_mime_type_from_bytes(slice) {
|
|
3272
|
-
Ok(mime) => match string_to_c_string(mime) {
|
|
3273
|
-
Ok(ptr) => ptr,
|
|
3274
|
-
Err(e) => {
|
|
3275
|
-
set_last_error(e);
|
|
3276
|
-
ptr::null_mut()
|
|
3277
|
-
}
|
|
3278
|
-
},
|
|
3279
|
-
Err(e) => {
|
|
3280
|
-
set_last_error(e.to_string());
|
|
3281
|
-
ptr::null_mut()
|
|
3282
|
-
}
|
|
3283
|
-
}
|
|
3284
|
-
})
|
|
3285
|
-
}
|
|
3286
|
-
|
|
3287
|
-
/// Detect MIME type from file path (checks extension and reads file content).
|
|
3288
|
-
///
|
|
3289
|
-
/// # Safety
|
|
3290
|
-
///
|
|
3291
|
-
/// - `file_path` must be a valid null-terminated C string
|
|
3292
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
3293
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
3294
|
-
///
|
|
3295
|
-
/// # Example (C)
|
|
3296
|
-
///
|
|
3297
|
-
/// ```c
|
|
3298
|
-
/// char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
|
|
3299
|
-
/// if (mime == NULL) {
|
|
3300
|
-
/// const char* error = kreuzberg_last_error();
|
|
3301
|
-
/// printf("Failed to detect MIME type: %s\n", error);
|
|
3302
|
-
/// } else {
|
|
3303
|
-
/// printf("MIME type: %s\n", mime);
|
|
3304
|
-
/// kreuzberg_free_string(mime);
|
|
3305
|
-
/// }
|
|
3306
|
-
/// ```
|
|
3307
|
-
#[unsafe(no_mangle)]
|
|
3308
|
-
pub unsafe extern "C" fn kreuzberg_detect_mime_type_from_path(file_path: *const c_char) -> *mut c_char {
|
|
3309
|
-
ffi_panic_guard!("kreuzberg_detect_mime_type_from_path", {
|
|
3310
|
-
clear_last_error();
|
|
3311
|
-
|
|
3312
|
-
if file_path.is_null() {
|
|
3313
|
-
set_last_error("file_path cannot be NULL".to_string());
|
|
3314
|
-
return ptr::null_mut();
|
|
3315
|
-
}
|
|
3316
|
-
|
|
3317
|
-
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
3318
|
-
Ok(s) => s,
|
|
3319
|
-
Err(e) => {
|
|
3320
|
-
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
3321
|
-
return ptr::null_mut();
|
|
3322
|
-
}
|
|
3323
|
-
};
|
|
3324
|
-
|
|
3325
|
-
match kreuzberg::core::mime::detect_mime_type(path_str, true) {
|
|
3326
|
-
Ok(mime) => match string_to_c_string(mime) {
|
|
3327
|
-
Ok(ptr) => ptr,
|
|
3328
|
-
Err(e) => {
|
|
3329
|
-
set_last_error(e);
|
|
3330
|
-
ptr::null_mut()
|
|
3331
|
-
}
|
|
3332
|
-
},
|
|
3333
|
-
Err(e) => {
|
|
3334
|
-
// ~keep: IO errors from file operations should bubble up as they indicate
|
|
3335
|
-
set_last_error(e.to_string());
|
|
3336
|
-
ptr::null_mut()
|
|
3337
|
-
}
|
|
3338
|
-
}
|
|
3339
|
-
})
|
|
3340
|
-
}
|
|
3341
|
-
|
|
3342
|
-
/// Get file extensions for a MIME type.
|
|
3343
|
-
///
|
|
3344
|
-
/// # Safety
|
|
3345
|
-
///
|
|
3346
|
-
/// - `mime_type` must be a valid null-terminated C string
|
|
3347
|
-
/// - The returned string is a JSON array of extensions (must be freed with `kreuzberg_free_string`)
|
|
3348
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
3349
|
-
///
|
|
3350
|
-
/// # Example (C)
|
|
3351
|
-
///
|
|
3352
|
-
/// ```c
|
|
3353
|
-
/// char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
|
|
3354
|
-
/// if (extensions == NULL) {
|
|
3355
|
-
/// const char* error = kreuzberg_last_error();
|
|
3356
|
-
/// printf("Failed to get extensions: %s\n", error);
|
|
3357
|
-
/// } else {
|
|
3358
|
-
/// printf("Extensions: %s\n", extensions);
|
|
3359
|
-
/// kreuzberg_free_string(extensions);
|
|
3360
|
-
/// }
|
|
3361
|
-
/// ```
|
|
3362
|
-
#[unsafe(no_mangle)]
|
|
3363
|
-
pub unsafe extern "C" fn kreuzberg_get_extensions_for_mime(mime_type: *const c_char) -> *mut c_char {
|
|
3364
|
-
ffi_panic_guard!("kreuzberg_get_extensions_for_mime", {
|
|
3365
|
-
clear_last_error();
|
|
3366
|
-
|
|
3367
|
-
if mime_type.is_null() {
|
|
3368
|
-
set_last_error("mime_type cannot be NULL".to_string());
|
|
3369
|
-
return ptr::null_mut();
|
|
3370
|
-
}
|
|
3371
|
-
|
|
3372
|
-
let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
|
|
3373
|
-
Ok(s) => s,
|
|
3374
|
-
Err(e) => {
|
|
3375
|
-
set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
|
|
3376
|
-
return ptr::null_mut();
|
|
3377
|
-
}
|
|
3378
|
-
};
|
|
3379
|
-
|
|
3380
|
-
match kreuzberg::core::mime::get_extensions_for_mime(mime_str) {
|
|
3381
|
-
Ok(extensions) => match serde_json::to_string(&extensions) {
|
|
3382
|
-
Ok(json) => match string_to_c_string(json) {
|
|
3383
|
-
Ok(ptr) => ptr,
|
|
3384
|
-
Err(e) => {
|
|
3385
|
-
set_last_error(e);
|
|
3386
|
-
ptr::null_mut()
|
|
3387
|
-
}
|
|
3388
|
-
},
|
|
3389
|
-
Err(e) => {
|
|
3390
|
-
set_last_error(format!("Failed to serialize extensions: {}", e));
|
|
3391
|
-
ptr::null_mut()
|
|
3392
|
-
}
|
|
3393
|
-
},
|
|
3394
|
-
Err(e) => {
|
|
3395
|
-
set_last_error(e.to_string());
|
|
3396
|
-
ptr::null_mut()
|
|
3397
|
-
}
|
|
3398
|
-
}
|
|
3399
|
-
})
|
|
3400
|
-
}
|
|
3401
|
-
|
|
3402
|
-
/// Load an ExtractionConfig from a file.
|
|
3403
|
-
///
|
|
3404
|
-
/// Automatically detects the file format based on extension:
|
|
3405
|
-
/// - `.toml` - TOML format
|
|
3406
|
-
/// - `.yaml`, `.yml` - YAML format
|
|
3407
|
-
/// - `.json` - JSON format
|
|
3408
|
-
///
|
|
3409
|
-
/// # Safety
|
|
3410
|
-
///
|
|
3411
|
-
/// - `path` must be a valid null-terminated C string representing a file path
|
|
3412
|
-
/// - Returns a pointer to ExtractionConfig on success, NULL on error
|
|
3413
|
-
/// - The returned config must be freed with `kreuzberg_free_config`
|
|
3414
|
-
/// - Check `kreuzberg_last_error` on NULL return
|
|
3415
|
-
///
|
|
3416
|
-
/// # Example (C)
|
|
3417
|
-
///
|
|
3418
|
-
/// ```c
|
|
3419
|
-
/// ExtractionConfig* config = kreuzberg_config_from_file("kreuzberg.toml");
|
|
3420
|
-
/// if (config == NULL) {
|
|
3421
|
-
/// const char* error = kreuzberg_last_error();
|
|
3422
|
-
/// printf("Failed to load config: %s\n", error);
|
|
3423
|
-
/// return 1;
|
|
3424
|
-
/// }
|
|
3425
|
-
///
|
|
3426
|
-
/// // Use config...
|
|
3427
|
-
/// char* result = kreuzberg_extract_file_with_config_sync("document.pdf", config);
|
|
3428
|
-
///
|
|
3429
|
-
/// kreuzberg_free_config(config);
|
|
3430
|
-
/// ```
|
|
3431
|
-
#[unsafe(no_mangle)]
|
|
3432
|
-
pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
|
|
3433
|
-
ffi_panic_guard!("kreuzberg_config_from_file", {
|
|
3434
|
-
clear_last_error();
|
|
3435
|
-
|
|
3436
|
-
if path.is_null() {
|
|
3437
|
-
set_last_error("Config path cannot be NULL".to_string());
|
|
3438
|
-
return ptr::null_mut();
|
|
3439
|
-
}
|
|
3440
|
-
|
|
3441
|
-
let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
|
|
3442
|
-
Ok(s) => s,
|
|
3443
|
-
Err(e) => {
|
|
3444
|
-
set_last_error(format!("Invalid UTF-8 in config path: {}", e));
|
|
3445
|
-
return ptr::null_mut();
|
|
3446
|
-
}
|
|
3447
|
-
};
|
|
3448
|
-
|
|
3449
|
-
let path_buf = Path::new(path_str);
|
|
3450
|
-
|
|
3451
|
-
match ExtractionConfig::from_file(path_buf) {
|
|
3452
|
-
Ok(config) => Box::into_raw(Box::new(config)),
|
|
3453
|
-
Err(e) => {
|
|
3454
|
-
// ~keep: IO errors from file operations should bubble up as they indicate
|
|
3455
|
-
match &e {
|
|
3456
|
-
KreuzbergError::Io(io_err) => {
|
|
3457
|
-
set_last_error(format!("IO error loading config: {}", io_err));
|
|
3458
|
-
}
|
|
3459
|
-
_ => {
|
|
3460
|
-
set_last_error(format!("Failed to load config from file: {}", e));
|
|
3461
|
-
}
|
|
3462
|
-
}
|
|
3463
|
-
ptr::null_mut()
|
|
3464
|
-
}
|
|
3465
|
-
}
|
|
3466
|
-
})
|
|
3467
|
-
}
|
|
3468
|
-
|
|
3469
|
-
/// Discover and load an ExtractionConfig by searching parent directories.
|
|
3470
|
-
///
|
|
3471
|
-
/// Searches the current directory and all parent directories for:
|
|
3472
|
-
/// - `kreuzberg.toml`
|
|
3473
|
-
/// - `kreuzberg.yaml`
|
|
3474
|
-
/// - `kreuzberg.yml`
|
|
3475
|
-
/// - `kreuzberg.json`
|
|
3476
|
-
///
|
|
3477
|
-
/// Returns the first config file found as JSON, or NULL if none found.
|
|
3478
|
-
///
|
|
3479
|
-
/// # Safety
|
|
3480
|
-
///
|
|
3481
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
3482
|
-
/// - Returns NULL if no config found or on error (check `kreuzberg_last_error`)
|
|
3483
|
-
///
|
|
3484
|
-
/// # Example (C)
|
|
3485
|
-
///
|
|
3486
|
-
/// ```c
|
|
3487
|
-
/// char* config_json = kreuzberg_config_discover();
|
|
3488
|
-
/// if (config_json == NULL) {
|
|
3489
|
-
/// const char* error = kreuzberg_last_error();
|
|
3490
|
-
/// if (error != NULL && strlen(error) > 0) {
|
|
3491
|
-
/// printf("Error discovering config: %s\n", error);
|
|
3492
|
-
/// return 1;
|
|
3493
|
-
/// }
|
|
3494
|
-
/// // No config found, use defaults
|
|
3495
|
-
/// printf("No config file found\n");
|
|
3496
|
-
/// } else {
|
|
3497
|
-
/// printf("Config: %s\n", config_json);
|
|
3498
|
-
/// kreuzberg_free_string(config_json);
|
|
3499
|
-
/// }
|
|
3500
|
-
/// ```
|
|
3501
|
-
#[unsafe(no_mangle)]
|
|
3502
|
-
pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
|
|
3503
|
-
ffi_panic_guard!("kreuzberg_config_discover", {
|
|
3504
|
-
clear_last_error();
|
|
3505
|
-
|
|
3506
|
-
match ExtractionConfig::discover() {
|
|
3507
|
-
Ok(Some(config)) => match serde_json::to_string(&config) {
|
|
3508
|
-
Ok(json) => match CString::new(json) {
|
|
3509
|
-
Ok(cstr) => cstr.into_raw(),
|
|
3510
|
-
Err(e) => {
|
|
3511
|
-
set_last_error(format!("Failed to serialize config: {}", e));
|
|
3512
|
-
ptr::null_mut()
|
|
3513
|
-
}
|
|
3514
|
-
},
|
|
3515
|
-
Err(e) => {
|
|
3516
|
-
set_last_error(format!("Failed to serialize config: {}", e));
|
|
3517
|
-
ptr::null_mut()
|
|
3518
|
-
}
|
|
3519
|
-
},
|
|
3520
|
-
Ok(None) => ptr::null_mut(),
|
|
3521
|
-
Err(e) => {
|
|
3522
|
-
// ~keep: IO errors from directory traversal should bubble up as they indicate
|
|
3523
|
-
match &e {
|
|
3524
|
-
KreuzbergError::Io(io_err) => {
|
|
3525
|
-
set_last_error(format!("IO error discovering config: {}", io_err));
|
|
3526
|
-
}
|
|
3527
|
-
_ => {
|
|
3528
|
-
set_last_error(format!("Failed to discover config: {}", e));
|
|
3529
|
-
}
|
|
3530
|
-
}
|
|
3531
|
-
ptr::null_mut()
|
|
3532
|
-
}
|
|
3533
|
-
}
|
|
3534
|
-
})
|
|
3535
|
-
}
|
|
3536
|
-
|
|
3537
|
-
#[allow(non_upper_case_globals)]
|
|
3538
|
-
const _: () = {
|
|
3539
|
-
const fn assert_c_extraction_result_size() {
|
|
3540
|
-
const SIZE: usize = std::mem::size_of::<CExtractionResult>();
|
|
3541
|
-
const _: () = assert!(SIZE == 96, "CExtractionResult size must be 96 bytes");
|
|
3542
|
-
}
|
|
3543
|
-
|
|
3544
|
-
const fn assert_c_extraction_result_alignment() {
|
|
3545
|
-
const ALIGN: usize = std::mem::align_of::<CExtractionResult>();
|
|
3546
|
-
const _: () = assert!(ALIGN == 8, "CExtractionResult alignment must be 8 bytes");
|
|
3547
|
-
}
|
|
3548
|
-
|
|
3549
|
-
const fn assert_c_batch_result_size() {
|
|
3550
|
-
const SIZE: usize = std::mem::size_of::<CBatchResult>();
|
|
3551
|
-
const _: () = assert!(SIZE == 24, "CBatchResult size must be 24 bytes");
|
|
3552
|
-
}
|
|
3553
|
-
|
|
3554
|
-
const fn assert_c_batch_result_alignment() {
|
|
3555
|
-
const ALIGN: usize = std::mem::align_of::<CBatchResult>();
|
|
3556
|
-
const _: () = assert!(ALIGN == 8, "CBatchResult alignment must be 8 bytes");
|
|
3557
|
-
}
|
|
3558
|
-
|
|
3559
|
-
const fn assert_c_bytes_with_mime_size() {
|
|
3560
|
-
const SIZE: usize = std::mem::size_of::<CBytesWithMime>();
|
|
3561
|
-
const _: () = assert!(SIZE == 24, "CBytesWithMime size must be 24 bytes");
|
|
3562
|
-
}
|
|
3563
|
-
|
|
3564
|
-
const fn assert_c_bytes_with_mime_alignment() {
|
|
3565
|
-
const ALIGN: usize = std::mem::align_of::<CBytesWithMime>();
|
|
3566
|
-
const _: () = assert!(ALIGN == 8, "CBytesWithMime alignment must be 8 bytes");
|
|
3567
|
-
}
|
|
3568
|
-
|
|
3569
|
-
let _ = assert_c_extraction_result_size;
|
|
3570
|
-
let _ = assert_c_extraction_result_alignment;
|
|
3571
|
-
let _ = assert_c_batch_result_size;
|
|
3572
|
-
let _ = assert_c_batch_result_alignment;
|
|
3573
|
-
let _ = assert_c_bytes_with_mime_size;
|
|
3574
|
-
let _ = assert_c_bytes_with_mime_alignment;
|
|
3575
|
-
};
|
|
3576
|
-
|
|
3577
|
-
#[cfg(test)]
|
|
3578
|
-
mod tests {
|
|
3579
|
-
use super::*;
|
|
3580
|
-
use std::ffi::CString;
|
|
3581
|
-
|
|
3582
|
-
#[test]
|
|
3583
|
-
fn test_version() {
|
|
3584
|
-
unsafe {
|
|
3585
|
-
let version = kreuzberg_version();
|
|
3586
|
-
assert!(!version.is_null());
|
|
3587
|
-
let version_str = CStr::from_ptr(version).to_str().unwrap();
|
|
3588
|
-
assert!(!version_str.is_empty());
|
|
3589
|
-
}
|
|
3590
|
-
}
|
|
3591
|
-
|
|
3592
|
-
#[test]
|
|
3593
|
-
fn test_null_path() {
|
|
3594
|
-
unsafe {
|
|
3595
|
-
let result = kreuzberg_extract_file_sync(ptr::null());
|
|
3596
|
-
assert!(result.is_null());
|
|
3597
|
-
|
|
3598
|
-
let error = kreuzberg_last_error();
|
|
3599
|
-
assert!(!error.is_null());
|
|
3600
|
-
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
3601
|
-
assert!(error_str.contains("NULL"));
|
|
3602
|
-
}
|
|
3603
|
-
}
|
|
3604
|
-
|
|
3605
|
-
#[test]
|
|
3606
|
-
fn test_nonexistent_file() {
|
|
3607
|
-
unsafe {
|
|
3608
|
-
let path = CString::new("/nonexistent/file.pdf").unwrap();
|
|
3609
|
-
let result = kreuzberg_extract_file_sync(path.as_ptr());
|
|
3610
|
-
assert!(result.is_null());
|
|
3611
|
-
|
|
3612
|
-
let error = kreuzberg_last_error();
|
|
3613
|
-
assert!(!error.is_null());
|
|
3614
|
-
}
|
|
3615
|
-
}
|
|
3616
|
-
}
|