kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,455 @@
1
+ //! Profile memory usage for extracting documents.
2
+ //!
3
+ //! This utility can run against a single file or a batch of files (via `--input-list`).
4
+ //! For each input it prints or writes a JSON object containing duration, peak RSS,
5
+ //! optional flamegraph path, and the top hot functions when CPU profiling is enabled.
6
+
7
+ use std::env;
8
+ use std::fs::{File, create_dir_all};
9
+ use std::io::{BufRead, BufReader};
10
+ use std::path::{Path, PathBuf};
11
+ use std::time::Instant;
12
+
13
+ use kreuzberg::core::config::ExtractionConfig;
14
+ use kreuzberg::core::extractor::extract_file_sync;
15
+ use serde::Serialize;
16
+
17
+ #[cfg(feature = "profiling")]
18
+ use pprof::{ProfilerGuardBuilder, Report};
19
+
20
+ #[cfg(feature = "profiling")]
21
+ use std::collections::HashMap;
22
+
23
+ #[cfg(target_os = "macos")]
24
+ fn normalize_rss(value: i64) -> i64 {
25
+ value / 1024
26
+ }
27
+
28
+ #[cfg(all(unix, not(target_os = "macos")))]
29
+ fn normalize_rss(value: i64) -> i64 {
30
+ value
31
+ }
32
+
33
+ #[cfg(not(unix))]
34
+ fn normalize_rss(value: i64) -> i64 {
35
+ value
36
+ }
37
+
38
+ #[cfg(unix)]
39
+ fn max_rss_kb() -> Option<i64> {
40
+ use std::mem::MaybeUninit;
41
+
42
+ let mut usage = MaybeUninit::<libc::rusage>::uninit();
43
+ let rc = unsafe { libc::getrusage(libc::RUSAGE_SELF, usage.as_mut_ptr()) };
44
+ if rc != 0 {
45
+ return None;
46
+ }
47
+ let usage = unsafe { usage.assume_init() };
48
+ Some(normalize_rss(usage.ru_maxrss))
49
+ }
50
+
51
+ #[cfg(not(unix))]
52
+ fn max_rss_kb() -> Option<i64> {
53
+ None
54
+ }
55
+
56
+ #[derive(Debug)]
57
+ struct Options {
58
+ inputs: Vec<PathBuf>,
59
+ input_list: Option<PathBuf>,
60
+ flamegraph: Option<PathBuf>,
61
+ flamegraph_dir: Option<PathBuf>,
62
+ output_json: Option<PathBuf>,
63
+ output_dir: Option<PathBuf>,
64
+ }
65
+
66
+ #[derive(Serialize, Clone)]
67
+ struct FunctionSample {
68
+ function: String,
69
+ samples: i64,
70
+ percentage: f64,
71
+ }
72
+
73
+ #[derive(Serialize, Clone)]
74
+ struct ProfileOutput {
75
+ input: String,
76
+ duration_secs: f64,
77
+ peak_rss_kb: Option<i64>,
78
+ delta_rss_kb: Option<i64>,
79
+ flamegraph: Option<String>,
80
+ top_functions: Option<Vec<FunctionSample>>,
81
+ }
82
+
83
+ fn print_usage() {
84
+ eprintln!(
85
+ "Usage: profile_extract [options] <file ...>\n\nOptions:\n --flamegraph <path> Write flamegraph SVG (single input)\n --flamegraph-dir <dir> Write flamegraph SVGs for each input\n --output-json <path> Write JSON output (single input)\n --output-dir <dir> Write per-file JSON outputs to directory\n --input-list <path> File with newline-separated input paths\n -h, --help Show this help message"
86
+ );
87
+ }
88
+
89
+ fn parse_options() -> Options {
90
+ let mut args = env::args().skip(1);
91
+ let mut inputs = Vec::new();
92
+ let mut input_list = None;
93
+ let mut flamegraph = None;
94
+ let mut flamegraph_dir = None;
95
+ let mut output_json = None;
96
+ let mut output_dir = None;
97
+
98
+ while let Some(arg) = args.next() {
99
+ match arg.as_str() {
100
+ "--flamegraph" => {
101
+ let path = args.next().unwrap_or_else(|| {
102
+ print_usage();
103
+ std::process::exit(64);
104
+ });
105
+ flamegraph = Some(PathBuf::from(path));
106
+ }
107
+ "--flamegraph-dir" => {
108
+ let path = args.next().unwrap_or_else(|| {
109
+ print_usage();
110
+ std::process::exit(64);
111
+ });
112
+ flamegraph_dir = Some(PathBuf::from(path));
113
+ }
114
+ "--output-json" => {
115
+ let path = args.next().unwrap_or_else(|| {
116
+ print_usage();
117
+ std::process::exit(64);
118
+ });
119
+ output_json = Some(PathBuf::from(path));
120
+ }
121
+ "--output-dir" => {
122
+ let path = args.next().unwrap_or_else(|| {
123
+ print_usage();
124
+ std::process::exit(64);
125
+ });
126
+ output_dir = Some(PathBuf::from(path));
127
+ }
128
+ "--input-list" => {
129
+ let path = args.next().unwrap_or_else(|| {
130
+ print_usage();
131
+ std::process::exit(64);
132
+ });
133
+ input_list = Some(PathBuf::from(path));
134
+ }
135
+ "-h" | "--help" => {
136
+ print_usage();
137
+ std::process::exit(0);
138
+ }
139
+ _ if arg.starts_with("--") => {
140
+ eprintln!("Unknown option: {arg}");
141
+ print_usage();
142
+ std::process::exit(64);
143
+ }
144
+ _ => inputs.push(PathBuf::from(arg)),
145
+ }
146
+ }
147
+
148
+ Options {
149
+ inputs,
150
+ input_list,
151
+ flamegraph,
152
+ flamegraph_dir,
153
+ output_json,
154
+ output_dir,
155
+ }
156
+ }
157
+
158
+ fn sanitize_file_name(path: &Path) -> String {
159
+ let name_owned;
160
+ let name = match path.file_name().and_then(|n| n.to_str()) {
161
+ Some(value) => value,
162
+ None => {
163
+ name_owned = path.to_string_lossy().into_owned();
164
+ &name_owned
165
+ }
166
+ };
167
+ let sanitized: String = name
168
+ .chars()
169
+ .map(|c| if c.is_ascii_alphanumeric() { c } else { '_' })
170
+ .collect();
171
+ if sanitized.is_empty() {
172
+ "output".to_string()
173
+ } else {
174
+ sanitized
175
+ }
176
+ }
177
+
178
+ fn read_inputs_from_file(list_path: &Path) -> Result<Vec<PathBuf>, String> {
179
+ let file = File::open(list_path).map_err(|e| format!("Failed to open input list {}: {e}", list_path.display()))?;
180
+ let reader = BufReader::new(file);
181
+ let mut inputs = Vec::new();
182
+ for line in reader.lines() {
183
+ let line = line.map_err(|e| format!("Failed to read line from {}: {e}", list_path.display()))?;
184
+ let trimmed = line.trim();
185
+ if trimmed.is_empty() || trimmed.starts_with('#') {
186
+ continue;
187
+ }
188
+ inputs.push(PathBuf::from(trimmed));
189
+ }
190
+ Ok(inputs)
191
+ }
192
+
193
+ fn main() {
194
+ let options = parse_options();
195
+
196
+ let mut targets = options.inputs.clone();
197
+ if let Some(list_path) = &options.input_list {
198
+ match read_inputs_from_file(list_path) {
199
+ Ok(mut list_inputs) => targets.append(&mut list_inputs),
200
+ Err(err) => {
201
+ eprintln!("{err}");
202
+ std::process::exit(66);
203
+ }
204
+ }
205
+ }
206
+
207
+ if targets.is_empty() {
208
+ eprintln!("No input files specified");
209
+ print_usage();
210
+ std::process::exit(64);
211
+ }
212
+
213
+ let multiple = targets.len() > 1;
214
+
215
+ if multiple && options.flamegraph.is_some() && options.flamegraph_dir.is_none() {
216
+ eprintln!("Note: --flamegraph applies to a single input. Use --flamegraph-dir for batch runs.");
217
+ }
218
+
219
+ if multiple && options.output_json.is_some() && options.output_dir.is_none() {
220
+ eprintln!("Note: --output-json applies to a single input. Use --output-dir for batch runs.");
221
+ }
222
+
223
+ if let Some(dir) = &options.output_dir
224
+ && let Err(err) = create_dir_all(dir)
225
+ {
226
+ eprintln!("Failed to create output directory {}: {err}", dir.display());
227
+ }
228
+
229
+ if let Some(dir) = &options.flamegraph_dir
230
+ && let Err(err) = create_dir_all(dir)
231
+ {
232
+ eprintln!("Failed to create flamegraph directory {}: {err}", dir.display());
233
+ }
234
+
235
+ let mut aggregated_results: Vec<ProfileOutput> = Vec::new();
236
+
237
+ for target in targets {
238
+ let flamegraph_path = if let Some(dir) = &options.flamegraph_dir {
239
+ Some(dir.join(format!("{}.svg", sanitize_file_name(&target))))
240
+ } else if !multiple {
241
+ options.flamegraph.clone()
242
+ } else {
243
+ None
244
+ };
245
+
246
+ let output_json_path = if let Some(dir) = &options.output_dir {
247
+ Some(dir.join(format!("{}.json", sanitize_file_name(&target))))
248
+ } else if !multiple {
249
+ options.output_json.clone()
250
+ } else {
251
+ None
252
+ };
253
+
254
+ match run_profile(&target, flamegraph_path.clone()) {
255
+ Ok(profile) => {
256
+ if let Some(json_path) = output_json_path {
257
+ if let Some(parent) = json_path.parent()
258
+ && let Err(err) = create_dir_all(parent)
259
+ {
260
+ eprintln!("Failed to create output directory {}: {err}", parent.display());
261
+ }
262
+
263
+ match File::create(&json_path) {
264
+ Ok(file) => {
265
+ if let Err(err) = serde_json::to_writer_pretty(file, &profile) {
266
+ eprintln!("Failed to write JSON output {}: {err}", json_path.display());
267
+ } else {
268
+ eprintln!("Profile summary written to {}", json_path.display());
269
+ }
270
+ }
271
+ Err(err) => eprintln!("Failed to create JSON output file {}: {err}", json_path.display()),
272
+ }
273
+ } else {
274
+ aggregated_results.push(profile);
275
+ }
276
+ }
277
+ Err(err) => {
278
+ eprintln!("{}: {err}", target.display());
279
+ }
280
+ }
281
+ }
282
+
283
+ if options.output_json.is_none() && options.output_dir.is_none() {
284
+ if aggregated_results.len() == 1 {
285
+ if let Ok(json) = serde_json::to_string_pretty(&aggregated_results[0]) {
286
+ println!("{json}");
287
+ }
288
+ } else if !aggregated_results.is_empty()
289
+ && let Ok(json) = serde_json::to_string_pretty(&aggregated_results)
290
+ {
291
+ println!("{json}");
292
+ }
293
+ }
294
+ }
295
+
296
+ fn run_profile(path: &Path, flamegraph_path: Option<PathBuf>) -> Result<ProfileOutput, String> {
297
+ if !path.exists() {
298
+ return Err("Input file does not exist".to_string());
299
+ }
300
+
301
+ #[cfg(feature = "profiling")]
302
+ let guard = if flamegraph_path.is_some() {
303
+ #[cfg_attr(not(target_os = "macos"), allow(unused_mut))]
304
+ let mut builder = ProfilerGuardBuilder::default().frequency(100);
305
+
306
+ #[cfg(target_os = "macos")]
307
+ {
308
+ builder = builder.blocklist(&[
309
+ "libsystem_kernel.dylib",
310
+ "libsystem_pthread.dylib",
311
+ "libsystem_platform.dylib",
312
+ "libdyld.dylib",
313
+ ]);
314
+ }
315
+
316
+ match builder.build() {
317
+ Ok(guard) => Some(guard),
318
+ Err(err) => {
319
+ eprintln!("Failed to start profiler: {err}");
320
+ None
321
+ }
322
+ }
323
+ } else {
324
+ None
325
+ };
326
+
327
+ #[cfg(not(feature = "profiling"))]
328
+ if flamegraph_path.is_some() {
329
+ eprintln!(
330
+ "--flamegraph requested but build missing 'profiling' feature; recompile with `--features profiling`."
331
+ );
332
+ }
333
+
334
+ let start_rss = max_rss_kb();
335
+ let start = Instant::now();
336
+
337
+ let config = ExtractionConfig::default();
338
+ let result = extract_file_sync(path, None, &config).map_err(|e| format!("Extraction failed: {e:?}"))?;
339
+ let _ = result;
340
+ let duration = start.elapsed();
341
+ let end_rss = max_rss_kb();
342
+
343
+ #[cfg(feature = "profiling")]
344
+ let (flamegraph_path_str, top_functions) = match (flamegraph_path.clone(), guard) {
345
+ (Some(path), Some(guard)) => match guard.report().build() {
346
+ Ok(report) => {
347
+ if let Some(parent) = path.parent()
348
+ && let Err(err) = create_dir_all(parent)
349
+ {
350
+ eprintln!("Failed to create flamegraph directory: {err}");
351
+ }
352
+
353
+ match File::create(&path) {
354
+ Ok(mut file) => {
355
+ if let Err(err) = report.flamegraph(&mut file) {
356
+ eprintln!("Failed to write flamegraph: {err}");
357
+ }
358
+ }
359
+ Err(err) => {
360
+ eprintln!("Failed to create flamegraph file {}: {err}", path.display());
361
+ }
362
+ }
363
+
364
+ let tops = summarize_top_functions(&report, 10);
365
+ let filtered = if tops.is_empty() { None } else { Some(tops) };
366
+ (Some(path.display().to_string()), filtered)
367
+ }
368
+ Err(err) => {
369
+ eprintln!("Failed to build profiling report: {err}");
370
+ (None, None)
371
+ }
372
+ },
373
+ (Some(path), None) => {
374
+ eprintln!("Profiler guard was not initialised; skipping flamegraph generation");
375
+ if let Some(parent) = path.parent() {
376
+ let _ = create_dir_all(parent);
377
+ }
378
+ (None, None)
379
+ }
380
+ _ => (None, None),
381
+ };
382
+
383
+ #[cfg(not(feature = "profiling"))]
384
+ let (flamegraph_path_str, top_functions): (Option<String>, Option<Vec<FunctionSample>>) = (None, None);
385
+
386
+ let peak_kb = end_rss.or(start_rss);
387
+ let delta_kb = match (start_rss, end_rss) {
388
+ (Some(before), Some(after)) => Some(after.saturating_sub(before)),
389
+ _ => None,
390
+ };
391
+
392
+ Ok(ProfileOutput {
393
+ input: path.display().to_string(),
394
+ duration_secs: duration.as_secs_f64(),
395
+ peak_rss_kb: peak_kb,
396
+ delta_rss_kb: delta_kb,
397
+ flamegraph: flamegraph_path_str,
398
+ top_functions,
399
+ })
400
+ }
401
+
402
+ #[cfg(feature = "profiling")]
403
+ fn summarize_top_functions(report: &Report, limit: usize) -> Vec<FunctionSample> {
404
+ let mut totals: HashMap<String, i64> = HashMap::new();
405
+
406
+ for (frames, count) in &report.data {
407
+ let count = *count as i64;
408
+ if count <= 0 {
409
+ continue;
410
+ }
411
+
412
+ for frame_symbols in &frames.frames {
413
+ for symbol in frame_symbols {
414
+ let name = symbol.name();
415
+ *totals.entry(name).or_insert(0) += count;
416
+ }
417
+ }
418
+ }
419
+
420
+ let total_counts: i64 = totals.values().copied().sum();
421
+
422
+ let mut summary: Vec<FunctionSample> = totals
423
+ .into_iter()
424
+ .map(|(function, samples)| {
425
+ let percentage = if total_counts > 0 {
426
+ (samples as f64 / total_counts as f64) * 100.0
427
+ } else {
428
+ 0.0
429
+ };
430
+ FunctionSample {
431
+ function,
432
+ samples,
433
+ percentage,
434
+ }
435
+ })
436
+ .collect();
437
+
438
+ summary.sort_by(|a, b| b.samples.cmp(&a.samples));
439
+
440
+ let filtered: Vec<FunctionSample> = summary
441
+ .iter()
442
+ .filter(|entry| {
443
+ let name = entry.function.as_str();
444
+ !name.starts_with("__") && !name.contains("libsystem") && !name.contains("dyld")
445
+ })
446
+ .take(limit)
447
+ .cloned()
448
+ .collect();
449
+
450
+ if filtered.is_empty() {
451
+ summary.into_iter().take(limit).collect()
452
+ } else {
453
+ filtered
454
+ }
455
+ }
@@ -201,13 +201,6 @@ impl GenericCache {
201
201
  }
202
202
  }
203
203
 
204
- #[cfg_attr(feature = "otel", tracing::instrument(
205
- skip(self),
206
- fields(
207
- cache.hit = tracing::field::Empty,
208
- cache.key = %cache_key,
209
- )
210
- ))]
211
204
  pub fn get(&self, cache_key: &str, source_file: Option<&str>) -> Result<Option<Vec<u8>>> {
212
205
  let cache_path = self.get_cache_path(cache_key);
213
206
 
@@ -217,24 +210,16 @@ impl GenericCache {
217
210
  .lock()
218
211
  .map_err(|e| KreuzbergError::LockPoisoned(format!("Deleting files mutex poisoned: {}", e)))?;
219
212
  if deleting.contains(&cache_path) {
220
- #[cfg(feature = "otel")]
221
- tracing::Span::current().record("cache.hit", false);
222
213
  return Ok(None);
223
214
  }
224
215
  }
225
216
 
226
217
  if !self.is_valid(&cache_path, source_file) {
227
- #[cfg(feature = "otel")]
228
- tracing::Span::current().record("cache.hit", false);
229
218
  return Ok(None);
230
219
  }
231
220
 
232
221
  match fs::read(&cache_path) {
233
- Ok(content) => {
234
- #[cfg(feature = "otel")]
235
- tracing::Span::current().record("cache.hit", true);
236
- Ok(Some(content))
237
- }
222
+ Ok(content) => Ok(Some(content)),
238
223
  Err(_) => {
239
224
  // Best-effort cleanup of corrupted cache files ~keep
240
225
  if let Err(e) = fs::remove_file(&cache_path) {
@@ -243,30 +228,21 @@ impl GenericCache {
243
228
  if let Err(e) = fs::remove_file(self.get_metadata_path(cache_key)) {
244
229
  tracing::debug!("Failed to remove corrupted metadata file: {}", e);
245
230
  }
246
- #[cfg(feature = "otel")]
247
- tracing::Span::current().record("cache.hit", false);
248
231
  Ok(None)
249
232
  }
250
233
  }
251
234
  }
252
235
 
253
- #[cfg_attr(feature = "otel", tracing::instrument(
254
- skip(self, data),
255
- fields(
256
- cache.key = %cache_key,
257
- cache.size_bytes = data.len(),
258
- )
259
- ))]
260
236
  pub fn set(&self, cache_key: &str, data: Vec<u8>, source_file: Option<&str>) -> Result<()> {
261
237
  let cache_path = self.get_cache_path(cache_key);
262
238
 
263
- fs::write(&cache_path, &data)
239
+ fs::write(&cache_path, data)
264
240
  .map_err(|e| KreuzbergError::cache(format!("Failed to write cache file: {}", e)))?;
265
241
 
266
242
  self.save_metadata(cache_key, source_file);
267
243
 
268
244
  let count = self.write_counter.fetch_add(1, Ordering::Relaxed);
269
- if count.is_multiple_of(100)
245
+ if count % 100 == 0
270
246
  && let Some(cache_path_str) = self.cache_dir.to_str()
271
247
  {
272
248
  // Cache cleanup failure - safe to ignore, cache is optional fallback ~keep
@@ -33,63 +33,3 @@ where
33
33
  {
34
34
  BATCH_MODE.scope(Cell::new(true), future).await
35
35
  }
36
-
37
- #[cfg(test)]
38
- mod tests {
39
- use super::*;
40
-
41
- #[tokio::test]
42
- async fn test_batch_mode_not_set_by_default() {
43
- let result = is_batch_mode();
44
- assert!(!result, "batch mode should be false by default");
45
- }
46
-
47
- #[tokio::test]
48
- async fn test_with_batch_mode_sets_flag() {
49
- let result = with_batch_mode(async { is_batch_mode() }).await;
50
-
51
- assert!(result, "batch mode should be true inside with_batch_mode");
52
- }
53
-
54
- #[tokio::test]
55
- async fn test_batch_mode_scoped_to_future() {
56
- assert!(!is_batch_mode(), "batch mode should be false before");
57
-
58
- with_batch_mode(async {
59
- assert!(is_batch_mode(), "batch mode should be true inside");
60
- })
61
- .await;
62
-
63
- assert!(!is_batch_mode(), "batch mode should be false after future completes");
64
- }
65
-
66
- #[tokio::test]
67
- async fn test_nested_batch_mode_calls() {
68
- let result = with_batch_mode(async {
69
- let outer = is_batch_mode();
70
- let inner = with_batch_mode(async { is_batch_mode() }).await;
71
- (outer, inner)
72
- })
73
- .await;
74
-
75
- assert!(result.0, "outer batch mode should be true");
76
- assert!(result.1, "inner batch mode should be true");
77
- }
78
-
79
- #[tokio::test]
80
- async fn test_batch_mode_unaffected_after_with_batch_mode() {
81
- with_batch_mode(async {
82
- assert!(is_batch_mode(), "first call should set batch mode");
83
- })
84
- .await;
85
-
86
- assert!(!is_batch_mode(), "batch mode should be false between calls");
87
-
88
- with_batch_mode(async {
89
- assert!(is_batch_mode(), "second call should set batch mode");
90
- })
91
- .await;
92
-
93
- assert!(!is_batch_mode(), "batch mode should be false after all calls");
94
- }
95
- }