kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,228 +0,0 @@
1
- #![cfg(feature = "office")]
2
-
3
- use kreuzberg::core::config::ExtractionConfig;
4
- use kreuzberg::plugins::DocumentExtractor;
5
- use std::path::PathBuf;
6
-
7
- /// Helper to get absolute path to test documents
8
- fn test_file_path(filename: &str) -> PathBuf {
9
- let manifest_dir = env!("CARGO_MANIFEST_DIR");
10
- PathBuf::from(manifest_dir)
11
- .parent()
12
- .unwrap()
13
- .parent()
14
- .unwrap()
15
- .join("test_documents")
16
- .join("fictionbook")
17
- .join(filename)
18
- }
19
-
20
- #[tokio::test]
21
- async fn test_fictionbook_extract_metadata_title() {
22
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
23
- let path = test_file_path("meta.fb2");
24
-
25
- let result = extractor
26
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
27
- .await
28
- .expect("Failed to extract FB2 file");
29
-
30
- assert!(
31
- result.content.contains("Book title"),
32
- "Book title should be extracted from FB2 content"
33
- );
34
- }
35
-
36
- #[tokio::test]
37
- async fn test_fictionbook_extract_metadata_genre() {
38
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
39
- let path = test_file_path("meta.fb2");
40
-
41
- let result = extractor
42
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
43
- .await
44
- .expect("Failed to extract FB2 file");
45
-
46
- assert!(result.metadata.subject.is_none());
47
- }
48
-
49
- #[tokio::test]
50
- async fn test_fictionbook_extract_content_sections() {
51
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
52
- let path = test_file_path("titles.fb2");
53
-
54
- let result = extractor
55
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
56
- .await
57
- .expect("Failed to extract FB2 file");
58
-
59
- assert!(
60
- result.content.contains("Simple title"),
61
- "Section titles should be extracted"
62
- );
63
- assert!(
64
- result.content.contains("Emphasized"),
65
- "Section with emphasis should be extracted"
66
- );
67
- }
68
-
69
- #[tokio::test]
70
- async fn test_fictionbook_extract_section_hierarchy() {
71
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
72
- let path = test_file_path("basic.fb2");
73
-
74
- let result = extractor
75
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
76
- .await
77
- .expect("Failed to extract FB2 file");
78
-
79
- assert!(
80
- result.content.contains("Top-level title"),
81
- "Top-level section should be extracted"
82
- );
83
- assert!(result.content.contains("Section"), "Nested section should be extracted");
84
- assert!(
85
- result.content.contains("Subsection"),
86
- "Nested subsection should be extracted"
87
- );
88
- }
89
-
90
- #[tokio::test]
91
- async fn test_fictionbook_extract_inline_markup() {
92
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
93
- let path = test_file_path("emphasis.fb2");
94
-
95
- let result = extractor
96
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
97
- .await
98
- .expect("Failed to extract FB2 file");
99
-
100
- let content = result.content.to_lowercase();
101
- assert!(content.contains("plain"), "Plain text should be extracted");
102
- assert!(content.contains("strong"), "Strong emphasis should be extracted");
103
- assert!(content.contains("emphasis"), "Emphasis should be extracted");
104
- assert!(content.contains("strikethrough"), "Strikethrough should be extracted");
105
- }
106
-
107
- #[tokio::test]
108
- async fn test_fictionbook_extract_emphasis() {
109
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
110
- let path = test_file_path("basic.fb2");
111
-
112
- let result = extractor
113
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
114
- .await
115
- .expect("Failed to extract FB2 file");
116
-
117
- assert!(
118
- result.content.contains("emphasized"),
119
- "Emphasized text should be extracted"
120
- );
121
- }
122
-
123
- #[tokio::test]
124
- async fn test_fictionbook_extract_strong() {
125
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
126
- let path = test_file_path("basic.fb2");
127
-
128
- let result = extractor
129
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
130
- .await
131
- .expect("Failed to extract FB2 file");
132
-
133
- assert!(result.content.contains("strong"), "Strong text should be extracted");
134
- }
135
-
136
- #[tokio::test]
137
- async fn test_fictionbook_extract_code() {
138
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
139
- let path = test_file_path("basic.fb2");
140
-
141
- let result = extractor
142
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
143
- .await
144
- .expect("Failed to extract FB2 file");
145
-
146
- assert!(result.content.contains("verbatim"), "Code content should be extracted");
147
- }
148
-
149
- #[tokio::test]
150
- async fn test_fictionbook_extract_blockquote() {
151
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
152
- let path = test_file_path("basic.fb2");
153
-
154
- let result = extractor
155
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
156
- .await
157
- .expect("Failed to extract FB2 file");
158
-
159
- assert!(result.content.contains("Blockquote"), "Blockquote should be extracted");
160
- }
161
-
162
- #[tokio::test]
163
- async fn test_fictionbook_extract_tables() {
164
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
165
- let path = test_file_path("tables.fb2");
166
-
167
- let result = extractor
168
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
169
- .await
170
- .expect("Failed to extract FB2 file");
171
-
172
- assert!(
173
- !result.content.is_empty(),
174
- "Content should be extracted from file with tables"
175
- );
176
- }
177
-
178
- #[tokio::test]
179
- async fn test_fictionbook_markdown_formatting_preservation() {
180
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
181
- let path = test_file_path("emphasis.fb2");
182
-
183
- let result = extractor
184
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
185
- .await
186
- .expect("Failed to extract FB2 file");
187
-
188
- assert!(
189
- result.content.contains("**strong**"),
190
- "Strong text should be formatted as **bold** in markdown"
191
- );
192
- assert!(
193
- result.content.contains("*emphasis*"),
194
- "Emphasis text should be formatted as *italic* in markdown"
195
- );
196
- assert!(
197
- result.content.contains("~~deleted~~"),
198
- "Strikethrough text should be formatted as ~~strikethrough~~ in markdown"
199
- );
200
- assert!(
201
- result.content.contains("`code`"),
202
- "Code text should be wrapped in backticks in markdown"
203
- );
204
- }
205
-
206
- #[tokio::test]
207
- async fn test_fictionbook_formatting_in_body_paragraphs() {
208
- let extractor = kreuzberg::extractors::FictionBookExtractor::new();
209
- let path = test_file_path("basic.fb2");
210
-
211
- let result = extractor
212
- .extract_file(&path, "application/x-fictionbook+xml", &ExtractionConfig::default())
213
- .await
214
- .expect("Failed to extract FB2 file");
215
-
216
- assert!(
217
- result.content.contains("*emphasized*"),
218
- "Emphasis formatting should be preserved in body content"
219
- );
220
- assert!(
221
- result.content.contains("**strong**"),
222
- "Strong formatting should be preserved in body content"
223
- );
224
- assert!(
225
- result.content.contains("`verbatim`"),
226
- "Code formatting should be preserved in body content"
227
- );
228
- }