kreuzberg 4.1.2 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -2
|
@@ -83,13 +83,13 @@ mod jats_extractor_tests {
|
|
|
83
83
|
.await;
|
|
84
84
|
|
|
85
85
|
assert!(result.is_ok());
|
|
86
|
-
let extraction = result.
|
|
86
|
+
let extraction = result.expect("Operation failed");
|
|
87
87
|
|
|
88
88
|
assert!(extraction.content.contains("Effects of Caffeine"));
|
|
89
89
|
assert!(extraction.content.contains("Introduction"));
|
|
90
90
|
|
|
91
91
|
assert!(extraction.metadata.subject.is_some());
|
|
92
|
-
let subject = extraction.metadata.subject.
|
|
92
|
+
let subject = extraction.metadata.subject.expect("Operation failed");
|
|
93
93
|
assert!(subject.contains("Effects of Caffeine"));
|
|
94
94
|
|
|
95
95
|
assert!(subject.contains("10.1371"));
|
|
@@ -144,9 +144,9 @@ mod jats_extractor_tests {
|
|
|
144
144
|
.await;
|
|
145
145
|
|
|
146
146
|
assert!(result.is_ok());
|
|
147
|
-
let extraction = result.
|
|
147
|
+
let extraction = result.expect("Operation failed");
|
|
148
148
|
|
|
149
|
-
let subject = extraction.metadata.subject.
|
|
149
|
+
let subject = extraction.metadata.subject.expect("Operation failed");
|
|
150
150
|
assert!(subject.contains("Alpha"));
|
|
151
151
|
assert!(subject.contains("Beta"));
|
|
152
152
|
assert!(subject.contains("Gamma"));
|
|
@@ -201,7 +201,7 @@ mod jats_extractor_tests {
|
|
|
201
201
|
.await;
|
|
202
202
|
|
|
203
203
|
assert!(result.is_ok());
|
|
204
|
-
let extraction = result.
|
|
204
|
+
let extraction = result.expect("Operation failed");
|
|
205
205
|
|
|
206
206
|
assert!(extraction.content.contains("Introduction"));
|
|
207
207
|
assert!(extraction.content.contains("Methods"));
|
|
@@ -273,7 +273,7 @@ mod jats_extractor_tests {
|
|
|
273
273
|
.await;
|
|
274
274
|
|
|
275
275
|
assert!(result.is_ok());
|
|
276
|
-
let extraction = result.
|
|
276
|
+
let extraction = result.expect("Operation failed");
|
|
277
277
|
|
|
278
278
|
assert_eq!(extraction.tables.len(), 1);
|
|
279
279
|
let table = &extraction.tables[0];
|
|
@@ -327,7 +327,7 @@ mod jats_extractor_tests {
|
|
|
327
327
|
.await;
|
|
328
328
|
|
|
329
329
|
assert!(result.is_ok());
|
|
330
|
-
let extraction = result.
|
|
330
|
+
let extraction = result.expect("Operation failed");
|
|
331
331
|
|
|
332
332
|
assert_eq!(extraction.tables.len(), 2);
|
|
333
333
|
assert_eq!(extraction.tables[0].cells[0].len(), 2);
|
|
@@ -390,7 +390,7 @@ mod jats_extractor_tests {
|
|
|
390
390
|
.await;
|
|
391
391
|
|
|
392
392
|
assert!(result.is_ok());
|
|
393
|
-
let extraction = result.
|
|
393
|
+
let extraction = result.expect("Operation failed");
|
|
394
394
|
|
|
395
395
|
assert!(extraction.content.contains("Previous research"));
|
|
396
396
|
assert!(extraction.content.contains("Other studies"));
|
|
@@ -429,9 +429,9 @@ mod jats_extractor_tests {
|
|
|
429
429
|
.await;
|
|
430
430
|
|
|
431
431
|
assert!(result.is_ok());
|
|
432
|
-
let extraction = result.
|
|
432
|
+
let extraction = result.expect("Operation failed");
|
|
433
433
|
|
|
434
|
-
let subject = extraction.metadata.subject.
|
|
434
|
+
let subject = extraction.metadata.subject.expect("Operation failed");
|
|
435
435
|
assert!(subject.contains("background") || subject.contains("Background") || subject.contains("Abstract"));
|
|
436
436
|
}
|
|
437
437
|
|
|
@@ -457,7 +457,7 @@ mod jats_extractor_tests {
|
|
|
457
457
|
.await;
|
|
458
458
|
|
|
459
459
|
assert!(result.is_ok());
|
|
460
|
-
let extraction = result.
|
|
460
|
+
let extraction = result.expect("Operation failed");
|
|
461
461
|
|
|
462
462
|
assert!(extraction.metadata.subject.is_some());
|
|
463
463
|
}
|
|
@@ -486,7 +486,7 @@ mod jats_extractor_tests {
|
|
|
486
486
|
.await;
|
|
487
487
|
|
|
488
488
|
assert!(result.is_ok());
|
|
489
|
-
let extraction = result.
|
|
489
|
+
let extraction = result.expect("Operation failed");
|
|
490
490
|
|
|
491
491
|
assert!(extraction.metadata.created_at.is_some());
|
|
492
492
|
}
|
|
@@ -511,7 +511,7 @@ mod jats_extractor_tests {
|
|
|
511
511
|
.await;
|
|
512
512
|
|
|
513
513
|
assert!(result.is_ok());
|
|
514
|
-
let extraction = result.
|
|
514
|
+
let extraction = result.expect("Operation failed");
|
|
515
515
|
assert!(extraction.content.is_empty() || extraction.content.trim().is_empty());
|
|
516
516
|
}
|
|
517
517
|
|
|
@@ -578,7 +578,7 @@ mod jats_extractor_tests {
|
|
|
578
578
|
.await;
|
|
579
579
|
|
|
580
580
|
assert!(result.is_ok());
|
|
581
|
-
let extraction = result.
|
|
581
|
+
let extraction = result.expect("Operation failed");
|
|
582
582
|
|
|
583
583
|
assert!(extraction.content.contains("First paragraph"));
|
|
584
584
|
assert!(extraction.content.contains("Second paragraph"));
|
|
@@ -611,9 +611,9 @@ mod jats_extractor_tests {
|
|
|
611
611
|
.await;
|
|
612
612
|
|
|
613
613
|
assert!(result.is_ok());
|
|
614
|
-
let extraction = result.
|
|
614
|
+
let extraction = result.expect("Operation failed");
|
|
615
615
|
|
|
616
|
-
let subject = extraction.metadata.subject.
|
|
616
|
+
let subject = extraction.metadata.subject.expect("Operation failed");
|
|
617
617
|
assert!(subject.contains("keyword") || subject.contains("Keyword"));
|
|
618
618
|
}
|
|
619
619
|
|
|
@@ -630,7 +630,7 @@ mod jats_extractor_tests {
|
|
|
630
630
|
.await;
|
|
631
631
|
|
|
632
632
|
assert!(result.is_ok());
|
|
633
|
-
let extraction = result.
|
|
633
|
+
let extraction = result.expect("Operation failed");
|
|
634
634
|
|
|
635
635
|
assert!(!extraction.content.is_empty());
|
|
636
636
|
assert!(extraction.metadata.subject.is_some());
|
|
@@ -61,7 +61,7 @@ async fn test_jupyter_simple_notebook_extraction() {
|
|
|
61
61
|
return;
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
-
let extraction = result.
|
|
64
|
+
let extraction = result.expect("Operation failed");
|
|
65
65
|
|
|
66
66
|
assert_eq!(
|
|
67
67
|
extraction.mime_type, "application/x-ipynb+json",
|
|
@@ -156,7 +156,7 @@ async fn test_jupyter_mime_notebook_extraction() {
|
|
|
156
156
|
return;
|
|
157
157
|
}
|
|
158
158
|
|
|
159
|
-
let extraction = result.
|
|
159
|
+
let extraction = result.expect("Operation failed");
|
|
160
160
|
|
|
161
161
|
assert_eq!(
|
|
162
162
|
extraction.mime_type, "application/x-ipynb+json",
|
|
@@ -261,7 +261,7 @@ async fn test_jupyter_mime_out_notebook_extraction() {
|
|
|
261
261
|
return;
|
|
262
262
|
}
|
|
263
263
|
|
|
264
|
-
let extraction = result.
|
|
264
|
+
let extraction = result.expect("Operation failed");
|
|
265
265
|
|
|
266
266
|
assert_eq!(
|
|
267
267
|
extraction.mime_type, "application/x-ipynb+json",
|
|
@@ -351,7 +351,7 @@ async fn test_jupyter_rank_notebook_extraction() {
|
|
|
351
351
|
return;
|
|
352
352
|
}
|
|
353
353
|
|
|
354
|
-
let extraction = result.
|
|
354
|
+
let extraction = result.expect("Operation failed");
|
|
355
355
|
|
|
356
356
|
assert_eq!(
|
|
357
357
|
extraction.mime_type, "application/x-ipynb+json",
|
|
@@ -440,7 +440,7 @@ async fn test_jupyter_metadata_aggregation() {
|
|
|
440
440
|
continue;
|
|
441
441
|
}
|
|
442
442
|
|
|
443
|
-
let extraction = result.
|
|
443
|
+
let extraction = result.expect("Operation failed");
|
|
444
444
|
|
|
445
445
|
assert!(
|
|
446
446
|
!extraction.content.is_empty(),
|
|
@@ -491,7 +491,7 @@ async fn test_jupyter_cell_content_aggregation() {
|
|
|
491
491
|
return;
|
|
492
492
|
}
|
|
493
493
|
|
|
494
|
-
let extraction = result.
|
|
494
|
+
let extraction = result.expect("Operation failed");
|
|
495
495
|
|
|
496
496
|
let code_indicators = ["class", "def", "import", "from", "python"];
|
|
497
497
|
let code_count = code_indicators
|
|
@@ -563,7 +563,7 @@ async fn test_jupyter_mime_output_handling() {
|
|
|
563
563
|
return;
|
|
564
564
|
}
|
|
565
565
|
|
|
566
|
-
let extraction = result.
|
|
566
|
+
let extraction = result.expect("Operation failed");
|
|
567
567
|
|
|
568
568
|
assert!(
|
|
569
569
|
extraction.content.contains("image")
|
|
@@ -620,7 +620,7 @@ async fn test_jupyter_notebook_structure_preservation() {
|
|
|
620
620
|
return;
|
|
621
621
|
}
|
|
622
622
|
|
|
623
|
-
let extraction = result.
|
|
623
|
+
let extraction = result.expect("Operation failed");
|
|
624
624
|
|
|
625
625
|
let cell_id_patterns = ["uid1", "uid2", "uid3", "uid4", "uid6"];
|
|
626
626
|
let id_count = cell_id_patterns
|
|
@@ -672,7 +672,7 @@ async fn test_jupyter_pandoc_baseline_alignment() {
|
|
|
672
672
|
continue;
|
|
673
673
|
}
|
|
674
674
|
|
|
675
|
-
let extraction = result.
|
|
675
|
+
let extraction = result.expect("Operation failed");
|
|
676
676
|
|
|
677
677
|
assert!(
|
|
678
678
|
extraction.content.contains("cell")
|
|
@@ -38,7 +38,7 @@ El procesamiento del lenguaje natural es un campo de la inteligencia artificial
|
|
|
38
38
|
#[test]
|
|
39
39
|
fn test_yake_basic_extraction() {
|
|
40
40
|
let config = KeywordConfig::yake();
|
|
41
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
41
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
42
42
|
|
|
43
43
|
assert!(!keywords.is_empty(), "Should extract keywords from document");
|
|
44
44
|
assert!(
|
|
@@ -80,7 +80,7 @@ fn test_yake_basic_extraction() {
|
|
|
80
80
|
#[test]
|
|
81
81
|
fn test_rake_basic_extraction() {
|
|
82
82
|
let config = KeywordConfig::rake();
|
|
83
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
83
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
84
84
|
|
|
85
85
|
assert!(!keywords.is_empty(), "Should extract keywords from document");
|
|
86
86
|
assert!(
|
|
@@ -122,8 +122,8 @@ fn test_yake_vs_rake_comparison() {
|
|
|
122
122
|
let yake_config = KeywordConfig::yake().with_max_keywords(5);
|
|
123
123
|
let rake_config = KeywordConfig::rake().with_max_keywords(5);
|
|
124
124
|
|
|
125
|
-
let yake_keywords = extract_keywords(ML_DOCUMENT, &yake_config).
|
|
126
|
-
let rake_keywords = extract_keywords(ML_DOCUMENT, &rake_config).
|
|
125
|
+
let yake_keywords = extract_keywords(ML_DOCUMENT, &yake_config).expect("Operation failed");
|
|
126
|
+
let rake_keywords = extract_keywords(ML_DOCUMENT, &rake_config).expect("Operation failed");
|
|
127
127
|
|
|
128
128
|
assert!(!yake_keywords.is_empty(), "YAKE should extract keywords");
|
|
129
129
|
assert!(!rake_keywords.is_empty(), "RAKE should extract keywords");
|
|
@@ -161,7 +161,7 @@ fn test_yake_vs_rake_comparison() {
|
|
|
161
161
|
#[test]
|
|
162
162
|
fn test_yake_with_max_keywords() {
|
|
163
163
|
let config = KeywordConfig::yake().with_max_keywords(3);
|
|
164
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
164
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
165
165
|
|
|
166
166
|
assert!(keywords.len() <= 3, "Should respect max_keywords=3 limit");
|
|
167
167
|
|
|
@@ -176,7 +176,7 @@ fn test_yake_with_max_keywords() {
|
|
|
176
176
|
#[test]
|
|
177
177
|
fn test_rake_with_max_keywords() {
|
|
178
178
|
let config = KeywordConfig::rake().with_max_keywords(3);
|
|
179
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
179
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
180
180
|
|
|
181
181
|
assert!(keywords.len() <= 3, "Should respect max_keywords=3 limit");
|
|
182
182
|
|
|
@@ -191,7 +191,7 @@ fn test_rake_with_max_keywords() {
|
|
|
191
191
|
#[test]
|
|
192
192
|
fn test_yake_with_min_score() {
|
|
193
193
|
let config = KeywordConfig::yake().with_min_score(0.5);
|
|
194
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
194
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
195
195
|
|
|
196
196
|
for keyword in &keywords {
|
|
197
197
|
assert!(
|
|
@@ -207,7 +207,7 @@ fn test_yake_with_min_score() {
|
|
|
207
207
|
#[test]
|
|
208
208
|
fn test_rake_with_min_score() {
|
|
209
209
|
let config = KeywordConfig::rake().with_min_score(0.2);
|
|
210
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
210
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
211
211
|
|
|
212
212
|
for keyword in &keywords {
|
|
213
213
|
assert!(
|
|
@@ -223,7 +223,7 @@ fn test_rake_with_min_score() {
|
|
|
223
223
|
#[test]
|
|
224
224
|
fn test_yake_with_ngram_range() {
|
|
225
225
|
let config = KeywordConfig::yake().with_ngram_range(1, 1);
|
|
226
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
226
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
227
227
|
|
|
228
228
|
for keyword in &keywords {
|
|
229
229
|
let word_count = keyword.text.split_whitespace().count();
|
|
@@ -231,7 +231,7 @@ fn test_yake_with_ngram_range() {
|
|
|
231
231
|
}
|
|
232
232
|
|
|
233
233
|
let config = KeywordConfig::yake().with_ngram_range(2, 3);
|
|
234
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
234
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
235
235
|
|
|
236
236
|
for keyword in &keywords {
|
|
237
237
|
let word_count = keyword.text.split_whitespace().count();
|
|
@@ -248,7 +248,7 @@ fn test_yake_with_ngram_range() {
|
|
|
248
248
|
#[test]
|
|
249
249
|
fn test_rake_with_ngram_range() {
|
|
250
250
|
let config = KeywordConfig::rake().with_ngram_range(1, 1);
|
|
251
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
251
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
252
252
|
|
|
253
253
|
for keyword in &keywords {
|
|
254
254
|
let word_count = keyword.text.split_whitespace().count();
|
|
@@ -256,7 +256,7 @@ fn test_rake_with_ngram_range() {
|
|
|
256
256
|
}
|
|
257
257
|
|
|
258
258
|
let config = KeywordConfig::rake().with_ngram_range(2, 2);
|
|
259
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
259
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
260
260
|
|
|
261
261
|
for keyword in &keywords {
|
|
262
262
|
let word_count = keyword.text.split_whitespace().count();
|
|
@@ -268,7 +268,7 @@ fn test_rake_with_ngram_range() {
|
|
|
268
268
|
#[test]
|
|
269
269
|
fn test_rake_with_spanish() {
|
|
270
270
|
let config = KeywordConfig::rake().with_language("es");
|
|
271
|
-
let keywords = extract_keywords(SPANISH_DOCUMENT, &config).
|
|
271
|
+
let keywords = extract_keywords(SPANISH_DOCUMENT, &config).expect("Operation failed");
|
|
272
272
|
|
|
273
273
|
assert!(!keywords.is_empty(), "Should extract Spanish keywords");
|
|
274
274
|
|
|
@@ -294,7 +294,7 @@ fn test_rake_with_spanish() {
|
|
|
294
294
|
#[test]
|
|
295
295
|
fn test_yake_with_spanish() {
|
|
296
296
|
let config = KeywordConfig::yake().with_language("es");
|
|
297
|
-
let keywords = extract_keywords(SPANISH_DOCUMENT, &config).
|
|
297
|
+
let keywords = extract_keywords(SPANISH_DOCUMENT, &config).expect("Operation failed");
|
|
298
298
|
|
|
299
299
|
assert!(!keywords.is_empty(), "Should extract Spanish keywords");
|
|
300
300
|
|
|
@@ -308,7 +308,7 @@ fn test_yake_with_spanish() {
|
|
|
308
308
|
#[test]
|
|
309
309
|
fn test_rake_empty_document() {
|
|
310
310
|
let config = KeywordConfig::rake();
|
|
311
|
-
let keywords = extract_keywords("", &config).
|
|
311
|
+
let keywords = extract_keywords("", &config).expect("Operation failed");
|
|
312
312
|
|
|
313
313
|
assert!(keywords.is_empty(), "Empty document should yield no keywords");
|
|
314
314
|
}
|
|
@@ -317,7 +317,7 @@ fn test_rake_empty_document() {
|
|
|
317
317
|
#[test]
|
|
318
318
|
fn test_yake_empty_document() {
|
|
319
319
|
let config = KeywordConfig::yake();
|
|
320
|
-
let keywords = extract_keywords("", &config).
|
|
320
|
+
let keywords = extract_keywords("", &config).expect("Operation failed");
|
|
321
321
|
|
|
322
322
|
assert!(keywords.is_empty(), "Empty document should yield no keywords");
|
|
323
323
|
}
|
|
@@ -327,7 +327,7 @@ fn test_yake_empty_document() {
|
|
|
327
327
|
fn test_rake_short_document() {
|
|
328
328
|
let short_text = "Machine learning algorithms.";
|
|
329
329
|
let config = KeywordConfig::rake();
|
|
330
|
-
let keywords = extract_keywords(short_text, &config).
|
|
330
|
+
let keywords = extract_keywords(short_text, &config).expect("Operation failed");
|
|
331
331
|
|
|
332
332
|
println!(
|
|
333
333
|
"Keywords from short text: {:?}",
|
|
@@ -340,7 +340,7 @@ fn test_rake_short_document() {
|
|
|
340
340
|
fn test_yake_short_document() {
|
|
341
341
|
let short_text = "Machine learning algorithms.";
|
|
342
342
|
let config = KeywordConfig::yake();
|
|
343
|
-
let keywords = extract_keywords(short_text, &config).
|
|
343
|
+
let keywords = extract_keywords(short_text, &config).expect("Operation failed");
|
|
344
344
|
|
|
345
345
|
println!(
|
|
346
346
|
"YAKE keywords from short text: {:?}",
|
|
@@ -353,13 +353,13 @@ fn test_yake_short_document() {
|
|
|
353
353
|
fn test_rake_different_domains() {
|
|
354
354
|
let config = KeywordConfig::rake().with_max_keywords(5);
|
|
355
355
|
|
|
356
|
-
let ml_keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
356
|
+
let ml_keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
357
357
|
println!("\nML domain keywords:");
|
|
358
358
|
for kw in &ml_keywords {
|
|
359
359
|
println!(" {} (score: {:.3})", kw.text, kw.score);
|
|
360
360
|
}
|
|
361
361
|
|
|
362
|
-
let climate_keywords = extract_keywords(CLIMATE_DOCUMENT, &config).
|
|
362
|
+
let climate_keywords = extract_keywords(CLIMATE_DOCUMENT, &config).expect("Operation failed");
|
|
363
363
|
println!("\nClimate domain keywords:");
|
|
364
364
|
for kw in &climate_keywords {
|
|
365
365
|
println!(" {} (score: {:.3})", kw.text, kw.score);
|
|
@@ -395,13 +395,13 @@ fn test_rake_different_domains() {
|
|
|
395
395
|
fn test_yake_different_domains() {
|
|
396
396
|
let config = KeywordConfig::yake().with_max_keywords(5);
|
|
397
397
|
|
|
398
|
-
let ml_keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
398
|
+
let ml_keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
399
399
|
println!("\nYAKE ML domain keywords:");
|
|
400
400
|
for kw in &ml_keywords {
|
|
401
401
|
println!(" {} (score: {:.3})", kw.text, kw.score);
|
|
402
402
|
}
|
|
403
403
|
|
|
404
|
-
let climate_keywords = extract_keywords(CLIMATE_DOCUMENT, &config).
|
|
404
|
+
let climate_keywords = extract_keywords(CLIMATE_DOCUMENT, &config).expect("Operation failed");
|
|
405
405
|
println!("\nYAKE Climate domain keywords:");
|
|
406
406
|
for kw in &climate_keywords {
|
|
407
407
|
println!(" {} (score: {:.3})", kw.text, kw.score);
|
|
@@ -415,7 +415,7 @@ fn test_yake_different_domains() {
|
|
|
415
415
|
#[test]
|
|
416
416
|
fn test_rake_score_distribution() {
|
|
417
417
|
let config = KeywordConfig::rake();
|
|
418
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
418
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
419
419
|
|
|
420
420
|
if keywords.is_empty() {
|
|
421
421
|
return;
|
|
@@ -439,7 +439,7 @@ fn test_rake_score_distribution() {
|
|
|
439
439
|
#[test]
|
|
440
440
|
fn test_yake_score_distribution() {
|
|
441
441
|
let config = KeywordConfig::yake();
|
|
442
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
442
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
443
443
|
|
|
444
444
|
if keywords.is_empty() {
|
|
445
445
|
return;
|
|
@@ -463,7 +463,7 @@ fn test_yake_score_distribution() {
|
|
|
463
463
|
#[test]
|
|
464
464
|
fn test_keyword_struct_properties() {
|
|
465
465
|
let config = KeywordConfig::default();
|
|
466
|
-
let keywords = extract_keywords(ML_DOCUMENT, &config).
|
|
466
|
+
let keywords = extract_keywords(ML_DOCUMENT, &config).expect("Operation failed");
|
|
467
467
|
|
|
468
468
|
if keywords.is_empty() {
|
|
469
469
|
return;
|
|
@@ -189,7 +189,7 @@ Global warming is the long-term heating of Earth's climate system. Climate scien
|
|
|
189
189
|
#[test]
|
|
190
190
|
fn test_yake_quality_ml_document_default_config() {
|
|
191
191
|
let config = KeywordConfig::yake();
|
|
192
|
-
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).
|
|
192
|
+
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
|
193
193
|
|
|
194
194
|
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
|
195
195
|
|
|
@@ -239,7 +239,7 @@ fn test_yake_quality_ml_document_default_config() {
|
|
|
239
239
|
#[test]
|
|
240
240
|
fn test_rake_quality_ml_document_default_config() {
|
|
241
241
|
let config = KeywordConfig::rake();
|
|
242
|
-
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).
|
|
242
|
+
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
|
243
243
|
|
|
244
244
|
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
|
245
245
|
|
|
@@ -289,7 +289,7 @@ fn test_rake_quality_ml_document_default_config() {
|
|
|
289
289
|
#[test]
|
|
290
290
|
fn test_yake_quality_climate_document_default_config() {
|
|
291
291
|
let config = KeywordConfig::yake();
|
|
292
|
-
let keywords = extract_keywords(CLIMATE_DOC_SAMPLE, &config).
|
|
292
|
+
let keywords = extract_keywords(CLIMATE_DOC_SAMPLE, &config).expect("Operation failed");
|
|
293
293
|
|
|
294
294
|
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
|
295
295
|
|
|
@@ -331,7 +331,7 @@ fn test_yake_quality_climate_document_default_config() {
|
|
|
331
331
|
#[test]
|
|
332
332
|
fn test_rake_quality_climate_document_default_config() {
|
|
333
333
|
let config = KeywordConfig::rake();
|
|
334
|
-
let keywords = extract_keywords(CLIMATE_DOC_SAMPLE, &config).
|
|
334
|
+
let keywords = extract_keywords(CLIMATE_DOC_SAMPLE, &config).expect("Operation failed");
|
|
335
335
|
|
|
336
336
|
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
|
337
337
|
|
|
@@ -375,8 +375,8 @@ fn test_yake_vs_rake_quality_comparison() {
|
|
|
375
375
|
let yake_config = KeywordConfig::yake();
|
|
376
376
|
let rake_config = KeywordConfig::rake();
|
|
377
377
|
|
|
378
|
-
let yake_keywords = extract_keywords(ML_DOC_SAMPLE, &yake_config).
|
|
379
|
-
let rake_keywords = extract_keywords(ML_DOC_SAMPLE, &rake_config).
|
|
378
|
+
let yake_keywords = extract_keywords(ML_DOC_SAMPLE, &yake_config).expect("Operation failed");
|
|
379
|
+
let rake_keywords = extract_keywords(ML_DOC_SAMPLE, &rake_config).expect("Operation failed");
|
|
380
380
|
|
|
381
381
|
let yake_extracted: Vec<&str> = yake_keywords.iter().map(|k| k.text.as_str()).collect();
|
|
382
382
|
let rake_extracted: Vec<&str> = rake_keywords.iter().map(|k| k.text.as_str()).collect();
|
|
@@ -414,7 +414,7 @@ fn test_yake_quality_with_optimized_config() {
|
|
|
414
414
|
.with_ngram_range(1, 3)
|
|
415
415
|
.with_min_score(0.0);
|
|
416
416
|
|
|
417
|
-
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).
|
|
417
|
+
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
|
418
418
|
|
|
419
419
|
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
|
420
420
|
let ground_truth = get_ml_ground_truth();
|
|
@@ -441,7 +441,7 @@ fn test_rake_quality_with_optimized_config() {
|
|
|
441
441
|
.with_ngram_range(1, 3)
|
|
442
442
|
.with_min_score(0.0);
|
|
443
443
|
|
|
444
|
-
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).
|
|
444
|
+
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
|
445
445
|
|
|
446
446
|
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
|
447
447
|
let ground_truth = get_ml_ground_truth();
|
|
@@ -464,7 +464,7 @@ fn test_rake_quality_with_optimized_config() {
|
|
|
464
464
|
#[test]
|
|
465
465
|
fn test_extracted_keywords_are_domain_relevant() {
|
|
466
466
|
let config = KeywordConfig::default();
|
|
467
|
-
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).
|
|
467
|
+
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
|
468
468
|
|
|
469
469
|
let ml_terms = [
|
|
470
470
|
"machine",
|
|
@@ -28,9 +28,9 @@ fn test_file_path(filename: &str) -> PathBuf {
|
|
|
28
28
|
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
|
29
29
|
PathBuf::from(manifest_dir)
|
|
30
30
|
.parent()
|
|
31
|
-
.
|
|
31
|
+
.expect("Operation failed")
|
|
32
32
|
.parent()
|
|
33
|
-
.
|
|
33
|
+
.expect("Operation failed")
|
|
34
34
|
.join("test_documents")
|
|
35
35
|
.join("latex")
|
|
36
36
|
.join(filename)
|
|
@@ -486,5 +486,5 @@ async fn test_special_characters_in_metadata() {
|
|
|
486
486
|
|
|
487
487
|
let title = result.metadata.additional.get("title").and_then(|v| v.as_str());
|
|
488
488
|
assert!(title.is_some());
|
|
489
|
-
assert!(title.
|
|
489
|
+
assert!(title.expect("Operation failed").contains("&") || title.expect("Operation failed").contains("Part"));
|
|
490
490
|
}
|