kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
data/sig/kreuzberg.rbs
CHANGED
|
@@ -3,16 +3,6 @@
|
|
|
3
3
|
module Kreuzberg
|
|
4
4
|
VERSION: String
|
|
5
5
|
|
|
6
|
-
# Error code constants
|
|
7
|
-
ERROR_CODE_SUCCESS: Integer
|
|
8
|
-
ERROR_CODE_GENERIC: Integer
|
|
9
|
-
ERROR_CODE_PANIC: Integer
|
|
10
|
-
ERROR_CODE_INVALID_ARGUMENT: Integer
|
|
11
|
-
ERROR_CODE_IO: Integer
|
|
12
|
-
ERROR_CODE_PARSING: Integer
|
|
13
|
-
ERROR_CODE_OCR: Integer
|
|
14
|
-
ERROR_CODE_MISSING_DEPENDENCY: Integer
|
|
15
|
-
|
|
16
6
|
# Config namespace (defined in lib/kreuzberg/config.rb)
|
|
17
7
|
module Config
|
|
18
8
|
class OCR
|
|
@@ -401,10 +391,6 @@ module Kreuzberg
|
|
|
401
391
|
# Config loading (native method)
|
|
402
392
|
def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
|
|
403
393
|
|
|
404
|
-
# Error introspection (native methods)
|
|
405
|
-
def self._last_error_code_native: () -> Integer
|
|
406
|
-
def self._last_panic_context_json_native: () -> String?
|
|
407
|
-
|
|
408
394
|
# Plugin registration
|
|
409
395
|
def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
|
|
410
396
|
def self.unregister_post_processor: (String name) -> void
|
|
@@ -427,63 +413,25 @@ module Kreuzberg
|
|
|
427
413
|
def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
|
|
428
414
|
end
|
|
429
415
|
|
|
430
|
-
module ErrorContext
|
|
431
|
-
def self.last_error_code: () -> Integer
|
|
432
|
-
def self.last_panic_context: () -> Errors::PanicContext?
|
|
433
|
-
def self.last_panic_context_json: () -> String?
|
|
434
|
-
end
|
|
435
|
-
|
|
436
416
|
module Errors
|
|
437
|
-
# Panic context information from FFI error introspection
|
|
438
|
-
class PanicContext
|
|
439
|
-
attr_reader file: String
|
|
440
|
-
attr_reader line: Integer
|
|
441
|
-
attr_reader function: String
|
|
442
|
-
attr_reader message: String
|
|
443
|
-
attr_reader timestamp_secs: Integer
|
|
444
|
-
|
|
445
|
-
def initialize: (
|
|
446
|
-
file: String,
|
|
447
|
-
line: Integer,
|
|
448
|
-
function: String,
|
|
449
|
-
message: String,
|
|
450
|
-
timestamp_secs: Integer
|
|
451
|
-
) -> void
|
|
452
|
-
def to_s: () -> String
|
|
453
|
-
def to_h: () -> Hash[Symbol, String | Integer]
|
|
454
|
-
def self.from_json: (String) -> PanicContext?
|
|
455
|
-
|
|
456
|
-
private
|
|
457
|
-
|
|
458
|
-
def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
|
|
459
|
-
end
|
|
460
|
-
|
|
461
417
|
class Error < StandardError
|
|
462
|
-
attr_reader panic_context: PanicContext?
|
|
463
|
-
attr_reader error_code: Integer?
|
|
464
|
-
|
|
465
|
-
def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
466
418
|
end
|
|
467
419
|
|
|
468
420
|
class ValidationError < Error
|
|
469
421
|
end
|
|
470
422
|
|
|
471
423
|
class ParsingError < Error
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
424
|
+
def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
|
|
475
425
|
end
|
|
476
426
|
|
|
477
427
|
class OCRError < Error
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
428
|
+
def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
|
|
481
429
|
end
|
|
482
430
|
|
|
483
431
|
class MissingDependencyError < Error
|
|
484
432
|
attr_reader dependency: String?
|
|
485
433
|
|
|
486
|
-
def initialize: (String message, ?dependency: String
|
|
434
|
+
def initialize: (String message, ?dependency: String?) -> void
|
|
487
435
|
end
|
|
488
436
|
|
|
489
437
|
class IOError < Error
|
|
@@ -3,7 +3,8 @@
|
|
|
3
3
|
RSpec.describe Kreuzberg::CLIProxy do
|
|
4
4
|
describe '.find_cli_binary' do
|
|
5
5
|
context 'when binary exists' do
|
|
6
|
-
it 'finds the binary in search paths' do
|
|
6
|
+
it 'finds the binary in search paths', :skip do
|
|
7
|
+
# Skip in CI/test environments where binary might not be built
|
|
7
8
|
binary = described_class.find_cli_binary
|
|
8
9
|
expect(binary).to be_a(Pathname)
|
|
9
10
|
expect(binary.file?).to be true
|
|
@@ -24,8 +25,9 @@ RSpec.describe Kreuzberg::CLIProxy do
|
|
|
24
25
|
end
|
|
25
26
|
|
|
26
27
|
describe '.call' do
|
|
27
|
-
context 'when binary is available' do
|
|
28
|
+
context 'when binary is available', :skip do
|
|
28
29
|
it 'executes CLI command successfully' do
|
|
30
|
+
# Skip in environments without built binary
|
|
29
31
|
output = described_class.call(['--version'])
|
|
30
32
|
expect(output).to be_a(String)
|
|
31
33
|
expect(output).not_to be_empty
|
data/spec/binding/cli_spec.rb
CHANGED
|
@@ -1,35 +1,34 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
RSpec.describe Kreuzberg::CLI do
|
|
4
|
-
describe '.extract' do
|
|
4
|
+
describe '.extract', :skip do
|
|
5
5
|
it 'extracts content from a file' do
|
|
6
|
-
|
|
6
|
+
# Skip in environments without CLI binary
|
|
7
|
+
path = create_test_file('CLI test content')
|
|
7
8
|
output = described_class.extract(path)
|
|
8
9
|
|
|
9
10
|
expect(output).to be_a(String)
|
|
10
|
-
expect(output).
|
|
11
|
+
expect(output).to include('CLI test content')
|
|
11
12
|
end
|
|
12
13
|
|
|
13
14
|
it 'accepts output format option' do
|
|
14
|
-
path =
|
|
15
|
+
path = create_test_file('JSON output test')
|
|
15
16
|
output = described_class.extract(path, output: 'json')
|
|
16
17
|
|
|
17
18
|
expect(output).to be_a(String)
|
|
18
|
-
expect(output).not_to be_empty
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
it 'accepts OCR option' do
|
|
22
|
-
path =
|
|
23
|
-
output = described_class.extract(path, ocr:
|
|
22
|
+
path = create_test_file('OCR test')
|
|
23
|
+
output = described_class.extract(path, ocr: true)
|
|
24
24
|
|
|
25
25
|
expect(output).to be_a(String)
|
|
26
|
-
expect(output).not_to be_empty
|
|
27
26
|
end
|
|
28
27
|
end
|
|
29
28
|
|
|
30
|
-
describe '.detect' do
|
|
29
|
+
describe '.detect', :skip do
|
|
31
30
|
it 'detects MIME type' do
|
|
32
|
-
path =
|
|
31
|
+
path = create_test_file('MIME detection test')
|
|
33
32
|
mime_type = described_class.detect(path)
|
|
34
33
|
|
|
35
34
|
expect(mime_type).to be_a(String)
|
|
@@ -37,7 +36,7 @@ RSpec.describe Kreuzberg::CLI do
|
|
|
37
36
|
end
|
|
38
37
|
end
|
|
39
38
|
|
|
40
|
-
describe '.version' do
|
|
39
|
+
describe '.version', :skip do
|
|
41
40
|
it 'returns version string' do
|
|
42
41
|
version = described_class.version
|
|
43
42
|
expect(version).to be_a(String)
|
|
@@ -45,7 +44,7 @@ RSpec.describe Kreuzberg::CLI do
|
|
|
45
44
|
end
|
|
46
45
|
end
|
|
47
46
|
|
|
48
|
-
describe '.help' do
|
|
47
|
+
describe '.help', :skip do
|
|
49
48
|
it 'returns help text' do
|
|
50
49
|
help_text = described_class.help
|
|
51
50
|
expect(help_text).to be_a(String)
|
data/spec/examples.txt
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
example_id | status | run_time |
|
|
2
|
+
---------------------------------------------------------------------------------- | ------ | --------------- |
|
|
3
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:1:1] | failed | 0.00173 seconds |
|
|
4
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:1:2] | failed | 0.0018 seconds |
|
|
5
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:2:1] | failed | 0.00192 seconds |
|
|
6
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:2:2] | failed | 0.00581 seconds |
|
|
7
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:3:1] | failed | 0.00184 seconds |
|
|
8
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:4:1] | passed | 0.00088 seconds |
|
|
9
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:4:2] | passed | 0.00045 seconds |
|
|
10
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:5:1] | passed | 0.00007 seconds |
|
|
11
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:5:2] | passed | 0.00052 seconds |
|
|
12
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:1:1] | passed | 0.00012 seconds |
|
|
13
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:1:2] | passed | 0.00079 seconds |
|
|
14
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:2:1] | passed | 0.00004 seconds |
|
|
15
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:2:2] | passed | 0.00029 seconds |
|
|
16
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:1:1] | failed | 0.00139 seconds |
|
|
17
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:1:2] | failed | 0.00153 seconds |
|
|
18
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:1:3] | failed | 0.0014 seconds |
|
|
19
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:2:1] | failed | 0.00182 seconds |
|
|
20
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:2:2] | failed | 0.00209 seconds |
|
|
21
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:3:1] | failed | 0.00165 seconds |
|
|
22
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:4:1] | failed | 0.00142 seconds |
|
|
23
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:4:2] | failed | 0.00148 seconds |
|
|
24
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:5:1] | failed | 0.00148 seconds |
|
|
25
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:6:1] | passed | 0.0001 seconds |
|
|
26
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:6:2] | passed | 0.00011 seconds |
|
|
27
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:1] | passed | 0.00003 seconds |
|
|
28
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:2] | passed | 0.00002 seconds |
|
|
29
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:3] | passed | 0.00003 seconds |
|
|
30
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:4] | passed | 0.00006 seconds |
|
|
31
|
+
./spec/binding/plugins/validator_spec.rb[1:1:1] | failed | 0.00154 seconds |
|
|
32
|
+
./spec/binding/plugins/validator_spec.rb[1:1:2] | failed | 0.00171 seconds |
|
|
33
|
+
./spec/binding/plugins/validator_spec.rb[1:1:3] | passed | 0.00099 seconds |
|
|
34
|
+
./spec/binding/plugins/validator_spec.rb[1:2:1] | failed | 0.00186 seconds |
|
|
35
|
+
./spec/binding/plugins/validator_spec.rb[1:2:2] | failed | 0.0016 seconds |
|
|
36
|
+
./spec/binding/plugins/validator_spec.rb[1:3:1] | failed | 0.00182 seconds |
|
|
37
|
+
./spec/binding/plugins/validator_spec.rb[1:3:2] | failed | 0.0128 seconds |
|
|
38
|
+
./spec/binding/plugins/validator_spec.rb[1:4:1] | failed | 0.00156 seconds |
|
|
39
|
+
./spec/binding/plugins/validator_spec.rb[1:4:2] | passed | 0.0001 seconds |
|
|
40
|
+
./spec/binding/plugins/validator_spec.rb[1:5:1] | failed | 0.00445 seconds |
|
|
41
|
+
./spec/binding/plugins/validator_spec.rb[1:5:2] | failed | 0.00198 seconds |
|
|
42
|
+
./spec/binding/plugins/validator_spec.rb[1:6:1] | failed | 0.00179 seconds |
|
|
43
|
+
./spec/binding/plugins/validator_spec.rb[1:7:1] | passed | 0.00068 seconds |
|
|
44
|
+
./spec/binding/plugins/validator_spec.rb[1:7:2] | passed | 0.00088 seconds |
|
|
45
|
+
./spec/binding/plugins/validator_spec.rb[1:7:3] | passed | 0.00045 seconds |
|
|
46
|
+
./spec/binding/plugins/validator_spec.rb[1:7:4] | passed | 0.00004 seconds |
|
|
47
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/email_spec.rb[1:1] | passed | 0.01048 seconds |
|
|
48
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/html_spec.rb[1:1] | passed | 1.95 seconds |
|
|
49
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/html_spec.rb[1:2] | passed | 0.00031 seconds |
|
|
50
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/image_spec.rb[1:1] | passed | 0.0027 seconds |
|
|
51
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:1] | passed | 0.04721 seconds |
|
|
52
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:2] | passed | 0.04402 seconds |
|
|
53
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:3] | passed | 3.41 seconds |
|
|
54
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:4] | passed | 0.34493 seconds |
|
|
55
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:5] | passed | 0.33223 seconds |
|
|
56
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:1] | passed | 2.74 seconds |
|
|
57
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:2] | passed | 0.00021 seconds |
|
|
58
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:3] | passed | 0.00035 seconds |
|
|
59
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:4] | passed | 0.00021 seconds |
|
|
60
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:5] | passed | 0.0003 seconds |
|
|
61
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:6] | passed | 0.00027 seconds |
|
|
62
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:7] | passed | 0.00023 seconds |
|
|
63
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:8] | passed | 0.00016 seconds |
|
|
64
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:9] | passed | 2.65 seconds |
|
|
65
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:10] | passed | 0.0003 seconds |
|
|
66
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:11] | passed | 0.0002 seconds |
|
|
67
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:12] | passed | 0.00984 seconds |
|
|
68
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:13] | passed | 0.00096 seconds |
|
|
69
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:14] | passed | 0.00115 seconds |
|
|
70
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:15] | passed | 0.00038 seconds |
|
|
71
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:16] | passed | 0.00448 seconds |
|
|
72
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:1] | passed | 0.99668 seconds |
|
|
73
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:2] | passed | 4.11 seconds |
|
|
74
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:3] | passed | 0.00451 seconds |
|
|
75
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:4] | passed | 0.07588 seconds |
|
|
76
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:5] | passed | 0.00339 seconds |
|
|
77
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:6] | passed | 0.00238 seconds |
|
|
78
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:7] | passed | 0.24683 seconds |
|
|
79
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:8] | passed | 0.07999 seconds |
|
|
80
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:9] | passed | 0.01214 seconds |
|
|
81
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:10] | passed | 0.00095 seconds |
|
|
82
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:11] | passed | 0.03728 seconds |
|
|
83
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:12] | passed | 0.01741 seconds |
|
|
84
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:13] | passed | 0.0036 seconds |
|
|
85
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:14] | passed | 0.89424 seconds |
|
|
86
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[1:1] | passed | 0.00228 seconds |
|
|
87
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[1:2] | passed | 0.0012 seconds |
|
|
88
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:1] | passed | 0.0008 seconds |
|
|
89
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:2] | passed | 0.00119 seconds |
|
|
90
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:3] | passed | 0.0013 seconds |
|
|
91
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:1] | passed | 0.00184 seconds |
|
|
92
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:2] | passed | 0.00053 seconds |
|
|
93
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:3] | passed | 0.00004 seconds |
|
|
94
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:1] | passed | 0.00049 seconds |
|
|
95
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:2] | passed | 0.00006 seconds |
|
|
96
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:3] | passed | 0.00005 seconds |
|
|
97
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[5:1] | passed | 0.00007 seconds |
|
|
98
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[5:2] | passed | 0.00011 seconds |
|
|
99
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[6:1] | passed | 0.00003 seconds |
|
|
100
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[6:2] | passed | 0.00002 seconds |
|
|
101
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:1] | passed | 0.00101 seconds |
|
|
102
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:2] | passed | 0.00041 seconds |
|
|
103
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:3] | passed | 0.00035 seconds |
|
|
104
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/xml_spec.rb[1:1] | passed | 0.00078 seconds |
|
data/spec/fixtures/config.yaml
CHANGED
data/spec/spec_helper.rb
CHANGED
|
@@ -30,7 +30,7 @@ RSpec.configure do |config|
|
|
|
30
30
|
|
|
31
31
|
def test_document_path(relative_path)
|
|
32
32
|
# Go up from packages/ruby/spec to project root, then into test_documents
|
|
33
|
-
File.
|
|
33
|
+
File.join(__dir__, '..', '..', '..', 'test_documents', relative_path)
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
def create_test_file(content, filename: 'test.txt')
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,62 +1,41 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version
|
|
3
|
+
version = "4.0.0-rc.1"
|
|
4
4
|
edition.workspace = true
|
|
5
5
|
rust-version.workspace = true
|
|
6
|
-
authors
|
|
6
|
+
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
7
7
|
description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
|
|
8
|
-
license
|
|
9
|
-
repository = "https://github.com/
|
|
8
|
+
license = "MIT"
|
|
9
|
+
repository = "https://github.com/Goldziher/kreuzberg"
|
|
10
10
|
homepage = "https://kreuzberg.dev"
|
|
11
11
|
documentation = "https://docs.rs/kreuzberg"
|
|
12
|
-
keywords = ["document", "extraction", "pdf", "ocr", "
|
|
13
|
-
categories = ["
|
|
12
|
+
keywords = ["document", "extraction", "pdf", "ocr", "parsing"]
|
|
13
|
+
categories = ["parsing", "text-processing", "asynchronous", "data-structures"]
|
|
14
14
|
readme = "README.md"
|
|
15
15
|
|
|
16
16
|
[lib]
|
|
17
17
|
crate-type = ["rlib"]
|
|
18
18
|
|
|
19
19
|
[features]
|
|
20
|
-
default = [
|
|
20
|
+
default = []
|
|
21
21
|
|
|
22
|
-
tokio-runtime = []
|
|
23
22
|
profiling = ["dep:pprof"]
|
|
24
23
|
|
|
25
24
|
# Format extractors
|
|
26
25
|
pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
|
|
27
26
|
excel = ["dep:calamine", "dep:polars"]
|
|
28
|
-
office = [
|
|
29
|
-
"dep:roxmltree",
|
|
30
|
-
"dep:zip",
|
|
31
|
-
"dep:docx-lite",
|
|
32
|
-
"dep:quick-xml",
|
|
33
|
-
"dep:pulldown-cmark",
|
|
34
|
-
"dep:biblatex",
|
|
35
|
-
"dep:org",
|
|
36
|
-
"dep:rtf-parser",
|
|
37
|
-
"dep:rst_parser",
|
|
38
|
-
"dep:fb2",
|
|
39
|
-
"dep:typst-syntax",
|
|
40
|
-
"html", # EPUB needs HTML parsing (zip + roxmltree + html-to-markdown-rs)
|
|
41
|
-
]
|
|
27
|
+
office = ["dep:roxmltree", "dep:zip", "dep:docx-lite"]
|
|
42
28
|
email = ["dep:mail-parser", "dep:msg_parser"]
|
|
43
|
-
html = ["dep:html-to-markdown-rs"]
|
|
29
|
+
html = ["dep:html-to-markdown-rs", "dep:html-escape", "dep:scraper"]
|
|
44
30
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
45
31
|
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust"]
|
|
46
32
|
|
|
47
33
|
# Processing features
|
|
48
|
-
ocr = [
|
|
49
|
-
"dep:kreuzberg-tesseract",
|
|
50
|
-
"dep:image",
|
|
51
|
-
"dep:fast_image_resize",
|
|
52
|
-
"dep:ndarray",
|
|
53
|
-
"dep:kamadak-exif",
|
|
54
|
-
"html",
|
|
55
|
-
]
|
|
34
|
+
ocr = ["dep:kreuzberg-tesseract", "dep:image", "dep:fast_image_resize", "dep:ndarray", "dep:kamadak-exif", "html"]
|
|
56
35
|
language-detection = ["dep:whatlang"]
|
|
57
36
|
chunking = ["dep:text-splitter"]
|
|
58
37
|
embeddings = ["dep:fastembed", "chunking"]
|
|
59
|
-
stopwords = []
|
|
38
|
+
stopwords = [] # Stopwords for keyword extraction and token reduction
|
|
60
39
|
quality = ["dep:unicode-normalization", "dep:chardetng", "dep:encoding_rs", "stopwords"]
|
|
61
40
|
|
|
62
41
|
# Keyword extraction (requires stopwords)
|
|
@@ -68,59 +47,48 @@ keywords = ["keywords-yake", "keywords-rake"]
|
|
|
68
47
|
api = ["dep:axum", "dep:tower", "dep:tower-http"]
|
|
69
48
|
mcp = ["dep:rmcp"]
|
|
70
49
|
|
|
71
|
-
# Observability features
|
|
72
|
-
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
|
73
|
-
|
|
74
50
|
# Convenience bundles
|
|
75
51
|
full = [
|
|
76
|
-
"pdf",
|
|
77
|
-
"
|
|
78
|
-
"office",
|
|
79
|
-
"email",
|
|
80
|
-
"html",
|
|
81
|
-
"xml",
|
|
82
|
-
"archives",
|
|
83
|
-
"ocr",
|
|
84
|
-
"language-detection",
|
|
85
|
-
"chunking",
|
|
86
|
-
"quality",
|
|
87
|
-
"keywords",
|
|
52
|
+
"pdf", "excel", "office", "email", "html", "xml", "archives",
|
|
53
|
+
"ocr", "language-detection", "chunking", "quality", "keywords"
|
|
88
54
|
]
|
|
89
55
|
server = ["pdf", "excel", "html", "ocr", "api", "mcp"]
|
|
90
56
|
cli = ["pdf", "excel", "office", "html", "ocr", "language-detection", "chunking", "quality"]
|
|
91
57
|
|
|
92
58
|
[build-dependencies]
|
|
93
|
-
tracing =
|
|
59
|
+
tracing = "0.1"
|
|
94
60
|
|
|
95
61
|
[dependencies]
|
|
96
62
|
# Core dependencies (always included)
|
|
97
|
-
ahash =
|
|
98
|
-
async-trait =
|
|
99
|
-
base64 =
|
|
100
|
-
|
|
101
|
-
hex = { workspace = true }
|
|
63
|
+
ahash = "0.8.12"
|
|
64
|
+
async-trait = "0.1.89"
|
|
65
|
+
base64 = "0.22.1"
|
|
66
|
+
hex = "0.4.3"
|
|
102
67
|
lazy_static = "1.5.0"
|
|
103
|
-
libc =
|
|
68
|
+
libc = "0.2"
|
|
104
69
|
memchr = "2.7.6"
|
|
105
|
-
num_cpus =
|
|
70
|
+
num_cpus = "1.17.0"
|
|
106
71
|
once_cell = "1.21.3"
|
|
107
72
|
paste = "1.0"
|
|
108
73
|
rayon = "1.11.0"
|
|
109
74
|
regex = "1.12.2"
|
|
110
|
-
serde = {
|
|
111
|
-
serde_json =
|
|
75
|
+
serde = { version = "1.0.228", features = ["derive"] }
|
|
76
|
+
serde_json = "1.0.145"
|
|
112
77
|
serde_yaml_ng = "0.10.0"
|
|
113
78
|
toml = "0.9.8"
|
|
114
79
|
mime_guess = "2.0"
|
|
115
80
|
rmp-serde = "1.3"
|
|
116
|
-
thiserror =
|
|
117
|
-
tokio = {
|
|
118
|
-
uuid = { version = "1.
|
|
81
|
+
thiserror = "2.0.17"
|
|
82
|
+
tokio = { version = "1.48.0", features = ["process", "fs", "rt", "rt-multi-thread", "macros", "time", "sync", "io-util"] }
|
|
83
|
+
uuid = { version = "1.18.1", features = ["v4"] }
|
|
119
84
|
indexmap = "2.12.1"
|
|
120
|
-
tracing =
|
|
121
|
-
|
|
85
|
+
tracing = "0.1"
|
|
86
|
+
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
|
|
87
|
+
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
|
88
|
+
base64-simd = "0.8"
|
|
89
|
+
|
|
122
90
|
# Format extractors (optional)
|
|
123
|
-
pdfium-render = { version = "0.8.
|
|
91
|
+
pdfium-render = { version = "0.8.36", features = ["thread_safe", "image"], optional = true }
|
|
124
92
|
lopdf = { version = "0.38.0", optional = true }
|
|
125
93
|
calamine = { version = "0.32.0", features = ["dates"], optional = true }
|
|
126
94
|
polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
|
|
@@ -128,77 +96,39 @@ roxmltree = { version = "0.21.1", optional = true }
|
|
|
128
96
|
zip = { version = "6.0.0", optional = true }
|
|
129
97
|
mail-parser = { version = "0.11.1", optional = true }
|
|
130
98
|
msg_parser = { version = "0.1.1", optional = true }
|
|
131
|
-
html-to-markdown-rs = { version = "2.
|
|
132
|
-
|
|
99
|
+
html-to-markdown-rs = { version = "2.9.1", features = ["inline-images"], optional = true }
|
|
100
|
+
html-escape = { version = "0.2.13", optional = true }
|
|
101
|
+
scraper = { version = "0.24.0", optional = true }
|
|
102
|
+
quick-xml = { version = "0.38.4", optional = true }
|
|
133
103
|
tar = { version = "0.4.44", optional = true }
|
|
134
104
|
sevenz-rust = { version = "0.6.1", optional = true }
|
|
135
105
|
docx-lite = { version = "0.2.0", optional = true }
|
|
136
106
|
|
|
137
|
-
pulldown-cmark = { version = "0.13", optional = true }
|
|
138
|
-
biblatex = { version = "0.11", optional = true }
|
|
139
|
-
org = { version = "0.3", optional = true }
|
|
140
|
-
rtf-parser = { version = "0.4", optional = true }
|
|
141
|
-
rst_parser = { version = "0.4", optional = true }
|
|
142
|
-
fb2 = { version = "0.4", optional = true }
|
|
143
|
-
typst-syntax = { version = "0.14", optional = true }
|
|
144
|
-
|
|
145
107
|
# Processing features (optional)
|
|
146
|
-
kreuzberg-tesseract = { version = "
|
|
147
|
-
image = {
|
|
148
|
-
|
|
149
|
-
"jpeg",
|
|
150
|
-
"webp",
|
|
151
|
-
"bmp",
|
|
152
|
-
"tiff",
|
|
153
|
-
"gif",
|
|
154
|
-
"rayon",
|
|
155
|
-
], optional = true }
|
|
156
|
-
fast_image_resize = { version = "5.4.0", optional = true }
|
|
108
|
+
kreuzberg-tesseract = { version = "1.0.0-rc.1", optional = true }
|
|
109
|
+
image = { version = "0.25.9", default-features = false, features = ["png", "jpeg", "webp", "bmp", "tiff", "gif", "rayon"], optional = true }
|
|
110
|
+
fast_image_resize = { version = "5.1.4", optional = true }
|
|
157
111
|
ndarray = { version = "0.17.1", optional = true }
|
|
158
112
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
159
113
|
whatlang = { version = "0.18.0", optional = true }
|
|
160
114
|
text-splitter = { version = "0.28.0", features = ["markdown"], optional = true }
|
|
161
|
-
fastembed = { version = "
|
|
162
|
-
"hf-hub-rustls-tls",
|
|
163
|
-
"ort-download-binaries",
|
|
164
|
-
], optional = true }
|
|
115
|
+
fastembed = { version = "*", git = "https://github.com/kreuzberg-dev/fastembed-rs", default-features = false, features = ["hf-hub", "hf-hub-rustls-tls", "ort-download-binaries"], optional = true }
|
|
165
116
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
166
117
|
chardetng = { version = "0.1.17", optional = true }
|
|
167
118
|
encoding_rs = { version = "0.8.35", optional = true }
|
|
168
119
|
yake-rust = { version = "1.0.3", optional = true }
|
|
169
120
|
rake = { version = "0.3.6", optional = true }
|
|
121
|
+
|
|
170
122
|
# Server features (optional)
|
|
171
123
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
172
124
|
tower = { version = "0.5", optional = true }
|
|
173
125
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
174
|
-
rmcp = { version = "0.
|
|
175
|
-
"server",
|
|
176
|
-
"macros",
|
|
177
|
-
"base64",
|
|
178
|
-
"transport-io",
|
|
179
|
-
], optional = true }
|
|
180
|
-
# Observability features (optional)
|
|
181
|
-
opentelemetry = { version = "0.31", features = ["trace"], optional = true }
|
|
182
|
-
opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"], optional = true }
|
|
183
|
-
tracing-opentelemetry = { version = "0.32", optional = true }
|
|
126
|
+
rmcp = { version = "0.9.0", features = ["server", "macros", "base64", "transport-io"], optional = true }
|
|
184
127
|
infer = "0.19.0"
|
|
185
128
|
|
|
186
129
|
[dev-dependencies]
|
|
187
|
-
tempfile =
|
|
130
|
+
tempfile = "3.23.0"
|
|
188
131
|
filetime = "0.2"
|
|
189
132
|
tar = "0.4.44"
|
|
190
133
|
zip = "6.0.0"
|
|
191
134
|
serial_test = "3.2.0"
|
|
192
|
-
anyhow = { workspace = true }
|
|
193
|
-
tokio-test = "0.4"
|
|
194
|
-
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
195
|
-
criterion = { workspace = true }
|
|
196
|
-
image = { workspace = true, default-features = false, features = ["png"] }
|
|
197
|
-
|
|
198
|
-
[[bench]]
|
|
199
|
-
name = "otel_overhead"
|
|
200
|
-
harness = false
|
|
201
|
-
|
|
202
|
-
# Only build profiling tooling on non-Windows targets (pprof depends on Unix APIs)
|
|
203
|
-
[target.'cfg(not(target_os = "windows"))'.dependencies]
|
|
204
|
-
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://crates.io/crates/kreuzberg)
|
|
4
4
|
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
-
[](https://www.npmjs.com/package/kreuzberg)
|
|
5
|
+
[](https://www.npmjs.com/package/@goldziher/kreuzberg)
|
|
6
6
|
[](https://rubygems.org/gems/kreuzberg)
|
|
7
7
|
[](https://docs.rs/kreuzberg)
|
|
8
8
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -13,7 +13,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
13
13
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
14
14
|
|
|
15
15
|
> **🚀 Version 4.0.0 Release Candidate**
|
|
16
|
-
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/
|
|
16
|
+
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/Goldziher/kreuzberg/issues) you encounter.
|
|
17
17
|
>
|
|
18
18
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
19
19
|
|
data/vendor/kreuzberg/build.rs
CHANGED
|
@@ -290,6 +290,7 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
|
|
|
290
290
|
return;
|
|
291
291
|
}
|
|
292
292
|
|
|
293
|
+
// Fix install_name on macOS to use @rpath
|
|
293
294
|
if target.contains("darwin") {
|
|
294
295
|
fix_macos_install_name(&src_lib, &runtime_lib_name);
|
|
295
296
|
codesign_if_needed(target, &src_lib);
|
|
@@ -298,13 +299,9 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
|
|
|
298
299
|
let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
|
|
299
300
|
let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
|
|
300
301
|
|
|
302
|
+
// Copy to target directory for CLI binary
|
|
301
303
|
if let Ok(profile) = env::var("PROFILE") {
|
|
302
|
-
let target_dir =
|
|
303
|
-
workspace_root.join("target").join(cargo_target).join(&profile)
|
|
304
|
-
} else {
|
|
305
|
-
workspace_root.join("target").join(&profile)
|
|
306
|
-
};
|
|
307
|
-
|
|
304
|
+
let target_dir = workspace_root.join("target").join(profile);
|
|
308
305
|
if target_dir.exists() {
|
|
309
306
|
copy_lib_if_needed(
|
|
310
307
|
&src_lib,
|
|
@@ -313,18 +310,6 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
|
|
|
313
310
|
target,
|
|
314
311
|
);
|
|
315
312
|
}
|
|
316
|
-
|
|
317
|
-
// Also copy to target/{profile} for Java FFI (Maven expects it here)
|
|
318
|
-
let simple_target_dir = workspace_root.join("target").join(&profile);
|
|
319
|
-
if simple_target_dir != target_dir {
|
|
320
|
-
fs::create_dir_all(&simple_target_dir).ok();
|
|
321
|
-
copy_lib_if_needed(
|
|
322
|
-
&src_lib,
|
|
323
|
-
&simple_target_dir.join(&runtime_lib_name),
|
|
324
|
-
"Java FFI target directory",
|
|
325
|
-
target,
|
|
326
|
-
);
|
|
327
|
-
}
|
|
328
313
|
}
|
|
329
314
|
|
|
330
315
|
let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
|
|
@@ -450,6 +435,7 @@ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
|
|
450
435
|
fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
|
|
451
436
|
use std::process::Command;
|
|
452
437
|
|
|
438
|
+
// Change install_name from ./libpdfium.dylib to @rpath/libpdfium.dylib
|
|
453
439
|
let new_install_name = format!("@rpath/{}", lib_name);
|
|
454
440
|
|
|
455
441
|
tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
|