kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
data/sig/kreuzberg.rbs CHANGED
@@ -3,16 +3,6 @@
3
3
  module Kreuzberg
4
4
  VERSION: String
5
5
 
6
- # Error code constants
7
- ERROR_CODE_SUCCESS: Integer
8
- ERROR_CODE_GENERIC: Integer
9
- ERROR_CODE_PANIC: Integer
10
- ERROR_CODE_INVALID_ARGUMENT: Integer
11
- ERROR_CODE_IO: Integer
12
- ERROR_CODE_PARSING: Integer
13
- ERROR_CODE_OCR: Integer
14
- ERROR_CODE_MISSING_DEPENDENCY: Integer
15
-
16
6
  # Config namespace (defined in lib/kreuzberg/config.rb)
17
7
  module Config
18
8
  class OCR
@@ -401,10 +391,6 @@ module Kreuzberg
401
391
  # Config loading (native method)
402
392
  def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
403
393
 
404
- # Error introspection (native methods)
405
- def self._last_error_code_native: () -> Integer
406
- def self._last_panic_context_json_native: () -> String?
407
-
408
394
  # Plugin registration
409
395
  def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
410
396
  def self.unregister_post_processor: (String name) -> void
@@ -427,63 +413,25 @@ module Kreuzberg
427
413
  def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
428
414
  end
429
415
 
430
- module ErrorContext
431
- def self.last_error_code: () -> Integer
432
- def self.last_panic_context: () -> Errors::PanicContext?
433
- def self.last_panic_context_json: () -> String?
434
- end
435
-
436
416
  module Errors
437
- # Panic context information from FFI error introspection
438
- class PanicContext
439
- attr_reader file: String
440
- attr_reader line: Integer
441
- attr_reader function: String
442
- attr_reader message: String
443
- attr_reader timestamp_secs: Integer
444
-
445
- def initialize: (
446
- file: String,
447
- line: Integer,
448
- function: String,
449
- message: String,
450
- timestamp_secs: Integer
451
- ) -> void
452
- def to_s: () -> String
453
- def to_h: () -> Hash[Symbol, String | Integer]
454
- def self.from_json: (String) -> PanicContext?
455
-
456
- private
457
-
458
- def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
459
- end
460
-
461
417
  class Error < StandardError
462
- attr_reader panic_context: PanicContext?
463
- attr_reader error_code: Integer?
464
-
465
- def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
466
418
  end
467
419
 
468
420
  class ValidationError < Error
469
421
  end
470
422
 
471
423
  class ParsingError < Error
472
- attr_reader context: Hash[untyped, untyped]?
473
-
474
- def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
424
+ def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
475
425
  end
476
426
 
477
427
  class OCRError < Error
478
- attr_reader context: Hash[untyped, untyped]?
479
-
480
- def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
428
+ def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
481
429
  end
482
430
 
483
431
  class MissingDependencyError < Error
484
432
  attr_reader dependency: String?
485
433
 
486
- def initialize: (String message, ?dependency: String?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
434
+ def initialize: (String message, ?dependency: String?) -> void
487
435
  end
488
436
 
489
437
  class IOError < Error
@@ -3,7 +3,8 @@
3
3
  RSpec.describe Kreuzberg::CLIProxy do
4
4
  describe '.find_cli_binary' do
5
5
  context 'when binary exists' do
6
- it 'finds the binary in search paths' do
6
+ it 'finds the binary in search paths', :skip do
7
+ # Skip in CI/test environments where binary might not be built
7
8
  binary = described_class.find_cli_binary
8
9
  expect(binary).to be_a(Pathname)
9
10
  expect(binary.file?).to be true
@@ -24,8 +25,9 @@ RSpec.describe Kreuzberg::CLIProxy do
24
25
  end
25
26
 
26
27
  describe '.call' do
27
- context 'when binary is available' do
28
+ context 'when binary is available', :skip do
28
29
  it 'executes CLI command successfully' do
30
+ # Skip in environments without built binary
29
31
  output = described_class.call(['--version'])
30
32
  expect(output).to be_a(String)
31
33
  expect(output).not_to be_empty
@@ -1,35 +1,34 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  RSpec.describe Kreuzberg::CLI do
4
- describe '.extract' do
4
+ describe '.extract', :skip do
5
5
  it 'extracts content from a file' do
6
- path = test_document_path('documents/simple.odt')
6
+ # Skip in environments without CLI binary
7
+ path = create_test_file('CLI test content')
7
8
  output = described_class.extract(path)
8
9
 
9
10
  expect(output).to be_a(String)
10
- expect(output).not_to be_empty
11
+ expect(output).to include('CLI test content')
11
12
  end
12
13
 
13
14
  it 'accepts output format option' do
14
- path = test_document_path('documents/simple.odt')
15
+ path = create_test_file('JSON output test')
15
16
  output = described_class.extract(path, output: 'json')
16
17
 
17
18
  expect(output).to be_a(String)
18
- expect(output).not_to be_empty
19
19
  end
20
20
 
21
21
  it 'accepts OCR option' do
22
- path = test_document_path('pdfs/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
23
- output = described_class.extract(path, ocr: false)
22
+ path = create_test_file('OCR test')
23
+ output = described_class.extract(path, ocr: true)
24
24
 
25
25
  expect(output).to be_a(String)
26
- expect(output).not_to be_empty
27
26
  end
28
27
  end
29
28
 
30
- describe '.detect' do
29
+ describe '.detect', :skip do
31
30
  it 'detects MIME type' do
32
- path = test_document_path('documents/simple.odt')
31
+ path = create_test_file('MIME detection test')
33
32
  mime_type = described_class.detect(path)
34
33
 
35
34
  expect(mime_type).to be_a(String)
@@ -37,7 +36,7 @@ RSpec.describe Kreuzberg::CLI do
37
36
  end
38
37
  end
39
38
 
40
- describe '.version' do
39
+ describe '.version', :skip do
41
40
  it 'returns version string' do
42
41
  version = described_class.version
43
42
  expect(version).to be_a(String)
@@ -45,7 +44,7 @@ RSpec.describe Kreuzberg::CLI do
45
44
  end
46
45
  end
47
46
 
48
- describe '.help' do
47
+ describe '.help', :skip do
49
48
  it 'returns help text' do
50
49
  help_text = described_class.help
51
50
  expect(help_text).to be_a(String)
data/spec/examples.txt ADDED
@@ -0,0 +1,104 @@
1
+ example_id | status | run_time |
2
+ ---------------------------------------------------------------------------------- | ------ | --------------- |
3
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:1:1] | failed | 0.00173 seconds |
4
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:1:2] | failed | 0.0018 seconds |
5
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:2:1] | failed | 0.00192 seconds |
6
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:2:2] | failed | 0.00581 seconds |
7
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:3:1] | failed | 0.00184 seconds |
8
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:4:1] | passed | 0.00088 seconds |
9
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:4:2] | passed | 0.00045 seconds |
10
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:5:1] | passed | 0.00007 seconds |
11
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:5:2] | passed | 0.00052 seconds |
12
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:6:1:1] | passed | 0.00012 seconds |
13
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:6:1:2] | passed | 0.00079 seconds |
14
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:6:2:1] | passed | 0.00004 seconds |
15
+ ./spec/binding/plugins/ocr_backend_spec.rb[1:6:2:2] | passed | 0.00029 seconds |
16
+ ./spec/binding/plugins/postprocessor_spec.rb[1:1:1] | failed | 0.00139 seconds |
17
+ ./spec/binding/plugins/postprocessor_spec.rb[1:1:2] | failed | 0.00153 seconds |
18
+ ./spec/binding/plugins/postprocessor_spec.rb[1:1:3] | failed | 0.0014 seconds |
19
+ ./spec/binding/plugins/postprocessor_spec.rb[1:2:1] | failed | 0.00182 seconds |
20
+ ./spec/binding/plugins/postprocessor_spec.rb[1:2:2] | failed | 0.00209 seconds |
21
+ ./spec/binding/plugins/postprocessor_spec.rb[1:3:1] | failed | 0.00165 seconds |
22
+ ./spec/binding/plugins/postprocessor_spec.rb[1:4:1] | failed | 0.00142 seconds |
23
+ ./spec/binding/plugins/postprocessor_spec.rb[1:4:2] | failed | 0.00148 seconds |
24
+ ./spec/binding/plugins/postprocessor_spec.rb[1:5:1] | failed | 0.00148 seconds |
25
+ ./spec/binding/plugins/postprocessor_spec.rb[1:6:1] | passed | 0.0001 seconds |
26
+ ./spec/binding/plugins/postprocessor_spec.rb[1:6:2] | passed | 0.00011 seconds |
27
+ ./spec/binding/plugins/postprocessor_spec.rb[1:7:1] | passed | 0.00003 seconds |
28
+ ./spec/binding/plugins/postprocessor_spec.rb[1:7:2] | passed | 0.00002 seconds |
29
+ ./spec/binding/plugins/postprocessor_spec.rb[1:7:3] | passed | 0.00003 seconds |
30
+ ./spec/binding/plugins/postprocessor_spec.rb[1:7:4] | passed | 0.00006 seconds |
31
+ ./spec/binding/plugins/validator_spec.rb[1:1:1] | failed | 0.00154 seconds |
32
+ ./spec/binding/plugins/validator_spec.rb[1:1:2] | failed | 0.00171 seconds |
33
+ ./spec/binding/plugins/validator_spec.rb[1:1:3] | passed | 0.00099 seconds |
34
+ ./spec/binding/plugins/validator_spec.rb[1:2:1] | failed | 0.00186 seconds |
35
+ ./spec/binding/plugins/validator_spec.rb[1:2:2] | failed | 0.0016 seconds |
36
+ ./spec/binding/plugins/validator_spec.rb[1:3:1] | failed | 0.00182 seconds |
37
+ ./spec/binding/plugins/validator_spec.rb[1:3:2] | failed | 0.0128 seconds |
38
+ ./spec/binding/plugins/validator_spec.rb[1:4:1] | failed | 0.00156 seconds |
39
+ ./spec/binding/plugins/validator_spec.rb[1:4:2] | passed | 0.0001 seconds |
40
+ ./spec/binding/plugins/validator_spec.rb[1:5:1] | failed | 0.00445 seconds |
41
+ ./spec/binding/plugins/validator_spec.rb[1:5:2] | failed | 0.00198 seconds |
42
+ ./spec/binding/plugins/validator_spec.rb[1:6:1] | failed | 0.00179 seconds |
43
+ ./spec/binding/plugins/validator_spec.rb[1:7:1] | passed | 0.00068 seconds |
44
+ ./spec/binding/plugins/validator_spec.rb[1:7:2] | passed | 0.00088 seconds |
45
+ ./spec/binding/plugins/validator_spec.rb[1:7:3] | passed | 0.00045 seconds |
46
+ ./spec/binding/plugins/validator_spec.rb[1:7:4] | passed | 0.00004 seconds |
47
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/email_spec.rb[1:1] | passed | 0.01048 seconds |
48
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/html_spec.rb[1:1] | passed | 1.95 seconds |
49
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/html_spec.rb[1:2] | passed | 0.00031 seconds |
50
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/image_spec.rb[1:1] | passed | 0.0027 seconds |
51
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:1] | passed | 0.04721 seconds |
52
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:2] | passed | 0.04402 seconds |
53
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:3] | passed | 3.41 seconds |
54
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:4] | passed | 0.34493 seconds |
55
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:5] | passed | 0.33223 seconds |
56
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:1] | passed | 2.74 seconds |
57
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:2] | passed | 0.00021 seconds |
58
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:3] | passed | 0.00035 seconds |
59
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:4] | passed | 0.00021 seconds |
60
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:5] | passed | 0.0003 seconds |
61
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:6] | passed | 0.00027 seconds |
62
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:7] | passed | 0.00023 seconds |
63
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:8] | passed | 0.00016 seconds |
64
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:9] | passed | 2.65 seconds |
65
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:10] | passed | 0.0003 seconds |
66
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:11] | passed | 0.0002 seconds |
67
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:12] | passed | 0.00984 seconds |
68
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:13] | passed | 0.00096 seconds |
69
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:14] | passed | 0.00115 seconds |
70
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:15] | passed | 0.00038 seconds |
71
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:16] | passed | 0.00448 seconds |
72
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:1] | passed | 0.99668 seconds |
73
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:2] | passed | 4.11 seconds |
74
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:3] | passed | 0.00451 seconds |
75
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:4] | passed | 0.07588 seconds |
76
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:5] | passed | 0.00339 seconds |
77
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:6] | passed | 0.00238 seconds |
78
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:7] | passed | 0.24683 seconds |
79
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:8] | passed | 0.07999 seconds |
80
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:9] | passed | 0.01214 seconds |
81
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:10] | passed | 0.00095 seconds |
82
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:11] | passed | 0.03728 seconds |
83
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:12] | passed | 0.01741 seconds |
84
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:13] | passed | 0.0036 seconds |
85
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:14] | passed | 0.89424 seconds |
86
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[1:1] | passed | 0.00228 seconds |
87
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[1:2] | passed | 0.0012 seconds |
88
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:1] | passed | 0.0008 seconds |
89
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:2] | passed | 0.00119 seconds |
90
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:3] | passed | 0.0013 seconds |
91
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:1] | passed | 0.00184 seconds |
92
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:2] | passed | 0.00053 seconds |
93
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:3] | passed | 0.00004 seconds |
94
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:1] | passed | 0.00049 seconds |
95
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:2] | passed | 0.00006 seconds |
96
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:3] | passed | 0.00005 seconds |
97
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[5:1] | passed | 0.00007 seconds |
98
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[5:2] | passed | 0.00011 seconds |
99
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[6:1] | passed | 0.00003 seconds |
100
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[6:2] | passed | 0.00002 seconds |
101
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:1] | passed | 0.00101 seconds |
102
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:2] | passed | 0.00041 seconds |
103
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:3] | passed | 0.00035 seconds |
104
+ /Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/xml_spec.rb[1:1] | passed | 0.00078 seconds |
@@ -1,3 +1,4 @@
1
+ # Test configuration file for Kreuzberg Ruby bindings
1
2
 
2
3
  use_cache: false
3
4
  enable_quality_processing: true
data/spec/spec_helper.rb CHANGED
@@ -30,7 +30,7 @@ RSpec.configure do |config|
30
30
 
31
31
  def test_document_path(relative_path)
32
32
  # Go up from packages/ruby/spec to project root, then into test_documents
33
- File.expand_path(File.join(__dir__, '..', '..', '..', 'test_documents', relative_path))
33
+ File.join(__dir__, '..', '..', '..', 'test_documents', relative_path)
34
34
  end
35
35
 
36
36
  def create_test_file(content, filename: 'test.txt')
@@ -1,62 +1,41 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version.workspace = true
3
+ version = "4.0.0-rc.1"
4
4
  edition.workspace = true
5
5
  rust-version.workspace = true
6
- authors.workspace = true
6
+ authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
7
7
  description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
8
- license.workspace = true
9
- repository = "https://github.com/kreuzberg-dev/kreuzberg"
8
+ license = "MIT"
9
+ repository = "https://github.com/Goldziher/kreuzberg"
10
10
  homepage = "https://kreuzberg.dev"
11
11
  documentation = "https://docs.rs/kreuzberg"
12
- keywords = ["document", "extraction", "pdf", "ocr", "parser"]
13
- categories = ["parser-implementations", "text-processing"]
12
+ keywords = ["document", "extraction", "pdf", "ocr", "parsing"]
13
+ categories = ["parsing", "text-processing", "asynchronous", "data-structures"]
14
14
  readme = "README.md"
15
15
 
16
16
  [lib]
17
17
  crate-type = ["rlib"]
18
18
 
19
19
  [features]
20
- default = ["tokio-runtime"]
20
+ default = []
21
21
 
22
- tokio-runtime = []
23
22
  profiling = ["dep:pprof"]
24
23
 
25
24
  # Format extractors
26
25
  pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
27
26
  excel = ["dep:calamine", "dep:polars"]
28
- office = [
29
- "dep:roxmltree",
30
- "dep:zip",
31
- "dep:docx-lite",
32
- "dep:quick-xml",
33
- "dep:pulldown-cmark",
34
- "dep:biblatex",
35
- "dep:org",
36
- "dep:rtf-parser",
37
- "dep:rst_parser",
38
- "dep:fb2",
39
- "dep:typst-syntax",
40
- "html", # EPUB needs HTML parsing (zip + roxmltree + html-to-markdown-rs)
41
- ]
27
+ office = ["dep:roxmltree", "dep:zip", "dep:docx-lite"]
42
28
  email = ["dep:mail-parser", "dep:msg_parser"]
43
- html = ["dep:html-to-markdown-rs"]
29
+ html = ["dep:html-to-markdown-rs", "dep:html-escape", "dep:scraper"]
44
30
  xml = ["dep:quick-xml", "dep:roxmltree"]
45
31
  archives = ["dep:zip", "dep:tar", "dep:sevenz-rust"]
46
32
 
47
33
  # Processing features
48
- ocr = [
49
- "dep:kreuzberg-tesseract",
50
- "dep:image",
51
- "dep:fast_image_resize",
52
- "dep:ndarray",
53
- "dep:kamadak-exif",
54
- "html",
55
- ]
34
+ ocr = ["dep:kreuzberg-tesseract", "dep:image", "dep:fast_image_resize", "dep:ndarray", "dep:kamadak-exif", "html"]
56
35
  language-detection = ["dep:whatlang"]
57
36
  chunking = ["dep:text-splitter"]
58
37
  embeddings = ["dep:fastembed", "chunking"]
59
- stopwords = [] # Stopwords for keyword extraction and token reduction
38
+ stopwords = [] # Stopwords for keyword extraction and token reduction
60
39
  quality = ["dep:unicode-normalization", "dep:chardetng", "dep:encoding_rs", "stopwords"]
61
40
 
62
41
  # Keyword extraction (requires stopwords)
@@ -68,59 +47,48 @@ keywords = ["keywords-yake", "keywords-rake"]
68
47
  api = ["dep:axum", "dep:tower", "dep:tower-http"]
69
48
  mcp = ["dep:rmcp"]
70
49
 
71
- # Observability features
72
- otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
73
-
74
50
  # Convenience bundles
75
51
  full = [
76
- "pdf",
77
- "excel",
78
- "office",
79
- "email",
80
- "html",
81
- "xml",
82
- "archives",
83
- "ocr",
84
- "language-detection",
85
- "chunking",
86
- "quality",
87
- "keywords",
52
+ "pdf", "excel", "office", "email", "html", "xml", "archives",
53
+ "ocr", "language-detection", "chunking", "quality", "keywords"
88
54
  ]
89
55
  server = ["pdf", "excel", "html", "ocr", "api", "mcp"]
90
56
  cli = ["pdf", "excel", "office", "html", "ocr", "language-detection", "chunking", "quality"]
91
57
 
92
58
  [build-dependencies]
93
- tracing = { workspace = true }
59
+ tracing = "0.1"
94
60
 
95
61
  [dependencies]
96
62
  # Core dependencies (always included)
97
- ahash = { workspace = true }
98
- async-trait = { workspace = true }
99
- base64 = { workspace = true }
100
- base64-simd = "0.8"
101
- hex = { workspace = true }
63
+ ahash = "0.8.12"
64
+ async-trait = "0.1.89"
65
+ base64 = "0.22.1"
66
+ hex = "0.4.3"
102
67
  lazy_static = "1.5.0"
103
- libc = { workspace = true }
68
+ libc = "0.2"
104
69
  memchr = "2.7.6"
105
- num_cpus = { workspace = true }
70
+ num_cpus = "1.17.0"
106
71
  once_cell = "1.21.3"
107
72
  paste = "1.0"
108
73
  rayon = "1.11.0"
109
74
  regex = "1.12.2"
110
- serde = { workspace = true }
111
- serde_json = { workspace = true }
75
+ serde = { version = "1.0.228", features = ["derive"] }
76
+ serde_json = "1.0.145"
112
77
  serde_yaml_ng = "0.10.0"
113
78
  toml = "0.9.8"
114
79
  mime_guess = "2.0"
115
80
  rmp-serde = "1.3"
116
- thiserror = { workspace = true }
117
- tokio = { workspace = true }
118
- uuid = { version = "1.19.0", features = ["v4"] }
81
+ thiserror = "2.0.17"
82
+ tokio = { version = "1.48.0", features = ["process", "fs", "rt", "rt-multi-thread", "macros", "time", "sync", "io-util"] }
83
+ uuid = { version = "1.18.1", features = ["v4"] }
119
84
  indexmap = "2.12.1"
120
- tracing = { workspace = true }
121
- reqwest = { workspace = true, default-features = false, features = ["json", "rustls-tls"] }
85
+ tracing = "0.1"
86
+ pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
87
+ reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
88
+ base64-simd = "0.8"
89
+
122
90
  # Format extractors (optional)
123
- pdfium-render = { version = "0.8.37", features = ["thread_safe", "image"], optional = true }
91
+ pdfium-render = { version = "0.8.36", features = ["thread_safe", "image"], optional = true }
124
92
  lopdf = { version = "0.38.0", optional = true }
125
93
  calamine = { version = "0.32.0", features = ["dates"], optional = true }
126
94
  polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
@@ -128,77 +96,39 @@ roxmltree = { version = "0.21.1", optional = true }
128
96
  zip = { version = "6.0.0", optional = true }
129
97
  mail-parser = { version = "0.11.1", optional = true }
130
98
  msg_parser = { version = "0.1.1", optional = true }
131
- html-to-markdown-rs = { version = "2.12.0", features = ["inline-images"], optional = true }
132
- quick-xml = { version = "0.38.4", features = ["serialize"], optional = true }
99
+ html-to-markdown-rs = { version = "2.9.1", features = ["inline-images"], optional = true }
100
+ html-escape = { version = "0.2.13", optional = true }
101
+ scraper = { version = "0.24.0", optional = true }
102
+ quick-xml = { version = "0.38.4", optional = true }
133
103
  tar = { version = "0.4.44", optional = true }
134
104
  sevenz-rust = { version = "0.6.1", optional = true }
135
105
  docx-lite = { version = "0.2.0", optional = true }
136
106
 
137
- pulldown-cmark = { version = "0.13", optional = true }
138
- biblatex = { version = "0.11", optional = true }
139
- org = { version = "0.3", optional = true }
140
- rtf-parser = { version = "0.4", optional = true }
141
- rst_parser = { version = "0.4", optional = true }
142
- fb2 = { version = "0.4", optional = true }
143
- typst-syntax = { version = "0.14", optional = true }
144
-
145
107
  # Processing features (optional)
146
- kreuzberg-tesseract = { version = "4.0.0-rc.6", optional = true }
147
- image = { workspace = true, default-features = false, features = [
148
- "png",
149
- "jpeg",
150
- "webp",
151
- "bmp",
152
- "tiff",
153
- "gif",
154
- "rayon",
155
- ], optional = true }
156
- fast_image_resize = { version = "5.4.0", optional = true }
108
+ kreuzberg-tesseract = { version = "1.0.0-rc.1", optional = true }
109
+ image = { version = "0.25.9", default-features = false, features = ["png", "jpeg", "webp", "bmp", "tiff", "gif", "rayon"], optional = true }
110
+ fast_image_resize = { version = "5.1.4", optional = true }
157
111
  ndarray = { version = "0.17.1", optional = true }
158
112
  kamadak-exif = { version = "0.6.1", optional = true }
159
113
  whatlang = { version = "0.18.0", optional = true }
160
114
  text-splitter = { version = "0.28.0", features = ["markdown"], optional = true }
161
- fastembed = { version = "5.4", default-features = false, features = [
162
- "hf-hub-rustls-tls",
163
- "ort-download-binaries",
164
- ], optional = true }
115
+ fastembed = { version = "*", git = "https://github.com/kreuzberg-dev/fastembed-rs", default-features = false, features = ["hf-hub", "hf-hub-rustls-tls", "ort-download-binaries"], optional = true }
165
116
  unicode-normalization = { version = "0.1.25", optional = true }
166
117
  chardetng = { version = "0.1.17", optional = true }
167
118
  encoding_rs = { version = "0.8.35", optional = true }
168
119
  yake-rust = { version = "1.0.3", optional = true }
169
120
  rake = { version = "0.3.6", optional = true }
121
+
170
122
  # Server features (optional)
171
123
  axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
172
124
  tower = { version = "0.5", optional = true }
173
125
  tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
174
- rmcp = { version = "0.11.0", features = [
175
- "server",
176
- "macros",
177
- "base64",
178
- "transport-io",
179
- ], optional = true }
180
- # Observability features (optional)
181
- opentelemetry = { version = "0.31", features = ["trace"], optional = true }
182
- opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"], optional = true }
183
- tracing-opentelemetry = { version = "0.32", optional = true }
126
+ rmcp = { version = "0.9.0", features = ["server", "macros", "base64", "transport-io"], optional = true }
184
127
  infer = "0.19.0"
185
128
 
186
129
  [dev-dependencies]
187
- tempfile = { workspace = true }
130
+ tempfile = "3.23.0"
188
131
  filetime = "0.2"
189
132
  tar = "0.4.44"
190
133
  zip = "6.0.0"
191
134
  serial_test = "3.2.0"
192
- anyhow = { workspace = true }
193
- tokio-test = "0.4"
194
- tracing-subscriber = { version = "0.3", features = ["env-filter"] }
195
- criterion = { workspace = true }
196
- image = { workspace = true, default-features = false, features = ["png"] }
197
-
198
- [[bench]]
199
- name = "otel_overhead"
200
- harness = false
201
-
202
- # Only build profiling tooling on non-Windows targets (pprof depends on Unix APIs)
203
- [target.'cfg(not(target_os = "windows"))'.dependencies]
204
- pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![Crates.io](https://img.shields.io/crates/v/kreuzberg)](https://crates.io/crates/kreuzberg)
4
4
  [![PyPI](https://img.shields.io/pypi/v/kreuzberg)](https://pypi.org/project/kreuzberg/)
5
- [![npm](https://img.shields.io/npm/v/kreuzberg)](https://www.npmjs.com/package/kreuzberg)
5
+ [![npm](https://img.shields.io/npm/v/@goldziher/kreuzberg)](https://www.npmjs.com/package/@goldziher/kreuzberg)
6
6
  [![RubyGems](https://img.shields.io/gem/v/kreuzberg)](https://rubygems.org/gems/kreuzberg)
7
7
  [![docs.rs](https://docs.rs/kreuzberg/badge.svg)](https://docs.rs/kreuzberg)
8
8
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -13,7 +13,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
13
13
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
14
14
 
15
15
  > **🚀 Version 4.0.0 Release Candidate**
16
- > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
16
+ > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/Goldziher/kreuzberg/issues) you encounter.
17
17
  >
18
18
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
19
19
 
@@ -290,6 +290,7 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
290
290
  return;
291
291
  }
292
292
 
293
+ // Fix install_name on macOS to use @rpath
293
294
  if target.contains("darwin") {
294
295
  fix_macos_install_name(&src_lib, &runtime_lib_name);
295
296
  codesign_if_needed(target, &src_lib);
@@ -298,13 +299,9 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
298
299
  let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
299
300
  let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
300
301
 
302
+ // Copy to target directory for CLI binary
301
303
  if let Ok(profile) = env::var("PROFILE") {
302
- let target_dir = if let Ok(cargo_target) = env::var("TARGET") {
303
- workspace_root.join("target").join(cargo_target).join(&profile)
304
- } else {
305
- workspace_root.join("target").join(&profile)
306
- };
307
-
304
+ let target_dir = workspace_root.join("target").join(profile);
308
305
  if target_dir.exists() {
309
306
  copy_lib_if_needed(
310
307
  &src_lib,
@@ -313,18 +310,6 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
313
310
  target,
314
311
  );
315
312
  }
316
-
317
- // Also copy to target/{profile} for Java FFI (Maven expects it here)
318
- let simple_target_dir = workspace_root.join("target").join(&profile);
319
- if simple_target_dir != target_dir {
320
- fs::create_dir_all(&simple_target_dir).ok();
321
- copy_lib_if_needed(
322
- &src_lib,
323
- &simple_target_dir.join(&runtime_lib_name),
324
- "Java FFI target directory",
325
- target,
326
- );
327
- }
328
313
  }
329
314
 
330
315
  let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
@@ -450,6 +435,7 @@ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
450
435
  fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
451
436
  use std::process::Command;
452
437
 
438
+ // Change install_name from ./libpdfium.dylib to @rpath/libpdfium.dylib
453
439
  let new_install_name = format!("@rpath/{}", lib_name);
454
440
 
455
441
  tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);