kreuzberg 4.0.0.pre.rc.14 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +25 -215
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -2
- data/ext/kreuzberg_rb/native/build.rs +38 -1
- data/lib/kreuzberg/result.rb +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/{libpdfium.dylib → libpdfium.so} +0 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/build.rs +54 -10
- data/vendor/kreuzberg/src/api/mod.rs +8 -0
- data/vendor/kreuzberg/src/extraction/html.rs +40 -7
- data/vendor/kreuzberg/src/pdf/bundled.rs +115 -9
- data/vendor/kreuzberg/tests/format_integration.rs +1 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +11 -21
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1ac94696cb48598d98ae55f75c69c59e1d248577b965a3921e21998ee33d2352
|
|
4
|
+
data.tar.gz: 684e9f74a5f0d5c2c52677fec3cec493707b084dc77815396b237864dfeded90
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6ed0b13217aad741e169850f155a28f921a37a41ffa95fb12a733798b49625f7a9db030eae90ddf00ee3e367b5a563a426fa301c8a16604c8ad5ca3ba78432fc
|
|
7
|
+
data.tar.gz: 6c3acf2fb24f573a65e81fdac91f3735a6e2335c340d79a453d73fb43b63b807a8b9e93bbbba38a8c55550be72f0f503513b142238a1c3965e279d8ed522b3ae
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.15)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -136,7 +136,6 @@ GEM
|
|
|
136
136
|
yard (0.9.38)
|
|
137
137
|
|
|
138
138
|
PLATFORMS
|
|
139
|
-
arm64-darwin-23
|
|
140
139
|
arm64-darwin-24
|
|
141
140
|
x86_64-linux
|
|
142
141
|
|
|
@@ -75,56 +75,6 @@ dependencies = [
|
|
|
75
75
|
"libc",
|
|
76
76
|
]
|
|
77
77
|
|
|
78
|
-
[[package]]
|
|
79
|
-
name = "anstream"
|
|
80
|
-
version = "0.6.21"
|
|
81
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
82
|
-
checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
|
|
83
|
-
dependencies = [
|
|
84
|
-
"anstyle",
|
|
85
|
-
"anstyle-parse",
|
|
86
|
-
"anstyle-query",
|
|
87
|
-
"anstyle-wincon",
|
|
88
|
-
"colorchoice",
|
|
89
|
-
"is_terminal_polyfill",
|
|
90
|
-
"utf8parse",
|
|
91
|
-
]
|
|
92
|
-
|
|
93
|
-
[[package]]
|
|
94
|
-
name = "anstyle"
|
|
95
|
-
version = "1.0.13"
|
|
96
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
97
|
-
checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
|
|
98
|
-
|
|
99
|
-
[[package]]
|
|
100
|
-
name = "anstyle-parse"
|
|
101
|
-
version = "0.2.7"
|
|
102
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
103
|
-
checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
|
|
104
|
-
dependencies = [
|
|
105
|
-
"utf8parse",
|
|
106
|
-
]
|
|
107
|
-
|
|
108
|
-
[[package]]
|
|
109
|
-
name = "anstyle-query"
|
|
110
|
-
version = "1.1.5"
|
|
111
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
112
|
-
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
|
|
113
|
-
dependencies = [
|
|
114
|
-
"windows-sys 0.61.2",
|
|
115
|
-
]
|
|
116
|
-
|
|
117
|
-
[[package]]
|
|
118
|
-
name = "anstyle-wincon"
|
|
119
|
-
version = "3.0.11"
|
|
120
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
121
|
-
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
|
|
122
|
-
dependencies = [
|
|
123
|
-
"anstyle",
|
|
124
|
-
"once_cell_polyfill",
|
|
125
|
-
"windows-sys 0.61.2",
|
|
126
|
-
]
|
|
127
|
-
|
|
128
78
|
[[package]]
|
|
129
79
|
name = "anyhow"
|
|
130
80
|
version = "1.0.100"
|
|
@@ -466,30 +416,15 @@ dependencies = [
|
|
|
466
416
|
"syn",
|
|
467
417
|
]
|
|
468
418
|
|
|
469
|
-
[[package]]
|
|
470
|
-
name = "bit-set"
|
|
471
|
-
version = "0.6.0"
|
|
472
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
473
|
-
checksum = "f0481a0e032742109b1133a095184ee93d88f3dc9e0d28a5d033dc77a073f44f"
|
|
474
|
-
dependencies = [
|
|
475
|
-
"bit-vec 0.7.0",
|
|
476
|
-
]
|
|
477
|
-
|
|
478
419
|
[[package]]
|
|
479
420
|
name = "bit-set"
|
|
480
421
|
version = "0.8.0"
|
|
481
422
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
482
423
|
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
|
|
483
424
|
dependencies = [
|
|
484
|
-
"bit-vec
|
|
425
|
+
"bit-vec",
|
|
485
426
|
]
|
|
486
427
|
|
|
487
|
-
[[package]]
|
|
488
|
-
name = "bit-vec"
|
|
489
|
-
version = "0.7.0"
|
|
490
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
491
|
-
checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22"
|
|
492
|
-
|
|
493
428
|
[[package]]
|
|
494
429
|
name = "bit-vec"
|
|
495
430
|
version = "0.8.0"
|
|
@@ -661,25 +596,6 @@ dependencies = [
|
|
|
661
596
|
"cipher",
|
|
662
597
|
]
|
|
663
598
|
|
|
664
|
-
[[package]]
|
|
665
|
-
name = "cbindgen"
|
|
666
|
-
version = "0.29.2"
|
|
667
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
668
|
-
checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799"
|
|
669
|
-
dependencies = [
|
|
670
|
-
"clap",
|
|
671
|
-
"heck",
|
|
672
|
-
"indexmap",
|
|
673
|
-
"log",
|
|
674
|
-
"proc-macro2",
|
|
675
|
-
"quote",
|
|
676
|
-
"serde",
|
|
677
|
-
"serde_json",
|
|
678
|
-
"syn",
|
|
679
|
-
"tempfile",
|
|
680
|
-
"toml 0.9.10+spec-1.1.0",
|
|
681
|
-
]
|
|
682
|
-
|
|
683
599
|
[[package]]
|
|
684
600
|
name = "cc"
|
|
685
601
|
version = "1.2.50"
|
|
@@ -780,33 +696,6 @@ dependencies = [
|
|
|
780
696
|
"libloading 0.8.9",
|
|
781
697
|
]
|
|
782
698
|
|
|
783
|
-
[[package]]
|
|
784
|
-
name = "clap"
|
|
785
|
-
version = "4.5.53"
|
|
786
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
787
|
-
checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8"
|
|
788
|
-
dependencies = [
|
|
789
|
-
"clap_builder",
|
|
790
|
-
]
|
|
791
|
-
|
|
792
|
-
[[package]]
|
|
793
|
-
name = "clap_builder"
|
|
794
|
-
version = "4.5.53"
|
|
795
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
796
|
-
checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00"
|
|
797
|
-
dependencies = [
|
|
798
|
-
"anstream",
|
|
799
|
-
"anstyle",
|
|
800
|
-
"clap_lex",
|
|
801
|
-
"strsim",
|
|
802
|
-
]
|
|
803
|
-
|
|
804
|
-
[[package]]
|
|
805
|
-
name = "clap_lex"
|
|
806
|
-
version = "0.7.6"
|
|
807
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
808
|
-
checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
|
809
|
-
|
|
810
699
|
[[package]]
|
|
811
700
|
name = "cmake"
|
|
812
701
|
version = "0.1.57"
|
|
@@ -831,12 +720,6 @@ version = "1.1.0"
|
|
|
831
720
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
832
721
|
checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
|
|
833
722
|
|
|
834
|
-
[[package]]
|
|
835
|
-
name = "colorchoice"
|
|
836
|
-
version = "1.0.4"
|
|
837
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
838
|
-
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
|
|
839
|
-
|
|
840
723
|
[[package]]
|
|
841
724
|
name = "compact_str"
|
|
842
725
|
version = "0.9.0"
|
|
@@ -945,9 +828,9 @@ dependencies = [
|
|
|
945
828
|
|
|
946
829
|
[[package]]
|
|
947
830
|
name = "crc"
|
|
948
|
-
version = "3.
|
|
831
|
+
version = "3.3.0"
|
|
949
832
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
950
|
-
checksum = "
|
|
833
|
+
checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675"
|
|
951
834
|
dependencies = [
|
|
952
835
|
"crc-catalog",
|
|
953
836
|
]
|
|
@@ -1401,7 +1284,7 @@ version = "0.14.0"
|
|
|
1401
1284
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1402
1285
|
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
|
|
1403
1286
|
dependencies = [
|
|
1404
|
-
"bit-set
|
|
1287
|
+
"bit-set",
|
|
1405
1288
|
"regex-automata",
|
|
1406
1289
|
"regex-syntax",
|
|
1407
1290
|
]
|
|
@@ -1497,17 +1380,6 @@ dependencies = [
|
|
|
1497
1380
|
"windows-sys 0.60.2",
|
|
1498
1381
|
]
|
|
1499
1382
|
|
|
1500
|
-
[[package]]
|
|
1501
|
-
name = "filetime_creation"
|
|
1502
|
-
version = "0.2.0"
|
|
1503
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1504
|
-
checksum = "c25b5d475550e559de5b0c0084761c65325444e3b6c9e298af9cefe7a9ef3a5f"
|
|
1505
|
-
dependencies = [
|
|
1506
|
-
"cfg-if",
|
|
1507
|
-
"filetime",
|
|
1508
|
-
"windows-sys 0.52.0",
|
|
1509
|
-
]
|
|
1510
|
-
|
|
1511
1383
|
[[package]]
|
|
1512
1384
|
name = "find-msvc-tools"
|
|
1513
1385
|
version = "0.1.5"
|
|
@@ -2312,12 +2184,6 @@ dependencies = [
|
|
|
2312
2184
|
"serde",
|
|
2313
2185
|
]
|
|
2314
2186
|
|
|
2315
|
-
[[package]]
|
|
2316
|
-
name = "is_terminal_polyfill"
|
|
2317
|
-
version = "1.70.2"
|
|
2318
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2319
|
-
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
|
|
2320
|
-
|
|
2321
2187
|
[[package]]
|
|
2322
2188
|
name = "itertools"
|
|
2323
2189
|
version = "0.12.1"
|
|
@@ -2414,7 +2280,7 @@ dependencies = [
|
|
|
2414
2280
|
|
|
2415
2281
|
[[package]]
|
|
2416
2282
|
name = "kreuzberg"
|
|
2417
|
-
version = "4.0.0-rc.
|
|
2283
|
+
version = "4.0.0-rc.14"
|
|
2418
2284
|
dependencies = [
|
|
2419
2285
|
"ahash",
|
|
2420
2286
|
"async-trait",
|
|
@@ -2467,7 +2333,7 @@ dependencies = [
|
|
|
2467
2333
|
"serde",
|
|
2468
2334
|
"serde_json",
|
|
2469
2335
|
"serde_yaml_ng",
|
|
2470
|
-
"sevenz-
|
|
2336
|
+
"sevenz-rust2",
|
|
2471
2337
|
"tar",
|
|
2472
2338
|
"text-splitter",
|
|
2473
2339
|
"thiserror 2.0.17",
|
|
@@ -2483,30 +2349,16 @@ dependencies = [
|
|
|
2483
2349
|
"uuid",
|
|
2484
2350
|
"whatlang",
|
|
2485
2351
|
"yake-rust",
|
|
2486
|
-
"zip
|
|
2487
|
-
]
|
|
2488
|
-
|
|
2489
|
-
[[package]]
|
|
2490
|
-
name = "kreuzberg-ffi"
|
|
2491
|
-
version = "4.0.0-rc.13"
|
|
2492
|
-
dependencies = [
|
|
2493
|
-
"async-trait",
|
|
2494
|
-
"cbindgen",
|
|
2495
|
-
"html-to-markdown-rs",
|
|
2496
|
-
"kreuzberg",
|
|
2497
|
-
"serde",
|
|
2498
|
-
"serde_json",
|
|
2499
|
-
"tokio",
|
|
2352
|
+
"zip 7.0.0",
|
|
2500
2353
|
]
|
|
2501
2354
|
|
|
2502
2355
|
[[package]]
|
|
2503
2356
|
name = "kreuzberg-rb"
|
|
2504
|
-
version = "4.0.0-rc.
|
|
2357
|
+
version = "4.0.0-rc.15"
|
|
2505
2358
|
dependencies = [
|
|
2506
2359
|
"async-trait",
|
|
2507
2360
|
"html-to-markdown-rs",
|
|
2508
2361
|
"kreuzberg",
|
|
2509
|
-
"kreuzberg-ffi",
|
|
2510
2362
|
"magnus",
|
|
2511
2363
|
"pretty_assertions",
|
|
2512
2364
|
"rb-sys",
|
|
@@ -2516,14 +2368,14 @@ dependencies = [
|
|
|
2516
2368
|
|
|
2517
2369
|
[[package]]
|
|
2518
2370
|
name = "kreuzberg-tesseract"
|
|
2519
|
-
version = "4.0.0-rc.
|
|
2371
|
+
version = "4.0.0-rc.14"
|
|
2520
2372
|
dependencies = [
|
|
2521
2373
|
"cc",
|
|
2522
2374
|
"cmake",
|
|
2523
2375
|
"libc",
|
|
2524
2376
|
"reqwest",
|
|
2525
2377
|
"thiserror 2.0.17",
|
|
2526
|
-
"zip
|
|
2378
|
+
"zip 7.0.0",
|
|
2527
2379
|
]
|
|
2528
2380
|
|
|
2529
2381
|
[[package]]
|
|
@@ -2726,20 +2578,11 @@ dependencies = [
|
|
|
2726
2578
|
"libc",
|
|
2727
2579
|
]
|
|
2728
2580
|
|
|
2729
|
-
[[package]]
|
|
2730
|
-
name = "lzma-rust"
|
|
2731
|
-
version = "0.1.7"
|
|
2732
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2733
|
-
checksum = "5baab2bbbd7d75a144d671e9ff79270e903957d92fb7386fd39034c709bd2661"
|
|
2734
|
-
dependencies = [
|
|
2735
|
-
"byteorder",
|
|
2736
|
-
]
|
|
2737
|
-
|
|
2738
2581
|
[[package]]
|
|
2739
2582
|
name = "lzma-rust2"
|
|
2740
|
-
version = "0.
|
|
2583
|
+
version = "0.15.4"
|
|
2741
2584
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2742
|
-
checksum = "
|
|
2585
|
+
checksum = "48172246aa7c3ea28e423295dd1ca2589a24617cc4e588bb8cfe177cb2c54d95"
|
|
2743
2586
|
dependencies = [
|
|
2744
2587
|
"crc",
|
|
2745
2588
|
"sha2",
|
|
@@ -3076,16 +2919,6 @@ dependencies = [
|
|
|
3076
2919
|
"chrono",
|
|
3077
2920
|
]
|
|
3078
2921
|
|
|
3079
|
-
[[package]]
|
|
3080
|
-
name = "nt-time"
|
|
3081
|
-
version = "0.8.1"
|
|
3082
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3083
|
-
checksum = "2de419e64947cd8830e66beb584acc3fb42ed411d103e3c794dda355d1b374b5"
|
|
3084
|
-
dependencies = [
|
|
3085
|
-
"chrono",
|
|
3086
|
-
"time",
|
|
3087
|
-
]
|
|
3088
|
-
|
|
3089
2922
|
[[package]]
|
|
3090
2923
|
name = "num-bigint"
|
|
3091
2924
|
version = "0.4.6"
|
|
@@ -3218,12 +3051,6 @@ version = "1.21.3"
|
|
|
3218
3051
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3219
3052
|
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
|
3220
3053
|
|
|
3221
|
-
[[package]]
|
|
3222
|
-
name = "once_cell_polyfill"
|
|
3223
|
-
version = "1.70.2"
|
|
3224
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3225
|
-
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
|
3226
|
-
|
|
3227
3054
|
[[package]]
|
|
3228
3055
|
name = "onig"
|
|
3229
3056
|
version = "6.5.1"
|
|
@@ -5042,18 +4869,19 @@ dependencies = [
|
|
|
5042
4869
|
]
|
|
5043
4870
|
|
|
5044
4871
|
[[package]]
|
|
5045
|
-
name = "sevenz-
|
|
5046
|
-
version = "0.
|
|
4872
|
+
name = "sevenz-rust2"
|
|
4873
|
+
version = "0.20.0"
|
|
5047
4874
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5048
|
-
checksum = "
|
|
4875
|
+
checksum = "611081ec4fc67633b979fc0c24385de90fa60acd18126d796c8758a24294a950"
|
|
5049
4876
|
dependencies = [
|
|
5050
|
-
"
|
|
5051
|
-
"
|
|
5052
|
-
"
|
|
5053
|
-
"
|
|
4877
|
+
"aes",
|
|
4878
|
+
"bzip2",
|
|
4879
|
+
"cbc",
|
|
4880
|
+
"crc32fast",
|
|
4881
|
+
"getrandom 0.3.4",
|
|
5054
4882
|
"js-sys",
|
|
5055
|
-
"lzma-
|
|
5056
|
-
"
|
|
4883
|
+
"lzma-rust2",
|
|
4884
|
+
"ppmd-rust",
|
|
5057
4885
|
"sha2",
|
|
5058
4886
|
"wasm-bindgen",
|
|
5059
4887
|
]
|
|
@@ -5401,19 +5229,6 @@ dependencies = [
|
|
|
5401
5229
|
"xattr",
|
|
5402
5230
|
]
|
|
5403
5231
|
|
|
5404
|
-
[[package]]
|
|
5405
|
-
name = "tempfile"
|
|
5406
|
-
version = "3.23.0"
|
|
5407
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5408
|
-
checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
|
|
5409
|
-
dependencies = [
|
|
5410
|
-
"fastrand",
|
|
5411
|
-
"getrandom 0.3.4",
|
|
5412
|
-
"once_cell",
|
|
5413
|
-
"rustix",
|
|
5414
|
-
"windows-sys 0.61.2",
|
|
5415
|
-
]
|
|
5416
|
-
|
|
5417
5232
|
[[package]]
|
|
5418
5233
|
name = "tendril"
|
|
5419
5234
|
version = "0.4.3"
|
|
@@ -6122,12 +5937,6 @@ version = "1.0.4"
|
|
|
6122
5937
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6123
5938
|
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
|
6124
5939
|
|
|
6125
|
-
[[package]]
|
|
6126
|
-
name = "utf8parse"
|
|
6127
|
-
version = "0.2.2"
|
|
6128
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6129
|
-
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
|
6130
|
-
|
|
6131
5940
|
[[package]]
|
|
6132
5941
|
name = "uuid"
|
|
6133
5942
|
version = "1.19.0"
|
|
@@ -6829,9 +6638,9 @@ dependencies = [
|
|
|
6829
6638
|
|
|
6830
6639
|
[[package]]
|
|
6831
6640
|
name = "zip"
|
|
6832
|
-
version = "
|
|
6641
|
+
version = "7.0.0"
|
|
6833
6642
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6834
|
-
checksum = "
|
|
6643
|
+
checksum = "bdd8a47718a4ee5fe78e07667cd36f3de80e7c2bfe727c7074245ffc7303c037"
|
|
6835
6644
|
dependencies = [
|
|
6836
6645
|
"aes",
|
|
6837
6646
|
"arbitrary",
|
|
@@ -6840,6 +6649,7 @@ dependencies = [
|
|
|
6840
6649
|
"crc32fast",
|
|
6841
6650
|
"deflate64",
|
|
6842
6651
|
"flate2",
|
|
6652
|
+
"generic-array",
|
|
6843
6653
|
"getrandom 0.3.4",
|
|
6844
6654
|
"hmac",
|
|
6845
6655
|
"indexmap",
|
|
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
|
|
|
7
7
|
|
|
8
8
|
[package]
|
|
9
9
|
name = "kreuzberg-rb"
|
|
10
|
-
version = "4.0.0-rc.
|
|
10
|
+
version = "4.0.0-rc.15"
|
|
11
11
|
edition = "2024"
|
|
12
12
|
rust-version = "1.91"
|
|
13
13
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -30,7 +30,6 @@ default = []
|
|
|
30
30
|
[dependencies]
|
|
31
31
|
async-trait = "0.1.89"
|
|
32
32
|
kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full"] }
|
|
33
|
-
kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi", features = ["embeddings"] }
|
|
34
33
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
35
34
|
"rb-sys",
|
|
36
35
|
] }
|
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
#[cfg(target_os = "macos")]
|
|
2
2
|
fn main() {
|
|
3
|
+
if let Ok(cargo_manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
|
|
4
|
+
let lib_path = std::path::Path::new(&cargo_manifest_dir)
|
|
5
|
+
.parent()
|
|
6
|
+
.and_then(|p| p.parent())
|
|
7
|
+
.and_then(|p| p.parent())
|
|
8
|
+
.and_then(|p| p.parent())
|
|
9
|
+
.and_then(|p| p.parent())
|
|
10
|
+
.map(|p| p.join("target/release"))
|
|
11
|
+
.expect("Failed to construct lib path");
|
|
12
|
+
println!("cargo:rustc-link-search={}", lib_path.display());
|
|
13
|
+
}
|
|
3
14
|
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
4
15
|
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
5
16
|
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
|
@@ -7,9 +18,35 @@ fn main() {
|
|
|
7
18
|
|
|
8
19
|
#[cfg(target_os = "linux")]
|
|
9
20
|
fn main() {
|
|
21
|
+
if let Ok(cargo_manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
|
|
22
|
+
let lib_path = std::path::Path::new(&cargo_manifest_dir)
|
|
23
|
+
.parent()
|
|
24
|
+
.and_then(|p| p.parent())
|
|
25
|
+
.and_then(|p| p.parent())
|
|
26
|
+
.and_then(|p| p.parent())
|
|
27
|
+
.and_then(|p| p.parent())
|
|
28
|
+
.map(|p| p.join("target/release"))
|
|
29
|
+
.expect("Failed to construct lib path");
|
|
30
|
+
println!("cargo:rustc-link-search={}", lib_path.display());
|
|
31
|
+
}
|
|
10
32
|
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
11
33
|
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
|
12
34
|
}
|
|
13
35
|
|
|
14
|
-
#[cfg(
|
|
36
|
+
#[cfg(target_os = "windows")]
|
|
37
|
+
fn main() {
|
|
38
|
+
if let Ok(cargo_manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
|
|
39
|
+
let lib_path = std::path::Path::new(&cargo_manifest_dir)
|
|
40
|
+
.parent()
|
|
41
|
+
.and_then(|p| p.parent())
|
|
42
|
+
.and_then(|p| p.parent())
|
|
43
|
+
.and_then(|p| p.parent())
|
|
44
|
+
.and_then(|p| p.parent())
|
|
45
|
+
.map(|p| p.join("target/release"))
|
|
46
|
+
.expect("Failed to construct lib path");
|
|
47
|
+
println!("cargo:rustc-link-search={}", lib_path.display());
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
#[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
|
|
15
52
|
fn main() {}
|
data/lib/kreuzberg/result.rb
CHANGED
data/lib/kreuzberg/version.rb
CHANGED
|
Binary file
|
data/vendor/Cargo.toml
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
[workspace]
|
|
2
|
-
members = ["kreuzberg", "kreuzberg-
|
|
2
|
+
members = ["kreuzberg", "kreuzberg-tesseract"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.0.0-rc.
|
|
5
|
+
version = "4.0.0-rc.15"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
data/vendor/kreuzberg/build.rs
CHANGED
|
@@ -228,7 +228,10 @@ fn ensure_windows_import_library(pdfium_dir: &Path) {
|
|
|
228
228
|
/// Fetch the latest release version from a GitHub repository
|
|
229
229
|
///
|
|
230
230
|
/// Uses curl to query the GitHub API and extract the tag_name from the
|
|
231
|
-
/// latest release JSON response.
|
|
231
|
+
/// latest release JSON response. Uses improved JSON parsing with fallback logic.
|
|
232
|
+
///
|
|
233
|
+
/// For WASM (paulocoutinhox/pdfium-lib), falls back to known stable versions.
|
|
234
|
+
/// For non-WASM (bblanchon/pdfium-binaries), uses a different fallback strategy.
|
|
232
235
|
fn get_latest_version(repo: &str) -> String {
|
|
233
236
|
let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
|
|
234
237
|
|
|
@@ -238,19 +241,60 @@ fn get_latest_version(repo: &str) -> String {
|
|
|
238
241
|
&& output.status.success()
|
|
239
242
|
{
|
|
240
243
|
let json = String::from_utf8_lossy(&output.stdout);
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
244
|
+
|
|
245
|
+
// Try to extract tag_name from JSON
|
|
246
|
+
if let Some(tag) = extract_tag_from_json(&json) {
|
|
247
|
+
return tag;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Fallback versions based on repository
|
|
252
|
+
// These are stable versions known to have all required assets
|
|
253
|
+
if repo.contains("paulocoutinhox") {
|
|
254
|
+
eprintln!(
|
|
255
|
+
"cargo:warning=Failed to fetch latest PDFium WASM version from GitHub API, using fallback version 7442b"
|
|
256
|
+
);
|
|
257
|
+
"7442b".to_string()
|
|
258
|
+
} else if repo.contains("bblanchon") {
|
|
259
|
+
eprintln!(
|
|
260
|
+
"cargo:warning=Failed to fetch latest PDFium binaries version from GitHub API, using fallback version 7568"
|
|
261
|
+
);
|
|
262
|
+
"7568".to_string()
|
|
263
|
+
} else {
|
|
264
|
+
eprintln!(
|
|
265
|
+
"cargo:warning=Failed to fetch latest PDFium version from GitHub API (unknown repository: {})",
|
|
266
|
+
repo
|
|
267
|
+
);
|
|
268
|
+
String::new()
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/// Extract tag_name from GitHub API JSON response
|
|
273
|
+
///
|
|
274
|
+
/// Parses JSON by finding the tag_name field and extracting the value between quotes.
|
|
275
|
+
/// Handles various JSON formatting variations.
|
|
276
|
+
fn extract_tag_from_json(json: &str) -> Option<String> {
|
|
277
|
+
// Look for "tag_name": "..." pattern
|
|
278
|
+
if let Some(start) = json.find("\"tag_name\"") {
|
|
279
|
+
let after_colon = &json[start + "\"tag_name\"".len()..];
|
|
280
|
+
|
|
281
|
+
// Skip whitespace and colon
|
|
282
|
+
let after_colon = after_colon.trim_start();
|
|
283
|
+
let after_colon = after_colon.strip_prefix(':')?;
|
|
284
|
+
let after_colon = after_colon.trim_start();
|
|
285
|
+
|
|
286
|
+
// Extract value between quotes
|
|
287
|
+
if let Some(opening_quote) = after_colon.find('"') {
|
|
288
|
+
let value_start = opening_quote + 1;
|
|
289
|
+
if let Some(closing_quote) = after_colon[value_start..].find('"') {
|
|
290
|
+
let tag = &after_colon[value_start..value_start + closing_quote];
|
|
291
|
+
// Handle releases with '/' in tag (e.g., "chromium/1234")
|
|
292
|
+
return Some(tag.split('/').next_back().unwrap_or(tag).to_string());
|
|
249
293
|
}
|
|
250
294
|
}
|
|
251
295
|
}
|
|
252
296
|
|
|
253
|
-
|
|
297
|
+
None
|
|
254
298
|
}
|
|
255
299
|
|
|
256
300
|
/// Get the download URL and library name for the target platform
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
//! - `POST /extract` - Extract text from uploaded files (multipart form data)
|
|
9
9
|
//! - `GET /health` - Health check endpoint
|
|
10
10
|
//! - `GET /info` - Server information
|
|
11
|
+
//! - `GET /cache/stats` - Get cache statistics
|
|
12
|
+
//! - `DELETE /cache/clear` - Clear all cached files
|
|
11
13
|
//!
|
|
12
14
|
//! # Examples
|
|
13
15
|
//!
|
|
@@ -62,6 +64,12 @@
|
|
|
62
64
|
//!
|
|
63
65
|
//! # Server info
|
|
64
66
|
//! curl http://localhost:8000/info
|
|
67
|
+
//!
|
|
68
|
+
//! # Cache statistics
|
|
69
|
+
//! curl http://localhost:8000/cache/stats
|
|
70
|
+
//!
|
|
71
|
+
//! # Clear cache
|
|
72
|
+
//! curl -X DELETE http://localhost:8000/cache/clear
|
|
65
73
|
//! ```
|
|
66
74
|
|
|
67
75
|
mod error;
|