kreuzcrawl 0.3.0.pre.rc.55 → 0.3.0.pre.rc.59
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/kreuzcrawl_rb/native/Cargo.lock +49 -25
- data/ext/kreuzcrawl_rb/native/Cargo.toml +2 -2
- data/ext/kreuzcrawl_rb/src/lib.rs +37 -15
- data/lib/kreuzcrawl/native.rb +1 -1
- data/lib/kreuzcrawl/version.rb +2 -2
- data/lib/kreuzcrawl.rb +1 -1
- data/lib/kreuzcrawl_rb.so +0 -0
- data/sig/types.rbs +6 -5
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 119450f2df01a74e99d218b681740eb7367fccbfaea5754437df4f266b765a2e
|
|
4
|
+
data.tar.gz: ec3bd96a79e88e273c3c75a19e1e1630a46b7dde9f205455e7b64cf642a5e3ad
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz: '
|
|
6
|
+
metadata.gz: 473048d12a1cde342c72722cdd730efa94d95ab48583261f869cb68ab913805d44f821424d8ac3ede99f132400f6d79ba79d72ed9002faa454b1d17b3692d712
|
|
7
|
+
data.tar.gz: '0109b1c7a3f3430d1396127924b17937de44c7d9a699e3c6bd61acc2a27d4169510fe7fd027f6726eff3016a976c5d64dcd61b065d70915efae18cc55dd9f5e1'
|
|
@@ -280,9 +280,9 @@ checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
|
|
|
280
280
|
|
|
281
281
|
[[package]]
|
|
282
282
|
name = "cc"
|
|
283
|
-
version = "1.2.
|
|
283
|
+
version = "1.2.64"
|
|
284
284
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
285
|
-
checksum = "
|
|
285
|
+
checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f"
|
|
286
286
|
dependencies = [
|
|
287
287
|
"find-msvc-tools",
|
|
288
288
|
"jobserver",
|
|
@@ -1013,19 +1013,21 @@ dependencies = [
|
|
|
1013
1013
|
|
|
1014
1014
|
[[package]]
|
|
1015
1015
|
name = "html-to-markdown-rs"
|
|
1016
|
-
version = "3.
|
|
1016
|
+
version = "3.6.1"
|
|
1017
1017
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1018
|
-
checksum = "
|
|
1018
|
+
checksum = "567f72be80982d7d1c9d46730506847e8782e62ff95bf8d3ede3489b3a8e963a"
|
|
1019
1019
|
dependencies = [
|
|
1020
1020
|
"ahash",
|
|
1021
1021
|
"astral-tl",
|
|
1022
1022
|
"base64",
|
|
1023
|
+
"bitflags",
|
|
1023
1024
|
"html-escape",
|
|
1024
1025
|
"html5ever",
|
|
1025
1026
|
"image",
|
|
1026
1027
|
"lru",
|
|
1027
1028
|
"memchr",
|
|
1028
1029
|
"once_cell",
|
|
1030
|
+
"phf",
|
|
1029
1031
|
"regex",
|
|
1030
1032
|
"serde",
|
|
1031
1033
|
"serde_json",
|
|
@@ -1426,9 +1428,9 @@ dependencies = [
|
|
|
1426
1428
|
|
|
1427
1429
|
[[package]]
|
|
1428
1430
|
name = "js-sys"
|
|
1429
|
-
version = "0.3.
|
|
1431
|
+
version = "0.3.102"
|
|
1430
1432
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1431
|
-
checksum = "
|
|
1433
|
+
checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31"
|
|
1432
1434
|
dependencies = [
|
|
1433
1435
|
"cfg-if",
|
|
1434
1436
|
"futures-util",
|
|
@@ -1457,9 +1459,9 @@ dependencies = [
|
|
|
1457
1459
|
|
|
1458
1460
|
[[package]]
|
|
1459
1461
|
name = "kreuzcrawl"
|
|
1460
|
-
version = "0.3.0-rc.
|
|
1462
|
+
version = "0.3.0-rc.59"
|
|
1461
1463
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1462
|
-
checksum = "
|
|
1464
|
+
checksum = "b7e01bcf1ea739b309c26f952a9ee4981d9080e9915863dc92ba3803df265c0e"
|
|
1463
1465
|
dependencies = [
|
|
1464
1466
|
"ahash",
|
|
1465
1467
|
"aho-corasick",
|
|
@@ -1480,6 +1482,7 @@ dependencies = [
|
|
|
1480
1482
|
"memchr",
|
|
1481
1483
|
"notify",
|
|
1482
1484
|
"opentelemetry",
|
|
1485
|
+
"opentelemetry-semantic-conventions",
|
|
1483
1486
|
"quick-xml",
|
|
1484
1487
|
"regex",
|
|
1485
1488
|
"reqwest",
|
|
@@ -1491,12 +1494,13 @@ dependencies = [
|
|
|
1491
1494
|
"tokio-stream",
|
|
1492
1495
|
"toml",
|
|
1493
1496
|
"tower",
|
|
1497
|
+
"tracing",
|
|
1494
1498
|
"url",
|
|
1495
1499
|
]
|
|
1496
1500
|
|
|
1497
1501
|
[[package]]
|
|
1498
1502
|
name = "kreuzcrawl-rb"
|
|
1499
|
-
version = "0.3.0-rc.
|
|
1503
|
+
version = "0.3.0-rc.59"
|
|
1500
1504
|
dependencies = [
|
|
1501
1505
|
"futures",
|
|
1502
1506
|
"kreuzcrawl",
|
|
@@ -1754,6 +1758,12 @@ dependencies = [
|
|
|
1754
1758
|
"tracing",
|
|
1755
1759
|
]
|
|
1756
1760
|
|
|
1761
|
+
[[package]]
|
|
1762
|
+
name = "opentelemetry-semantic-conventions"
|
|
1763
|
+
version = "0.32.0"
|
|
1764
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1765
|
+
checksum = "6ca2f98a0437b427b4b08f19f1caa3c44db885a202bc12cfea13d6c702243d68"
|
|
1766
|
+
|
|
1757
1767
|
[[package]]
|
|
1758
1768
|
name = "option-ext"
|
|
1759
1769
|
version = "0.2.0"
|
|
@@ -1795,6 +1805,7 @@ version = "0.13.1"
|
|
|
1795
1805
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1796
1806
|
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
|
|
1797
1807
|
dependencies = [
|
|
1808
|
+
"phf_macros",
|
|
1798
1809
|
"phf_shared",
|
|
1799
1810
|
"serde",
|
|
1800
1811
|
]
|
|
@@ -1819,6 +1830,19 @@ dependencies = [
|
|
|
1819
1830
|
"phf_shared",
|
|
1820
1831
|
]
|
|
1821
1832
|
|
|
1833
|
+
[[package]]
|
|
1834
|
+
name = "phf_macros"
|
|
1835
|
+
version = "0.13.1"
|
|
1836
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1837
|
+
checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
|
|
1838
|
+
dependencies = [
|
|
1839
|
+
"phf_generator",
|
|
1840
|
+
"phf_shared",
|
|
1841
|
+
"proc-macro2",
|
|
1842
|
+
"quote",
|
|
1843
|
+
"syn",
|
|
1844
|
+
]
|
|
1845
|
+
|
|
1822
1846
|
[[package]]
|
|
1823
1847
|
name = "phf_shared"
|
|
1824
1848
|
version = "0.13.1"
|
|
@@ -2955,9 +2979,9 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
|
|
2955
2979
|
|
|
2956
2980
|
[[package]]
|
|
2957
2981
|
name = "wasip2"
|
|
2958
|
-
version = "1.0.
|
|
2982
|
+
version = "1.0.4+wasi-0.2.12"
|
|
2959
2983
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2960
|
-
checksum = "
|
|
2984
|
+
checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487"
|
|
2961
2985
|
dependencies = [
|
|
2962
2986
|
"wit-bindgen 0.57.1",
|
|
2963
2987
|
]
|
|
@@ -2973,9 +2997,9 @@ dependencies = [
|
|
|
2973
2997
|
|
|
2974
2998
|
[[package]]
|
|
2975
2999
|
name = "wasm-bindgen"
|
|
2976
|
-
version = "0.2.
|
|
3000
|
+
version = "0.2.125"
|
|
2977
3001
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2978
|
-
checksum = "
|
|
3002
|
+
checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a"
|
|
2979
3003
|
dependencies = [
|
|
2980
3004
|
"cfg-if",
|
|
2981
3005
|
"once_cell",
|
|
@@ -2986,9 +3010,9 @@ dependencies = [
|
|
|
2986
3010
|
|
|
2987
3011
|
[[package]]
|
|
2988
3012
|
name = "wasm-bindgen-futures"
|
|
2989
|
-
version = "0.4.
|
|
3013
|
+
version = "0.4.75"
|
|
2990
3014
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2991
|
-
checksum = "
|
|
3015
|
+
checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280"
|
|
2992
3016
|
dependencies = [
|
|
2993
3017
|
"js-sys",
|
|
2994
3018
|
"wasm-bindgen",
|
|
@@ -2996,9 +3020,9 @@ dependencies = [
|
|
|
2996
3020
|
|
|
2997
3021
|
[[package]]
|
|
2998
3022
|
name = "wasm-bindgen-macro"
|
|
2999
|
-
version = "0.2.
|
|
3023
|
+
version = "0.2.125"
|
|
3000
3024
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3001
|
-
checksum = "
|
|
3025
|
+
checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d"
|
|
3002
3026
|
dependencies = [
|
|
3003
3027
|
"quote",
|
|
3004
3028
|
"wasm-bindgen-macro-support",
|
|
@@ -3006,9 +3030,9 @@ dependencies = [
|
|
|
3006
3030
|
|
|
3007
3031
|
[[package]]
|
|
3008
3032
|
name = "wasm-bindgen-macro-support"
|
|
3009
|
-
version = "0.2.
|
|
3033
|
+
version = "0.2.125"
|
|
3010
3034
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3011
|
-
checksum = "
|
|
3035
|
+
checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd"
|
|
3012
3036
|
dependencies = [
|
|
3013
3037
|
"bumpalo",
|
|
3014
3038
|
"proc-macro2",
|
|
@@ -3019,9 +3043,9 @@ dependencies = [
|
|
|
3019
3043
|
|
|
3020
3044
|
[[package]]
|
|
3021
3045
|
name = "wasm-bindgen-shared"
|
|
3022
|
-
version = "0.2.
|
|
3046
|
+
version = "0.2.125"
|
|
3023
3047
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3024
|
-
checksum = "
|
|
3048
|
+
checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f"
|
|
3025
3049
|
dependencies = [
|
|
3026
3050
|
"unicode-ident",
|
|
3027
3051
|
]
|
|
@@ -3062,9 +3086,9 @@ dependencies = [
|
|
|
3062
3086
|
|
|
3063
3087
|
[[package]]
|
|
3064
3088
|
name = "web-sys"
|
|
3065
|
-
version = "0.3.
|
|
3089
|
+
version = "0.3.102"
|
|
3066
3090
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3067
|
-
checksum = "
|
|
3091
|
+
checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d"
|
|
3068
3092
|
dependencies = [
|
|
3069
3093
|
"js-sys",
|
|
3070
3094
|
"wasm-bindgen",
|
|
@@ -3523,9 +3547,9 @@ dependencies = [
|
|
|
3523
3547
|
|
|
3524
3548
|
[[package]]
|
|
3525
3549
|
name = "zeroize"
|
|
3526
|
-
version = "1.
|
|
3550
|
+
version = "1.9.0"
|
|
3527
3551
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3528
|
-
checksum = "
|
|
3552
|
+
checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e"
|
|
3529
3553
|
|
|
3530
3554
|
[[package]]
|
|
3531
3555
|
name = "zerotrie"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzcrawl-rb"
|
|
3
|
-
version = "0.3.0-rc.
|
|
3
|
+
version = "0.3.0-rc.59"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
license = "Elastic-2.0"
|
|
6
6
|
description = "High-performance web crawling engine"
|
|
@@ -18,7 +18,7 @@ crate-type = ["cdylib"]
|
|
|
18
18
|
|
|
19
19
|
[dependencies]
|
|
20
20
|
futures = "0.3"
|
|
21
|
-
kreuzcrawl = { version = "0.3.0-rc.
|
|
21
|
+
kreuzcrawl = { version = "0.3.0-rc.59", features = ["interact", "browser-chromiumoxide"] }
|
|
22
22
|
magnus = "0.8"
|
|
23
23
|
rb-sys = ">=0.9, <0.9.128"
|
|
24
24
|
serde = { version = "1", features = ["derive"] }
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// This file is auto-generated by alef. DO NOT EDIT.
|
|
2
|
-
// alef:hash:
|
|
2
|
+
// alef:hash:4a1130eaa7f8e3312e90b66f3eb0a52e657e6378ee013ed7eeaada8817434913
|
|
3
3
|
// Re-generate with: alef generate
|
|
4
4
|
#![allow(dead_code, unused_imports, unused_variables)]
|
|
5
5
|
#![allow(
|
|
@@ -400,7 +400,6 @@ pub struct BrowserConfig {
|
|
|
400
400
|
wait: BrowserWait,
|
|
401
401
|
wait_selector: Option<String>,
|
|
402
402
|
extra_wait: Option<u64>,
|
|
403
|
-
stealth: bool,
|
|
404
403
|
proxy: Option<ProxyConfig>,
|
|
405
404
|
block_url_patterns: Vec<String>,
|
|
406
405
|
eval_script: Option<String>,
|
|
@@ -447,7 +446,6 @@ impl Default for BrowserConfig {
|
|
|
447
446
|
wait: Default::default(),
|
|
448
447
|
wait_selector: None,
|
|
449
448
|
extra_wait: None,
|
|
450
|
-
stealth: false,
|
|
451
449
|
proxy: None,
|
|
452
450
|
block_url_patterns: vec![],
|
|
453
451
|
eval_script: None,
|
|
@@ -490,10 +488,6 @@ impl BrowserConfig {
|
|
|
490
488
|
extra_wait: kwargs
|
|
491
489
|
.get(ruby.to_symbol("extra_wait"))
|
|
492
490
|
.and_then(|v| u64::try_convert(v).ok()),
|
|
493
|
-
stealth: kwargs
|
|
494
|
-
.get(ruby.to_symbol("stealth"))
|
|
495
|
-
.and_then(|v| bool::try_convert(v).ok())
|
|
496
|
-
.unwrap_or(false),
|
|
497
491
|
proxy: kwargs
|
|
498
492
|
.get(ruby.to_symbol("proxy"))
|
|
499
493
|
.and_then(|v| ProxyConfig::try_convert(v).ok()),
|
|
@@ -546,10 +540,6 @@ impl BrowserConfig {
|
|
|
546
540
|
self.extra_wait.clone()
|
|
547
541
|
}
|
|
548
542
|
|
|
549
|
-
fn stealth(&self) -> bool {
|
|
550
|
-
self.stealth
|
|
551
|
-
}
|
|
552
|
-
|
|
553
543
|
fn proxy(&self) -> Option<ProxyConfig> {
|
|
554
544
|
self.proxy.clone()
|
|
555
545
|
}
|
|
@@ -610,6 +600,8 @@ pub struct CrawlConfig {
|
|
|
610
600
|
proxy: Option<ProxyConfig>,
|
|
611
601
|
user_agents: Vec<String>,
|
|
612
602
|
capture_screenshot: bool,
|
|
603
|
+
follow_document_urls: bool,
|
|
604
|
+
document_url_depth: Option<u32>,
|
|
613
605
|
download_documents: bool,
|
|
614
606
|
document_max_size: Option<usize>,
|
|
615
607
|
document_mime_types: Vec<String>,
|
|
@@ -679,6 +671,8 @@ impl Default for CrawlConfig {
|
|
|
679
671
|
proxy: None,
|
|
680
672
|
user_agents: vec![],
|
|
681
673
|
capture_screenshot: false,
|
|
674
|
+
follow_document_urls: false,
|
|
675
|
+
document_url_depth: None,
|
|
682
676
|
download_documents: true,
|
|
683
677
|
document_max_size: None,
|
|
684
678
|
document_mime_types: vec![],
|
|
@@ -805,6 +799,13 @@ impl CrawlConfig {
|
|
|
805
799
|
.get(ruby.to_symbol("capture_screenshot"))
|
|
806
800
|
.and_then(|v| bool::try_convert(v).ok())
|
|
807
801
|
.unwrap_or(false),
|
|
802
|
+
follow_document_urls: kwargs
|
|
803
|
+
.get(ruby.to_symbol("follow_document_urls"))
|
|
804
|
+
.and_then(|v| bool::try_convert(v).ok())
|
|
805
|
+
.unwrap_or(false),
|
|
806
|
+
document_url_depth: kwargs
|
|
807
|
+
.get(ruby.to_symbol("document_url_depth"))
|
|
808
|
+
.and_then(|v| u32::try_convert(v).ok()),
|
|
808
809
|
download_documents: kwargs
|
|
809
810
|
.get(ruby.to_symbol("download_documents"))
|
|
810
811
|
.and_then(|v| bool::try_convert(v).ok())
|
|
@@ -949,6 +950,14 @@ impl CrawlConfig {
|
|
|
949
950
|
self.capture_screenshot
|
|
950
951
|
}
|
|
951
952
|
|
|
953
|
+
fn follow_document_urls(&self) -> bool {
|
|
954
|
+
self.follow_document_urls
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
fn document_url_depth(&self) -> Option<u32> {
|
|
958
|
+
self.document_url_depth
|
|
959
|
+
}
|
|
960
|
+
|
|
952
961
|
fn download_documents(&self) -> bool {
|
|
953
962
|
self.download_documents
|
|
954
963
|
}
|
|
@@ -1041,6 +1050,10 @@ impl CrawlConfig {
|
|
|
1041
1050
|
|
|
1042
1051
|
capture_screenshot: self.capture_screenshot,
|
|
1043
1052
|
|
|
1053
|
+
follow_document_urls: self.follow_document_urls,
|
|
1054
|
+
|
|
1055
|
+
document_url_depth: self.document_url_depth,
|
|
1056
|
+
|
|
1044
1057
|
download_documents: self.download_documents,
|
|
1045
1058
|
|
|
1046
1059
|
document_max_size: self.document_max_size,
|
|
@@ -4411,6 +4424,7 @@ pub enum BrowserMode {
|
|
|
4411
4424
|
Auto,
|
|
4412
4425
|
Always,
|
|
4413
4426
|
Never,
|
|
4427
|
+
Stealth,
|
|
4414
4428
|
}
|
|
4415
4429
|
|
|
4416
4430
|
impl Default for BrowserMode {
|
|
@@ -4425,6 +4439,7 @@ impl magnus::IntoValue for BrowserMode {
|
|
|
4425
4439
|
BrowserMode::Auto => "auto",
|
|
4426
4440
|
BrowserMode::Always => "always",
|
|
4427
4441
|
BrowserMode::Never => "never",
|
|
4442
|
+
BrowserMode::Stealth => "stealth",
|
|
4428
4443
|
};
|
|
4429
4444
|
handle.to_symbol(sym).into_value_with(handle)
|
|
4430
4445
|
}
|
|
@@ -4439,6 +4454,7 @@ impl magnus::TryConvert for BrowserMode {
|
|
|
4439
4454
|
"auto" | "Auto" => Ok(BrowserMode::Auto),
|
|
4440
4455
|
"always" | "Always" => Ok(BrowserMode::Always),
|
|
4441
4456
|
"never" | "Never" => Ok(BrowserMode::Never),
|
|
4457
|
+
"stealth" | "Stealth" => Ok(BrowserMode::Stealth),
|
|
4442
4458
|
other => Err(magnus::Error::new(
|
|
4443
4459
|
unsafe { Ruby::get_unchecked() }.exception_arg_error(),
|
|
4444
4460
|
format!("invalid BrowserMode value: {other}"),
|
|
@@ -5351,7 +5367,6 @@ impl From<BrowserConfig> for kreuzcrawl::BrowserConfig {
|
|
|
5351
5367
|
wait: val.wait.into(),
|
|
5352
5368
|
wait_selector: val.wait_selector,
|
|
5353
5369
|
extra_wait: val.extra_wait.map(std::time::Duration::from_millis),
|
|
5354
|
-
stealth: val.stealth,
|
|
5355
5370
|
proxy: val.proxy.map(Into::into),
|
|
5356
5371
|
block_url_patterns: val.block_url_patterns.into_iter().collect(),
|
|
5357
5372
|
eval_script: val.eval_script,
|
|
@@ -5373,7 +5388,6 @@ impl From<kreuzcrawl::BrowserConfig> for BrowserConfig {
|
|
|
5373
5388
|
wait: val.wait.into(),
|
|
5374
5389
|
wait_selector: val.wait_selector.map(|v| v.to_string()),
|
|
5375
5390
|
extra_wait: val.extra_wait.map(|d| d.as_millis() as u64),
|
|
5376
|
-
stealth: val.stealth,
|
|
5377
5391
|
proxy: val.proxy.map(Into::into),
|
|
5378
5392
|
block_url_patterns: val.block_url_patterns.into_iter().collect(),
|
|
5379
5393
|
eval_script: val.eval_script.map(|v| v.to_string()),
|
|
@@ -5423,6 +5437,8 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
|
|
|
5423
5437
|
proxy: val.proxy.map(Into::into),
|
|
5424
5438
|
user_agents: val.user_agents.into_iter().collect(),
|
|
5425
5439
|
capture_screenshot: val.capture_screenshot,
|
|
5440
|
+
follow_document_urls: val.follow_document_urls,
|
|
5441
|
+
document_url_depth: val.document_url_depth,
|
|
5426
5442
|
download_documents: val.download_documents,
|
|
5427
5443
|
document_max_size: val.document_max_size,
|
|
5428
5444
|
document_mime_types: val.document_mime_types.into_iter().collect(),
|
|
@@ -5473,6 +5489,8 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
|
|
|
5473
5489
|
proxy: val.proxy.map(Into::into),
|
|
5474
5490
|
user_agents: val.user_agents.into_iter().collect(),
|
|
5475
5491
|
capture_screenshot: val.capture_screenshot,
|
|
5492
|
+
follow_document_urls: val.follow_document_urls,
|
|
5493
|
+
document_url_depth: val.document_url_depth,
|
|
5476
5494
|
download_documents: val.download_documents,
|
|
5477
5495
|
document_max_size: val.document_max_size,
|
|
5478
5496
|
document_mime_types: val.document_mime_types.into_iter().collect(),
|
|
@@ -6368,6 +6386,7 @@ impl From<BrowserMode> for kreuzcrawl::BrowserMode {
|
|
|
6368
6386
|
BrowserMode::Auto => Self::Auto,
|
|
6369
6387
|
BrowserMode::Always => Self::Always,
|
|
6370
6388
|
BrowserMode::Never => Self::Never,
|
|
6389
|
+
BrowserMode::Stealth => Self::Stealth,
|
|
6371
6390
|
}
|
|
6372
6391
|
}
|
|
6373
6392
|
}
|
|
@@ -6378,6 +6397,7 @@ impl From<kreuzcrawl::BrowserMode> for BrowserMode {
|
|
|
6378
6397
|
kreuzcrawl::BrowserMode::Auto => Self::Auto,
|
|
6379
6398
|
kreuzcrawl::BrowserMode::Always => Self::Always,
|
|
6380
6399
|
kreuzcrawl::BrowserMode::Never => Self::Never,
|
|
6400
|
+
kreuzcrawl::BrowserMode::Stealth => Self::Stealth,
|
|
6381
6401
|
}
|
|
6382
6402
|
}
|
|
6383
6403
|
}
|
|
@@ -6892,8 +6912,6 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
6892
6912
|
|
|
6893
6913
|
class.define_method("extra_wait", method!(BrowserConfig::extra_wait, 0))?;
|
|
6894
6914
|
|
|
6895
|
-
class.define_method("stealth", method!(BrowserConfig::stealth, 0))?;
|
|
6896
|
-
|
|
6897
6915
|
class.define_method("proxy", method!(BrowserConfig::proxy, 0))?;
|
|
6898
6916
|
|
|
6899
6917
|
class.define_method("block_url_patterns", method!(BrowserConfig::block_url_patterns, 0))?;
|
|
@@ -6973,6 +6991,10 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
6973
6991
|
|
|
6974
6992
|
class.define_method("capture_screenshot", method!(CrawlConfig::capture_screenshot, 0))?;
|
|
6975
6993
|
|
|
6994
|
+
class.define_method("follow_document_urls", method!(CrawlConfig::follow_document_urls, 0))?;
|
|
6995
|
+
|
|
6996
|
+
class.define_method("document_url_depth", method!(CrawlConfig::document_url_depth, 0))?;
|
|
6997
|
+
|
|
6976
6998
|
class.define_method("download_documents", method!(CrawlConfig::download_documents, 0))?;
|
|
6977
6999
|
|
|
6978
7000
|
class.define_method("document_max_size", method!(CrawlConfig::document_max_size, 0))?;
|
data/lib/kreuzcrawl/native.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:4a1130eaa7f8e3312e90b66f3eb0a52e657e6378ee013ed7eeaada8817434913
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
data/lib/kreuzcrawl/version.rb
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:4a1130eaa7f8e3312e90b66f3eb0a52e657e6378ee013ed7eeaada8817434913
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
|
6
6
|
|
|
7
7
|
module Kreuzcrawl
|
|
8
8
|
## The version string for this package.
|
|
9
|
-
VERSION = "0.3.0.pre.rc.
|
|
9
|
+
VERSION = "0.3.0.pre.rc.59"
|
|
10
10
|
end
|
data/lib/kreuzcrawl.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:4a1130eaa7f8e3312e90b66f3eb0a52e657e6378ee013ed7eeaada8817434913
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
data/lib/kreuzcrawl_rb.so
CHANGED
|
Binary file
|
data/sig/types.rbs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:4a1130eaa7f8e3312e90b66f3eb0a52e657e6378ee013ed7eeaada8817434913
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
|
|
@@ -53,7 +53,6 @@ module Kreuzcrawl
|
|
|
53
53
|
attr_accessor wait: BrowserWait?
|
|
54
54
|
attr_accessor wait_selector: String?
|
|
55
55
|
attr_accessor extra_wait: Integer?
|
|
56
|
-
attr_accessor stealth: bool?
|
|
57
56
|
attr_accessor proxy: ProxyConfig?
|
|
58
57
|
attr_accessor block_url_patterns: Array[String]?
|
|
59
58
|
attr_accessor eval_script: String?
|
|
@@ -61,7 +60,7 @@ module Kreuzcrawl
|
|
|
61
60
|
attr_accessor capture_network_events: bool?
|
|
62
61
|
attr_accessor session_affinity: bool?
|
|
63
62
|
|
|
64
|
-
def initialize: (?mode: BrowserMode, ?backend: BrowserBackend, ?endpoint: String, ?timeout: Integer, ?wait: BrowserWait, ?wait_selector: String, ?extra_wait: Integer, ?
|
|
63
|
+
def initialize: (?mode: BrowserMode, ?backend: BrowserBackend, ?endpoint: String, ?timeout: Integer, ?wait: BrowserWait, ?wait_selector: String, ?extra_wait: Integer, ?proxy: ProxyConfig, ?block_url_patterns: Array[String], ?eval_script: String, ?robots_user_agent: String, ?capture_network_events: bool, ?session_affinity: bool) -> void
|
|
65
64
|
def self.default: () -> BrowserConfig
|
|
66
65
|
end
|
|
67
66
|
|
|
@@ -96,6 +95,8 @@ module Kreuzcrawl
|
|
|
96
95
|
attr_accessor proxy: ProxyConfig?
|
|
97
96
|
attr_accessor user_agents: Array[String]?
|
|
98
97
|
attr_accessor capture_screenshot: bool?
|
|
98
|
+
attr_accessor follow_document_urls: bool?
|
|
99
|
+
attr_accessor document_url_depth: Integer?
|
|
99
100
|
attr_accessor download_documents: bool?
|
|
100
101
|
attr_accessor document_max_size: Integer?
|
|
101
102
|
attr_accessor document_mime_types: Array[String]?
|
|
@@ -103,7 +104,7 @@ module Kreuzcrawl
|
|
|
103
104
|
attr_accessor browser_profile: String?
|
|
104
105
|
attr_accessor save_browser_profile: bool?
|
|
105
106
|
|
|
106
|
-
def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool) -> void
|
|
107
|
+
def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool) -> void
|
|
107
108
|
def validate: () -> void
|
|
108
109
|
def self.default: () -> CrawlConfig
|
|
109
110
|
end
|
|
@@ -460,7 +461,7 @@ module Kreuzcrawl
|
|
|
460
461
|
end
|
|
461
462
|
|
|
462
463
|
class BrowserMode
|
|
463
|
-
type value = :auto | :always | :never
|
|
464
|
+
type value = :auto | :always | :never | :stealth
|
|
464
465
|
end
|
|
465
466
|
|
|
466
467
|
class BrowserWait
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzcrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.0.pre.rc.
|
|
4
|
+
version: 0.3.0.pre.rc.59
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kreuzberg Team
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-06-
|
|
11
|
+
date: 2026-06-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -78,6 +78,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
78
78
|
- - ">="
|
|
79
79
|
- !ruby/object:Gem::Version
|
|
80
80
|
version: 3.2.0
|
|
81
|
+
- - "<"
|
|
82
|
+
- !ruby/object:Gem::Version
|
|
83
|
+
version: '4.0'
|
|
81
84
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
85
|
requirements:
|
|
83
86
|
- - ">="
|