kreuzcrawl 0.3.0.pre.rc.71 → 0.3.0.pre.rc.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/ext/kreuzcrawl_rb/native/Cargo.lock +3 -3
- data/ext/kreuzcrawl_rb/native/Cargo.toml +2 -2
- data/ext/kreuzcrawl_rb/src/lib.rs +132 -10
- data/lib/kreuzcrawl/native.rb +2 -5
- data/lib/kreuzcrawl/version.rb +2 -2
- data/lib/kreuzcrawl.rb +1 -1
- data/lib/kreuzcrawl_rb.so +0 -0
- data/sig/types.rbs +12 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2e042443c6e9e74a3efa5090e5a9007bf95b3a29326d9d9c49a8b5310df1892d
|
|
4
|
+
data.tar.gz: 33d32333a455f6330dcec4266db79dafb758c0014c99d7d62af0f608befa811f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 946f2b86e1bfcf44cee9623c154dd8a0cc3fbec4860e5131fddc96d6aa8a7a4ff38933e2d8172d21ad7fe17dcc60382424100ea3404e314f800a59d740b2eab9
|
|
7
|
+
data.tar.gz: c860e9ab287b354203960a631d81b0bf5e1786b9bdb1263465598922d2ad9c76c25986747a28df0c37451f0f931666df444256b2d467c57c6f30d58c97d78df1
|
data/README.md
CHANGED
|
@@ -141,12 +141,13 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co
|
|
|
141
141
|
|
|
142
142
|
## Part of Kreuzberg.dev
|
|
143
143
|
|
|
144
|
-
- [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from
|
|
144
|
+
- [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 91+ formats with optional OCR.
|
|
145
145
|
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
|
|
146
|
+
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
|
|
146
147
|
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
|
|
147
148
|
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
|
|
148
149
|
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
|
|
149
|
-
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces
|
|
150
|
+
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces every per-language binding across the 5 polyglot repos.
|
|
150
151
|
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
|
|
151
152
|
|
|
152
153
|
## License
|
|
@@ -158,4 +159,3 @@ This project is licensed under [Elastic License 2.0](https://github.com/kreuzber
|
|
|
158
159
|
- [Documentation](https://docs.kreuzcrawl.kreuzberg.dev)
|
|
159
160
|
- [GitHub Repository](https://github.com/kreuzberg-dev/kreuzcrawl)
|
|
160
161
|
- [Issue Tracker](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
|
|
161
|
-
- [Issues](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
|
|
@@ -1460,9 +1460,9 @@ dependencies = [
|
|
|
1460
1460
|
|
|
1461
1461
|
[[package]]
|
|
1462
1462
|
name = "kreuzcrawl"
|
|
1463
|
-
version = "0.3.0-rc.
|
|
1463
|
+
version = "0.3.0-rc.76"
|
|
1464
1464
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1465
|
-
checksum = "
|
|
1465
|
+
checksum = "b620f0f12386898f22a5e8e2ab8ba1066d3a304c3b6e8847d5ee8e51cd068de1"
|
|
1466
1466
|
dependencies = [
|
|
1467
1467
|
"ahash",
|
|
1468
1468
|
"aho-corasick",
|
|
@@ -1502,7 +1502,7 @@ dependencies = [
|
|
|
1502
1502
|
|
|
1503
1503
|
[[package]]
|
|
1504
1504
|
name = "kreuzcrawl-rb"
|
|
1505
|
-
version = "0.3.0-rc.
|
|
1505
|
+
version = "0.3.0-rc.76"
|
|
1506
1506
|
dependencies = [
|
|
1507
1507
|
"futures",
|
|
1508
1508
|
"kreuzcrawl",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzcrawl-rb"
|
|
3
|
-
version = "0.3.0-rc.
|
|
3
|
+
version = "0.3.0-rc.76"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
license = "Elastic-2.0"
|
|
6
6
|
description = "High-performance web crawling engine"
|
|
@@ -18,7 +18,7 @@ crate-type = ["cdylib"]
|
|
|
18
18
|
|
|
19
19
|
[dependencies]
|
|
20
20
|
futures = "0.3"
|
|
21
|
-
kreuzcrawl = { version = "0.3.0-rc.
|
|
21
|
+
kreuzcrawl = { version = "0.3.0-rc.76", features = ["interact", "browser-chromiumoxide"] }
|
|
22
22
|
magnus = "0.8"
|
|
23
23
|
rb-sys = ">=0.9, <0.9.128"
|
|
24
24
|
serde = { version = "1", features = ["derive"] }
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// This file is auto-generated by alef. DO NOT EDIT.
|
|
2
|
-
// alef:hash:
|
|
2
|
+
// alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
|
|
3
3
|
// Re-generate with: alef generate
|
|
4
4
|
#![allow(dead_code, unused_imports, unused_variables)]
|
|
5
5
|
#![allow(
|
|
@@ -608,6 +608,7 @@ pub struct CrawlConfig {
|
|
|
608
608
|
warc_output: Option<String>,
|
|
609
609
|
browser_profile: Option<String>,
|
|
610
610
|
save_browser_profile: bool,
|
|
611
|
+
ssrf: SsrfPolicy,
|
|
611
612
|
}
|
|
612
613
|
|
|
613
614
|
unsafe impl IntoValueFromNative for CrawlConfig {}
|
|
@@ -679,6 +680,7 @@ impl Default for CrawlConfig {
|
|
|
679
680
|
warc_output: None,
|
|
680
681
|
browser_profile: None,
|
|
681
682
|
save_browser_profile: false,
|
|
683
|
+
ssrf: Default::default(),
|
|
682
684
|
}
|
|
683
685
|
}
|
|
684
686
|
}
|
|
@@ -827,6 +829,10 @@ impl CrawlConfig {
|
|
|
827
829
|
.get(ruby.to_symbol("save_browser_profile"))
|
|
828
830
|
.and_then(|v| bool::try_convert(v).ok())
|
|
829
831
|
.unwrap_or(false),
|
|
832
|
+
ssrf: kwargs
|
|
833
|
+
.get(ruby.to_symbol("ssrf"))
|
|
834
|
+
.and_then(|v| SsrfPolicy::try_convert(v).ok())
|
|
835
|
+
.unwrap_or_default(),
|
|
830
836
|
})
|
|
831
837
|
}
|
|
832
838
|
|
|
@@ -982,6 +988,10 @@ impl CrawlConfig {
|
|
|
982
988
|
self.save_browser_profile
|
|
983
989
|
}
|
|
984
990
|
|
|
991
|
+
fn ssrf(&self) -> SsrfPolicy {
|
|
992
|
+
self.ssrf.clone()
|
|
993
|
+
}
|
|
994
|
+
|
|
985
995
|
fn validate(&self) -> Result<(), Error> {
|
|
986
996
|
#[allow(clippy::needless_update)]
|
|
987
997
|
let core_self = kreuzcrawl::CrawlConfig {
|
|
@@ -1066,8 +1076,8 @@ impl CrawlConfig {
|
|
|
1066
1076
|
|
|
1067
1077
|
save_browser_profile: self.save_browser_profile,
|
|
1068
1078
|
|
|
1069
|
-
ssrf:
|
|
1070
|
-
|
|
1079
|
+
ssrf: self.ssrf.clone().into(),
|
|
1080
|
+
|
|
1071
1081
|
..Default::default()
|
|
1072
1082
|
};
|
|
1073
1083
|
let result = core_self.validate().map_err(|e| {
|
|
@@ -2101,7 +2111,7 @@ impl CrawlResult {
|
|
|
2101
2111
|
|
|
2102
2112
|
browser_used: self.browser_used,
|
|
2103
2113
|
|
|
2104
|
-
|
|
2114
|
+
..Default::default()
|
|
2105
2115
|
};
|
|
2106
2116
|
core_self.unique_normalized_urls()
|
|
2107
2117
|
}
|
|
@@ -4419,6 +4429,75 @@ impl BatchCrawlResults {
|
|
|
4419
4429
|
}
|
|
4420
4430
|
}
|
|
4421
4431
|
|
|
4432
|
+
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
|
|
4433
|
+
#[serde(default)]
|
|
4434
|
+
#[magnus::wrap(class = "Kreuzcrawl::SsrfPolicy")]
|
|
4435
|
+
pub struct SsrfPolicy {
|
|
4436
|
+
deny_private: bool,
|
|
4437
|
+
max_redirects: u8,
|
|
4438
|
+
}
|
|
4439
|
+
|
|
4440
|
+
unsafe impl IntoValueFromNative for SsrfPolicy {}
|
|
4441
|
+
|
|
4442
|
+
impl magnus::TryConvert for SsrfPolicy {
|
|
4443
|
+
fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
|
|
4444
|
+
if let Ok(r) = <&SsrfPolicy as magnus::TryConvert>::try_convert(val) {
|
|
4445
|
+
return Ok(r.clone());
|
|
4446
|
+
}
|
|
4447
|
+
let json_str: String = if let Ok(s) = <String as magnus::TryConvert>::try_convert(val) {
|
|
4448
|
+
s
|
|
4449
|
+
} else {
|
|
4450
|
+
val.funcall::<_, _, String>("to_json", ()).map_err(|e| {
|
|
4451
|
+
magnus::Error::new(
|
|
4452
|
+
unsafe { magnus::Ruby::get_unchecked() }.exception_type_error(),
|
|
4453
|
+
format!("no implicit conversion into SsrfPolicy: {}", e),
|
|
4454
|
+
)
|
|
4455
|
+
})?
|
|
4456
|
+
};
|
|
4457
|
+
serde_json::from_str::<SsrfPolicy>(&json_str).map_err(|e| {
|
|
4458
|
+
magnus::Error::new(
|
|
4459
|
+
unsafe { magnus::Ruby::get_unchecked() }.exception_type_error(),
|
|
4460
|
+
format!("failed to deserialize SsrfPolicy: {}", e),
|
|
4461
|
+
)
|
|
4462
|
+
})
|
|
4463
|
+
}
|
|
4464
|
+
}
|
|
4465
|
+
|
|
4466
|
+
unsafe impl TryConvertOwned for SsrfPolicy {}
|
|
4467
|
+
|
|
4468
|
+
impl Default for SsrfPolicy {
|
|
4469
|
+
fn default() -> Self {
|
|
4470
|
+
kreuzcrawl::SsrfPolicy::default().into()
|
|
4471
|
+
}
|
|
4472
|
+
}
|
|
4473
|
+
|
|
4474
|
+
impl SsrfPolicy {
|
|
4475
|
+
fn new(args: &[magnus::Value]) -> Result<Self, magnus::Error> {
|
|
4476
|
+
let ruby = unsafe { magnus::Ruby::get_unchecked() };
|
|
4477
|
+
let args = magnus::scan_args::scan_args::<(), (Option<magnus::RHash>,), (), (), (), ()>(args)?;
|
|
4478
|
+
let (kwargs_opt,) = args.optional;
|
|
4479
|
+
let kwargs = kwargs_opt.unwrap_or_else(|| ruby.hash_new());
|
|
4480
|
+
Ok(Self {
|
|
4481
|
+
deny_private: kwargs
|
|
4482
|
+
.get(ruby.to_symbol("deny_private"))
|
|
4483
|
+
.and_then(|v| bool::try_convert(v).ok())
|
|
4484
|
+
.unwrap_or(true),
|
|
4485
|
+
max_redirects: kwargs
|
|
4486
|
+
.get(ruby.to_symbol("max_redirects"))
|
|
4487
|
+
.and_then(|v| u8::try_convert(v).ok())
|
|
4488
|
+
.unwrap_or(5),
|
|
4489
|
+
})
|
|
4490
|
+
}
|
|
4491
|
+
|
|
4492
|
+
fn deny_private(&self) -> bool {
|
|
4493
|
+
self.deny_private
|
|
4494
|
+
}
|
|
4495
|
+
|
|
4496
|
+
fn max_redirects(&self) -> u8 {
|
|
4497
|
+
self.max_redirects
|
|
4498
|
+
}
|
|
4499
|
+
}
|
|
4500
|
+
|
|
4422
4501
|
#[derive(Clone, Copy, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
|
|
4423
4502
|
#[serde(rename_all = "snake_case")]
|
|
4424
4503
|
pub enum BrowserMode {
|
|
@@ -5446,8 +5525,7 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
|
|
|
5446
5525
|
warc_output: val.warc_output.map(Into::into),
|
|
5447
5526
|
browser_profile: val.browser_profile,
|
|
5448
5527
|
save_browser_profile: val.save_browser_profile,
|
|
5449
|
-
ssrf:
|
|
5450
|
-
dispatch: Default::default(),
|
|
5528
|
+
ssrf: val.ssrf.into(),
|
|
5451
5529
|
..Default::default()
|
|
5452
5530
|
}
|
|
5453
5531
|
}
|
|
@@ -5499,6 +5577,7 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
|
|
|
5499
5577
|
warc_output: val.warc_output.map(|p| p.to_string_lossy().to_string()),
|
|
5500
5578
|
browser_profile: val.browser_profile.map(|v| v.to_string()),
|
|
5501
5579
|
save_browser_profile: val.save_browser_profile,
|
|
5580
|
+
ssrf: val.ssrf.into(),
|
|
5502
5581
|
}
|
|
5503
5582
|
}
|
|
5504
5583
|
}
|
|
@@ -5525,17 +5604,18 @@ impl From<kreuzcrawl::BrowserExtras> for BrowserExtras {
|
|
|
5525
5604
|
}
|
|
5526
5605
|
}
|
|
5527
5606
|
|
|
5607
|
+
#[allow(clippy::needless_update)]
|
|
5528
5608
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5529
5609
|
impl From<DownloadedDocument> for kreuzcrawl::DownloadedDocument {
|
|
5530
5610
|
fn from(val: DownloadedDocument) -> Self {
|
|
5531
5611
|
Self {
|
|
5532
5612
|
url: val.url,
|
|
5533
5613
|
mime_type: val.mime_type.into(),
|
|
5534
|
-
content: Default::default(),
|
|
5535
5614
|
size: val.size,
|
|
5536
5615
|
filename: val.filename.map(Into::into),
|
|
5537
5616
|
content_hash: val.content_hash.into(),
|
|
5538
5617
|
headers: val.headers.into_iter().map(|(k, v)| (k.into(), v.into())).collect(),
|
|
5618
|
+
..Default::default()
|
|
5539
5619
|
}
|
|
5540
5620
|
}
|
|
5541
5621
|
}
|
|
@@ -5554,6 +5634,7 @@ impl From<kreuzcrawl::DownloadedDocument> for DownloadedDocument {
|
|
|
5554
5634
|
}
|
|
5555
5635
|
}
|
|
5556
5636
|
|
|
5637
|
+
#[allow(clippy::needless_update)]
|
|
5557
5638
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5558
5639
|
impl From<InteractionResult> for kreuzcrawl::InteractionResult {
|
|
5559
5640
|
fn from(val: InteractionResult) -> Self {
|
|
@@ -5561,7 +5642,7 @@ impl From<InteractionResult> for kreuzcrawl::InteractionResult {
|
|
|
5561
5642
|
action_results: val.action_results.into_iter().map(Into::into).collect(),
|
|
5562
5643
|
final_html: val.final_html,
|
|
5563
5644
|
final_url: val.final_url,
|
|
5564
|
-
|
|
5645
|
+
..Default::default()
|
|
5565
5646
|
}
|
|
5566
5647
|
}
|
|
5567
5648
|
}
|
|
@@ -5603,6 +5684,7 @@ impl From<kreuzcrawl::ActionResult> for ActionResult {
|
|
|
5603
5684
|
}
|
|
5604
5685
|
}
|
|
5605
5686
|
|
|
5687
|
+
#[allow(clippy::needless_update)]
|
|
5606
5688
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5607
5689
|
impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
|
|
5608
5690
|
fn from(val: ScrapeResult) -> Self {
|
|
@@ -5633,9 +5715,9 @@ impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
|
|
|
5633
5715
|
markdown: val.markdown.map(Into::into),
|
|
5634
5716
|
extracted_data: val.extracted_data.as_ref().and_then(|s| serde_json::from_str(s).ok()),
|
|
5635
5717
|
extraction_meta: val.extraction_meta.map(Into::into),
|
|
5636
|
-
screenshot: Default::default(),
|
|
5637
5718
|
downloaded_document: val.downloaded_document.map(Into::into),
|
|
5638
5719
|
browser: val.browser.map(Into::into),
|
|
5720
|
+
..Default::default()
|
|
5639
5721
|
}
|
|
5640
5722
|
}
|
|
5641
5723
|
}
|
|
@@ -5734,6 +5816,7 @@ impl From<kreuzcrawl::CrawlPageResult> for CrawlPageResult {
|
|
|
5734
5816
|
}
|
|
5735
5817
|
}
|
|
5736
5818
|
|
|
5819
|
+
#[allow(clippy::needless_update)]
|
|
5737
5820
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5738
5821
|
impl From<CrawlResult> for kreuzcrawl::CrawlResult {
|
|
5739
5822
|
fn from(val: CrawlResult) -> Self {
|
|
@@ -5746,7 +5829,7 @@ impl From<CrawlResult> for kreuzcrawl::CrawlResult {
|
|
|
5746
5829
|
cookies: val.cookies.into_iter().map(Into::into).collect(),
|
|
5747
5830
|
stayed_on_domain: val.stayed_on_domain,
|
|
5748
5831
|
browser_used: val.browser_used,
|
|
5749
|
-
|
|
5832
|
+
..Default::default()
|
|
5750
5833
|
}
|
|
5751
5834
|
}
|
|
5752
5835
|
}
|
|
@@ -6382,6 +6465,28 @@ impl From<kreuzcrawl::BatchCrawlResults> for BatchCrawlResults {
|
|
|
6382
6465
|
}
|
|
6383
6466
|
}
|
|
6384
6467
|
|
|
6468
|
+
#[allow(clippy::needless_update)]
|
|
6469
|
+
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
6470
|
+
impl From<SsrfPolicy> for kreuzcrawl::SsrfPolicy {
|
|
6471
|
+
fn from(val: SsrfPolicy) -> Self {
|
|
6472
|
+
Self {
|
|
6473
|
+
deny_private: val.deny_private,
|
|
6474
|
+
max_redirects: val.max_redirects,
|
|
6475
|
+
..Default::default()
|
|
6476
|
+
}
|
|
6477
|
+
}
|
|
6478
|
+
}
|
|
6479
|
+
|
|
6480
|
+
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
6481
|
+
impl From<kreuzcrawl::SsrfPolicy> for SsrfPolicy {
|
|
6482
|
+
fn from(val: kreuzcrawl::SsrfPolicy) -> Self {
|
|
6483
|
+
Self {
|
|
6484
|
+
deny_private: val.deny_private,
|
|
6485
|
+
max_redirects: val.max_redirects,
|
|
6486
|
+
}
|
|
6487
|
+
}
|
|
6488
|
+
}
|
|
6489
|
+
|
|
6385
6490
|
impl From<BrowserMode> for kreuzcrawl::BrowserMode {
|
|
6386
6491
|
fn from(val: BrowserMode) -> Self {
|
|
6387
6492
|
match val {
|
|
@@ -6664,6 +6769,13 @@ fn crawl_error_to_magnus_err(e: kreuzcrawl::CrawlError) -> magnus::Error {
|
|
|
6664
6769
|
magnus::Error::new(unsafe { magnus::Ruby::get_unchecked() }.exception_runtime_error(), msg)
|
|
6665
6770
|
}
|
|
6666
6771
|
|
|
6772
|
+
/// Convert a `kreuzcrawl::SsrfError` error to a Magnus runtime error.
|
|
6773
|
+
#[allow(dead_code)]
|
|
6774
|
+
fn ssrf_error_to_magnus_err(e: kreuzcrawl::SsrfError) -> magnus::Error {
|
|
6775
|
+
let msg = e.to_string();
|
|
6776
|
+
magnus::Error::new(unsafe { magnus::Ruby::get_unchecked() }.exception_runtime_error(), msg)
|
|
6777
|
+
}
|
|
6778
|
+
|
|
6667
6779
|
#[derive(Clone)]
|
|
6668
6780
|
#[magnus::wrap(class = "Kreuzcrawl::CrawlStreamIterator")]
|
|
6669
6781
|
pub struct CrawlStreamIterator {
|
|
@@ -7009,6 +7121,8 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
7009
7121
|
|
|
7010
7122
|
class.define_method("save_browser_profile", method!(CrawlConfig::save_browser_profile, 0))?;
|
|
7011
7123
|
|
|
7124
|
+
class.define_method("ssrf", method!(CrawlConfig::ssrf, 0))?;
|
|
7125
|
+
|
|
7012
7126
|
class.define_method("validate", method!(CrawlConfig::validate, 0))?;
|
|
7013
7127
|
|
|
7014
7128
|
let class = module.define_class("BrowserExtras", ruby.class_object())?;
|
|
@@ -7535,6 +7649,14 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
7535
7649
|
|
|
7536
7650
|
class.define_method("failed_count", method!(BatchCrawlResults::failed_count, 0))?;
|
|
7537
7651
|
|
|
7652
|
+
let class = module.define_class("SsrfPolicy", ruby.class_object())?;
|
|
7653
|
+
|
|
7654
|
+
class.define_singleton_method("new", function!(SsrfPolicy::new, -1))?;
|
|
7655
|
+
|
|
7656
|
+
class.define_method("deny_private", method!(SsrfPolicy::deny_private, 0))?;
|
|
7657
|
+
|
|
7658
|
+
class.define_method("max_redirects", method!(SsrfPolicy::max_redirects, 0))?;
|
|
7659
|
+
|
|
7538
7660
|
let class = module.define_class("CrawlStreamIterator", ruby.class_object())?;
|
|
7539
7661
|
class.define_method("next_chunk", method!(CrawlStreamIterator::next_chunk, 0))?;
|
|
7540
7662
|
class.define_method("each", method!(CrawlStreamIterator::each, 0))?;
|
data/lib/kreuzcrawl/native.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
|
@@ -106,10 +106,7 @@ module Kreuzcrawl
|
|
|
106
106
|
# Not available on `wasm32` targets — streaming requires native concurrency
|
|
107
107
|
# primitives (tokio channels, `JoinSet`) that are not supported on wasm32.
|
|
108
108
|
#
|
|
109
|
-
# Delivered to bindings
|
|
110
|
-
# `crawl_stream` / `batch_crawl_stream` binding wrappers in `bindings.rs`
|
|
111
|
-
# expose this as the per-language streaming idiom (Python `AsyncIterator`,
|
|
112
|
-
# Ruby `Enumerator`, PHP `Generator`, Elixir `Stream.unfold`, etc.).
|
|
109
|
+
# Delivered to bindings through each target's native streaming idiom.
|
|
113
110
|
module CrawlEvent
|
|
114
111
|
extend T::Helpers
|
|
115
112
|
extend T::Sig
|
data/lib/kreuzcrawl/version.rb
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
|
6
6
|
|
|
7
7
|
module Kreuzcrawl
|
|
8
8
|
## The version string for this package.
|
|
9
|
-
VERSION = "0.3.0.pre.rc.
|
|
9
|
+
VERSION = "0.3.0.pre.rc.76"
|
|
10
10
|
end
|
data/lib/kreuzcrawl.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
data/lib/kreuzcrawl_rb.so
CHANGED
|
Binary file
|
data/sig/types.rbs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
|
|
@@ -103,8 +103,9 @@ module Kreuzcrawl
|
|
|
103
103
|
attr_accessor warc_output: String?
|
|
104
104
|
attr_accessor browser_profile: String?
|
|
105
105
|
attr_accessor save_browser_profile: bool?
|
|
106
|
+
attr_accessor ssrf: SsrfPolicy?
|
|
106
107
|
|
|
107
|
-
def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool) -> void
|
|
108
|
+
def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool, ?ssrf: SsrfPolicy) -> void
|
|
108
109
|
def validate: () -> void
|
|
109
110
|
def self.default: () -> CrawlConfig
|
|
110
111
|
end
|
|
@@ -460,6 +461,15 @@ module Kreuzcrawl
|
|
|
460
461
|
def initialize: (?results: Array[BatchCrawlResult], ?total_count: Integer, ?completed_count: Integer, ?failed_count: Integer) -> void
|
|
461
462
|
end
|
|
462
463
|
|
|
464
|
+
class SsrfPolicy
|
|
465
|
+
attr_accessor deny_private: bool?
|
|
466
|
+
attr_accessor max_redirects: Integer?
|
|
467
|
+
|
|
468
|
+
def initialize: (?deny_private: bool, ?max_redirects: Integer) -> void
|
|
469
|
+
def self.default: () -> SsrfPolicy
|
|
470
|
+
def self.from_env: () -> SsrfPolicy
|
|
471
|
+
end
|
|
472
|
+
|
|
463
473
|
class BrowserMode
|
|
464
474
|
type value = :auto | :always | :never | :stealth
|
|
465
475
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzcrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.0.pre.rc.
|
|
4
|
+
version: 0.3.0.pre.rc.76
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kreuzberg Team
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-06-
|
|
11
|
+
date: 2026-06-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|