kreuzcrawl 0.3.0.pre.rc.72 → 0.3.0.pre.rc.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8edea9a39a5cbe88bec892ae7fc8de37f0a4b4d5dc3430f5a6e4c0ccb475e443
4
- data.tar.gz: 3dd64e9b4d361cc4d3ec82178260aaa018b0ca1d1d2f76034cb20ed05c23cefd
3
+ metadata.gz: 2e042443c6e9e74a3efa5090e5a9007bf95b3a29326d9d9c49a8b5310df1892d
4
+ data.tar.gz: 33d32333a455f6330dcec4266db79dafb758c0014c99d7d62af0f608befa811f
5
5
  SHA512:
6
- metadata.gz: 56ac6e332c9e4cba97c1adb271a6e36dd1fb0c8eb1b9549950d7048c208e9ecf2f9bdb8529dd326d2b5bece9fa2d56f961e8199b2c5667167ee5965ffa3f7596
7
- data.tar.gz: b84490766d8ade6a489ba72ee7711c25b6e16e75861073f9b3c4346c8d0da4ecdca2321c9f590860070260ab9ae144429e4837ae9fced893b0983feee3718dd0
6
+ metadata.gz: 946f2b86e1bfcf44cee9623c154dd8a0cc3fbec4860e5131fddc96d6aa8a7a4ff38933e2d8172d21ad7fe17dcc60382424100ea3404e314f800a59d740b2eab9
7
+ data.tar.gz: c860e9ab287b354203960a631d81b0bf5e1786b9bdb1263465598922d2ad9c76c25986747a28df0c37451f0f931666df444256b2d467c57c6f30d58c97d78df1
@@ -1460,9 +1460,9 @@ dependencies = [
1460
1460
 
1461
1461
  [[package]]
1462
1462
  name = "kreuzcrawl"
1463
- version = "0.3.0-rc.72"
1463
+ version = "0.3.0-rc.76"
1464
1464
  source = "registry+https://github.com/rust-lang/crates.io-index"
1465
- checksum = "8a83f0115e5fce77f5f6a4c53d572b8c151038c0fcd16c641db2e8b499603582"
1465
+ checksum = "b620f0f12386898f22a5e8e2ab8ba1066d3a304c3b6e8847d5ee8e51cd068de1"
1466
1466
  dependencies = [
1467
1467
  "ahash",
1468
1468
  "aho-corasick",
@@ -1502,7 +1502,7 @@ dependencies = [
1502
1502
 
1503
1503
  [[package]]
1504
1504
  name = "kreuzcrawl-rb"
1505
- version = "0.3.0-rc.72"
1505
+ version = "0.3.0-rc.76"
1506
1506
  dependencies = [
1507
1507
  "futures",
1508
1508
  "kreuzcrawl",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzcrawl-rb"
3
- version = "0.3.0-rc.72"
3
+ version = "0.3.0-rc.76"
4
4
  edition = "2024"
5
5
  license = "Elastic-2.0"
6
6
  description = "High-performance web crawling engine"
@@ -18,7 +18,7 @@ crate-type = ["cdylib"]
18
18
 
19
19
  [dependencies]
20
20
  futures = "0.3"
21
- kreuzcrawl = { version = "0.3.0-rc.72", features = ["interact", "browser-chromiumoxide"] }
21
+ kreuzcrawl = { version = "0.3.0-rc.76", features = ["interact", "browser-chromiumoxide"] }
22
22
  magnus = "0.8"
23
23
  rb-sys = ">=0.9, <0.9.128"
24
24
  serde = { version = "1", features = ["derive"] }
@@ -1,5 +1,5 @@
1
1
  // This file is auto-generated by alef. DO NOT EDIT.
2
- // alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
2
+ // alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  // Re-generate with: alef generate
4
4
  #![allow(dead_code, unused_imports, unused_variables)]
5
5
  #![allow(
@@ -608,6 +608,7 @@ pub struct CrawlConfig {
608
608
  warc_output: Option<String>,
609
609
  browser_profile: Option<String>,
610
610
  save_browser_profile: bool,
611
+ ssrf: SsrfPolicy,
611
612
  }
612
613
 
613
614
  unsafe impl IntoValueFromNative for CrawlConfig {}
@@ -679,6 +680,7 @@ impl Default for CrawlConfig {
679
680
  warc_output: None,
680
681
  browser_profile: None,
681
682
  save_browser_profile: false,
683
+ ssrf: Default::default(),
682
684
  }
683
685
  }
684
686
  }
@@ -827,6 +829,10 @@ impl CrawlConfig {
827
829
  .get(ruby.to_symbol("save_browser_profile"))
828
830
  .and_then(|v| bool::try_convert(v).ok())
829
831
  .unwrap_or(false),
832
+ ssrf: kwargs
833
+ .get(ruby.to_symbol("ssrf"))
834
+ .and_then(|v| SsrfPolicy::try_convert(v).ok())
835
+ .unwrap_or_default(),
830
836
  })
831
837
  }
832
838
 
@@ -982,6 +988,10 @@ impl CrawlConfig {
982
988
  self.save_browser_profile
983
989
  }
984
990
 
991
+ fn ssrf(&self) -> SsrfPolicy {
992
+ self.ssrf.clone()
993
+ }
994
+
985
995
  fn validate(&self) -> Result<(), Error> {
986
996
  #[allow(clippy::needless_update)]
987
997
  let core_self = kreuzcrawl::CrawlConfig {
@@ -1066,8 +1076,8 @@ impl CrawlConfig {
1066
1076
 
1067
1077
  save_browser_profile: self.save_browser_profile,
1068
1078
 
1069
- ssrf: Default::default(),
1070
- dispatch: Default::default(),
1079
+ ssrf: self.ssrf.clone().into(),
1080
+
1071
1081
  ..Default::default()
1072
1082
  };
1073
1083
  let result = core_self.validate().map_err(|e| {
@@ -2101,7 +2111,7 @@ impl CrawlResult {
2101
2111
 
2102
2112
  browser_used: self.browser_used,
2103
2113
 
2104
- normalized_urls: Default::default(),
2114
+ ..Default::default()
2105
2115
  };
2106
2116
  core_self.unique_normalized_urls()
2107
2117
  }
@@ -4419,6 +4429,75 @@ impl BatchCrawlResults {
4419
4429
  }
4420
4430
  }
4421
4431
 
4432
+ #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
4433
+ #[serde(default)]
4434
+ #[magnus::wrap(class = "Kreuzcrawl::SsrfPolicy")]
4435
+ pub struct SsrfPolicy {
4436
+ deny_private: bool,
4437
+ max_redirects: u8,
4438
+ }
4439
+
4440
+ unsafe impl IntoValueFromNative for SsrfPolicy {}
4441
+
4442
+ impl magnus::TryConvert for SsrfPolicy {
4443
+ fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
4444
+ if let Ok(r) = <&SsrfPolicy as magnus::TryConvert>::try_convert(val) {
4445
+ return Ok(r.clone());
4446
+ }
4447
+ let json_str: String = if let Ok(s) = <String as magnus::TryConvert>::try_convert(val) {
4448
+ s
4449
+ } else {
4450
+ val.funcall::<_, _, String>("to_json", ()).map_err(|e| {
4451
+ magnus::Error::new(
4452
+ unsafe { magnus::Ruby::get_unchecked() }.exception_type_error(),
4453
+ format!("no implicit conversion into SsrfPolicy: {}", e),
4454
+ )
4455
+ })?
4456
+ };
4457
+ serde_json::from_str::<SsrfPolicy>(&json_str).map_err(|e| {
4458
+ magnus::Error::new(
4459
+ unsafe { magnus::Ruby::get_unchecked() }.exception_type_error(),
4460
+ format!("failed to deserialize SsrfPolicy: {}", e),
4461
+ )
4462
+ })
4463
+ }
4464
+ }
4465
+
4466
+ unsafe impl TryConvertOwned for SsrfPolicy {}
4467
+
4468
+ impl Default for SsrfPolicy {
4469
+ fn default() -> Self {
4470
+ kreuzcrawl::SsrfPolicy::default().into()
4471
+ }
4472
+ }
4473
+
4474
+ impl SsrfPolicy {
4475
+ fn new(args: &[magnus::Value]) -> Result<Self, magnus::Error> {
4476
+ let ruby = unsafe { magnus::Ruby::get_unchecked() };
4477
+ let args = magnus::scan_args::scan_args::<(), (Option<magnus::RHash>,), (), (), (), ()>(args)?;
4478
+ let (kwargs_opt,) = args.optional;
4479
+ let kwargs = kwargs_opt.unwrap_or_else(|| ruby.hash_new());
4480
+ Ok(Self {
4481
+ deny_private: kwargs
4482
+ .get(ruby.to_symbol("deny_private"))
4483
+ .and_then(|v| bool::try_convert(v).ok())
4484
+ .unwrap_or(true),
4485
+ max_redirects: kwargs
4486
+ .get(ruby.to_symbol("max_redirects"))
4487
+ .and_then(|v| u8::try_convert(v).ok())
4488
+ .unwrap_or(5),
4489
+ })
4490
+ }
4491
+
4492
+ fn deny_private(&self) -> bool {
4493
+ self.deny_private
4494
+ }
4495
+
4496
+ fn max_redirects(&self) -> u8 {
4497
+ self.max_redirects
4498
+ }
4499
+ }
4500
+
4422
4501
  #[derive(Clone, Copy, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
4423
4502
  #[serde(rename_all = "snake_case")]
4424
4503
  pub enum BrowserMode {
@@ -5446,6 +5525,7 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
5446
5525
  warc_output: val.warc_output.map(Into::into),
5447
5526
  browser_profile: val.browser_profile,
5448
5527
  save_browser_profile: val.save_browser_profile,
5528
+ ssrf: val.ssrf.into(),
5449
5529
  ..Default::default()
5450
5530
  }
5451
5531
  }
@@ -5497,6 +5577,7 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
5497
5577
  warc_output: val.warc_output.map(|p| p.to_string_lossy().to_string()),
5498
5578
  browser_profile: val.browser_profile.map(|v| v.to_string()),
5499
5579
  save_browser_profile: val.save_browser_profile,
5580
+ ssrf: val.ssrf.into(),
5500
5581
  }
5501
5582
  }
5502
5583
  }
@@ -6384,6 +6465,28 @@ impl From<kreuzcrawl::BatchCrawlResults> for BatchCrawlResults {
6384
6465
  }
6385
6466
  }
6386
6467
 
6468
+ #[allow(clippy::needless_update)]
6469
+ #[allow(clippy::redundant_closure, clippy::useless_conversion)]
6470
+ impl From<SsrfPolicy> for kreuzcrawl::SsrfPolicy {
6471
+ fn from(val: SsrfPolicy) -> Self {
6472
+ Self {
6473
+ deny_private: val.deny_private,
6474
+ max_redirects: val.max_redirects,
6475
+ ..Default::default()
6476
+ }
6477
+ }
6478
+ }
6479
+
6480
+ #[allow(clippy::redundant_closure, clippy::useless_conversion)]
6481
+ impl From<kreuzcrawl::SsrfPolicy> for SsrfPolicy {
6482
+ fn from(val: kreuzcrawl::SsrfPolicy) -> Self {
6483
+ Self {
6484
+ deny_private: val.deny_private,
6485
+ max_redirects: val.max_redirects,
6486
+ }
6487
+ }
6488
+ }
6489
+
6387
6490
  impl From<BrowserMode> for kreuzcrawl::BrowserMode {
6388
6491
  fn from(val: BrowserMode) -> Self {
6389
6492
  match val {
@@ -6666,6 +6769,13 @@ fn crawl_error_to_magnus_err(e: kreuzcrawl::CrawlError) -> magnus::Error {
6666
6769
  magnus::Error::new(unsafe { magnus::Ruby::get_unchecked() }.exception_runtime_error(), msg)
6667
6770
  }
6668
6771
 
6772
+ /// Convert a `kreuzcrawl::SsrfError` error to a Magnus runtime error.
6773
+ #[allow(dead_code)]
6774
+ fn ssrf_error_to_magnus_err(e: kreuzcrawl::SsrfError) -> magnus::Error {
6775
+ let msg = e.to_string();
6776
+ magnus::Error::new(unsafe { magnus::Ruby::get_unchecked() }.exception_runtime_error(), msg)
6777
+ }
6778
+
6669
6779
  #[derive(Clone)]
6670
6780
  #[magnus::wrap(class = "Kreuzcrawl::CrawlStreamIterator")]
6671
6781
  pub struct CrawlStreamIterator {
@@ -7011,6 +7121,8 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
7011
7121
 
7012
7122
  class.define_method("save_browser_profile", method!(CrawlConfig::save_browser_profile, 0))?;
7013
7123
 
7124
+ class.define_method("ssrf", method!(CrawlConfig::ssrf, 0))?;
7125
+
7014
7126
  class.define_method("validate", method!(CrawlConfig::validate, 0))?;
7015
7127
 
7016
7128
  let class = module.define_class("BrowserExtras", ruby.class_object())?;
@@ -7537,6 +7649,14 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
7537
7649
 
7538
7650
  class.define_method("failed_count", method!(BatchCrawlResults::failed_count, 0))?;
7539
7651
 
7652
+ let class = module.define_class("SsrfPolicy", ruby.class_object())?;
7653
+
7654
+ class.define_singleton_method("new", function!(SsrfPolicy::new, -1))?;
7655
+
7656
+ class.define_method("deny_private", method!(SsrfPolicy::deny_private, 0))?;
7657
+
7658
+ class.define_method("max_redirects", method!(SsrfPolicy::max_redirects, 0))?;
7659
+
7540
7660
  let class = module.define_class("CrawlStreamIterator", ruby.class_object())?;
7541
7661
  class.define_method("next_chunk", method!(CrawlStreamIterator::next_chunk, 0))?;
7542
7662
  class.define_method("each", method!(CrawlStreamIterator::each, 0))?;
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
@@ -1,10 +1,10 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
6
6
 
7
7
  module Kreuzcrawl
8
8
  ## The version string for this package.
9
- VERSION = "0.3.0.pre.rc.72"
9
+ VERSION = "0.3.0.pre.rc.76"
10
10
  end
data/lib/kreuzcrawl.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
data/lib/kreuzcrawl_rb.so CHANGED
Binary file
data/sig/types.rbs CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
 
@@ -103,8 +103,9 @@ module Kreuzcrawl
103
103
  attr_accessor warc_output: String?
104
104
  attr_accessor browser_profile: String?
105
105
  attr_accessor save_browser_profile: bool?
106
+ attr_accessor ssrf: SsrfPolicy?
106
107
 
107
- def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool) -> void
108
+ def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool, ?ssrf: SsrfPolicy) -> void
108
109
  def validate: () -> void
109
110
  def self.default: () -> CrawlConfig
110
111
  end
@@ -460,6 +461,15 @@ module Kreuzcrawl
460
461
  def initialize: (?results: Array[BatchCrawlResult], ?total_count: Integer, ?completed_count: Integer, ?failed_count: Integer) -> void
461
462
  end
462
463
 
464
+ class SsrfPolicy
465
+ attr_accessor deny_private: bool?
466
+ attr_accessor max_redirects: Integer?
467
+
468
+ def initialize: (?deny_private: bool, ?max_redirects: Integer) -> void
469
+ def self.default: () -> SsrfPolicy
470
+ def self.from_env: () -> SsrfPolicy
471
+ end
472
+
463
473
  class BrowserMode
464
474
  type value = :auto | :always | :never | :stealth
465
475
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.pre.rc.72
4
+ version: 0.3.0.pre.rc.76
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kreuzberg Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-06-16 00:00:00.000000000 Z
11
+ date: 2026-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys