kreuzcrawl 0.3.0.pre.rc.71 → 0.3.0.pre.rc.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f9dc9a2b925bd356635ef537594ccf72049f8bbfa47754f348fe0643443a4c9f
4
- data.tar.gz: 06b86af2e1ef230a61a1c9c81d84115e2346e558de94df0d17c08097e63c84e1
3
+ metadata.gz: 2e042443c6e9e74a3efa5090e5a9007bf95b3a29326d9d9c49a8b5310df1892d
4
+ data.tar.gz: 33d32333a455f6330dcec4266db79dafb758c0014c99d7d62af0f608befa811f
5
5
  SHA512:
6
- metadata.gz: 490013779c30677fd6f3618275bb0bd9f11f0da30c43b320e2fda3b814762d18fac68391c9b6e91c9b9f80a5f3cc4ee6a008d48e4897b6dd79c314af085e5d5b
7
- data.tar.gz: 6c8b027fc5db3dd04aa20b034a09e80a4041d527e688131a0bdfbc2f87a9390bef47adc14db5e096a195d69ddcc45ad17d68f43dbb91846ce09fba8919d6d5c0
6
+ metadata.gz: 946f2b86e1bfcf44cee9623c154dd8a0cc3fbec4860e5131fddc96d6aa8a7a4ff38933e2d8172d21ad7fe17dcc60382424100ea3404e314f800a59d740b2eab9
7
+ data.tar.gz: c860e9ab287b354203960a631d81b0bf5e1786b9bdb1263465598922d2ad9c76c25986747a28df0c37451f0f931666df444256b2d467c57c6f30d58c97d78df1
data/README.md CHANGED
@@ -141,12 +141,13 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co
141
141
 
142
142
  ## Part of Kreuzberg.dev
143
143
 
144
- - [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 90+ formats with optional OCR.
144
+ - [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 91+ formats with optional OCR.
145
145
  - [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
146
+ - [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
146
147
  - [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
147
148
  - [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
148
149
  - [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
149
- - [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
150
+ - [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces every per-language binding across the 5 polyglot repos.
150
151
  - [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
151
152
 
152
153
  ## License
@@ -158,4 +159,3 @@ This project is licensed under [Elastic License 2.0](https://github.com/kreuzber
158
159
  - [Documentation](https://docs.kreuzcrawl.kreuzberg.dev)
159
160
  - [GitHub Repository](https://github.com/kreuzberg-dev/kreuzcrawl)
160
161
  - [Issue Tracker](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
161
- - [Issues](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
@@ -1460,9 +1460,9 @@ dependencies = [
1460
1460
 
1461
1461
  [[package]]
1462
1462
  name = "kreuzcrawl"
1463
- version = "0.3.0-rc.71"
1463
+ version = "0.3.0-rc.76"
1464
1464
  source = "registry+https://github.com/rust-lang/crates.io-index"
1465
- checksum = "e8c85e79454c048ec41289dfb0168e87ff1c5a0a28997de8cc0dbd3ab331c8e2"
1465
+ checksum = "b620f0f12386898f22a5e8e2ab8ba1066d3a304c3b6e8847d5ee8e51cd068de1"
1466
1466
  dependencies = [
1467
1467
  "ahash",
1468
1468
  "aho-corasick",
@@ -1502,7 +1502,7 @@ dependencies = [
1502
1502
 
1503
1503
  [[package]]
1504
1504
  name = "kreuzcrawl-rb"
1505
- version = "0.3.0-rc.71"
1505
+ version = "0.3.0-rc.76"
1506
1506
  dependencies = [
1507
1507
  "futures",
1508
1508
  "kreuzcrawl",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzcrawl-rb"
3
- version = "0.3.0-rc.71"
3
+ version = "0.3.0-rc.76"
4
4
  edition = "2024"
5
5
  license = "Elastic-2.0"
6
6
  description = "High-performance web crawling engine"
@@ -18,7 +18,7 @@ crate-type = ["cdylib"]
18
18
 
19
19
  [dependencies]
20
20
  futures = "0.3"
21
- kreuzcrawl = { version = "0.3.0-rc.71", features = ["interact", "browser-chromiumoxide"] }
21
+ kreuzcrawl = { version = "0.3.0-rc.76", features = ["interact", "browser-chromiumoxide"] }
22
22
  magnus = "0.8"
23
23
  rb-sys = ">=0.9, <0.9.128"
24
24
  serde = { version = "1", features = ["derive"] }
@@ -1,5 +1,5 @@
1
1
  // This file is auto-generated by alef. DO NOT EDIT.
2
- // alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ // alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  // Re-generate with: alef generate
4
4
  #![allow(dead_code, unused_imports, unused_variables)]
5
5
  #![allow(
@@ -608,6 +608,7 @@ pub struct CrawlConfig {
608
608
  warc_output: Option<String>,
609
609
  browser_profile: Option<String>,
610
610
  save_browser_profile: bool,
611
+ ssrf: SsrfPolicy,
611
612
  }
612
613
 
613
614
  unsafe impl IntoValueFromNative for CrawlConfig {}
@@ -679,6 +680,7 @@ impl Default for CrawlConfig {
679
680
  warc_output: None,
680
681
  browser_profile: None,
681
682
  save_browser_profile: false,
683
+ ssrf: Default::default(),
682
684
  }
683
685
  }
684
686
  }
@@ -827,6 +829,10 @@ impl CrawlConfig {
827
829
  .get(ruby.to_symbol("save_browser_profile"))
828
830
  .and_then(|v| bool::try_convert(v).ok())
829
831
  .unwrap_or(false),
832
+ ssrf: kwargs
833
+ .get(ruby.to_symbol("ssrf"))
834
+ .and_then(|v| SsrfPolicy::try_convert(v).ok())
835
+ .unwrap_or_default(),
830
836
  })
831
837
  }
832
838
 
@@ -982,6 +988,10 @@ impl CrawlConfig {
982
988
  self.save_browser_profile
983
989
  }
984
990
 
991
+ fn ssrf(&self) -> SsrfPolicy {
992
+ self.ssrf.clone()
993
+ }
994
+
985
995
  fn validate(&self) -> Result<(), Error> {
986
996
  #[allow(clippy::needless_update)]
987
997
  let core_self = kreuzcrawl::CrawlConfig {
@@ -1066,8 +1076,8 @@ impl CrawlConfig {
1066
1076
 
1067
1077
  save_browser_profile: self.save_browser_profile,
1068
1078
 
1069
- ssrf: Default::default(),
1070
- dispatch: Default::default(),
1079
+ ssrf: self.ssrf.clone().into(),
1080
+
1071
1081
  ..Default::default()
1072
1082
  };
1073
1083
  let result = core_self.validate().map_err(|e| {
@@ -2101,7 +2111,7 @@ impl CrawlResult {
2101
2111
 
2102
2112
  browser_used: self.browser_used,
2103
2113
 
2104
- normalized_urls: Default::default(),
2114
+ ..Default::default()
2105
2115
  };
2106
2116
  core_self.unique_normalized_urls()
2107
2117
  }
@@ -4419,6 +4429,75 @@ impl BatchCrawlResults {
4419
4429
  }
4420
4430
  }
4421
4431
 
4432
+ #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
4433
+ #[serde(default)]
4434
+ #[magnus::wrap(class = "Kreuzcrawl::SsrfPolicy")]
4435
+ pub struct SsrfPolicy {
4436
+ deny_private: bool,
4437
+ max_redirects: u8,
4438
+ }
4439
+
4440
+ unsafe impl IntoValueFromNative for SsrfPolicy {}
4441
+
4442
+ impl magnus::TryConvert for SsrfPolicy {
4443
+ fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
4444
+ if let Ok(r) = <&SsrfPolicy as magnus::TryConvert>::try_convert(val) {
4445
+ return Ok(r.clone());
4446
+ }
4447
+ let json_str: String = if let Ok(s) = <String as magnus::TryConvert>::try_convert(val) {
4448
+ s
4449
+ } else {
4450
+ val.funcall::<_, _, String>("to_json", ()).map_err(|e| {
4451
+ magnus::Error::new(
4452
+ unsafe { magnus::Ruby::get_unchecked() }.exception_type_error(),
4453
+ format!("no implicit conversion into SsrfPolicy: {}", e),
4454
+ )
4455
+ })?
4456
+ };
4457
+ serde_json::from_str::<SsrfPolicy>(&json_str).map_err(|e| {
4458
+ magnus::Error::new(
4459
+ unsafe { magnus::Ruby::get_unchecked() }.exception_type_error(),
4460
+ format!("failed to deserialize SsrfPolicy: {}", e),
4461
+ )
4462
+ })
4463
+ }
4464
+ }
4465
+
4466
+ unsafe impl TryConvertOwned for SsrfPolicy {}
4467
+
4468
+ impl Default for SsrfPolicy {
4469
+ fn default() -> Self {
4470
+ kreuzcrawl::SsrfPolicy::default().into()
4471
+ }
4472
+ }
4473
+
4474
+ impl SsrfPolicy {
4475
+ fn new(args: &[magnus::Value]) -> Result<Self, magnus::Error> {
4476
+ let ruby = unsafe { magnus::Ruby::get_unchecked() };
4477
+ let args = magnus::scan_args::scan_args::<(), (Option<magnus::RHash>,), (), (), (), ()>(args)?;
4478
+ let (kwargs_opt,) = args.optional;
4479
+ let kwargs = kwargs_opt.unwrap_or_else(|| ruby.hash_new());
4480
+ Ok(Self {
4481
+ deny_private: kwargs
4482
+ .get(ruby.to_symbol("deny_private"))
4483
+ .and_then(|v| bool::try_convert(v).ok())
4484
+ .unwrap_or(true),
4485
+ max_redirects: kwargs
4486
+ .get(ruby.to_symbol("max_redirects"))
4487
+ .and_then(|v| u8::try_convert(v).ok())
4488
+ .unwrap_or(5),
4489
+ })
4490
+ }
4491
+
4492
+ fn deny_private(&self) -> bool {
4493
+ self.deny_private
4494
+ }
4495
+
4496
+ fn max_redirects(&self) -> u8 {
4497
+ self.max_redirects
4498
+ }
4499
+ }
4500
+
4422
4501
  #[derive(Clone, Copy, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
4423
4502
  #[serde(rename_all = "snake_case")]
4424
4503
  pub enum BrowserMode {
@@ -5446,8 +5525,7 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
5446
5525
  warc_output: val.warc_output.map(Into::into),
5447
5526
  browser_profile: val.browser_profile,
5448
5527
  save_browser_profile: val.save_browser_profile,
5449
- ssrf: Default::default(),
5450
- dispatch: Default::default(),
5528
+ ssrf: val.ssrf.into(),
5451
5529
  ..Default::default()
5452
5530
  }
5453
5531
  }
@@ -5499,6 +5577,7 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
5499
5577
  warc_output: val.warc_output.map(|p| p.to_string_lossy().to_string()),
5500
5578
  browser_profile: val.browser_profile.map(|v| v.to_string()),
5501
5579
  save_browser_profile: val.save_browser_profile,
5580
+ ssrf: val.ssrf.into(),
5502
5581
  }
5503
5582
  }
5504
5583
  }
@@ -5525,17 +5604,18 @@ impl From<kreuzcrawl::BrowserExtras> for BrowserExtras {
5525
5604
  }
5526
5605
  }
5527
5606
 
5607
+ #[allow(clippy::needless_update)]
5528
5608
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5529
5609
  impl From<DownloadedDocument> for kreuzcrawl::DownloadedDocument {
5530
5610
  fn from(val: DownloadedDocument) -> Self {
5531
5611
  Self {
5532
5612
  url: val.url,
5533
5613
  mime_type: val.mime_type.into(),
5534
- content: Default::default(),
5535
5614
  size: val.size,
5536
5615
  filename: val.filename.map(Into::into),
5537
5616
  content_hash: val.content_hash.into(),
5538
5617
  headers: val.headers.into_iter().map(|(k, v)| (k.into(), v.into())).collect(),
5618
+ ..Default::default()
5539
5619
  }
5540
5620
  }
5541
5621
  }
@@ -5554,6 +5634,7 @@ impl From<kreuzcrawl::DownloadedDocument> for DownloadedDocument {
5554
5634
  }
5555
5635
  }
5556
5636
 
5637
+ #[allow(clippy::needless_update)]
5557
5638
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5558
5639
  impl From<InteractionResult> for kreuzcrawl::InteractionResult {
5559
5640
  fn from(val: InteractionResult) -> Self {
@@ -5561,7 +5642,7 @@ impl From<InteractionResult> for kreuzcrawl::InteractionResult {
5561
5642
  action_results: val.action_results.into_iter().map(Into::into).collect(),
5562
5643
  final_html: val.final_html,
5563
5644
  final_url: val.final_url,
5564
- screenshot: Default::default(),
5645
+ ..Default::default()
5565
5646
  }
5566
5647
  }
5567
5648
  }
@@ -5603,6 +5684,7 @@ impl From<kreuzcrawl::ActionResult> for ActionResult {
5603
5684
  }
5604
5685
  }
5605
5686
 
5687
+ #[allow(clippy::needless_update)]
5606
5688
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5607
5689
  impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
5608
5690
  fn from(val: ScrapeResult) -> Self {
@@ -5633,9 +5715,9 @@ impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
5633
5715
  markdown: val.markdown.map(Into::into),
5634
5716
  extracted_data: val.extracted_data.as_ref().and_then(|s| serde_json::from_str(s).ok()),
5635
5717
  extraction_meta: val.extraction_meta.map(Into::into),
5636
- screenshot: Default::default(),
5637
5718
  downloaded_document: val.downloaded_document.map(Into::into),
5638
5719
  browser: val.browser.map(Into::into),
5720
+ ..Default::default()
5639
5721
  }
5640
5722
  }
5641
5723
  }
@@ -5734,6 +5816,7 @@ impl From<kreuzcrawl::CrawlPageResult> for CrawlPageResult {
5734
5816
  }
5735
5817
  }
5736
5818
 
5819
+ #[allow(clippy::needless_update)]
5737
5820
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5738
5821
  impl From<CrawlResult> for kreuzcrawl::CrawlResult {
5739
5822
  fn from(val: CrawlResult) -> Self {
@@ -5746,7 +5829,7 @@ impl From<CrawlResult> for kreuzcrawl::CrawlResult {
5746
5829
  cookies: val.cookies.into_iter().map(Into::into).collect(),
5747
5830
  stayed_on_domain: val.stayed_on_domain,
5748
5831
  browser_used: val.browser_used,
5749
- normalized_urls: Default::default(),
5832
+ ..Default::default()
5750
5833
  }
5751
5834
  }
5752
5835
  }
@@ -6382,6 +6465,28 @@ impl From<kreuzcrawl::BatchCrawlResults> for BatchCrawlResults {
6382
6465
  }
6383
6466
  }
6384
6467
 
6468
+ #[allow(clippy::needless_update)]
6469
+ #[allow(clippy::redundant_closure, clippy::useless_conversion)]
6470
+ impl From<SsrfPolicy> for kreuzcrawl::SsrfPolicy {
6471
+ fn from(val: SsrfPolicy) -> Self {
6472
+ Self {
6473
+ deny_private: val.deny_private,
6474
+ max_redirects: val.max_redirects,
6475
+ ..Default::default()
6476
+ }
6477
+ }
6478
+ }
6479
+
6480
+ #[allow(clippy::redundant_closure, clippy::useless_conversion)]
6481
+ impl From<kreuzcrawl::SsrfPolicy> for SsrfPolicy {
6482
+ fn from(val: kreuzcrawl::SsrfPolicy) -> Self {
6483
+ Self {
6484
+ deny_private: val.deny_private,
6485
+ max_redirects: val.max_redirects,
6486
+ }
6487
+ }
6488
+ }
6489
+
6385
6490
  impl From<BrowserMode> for kreuzcrawl::BrowserMode {
6386
6491
  fn from(val: BrowserMode) -> Self {
6387
6492
  match val {
@@ -6664,6 +6769,13 @@ fn crawl_error_to_magnus_err(e: kreuzcrawl::CrawlError) -> magnus::Error {
6664
6769
  magnus::Error::new(unsafe { magnus::Ruby::get_unchecked() }.exception_runtime_error(), msg)
6665
6770
  }
6666
6771
 
6772
+ /// Convert a `kreuzcrawl::SsrfError` error to a Magnus runtime error.
6773
+ #[allow(dead_code)]
6774
+ fn ssrf_error_to_magnus_err(e: kreuzcrawl::SsrfError) -> magnus::Error {
6775
+ let msg = e.to_string();
6776
+ magnus::Error::new(unsafe { magnus::Ruby::get_unchecked() }.exception_runtime_error(), msg)
6777
+ }
6778
+
6667
6779
  #[derive(Clone)]
6668
6780
  #[magnus::wrap(class = "Kreuzcrawl::CrawlStreamIterator")]
6669
6781
  pub struct CrawlStreamIterator {
@@ -7009,6 +7121,8 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
7009
7121
 
7010
7122
  class.define_method("save_browser_profile", method!(CrawlConfig::save_browser_profile, 0))?;
7011
7123
 
7124
+ class.define_method("ssrf", method!(CrawlConfig::ssrf, 0))?;
7125
+
7012
7126
  class.define_method("validate", method!(CrawlConfig::validate, 0))?;
7013
7127
 
7014
7128
  let class = module.define_class("BrowserExtras", ruby.class_object())?;
@@ -7535,6 +7649,14 @@ fn ruby_init(ruby: &Ruby) -> Result<(), Error> {
7535
7649
 
7536
7650
  class.define_method("failed_count", method!(BatchCrawlResults::failed_count, 0))?;
7537
7651
 
7652
+ let class = module.define_class("SsrfPolicy", ruby.class_object())?;
7653
+
7654
+ class.define_singleton_method("new", function!(SsrfPolicy::new, -1))?;
7655
+
7656
+ class.define_method("deny_private", method!(SsrfPolicy::deny_private, 0))?;
7657
+
7658
+ class.define_method("max_redirects", method!(SsrfPolicy::max_redirects, 0))?;
7659
+
7538
7660
  let class = module.define_class("CrawlStreamIterator", ruby.class_object())?;
7539
7661
  class.define_method("next_chunk", method!(CrawlStreamIterator::next_chunk, 0))?;
7540
7662
  class.define_method("each", method!(CrawlStreamIterator::each, 0))?;
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
@@ -106,10 +106,7 @@ module Kreuzcrawl
106
106
  # Not available on `wasm32` targets — streaming requires native concurrency
107
107
  # primitives (tokio channels, `JoinSet`) that are not supported on wasm32.
108
108
  #
109
- # Delivered to bindings via alef's streaming-adapter pattern. The
110
- # `crawl_stream` / `batch_crawl_stream` binding wrappers in `bindings.rs`
111
- # expose this as the per-language streaming idiom (Python `AsyncIterator`,
112
- # Ruby `Enumerator`, PHP `Generator`, Elixir `Stream.unfold`, etc.).
109
+ # Delivered to bindings through each target's native streaming idiom.
113
110
  module CrawlEvent
114
111
  extend T::Helpers
115
112
  extend T::Sig
@@ -1,10 +1,10 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
6
6
 
7
7
  module Kreuzcrawl
8
8
  ## The version string for this package.
9
- VERSION = "0.3.0.pre.rc.71"
9
+ VERSION = "0.3.0.pre.rc.76"
10
10
  end
data/lib/kreuzcrawl.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
data/lib/kreuzcrawl_rb.so CHANGED
Binary file
data/sig/types.rbs CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:8896b7e192e408f916c81acf04c97fc5ee10b269fe22c287e9eefab02fada615
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
 
@@ -103,8 +103,9 @@ module Kreuzcrawl
103
103
  attr_accessor warc_output: String?
104
104
  attr_accessor browser_profile: String?
105
105
  attr_accessor save_browser_profile: bool?
106
+ attr_accessor ssrf: SsrfPolicy?
106
107
 
107
- def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool) -> void
108
+ def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool, ?ssrf: SsrfPolicy) -> void
108
109
  def validate: () -> void
109
110
  def self.default: () -> CrawlConfig
110
111
  end
@@ -460,6 +461,15 @@ module Kreuzcrawl
460
461
  def initialize: (?results: Array[BatchCrawlResult], ?total_count: Integer, ?completed_count: Integer, ?failed_count: Integer) -> void
461
462
  end
462
463
 
464
+ class SsrfPolicy
465
+ attr_accessor deny_private: bool?
466
+ attr_accessor max_redirects: Integer?
467
+
468
+ def initialize: (?deny_private: bool, ?max_redirects: Integer) -> void
469
+ def self.default: () -> SsrfPolicy
470
+ def self.from_env: () -> SsrfPolicy
471
+ end
472
+
463
473
  class BrowserMode
464
474
  type value = :auto | :always | :never | :stealth
465
475
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.pre.rc.71
4
+ version: 0.3.0.pre.rc.76
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kreuzberg Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-06-16 00:00:00.000000000 Z
11
+ date: 2026-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys