kreuzcrawl 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 922ae77455c2e0e37df7ddb5a4084ad765187c947427092346d577e577d25be1
4
- data.tar.gz: e1e537da083dea650de05a86713fe0b93134751345b7bb9d47ee3760379774c4
3
+ metadata.gz: a2f4027aef59737e93add85d5da2398c4a95767fb41766b86822f50199331e32
4
+ data.tar.gz: 82f38cf86e988321ef9bc11f5a42a023df464e79d0d0ad646edeae564cabfec6
5
5
  SHA512:
6
- metadata.gz: 5499c3684018e4bb69febc2e3c939b9f388c2976636f6dce443866108d898759895e5a0567b8bf9fdfbadf1c35dda8f54cbf73ec2ddfb1f20c39dac7a5a0efd6
7
- data.tar.gz: c82c4450e90a1a47216f9240f181791a7fa37089c7d776709bcd0c031949a0043c9045c0919f04ac1f8672f46c9bcb811daed5f01b5e0bde07e8e2f87edcf020
6
+ metadata.gz: 1ebcebff06f45e809441c0cae1d9f14be48a79b6d1439b3421ebedd4808d9c71e6ffa86d51a6c1d64ba57a0bc2eeffa24a672a4af3db5659185ba5c8cca8443a
7
+ data.tar.gz: af29b39fdf985044e579d71064af9eb10dc348c81a82e7f27dc0650c4720ee1164fd8fe5529358cc9ba494b0fc296e9a425b17182794d001b3bcee526f737d35
@@ -1174,7 +1174,7 @@ dependencies = [
1174
1174
 
1175
1175
  [[package]]
1176
1176
  name = "kreuzcrawl"
1177
- version = "0.1.0-rc.5"
1177
+ version = "0.1.1"
1178
1178
  dependencies = [
1179
1179
  "ahash",
1180
1180
  "astral-tl",
@@ -1200,7 +1200,7 @@ dependencies = [
1200
1200
 
1201
1201
  [[package]]
1202
1202
  name = "kreuzcrawl-rb"
1203
- version = "0.1.0-rc.5"
1203
+ version = "0.1.1"
1204
1204
  dependencies = [
1205
1205
  "kreuzcrawl",
1206
1206
  "magnus",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzcrawl-rb"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  edition = "2024"
5
5
  license = "Elastic-2.0"
6
6
 
@@ -2,5 +2,5 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  module Kreuzcrawl
5
- VERSION = "0.1.1"
5
+ VERSION = "0.1.2"
6
6
  end
@@ -1,6 +1,19 @@
1
1
  // This file is auto-generated by alef. DO NOT EDIT.
2
2
  // Re-generate with: alef generate
3
3
  #![allow(dead_code)]
4
+ #![allow(
5
+ clippy::too_many_arguments,
6
+ clippy::let_unit_value,
7
+ clippy::needless_borrow,
8
+ clippy::map_identity,
9
+ clippy::just_underscores_and_digits,
10
+ clippy::unused_unit,
11
+ clippy::unnecessary_cast,
12
+ clippy::unwrap_or_default,
13
+ clippy::derivable_impls,
14
+ clippy::needless_borrows_for_generic_args,
15
+ clippy::unnecessary_fallible_conversions
16
+ )]
4
17
 
5
18
  use magnus::{Error, IntoValueFromNative, Ruby, function, method, prelude::*, try_convert::TryConvertOwned};
6
19
  use std::collections::HashMap;
@@ -243,6 +256,7 @@ pub struct CrawlConfig {
243
256
  pub exclude_paths: Vec<String>,
244
257
  pub custom_headers: HashMap<String, String>,
245
258
  pub request_timeout: u64,
259
+ pub rate_limit_ms: Option<u64>,
246
260
  pub max_redirects: usize,
247
261
  pub retry_count: usize,
248
262
  pub retry_codes: Vec<u16>,
@@ -322,6 +336,9 @@ impl CrawlConfig {
322
336
  .get(ruby.to_symbol("request_timeout"))
323
337
  .and_then(|v| u64::try_convert(v).ok())
324
338
  .unwrap_or(30000),
339
+ rate_limit_ms: kwargs
340
+ .get(ruby.to_symbol("rate_limit_ms"))
341
+ .and_then(|v| u64::try_convert(v).ok()),
325
342
  max_redirects: kwargs
326
343
  .get(ruby.to_symbol("max_redirects"))
327
344
  .and_then(|v| usize::try_convert(v).ok())
@@ -452,6 +469,10 @@ impl CrawlConfig {
452
469
  self.request_timeout.clone()
453
470
  }
454
471
 
472
+ fn rate_limit_ms(&self) -> Option<u64> {
473
+ self.rate_limit_ms
474
+ }
475
+
455
476
  fn max_redirects(&self) -> usize {
456
477
  self.max_redirects
457
478
  }
@@ -558,6 +579,7 @@ impl CrawlConfig {
558
579
  exclude_paths: self.exclude_paths.clone(),
559
580
  custom_headers: self.custom_headers.clone().into_iter().collect(),
560
581
  request_timeout: std::time::Duration::from_millis(self.request_timeout),
582
+ rate_limit_ms: self.rate_limit_ms,
561
583
  max_redirects: self.max_redirects,
562
584
  retry_count: self.retry_count,
563
585
  retry_codes: self.retry_codes.clone(),
@@ -680,140 +702,6 @@ impl DownloadedDocument {
680
702
  }
681
703
  }
682
704
 
683
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
684
- #[magnus::wrap(class = "Kreuzcrawl::InteractionResult")]
685
- #[serde(default)]
686
- pub struct InteractionResult {
687
- pub action_results: Vec<ActionResult>,
688
- pub final_html: String,
689
- pub final_url: String,
690
- pub screenshot: Option<Vec<u8>>,
691
- }
692
-
693
- unsafe impl IntoValueFromNative for InteractionResult {}
694
-
695
- impl magnus::TryConvert for InteractionResult {
696
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
697
- let r: &InteractionResult = magnus::TryConvert::try_convert(val)?;
698
- Ok(r.clone())
699
- }
700
- }
701
- unsafe impl TryConvertOwned for InteractionResult {}
702
-
703
- impl Default for InteractionResult {
704
- fn default() -> Self {
705
- Self {
706
- action_results: Default::default(),
707
- final_html: Default::default(),
708
- final_url: Default::default(),
709
- screenshot: Default::default(),
710
- }
711
- }
712
- }
713
-
714
- impl InteractionResult {
715
- fn new(
716
- action_results: Option<Vec<ActionResult>>,
717
- final_html: Option<String>,
718
- final_url: Option<String>,
719
- screenshot: Option<Vec<u8>>,
720
- ) -> Self {
721
- Self {
722
- action_results: action_results.unwrap_or_default(),
723
- final_html: final_html.unwrap_or_default(),
724
- final_url: final_url.unwrap_or_default(),
725
- screenshot,
726
- }
727
- }
728
-
729
- fn action_results(&self) -> Vec<ActionResult> {
730
- self.action_results.clone()
731
- }
732
-
733
- fn final_html(&self) -> String {
734
- self.final_html.clone()
735
- }
736
-
737
- fn final_url(&self) -> String {
738
- self.final_url.clone()
739
- }
740
-
741
- fn screenshot(&self) -> Option<Vec<u8>> {
742
- self.screenshot.clone()
743
- }
744
- }
745
-
746
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
747
- #[magnus::wrap(class = "Kreuzcrawl::ActionResult")]
748
- #[serde(default)]
749
- pub struct ActionResult {
750
- pub action_index: usize,
751
- pub action_type: String,
752
- pub success: bool,
753
- pub data: Option<String>,
754
- pub error: Option<String>,
755
- }
756
-
757
- unsafe impl IntoValueFromNative for ActionResult {}
758
-
759
- impl magnus::TryConvert for ActionResult {
760
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
761
- let r: &ActionResult = magnus::TryConvert::try_convert(val)?;
762
- Ok(r.clone())
763
- }
764
- }
765
- unsafe impl TryConvertOwned for ActionResult {}
766
-
767
- impl Default for ActionResult {
768
- fn default() -> Self {
769
- Self {
770
- action_index: Default::default(),
771
- action_type: Default::default(),
772
- success: Default::default(),
773
- data: Default::default(),
774
- error: Default::default(),
775
- }
776
- }
777
- }
778
-
779
- impl ActionResult {
780
- fn new(
781
- action_index: Option<usize>,
782
- action_type: Option<String>,
783
- success: Option<bool>,
784
- data: Option<String>,
785
- error: Option<String>,
786
- ) -> Self {
787
- Self {
788
- action_index: action_index.unwrap_or_default(),
789
- action_type: action_type.unwrap_or_default(),
790
- success: success.unwrap_or_default(),
791
- data,
792
- error,
793
- }
794
- }
795
-
796
- fn action_index(&self) -> usize {
797
- self.action_index
798
- }
799
-
800
- fn action_type(&self) -> String {
801
- self.action_type.clone()
802
- }
803
-
804
- fn success(&self) -> bool {
805
- self.success
806
- }
807
-
808
- fn data(&self) -> Option<String> {
809
- self.data.clone()
810
- }
811
-
812
- fn error(&self) -> Option<String> {
813
- self.error.clone()
814
- }
815
- }
816
-
817
705
  #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
818
706
  #[magnus::wrap(class = "Kreuzcrawl::ScrapeResult")]
819
707
  #[serde(default)]
@@ -1616,93 +1504,6 @@ impl MarkdownResult {
1616
1504
  }
1617
1505
  }
1618
1506
 
1619
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
1620
- #[magnus::wrap(class = "Kreuzcrawl::CachedPage")]
1621
- #[serde(default)]
1622
- pub struct CachedPage {
1623
- pub url: String,
1624
- pub status_code: u16,
1625
- pub content_type: String,
1626
- pub body: String,
1627
- pub etag: Option<String>,
1628
- pub last_modified: Option<String>,
1629
- pub cached_at: u64,
1630
- }
1631
-
1632
- unsafe impl IntoValueFromNative for CachedPage {}
1633
-
1634
- impl magnus::TryConvert for CachedPage {
1635
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
1636
- let r: &CachedPage = magnus::TryConvert::try_convert(val)?;
1637
- Ok(r.clone())
1638
- }
1639
- }
1640
- unsafe impl TryConvertOwned for CachedPage {}
1641
-
1642
- impl Default for CachedPage {
1643
- fn default() -> Self {
1644
- Self {
1645
- url: Default::default(),
1646
- status_code: Default::default(),
1647
- content_type: Default::default(),
1648
- body: Default::default(),
1649
- etag: Default::default(),
1650
- last_modified: Default::default(),
1651
- cached_at: Default::default(),
1652
- }
1653
- }
1654
- }
1655
-
1656
- impl CachedPage {
1657
- fn new(
1658
- url: Option<String>,
1659
- status_code: Option<u16>,
1660
- content_type: Option<String>,
1661
- body: Option<String>,
1662
- etag: Option<String>,
1663
- last_modified: Option<String>,
1664
- cached_at: Option<u64>,
1665
- ) -> Self {
1666
- Self {
1667
- url: url.unwrap_or_default(),
1668
- status_code: status_code.unwrap_or_default(),
1669
- content_type: content_type.unwrap_or_default(),
1670
- body: body.unwrap_or_default(),
1671
- etag,
1672
- last_modified,
1673
- cached_at: cached_at.unwrap_or_default(),
1674
- }
1675
- }
1676
-
1677
- fn url(&self) -> String {
1678
- self.url.clone()
1679
- }
1680
-
1681
- fn status_code(&self) -> u16 {
1682
- self.status_code
1683
- }
1684
-
1685
- fn content_type(&self) -> String {
1686
- self.content_type.clone()
1687
- }
1688
-
1689
- fn body(&self) -> String {
1690
- self.body.clone()
1691
- }
1692
-
1693
- fn etag(&self) -> Option<String> {
1694
- self.etag.clone()
1695
- }
1696
-
1697
- fn last_modified(&self) -> Option<String> {
1698
- self.last_modified.clone()
1699
- }
1700
-
1701
- fn cached_at(&self) -> u64 {
1702
- self.cached_at
1703
- }
1704
- }
1705
-
1706
1507
  #[derive(Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
1707
1508
  #[magnus::wrap(class = "Kreuzcrawl::LinkInfo")]
1708
1509
  #[serde(default)]
@@ -3295,39 +3096,6 @@ impl magnus::TryConvert for AssetCategory {
3295
3096
  unsafe impl IntoValueFromNative for AssetCategory {}
3296
3097
  unsafe impl TryConvertOwned for AssetCategory {}
3297
3098
 
3298
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
3299
- pub enum CrawlEvent {
3300
- Page { _0: CrawlPageResult },
3301
- Error { url: String, error: String },
3302
- Complete { pages_crawled: usize },
3303
- }
3304
-
3305
- impl Default for CrawlEvent {
3306
- fn default() -> Self {
3307
- Self::Page { _0: Default::default() }
3308
- }
3309
- }
3310
-
3311
- impl magnus::IntoValue for CrawlEvent {
3312
- fn into_value_with(self, handle: &Ruby) -> magnus::Value {
3313
- match serde_json::to_value(&self) {
3314
- Ok(v) => json_to_ruby(handle, v),
3315
- Err(_) => handle.qnil().into_value_with(handle),
3316
- }
3317
- }
3318
- }
3319
-
3320
- impl magnus::TryConvert for CrawlEvent {
3321
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
3322
- let s: String = magnus::TryConvert::try_convert(val)?;
3323
- serde_json::from_str(&s)
3324
- .map_err(|e| magnus::Error::new(unsafe { Ruby::get_unchecked() }.exception_type_error(), e.to_string()))
3325
- }
3326
- }
3327
-
3328
- unsafe impl IntoValueFromNative for CrawlEvent {}
3329
- unsafe impl TryConvertOwned for CrawlEvent {}
3330
-
3331
3099
  fn create_engine(config: Option<String>) -> Result<CrawlEngineHandle, Error> {
3332
3100
  let config: Option<CrawlConfig> = config
3333
3101
  .as_deref()
@@ -3587,6 +3355,7 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
3587
3355
  exclude_paths: val.exclude_paths,
3588
3356
  custom_headers: val.custom_headers.into_iter().collect(),
3589
3357
  request_timeout: std::time::Duration::from_millis(val.request_timeout),
3358
+ rate_limit_ms: val.rate_limit_ms,
3590
3359
  max_redirects: val.max_redirects,
3591
3360
  retry_count: val.retry_count,
3592
3361
  retry_codes: val.retry_codes,
@@ -3629,6 +3398,7 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
3629
3398
  exclude_paths: val.exclude_paths,
3630
3399
  custom_headers: val.custom_headers.into_iter().collect(),
3631
3400
  request_timeout: val.request_timeout.as_millis() as u64,
3401
+ rate_limit_ms: val.rate_limit_ms,
3632
3402
  max_redirects: val.max_redirects,
3633
3403
  retry_count: val.retry_count,
3634
3404
  retry_codes: val.retry_codes,
@@ -3677,40 +3447,17 @@ impl From<kreuzcrawl::DownloadedDocument> for DownloadedDocument {
3677
3447
  mime_type: format!("{:?}", val.mime_type),
3678
3448
  content: val.content.to_vec(),
3679
3449
  size: val.size,
3680
- filename: val.filename.as_ref().map(|v| format!("{:?}", v)),
3450
+ filename: val.filename.as_ref().map(|v| format!("{v:?}")),
3681
3451
  content_hash: format!("{:?}", val.content_hash),
3682
3452
  headers: val
3683
3453
  .headers
3684
3454
  .into_iter()
3685
- .map(|(k, v)| (format!("{:?}", k), format!("{:?}", v)))
3455
+ .map(|(k, v)| (k.to_string(), v.to_string()))
3686
3456
  .collect(),
3687
3457
  }
3688
3458
  }
3689
3459
  }
3690
3460
 
3691
- impl From<kreuzcrawl::InteractionResult> for InteractionResult {
3692
- fn from(val: kreuzcrawl::InteractionResult) -> Self {
3693
- Self {
3694
- action_results: val.action_results.into_iter().map(Into::into).collect(),
3695
- final_html: val.final_html,
3696
- final_url: val.final_url,
3697
- screenshot: val.screenshot.map(|v| v.to_vec()),
3698
- }
3699
- }
3700
- }
3701
-
3702
- impl From<kreuzcrawl::ActionResult> for ActionResult {
3703
- fn from(val: kreuzcrawl::ActionResult) -> Self {
3704
- Self {
3705
- action_index: val.action_index,
3706
- action_type: format!("{:?}", val.action_type),
3707
- success: val.success,
3708
- data: val.data.as_ref().map(ToString::to_string),
3709
- error: val.error,
3710
- }
3711
- }
3712
- }
3713
-
3714
3461
  impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
3715
3462
  fn from(val: ScrapeResult) -> Self {
3716
3463
  Self {
@@ -3934,20 +3681,6 @@ impl From<kreuzcrawl::MarkdownResult> for MarkdownResult {
3934
3681
  }
3935
3682
  }
3936
3683
 
3937
- impl From<kreuzcrawl::CachedPage> for CachedPage {
3938
- fn from(val: kreuzcrawl::CachedPage) -> Self {
3939
- Self {
3940
- url: val.url,
3941
- status_code: val.status_code,
3942
- content_type: val.content_type,
3943
- body: val.body,
3944
- etag: val.etag,
3945
- last_modified: val.last_modified,
3946
- cached_at: val.cached_at,
3947
- }
3948
- }
3949
- }
3950
-
3951
3684
  impl From<LinkInfo> for kreuzcrawl::LinkInfo {
3952
3685
  fn from(val: LinkInfo) -> Self {
3953
3686
  Self {
@@ -4530,16 +4263,6 @@ impl From<kreuzcrawl::AssetCategory> for AssetCategory {
4530
4263
  }
4531
4264
  }
4532
4265
 
4533
- impl From<kreuzcrawl::CrawlEvent> for CrawlEvent {
4534
- fn from(val: kreuzcrawl::CrawlEvent) -> Self {
4535
- match val {
4536
- kreuzcrawl::CrawlEvent::Page(_0) => Self::Page { _0: (*_0).into() },
4537
- kreuzcrawl::CrawlEvent::Error { url, error } => Self::Error { url, error },
4538
- kreuzcrawl::CrawlEvent::Complete { pages_crawled } => Self::Complete { pages_crawled },
4539
- }
4540
- }
4541
- }
4542
-
4543
4266
  /// Convert a `kreuzcrawl::CrawlError` error to a Magnus runtime error.
4544
4267
  #[allow(dead_code)]
4545
4268
  fn crawl_error_to_magnus_err(e: kreuzcrawl::CrawlError) -> magnus::Error {
@@ -4587,6 +4310,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4587
4310
  class.define_method("exclude_paths", method!(CrawlConfig::exclude_paths, 0))?;
4588
4311
  class.define_method("custom_headers", method!(CrawlConfig::custom_headers, 0))?;
4589
4312
  class.define_method("request_timeout", method!(CrawlConfig::request_timeout, 0))?;
4313
+ class.define_method("rate_limit_ms", method!(CrawlConfig::rate_limit_ms, 0))?;
4590
4314
  class.define_method("max_redirects", method!(CrawlConfig::max_redirects, 0))?;
4591
4315
  class.define_method("retry_count", method!(CrawlConfig::retry_count, 0))?;
4592
4316
  class.define_method("retry_codes", method!(CrawlConfig::retry_codes, 0))?;
@@ -4622,21 +4346,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4622
4346
  class.define_method("content_hash", method!(DownloadedDocument::content_hash, 0))?;
4623
4347
  class.define_method("headers", method!(DownloadedDocument::headers, 0))?;
4624
4348
 
4625
- let class = module.define_class("InteractionResult", ruby.class_object())?;
4626
- class.define_singleton_method("new", function!(InteractionResult::new, 4))?;
4627
- class.define_method("action_results", method!(InteractionResult::action_results, 0))?;
4628
- class.define_method("final_html", method!(InteractionResult::final_html, 0))?;
4629
- class.define_method("final_url", method!(InteractionResult::final_url, 0))?;
4630
- class.define_method("screenshot", method!(InteractionResult::screenshot, 0))?;
4631
-
4632
- let class = module.define_class("ActionResult", ruby.class_object())?;
4633
- class.define_singleton_method("new", function!(ActionResult::new, 5))?;
4634
- class.define_method("action_index", method!(ActionResult::action_index, 0))?;
4635
- class.define_method("action_type", method!(ActionResult::action_type, 0))?;
4636
- class.define_method("success", method!(ActionResult::success, 0))?;
4637
- class.define_method("data", method!(ActionResult::data, 0))?;
4638
- class.define_method("error", method!(ActionResult::error, 0))?;
4639
-
4640
4349
  let class = module.define_class("ScrapeResult", ruby.class_object())?;
4641
4350
  class.define_singleton_method("new", function!(ScrapeResult::new, 1))?;
4642
4351
  class.define_method("status_code", method!(ScrapeResult::status_code, 0))?;
@@ -4725,16 +4434,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4725
4434
  class.define_method("citations", method!(MarkdownResult::citations, 0))?;
4726
4435
  class.define_method("fit_content", method!(MarkdownResult::fit_content, 0))?;
4727
4436
 
4728
- let class = module.define_class("CachedPage", ruby.class_object())?;
4729
- class.define_singleton_method("new", function!(CachedPage::new, 7))?;
4730
- class.define_method("url", method!(CachedPage::url, 0))?;
4731
- class.define_method("status_code", method!(CachedPage::status_code, 0))?;
4732
- class.define_method("content_type", method!(CachedPage::content_type, 0))?;
4733
- class.define_method("body", method!(CachedPage::body, 0))?;
4734
- class.define_method("etag", method!(CachedPage::etag, 0))?;
4735
- class.define_method("last_modified", method!(CachedPage::last_modified, 0))?;
4736
- class.define_method("cached_at", method!(CachedPage::cached_at, 0))?;
4737
-
4738
4437
  let class = module.define_class("LinkInfo", ruby.class_object())?;
4739
4438
  class.define_singleton_method("new", function!(LinkInfo::new, 5))?;
4740
4439
  class.define_method("url", method!(LinkInfo::url, 0))?;
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kreuzberg Team