kreuzcrawl 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 922ae77455c2e0e37df7ddb5a4084ad765187c947427092346d577e577d25be1
4
- data.tar.gz: e1e537da083dea650de05a86713fe0b93134751345b7bb9d47ee3760379774c4
3
+ metadata.gz: 92e6cb8be4ef5d12847d6d632465cf41b18ce6564652049052b8793162204255
4
+ data.tar.gz: 9bd7b541b57c35b0ddb1cec5d6b119463ca1622db1f57455b340f38b4cf3abd3
5
5
  SHA512:
6
- metadata.gz: 5499c3684018e4bb69febc2e3c939b9f388c2976636f6dce443866108d898759895e5a0567b8bf9fdfbadf1c35dda8f54cbf73ec2ddfb1f20c39dac7a5a0efd6
7
- data.tar.gz: c82c4450e90a1a47216f9240f181791a7fa37089c7d776709bcd0c031949a0043c9045c0919f04ac1f8672f46c9bcb811daed5f01b5e0bde07e8e2f87edcf020
6
+ metadata.gz: 78ae3990a646e6f0a9ecfdb8355d4bfb11f476d3c0dc498651e5ebeb08f28dd43dd848178c43ee4f5788517e0014b3b20f8827990abe9e8e3377cb43c27be773
7
+ data.tar.gz: f43d72f320847d7bf872dc7d59d8cf816bf1c2faf05528998773dbd55ab414dd35380e190f7a0f64eb0e60eae4866c22263eceb0ba05da02a47ad5e98aecbeeb
@@ -1174,7 +1174,7 @@ dependencies = [
1174
1174
 
1175
1175
  [[package]]
1176
1176
  name = "kreuzcrawl"
1177
- version = "0.1.0-rc.5"
1177
+ version = "0.1.1"
1178
1178
  dependencies = [
1179
1179
  "ahash",
1180
1180
  "astral-tl",
@@ -1200,7 +1200,7 @@ dependencies = [
1200
1200
 
1201
1201
  [[package]]
1202
1202
  name = "kreuzcrawl-rb"
1203
- version = "0.1.0-rc.5"
1203
+ version = "0.1.1"
1204
1204
  dependencies = [
1205
1205
  "kreuzcrawl",
1206
1206
  "magnus",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzcrawl-rb"
3
- version = "0.1.1"
3
+ version = "0.2.0"
4
4
  edition = "2024"
5
5
  license = "Elastic-2.0"
6
6
 
@@ -2,5 +2,5 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  module Kreuzcrawl
5
- VERSION = "0.1.1"
5
+ VERSION = "0.2.0"
6
6
  end
@@ -1,6 +1,19 @@
1
1
  // This file is auto-generated by alef. DO NOT EDIT.
2
2
  // Re-generate with: alef generate
3
3
  #![allow(dead_code)]
4
+ #![allow(
5
+ clippy::too_many_arguments,
6
+ clippy::let_unit_value,
7
+ clippy::needless_borrow,
8
+ clippy::map_identity,
9
+ clippy::just_underscores_and_digits,
10
+ clippy::unused_unit,
11
+ clippy::unnecessary_cast,
12
+ clippy::unwrap_or_default,
13
+ clippy::derivable_impls,
14
+ clippy::needless_borrows_for_generic_args,
15
+ clippy::unnecessary_fallible_conversions
16
+ )]
4
17
 
5
18
  use magnus::{Error, IntoValueFromNative, Ruby, function, method, prelude::*, try_convert::TryConvertOwned};
6
19
  use std::collections::HashMap;
@@ -162,6 +175,133 @@ impl ProxyConfig {
162
175
  }
163
176
  }
164
177
 
178
+ #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
179
+ #[magnus::wrap(class = "Kreuzcrawl::ContentConfig")]
180
+ #[serde(default)]
181
+ pub struct ContentConfig {
182
+ pub output_format: String,
183
+ pub preprocessing_preset: String,
184
+ pub remove_navigation: bool,
185
+ pub remove_forms: bool,
186
+ pub strip_tags: Vec<String>,
187
+ pub preserve_tags: Vec<String>,
188
+ pub exclude_selectors: Vec<String>,
189
+ pub skip_images: bool,
190
+ pub max_depth: Option<usize>,
191
+ pub wrap: bool,
192
+ pub wrap_width: usize,
193
+ pub include_document_structure: bool,
194
+ }
195
+
196
+ unsafe impl IntoValueFromNative for ContentConfig {}
197
+
198
+ impl magnus::TryConvert for ContentConfig {
199
+ fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
200
+ let r: &ContentConfig = magnus::TryConvert::try_convert(val)?;
201
+ Ok(r.clone())
202
+ }
203
+ }
204
+ unsafe impl TryConvertOwned for ContentConfig {}
205
+
206
+ impl Default for ContentConfig {
207
+ fn default() -> Self {
208
+ Self {
209
+ output_format: Default::default(),
210
+ preprocessing_preset: Default::default(),
211
+ remove_navigation: Default::default(),
212
+ remove_forms: Default::default(),
213
+ strip_tags: Default::default(),
214
+ preserve_tags: Default::default(),
215
+ exclude_selectors: Default::default(),
216
+ skip_images: Default::default(),
217
+ max_depth: Default::default(),
218
+ wrap: Default::default(),
219
+ wrap_width: Default::default(),
220
+ include_document_structure: Default::default(),
221
+ }
222
+ }
223
+ }
224
+
225
+ impl ContentConfig {
226
+ fn new(
227
+ output_format: Option<String>,
228
+ preprocessing_preset: Option<String>,
229
+ remove_navigation: Option<bool>,
230
+ remove_forms: Option<bool>,
231
+ strip_tags: Option<Vec<String>>,
232
+ preserve_tags: Option<Vec<String>>,
233
+ exclude_selectors: Option<Vec<String>>,
234
+ skip_images: Option<bool>,
235
+ max_depth: Option<usize>,
236
+ wrap: Option<bool>,
237
+ wrap_width: Option<usize>,
238
+ include_document_structure: Option<bool>,
239
+ ) -> Self {
240
+ Self {
241
+ output_format: output_format.unwrap_or("markdown".to_string()),
242
+ preprocessing_preset: preprocessing_preset.unwrap_or("standard".to_string()),
243
+ remove_navigation: remove_navigation.unwrap_or(true),
244
+ remove_forms: remove_forms.unwrap_or(true),
245
+ strip_tags: strip_tags.unwrap_or_default(),
246
+ preserve_tags: preserve_tags.unwrap_or_default(),
247
+ exclude_selectors: exclude_selectors.unwrap_or_default(),
248
+ skip_images: skip_images.unwrap_or(false),
249
+ max_depth,
250
+ wrap: wrap.unwrap_or(false),
251
+ wrap_width: wrap_width.unwrap_or(80),
252
+ include_document_structure: include_document_structure.unwrap_or(true),
253
+ }
254
+ }
255
+
256
+ fn output_format(&self) -> String {
257
+ self.output_format.clone()
258
+ }
259
+
260
+ fn preprocessing_preset(&self) -> String {
261
+ self.preprocessing_preset.clone()
262
+ }
263
+
264
+ fn remove_navigation(&self) -> bool {
265
+ self.remove_navigation
266
+ }
267
+
268
+ fn remove_forms(&self) -> bool {
269
+ self.remove_forms
270
+ }
271
+
272
+ fn strip_tags(&self) -> Vec<String> {
273
+ self.strip_tags.clone()
274
+ }
275
+
276
+ fn preserve_tags(&self) -> Vec<String> {
277
+ self.preserve_tags.clone()
278
+ }
279
+
280
+ fn exclude_selectors(&self) -> Vec<String> {
281
+ self.exclude_selectors.clone()
282
+ }
283
+
284
+ fn skip_images(&self) -> bool {
285
+ self.skip_images
286
+ }
287
+
288
+ fn max_depth(&self) -> Option<usize> {
289
+ self.max_depth
290
+ }
291
+
292
+ fn wrap(&self) -> bool {
293
+ self.wrap
294
+ }
295
+
296
+ fn wrap_width(&self) -> usize {
297
+ self.wrap_width
298
+ }
299
+
300
+ fn include_document_structure(&self) -> bool {
301
+ self.include_document_structure
302
+ }
303
+ }
304
+
165
305
  #[derive(Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
166
306
  #[magnus::wrap(class = "Kreuzcrawl::BrowserConfig")]
167
307
  #[serde(default)]
@@ -243,14 +383,15 @@ pub struct CrawlConfig {
243
383
  pub exclude_paths: Vec<String>,
244
384
  pub custom_headers: HashMap<String, String>,
245
385
  pub request_timeout: u64,
386
+ pub rate_limit_ms: Option<u64>,
246
387
  pub max_redirects: usize,
247
388
  pub retry_count: usize,
248
389
  pub retry_codes: Vec<u16>,
249
390
  pub cookies_enabled: bool,
250
391
  pub auth: Option<AuthConfig>,
251
392
  pub max_body_size: Option<usize>,
252
- pub main_content_only: bool,
253
393
  pub remove_tags: Vec<String>,
394
+ pub content: ContentConfig,
254
395
  pub map_limit: Option<usize>,
255
396
  pub map_search: Option<String>,
256
397
  pub download_assets: bool,
@@ -322,6 +463,9 @@ impl CrawlConfig {
322
463
  .get(ruby.to_symbol("request_timeout"))
323
464
  .and_then(|v| u64::try_convert(v).ok())
324
465
  .unwrap_or(30000),
466
+ rate_limit_ms: kwargs
467
+ .get(ruby.to_symbol("rate_limit_ms"))
468
+ .and_then(|v| u64::try_convert(v).ok()),
325
469
  max_redirects: kwargs
326
470
  .get(ruby.to_symbol("max_redirects"))
327
471
  .and_then(|v| usize::try_convert(v).ok())
@@ -344,14 +488,14 @@ impl CrawlConfig {
344
488
  max_body_size: kwargs
345
489
  .get(ruby.to_symbol("max_body_size"))
346
490
  .and_then(|v| usize::try_convert(v).ok()),
347
- main_content_only: kwargs
348
- .get(ruby.to_symbol("main_content_only"))
349
- .and_then(|v| bool::try_convert(v).ok())
350
- .unwrap_or(false),
351
491
  remove_tags: kwargs
352
492
  .get(ruby.to_symbol("remove_tags"))
353
493
  .and_then(|v| <Vec<String>>::try_convert(v).ok())
354
494
  .unwrap_or_default(),
495
+ content: kwargs
496
+ .get(ruby.to_symbol("content"))
497
+ .and_then(|v| ContentConfig::try_convert(v).ok())
498
+ .unwrap_or_default(),
355
499
  map_limit: kwargs
356
500
  .get(ruby.to_symbol("map_limit"))
357
501
  .and_then(|v| usize::try_convert(v).ok()),
@@ -452,6 +596,10 @@ impl CrawlConfig {
452
596
  self.request_timeout.clone()
453
597
  }
454
598
 
599
+ fn rate_limit_ms(&self) -> Option<u64> {
600
+ self.rate_limit_ms
601
+ }
602
+
455
603
  fn max_redirects(&self) -> usize {
456
604
  self.max_redirects
457
605
  }
@@ -476,14 +624,14 @@ impl CrawlConfig {
476
624
  self.max_body_size
477
625
  }
478
626
 
479
- fn main_content_only(&self) -> bool {
480
- self.main_content_only
481
- }
482
-
483
627
  fn remove_tags(&self) -> Vec<String> {
484
628
  self.remove_tags.clone()
485
629
  }
486
630
 
631
+ fn content(&self) -> ContentConfig {
632
+ self.content.clone()
633
+ }
634
+
487
635
  fn map_limit(&self) -> Option<usize> {
488
636
  self.map_limit
489
637
  }
@@ -558,14 +706,15 @@ impl CrawlConfig {
558
706
  exclude_paths: self.exclude_paths.clone(),
559
707
  custom_headers: self.custom_headers.clone().into_iter().collect(),
560
708
  request_timeout: std::time::Duration::from_millis(self.request_timeout),
709
+ rate_limit_ms: self.rate_limit_ms,
561
710
  max_redirects: self.max_redirects,
562
711
  retry_count: self.retry_count,
563
712
  retry_codes: self.retry_codes.clone(),
564
713
  cookies_enabled: self.cookies_enabled,
565
714
  auth: self.auth.clone().map(Into::into),
566
715
  max_body_size: self.max_body_size,
567
- main_content_only: self.main_content_only,
568
716
  remove_tags: self.remove_tags.clone(),
717
+ content: self.content.clone().into(),
569
718
  map_limit: self.map_limit,
570
719
  map_search: self.map_search.clone(),
571
720
  download_assets: self.download_assets,
@@ -680,140 +829,6 @@ impl DownloadedDocument {
680
829
  }
681
830
  }
682
831
 
683
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
684
- #[magnus::wrap(class = "Kreuzcrawl::InteractionResult")]
685
- #[serde(default)]
686
- pub struct InteractionResult {
687
- pub action_results: Vec<ActionResult>,
688
- pub final_html: String,
689
- pub final_url: String,
690
- pub screenshot: Option<Vec<u8>>,
691
- }
692
-
693
- unsafe impl IntoValueFromNative for InteractionResult {}
694
-
695
- impl magnus::TryConvert for InteractionResult {
696
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
697
- let r: &InteractionResult = magnus::TryConvert::try_convert(val)?;
698
- Ok(r.clone())
699
- }
700
- }
701
- unsafe impl TryConvertOwned for InteractionResult {}
702
-
703
- impl Default for InteractionResult {
704
- fn default() -> Self {
705
- Self {
706
- action_results: Default::default(),
707
- final_html: Default::default(),
708
- final_url: Default::default(),
709
- screenshot: Default::default(),
710
- }
711
- }
712
- }
713
-
714
- impl InteractionResult {
715
- fn new(
716
- action_results: Option<Vec<ActionResult>>,
717
- final_html: Option<String>,
718
- final_url: Option<String>,
719
- screenshot: Option<Vec<u8>>,
720
- ) -> Self {
721
- Self {
722
- action_results: action_results.unwrap_or_default(),
723
- final_html: final_html.unwrap_or_default(),
724
- final_url: final_url.unwrap_or_default(),
725
- screenshot,
726
- }
727
- }
728
-
729
- fn action_results(&self) -> Vec<ActionResult> {
730
- self.action_results.clone()
731
- }
732
-
733
- fn final_html(&self) -> String {
734
- self.final_html.clone()
735
- }
736
-
737
- fn final_url(&self) -> String {
738
- self.final_url.clone()
739
- }
740
-
741
- fn screenshot(&self) -> Option<Vec<u8>> {
742
- self.screenshot.clone()
743
- }
744
- }
745
-
746
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
747
- #[magnus::wrap(class = "Kreuzcrawl::ActionResult")]
748
- #[serde(default)]
749
- pub struct ActionResult {
750
- pub action_index: usize,
751
- pub action_type: String,
752
- pub success: bool,
753
- pub data: Option<String>,
754
- pub error: Option<String>,
755
- }
756
-
757
- unsafe impl IntoValueFromNative for ActionResult {}
758
-
759
- impl magnus::TryConvert for ActionResult {
760
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
761
- let r: &ActionResult = magnus::TryConvert::try_convert(val)?;
762
- Ok(r.clone())
763
- }
764
- }
765
- unsafe impl TryConvertOwned for ActionResult {}
766
-
767
- impl Default for ActionResult {
768
- fn default() -> Self {
769
- Self {
770
- action_index: Default::default(),
771
- action_type: Default::default(),
772
- success: Default::default(),
773
- data: Default::default(),
774
- error: Default::default(),
775
- }
776
- }
777
- }
778
-
779
- impl ActionResult {
780
- fn new(
781
- action_index: Option<usize>,
782
- action_type: Option<String>,
783
- success: Option<bool>,
784
- data: Option<String>,
785
- error: Option<String>,
786
- ) -> Self {
787
- Self {
788
- action_index: action_index.unwrap_or_default(),
789
- action_type: action_type.unwrap_or_default(),
790
- success: success.unwrap_or_default(),
791
- data,
792
- error,
793
- }
794
- }
795
-
796
- fn action_index(&self) -> usize {
797
- self.action_index
798
- }
799
-
800
- fn action_type(&self) -> String {
801
- self.action_type.clone()
802
- }
803
-
804
- fn success(&self) -> bool {
805
- self.success
806
- }
807
-
808
- fn data(&self) -> Option<String> {
809
- self.data.clone()
810
- }
811
-
812
- fn error(&self) -> Option<String> {
813
- self.error.clone()
814
- }
815
- }
816
-
817
832
  #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
818
833
  #[magnus::wrap(class = "Kreuzcrawl::ScrapeResult")]
819
834
  #[serde(default)]
@@ -835,7 +850,6 @@ pub struct ScrapeResult {
835
850
  pub is_pdf: bool,
836
851
  pub was_skipped: bool,
837
852
  pub detected_charset: Option<String>,
838
- pub main_content_only: bool,
839
853
  pub auth_header_sent: bool,
840
854
  pub response_meta: Option<ResponseMeta>,
841
855
  pub assets: Vec<DownloadedAsset>,
@@ -878,7 +892,6 @@ impl Default for ScrapeResult {
878
892
  is_pdf: Default::default(),
879
893
  was_skipped: Default::default(),
880
894
  detected_charset: Default::default(),
881
- main_content_only: Default::default(),
882
895
  auth_header_sent: Default::default(),
883
896
  response_meta: Default::default(),
884
897
  assets: Default::default(),
@@ -962,10 +975,6 @@ impl ScrapeResult {
962
975
  detected_charset: kwargs
963
976
  .get(ruby.to_symbol("detected_charset"))
964
977
  .and_then(|v| String::try_convert(v).ok()),
965
- main_content_only: kwargs
966
- .get(ruby.to_symbol("main_content_only"))
967
- .and_then(|v| bool::try_convert(v).ok())
968
- .unwrap_or_default(),
969
978
  auth_header_sent: kwargs
970
979
  .get(ruby.to_symbol("auth_header_sent"))
971
980
  .and_then(|v| bool::try_convert(v).ok())
@@ -1071,10 +1080,6 @@ impl ScrapeResult {
1071
1080
  self.detected_charset.clone()
1072
1081
  }
1073
1082
 
1074
- fn main_content_only(&self) -> bool {
1075
- self.main_content_only
1076
- }
1077
-
1078
1083
  fn auth_header_sent(&self) -> bool {
1079
1084
  self.auth_header_sent
1080
1085
  }
@@ -1616,93 +1621,6 @@ impl MarkdownResult {
1616
1621
  }
1617
1622
  }
1618
1623
 
1619
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
1620
- #[magnus::wrap(class = "Kreuzcrawl::CachedPage")]
1621
- #[serde(default)]
1622
- pub struct CachedPage {
1623
- pub url: String,
1624
- pub status_code: u16,
1625
- pub content_type: String,
1626
- pub body: String,
1627
- pub etag: Option<String>,
1628
- pub last_modified: Option<String>,
1629
- pub cached_at: u64,
1630
- }
1631
-
1632
- unsafe impl IntoValueFromNative for CachedPage {}
1633
-
1634
- impl magnus::TryConvert for CachedPage {
1635
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
1636
- let r: &CachedPage = magnus::TryConvert::try_convert(val)?;
1637
- Ok(r.clone())
1638
- }
1639
- }
1640
- unsafe impl TryConvertOwned for CachedPage {}
1641
-
1642
- impl Default for CachedPage {
1643
- fn default() -> Self {
1644
- Self {
1645
- url: Default::default(),
1646
- status_code: Default::default(),
1647
- content_type: Default::default(),
1648
- body: Default::default(),
1649
- etag: Default::default(),
1650
- last_modified: Default::default(),
1651
- cached_at: Default::default(),
1652
- }
1653
- }
1654
- }
1655
-
1656
- impl CachedPage {
1657
- fn new(
1658
- url: Option<String>,
1659
- status_code: Option<u16>,
1660
- content_type: Option<String>,
1661
- body: Option<String>,
1662
- etag: Option<String>,
1663
- last_modified: Option<String>,
1664
- cached_at: Option<u64>,
1665
- ) -> Self {
1666
- Self {
1667
- url: url.unwrap_or_default(),
1668
- status_code: status_code.unwrap_or_default(),
1669
- content_type: content_type.unwrap_or_default(),
1670
- body: body.unwrap_or_default(),
1671
- etag,
1672
- last_modified,
1673
- cached_at: cached_at.unwrap_or_default(),
1674
- }
1675
- }
1676
-
1677
- fn url(&self) -> String {
1678
- self.url.clone()
1679
- }
1680
-
1681
- fn status_code(&self) -> u16 {
1682
- self.status_code
1683
- }
1684
-
1685
- fn content_type(&self) -> String {
1686
- self.content_type.clone()
1687
- }
1688
-
1689
- fn body(&self) -> String {
1690
- self.body.clone()
1691
- }
1692
-
1693
- fn etag(&self) -> Option<String> {
1694
- self.etag.clone()
1695
- }
1696
-
1697
- fn last_modified(&self) -> Option<String> {
1698
- self.last_modified.clone()
1699
- }
1700
-
1701
- fn cached_at(&self) -> u64 {
1702
- self.cached_at
1703
- }
1704
- }
1705
-
1706
1624
  #[derive(Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
1707
1625
  #[magnus::wrap(class = "Kreuzcrawl::LinkInfo")]
1708
1626
  #[serde(default)]
@@ -3295,39 +3213,6 @@ impl magnus::TryConvert for AssetCategory {
3295
3213
  unsafe impl IntoValueFromNative for AssetCategory {}
3296
3214
  unsafe impl TryConvertOwned for AssetCategory {}
3297
3215
 
3298
- #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
3299
- pub enum CrawlEvent {
3300
- Page { _0: CrawlPageResult },
3301
- Error { url: String, error: String },
3302
- Complete { pages_crawled: usize },
3303
- }
3304
-
3305
- impl Default for CrawlEvent {
3306
- fn default() -> Self {
3307
- Self::Page { _0: Default::default() }
3308
- }
3309
- }
3310
-
3311
- impl magnus::IntoValue for CrawlEvent {
3312
- fn into_value_with(self, handle: &Ruby) -> magnus::Value {
3313
- match serde_json::to_value(&self) {
3314
- Ok(v) => json_to_ruby(handle, v),
3315
- Err(_) => handle.qnil().into_value_with(handle),
3316
- }
3317
- }
3318
- }
3319
-
3320
- impl magnus::TryConvert for CrawlEvent {
3321
- fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
3322
- let s: String = magnus::TryConvert::try_convert(val)?;
3323
- serde_json::from_str(&s)
3324
- .map_err(|e| magnus::Error::new(unsafe { Ruby::get_unchecked() }.exception_type_error(), e.to_string()))
3325
- }
3326
- }
3327
-
3328
- unsafe impl IntoValueFromNative for CrawlEvent {}
3329
- unsafe impl TryConvertOwned for CrawlEvent {}
3330
-
3331
3216
  fn create_engine(config: Option<String>) -> Result<CrawlEngineHandle, Error> {
3332
3217
  let config: Option<CrawlConfig> = config
3333
3218
  .as_deref()
@@ -3546,6 +3431,44 @@ impl From<kreuzcrawl::ProxyConfig> for ProxyConfig {
3546
3431
  }
3547
3432
  }
3548
3433
 
3434
+ impl From<ContentConfig> for kreuzcrawl::ContentConfig {
3435
+ fn from(val: ContentConfig) -> Self {
3436
+ Self {
3437
+ output_format: val.output_format,
3438
+ preprocessing_preset: val.preprocessing_preset,
3439
+ remove_navigation: val.remove_navigation,
3440
+ remove_forms: val.remove_forms,
3441
+ strip_tags: val.strip_tags,
3442
+ preserve_tags: val.preserve_tags,
3443
+ exclude_selectors: val.exclude_selectors,
3444
+ skip_images: val.skip_images,
3445
+ max_depth: val.max_depth,
3446
+ wrap: val.wrap,
3447
+ wrap_width: val.wrap_width,
3448
+ include_document_structure: val.include_document_structure,
3449
+ }
3450
+ }
3451
+ }
3452
+
3453
+ impl From<kreuzcrawl::ContentConfig> for ContentConfig {
3454
+ fn from(val: kreuzcrawl::ContentConfig) -> Self {
3455
+ Self {
3456
+ output_format: val.output_format,
3457
+ preprocessing_preset: val.preprocessing_preset,
3458
+ remove_navigation: val.remove_navigation,
3459
+ remove_forms: val.remove_forms,
3460
+ strip_tags: val.strip_tags,
3461
+ preserve_tags: val.preserve_tags,
3462
+ exclude_selectors: val.exclude_selectors,
3463
+ skip_images: val.skip_images,
3464
+ max_depth: val.max_depth,
3465
+ wrap: val.wrap,
3466
+ wrap_width: val.wrap_width,
3467
+ include_document_structure: val.include_document_structure,
3468
+ }
3469
+ }
3470
+ }
3471
+
3549
3472
  impl From<BrowserConfig> for kreuzcrawl::BrowserConfig {
3550
3473
  fn from(val: BrowserConfig) -> Self {
3551
3474
  Self {
@@ -3587,14 +3510,15 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
3587
3510
  exclude_paths: val.exclude_paths,
3588
3511
  custom_headers: val.custom_headers.into_iter().collect(),
3589
3512
  request_timeout: std::time::Duration::from_millis(val.request_timeout),
3513
+ rate_limit_ms: val.rate_limit_ms,
3590
3514
  max_redirects: val.max_redirects,
3591
3515
  retry_count: val.retry_count,
3592
3516
  retry_codes: val.retry_codes,
3593
3517
  cookies_enabled: val.cookies_enabled,
3594
3518
  auth: val.auth.map(Into::into),
3595
3519
  max_body_size: val.max_body_size,
3596
- main_content_only: val.main_content_only,
3597
3520
  remove_tags: val.remove_tags,
3521
+ content: val.content.into(),
3598
3522
  map_limit: val.map_limit,
3599
3523
  map_search: val.map_search,
3600
3524
  download_assets: val.download_assets,
@@ -3629,14 +3553,15 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
3629
3553
  exclude_paths: val.exclude_paths,
3630
3554
  custom_headers: val.custom_headers.into_iter().collect(),
3631
3555
  request_timeout: val.request_timeout.as_millis() as u64,
3556
+ rate_limit_ms: val.rate_limit_ms,
3632
3557
  max_redirects: val.max_redirects,
3633
3558
  retry_count: val.retry_count,
3634
3559
  retry_codes: val.retry_codes,
3635
3560
  cookies_enabled: val.cookies_enabled,
3636
3561
  auth: val.auth.map(Into::into),
3637
3562
  max_body_size: val.max_body_size,
3638
- main_content_only: val.main_content_only,
3639
3563
  remove_tags: val.remove_tags,
3564
+ content: val.content.into(),
3640
3565
  map_limit: val.map_limit,
3641
3566
  map_search: val.map_search,
3642
3567
  download_assets: val.download_assets,
@@ -3677,40 +3602,17 @@ impl From<kreuzcrawl::DownloadedDocument> for DownloadedDocument {
3677
3602
  mime_type: format!("{:?}", val.mime_type),
3678
3603
  content: val.content.to_vec(),
3679
3604
  size: val.size,
3680
- filename: val.filename.as_ref().map(|v| format!("{:?}", v)),
3605
+ filename: val.filename.as_ref().map(|v| format!("{v:?}")),
3681
3606
  content_hash: format!("{:?}", val.content_hash),
3682
3607
  headers: val
3683
3608
  .headers
3684
3609
  .into_iter()
3685
- .map(|(k, v)| (format!("{:?}", k), format!("{:?}", v)))
3610
+ .map(|(k, v)| (k.to_string(), v.to_string()))
3686
3611
  .collect(),
3687
3612
  }
3688
3613
  }
3689
3614
  }
3690
3615
 
3691
- impl From<kreuzcrawl::InteractionResult> for InteractionResult {
3692
- fn from(val: kreuzcrawl::InteractionResult) -> Self {
3693
- Self {
3694
- action_results: val.action_results.into_iter().map(Into::into).collect(),
3695
- final_html: val.final_html,
3696
- final_url: val.final_url,
3697
- screenshot: val.screenshot.map(|v| v.to_vec()),
3698
- }
3699
- }
3700
- }
3701
-
3702
- impl From<kreuzcrawl::ActionResult> for ActionResult {
3703
- fn from(val: kreuzcrawl::ActionResult) -> Self {
3704
- Self {
3705
- action_index: val.action_index,
3706
- action_type: format!("{:?}", val.action_type),
3707
- success: val.success,
3708
- data: val.data.as_ref().map(ToString::to_string),
3709
- error: val.error,
3710
- }
3711
- }
3712
- }
3713
-
3714
3616
  impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
3715
3617
  fn from(val: ScrapeResult) -> Self {
3716
3618
  Self {
@@ -3731,7 +3633,6 @@ impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
3731
3633
  is_pdf: val.is_pdf,
3732
3634
  was_skipped: val.was_skipped,
3733
3635
  detected_charset: val.detected_charset,
3734
- main_content_only: val.main_content_only,
3735
3636
  auth_header_sent: val.auth_header_sent,
3736
3637
  response_meta: val.response_meta.map(Into::into),
3737
3638
  assets: val.assets.into_iter().map(Into::into).collect(),
@@ -3766,7 +3667,6 @@ impl From<kreuzcrawl::ScrapeResult> for ScrapeResult {
3766
3667
  is_pdf: val.is_pdf,
3767
3668
  was_skipped: val.was_skipped,
3768
3669
  detected_charset: val.detected_charset,
3769
- main_content_only: val.main_content_only,
3770
3670
  auth_header_sent: val.auth_header_sent,
3771
3671
  response_meta: val.response_meta.map(Into::into),
3772
3672
  assets: val.assets.into_iter().map(Into::into).collect(),
@@ -3934,20 +3834,6 @@ impl From<kreuzcrawl::MarkdownResult> for MarkdownResult {
3934
3834
  }
3935
3835
  }
3936
3836
 
3937
- impl From<kreuzcrawl::CachedPage> for CachedPage {
3938
- fn from(val: kreuzcrawl::CachedPage) -> Self {
3939
- Self {
3940
- url: val.url,
3941
- status_code: val.status_code,
3942
- content_type: val.content_type,
3943
- body: val.body,
3944
- etag: val.etag,
3945
- last_modified: val.last_modified,
3946
- cached_at: val.cached_at,
3947
- }
3948
- }
3949
- }
3950
-
3951
3837
  impl From<LinkInfo> for kreuzcrawl::LinkInfo {
3952
3838
  fn from(val: LinkInfo) -> Self {
3953
3839
  Self {
@@ -4530,16 +4416,6 @@ impl From<kreuzcrawl::AssetCategory> for AssetCategory {
4530
4416
  }
4531
4417
  }
4532
4418
 
4533
- impl From<kreuzcrawl::CrawlEvent> for CrawlEvent {
4534
- fn from(val: kreuzcrawl::CrawlEvent) -> Self {
4535
- match val {
4536
- kreuzcrawl::CrawlEvent::Page(_0) => Self::Page { _0: (*_0).into() },
4537
- kreuzcrawl::CrawlEvent::Error { url, error } => Self::Error { url, error },
4538
- kreuzcrawl::CrawlEvent::Complete { pages_crawled } => Self::Complete { pages_crawled },
4539
- }
4540
- }
4541
- }
4542
-
4543
4419
  /// Convert a `kreuzcrawl::CrawlError` error to a Magnus runtime error.
4544
4420
  #[allow(dead_code)]
4545
4421
  fn crawl_error_to_magnus_err(e: kreuzcrawl::CrawlError) -> magnus::Error {
@@ -4565,6 +4441,24 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4565
4441
  class.define_method("username", method!(ProxyConfig::username, 0))?;
4566
4442
  class.define_method("password", method!(ProxyConfig::password, 0))?;
4567
4443
 
4444
+ let class = module.define_class("ContentConfig", ruby.class_object())?;
4445
+ class.define_singleton_method("new", function!(ContentConfig::new, 12))?;
4446
+ class.define_method("output_format", method!(ContentConfig::output_format, 0))?;
4447
+ class.define_method("preprocessing_preset", method!(ContentConfig::preprocessing_preset, 0))?;
4448
+ class.define_method("remove_navigation", method!(ContentConfig::remove_navigation, 0))?;
4449
+ class.define_method("remove_forms", method!(ContentConfig::remove_forms, 0))?;
4450
+ class.define_method("strip_tags", method!(ContentConfig::strip_tags, 0))?;
4451
+ class.define_method("preserve_tags", method!(ContentConfig::preserve_tags, 0))?;
4452
+ class.define_method("exclude_selectors", method!(ContentConfig::exclude_selectors, 0))?;
4453
+ class.define_method("skip_images", method!(ContentConfig::skip_images, 0))?;
4454
+ class.define_method("max_depth", method!(ContentConfig::max_depth, 0))?;
4455
+ class.define_method("wrap", method!(ContentConfig::wrap, 0))?;
4456
+ class.define_method("wrap_width", method!(ContentConfig::wrap_width, 0))?;
4457
+ class.define_method(
4458
+ "include_document_structure",
4459
+ method!(ContentConfig::include_document_structure, 0),
4460
+ )?;
4461
+
4568
4462
  let class = module.define_class("BrowserConfig", ruby.class_object())?;
4569
4463
  class.define_singleton_method("new", function!(BrowserConfig::new, 6))?;
4570
4464
  class.define_method("mode", method!(BrowserConfig::mode, 0))?;
@@ -4587,14 +4481,15 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4587
4481
  class.define_method("exclude_paths", method!(CrawlConfig::exclude_paths, 0))?;
4588
4482
  class.define_method("custom_headers", method!(CrawlConfig::custom_headers, 0))?;
4589
4483
  class.define_method("request_timeout", method!(CrawlConfig::request_timeout, 0))?;
4484
+ class.define_method("rate_limit_ms", method!(CrawlConfig::rate_limit_ms, 0))?;
4590
4485
  class.define_method("max_redirects", method!(CrawlConfig::max_redirects, 0))?;
4591
4486
  class.define_method("retry_count", method!(CrawlConfig::retry_count, 0))?;
4592
4487
  class.define_method("retry_codes", method!(CrawlConfig::retry_codes, 0))?;
4593
4488
  class.define_method("cookies_enabled", method!(CrawlConfig::cookies_enabled, 0))?;
4594
4489
  class.define_method("auth", method!(CrawlConfig::auth, 0))?;
4595
4490
  class.define_method("max_body_size", method!(CrawlConfig::max_body_size, 0))?;
4596
- class.define_method("main_content_only", method!(CrawlConfig::main_content_only, 0))?;
4597
4491
  class.define_method("remove_tags", method!(CrawlConfig::remove_tags, 0))?;
4492
+ class.define_method("content", method!(CrawlConfig::content, 0))?;
4598
4493
  class.define_method("map_limit", method!(CrawlConfig::map_limit, 0))?;
4599
4494
  class.define_method("map_search", method!(CrawlConfig::map_search, 0))?;
4600
4495
  class.define_method("download_assets", method!(CrawlConfig::download_assets, 0))?;
@@ -4622,21 +4517,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4622
4517
  class.define_method("content_hash", method!(DownloadedDocument::content_hash, 0))?;
4623
4518
  class.define_method("headers", method!(DownloadedDocument::headers, 0))?;
4624
4519
 
4625
- let class = module.define_class("InteractionResult", ruby.class_object())?;
4626
- class.define_singleton_method("new", function!(InteractionResult::new, 4))?;
4627
- class.define_method("action_results", method!(InteractionResult::action_results, 0))?;
4628
- class.define_method("final_html", method!(InteractionResult::final_html, 0))?;
4629
- class.define_method("final_url", method!(InteractionResult::final_url, 0))?;
4630
- class.define_method("screenshot", method!(InteractionResult::screenshot, 0))?;
4631
-
4632
- let class = module.define_class("ActionResult", ruby.class_object())?;
4633
- class.define_singleton_method("new", function!(ActionResult::new, 5))?;
4634
- class.define_method("action_index", method!(ActionResult::action_index, 0))?;
4635
- class.define_method("action_type", method!(ActionResult::action_type, 0))?;
4636
- class.define_method("success", method!(ActionResult::success, 0))?;
4637
- class.define_method("data", method!(ActionResult::data, 0))?;
4638
- class.define_method("error", method!(ActionResult::error, 0))?;
4639
-
4640
4520
  let class = module.define_class("ScrapeResult", ruby.class_object())?;
4641
4521
  class.define_singleton_method("new", function!(ScrapeResult::new, 1))?;
4642
4522
  class.define_method("status_code", method!(ScrapeResult::status_code, 0))?;
@@ -4656,7 +4536,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4656
4536
  class.define_method("is_pdf", method!(ScrapeResult::is_pdf, 0))?;
4657
4537
  class.define_method("was_skipped", method!(ScrapeResult::was_skipped, 0))?;
4658
4538
  class.define_method("detected_charset", method!(ScrapeResult::detected_charset, 0))?;
4659
- class.define_method("main_content_only", method!(ScrapeResult::main_content_only, 0))?;
4660
4539
  class.define_method("auth_header_sent", method!(ScrapeResult::auth_header_sent, 0))?;
4661
4540
  class.define_method("response_meta", method!(ScrapeResult::response_meta, 0))?;
4662
4541
  class.define_method("assets", method!(ScrapeResult::assets, 0))?;
@@ -4725,16 +4604,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4725
4604
  class.define_method("citations", method!(MarkdownResult::citations, 0))?;
4726
4605
  class.define_method("fit_content", method!(MarkdownResult::fit_content, 0))?;
4727
4606
 
4728
- let class = module.define_class("CachedPage", ruby.class_object())?;
4729
- class.define_singleton_method("new", function!(CachedPage::new, 7))?;
4730
- class.define_method("url", method!(CachedPage::url, 0))?;
4731
- class.define_method("status_code", method!(CachedPage::status_code, 0))?;
4732
- class.define_method("content_type", method!(CachedPage::content_type, 0))?;
4733
- class.define_method("body", method!(CachedPage::body, 0))?;
4734
- class.define_method("etag", method!(CachedPage::etag, 0))?;
4735
- class.define_method("last_modified", method!(CachedPage::last_modified, 0))?;
4736
- class.define_method("cached_at", method!(CachedPage::cached_at, 0))?;
4737
-
4738
4607
  let class = module.define_class("LinkInfo", ruby.class_object())?;
4739
4608
  class.define_singleton_method("new", function!(LinkInfo::new, 5))?;
4740
4609
  class.define_method("url", method!(LinkInfo::url, 0))?;
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kreuzberg Team