kreuzcrawl 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a2f4027aef59737e93add85d5da2398c4a95767fb41766b86822f50199331e32
4
- data.tar.gz: 82f38cf86e988321ef9bc11f5a42a023df464e79d0d0ad646edeae564cabfec6
3
+ metadata.gz: 92e6cb8be4ef5d12847d6d632465cf41b18ce6564652049052b8793162204255
4
+ data.tar.gz: 9bd7b541b57c35b0ddb1cec5d6b119463ca1622db1f57455b340f38b4cf3abd3
5
5
  SHA512:
6
- metadata.gz: 1ebcebff06f45e809441c0cae1d9f14be48a79b6d1439b3421ebedd4808d9c71e6ffa86d51a6c1d64ba57a0bc2eeffa24a672a4af3db5659185ba5c8cca8443a
7
- data.tar.gz: af29b39fdf985044e579d71064af9eb10dc348c81a82e7f27dc0650c4720ee1164fd8fe5529358cc9ba494b0fc296e9a425b17182794d001b3bcee526f737d35
6
+ metadata.gz: 78ae3990a646e6f0a9ecfdb8355d4bfb11f476d3c0dc498651e5ebeb08f28dd43dd848178c43ee4f5788517e0014b3b20f8827990abe9e8e3377cb43c27be773
7
+ data.tar.gz: f43d72f320847d7bf872dc7d59d8cf816bf1c2faf05528998773dbd55ab414dd35380e190f7a0f64eb0e60eae4866c22263eceb0ba05da02a47ad5e98aecbeeb
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzcrawl-rb"
3
- version = "0.1.2"
3
+ version = "0.2.0"
4
4
  edition = "2024"
5
5
  license = "Elastic-2.0"
6
6
 
@@ -2,5 +2,5 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  module Kreuzcrawl
5
- VERSION = "0.1.2"
5
+ VERSION = "0.2.0"
6
6
  end
@@ -175,6 +175,133 @@ impl ProxyConfig {
175
175
  }
176
176
  }
177
177
 
178
+ #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
179
+ #[magnus::wrap(class = "Kreuzcrawl::ContentConfig")]
180
+ #[serde(default)]
181
+ pub struct ContentConfig {
182
+ pub output_format: String,
183
+ pub preprocessing_preset: String,
184
+ pub remove_navigation: bool,
185
+ pub remove_forms: bool,
186
+ pub strip_tags: Vec<String>,
187
+ pub preserve_tags: Vec<String>,
188
+ pub exclude_selectors: Vec<String>,
189
+ pub skip_images: bool,
190
+ pub max_depth: Option<usize>,
191
+ pub wrap: bool,
192
+ pub wrap_width: usize,
193
+ pub include_document_structure: bool,
194
+ }
195
+
196
+ unsafe impl IntoValueFromNative for ContentConfig {}
197
+
198
+ impl magnus::TryConvert for ContentConfig {
199
+ fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
200
+ let r: &ContentConfig = magnus::TryConvert::try_convert(val)?;
201
+ Ok(r.clone())
202
+ }
203
+ }
204
+ unsafe impl TryConvertOwned for ContentConfig {}
205
+
206
+ impl Default for ContentConfig {
207
+ fn default() -> Self {
208
+ Self {
209
+ output_format: Default::default(),
210
+ preprocessing_preset: Default::default(),
211
+ remove_navigation: Default::default(),
212
+ remove_forms: Default::default(),
213
+ strip_tags: Default::default(),
214
+ preserve_tags: Default::default(),
215
+ exclude_selectors: Default::default(),
216
+ skip_images: Default::default(),
217
+ max_depth: Default::default(),
218
+ wrap: Default::default(),
219
+ wrap_width: Default::default(),
220
+ include_document_structure: Default::default(),
221
+ }
222
+ }
223
+ }
224
+
225
+ impl ContentConfig {
226
+ fn new(
227
+ output_format: Option<String>,
228
+ preprocessing_preset: Option<String>,
229
+ remove_navigation: Option<bool>,
230
+ remove_forms: Option<bool>,
231
+ strip_tags: Option<Vec<String>>,
232
+ preserve_tags: Option<Vec<String>>,
233
+ exclude_selectors: Option<Vec<String>>,
234
+ skip_images: Option<bool>,
235
+ max_depth: Option<usize>,
236
+ wrap: Option<bool>,
237
+ wrap_width: Option<usize>,
238
+ include_document_structure: Option<bool>,
239
+ ) -> Self {
240
+ Self {
241
+ output_format: output_format.unwrap_or("markdown".to_string()),
242
+ preprocessing_preset: preprocessing_preset.unwrap_or("standard".to_string()),
243
+ remove_navigation: remove_navigation.unwrap_or(true),
244
+ remove_forms: remove_forms.unwrap_or(true),
245
+ strip_tags: strip_tags.unwrap_or_default(),
246
+ preserve_tags: preserve_tags.unwrap_or_default(),
247
+ exclude_selectors: exclude_selectors.unwrap_or_default(),
248
+ skip_images: skip_images.unwrap_or(false),
249
+ max_depth,
250
+ wrap: wrap.unwrap_or(false),
251
+ wrap_width: wrap_width.unwrap_or(80),
252
+ include_document_structure: include_document_structure.unwrap_or(true),
253
+ }
254
+ }
255
+
256
+ fn output_format(&self) -> String {
257
+ self.output_format.clone()
258
+ }
259
+
260
+ fn preprocessing_preset(&self) -> String {
261
+ self.preprocessing_preset.clone()
262
+ }
263
+
264
+ fn remove_navigation(&self) -> bool {
265
+ self.remove_navigation
266
+ }
267
+
268
+ fn remove_forms(&self) -> bool {
269
+ self.remove_forms
270
+ }
271
+
272
+ fn strip_tags(&self) -> Vec<String> {
273
+ self.strip_tags.clone()
274
+ }
275
+
276
+ fn preserve_tags(&self) -> Vec<String> {
277
+ self.preserve_tags.clone()
278
+ }
279
+
280
+ fn exclude_selectors(&self) -> Vec<String> {
281
+ self.exclude_selectors.clone()
282
+ }
283
+
284
+ fn skip_images(&self) -> bool {
285
+ self.skip_images
286
+ }
287
+
288
+ fn max_depth(&self) -> Option<usize> {
289
+ self.max_depth
290
+ }
291
+
292
+ fn wrap(&self) -> bool {
293
+ self.wrap
294
+ }
295
+
296
+ fn wrap_width(&self) -> usize {
297
+ self.wrap_width
298
+ }
299
+
300
+ fn include_document_structure(&self) -> bool {
301
+ self.include_document_structure
302
+ }
303
+ }
304
+
178
305
  #[derive(Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
179
306
  #[magnus::wrap(class = "Kreuzcrawl::BrowserConfig")]
180
307
  #[serde(default)]
@@ -263,8 +390,8 @@ pub struct CrawlConfig {
263
390
  pub cookies_enabled: bool,
264
391
  pub auth: Option<AuthConfig>,
265
392
  pub max_body_size: Option<usize>,
266
- pub main_content_only: bool,
267
393
  pub remove_tags: Vec<String>,
394
+ pub content: ContentConfig,
268
395
  pub map_limit: Option<usize>,
269
396
  pub map_search: Option<String>,
270
397
  pub download_assets: bool,
@@ -361,14 +488,14 @@ impl CrawlConfig {
361
488
  max_body_size: kwargs
362
489
  .get(ruby.to_symbol("max_body_size"))
363
490
  .and_then(|v| usize::try_convert(v).ok()),
364
- main_content_only: kwargs
365
- .get(ruby.to_symbol("main_content_only"))
366
- .and_then(|v| bool::try_convert(v).ok())
367
- .unwrap_or(false),
368
491
  remove_tags: kwargs
369
492
  .get(ruby.to_symbol("remove_tags"))
370
493
  .and_then(|v| <Vec<String>>::try_convert(v).ok())
371
494
  .unwrap_or_default(),
495
+ content: kwargs
496
+ .get(ruby.to_symbol("content"))
497
+ .and_then(|v| ContentConfig::try_convert(v).ok())
498
+ .unwrap_or_default(),
372
499
  map_limit: kwargs
373
500
  .get(ruby.to_symbol("map_limit"))
374
501
  .and_then(|v| usize::try_convert(v).ok()),
@@ -497,14 +624,14 @@ impl CrawlConfig {
497
624
  self.max_body_size
498
625
  }
499
626
 
500
- fn main_content_only(&self) -> bool {
501
- self.main_content_only
502
- }
503
-
504
627
  fn remove_tags(&self) -> Vec<String> {
505
628
  self.remove_tags.clone()
506
629
  }
507
630
 
631
+ fn content(&self) -> ContentConfig {
632
+ self.content.clone()
633
+ }
634
+
508
635
  fn map_limit(&self) -> Option<usize> {
509
636
  self.map_limit
510
637
  }
@@ -586,8 +713,8 @@ impl CrawlConfig {
586
713
  cookies_enabled: self.cookies_enabled,
587
714
  auth: self.auth.clone().map(Into::into),
588
715
  max_body_size: self.max_body_size,
589
- main_content_only: self.main_content_only,
590
716
  remove_tags: self.remove_tags.clone(),
717
+ content: self.content.clone().into(),
591
718
  map_limit: self.map_limit,
592
719
  map_search: self.map_search.clone(),
593
720
  download_assets: self.download_assets,
@@ -723,7 +850,6 @@ pub struct ScrapeResult {
723
850
  pub is_pdf: bool,
724
851
  pub was_skipped: bool,
725
852
  pub detected_charset: Option<String>,
726
- pub main_content_only: bool,
727
853
  pub auth_header_sent: bool,
728
854
  pub response_meta: Option<ResponseMeta>,
729
855
  pub assets: Vec<DownloadedAsset>,
@@ -766,7 +892,6 @@ impl Default for ScrapeResult {
766
892
  is_pdf: Default::default(),
767
893
  was_skipped: Default::default(),
768
894
  detected_charset: Default::default(),
769
- main_content_only: Default::default(),
770
895
  auth_header_sent: Default::default(),
771
896
  response_meta: Default::default(),
772
897
  assets: Default::default(),
@@ -850,10 +975,6 @@ impl ScrapeResult {
850
975
  detected_charset: kwargs
851
976
  .get(ruby.to_symbol("detected_charset"))
852
977
  .and_then(|v| String::try_convert(v).ok()),
853
- main_content_only: kwargs
854
- .get(ruby.to_symbol("main_content_only"))
855
- .and_then(|v| bool::try_convert(v).ok())
856
- .unwrap_or_default(),
857
978
  auth_header_sent: kwargs
858
979
  .get(ruby.to_symbol("auth_header_sent"))
859
980
  .and_then(|v| bool::try_convert(v).ok())
@@ -959,10 +1080,6 @@ impl ScrapeResult {
959
1080
  self.detected_charset.clone()
960
1081
  }
961
1082
 
962
- fn main_content_only(&self) -> bool {
963
- self.main_content_only
964
- }
965
-
966
1083
  fn auth_header_sent(&self) -> bool {
967
1084
  self.auth_header_sent
968
1085
  }
@@ -3314,6 +3431,44 @@ impl From<kreuzcrawl::ProxyConfig> for ProxyConfig {
3314
3431
  }
3315
3432
  }
3316
3433
 
3434
+ impl From<ContentConfig> for kreuzcrawl::ContentConfig {
3435
+ fn from(val: ContentConfig) -> Self {
3436
+ Self {
3437
+ output_format: val.output_format,
3438
+ preprocessing_preset: val.preprocessing_preset,
3439
+ remove_navigation: val.remove_navigation,
3440
+ remove_forms: val.remove_forms,
3441
+ strip_tags: val.strip_tags,
3442
+ preserve_tags: val.preserve_tags,
3443
+ exclude_selectors: val.exclude_selectors,
3444
+ skip_images: val.skip_images,
3445
+ max_depth: val.max_depth,
3446
+ wrap: val.wrap,
3447
+ wrap_width: val.wrap_width,
3448
+ include_document_structure: val.include_document_structure,
3449
+ }
3450
+ }
3451
+ }
3452
+
3453
+ impl From<kreuzcrawl::ContentConfig> for ContentConfig {
3454
+ fn from(val: kreuzcrawl::ContentConfig) -> Self {
3455
+ Self {
3456
+ output_format: val.output_format,
3457
+ preprocessing_preset: val.preprocessing_preset,
3458
+ remove_navigation: val.remove_navigation,
3459
+ remove_forms: val.remove_forms,
3460
+ strip_tags: val.strip_tags,
3461
+ preserve_tags: val.preserve_tags,
3462
+ exclude_selectors: val.exclude_selectors,
3463
+ skip_images: val.skip_images,
3464
+ max_depth: val.max_depth,
3465
+ wrap: val.wrap,
3466
+ wrap_width: val.wrap_width,
3467
+ include_document_structure: val.include_document_structure,
3468
+ }
3469
+ }
3470
+ }
3471
+
3317
3472
  impl From<BrowserConfig> for kreuzcrawl::BrowserConfig {
3318
3473
  fn from(val: BrowserConfig) -> Self {
3319
3474
  Self {
@@ -3362,8 +3517,8 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
3362
3517
  cookies_enabled: val.cookies_enabled,
3363
3518
  auth: val.auth.map(Into::into),
3364
3519
  max_body_size: val.max_body_size,
3365
- main_content_only: val.main_content_only,
3366
3520
  remove_tags: val.remove_tags,
3521
+ content: val.content.into(),
3367
3522
  map_limit: val.map_limit,
3368
3523
  map_search: val.map_search,
3369
3524
  download_assets: val.download_assets,
@@ -3405,8 +3560,8 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
3405
3560
  cookies_enabled: val.cookies_enabled,
3406
3561
  auth: val.auth.map(Into::into),
3407
3562
  max_body_size: val.max_body_size,
3408
- main_content_only: val.main_content_only,
3409
3563
  remove_tags: val.remove_tags,
3564
+ content: val.content.into(),
3410
3565
  map_limit: val.map_limit,
3411
3566
  map_search: val.map_search,
3412
3567
  download_assets: val.download_assets,
@@ -3478,7 +3633,6 @@ impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
3478
3633
  is_pdf: val.is_pdf,
3479
3634
  was_skipped: val.was_skipped,
3480
3635
  detected_charset: val.detected_charset,
3481
- main_content_only: val.main_content_only,
3482
3636
  auth_header_sent: val.auth_header_sent,
3483
3637
  response_meta: val.response_meta.map(Into::into),
3484
3638
  assets: val.assets.into_iter().map(Into::into).collect(),
@@ -3513,7 +3667,6 @@ impl From<kreuzcrawl::ScrapeResult> for ScrapeResult {
3513
3667
  is_pdf: val.is_pdf,
3514
3668
  was_skipped: val.was_skipped,
3515
3669
  detected_charset: val.detected_charset,
3516
- main_content_only: val.main_content_only,
3517
3670
  auth_header_sent: val.auth_header_sent,
3518
3671
  response_meta: val.response_meta.map(Into::into),
3519
3672
  assets: val.assets.into_iter().map(Into::into).collect(),
@@ -4288,6 +4441,24 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4288
4441
  class.define_method("username", method!(ProxyConfig::username, 0))?;
4289
4442
  class.define_method("password", method!(ProxyConfig::password, 0))?;
4290
4443
 
4444
+ let class = module.define_class("ContentConfig", ruby.class_object())?;
4445
+ class.define_singleton_method("new", function!(ContentConfig::new, 12))?;
4446
+ class.define_method("output_format", method!(ContentConfig::output_format, 0))?;
4447
+ class.define_method("preprocessing_preset", method!(ContentConfig::preprocessing_preset, 0))?;
4448
+ class.define_method("remove_navigation", method!(ContentConfig::remove_navigation, 0))?;
4449
+ class.define_method("remove_forms", method!(ContentConfig::remove_forms, 0))?;
4450
+ class.define_method("strip_tags", method!(ContentConfig::strip_tags, 0))?;
4451
+ class.define_method("preserve_tags", method!(ContentConfig::preserve_tags, 0))?;
4452
+ class.define_method("exclude_selectors", method!(ContentConfig::exclude_selectors, 0))?;
4453
+ class.define_method("skip_images", method!(ContentConfig::skip_images, 0))?;
4454
+ class.define_method("max_depth", method!(ContentConfig::max_depth, 0))?;
4455
+ class.define_method("wrap", method!(ContentConfig::wrap, 0))?;
4456
+ class.define_method("wrap_width", method!(ContentConfig::wrap_width, 0))?;
4457
+ class.define_method(
4458
+ "include_document_structure",
4459
+ method!(ContentConfig::include_document_structure, 0),
4460
+ )?;
4461
+
4291
4462
  let class = module.define_class("BrowserConfig", ruby.class_object())?;
4292
4463
  class.define_singleton_method("new", function!(BrowserConfig::new, 6))?;
4293
4464
  class.define_method("mode", method!(BrowserConfig::mode, 0))?;
@@ -4317,8 +4488,8 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4317
4488
  class.define_method("cookies_enabled", method!(CrawlConfig::cookies_enabled, 0))?;
4318
4489
  class.define_method("auth", method!(CrawlConfig::auth, 0))?;
4319
4490
  class.define_method("max_body_size", method!(CrawlConfig::max_body_size, 0))?;
4320
- class.define_method("main_content_only", method!(CrawlConfig::main_content_only, 0))?;
4321
4491
  class.define_method("remove_tags", method!(CrawlConfig::remove_tags, 0))?;
4492
+ class.define_method("content", method!(CrawlConfig::content, 0))?;
4322
4493
  class.define_method("map_limit", method!(CrawlConfig::map_limit, 0))?;
4323
4494
  class.define_method("map_search", method!(CrawlConfig::map_search, 0))?;
4324
4495
  class.define_method("download_assets", method!(CrawlConfig::download_assets, 0))?;
@@ -4365,7 +4536,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
4365
4536
  class.define_method("is_pdf", method!(ScrapeResult::is_pdf, 0))?;
4366
4537
  class.define_method("was_skipped", method!(ScrapeResult::was_skipped, 0))?;
4367
4538
  class.define_method("detected_charset", method!(ScrapeResult::detected_charset, 0))?;
4368
- class.define_method("main_content_only", method!(ScrapeResult::main_content_only, 0))?;
4369
4539
  class.define_method("auth_header_sent", method!(ScrapeResult::auth_header_sent, 0))?;
4370
4540
  class.define_method("response_meta", method!(ScrapeResult::response_meta, 0))?;
4371
4541
  class.define_method("assets", method!(ScrapeResult::assets, 0))?;
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kreuzberg Team