kreuzcrawl 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/kreuzcrawl_rb/Cargo.toml +1 -1
- data/ext/kreuzcrawl_rb/src/kreuzcrawl/version.rb +1 -1
- data/ext/kreuzcrawl_rb/src/lib.rs +196 -26
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 92e6cb8be4ef5d12847d6d632465cf41b18ce6564652049052b8793162204255
|
|
4
|
+
data.tar.gz: 9bd7b541b57c35b0ddb1cec5d6b119463ca1622db1f57455b340f38b4cf3abd3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 78ae3990a646e6f0a9ecfdb8355d4bfb11f476d3c0dc498651e5ebeb08f28dd43dd848178c43ee4f5788517e0014b3b20f8827990abe9e8e3377cb43c27be773
|
|
7
|
+
data.tar.gz: f43d72f320847d7bf872dc7d59d8cf816bf1c2faf05528998773dbd55ab414dd35380e190f7a0f64eb0e60eae4866c22263eceb0ba05da02a47ad5e98aecbeeb
|
|
@@ -175,6 +175,133 @@ impl ProxyConfig {
|
|
|
175
175
|
}
|
|
176
176
|
}
|
|
177
177
|
|
|
178
|
+
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
|
|
179
|
+
#[magnus::wrap(class = "Kreuzcrawl::ContentConfig")]
|
|
180
|
+
#[serde(default)]
|
|
181
|
+
pub struct ContentConfig {
|
|
182
|
+
pub output_format: String,
|
|
183
|
+
pub preprocessing_preset: String,
|
|
184
|
+
pub remove_navigation: bool,
|
|
185
|
+
pub remove_forms: bool,
|
|
186
|
+
pub strip_tags: Vec<String>,
|
|
187
|
+
pub preserve_tags: Vec<String>,
|
|
188
|
+
pub exclude_selectors: Vec<String>,
|
|
189
|
+
pub skip_images: bool,
|
|
190
|
+
pub max_depth: Option<usize>,
|
|
191
|
+
pub wrap: bool,
|
|
192
|
+
pub wrap_width: usize,
|
|
193
|
+
pub include_document_structure: bool,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
unsafe impl IntoValueFromNative for ContentConfig {}
|
|
197
|
+
|
|
198
|
+
impl magnus::TryConvert for ContentConfig {
|
|
199
|
+
fn try_convert(val: magnus::Value) -> Result<Self, magnus::Error> {
|
|
200
|
+
let r: &ContentConfig = magnus::TryConvert::try_convert(val)?;
|
|
201
|
+
Ok(r.clone())
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
unsafe impl TryConvertOwned for ContentConfig {}
|
|
205
|
+
|
|
206
|
+
impl Default for ContentConfig {
|
|
207
|
+
fn default() -> Self {
|
|
208
|
+
Self {
|
|
209
|
+
output_format: Default::default(),
|
|
210
|
+
preprocessing_preset: Default::default(),
|
|
211
|
+
remove_navigation: Default::default(),
|
|
212
|
+
remove_forms: Default::default(),
|
|
213
|
+
strip_tags: Default::default(),
|
|
214
|
+
preserve_tags: Default::default(),
|
|
215
|
+
exclude_selectors: Default::default(),
|
|
216
|
+
skip_images: Default::default(),
|
|
217
|
+
max_depth: Default::default(),
|
|
218
|
+
wrap: Default::default(),
|
|
219
|
+
wrap_width: Default::default(),
|
|
220
|
+
include_document_structure: Default::default(),
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
impl ContentConfig {
|
|
226
|
+
fn new(
|
|
227
|
+
output_format: Option<String>,
|
|
228
|
+
preprocessing_preset: Option<String>,
|
|
229
|
+
remove_navigation: Option<bool>,
|
|
230
|
+
remove_forms: Option<bool>,
|
|
231
|
+
strip_tags: Option<Vec<String>>,
|
|
232
|
+
preserve_tags: Option<Vec<String>>,
|
|
233
|
+
exclude_selectors: Option<Vec<String>>,
|
|
234
|
+
skip_images: Option<bool>,
|
|
235
|
+
max_depth: Option<usize>,
|
|
236
|
+
wrap: Option<bool>,
|
|
237
|
+
wrap_width: Option<usize>,
|
|
238
|
+
include_document_structure: Option<bool>,
|
|
239
|
+
) -> Self {
|
|
240
|
+
Self {
|
|
241
|
+
output_format: output_format.unwrap_or("markdown".to_string()),
|
|
242
|
+
preprocessing_preset: preprocessing_preset.unwrap_or("standard".to_string()),
|
|
243
|
+
remove_navigation: remove_navigation.unwrap_or(true),
|
|
244
|
+
remove_forms: remove_forms.unwrap_or(true),
|
|
245
|
+
strip_tags: strip_tags.unwrap_or_default(),
|
|
246
|
+
preserve_tags: preserve_tags.unwrap_or_default(),
|
|
247
|
+
exclude_selectors: exclude_selectors.unwrap_or_default(),
|
|
248
|
+
skip_images: skip_images.unwrap_or(false),
|
|
249
|
+
max_depth,
|
|
250
|
+
wrap: wrap.unwrap_or(false),
|
|
251
|
+
wrap_width: wrap_width.unwrap_or(80),
|
|
252
|
+
include_document_structure: include_document_structure.unwrap_or(true),
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
fn output_format(&self) -> String {
|
|
257
|
+
self.output_format.clone()
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
fn preprocessing_preset(&self) -> String {
|
|
261
|
+
self.preprocessing_preset.clone()
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
fn remove_navigation(&self) -> bool {
|
|
265
|
+
self.remove_navigation
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
fn remove_forms(&self) -> bool {
|
|
269
|
+
self.remove_forms
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
fn strip_tags(&self) -> Vec<String> {
|
|
273
|
+
self.strip_tags.clone()
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
fn preserve_tags(&self) -> Vec<String> {
|
|
277
|
+
self.preserve_tags.clone()
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
fn exclude_selectors(&self) -> Vec<String> {
|
|
281
|
+
self.exclude_selectors.clone()
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
fn skip_images(&self) -> bool {
|
|
285
|
+
self.skip_images
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
fn max_depth(&self) -> Option<usize> {
|
|
289
|
+
self.max_depth
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
fn wrap(&self) -> bool {
|
|
293
|
+
self.wrap
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
fn wrap_width(&self) -> usize {
|
|
297
|
+
self.wrap_width
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
fn include_document_structure(&self) -> bool {
|
|
301
|
+
self.include_document_structure
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
178
305
|
#[derive(Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
|
|
179
306
|
#[magnus::wrap(class = "Kreuzcrawl::BrowserConfig")]
|
|
180
307
|
#[serde(default)]
|
|
@@ -263,8 +390,8 @@ pub struct CrawlConfig {
|
|
|
263
390
|
pub cookies_enabled: bool,
|
|
264
391
|
pub auth: Option<AuthConfig>,
|
|
265
392
|
pub max_body_size: Option<usize>,
|
|
266
|
-
pub main_content_only: bool,
|
|
267
393
|
pub remove_tags: Vec<String>,
|
|
394
|
+
pub content: ContentConfig,
|
|
268
395
|
pub map_limit: Option<usize>,
|
|
269
396
|
pub map_search: Option<String>,
|
|
270
397
|
pub download_assets: bool,
|
|
@@ -361,14 +488,14 @@ impl CrawlConfig {
|
|
|
361
488
|
max_body_size: kwargs
|
|
362
489
|
.get(ruby.to_symbol("max_body_size"))
|
|
363
490
|
.and_then(|v| usize::try_convert(v).ok()),
|
|
364
|
-
main_content_only: kwargs
|
|
365
|
-
.get(ruby.to_symbol("main_content_only"))
|
|
366
|
-
.and_then(|v| bool::try_convert(v).ok())
|
|
367
|
-
.unwrap_or(false),
|
|
368
491
|
remove_tags: kwargs
|
|
369
492
|
.get(ruby.to_symbol("remove_tags"))
|
|
370
493
|
.and_then(|v| <Vec<String>>::try_convert(v).ok())
|
|
371
494
|
.unwrap_or_default(),
|
|
495
|
+
content: kwargs
|
|
496
|
+
.get(ruby.to_symbol("content"))
|
|
497
|
+
.and_then(|v| ContentConfig::try_convert(v).ok())
|
|
498
|
+
.unwrap_or_default(),
|
|
372
499
|
map_limit: kwargs
|
|
373
500
|
.get(ruby.to_symbol("map_limit"))
|
|
374
501
|
.and_then(|v| usize::try_convert(v).ok()),
|
|
@@ -497,14 +624,14 @@ impl CrawlConfig {
|
|
|
497
624
|
self.max_body_size
|
|
498
625
|
}
|
|
499
626
|
|
|
500
|
-
fn main_content_only(&self) -> bool {
|
|
501
|
-
self.main_content_only
|
|
502
|
-
}
|
|
503
|
-
|
|
504
627
|
fn remove_tags(&self) -> Vec<String> {
|
|
505
628
|
self.remove_tags.clone()
|
|
506
629
|
}
|
|
507
630
|
|
|
631
|
+
fn content(&self) -> ContentConfig {
|
|
632
|
+
self.content.clone()
|
|
633
|
+
}
|
|
634
|
+
|
|
508
635
|
fn map_limit(&self) -> Option<usize> {
|
|
509
636
|
self.map_limit
|
|
510
637
|
}
|
|
@@ -586,8 +713,8 @@ impl CrawlConfig {
|
|
|
586
713
|
cookies_enabled: self.cookies_enabled,
|
|
587
714
|
auth: self.auth.clone().map(Into::into),
|
|
588
715
|
max_body_size: self.max_body_size,
|
|
589
|
-
main_content_only: self.main_content_only,
|
|
590
716
|
remove_tags: self.remove_tags.clone(),
|
|
717
|
+
content: self.content.clone().into(),
|
|
591
718
|
map_limit: self.map_limit,
|
|
592
719
|
map_search: self.map_search.clone(),
|
|
593
720
|
download_assets: self.download_assets,
|
|
@@ -723,7 +850,6 @@ pub struct ScrapeResult {
|
|
|
723
850
|
pub is_pdf: bool,
|
|
724
851
|
pub was_skipped: bool,
|
|
725
852
|
pub detected_charset: Option<String>,
|
|
726
|
-
pub main_content_only: bool,
|
|
727
853
|
pub auth_header_sent: bool,
|
|
728
854
|
pub response_meta: Option<ResponseMeta>,
|
|
729
855
|
pub assets: Vec<DownloadedAsset>,
|
|
@@ -766,7 +892,6 @@ impl Default for ScrapeResult {
|
|
|
766
892
|
is_pdf: Default::default(),
|
|
767
893
|
was_skipped: Default::default(),
|
|
768
894
|
detected_charset: Default::default(),
|
|
769
|
-
main_content_only: Default::default(),
|
|
770
895
|
auth_header_sent: Default::default(),
|
|
771
896
|
response_meta: Default::default(),
|
|
772
897
|
assets: Default::default(),
|
|
@@ -850,10 +975,6 @@ impl ScrapeResult {
|
|
|
850
975
|
detected_charset: kwargs
|
|
851
976
|
.get(ruby.to_symbol("detected_charset"))
|
|
852
977
|
.and_then(|v| String::try_convert(v).ok()),
|
|
853
|
-
main_content_only: kwargs
|
|
854
|
-
.get(ruby.to_symbol("main_content_only"))
|
|
855
|
-
.and_then(|v| bool::try_convert(v).ok())
|
|
856
|
-
.unwrap_or_default(),
|
|
857
978
|
auth_header_sent: kwargs
|
|
858
979
|
.get(ruby.to_symbol("auth_header_sent"))
|
|
859
980
|
.and_then(|v| bool::try_convert(v).ok())
|
|
@@ -959,10 +1080,6 @@ impl ScrapeResult {
|
|
|
959
1080
|
self.detected_charset.clone()
|
|
960
1081
|
}
|
|
961
1082
|
|
|
962
|
-
fn main_content_only(&self) -> bool {
|
|
963
|
-
self.main_content_only
|
|
964
|
-
}
|
|
965
|
-
|
|
966
1083
|
fn auth_header_sent(&self) -> bool {
|
|
967
1084
|
self.auth_header_sent
|
|
968
1085
|
}
|
|
@@ -3314,6 +3431,44 @@ impl From<kreuzcrawl::ProxyConfig> for ProxyConfig {
|
|
|
3314
3431
|
}
|
|
3315
3432
|
}
|
|
3316
3433
|
|
|
3434
|
+
impl From<ContentConfig> for kreuzcrawl::ContentConfig {
|
|
3435
|
+
fn from(val: ContentConfig) -> Self {
|
|
3436
|
+
Self {
|
|
3437
|
+
output_format: val.output_format,
|
|
3438
|
+
preprocessing_preset: val.preprocessing_preset,
|
|
3439
|
+
remove_navigation: val.remove_navigation,
|
|
3440
|
+
remove_forms: val.remove_forms,
|
|
3441
|
+
strip_tags: val.strip_tags,
|
|
3442
|
+
preserve_tags: val.preserve_tags,
|
|
3443
|
+
exclude_selectors: val.exclude_selectors,
|
|
3444
|
+
skip_images: val.skip_images,
|
|
3445
|
+
max_depth: val.max_depth,
|
|
3446
|
+
wrap: val.wrap,
|
|
3447
|
+
wrap_width: val.wrap_width,
|
|
3448
|
+
include_document_structure: val.include_document_structure,
|
|
3449
|
+
}
|
|
3450
|
+
}
|
|
3451
|
+
}
|
|
3452
|
+
|
|
3453
|
+
impl From<kreuzcrawl::ContentConfig> for ContentConfig {
|
|
3454
|
+
fn from(val: kreuzcrawl::ContentConfig) -> Self {
|
|
3455
|
+
Self {
|
|
3456
|
+
output_format: val.output_format,
|
|
3457
|
+
preprocessing_preset: val.preprocessing_preset,
|
|
3458
|
+
remove_navigation: val.remove_navigation,
|
|
3459
|
+
remove_forms: val.remove_forms,
|
|
3460
|
+
strip_tags: val.strip_tags,
|
|
3461
|
+
preserve_tags: val.preserve_tags,
|
|
3462
|
+
exclude_selectors: val.exclude_selectors,
|
|
3463
|
+
skip_images: val.skip_images,
|
|
3464
|
+
max_depth: val.max_depth,
|
|
3465
|
+
wrap: val.wrap,
|
|
3466
|
+
wrap_width: val.wrap_width,
|
|
3467
|
+
include_document_structure: val.include_document_structure,
|
|
3468
|
+
}
|
|
3469
|
+
}
|
|
3470
|
+
}
|
|
3471
|
+
|
|
3317
3472
|
impl From<BrowserConfig> for kreuzcrawl::BrowserConfig {
|
|
3318
3473
|
fn from(val: BrowserConfig) -> Self {
|
|
3319
3474
|
Self {
|
|
@@ -3362,8 +3517,8 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
|
|
|
3362
3517
|
cookies_enabled: val.cookies_enabled,
|
|
3363
3518
|
auth: val.auth.map(Into::into),
|
|
3364
3519
|
max_body_size: val.max_body_size,
|
|
3365
|
-
main_content_only: val.main_content_only,
|
|
3366
3520
|
remove_tags: val.remove_tags,
|
|
3521
|
+
content: val.content.into(),
|
|
3367
3522
|
map_limit: val.map_limit,
|
|
3368
3523
|
map_search: val.map_search,
|
|
3369
3524
|
download_assets: val.download_assets,
|
|
@@ -3405,8 +3560,8 @@ impl From<kreuzcrawl::CrawlConfig> for CrawlConfig {
|
|
|
3405
3560
|
cookies_enabled: val.cookies_enabled,
|
|
3406
3561
|
auth: val.auth.map(Into::into),
|
|
3407
3562
|
max_body_size: val.max_body_size,
|
|
3408
|
-
main_content_only: val.main_content_only,
|
|
3409
3563
|
remove_tags: val.remove_tags,
|
|
3564
|
+
content: val.content.into(),
|
|
3410
3565
|
map_limit: val.map_limit,
|
|
3411
3566
|
map_search: val.map_search,
|
|
3412
3567
|
download_assets: val.download_assets,
|
|
@@ -3478,7 +3633,6 @@ impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
|
|
|
3478
3633
|
is_pdf: val.is_pdf,
|
|
3479
3634
|
was_skipped: val.was_skipped,
|
|
3480
3635
|
detected_charset: val.detected_charset,
|
|
3481
|
-
main_content_only: val.main_content_only,
|
|
3482
3636
|
auth_header_sent: val.auth_header_sent,
|
|
3483
3637
|
response_meta: val.response_meta.map(Into::into),
|
|
3484
3638
|
assets: val.assets.into_iter().map(Into::into).collect(),
|
|
@@ -3513,7 +3667,6 @@ impl From<kreuzcrawl::ScrapeResult> for ScrapeResult {
|
|
|
3513
3667
|
is_pdf: val.is_pdf,
|
|
3514
3668
|
was_skipped: val.was_skipped,
|
|
3515
3669
|
detected_charset: val.detected_charset,
|
|
3516
|
-
main_content_only: val.main_content_only,
|
|
3517
3670
|
auth_header_sent: val.auth_header_sent,
|
|
3518
3671
|
response_meta: val.response_meta.map(Into::into),
|
|
3519
3672
|
assets: val.assets.into_iter().map(Into::into).collect(),
|
|
@@ -4288,6 +4441,24 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
4288
4441
|
class.define_method("username", method!(ProxyConfig::username, 0))?;
|
|
4289
4442
|
class.define_method("password", method!(ProxyConfig::password, 0))?;
|
|
4290
4443
|
|
|
4444
|
+
let class = module.define_class("ContentConfig", ruby.class_object())?;
|
|
4445
|
+
class.define_singleton_method("new", function!(ContentConfig::new, 12))?;
|
|
4446
|
+
class.define_method("output_format", method!(ContentConfig::output_format, 0))?;
|
|
4447
|
+
class.define_method("preprocessing_preset", method!(ContentConfig::preprocessing_preset, 0))?;
|
|
4448
|
+
class.define_method("remove_navigation", method!(ContentConfig::remove_navigation, 0))?;
|
|
4449
|
+
class.define_method("remove_forms", method!(ContentConfig::remove_forms, 0))?;
|
|
4450
|
+
class.define_method("strip_tags", method!(ContentConfig::strip_tags, 0))?;
|
|
4451
|
+
class.define_method("preserve_tags", method!(ContentConfig::preserve_tags, 0))?;
|
|
4452
|
+
class.define_method("exclude_selectors", method!(ContentConfig::exclude_selectors, 0))?;
|
|
4453
|
+
class.define_method("skip_images", method!(ContentConfig::skip_images, 0))?;
|
|
4454
|
+
class.define_method("max_depth", method!(ContentConfig::max_depth, 0))?;
|
|
4455
|
+
class.define_method("wrap", method!(ContentConfig::wrap, 0))?;
|
|
4456
|
+
class.define_method("wrap_width", method!(ContentConfig::wrap_width, 0))?;
|
|
4457
|
+
class.define_method(
|
|
4458
|
+
"include_document_structure",
|
|
4459
|
+
method!(ContentConfig::include_document_structure, 0),
|
|
4460
|
+
)?;
|
|
4461
|
+
|
|
4291
4462
|
let class = module.define_class("BrowserConfig", ruby.class_object())?;
|
|
4292
4463
|
class.define_singleton_method("new", function!(BrowserConfig::new, 6))?;
|
|
4293
4464
|
class.define_method("mode", method!(BrowserConfig::mode, 0))?;
|
|
@@ -4317,8 +4488,8 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
4317
4488
|
class.define_method("cookies_enabled", method!(CrawlConfig::cookies_enabled, 0))?;
|
|
4318
4489
|
class.define_method("auth", method!(CrawlConfig::auth, 0))?;
|
|
4319
4490
|
class.define_method("max_body_size", method!(CrawlConfig::max_body_size, 0))?;
|
|
4320
|
-
class.define_method("main_content_only", method!(CrawlConfig::main_content_only, 0))?;
|
|
4321
4491
|
class.define_method("remove_tags", method!(CrawlConfig::remove_tags, 0))?;
|
|
4492
|
+
class.define_method("content", method!(CrawlConfig::content, 0))?;
|
|
4322
4493
|
class.define_method("map_limit", method!(CrawlConfig::map_limit, 0))?;
|
|
4323
4494
|
class.define_method("map_search", method!(CrawlConfig::map_search, 0))?;
|
|
4324
4495
|
class.define_method("download_assets", method!(CrawlConfig::download_assets, 0))?;
|
|
@@ -4365,7 +4536,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
4365
4536
|
class.define_method("is_pdf", method!(ScrapeResult::is_pdf, 0))?;
|
|
4366
4537
|
class.define_method("was_skipped", method!(ScrapeResult::was_skipped, 0))?;
|
|
4367
4538
|
class.define_method("detected_charset", method!(ScrapeResult::detected_charset, 0))?;
|
|
4368
|
-
class.define_method("main_content_only", method!(ScrapeResult::main_content_only, 0))?;
|
|
4369
4539
|
class.define_method("auth_header_sent", method!(ScrapeResult::auth_header_sent, 0))?;
|
|
4370
4540
|
class.define_method("response_meta", method!(ScrapeResult::response_meta, 0))?;
|
|
4371
4541
|
class.define_method("assets", method!(ScrapeResult::assets, 0))?;
|