html2rss 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +112 -44
  3. data/html2rss.gemspec +3 -2
  4. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +8 -1
  5. data/lib/html2rss/auto_source/article.rb +37 -5
  6. data/lib/html2rss/auto_source/channel.rb +21 -28
  7. data/lib/html2rss/auto_source/cleanup.rb +0 -16
  8. data/lib/html2rss/auto_source/rss_builder.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/html.rb +21 -12
  10. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
  11. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
  12. data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
  13. data/lib/html2rss/auto_source/scraper/schema.rb +22 -34
  14. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +41 -41
  15. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +6 -6
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +3 -2
  17. data/lib/html2rss/auto_source.rb +0 -7
  18. data/lib/html2rss/cli.rb +11 -4
  19. data/lib/html2rss/config/channel.rb +7 -1
  20. data/lib/html2rss/config/selectors.rb +2 -1
  21. data/lib/html2rss/config.rb +1 -0
  22. data/lib/html2rss/item.rb +7 -2
  23. data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
  24. data/lib/html2rss/request_service/context.rb +46 -0
  25. data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
  26. data/lib/html2rss/request_service/puppet_commander.rb +61 -0
  27. data/lib/html2rss/request_service/response.rb +27 -0
  28. data/lib/html2rss/request_service/strategy.rb +28 -0
  29. data/lib/html2rss/request_service.rb +97 -0
  30. data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
  31. data/lib/html2rss/utils.rb +23 -26
  32. data/lib/html2rss/version.rb +1 -1
  33. data/lib/html2rss.rb +5 -5
  34. metadata +31 -11
  35. data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d89191b35f643372cc18b880dab7535d18a10d9fd123897460ee16c5e990a5d9
4
- data.tar.gz: 71cb356f5261b2e6a3d2152afcb68f658e78d5fec5ff15bc67ed0d5bd153fc00
3
+ metadata.gz: bb6b3eb69655bdbb4511f74db9e1bcc766a98aa55d7afc2561a176c6973bda4f
4
+ data.tar.gz: 45193122489ba965b489c981696f71508030dc80f156e5b0f077932fc55caec3
5
5
  SHA512:
6
- metadata.gz: 46f048feae342844df1af51c741d681677192c1dc84452fae1002f5cca5b406c0698a426ec6e532572c4fb4f6fb896a966862d8d2599b8dd742a174707289aed
7
- data.tar.gz: 98d0316c64bb5a160d26d5efa59b25901b3a64e572795bbd840539fe69d84a4ea3c797bb16721edb73277d1b9bfb9238f9d40ea2b9bb4ebeffc81e8790a02062
6
+ metadata.gz: 294529cd8cb1d289e969f94c32757656d12c864a92c438fceb73ba9dbddd85cca822b146898cdeff928aeefcba75652b91c0a56ded66241bcf23014fea299196
7
+ data.tar.gz: a2b50e52a7f6ad7768a7092fcdf04c7000dd43d190ee12461f6773fe4b324e43be7b411c7aff69b3af1266aeca3fbb4678e5c43e3bac2b0c2a49d86589869b38
data/README.md CHANGED
@@ -10,21 +10,11 @@ With the _feed config_, you provide a URL to scrape and CSS selectors for extrac
10
10
 
11
11
  Support the development by sponsoring this project on GitHub. Thank you! 💓
12
12
 
13
- ## Installation
14
-
15
- | Install | `gem install html2rss` |
16
- | ------- | ---------------------- |
17
- | Usage | `html2rss help` |
13
+ ## Generating a feed on the CLI
18
14
 
19
- You can also install it as a dependency in your Ruby project:
15
+ [Install Ruby](https://www.ruby-lang.org/en/documentation/installation/) (latest version is recommended) on your machine and run `gem install html2rss` in your terminal.
20
16
 
21
- | 🤩 Like it? | Star it! ⭐️ |
22
- | -------------------------------: | -------------------- |
23
- | Add this line to your `Gemfile`: | `gem 'html2rss'` |
24
- | Then execute: | `bundle` |
25
- | In your code: | `require 'html2rss'` |
26
-
27
- ## Generating a feed on the CLI
17
+ After the installation has finished, `html2rss help` will print usage information.
28
18
 
29
19
  ### using automatic generation
30
20
 
@@ -59,6 +49,14 @@ Build the feed from this config with: `html2rss feed ./my_config_file.yml`.
59
49
 
60
50
  ## Generating a feed with Ruby
61
51
 
52
+ You can also install it as a dependency in your Ruby project:
53
+
54
+ | 🤩 Like it? | Star it! ⭐️ |
55
+ | -------------------------------: | -------------------- |
56
+ | Add this line to your `Gemfile`: | `gem 'html2rss'` |
57
+ | Then execute: | `bundle` |
58
+ | In your code: | `require 'html2rss'` |
59
+
62
60
  Here's a minimal working example using Ruby:
63
61
 
64
62
  ```ruby
@@ -117,7 +115,7 @@ channel:
117
115
  Command line usage example:
118
116
 
119
117
  ```sh
120
- bundle exec html2rss feed the_feed_config.yml id=42
118
+ html2rss feed the_feed_config.yml id=42
121
119
  ```
122
120
 
123
121
  <details><summary>See a Ruby example</summary>
@@ -154,9 +152,9 @@ Your `selectors` hash can contain arbitrary named selectors, but only a few will
154
152
  | `comments` | `comments` | A URL. |
155
153
  | `source` | ~~source~~ | Not yet supported. |
156
154
 
157
- ### The `selector` hash
155
+ ### Build RSS 2.0 item attributes by specifying selectors
158
156
 
159
- Every named selector in your `selectors` hash can have these attributes:
157
+ Every named selector (i.e. `title`, `description`, see table above) in your `selectors` hash can have these attributes:
160
158
 
161
159
  | name | value |
162
160
  | -------------- | -------------------------------------------------------- |
@@ -164,7 +162,7 @@ Every named selector in your `selectors` hash can have these attributes:
164
162
  | `extractor` | Name of the extractor. See notes below. |
165
163
  | `post_process` | A hash or array of hashes. See notes below. |
166
164
 
167
- ## Using extractors
165
+ #### Using extractors
168
166
 
169
167
  Extractors help with extracting the information from the selected HTML tag.
170
168
 
@@ -201,7 +199,7 @@ selectors:
201
199
 
202
200
  </details>
203
201
 
204
- ## Using post processors
202
+ ### Using post processors
205
203
 
206
204
  Extracted information can be further manipulated with post processors.
207
205
 
@@ -218,7 +216,7 @@ Extracted information can be further manipulated with post processors.
218
216
 
219
217
  ⚠️ Always make use of the `sanitize_html` post processor for HTML content. _Never trust the internet!_ ⚠️
220
218
 
221
- ### Chaining post processors
219
+ #### Chaining post processors
222
220
 
223
221
  Pass an array to `post_process` to chain the post processors.
224
222
 
@@ -244,14 +242,14 @@ selectors:
244
242
 
245
243
  </details>
246
244
 
247
- ### Post processor `gsub`
245
+ ##### Post processor `gsub`
248
246
 
249
247
  The post processor `gsub` makes use of Ruby's [`gsub`](https://apidock.com/ruby/String/gsub) method.
250
248
 
251
- | key | type | required | note |
252
- | ------------- | ------ | -------- | --------------------------- |
253
- | `pattern` | String | yes | Can be Regexp or String. |
254
- | `replacement` | String | yes | Can be a [backreference](). |
249
+ | key | type | required | note |
250
+ | ------------- | ------ | -------- | ------------------------ |
251
+ | `pattern` | String | yes | Can be Regexp or String. |
252
+ | `replacement` | String | yes | Can be a backreference. |
255
253
 
256
254
  <details><summary>See a Ruby example</summary>
257
255
 
@@ -283,7 +281,7 @@ selectors:
283
281
 
284
282
  </details>
285
283
 
286
- ## Adding `<category>` tags to an item
284
+ #### Adding `<category>` tags to an item
287
285
 
288
286
  The `categories` selector takes an array of selector names. Each value of those
289
287
  selectors will become a `<category>` on the RSS item.
@@ -326,7 +324,7 @@ selectors:
326
324
 
327
325
  </details>
328
326
 
329
- ## Custom item GUID
327
+ #### Custom item GUID
330
328
 
331
329
  By default, html2rss generates a GUID from the `title` or `description`.
332
330
 
@@ -371,7 +369,7 @@ selectors:
371
369
 
372
370
  </details>
373
371
 
374
- ## Adding an `<enclosure>` tag to an item
372
+ #### Adding an `<enclosure>` tag to an item
375
373
 
376
374
  An enclosure can be any file, e.g. a image, audio or video - think Podcast.
377
375
 
@@ -379,7 +377,7 @@ The `enclosure` selector needs to return a URL of the content to enclose. If the
379
377
 
380
378
  Since `html2rss` does no further inspection of the enclosure, its support comes with trade-offs:
381
379
 
382
- 1. The content-type is guessed from the file extension of the URL.
380
+ 1. The content-type is guessed from the file extension of the URL, unless one is specified in `content_type`.
383
381
  2. If the content-type guessing fails, it will default to `application/octet-stream`.
384
382
  3. The content-length will always be undetermined and therefore stated as `0` bytes.
385
383
 
@@ -392,7 +390,12 @@ Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item
392
390
  Html2rss.feed(
393
391
  channel: {},
394
392
  selectors: {
395
- enclosure: { selector: 'audio', extractor: 'attribute', attribute: 'src' }
393
+ enclosure: {
394
+ selector: 'audio',
395
+ extractor: 'attribute',
396
+ attribute: 'src',
397
+ content_type: 'audio/mp3'
398
+ }
396
399
  }
397
400
  )
398
401
  ```
@@ -411,17 +414,16 @@ selectors:
411
414
  selector: "audio"
412
415
  extractor: "attribute"
413
416
  attribute: "src"
417
+ content_type: "audio/mp3"
414
418
  ```
415
419
 
416
420
  </details>
421
+
417
422
  ## Scraping and handling JSON responses
418
423
 
419
- By default, `html2rss` assumes the URL responds with HTML. However, it can also handle JSON responses. The JSON must return an Array or Hash.
424
+ By default, `html2rss` assumes the URL responds with HTML. However, it can also handle JSON responses. The JSON response must be an Array or Hash.
420
425
 
421
- | key | required | default | note |
422
- | ---------- | -------- | ------- | ---------------------------------------------------- |
423
- | `json` | optional | false | If set to `true`, the response is parsed as JSON. |
424
- | `jsonpath` | optional | $ | Use [JSONPath syntax]() to select nodes of interest. |
426
+ The JSON is converted to XML which you can query using CSS selectors.
425
427
 
426
428
  <details><summary>See a Ruby example</summary>
427
429
 
@@ -447,7 +449,73 @@ selectors:
447
449
 
448
450
  </details>
449
451
 
450
- ## Set any HTTP header in the request
452
+ ## Customization of how requests to the channel URL are sent
453
+
454
+ By default, html2rss issues a naiive HTTP request and extracts information from the response. That is performant and works for many websites.
455
+
456
+ However, modern websites often do not render much HTML on the server, but evaluate JavaScript on the client to create the HTML. In such cases, the default strategy will not find the "juicy content".
457
+
458
+ ### Use Browserless.io
459
+
460
+ You can use _Browserless.io_ to run a Chrome browser and return the website's source code after the website generated it.
461
+ For this, you can either run your own Browserless.io instance (Docker image available -- [read their license](https://github.com/browserless/browserless/pkgs/container/chromium#licensing)!) or pay them for a hosted instance.
462
+
463
+ To run a local Browserless.io instance, you can use the following Docker command:
464
+
465
+ ```sh
466
+ docker run \
467
+ --rm \
468
+ -p 3000:3000 \
469
+ -e "CONCURRENT=10" \
470
+ -e "TOKEN=6R0W53R135510" \
471
+ ghcr.io/browserless/chromium
472
+ ```
473
+
474
+ To make html2rss use your instance,
475
+
476
+ 1. specify the environment variables accordingly, and
477
+ 2. use the `browserless` strategy for those websites.
478
+
479
+ When running locally with commands from above, you can skip setting the environment variables, as they are aligned with the default values.
480
+
481
+ ```sh
482
+ BROWSERLESS_IO_WEBSOCKET_URL="ws://127.0.0.1:3000" BROWSERLESS_IO_API_TOKEN="6R0W53R135510" \
483
+ html2rss auto --strategy=browserless https://example.com
484
+ ```
485
+
486
+ When using traditional feed configs, inside your channel config set `strategy: browserless`.
487
+
488
+ <details><summary>See a YAML feed config example</summary>
489
+
490
+ ```yml
491
+ channel:
492
+ url: https://www.imdb.com/user/ur67728460/ratings
493
+ time_zone: UTC
494
+ ttl: 1440
495
+ strategy: browserless
496
+ headers:
497
+ User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
498
+ selectors:
499
+ items:
500
+ selector: "li.ipc-metadata-list-summary-item"
501
+ title:
502
+ selector: ".ipc-title__text"
503
+ post_process:
504
+ - name: gsub
505
+ pattern: "/^(\\d+.)\\s/"
506
+ replacement: ""
507
+ - name: template
508
+ string: "%{self} rated with: %{user_rating}"
509
+ link:
510
+ selector: "a.ipc-title-link-wrapper"
511
+ extractor: "href"
512
+ user_rating:
513
+ selector: "[data-testid='ratingGroup--other-user-rating'] > .ipc-rating-star--rating"
514
+ ```
515
+
516
+ </details>
517
+
518
+ ### Set any HTTP header in the request
451
519
 
452
520
  To set HTTP request headers, you can add them to the channel's `headers` hash. This is useful for APIs that require an Authorization header.
453
521
 
@@ -595,24 +663,24 @@ Recommended further readings:
595
663
  - Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
596
664
  - [CSS selectors are versatile. Here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
597
665
 
598
- ### Contributing
666
+ ## Contributing
599
667
 
600
668
  Find ideas what to contribute in:
601
669
 
602
670
  1. <https://github.com/orgs/html2rss/discussions>
603
671
  2. the issues tracker: <https://github.com/html2rss/html2rss/issues>
604
672
 
605
- #### Development Helpers
606
-
607
- 1. `bin/setup`: installs dependencies and sets up the development environment.
608
- 2. `bin/guard`: automatically runs rspec, rubocop and reek when a file changes.
609
- 3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE:
610
- a. [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby)
611
-
612
- #### How to submit changes
673
+ To submit changes:
613
674
 
614
675
  1. Fork this repo ( <https://github.com/html2rss/html2rss/fork> )
615
676
  2. Create your feature branch (`git checkout -b my-new-feature`)
616
677
  3. Implement a commit your changes (`git commit -am 'feat: add XYZ'`)
617
678
  4. Push to the branch (`git push origin my-new-feature`)
618
679
  5. Create a new Pull Request using the Github web UI
680
+
681
+ ## Development Helpers
682
+
683
+ 1. `bin/setup`: installs dependencies and sets up the development environment.
684
+ 2. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE.
685
+
686
+ For example: [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby).
data/html2rss.gemspec CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  spec.description = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
15
15
  spec.homepage = 'https://github.com/html2rss/html2rss'
16
16
  spec.license = 'MIT'
17
- spec.required_ruby_version = '>= 3.1'
17
+ spec.required_ruby_version = '>= 3.2'
18
18
 
19
19
  if spec.respond_to?(:metadata)
20
20
  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
@@ -39,8 +39,9 @@ Gem::Specification.new do |spec|
39
39
  spec.add_dependency 'mime-types', '> 3.0'
40
40
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
41
41
  spec.add_dependency 'parallel'
42
+ spec.add_dependency 'puppeteer-ruby'
42
43
  spec.add_dependency 'regexp_parser'
43
- spec.add_dependency 'reverse_markdown', '~> 2.0'
44
+ spec.add_dependency 'reverse_markdown', '~> 3.0'
44
45
  spec.add_dependency 'rss'
45
46
  spec.add_dependency 'sanitize', '~> 6.0'
46
47
  spec.add_dependency 'thor'
@@ -77,10 +77,17 @@ module Html2rss
77
77
  )
78
78
  end
79
79
 
80
+ ##
81
+ # @return [Hash]
82
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
80
83
  def add_attributes
81
84
  {
82
85
  'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
83
- 'img' => { 'referrer-policy' => 'no-referrer' }
86
+ 'area' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
87
+ 'img' => { 'referrerpolicy' => 'no-referrer' },
88
+ 'iframe' => { 'referrerpolicy' => 'no-referrer' },
89
+ 'video' => { 'referrerpolicy' => 'no-referrer' },
90
+ 'audio' => { 'referrerpolicy' => 'no-referrer' }
84
91
  }
85
92
  end
86
93
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'zlib'
4
4
  require 'sanitize'
5
+ require 'nokogiri'
5
6
 
6
7
  module Html2rss
7
8
  class AutoSource
@@ -14,6 +15,31 @@ module Html2rss
14
15
 
15
16
  PROVIDED_KEYS = %i[id title description url image guid published_at scraper].freeze
16
17
 
18
+ ##
19
+ # Removes the specified pattern from the beginning of the text
20
+ # within a given range if the pattern occurs before the range's end.
21
+ #
22
+ # @param text [String]
23
+ # @param pattern [String]
24
+ # @param end_of_range [Integer] - Optional, defaults to half the size of the text
25
+ # @return [String]
26
+ def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
27
+ return text unless text.is_a?(String) && pattern.is_a?(String)
28
+
29
+ index = text.index(pattern)
30
+ return text if index.nil? || index >= end_of_range
31
+
32
+ text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
33
+ end
34
+
35
+ ##
36
+ # Checks if the text contains HTML tags.
37
+ # @param text [String]
38
+ # @return [Boolean]
39
+ def self.contains_html?(text)
40
+ Nokogiri::HTML.fragment(text).children.any?(&:element?)
41
+ end
42
+
17
43
  # @param options [Hash<Symbol, String>]
18
44
  def initialize(**options)
19
45
  @to_h = {}
@@ -50,9 +76,15 @@ module Html2rss
50
76
  def description
51
77
  return @description if defined?(@description)
52
78
 
53
- return if url.to_s.empty? || @to_h[:description].to_s.empty?
79
+ return if (description = @to_h[:description]).to_s.empty?
80
+
81
+ @description = self.class.remove_pattern_from_start(description, title) if title
54
82
 
55
- @description ||= Html2rss::AttributePostProcessors::SanitizeHtml.get(@to_h[:description], url)
83
+ if self.class.contains_html?(@description) && url
84
+ @description = Html2rss::AttributePostProcessors::SanitizeHtml.get(description, url)
85
+ else
86
+ @description
87
+ end
56
88
  end
57
89
 
58
90
  # @return [Addressable::URI, nil]
@@ -72,11 +104,11 @@ module Html2rss
72
104
  end
73
105
 
74
106
  # Parses and returns the published_at time.
75
- # @return [Time, nil]
107
+ # @return [DateTime, nil]
76
108
  def published_at
77
- return if (string = @to_h[:published_at].to_s).strip.empty?
109
+ return if (string = @to_h[:published_at].to_s.strip).empty?
78
110
 
79
- @published_at ||= Time.parse(string)
111
+ @published_at ||= DateTime.parse(string)
80
112
  rescue ArgumentError
81
113
  nil
82
114
  end
@@ -24,52 +24,45 @@ module Html2rss
24
24
  attr_writer :articles
25
25
  attr_reader :stylesheets
26
26
 
27
- def url = extract_url
28
- def title = extract_title
29
- def language = extract_language
30
- def description = extract_description
31
- def image = extract_image
32
- def ttl = extract_ttl
33
- def last_build_date = headers['last-modified']
34
-
35
- def generator
36
- "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
27
+ def url = @url.normalize.to_s
28
+
29
+ def title
30
+ @title ||= if (title = parsed_body.at_css('head > title')&.text.to_s) && !title.empty?
31
+ title.gsub(/\s+/, ' ').strip
32
+ else
33
+ Utils.titleized_channel_url(@url)
34
+ end
37
35
  end
38
36
 
39
- private
40
-
41
- attr_reader :parsed_body, :headers
42
-
43
- def extract_url
44
- @url.normalize.to_s
45
- end
46
-
47
- def extract_title
48
- parsed_body.at_css('head > title')&.text
49
- end
37
+ def description = parsed_body.at_css('meta[name="description"]')&.[]('content')
38
+ def last_build_date = headers['last-modified']
50
39
 
51
- def extract_language
40
+ def language
52
41
  return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
53
42
 
54
43
  parsed_body.at_css('[lang]')&.[]('lang')
55
44
  end
56
45
 
57
- def extract_description
58
- parsed_body.at_css('meta[name="description"]')&.[]('content') || ''
59
- end
60
-
61
- def extract_image
46
+ def image
62
47
  url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
63
48
  Html2rss::Utils.sanitize_url(url) if url
64
49
  end
65
50
 
66
- def extract_ttl
51
+ def ttl
67
52
  ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
68
53
  return unless ttl
69
54
 
70
55
  ttl.to_i.fdiv(60).ceil
71
56
  end
72
57
 
58
+ def generator
59
+ "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
60
+ end
61
+
62
+ private
63
+
64
+ attr_reader :parsed_body, :headers
65
+
73
66
  def scraper_counts
74
67
  scraper_counts = +''
75
68
 
@@ -13,10 +13,7 @@ module Html2rss
13
13
 
14
14
  articles.select!(&:valid?)
15
15
 
16
- remove_short!(articles, :title)
17
-
18
16
  deduplicate_by!(articles, :url)
19
- deduplicate_by!(articles, :title)
20
17
 
21
18
  keep_only_http_urls!(articles)
22
19
  reject_different_domain!(articles, url) unless keep_different_domain
@@ -27,19 +24,6 @@ module Html2rss
27
24
 
28
25
  private
29
26
 
30
- ##
31
- # Removes articles with short values for a given key.
32
- #
33
- # @param articles [Array<Article>] The list of articles to process.
34
- # @param key [Symbol] The key to check for short values.
35
- # @param min_words [Integer] The minimum number of words required.
36
- def remove_short!(articles, key = :title, min_words: 2)
37
- articles.reject! do |article|
38
- value = article.public_send(key)
39
- value.nil? || value.to_s.split.size < min_words
40
- end
41
- end
42
-
43
27
  ##
44
28
  # Deduplicates articles by a given key.
45
29
  #
@@ -60,7 +60,7 @@ module Html2rss
60
60
 
61
61
  item_maker.title = article.title
62
62
  item_maker.description = article.description
63
- item_maker.pubDate = article.published_at
63
+ item_maker.pubDate = article.published_at&.rfc2822
64
64
  item_maker.link = article.url
65
65
  end
66
66
  end
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'nokogiri'
4
- require 'set'
5
4
 
6
5
  module Html2rss
7
6
  class AutoSource
@@ -12,12 +11,14 @@ module Html2rss
12
11
  class Html
13
12
  include Enumerable
14
13
 
14
+ TAGS_TO_IGNORE = /(nav|footer|header)/i
15
+
15
16
  def self.articles?(parsed_body)
16
17
  new(parsed_body, url: '').any?
17
18
  end
18
19
 
19
20
  def self.parent_until_condition(node, condition)
20
- return nil if !node || node.parent.name == 'html'
21
+ return nil if !node || node.document? || node.parent.name == 'html'
21
22
  return node if condition.call(node)
22
23
 
23
24
  parent_until_condition(node.parent, condition)
@@ -32,7 +33,7 @@ module Html2rss
32
33
  def initialize(parsed_body, url:)
33
34
  @parsed_body = parsed_body
34
35
  @url = url
35
- @css_selectors = Hash.new(0)
36
+ @selectors = Hash.new(0)
36
37
  end
37
38
 
38
39
  attr_reader :parsed_body
@@ -48,9 +49,10 @@ module Html2rss
48
49
  frequent_selectors.each do |selector|
49
50
  parsed_body.xpath(selector).each do |selected_tag|
50
51
  article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
51
- article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
52
52
 
53
- yield article_hash if article_hash
53
+ if article_tag && (article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call)
54
+ yield article_hash
55
+ end
54
56
  end
55
57
  end
56
58
  end
@@ -58,25 +60,32 @@ module Html2rss
58
60
  ##
59
61
  # Find all the anchors in root.
60
62
  # @param root [Nokogiri::XML::Node] The root node to search for anchors
61
- # @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
63
+ # @return [Set<String>] The set of XPath selectors which exist at least min_frequency times
62
64
  def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
63
65
  @frequent_selectors ||= begin
64
66
  root.traverse do |node|
65
67
  next if !node.element? || node.name != 'a'
66
68
 
67
- @css_selectors[self.class.simplify_xpath(node.path)] += 1
69
+ @selectors[self.class.simplify_xpath(node.path)] += 1
68
70
  end
69
71
 
70
- @css_selectors.keys
71
- .select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
72
- .to_set
72
+ @selectors.keys
73
+ .select { |selector| (@selectors[selector]).to_i >= min_frequency }
74
+ .to_set
73
75
  end
74
76
  end
75
77
 
76
- private
77
-
78
78
  def article_condition(node)
79
+ # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
80
+ return false if node.path.match?(TAGS_TO_IGNORE)
81
+
82
+ # Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE.
83
+ return false if self.class.parent_until_condition(node, proc do |current_node|
84
+ current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) }
85
+ end)
86
+
79
87
  return true if %w[body html].include?(node.name)
88
+
80
89
  return true if node.parent.css('a').size > 1
81
90
 
82
91
  false
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class Schema
7
+ ##
8
+ # Handles schema.org ItemList objects, which contain
9
+ # 1. itemListElements, and/or
10
+ # 2. interesting attributes, i.e. description, url, image, itself.
11
+ #
12
+ # @see https://schema.org/ItemList
13
+ class ItemList < Thing
14
+ SUPPORTED_TYPES = Set['ItemList']
15
+
16
+ # @return [Array<Hash>] the scraped article hashes with DEFAULT_ATTRIBUTES
17
+ def call
18
+ hashes = [super]
19
+
20
+ return hashes if (elements = @schema_object[:itemListElement]).nil?
21
+
22
+ elements = [elements] unless elements.is_a?(Array)
23
+
24
+ elements.each do |schema_object|
25
+ hashes << ListItem.new(schema_object, url: @url).call
26
+ end
27
+
28
+ hashes
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class Schema
7
+ ##
8
+ #
9
+ # @see https://schema.org/ListItem
10
+ class ListItem < Thing
11
+ def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
12
+ def title = schema_object.dig(:item, :name) || super || (url ? Utils.titleized_url(url) : nil)
13
+ def description = schema_object.dig(:item, :description) || super
14
+
15
+ # @return [Addressable::URI, nil]
16
+ def url
17
+ url = schema_object.dig(:item, :url) || super
18
+
19
+ Utils.build_absolute_url_from_relative(url, @url) if url
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end