html2rss 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +39 -11
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +9 -6
- data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
- data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
- data/lib/html2rss/attribute_post_processors/template.rb +4 -4
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +85 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +70 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +128 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +80 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +2 -2
- data/lib/html2rss/config.rb +1 -4
- data/lib/html2rss/item.rb +9 -3
- data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
- data/lib/html2rss/utils.rb +11 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +27 -11
- metadata +30 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
|
4
|
+
data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
|
7
|
+
data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[](http://rubygems.org/gems/html2rss/) [](https://www.rubydoc.info/gems/html2rss) 
|
4
4
|
|
5
|
-
`html2rss` is a Ruby gem that generates RSS 2.0 feeds from a _feed config_.
|
5
|
+
`html2rss` is a Ruby gem that generates RSS 2.0 feeds from websites automatically, and as a fallback via _feed config_.
|
6
6
|
|
7
7
|
With the _feed config_, you provide a URL to scrape and CSS selectors for extracting information (like title, URL, etc.). The gem builds the RSS feed accordingly. [Extractors](#using-extractors) and chainable [post processors](#using-post-processors) make information extraction, processing, and sanitizing a breeze. The gem also supports [scraping JSON](#scraping-and-handling-json-responses) responses and [setting HTTP request headers](#set-any-http-header-in-the-request).
|
8
8
|
|
@@ -26,26 +26,40 @@ You can also install it as a dependency in your Ruby project:
|
|
26
26
|
|
27
27
|
## Generating a feed on the CLI
|
28
28
|
|
29
|
-
|
29
|
+
### using automatic generation
|
30
|
+
|
31
|
+
html2rss offers an automatic RSS generation feature. Try it with:
|
32
|
+
|
33
|
+
`html2rss auto https://unmatchedstyle.com/`
|
34
|
+
|
35
|
+
### creating a feed config file and using it
|
36
|
+
|
37
|
+
If the results are not to your satisfaction, you can create a feed config file.
|
38
|
+
|
39
|
+
Create a file called `my_config_file.yml` with this sample content:
|
30
40
|
|
31
41
|
```yml
|
32
42
|
channel:
|
33
|
-
url: https://
|
43
|
+
url: https://unmatchedstyle.com
|
34
44
|
selectors:
|
35
45
|
items:
|
36
|
-
selector: "
|
46
|
+
selector: "article[id^='post-']"
|
37
47
|
title:
|
38
|
-
selector:
|
48
|
+
selector: h2
|
39
49
|
link:
|
40
50
|
selector: a
|
41
51
|
extractor: href
|
52
|
+
description:
|
53
|
+
selector: ".post-content"
|
54
|
+
post_process:
|
55
|
+
- name: sanitize_html
|
42
56
|
```
|
43
57
|
|
44
|
-
Build the
|
58
|
+
Build the feed from this config with: `html2rss feed ./my_config_file.yml`.
|
45
59
|
|
46
60
|
## Generating a feed with Ruby
|
47
61
|
|
48
|
-
Here's a minimal working example
|
62
|
+
Here's a minimal working example using Ruby:
|
49
63
|
|
50
64
|
```ruby
|
51
65
|
require 'html2rss'
|
@@ -481,7 +495,7 @@ feeds:
|
|
481
495
|
|
482
496
|
Your feed configs go below `feeds`. Everything else is part of the global config.
|
483
497
|
|
484
|
-
Find a full example of a `feeds.yml` at [`spec/feeds.test.yml`](https://github.com/html2rss/html2rss/blob/master/spec/feeds.test.yml).
|
498
|
+
Find a full example of a `feeds.yml` at [`spec/fixtures/feeds.test.yml`](https://github.com/html2rss/html2rss/blob/master/spec/fixtures/feeds.test.yml).
|
485
499
|
|
486
500
|
Now you can build your feeds like this:
|
487
501
|
|
@@ -583,8 +597,22 @@ Recommended further readings:
|
|
583
597
|
|
584
598
|
### Contributing
|
585
599
|
|
586
|
-
|
600
|
+
Find ideas what to contribute in:
|
601
|
+
|
602
|
+
1. <https://github.com/orgs/html2rss/discussions>
|
603
|
+
2. the issues tracker: <https://github.com/html2rss/html2rss/issues>
|
604
|
+
|
605
|
+
#### Development Helpers
|
606
|
+
|
607
|
+
1. `bin/setup`: installs dependencies and sets up the development environment.
|
608
|
+
2. `bin/guard`: automatically runs rspec, rubocop and reek when a file changes.
|
609
|
+
3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE:
|
610
|
+
a. [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby)
|
611
|
+
|
612
|
+
#### How to submit changes
|
613
|
+
|
614
|
+
1. Fork this repo ( <https://github.com/html2rss/html2rss/fork> )
|
587
615
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
588
|
-
3.
|
616
|
+
3. Implement a commit your changes (`git commit -am 'feat: add XYZ'`)
|
589
617
|
4. Push to the branch (`git push origin my-new-feature`)
|
590
|
-
5. Create a new Pull Request
|
618
|
+
5. Create a new Pull Request using the Github web UI
|
data/html2rss.gemspec
CHANGED
@@ -38,6 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_dependency 'kramdown'
|
39
39
|
spec.add_dependency 'mime-types', '> 3.0'
|
40
40
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
41
|
+
spec.add_dependency 'parallel'
|
41
42
|
spec.add_dependency 'regexp_parser'
|
42
43
|
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
43
44
|
spec.add_dependency 'rss'
|
@@ -26,17 +26,20 @@ module Html2rss
|
|
26
26
|
# @param value [Object] the value to check
|
27
27
|
# @param types [Array<Class>, Class] the expected type(s)
|
28
28
|
# @param name [String] the name of the option being checked
|
29
|
+
# @param context [Item::Context] the context
|
29
30
|
# @raise [InvalidType] if the value is not of the expected type(s)
|
30
|
-
def self.assert_type(value, types = [], name)
|
31
|
+
def self.assert_type(value, types = [], name, context:)
|
31
32
|
types = [types] unless types.is_a?(Array)
|
32
33
|
|
33
34
|
return if types.any? { |type| value.is_a?(type) }
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
end
|
36
|
+
options = context[:options] if context.is_a?(Hash)
|
37
|
+
options ||= { file: File.basename(caller_locations(1, 1).first.absolute_path) }
|
38
38
|
|
39
|
-
|
39
|
+
raise InvalidType, format('The type of `%<name>s` must be %<types>s, but is: %<type>s in: %<options>s',
|
40
|
+
name:, types: types.join(' or '), type: value.class, options: options.inspect),
|
41
|
+
[], cause: nil
|
42
|
+
end
|
40
43
|
|
41
44
|
##
|
42
45
|
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
@@ -51,7 +54,7 @@ module Html2rss
|
|
51
54
|
def initialize(value, context)
|
52
55
|
klass = self.class
|
53
56
|
# TODO: get rid of Hash
|
54
|
-
klass.assert_type(context, [Item::Context, Hash], 'context')
|
57
|
+
klass.assert_type(context, [Item::Context, Hash], 'context', context:)
|
55
58
|
klass.validate_args!(value, context)
|
56
59
|
|
57
60
|
@value = value
|
@@ -27,9 +27,9 @@ module Html2rss
|
|
27
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
28
28
|
class Gsub < Base
|
29
29
|
def self.validate_args!(value, context)
|
30
|
-
assert_type value, String, :value
|
30
|
+
assert_type value, String, :value, context:
|
31
31
|
expect_options(%i[replacement pattern], context)
|
32
|
-
assert_type context.dig(:options, :replacement), [String, Hash], :replacement
|
32
|
+
assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
|
33
33
|
end
|
34
34
|
|
35
35
|
##
|
@@ -27,8 +27,8 @@ module Html2rss
|
|
27
27
|
# Would return:
|
28
28
|
# 'Lorem **ipsum** dolor'
|
29
29
|
class HtmlToMarkdown < Base
|
30
|
-
def self.validate_args!(value,
|
31
|
-
assert_type value, String, :value
|
30
|
+
def self.validate_args!(value, context)
|
31
|
+
assert_type value, String, :value, context:
|
32
32
|
end
|
33
33
|
|
34
34
|
##
|
@@ -33,8 +33,8 @@ module Html2rss
|
|
33
33
|
#
|
34
34
|
# <p>Price: 12.34</p>
|
35
35
|
class MarkdownToHtml < Base
|
36
|
-
def self.validate_args!(value,
|
37
|
-
assert_type value, String, :value
|
36
|
+
def self.validate_args!(value, context)
|
37
|
+
assert_type value, String, :value, context:
|
38
38
|
end
|
39
39
|
|
40
40
|
##
|
@@ -27,8 +27,8 @@ module Html2rss
|
|
27
27
|
# It uses `Time.parse`.
|
28
28
|
class ParseTime < Base
|
29
29
|
def self.validate_args!(value, context)
|
30
|
-
assert_type
|
31
|
-
assert_type
|
30
|
+
assert_type(value, String, :value, context:)
|
31
|
+
assert_type(context[:config].time_zone, String, :time_zone, context:)
|
32
32
|
end
|
33
33
|
|
34
34
|
##
|
@@ -25,8 +25,8 @@ module Html2rss
|
|
25
25
|
def self.validate_args!(value, context)
|
26
26
|
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
27
27
|
|
28
|
-
assert_type(value, url_types, :value)
|
29
|
-
assert_type(context.config.url, url_types, :url)
|
28
|
+
assert_type(value, url_types, :value, context:)
|
29
|
+
assert_type(context.config.url, url_types, :url, context:)
|
30
30
|
|
31
31
|
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
32
32
|
end
|
@@ -39,8 +39,19 @@ module Html2rss
|
|
39
39
|
# Would return:
|
40
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
41
41
|
class SanitizeHtml < Base
|
42
|
-
def self.validate_args!(value,
|
43
|
-
assert_type value, String, :value
|
42
|
+
def self.validate_args!(value, context)
|
43
|
+
assert_type value, String, :value, context:
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Shorthand method to get the sanitized HTML.
|
48
|
+
# @param html [String]
|
49
|
+
# @param url [String, Addressable::URI]
|
50
|
+
def self.get(html, url)
|
51
|
+
raise ArgumentError, 'url must be a String or Addressable::URI' if url.to_s.empty?
|
52
|
+
return nil if html.to_s.empty?
|
53
|
+
|
54
|
+
new(html, { config: Config::Channel.new({ url: }) }).get
|
44
55
|
end
|
45
56
|
|
46
57
|
##
|
@@ -30,13 +30,13 @@ module Html2rss
|
|
30
30
|
# 'bar'
|
31
31
|
class Substring < Base
|
32
32
|
def self.validate_args!(value, context)
|
33
|
-
assert_type value, String, :value
|
33
|
+
assert_type value, String, :value, context:
|
34
34
|
|
35
35
|
options = context[:options]
|
36
|
-
assert_type options[:start], Integer, :start
|
36
|
+
assert_type options[:start], Integer, :start, context:
|
37
37
|
|
38
38
|
end_index = options[:end]
|
39
|
-
assert_type
|
39
|
+
assert_type(end_index, Integer, :end, context:) if end_index
|
40
40
|
end
|
41
41
|
|
42
42
|
##
|
@@ -33,7 +33,7 @@ module Html2rss
|
|
33
33
|
# 'Product (23,42€)'
|
34
34
|
class Template < Base
|
35
35
|
def self.validate_args!(value, context)
|
36
|
-
assert_type value, String, :value
|
36
|
+
assert_type value, String, :value, context:
|
37
37
|
|
38
38
|
string = context[:options]&.dig(:string).to_s
|
39
39
|
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
@@ -74,9 +74,9 @@ module Html2rss
|
|
74
74
|
# @return [String]
|
75
75
|
# @deprecated Use %<id>s formatting instead. Will be removed in version 1.0.0. See README / Dynamic parameters.
|
76
76
|
def format_string_with_methods
|
77
|
-
warn '[DEPRECATION] This method of using params is deprecated and \
|
78
|
-
|
79
|
-
|
77
|
+
Log.warn '[DEPRECATION] This method of using params is deprecated and \
|
78
|
+
support for it will be removed in version 1.0.0.\
|
79
|
+
Please use dynamic parameters (i.e. %<id>s, see README.md) instead.'
|
80
80
|
|
81
81
|
string % methods
|
82
82
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'zlib'
|
4
|
+
require 'sanitize'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
##
|
9
|
+
# Article is a simple data object representing an article extracted from a page.
|
10
|
+
# It is enumerable and responds to all keys specified in PROVIDED_KEYS.
|
11
|
+
class Article
|
12
|
+
include Enumerable
|
13
|
+
include Comparable
|
14
|
+
|
15
|
+
PROVIDED_KEYS = %i[id title description url image guid published_at scraper].freeze
|
16
|
+
|
17
|
+
# @param options [Hash<Symbol, String>]
|
18
|
+
def initialize(**options)
|
19
|
+
@to_h = {}
|
20
|
+
options.each_pair { |key, value| @to_h[key] = value.freeze if value }
|
21
|
+
@to_h.freeze
|
22
|
+
|
23
|
+
return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
|
24
|
+
|
25
|
+
Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Checks if the article is valid based on the presence of URL, ID, and either title or description.
|
29
|
+
# @return [Boolean] True if the article is valid, otherwise false.
|
30
|
+
def valid?
|
31
|
+
!url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
|
32
|
+
end
|
33
|
+
|
34
|
+
# @yield [key, value]
|
35
|
+
# @return [Enumerator] if no block is given
|
36
|
+
def each
|
37
|
+
return enum_for(:each) unless block_given?
|
38
|
+
|
39
|
+
PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def id
|
43
|
+
@to_h[:id]
|
44
|
+
end
|
45
|
+
|
46
|
+
def title
|
47
|
+
@to_h[:title]
|
48
|
+
end
|
49
|
+
|
50
|
+
def description
|
51
|
+
return @description if defined?(@description)
|
52
|
+
|
53
|
+
return if url.to_s.empty? || @to_h[:description].to_s.empty?
|
54
|
+
|
55
|
+
@description ||= Html2rss::AttributePostProcessors::SanitizeHtml.get(@to_h[:description], url)
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [Addressable::URI, nil]
|
59
|
+
def url
|
60
|
+
@url ||= Html2rss::Utils.sanitize_url(@to_h[:url])
|
61
|
+
end
|
62
|
+
|
63
|
+
# @return [Addressable::URI, nil]
|
64
|
+
def image
|
65
|
+
@image ||= Html2rss::Utils.sanitize_url(@to_h[:image])
|
66
|
+
end
|
67
|
+
|
68
|
+
# Generates a unique identifier based on the URL and ID using CRC32.
|
69
|
+
# @return [String]
|
70
|
+
def guid
|
71
|
+
@guid ||= Zlib.crc32([url, id].join('#!/')).to_s(36).encode('utf-8')
|
72
|
+
end
|
73
|
+
|
74
|
+
# Parses and returns the published_at time.
|
75
|
+
# @return [Time, nil]
|
76
|
+
def published_at
|
77
|
+
return if (string = @to_h[:published_at].to_s).strip.empty?
|
78
|
+
|
79
|
+
@published_at ||= Time.parse(string)
|
80
|
+
rescue ArgumentError
|
81
|
+
nil
|
82
|
+
end
|
83
|
+
|
84
|
+
def scraper
|
85
|
+
@to_h[:scraper]
|
86
|
+
end
|
87
|
+
|
88
|
+
def <=>(other)
|
89
|
+
return nil unless other.is_a?(Article)
|
90
|
+
|
91
|
+
0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# Extracts channel information from
|
7
|
+
# 1. the HTML document's <head>.
|
8
|
+
# 2. the HTTP response
|
9
|
+
class Channel
|
10
|
+
##
|
11
|
+
#
|
12
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
|
13
|
+
# @param url [Addressable::URI] The URL of the channel.
|
14
|
+
# @param headers [Hash<String, String>] the http headers
|
15
|
+
# @param articles [Array<Html2rss::AutoSource::Article>] The articles.
|
16
|
+
def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
|
17
|
+
@parsed_body = parsed_body
|
18
|
+
@url = url
|
19
|
+
@headers = headers
|
20
|
+
@articles = articles
|
21
|
+
@stylesheets = stylesheets
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_writer :articles
|
25
|
+
attr_reader :stylesheets
|
26
|
+
|
27
|
+
def url = extract_url
|
28
|
+
def title = extract_title
|
29
|
+
def language = extract_language
|
30
|
+
def description = extract_description
|
31
|
+
def image = extract_image
|
32
|
+
def ttl = extract_ttl
|
33
|
+
def last_build_date = headers['last-modified']
|
34
|
+
|
35
|
+
def generator
|
36
|
+
"html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
attr_reader :parsed_body, :headers
|
42
|
+
|
43
|
+
def extract_url
|
44
|
+
@url.normalize.to_s
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_title
|
48
|
+
parsed_body.at_css('head > title')&.text
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_language
|
52
|
+
return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
|
53
|
+
|
54
|
+
parsed_body.at_css('[lang]')&.[]('lang')
|
55
|
+
end
|
56
|
+
|
57
|
+
def extract_description
|
58
|
+
parsed_body.at_css('meta[name="description"]')&.[]('content') || ''
|
59
|
+
end
|
60
|
+
|
61
|
+
def extract_image
|
62
|
+
url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
|
63
|
+
Html2rss::Utils.sanitize_url(url) if url
|
64
|
+
end
|
65
|
+
|
66
|
+
def extract_ttl
|
67
|
+
ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
|
68
|
+
return unless ttl
|
69
|
+
|
70
|
+
ttl.to_i.fdiv(60).ceil
|
71
|
+
end
|
72
|
+
|
73
|
+
def scraper_counts
|
74
|
+
scraper_counts = +''
|
75
|
+
|
76
|
+
@articles.each_with_object(Hash.new(0)) { |article, counts| counts[article.scraper] += 1 }
|
77
|
+
.each do |klass, count|
|
78
|
+
scraper_counts.concat("[#{klass.to_s.gsub('Html2rss::AutoSource::Scraper::', '')}=#{count}]")
|
79
|
+
end
|
80
|
+
|
81
|
+
scraper_counts
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# Cleanup is responsible for cleaning up the extracted articles.
|
7
|
+
# :reek:MissingSafeMethod { enabled: false }
|
8
|
+
# It applies various strategies to filter and refine the article list.
|
9
|
+
class Cleanup
|
10
|
+
class << self
|
11
|
+
def call(articles, url:, keep_different_domain: false)
|
12
|
+
Log.debug "Cleanup: start with #{articles.size} articles"
|
13
|
+
|
14
|
+
articles.select!(&:valid?)
|
15
|
+
|
16
|
+
remove_short!(articles, :title)
|
17
|
+
|
18
|
+
deduplicate_by!(articles, :url)
|
19
|
+
deduplicate_by!(articles, :title)
|
20
|
+
|
21
|
+
keep_only_http_urls!(articles)
|
22
|
+
reject_different_domain!(articles, url) unless keep_different_domain
|
23
|
+
|
24
|
+
Log.debug "Cleanup: end with #{articles.size} articles"
|
25
|
+
articles
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
##
|
31
|
+
# Removes articles with short values for a given key.
|
32
|
+
#
|
33
|
+
# @param articles [Array<Article>] The list of articles to process.
|
34
|
+
# @param key [Symbol] The key to check for short values.
|
35
|
+
# @param min_words [Integer] The minimum number of words required.
|
36
|
+
def remove_short!(articles, key = :title, min_words: 2)
|
37
|
+
articles.reject! do |article|
|
38
|
+
value = article.public_send(key)
|
39
|
+
value.nil? || value.to_s.split.size < min_words
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Deduplicates articles by a given key.
|
45
|
+
#
|
46
|
+
# @param articles [Array<Article>] The list of articles to process.
|
47
|
+
# @param key [Symbol] The key to deduplicate by.
|
48
|
+
def deduplicate_by!(articles, key)
|
49
|
+
seen = {}
|
50
|
+
articles.reject! do |article|
|
51
|
+
value = article.public_send(key)
|
52
|
+
value.nil? || seen.key?(value).tap { seen[value] = true }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Keeps only articles with HTTP or HTTPS URLs.
|
58
|
+
#
|
59
|
+
# @param articles [Array<Article>] The list of articles to process.
|
60
|
+
def keep_only_http_urls!(articles)
|
61
|
+
articles.select! { |article| %w[http https].include?(article.url&.scheme) }
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Rejects articles that have a URL not on the same domain as the source.
|
66
|
+
#
|
67
|
+
# @param articles [Array<Article>] The list of articles to process.
|
68
|
+
# @param base_url [Addressable::URI] The source URL to compare against.
|
69
|
+
def reject_different_domain!(articles, base_url)
|
70
|
+
base_host = base_url.host
|
71
|
+
articles.select! { |article| article.url&.host == base_host }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# Reducer is responsible for reducing the list of articles.
|
7
|
+
# It keeps only the longest attributes of articles with the same URL.
|
8
|
+
# It also filters out invalid articles.
|
9
|
+
class Reducer
|
10
|
+
class << self
|
11
|
+
def call(articles, **_options)
|
12
|
+
Log.debug "Reducer: inited with #{articles.size} articles"
|
13
|
+
|
14
|
+
reduce_by_keeping_longest_values(articles, keep: [:scraper]) { |article| article.url&.path }
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
# @param articles [Array<Article>]
|
20
|
+
# @return [Array<Article>] reduced articles
|
21
|
+
def reduce_by_keeping_longest_values(articles, keep:, &)
|
22
|
+
grouped_by_block = articles.group_by(&)
|
23
|
+
grouped_by_block.each_with_object([]) do |(_key, grouped_articles), result|
|
24
|
+
memo_object = {}
|
25
|
+
grouped_articles.each do |article_hash|
|
26
|
+
keep_longest_values(memo_object, article_hash, keep:)
|
27
|
+
end
|
28
|
+
|
29
|
+
result << Article.new(**memo_object)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def keep_longest_values(memo_object, article_hash, keep:)
|
34
|
+
article_hash.each do |key, value|
|
35
|
+
next if value.eql?(memo_object[key])
|
36
|
+
|
37
|
+
if keep.include?(key)
|
38
|
+
memo_object[key] ||= []
|
39
|
+
memo_object[key] << value
|
40
|
+
elsif value && value.to_s.size > memo_object[key].to_s.size
|
41
|
+
memo_object[key] = value
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rss'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
##
|
8
|
+
# Converts the autosourced channel and articles to an RSS feed.
|
9
|
+
class RssBuilder
|
10
|
+
def self.add_guid(article, maker)
|
11
|
+
maker.guid.tap do |guid|
|
12
|
+
guid.content = article.guid
|
13
|
+
guid.isPermaLink = false
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.add_image(article, maker)
|
18
|
+
url = article.image || return
|
19
|
+
|
20
|
+
maker.enclosure.tap do |enclosure|
|
21
|
+
enclosure.url = url
|
22
|
+
enclosure.type = Html2rss::Utils.guess_content_type_from_url(url)
|
23
|
+
enclosure.length = 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(channel:, articles:)
|
28
|
+
@channel = channel
|
29
|
+
@articles = articles
|
30
|
+
end
|
31
|
+
|
32
|
+
def call
|
33
|
+
RSS::Maker.make('2.0') do |maker|
|
34
|
+
Html2rss::RssBuilder::Stylesheet.add(maker, channel.stylesheets)
|
35
|
+
|
36
|
+
make_channel(maker.channel)
|
37
|
+
make_items(maker)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
attr_reader :channel, :articles
|
44
|
+
|
45
|
+
def make_channel(maker)
|
46
|
+
%i[language title description ttl].each do |key|
|
47
|
+
maker.public_send(:"#{key}=", channel.public_send(key))
|
48
|
+
end
|
49
|
+
|
50
|
+
maker.link = channel.url
|
51
|
+
maker.generator = channel.generator
|
52
|
+
maker.updated = channel.last_build_date
|
53
|
+
end
|
54
|
+
|
55
|
+
def make_items(maker)
|
56
|
+
articles.each do |article|
|
57
|
+
maker.items.new_item do |item_maker|
|
58
|
+
RssBuilder.add_guid(article, item_maker)
|
59
|
+
RssBuilder.add_image(article, item_maker)
|
60
|
+
|
61
|
+
item_maker.title = article.title
|
62
|
+
item_maker.description = article.description
|
63
|
+
item_maker.pubDate = article.published_at
|
64
|
+
item_maker.link = article.url
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|