html2rss 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +38 -10
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +9 -6
- data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
- data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
- data/lib/html2rss/attribute_post_processors/template.rb +4 -4
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +79 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +68 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +77 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +2 -2
- data/lib/html2rss/item.rb +8 -2
- data/lib/html2rss/utils.rb +5 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +21 -0
- metadata +29 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a2bf557dd65533533e07b4581e195f2d2b32ff906831526a4d7aed27a558d71
|
4
|
+
data.tar.gz: f42e5f03649a08219d310a2545413c371f851530c4d323fd68ef783b4b3b5e13
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 724a1fa8ab15ae140278eb9b055f22e7aad12e94627795f7a2f13c78f5421607e39d6ba040821b4c47b69f963cc0180bf8e964ff0b896403cb6305ed1d67dbb5
|
7
|
+
data.tar.gz: a06c2e16b0b51c6b6d2184430efc2a4e8b2812fee413163aa2991567e7608141f1c18189fdded58c8c3383940c4790478cd631abc6a1470ad648b2030fdefaab
|
data/README.md
CHANGED
@@ -26,26 +26,40 @@ You can also install it as a dependency in your Ruby project:
|
|
26
26
|
|
27
27
|
## Generating a feed on the CLI
|
28
28
|
|
29
|
-
|
29
|
+
### using automatic scraping
|
30
|
+
|
31
|
+
html2rss offers an automatic scrapting feature. Try it with:
|
32
|
+
|
33
|
+
`html2rss auto https://unmatchedstyle.com/`
|
34
|
+
|
35
|
+
### creating a feed config file and using it
|
36
|
+
|
37
|
+
If the results are not to your satisfaction, you can create a feed config file.
|
38
|
+
|
39
|
+
Create a file called `my_config_file.yml` with this sample content:
|
30
40
|
|
31
41
|
```yml
|
32
42
|
channel:
|
33
|
-
url: https://
|
43
|
+
url: https://unmatchedstyle.com
|
34
44
|
selectors:
|
35
45
|
items:
|
36
|
-
selector: "
|
46
|
+
selector: "article[id^='post-']"
|
37
47
|
title:
|
38
|
-
selector:
|
48
|
+
selector: h2
|
39
49
|
link:
|
40
50
|
selector: a
|
41
51
|
extractor: href
|
52
|
+
description:
|
53
|
+
selector: ".post-content"
|
54
|
+
post_process:
|
55
|
+
- name: sanitize_html
|
42
56
|
```
|
43
57
|
|
44
|
-
Build the
|
58
|
+
Build the feed from this config with: `html2rss feed ./my_config_file.yml`.
|
45
59
|
|
46
60
|
## Generating a feed with Ruby
|
47
61
|
|
48
|
-
Here's a minimal working example
|
62
|
+
Here's a minimal working example using Ruby:
|
49
63
|
|
50
64
|
```ruby
|
51
65
|
require 'html2rss'
|
@@ -481,7 +495,7 @@ feeds:
|
|
481
495
|
|
482
496
|
Your feed configs go below `feeds`. Everything else is part of the global config.
|
483
497
|
|
484
|
-
Find a full example of a `feeds.yml` at [`spec/feeds.test.yml`](https://github.com/html2rss/html2rss/blob/master/spec/feeds.test.yml).
|
498
|
+
Find a full example of a `feeds.yml` at [`spec/fixtures/feeds.test.yml`](https://github.com/html2rss/html2rss/blob/master/spec/fixtures/feeds.test.yml).
|
485
499
|
|
486
500
|
Now you can build your feeds like this:
|
487
501
|
|
@@ -583,8 +597,22 @@ Recommended further readings:
|
|
583
597
|
|
584
598
|
### Contributing
|
585
599
|
|
586
|
-
|
600
|
+
Find ideas what to contribute in:
|
601
|
+
|
602
|
+
1. <https://github.com/orgs/html2rss/discussions>
|
603
|
+
2. the issues tracker: <https://github.com/html2rss/html2rss/issues>
|
604
|
+
|
605
|
+
#### Development Helpers
|
606
|
+
|
607
|
+
1. `bin/setup`: installs dependencies and sets up the development environment.
|
608
|
+
2. `bin/guard`: automatically runs rspec, rubocop and reek when a file changes.
|
609
|
+
3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE:
|
610
|
+
a. [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby)
|
611
|
+
|
612
|
+
#### How to submit changes
|
613
|
+
|
614
|
+
1. Fork this repo ( <https://github.com/html2rss/html2rss/fork> )
|
587
615
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
588
|
-
3.
|
616
|
+
3. Implement a commit your changes (`git commit -am 'feat: add XYZ'`)
|
589
617
|
4. Push to the branch (`git push origin my-new-feature`)
|
590
|
-
5. Create a new Pull Request
|
618
|
+
5. Create a new Pull Request using the Github web UI
|
data/html2rss.gemspec
CHANGED
@@ -38,6 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_dependency 'kramdown'
|
39
39
|
spec.add_dependency 'mime-types', '> 3.0'
|
40
40
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
41
|
+
spec.add_dependency 'parallel'
|
41
42
|
spec.add_dependency 'regexp_parser'
|
42
43
|
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
43
44
|
spec.add_dependency 'rss'
|
@@ -26,17 +26,20 @@ module Html2rss
|
|
26
26
|
# @param value [Object] the value to check
|
27
27
|
# @param types [Array<Class>, Class] the expected type(s)
|
28
28
|
# @param name [String] the name of the option being checked
|
29
|
+
# @param context [Item::Context] the context
|
29
30
|
# @raise [InvalidType] if the value is not of the expected type(s)
|
30
|
-
def self.assert_type(value, types = [], name)
|
31
|
+
def self.assert_type(value, types = [], name, context:)
|
31
32
|
types = [types] unless types.is_a?(Array)
|
32
33
|
|
33
34
|
return if types.any? { |type| value.is_a?(type) }
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
end
|
36
|
+
options = context[:options] if context.is_a?(Hash)
|
37
|
+
options ||= { file: File.basename(caller_locations(1, 1).first.absolute_path) }
|
38
38
|
|
39
|
-
|
39
|
+
raise InvalidType, format('The type of `%<name>s` must be %<types>s, but is: %<type>s in: %<options>s',
|
40
|
+
name:, types: types.join(' or '), type: value.class, options: options.inspect),
|
41
|
+
[], cause: nil
|
42
|
+
end
|
40
43
|
|
41
44
|
##
|
42
45
|
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
@@ -51,7 +54,7 @@ module Html2rss
|
|
51
54
|
def initialize(value, context)
|
52
55
|
klass = self.class
|
53
56
|
# TODO: get rid of Hash
|
54
|
-
klass.assert_type(context, [Item::Context, Hash], 'context')
|
57
|
+
klass.assert_type(context, [Item::Context, Hash], 'context', context:)
|
55
58
|
klass.validate_args!(value, context)
|
56
59
|
|
57
60
|
@value = value
|
@@ -27,9 +27,9 @@ module Html2rss
|
|
27
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
28
28
|
class Gsub < Base
|
29
29
|
def self.validate_args!(value, context)
|
30
|
-
assert_type value, String, :value
|
30
|
+
assert_type value, String, :value, context:
|
31
31
|
expect_options(%i[replacement pattern], context)
|
32
|
-
assert_type context.dig(:options, :replacement), [String, Hash], :replacement
|
32
|
+
assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
|
33
33
|
end
|
34
34
|
|
35
35
|
##
|
@@ -27,8 +27,8 @@ module Html2rss
|
|
27
27
|
# Would return:
|
28
28
|
# 'Lorem **ipsum** dolor'
|
29
29
|
class HtmlToMarkdown < Base
|
30
|
-
def self.validate_args!(value,
|
31
|
-
assert_type value, String, :value
|
30
|
+
def self.validate_args!(value, context)
|
31
|
+
assert_type value, String, :value, context:
|
32
32
|
end
|
33
33
|
|
34
34
|
##
|
@@ -33,8 +33,8 @@ module Html2rss
|
|
33
33
|
#
|
34
34
|
# <p>Price: 12.34</p>
|
35
35
|
class MarkdownToHtml < Base
|
36
|
-
def self.validate_args!(value,
|
37
|
-
assert_type value, String, :value
|
36
|
+
def self.validate_args!(value, context)
|
37
|
+
assert_type value, String, :value, context:
|
38
38
|
end
|
39
39
|
|
40
40
|
##
|
@@ -27,8 +27,8 @@ module Html2rss
|
|
27
27
|
# It uses `Time.parse`.
|
28
28
|
class ParseTime < Base
|
29
29
|
def self.validate_args!(value, context)
|
30
|
-
assert_type
|
31
|
-
assert_type
|
30
|
+
assert_type(value, String, :value, context:)
|
31
|
+
assert_type(context[:config].time_zone, String, :time_zone, context:)
|
32
32
|
end
|
33
33
|
|
34
34
|
##
|
@@ -25,8 +25,8 @@ module Html2rss
|
|
25
25
|
def self.validate_args!(value, context)
|
26
26
|
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
27
27
|
|
28
|
-
assert_type(value, url_types, :value)
|
29
|
-
assert_type(context.config.url, url_types, :url)
|
28
|
+
assert_type(value, url_types, :value, context:)
|
29
|
+
assert_type(context.config.url, url_types, :url, context:)
|
30
30
|
|
31
31
|
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
32
32
|
end
|
@@ -39,8 +39,19 @@ module Html2rss
|
|
39
39
|
# Would return:
|
40
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
41
41
|
class SanitizeHtml < Base
|
42
|
-
def self.validate_args!(value,
|
43
|
-
assert_type value, String, :value
|
42
|
+
def self.validate_args!(value, context)
|
43
|
+
assert_type value, String, :value, context:
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Shorthand method to get the sanitized HTML.
|
48
|
+
# @param html [String]
|
49
|
+
# @param url [String, Addressable::URI]
|
50
|
+
def self.get(html, url)
|
51
|
+
raise ArgumentError, 'url must be a String or Addressable::URI' if url.to_s.empty?
|
52
|
+
return nil if html.to_s.empty?
|
53
|
+
|
54
|
+
new(html, { config: Config::Channel.new({ url: }) }).get
|
44
55
|
end
|
45
56
|
|
46
57
|
##
|
@@ -30,13 +30,13 @@ module Html2rss
|
|
30
30
|
# 'bar'
|
31
31
|
class Substring < Base
|
32
32
|
def self.validate_args!(value, context)
|
33
|
-
assert_type value, String, :value
|
33
|
+
assert_type value, String, :value, context:
|
34
34
|
|
35
35
|
options = context[:options]
|
36
|
-
assert_type options[:start], Integer, :start
|
36
|
+
assert_type options[:start], Integer, :start, context:
|
37
37
|
|
38
38
|
end_index = options[:end]
|
39
|
-
assert_type
|
39
|
+
assert_type(end_index, Integer, :end, context:) if end_index
|
40
40
|
end
|
41
41
|
|
42
42
|
##
|
@@ -33,7 +33,7 @@ module Html2rss
|
|
33
33
|
# 'Product (23,42€)'
|
34
34
|
class Template < Base
|
35
35
|
def self.validate_args!(value, context)
|
36
|
-
assert_type value, String, :value
|
36
|
+
assert_type value, String, :value, context:
|
37
37
|
|
38
38
|
string = context[:options]&.dig(:string).to_s
|
39
39
|
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
@@ -74,9 +74,9 @@ module Html2rss
|
|
74
74
|
# @return [String]
|
75
75
|
# @deprecated Use %<id>s formatting instead. Will be removed in version 1.0.0. See README / Dynamic parameters.
|
76
76
|
def format_string_with_methods
|
77
|
-
warn '[DEPRECATION] This method of using params is deprecated and \
|
78
|
-
|
79
|
-
|
77
|
+
Log.warn '[DEPRECATION] This method of using params is deprecated and \
|
78
|
+
support for it will be removed in version 1.0.0.\
|
79
|
+
Please use dynamic parameters (i.e. %<id>s, see README.md) instead.'
|
80
80
|
|
81
81
|
string % methods
|
82
82
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'zlib'
|
4
|
+
require 'sanitize'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
##
|
9
|
+
# Article is a simple data object representing an article extracted from a page.
|
10
|
+
# It is enumerable and responds to all keys specified in PROVIDED_KEYS.
|
11
|
+
class Article
|
12
|
+
include Enumerable
|
13
|
+
include Comparable
|
14
|
+
|
15
|
+
PROVIDED_KEYS = %i[id title description url image guid published_at scraper].freeze
|
16
|
+
|
17
|
+
# @param options [Hash<Symbol, String>]
|
18
|
+
def initialize(**options)
|
19
|
+
@to_h = {}
|
20
|
+
options.each_pair { |key, value| @to_h[key] = value.freeze if value }
|
21
|
+
@to_h.freeze
|
22
|
+
|
23
|
+
return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
|
24
|
+
|
25
|
+
Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Checks if the article is valid based on the presence of URL, ID, and either title or description.
|
29
|
+
# @return [Boolean] True if the article is valid, otherwise false.
|
30
|
+
def valid?
|
31
|
+
!url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
|
32
|
+
end
|
33
|
+
|
34
|
+
# @yield [key, value]
|
35
|
+
# @return [Enumerator] if no block is given
|
36
|
+
def each
|
37
|
+
return enum_for(:each) unless block_given?
|
38
|
+
|
39
|
+
PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def id
|
43
|
+
@to_h[:id]
|
44
|
+
end
|
45
|
+
|
46
|
+
def title
|
47
|
+
@to_h[:title]
|
48
|
+
end
|
49
|
+
|
50
|
+
def description
|
51
|
+
return @description if defined?(@description)
|
52
|
+
|
53
|
+
return if url.to_s.empty? || @to_h[:description].to_s.empty?
|
54
|
+
|
55
|
+
@description ||= Html2rss::AttributePostProcessors::SanitizeHtml.get(@to_h[:description], url)
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [Addressable::URI, nil]
|
59
|
+
def url
|
60
|
+
@url ||= Html2rss::Utils.sanitize_url(@to_h[:url])
|
61
|
+
end
|
62
|
+
|
63
|
+
# @return [Addressable::URI, nil]
|
64
|
+
def image
|
65
|
+
@image ||= Html2rss::Utils.sanitize_url(@to_h[:image])
|
66
|
+
end
|
67
|
+
|
68
|
+
# Generates a unique identifier based on the URL and ID using CRC32.
|
69
|
+
# @return [String]
|
70
|
+
def guid
|
71
|
+
@guid ||= Zlib.crc32([url, id].join('#!/')).to_s(36).encode('utf-8')
|
72
|
+
end
|
73
|
+
|
74
|
+
# Parses and returns the published_at time.
|
75
|
+
# @return [Time, nil]
|
76
|
+
def published_at
|
77
|
+
return if (string = @to_h[:published_at].to_s).strip.empty?
|
78
|
+
|
79
|
+
@published_at ||= Time.parse(string)
|
80
|
+
rescue ArgumentError
|
81
|
+
nil
|
82
|
+
end
|
83
|
+
|
84
|
+
def scraper
|
85
|
+
@to_h[:scraper]
|
86
|
+
end
|
87
|
+
|
88
|
+
def <=>(other)
|
89
|
+
return nil unless other.is_a?(Article)
|
90
|
+
|
91
|
+
0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# Extracts channel information from
|
7
|
+
# 1. the HTML document's <head>.
|
8
|
+
# 2. the HTTP response
|
9
|
+
class Channel
|
10
|
+
##
|
11
|
+
#
|
12
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
|
13
|
+
# @param response [Faraday::Response] The URL of the HTML document.
|
14
|
+
def initialize(parsed_body, url:, response:, articles: [])
|
15
|
+
@parsed_body = parsed_body
|
16
|
+
@url = url
|
17
|
+
@response = response
|
18
|
+
@articles = articles
|
19
|
+
end
|
20
|
+
|
21
|
+
def url = extract_url
|
22
|
+
def title = extract_title
|
23
|
+
def language = extract_language
|
24
|
+
def description = extract_description
|
25
|
+
def image = extract_image
|
26
|
+
def ttl = extract_ttl
|
27
|
+
def last_build_date = response.headers['last-modified']
|
28
|
+
|
29
|
+
def generator
|
30
|
+
"html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
attr_reader :parsed_body, :response
|
36
|
+
|
37
|
+
def extract_url
|
38
|
+
@url.normalize.to_s
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_title
|
42
|
+
parsed_body.at_css('head > title')&.text
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_language
|
46
|
+
return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
|
47
|
+
|
48
|
+
parsed_body.at_css('[lang]')&.[]('lang')
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_description
|
52
|
+
parsed_body.at_css('meta[name="description"]')&.[]('content') || ''
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_image
|
56
|
+
url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
|
57
|
+
Html2rss::Utils.sanitize_url(url) if url
|
58
|
+
end
|
59
|
+
|
60
|
+
def extract_ttl
|
61
|
+
ttl = response.headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
|
62
|
+
return unless ttl
|
63
|
+
|
64
|
+
ttl.to_i.fdiv(60).ceil
|
65
|
+
end
|
66
|
+
|
67
|
+
def scraper_counts
|
68
|
+
scraper_counts = +''
|
69
|
+
|
70
|
+
@articles.each_with_object(Hash.new(0)) { |article, counts| counts[article.scraper] += 1 }
|
71
|
+
.each do |klass, count|
|
72
|
+
scraper_counts.concat("[#{klass.to_s.gsub('Html2rss::AutoSource::Scraper::', '')}=#{count}]")
|
73
|
+
end
|
74
|
+
|
75
|
+
scraper_counts
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# Cleanup is responsible for cleaning up the extracted articles.
|
7
|
+
# :reek:MissingSafeMethod { enabled: false }
|
8
|
+
# It applies various strategies to filter and refine the article list.
|
9
|
+
class Cleanup
|
10
|
+
class << self
|
11
|
+
def call(articles, url:, keep_different_domain: false)
|
12
|
+
Log.debug "Cleanup: start with #{articles.size} articles"
|
13
|
+
|
14
|
+
articles.select!(&:valid?)
|
15
|
+
|
16
|
+
remove_short!(articles, :title)
|
17
|
+
|
18
|
+
deduplicate_by!(articles, :url)
|
19
|
+
deduplicate_by!(articles, :title)
|
20
|
+
|
21
|
+
keep_only_http_urls!(articles)
|
22
|
+
reject_different_domain!(articles, url) unless keep_different_domain
|
23
|
+
|
24
|
+
Log.debug "Cleanup: end with #{articles.size} articles"
|
25
|
+
articles
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
##
|
31
|
+
# Removes articles with short values for a given key.
|
32
|
+
#
|
33
|
+
# @param articles [Array<Article>] The list of articles to process.
|
34
|
+
# @param key [Symbol] The key to check for short values.
|
35
|
+
# @param min_words [Integer] The minimum number of words required.
|
36
|
+
def remove_short!(articles, key = :title, min_words: 2)
|
37
|
+
articles.reject! do |article|
|
38
|
+
value = article.public_send(key)
|
39
|
+
value.nil? || value.to_s.split.size < min_words
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Deduplicates articles by a given key.
|
45
|
+
#
|
46
|
+
# @param articles [Array<Article>] The list of articles to process.
|
47
|
+
# @param key [Symbol] The key to deduplicate by.
|
48
|
+
def deduplicate_by!(articles, key)
|
49
|
+
seen = {}
|
50
|
+
articles.reject! do |article|
|
51
|
+
value = article.public_send(key)
|
52
|
+
value.nil? || seen.key?(value).tap { seen[value] = true }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Keeps only articles with HTTP or HTTPS URLs.
|
58
|
+
#
|
59
|
+
# @param articles [Array<Article>] The list of articles to process.
|
60
|
+
def keep_only_http_urls!(articles)
|
61
|
+
articles.select! { |article| %w[http https].include?(article.url&.scheme) }
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Rejects articles that have a URL not on the same domain as the source.
|
66
|
+
#
|
67
|
+
# @param articles [Array<Article>] The list of articles to process.
|
68
|
+
# @param base_url [Addressable::URI] The source URL to compare against.
|
69
|
+
def reject_different_domain!(articles, base_url)
|
70
|
+
base_host = base_url.host
|
71
|
+
articles.select! { |article| article.url&.host == base_host }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# Reducer is responsible for reducing the list of articles.
|
7
|
+
# It keeps only the longest attributes of articles with the same URL.
|
8
|
+
# It also filters out invalid articles.
|
9
|
+
class Reducer
|
10
|
+
class << self
|
11
|
+
def call(articles, **_options)
|
12
|
+
Log.debug "Reducer: inited with #{articles.size} articles"
|
13
|
+
|
14
|
+
reduce_by_keeping_longest_values(articles, keep: [:scraper]) { |article| article.url&.path }
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
# @param articles [Array<Article>]
|
20
|
+
# @return [Array<Article>] reduced articles
|
21
|
+
def reduce_by_keeping_longest_values(articles, keep:, &)
|
22
|
+
grouped_by_block = articles.group_by(&)
|
23
|
+
grouped_by_block.each_with_object([]) do |(_key, grouped_articles), result|
|
24
|
+
memo_object = {}
|
25
|
+
grouped_articles.each do |article_hash|
|
26
|
+
keep_longest_values(memo_object, article_hash, keep:)
|
27
|
+
end
|
28
|
+
|
29
|
+
result << Article.new(**memo_object)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def keep_longest_values(memo_object, article_hash, keep:)
|
34
|
+
article_hash.each do |key, value|
|
35
|
+
next if value.eql?(memo_object[key])
|
36
|
+
|
37
|
+
if keep.include?(key)
|
38
|
+
memo_object[key] ||= []
|
39
|
+
memo_object[key] << value
|
40
|
+
elsif value && value.to_s.size > memo_object[key].to_s.size
|
41
|
+
memo_object[key] = value
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rss'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
##
|
8
|
+
# Converts the autosourced channel and articles to an RSS feed.
|
9
|
+
class RssBuilder
|
10
|
+
def self.add_guid(article, maker)
|
11
|
+
maker.guid.tap do |guid|
|
12
|
+
guid.content = article.guid
|
13
|
+
guid.isPermaLink = false
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.add_image(article, maker)
|
18
|
+
url = article.image || return
|
19
|
+
|
20
|
+
maker.enclosure.tap do |enclosure|
|
21
|
+
enclosure.url = url
|
22
|
+
enclosure.type = Html2rss::Utils.guess_content_type_from_url(url)
|
23
|
+
enclosure.length = 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(channel:, articles:)
|
28
|
+
@channel = channel
|
29
|
+
@articles = articles
|
30
|
+
end
|
31
|
+
|
32
|
+
def call
|
33
|
+
RSS::Maker.make('2.0') do |maker|
|
34
|
+
make_channel(maker.channel)
|
35
|
+
make_items(maker)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
attr_reader :channel, :articles
|
42
|
+
|
43
|
+
def make_channel(maker)
|
44
|
+
%i[language title description ttl].each do |key|
|
45
|
+
maker.public_send(:"#{key}=", channel.public_send(key))
|
46
|
+
end
|
47
|
+
|
48
|
+
maker.link = channel.url
|
49
|
+
maker.generator = channel.generator
|
50
|
+
maker.updated = channel.last_build_date
|
51
|
+
end
|
52
|
+
|
53
|
+
def make_items(maker)
|
54
|
+
articles.each do |article|
|
55
|
+
maker.items.new_item do |item_maker|
|
56
|
+
RssBuilder.add_guid(article, item_maker)
|
57
|
+
RssBuilder.add_image(article, item_maker)
|
58
|
+
|
59
|
+
item_maker.title = article.title
|
60
|
+
item_maker.description = article.description
|
61
|
+
item_maker.pubDate = article.published_at
|
62
|
+
item_maker.link = article.url
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|