html2rss 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +38 -10
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +74 -0
- data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
- data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +17 -8
- data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
- data/lib/html2rss/attribute_post_processors/template.rb +19 -11
- data/lib/html2rss/attribute_post_processors.rb +8 -0
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +79 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +68 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +77 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +13 -2
- data/lib/html2rss/item.rb +8 -2
- data/lib/html2rss/utils.rb +5 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +21 -0
- metadata +30 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a2bf557dd65533533e07b4581e195f2d2b32ff906831526a4d7aed27a558d71
|
4
|
+
data.tar.gz: f42e5f03649a08219d310a2545413c371f851530c4d323fd68ef783b4b3b5e13
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 724a1fa8ab15ae140278eb9b055f22e7aad12e94627795f7a2f13c78f5421607e39d6ba040821b4c47b69f963cc0180bf8e964ff0b896403cb6305ed1d67dbb5
|
7
|
+
data.tar.gz: a06c2e16b0b51c6b6d2184430efc2a4e8b2812fee413163aa2991567e7608141f1c18189fdded58c8c3383940c4790478cd631abc6a1470ad648b2030fdefaab
|
data/README.md
CHANGED
@@ -26,26 +26,40 @@ You can also install it as a dependency in your Ruby project:
|
|
26
26
|
|
27
27
|
## Generating a feed on the CLI
|
28
28
|
|
29
|
-
|
29
|
+
### using automatic scraping
|
30
|
+
|
31
|
+
html2rss offers an automatic scrapting feature. Try it with:
|
32
|
+
|
33
|
+
`html2rss auto https://unmatchedstyle.com/`
|
34
|
+
|
35
|
+
### creating a feed config file and using it
|
36
|
+
|
37
|
+
If the results are not to your satisfaction, you can create a feed config file.
|
38
|
+
|
39
|
+
Create a file called `my_config_file.yml` with this sample content:
|
30
40
|
|
31
41
|
```yml
|
32
42
|
channel:
|
33
|
-
url: https://
|
43
|
+
url: https://unmatchedstyle.com
|
34
44
|
selectors:
|
35
45
|
items:
|
36
|
-
selector: "
|
46
|
+
selector: "article[id^='post-']"
|
37
47
|
title:
|
38
|
-
selector:
|
48
|
+
selector: h2
|
39
49
|
link:
|
40
50
|
selector: a
|
41
51
|
extractor: href
|
52
|
+
description:
|
53
|
+
selector: ".post-content"
|
54
|
+
post_process:
|
55
|
+
- name: sanitize_html
|
42
56
|
```
|
43
57
|
|
44
|
-
Build the
|
58
|
+
Build the feed from this config with: `html2rss feed ./my_config_file.yml`.
|
45
59
|
|
46
60
|
## Generating a feed with Ruby
|
47
61
|
|
48
|
-
Here's a minimal working example
|
62
|
+
Here's a minimal working example using Ruby:
|
49
63
|
|
50
64
|
```ruby
|
51
65
|
require 'html2rss'
|
@@ -481,7 +495,7 @@ feeds:
|
|
481
495
|
|
482
496
|
Your feed configs go below `feeds`. Everything else is part of the global config.
|
483
497
|
|
484
|
-
Find a full example of a `feeds.yml` at [`spec/feeds.test.yml`](https://github.com/html2rss/html2rss/blob/master/spec/feeds.test.yml).
|
498
|
+
Find a full example of a `feeds.yml` at [`spec/fixtures/feeds.test.yml`](https://github.com/html2rss/html2rss/blob/master/spec/fixtures/feeds.test.yml).
|
485
499
|
|
486
500
|
Now you can build your feeds like this:
|
487
501
|
|
@@ -583,8 +597,22 @@ Recommended further readings:
|
|
583
597
|
|
584
598
|
### Contributing
|
585
599
|
|
586
|
-
|
600
|
+
Find ideas what to contribute in:
|
601
|
+
|
602
|
+
1. <https://github.com/orgs/html2rss/discussions>
|
603
|
+
2. the issues tracker: <https://github.com/html2rss/html2rss/issues>
|
604
|
+
|
605
|
+
#### Development Helpers
|
606
|
+
|
607
|
+
1. `bin/setup`: installs dependencies and sets up the development environment.
|
608
|
+
2. `bin/guard`: automatically runs rspec, rubocop and reek when a file changes.
|
609
|
+
3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE:
|
610
|
+
a. [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby)
|
611
|
+
|
612
|
+
#### How to submit changes
|
613
|
+
|
614
|
+
1. Fork this repo ( <https://github.com/html2rss/html2rss/fork> )
|
587
615
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
588
|
-
3.
|
616
|
+
3. Implement a commit your changes (`git commit -am 'feat: add XYZ'`)
|
589
617
|
4. Push to the branch (`git push origin my-new-feature`)
|
590
|
-
5. Create a new Pull Request
|
618
|
+
5. Create a new Pull Request using the Github web UI
|
data/html2rss.gemspec
CHANGED
@@ -38,6 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_dependency 'kramdown'
|
39
39
|
spec.add_dependency 'mime-types', '> 3.0'
|
40
40
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
41
|
+
spec.add_dependency 'parallel'
|
41
42
|
spec.add_dependency 'regexp_parser'
|
42
43
|
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
43
44
|
spec.add_dependency 'rss'
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
##
|
5
|
+
# Provides a namespace for attribute post processors.
|
6
|
+
module AttributePostProcessors
|
7
|
+
##
|
8
|
+
# All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
|
9
|
+
class Base
|
10
|
+
# Validates the presence of required options in the context
|
11
|
+
#
|
12
|
+
# @param keys [Array<Symbol>] the keys to check for presence
|
13
|
+
# @param context [Hash] the context containing options
|
14
|
+
# @raise [MissingOption] if any key is missing
|
15
|
+
def self.expect_options(keys, context)
|
16
|
+
keys.each do |key|
|
17
|
+
unless (options = context[:options]).key?(key)
|
18
|
+
raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
|
19
|
+
cause: nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Asserts that the value is of the expected type(s)
|
25
|
+
#
|
26
|
+
# @param value [Object] the value to check
|
27
|
+
# @param types [Array<Class>, Class] the expected type(s)
|
28
|
+
# @param name [String] the name of the option being checked
|
29
|
+
# @param context [Item::Context] the context
|
30
|
+
# @raise [InvalidType] if the value is not of the expected type(s)
|
31
|
+
def self.assert_type(value, types = [], name, context:)
|
32
|
+
types = [types] unless types.is_a?(Array)
|
33
|
+
|
34
|
+
return if types.any? { |type| value.is_a?(type) }
|
35
|
+
|
36
|
+
options = context[:options] if context.is_a?(Hash)
|
37
|
+
options ||= { file: File.basename(caller_locations(1, 1).first.absolute_path) }
|
38
|
+
|
39
|
+
raise InvalidType, format('The type of `%<name>s` must be %<types>s, but is: %<type>s in: %<options>s',
|
40
|
+
name:, types: types.join(' or '), type: value.class, options: options.inspect),
|
41
|
+
[], cause: nil
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
46
|
+
def self.validate_args!(_value, _context)
|
47
|
+
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
48
|
+
end
|
49
|
+
|
50
|
+
# Initializes the post processor
|
51
|
+
#
|
52
|
+
# @param value [Object] the value to be processed
|
53
|
+
# @param context [Item::Context] the context
|
54
|
+
def initialize(value, context)
|
55
|
+
klass = self.class
|
56
|
+
# TODO: get rid of Hash
|
57
|
+
klass.assert_type(context, [Item::Context, Hash], 'context', context:)
|
58
|
+
klass.validate_args!(value, context)
|
59
|
+
|
60
|
+
@value = value
|
61
|
+
@context = context
|
62
|
+
end
|
63
|
+
|
64
|
+
attr_reader :value, :context
|
65
|
+
|
66
|
+
# Abstract method to be implemented by subclasses
|
67
|
+
#
|
68
|
+
# @raise [NotImplementedError] if not implemented in subclass
|
69
|
+
def get
|
70
|
+
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -25,39 +25,39 @@ module Html2rss
|
|
25
25
|
# `replacement` can be a String or a Hash.
|
26
26
|
#
|
27
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
28
|
-
class Gsub
|
28
|
+
class Gsub < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type value, String, :value, context:
|
31
|
+
expect_options(%i[replacement pattern], context)
|
32
|
+
assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
|
33
|
+
end
|
34
|
+
|
29
35
|
##
|
30
36
|
# @param value [String]
|
31
37
|
# @param context [Item::Context]
|
32
38
|
def initialize(value, context)
|
33
|
-
|
34
|
-
|
39
|
+
super
|
40
|
+
|
41
|
+
options = context[:options]
|
42
|
+
|
43
|
+
@replacement = options[:replacement]
|
44
|
+
@pattern = options[:pattern]
|
35
45
|
end
|
36
46
|
|
37
47
|
##
|
38
48
|
# @return [String]
|
39
49
|
def get
|
40
|
-
|
50
|
+
value.to_s.gsub(pattern, replacement)
|
41
51
|
end
|
42
52
|
|
43
53
|
private
|
44
54
|
|
55
|
+
attr_accessor :replacement
|
56
|
+
|
45
57
|
##
|
46
58
|
# @return [Regexp]
|
47
59
|
def pattern
|
48
|
-
pattern
|
49
|
-
raise ArgumentError, 'The `pattern` option is missing' unless pattern
|
50
|
-
|
51
|
-
pattern.is_a?(String) ? Utils.build_regexp_from_string(pattern) : pattern
|
52
|
-
end
|
53
|
-
|
54
|
-
##
|
55
|
-
# @return [Hash, String]
|
56
|
-
def replacement
|
57
|
-
replacement = @options[:replacement]
|
58
|
-
return replacement if replacement.is_a?(String) || replacement.is_a?(Hash)
|
59
|
-
|
60
|
-
raise ArgumentError, 'The `replacement` option must be a String or Hash'
|
60
|
+
@pattern.is_a?(String) ? Utils.build_regexp_from_string(@pattern) : @pattern
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -26,18 +26,17 @@ module Html2rss
|
|
26
26
|
#
|
27
27
|
# Would return:
|
28
28
|
# 'Lorem **ipsum** dolor'
|
29
|
-
class HtmlToMarkdown
|
30
|
-
|
31
|
-
|
32
|
-
# @param env [Item::Context]
|
33
|
-
def initialize(value, env)
|
34
|
-
@sanitized_value = SanitizeHtml.new(value, env).get
|
29
|
+
class HtmlToMarkdown < Base
|
30
|
+
def self.validate_args!(value, context)
|
31
|
+
assert_type value, String, :value, context:
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
35
|
# @return [String] formatted in Markdown
|
39
36
|
def get
|
40
|
-
|
37
|
+
sanitized_value = SanitizeHtml.new(value, context).get
|
38
|
+
|
39
|
+
ReverseMarkdown.convert(sanitized_value)
|
41
40
|
end
|
42
41
|
end
|
43
42
|
end
|
@@ -32,13 +32,9 @@ module Html2rss
|
|
32
32
|
# <h1>Section</h1>
|
33
33
|
#
|
34
34
|
# <p>Price: 12.34</p>
|
35
|
-
class MarkdownToHtml
|
36
|
-
|
37
|
-
|
38
|
-
# @param env [Item::Context] Context object providing additional environment details
|
39
|
-
def initialize(value, env)
|
40
|
-
@value = value
|
41
|
-
@env = env
|
35
|
+
class MarkdownToHtml < Base
|
36
|
+
def self.validate_args!(value, context)
|
37
|
+
assert_type value, String, :value, context:
|
42
38
|
end
|
43
39
|
|
44
40
|
##
|
@@ -46,8 +42,8 @@ module Html2rss
|
|
46
42
|
#
|
47
43
|
# @return [String] Sanitized HTML content
|
48
44
|
def get
|
49
|
-
html_content = Kramdown::Document.new(
|
50
|
-
SanitizeHtml.new(html_content,
|
45
|
+
html_content = Kramdown::Document.new(value).to_html
|
46
|
+
SanitizeHtml.new(html_content, context).get
|
51
47
|
end
|
52
48
|
end
|
53
49
|
end
|
@@ -24,22 +24,22 @@ module Html2rss
|
|
24
24
|
# Would return:
|
25
25
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
26
26
|
#
|
27
|
-
# It uses
|
28
|
-
class ParseTime
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def initialize(value, env)
|
33
|
-
@value = value.to_s
|
34
|
-
@time_zone = env[:config].time_zone
|
27
|
+
# It uses `Time.parse`.
|
28
|
+
class ParseTime < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type(value, String, :value, context:)
|
31
|
+
assert_type(context[:config].time_zone, String, :time_zone, context:)
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
|
-
# Converts the provided time string to RFC822 format, taking into account the
|
35
|
+
# Converts the provided time string to RFC822 format, taking into account the time_zone.
|
39
36
|
#
|
40
37
|
# @return [String] RFC822 formatted time
|
38
|
+
# @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
|
41
39
|
def get
|
42
|
-
|
40
|
+
time_zone = context[:config].time_zone
|
41
|
+
|
42
|
+
Utils.use_zone(time_zone) { Time.parse(value).rfc822 }
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
@@ -21,21 +21,24 @@ module Html2rss
|
|
21
21
|
#
|
22
22
|
# Would return:
|
23
23
|
# 'http://why-not-use-a-link.uh'
|
24
|
-
class ParseUri
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
class ParseUri < Base
|
25
|
+
def self.validate_args!(value, context)
|
26
|
+
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
27
|
+
|
28
|
+
assert_type(value, url_types, :value, context:)
|
29
|
+
assert_type(context.config.url, url_types, :url, context:)
|
30
|
+
|
31
|
+
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
31
32
|
end
|
32
33
|
|
33
34
|
##
|
34
35
|
# @return [String]
|
35
36
|
def get
|
37
|
+
config_url = context.config.url
|
38
|
+
|
36
39
|
Html2rss::Utils.build_absolute_url_from_relative(
|
37
|
-
Html2rss::Utils.sanitize_url(
|
38
|
-
|
40
|
+
Html2rss::Utils.sanitize_url(value),
|
41
|
+
config_url
|
39
42
|
).to_s
|
40
43
|
end
|
41
44
|
end
|
@@ -38,19 +38,26 @@ module Html2rss
|
|
38
38
|
#
|
39
39
|
# Would return:
|
40
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
41
|
-
class SanitizeHtml
|
41
|
+
class SanitizeHtml < Base
|
42
|
+
def self.validate_args!(value, context)
|
43
|
+
assert_type value, String, :value, context:
|
44
|
+
end
|
45
|
+
|
42
46
|
##
|
43
|
-
#
|
44
|
-
# @param
|
45
|
-
|
46
|
-
|
47
|
-
|
47
|
+
# Shorthand method to get the sanitized HTML.
|
48
|
+
# @param html [String]
|
49
|
+
# @param url [String, Addressable::URI]
|
50
|
+
def self.get(html, url)
|
51
|
+
raise ArgumentError, 'url must be a String or Addressable::URI' if url.to_s.empty?
|
52
|
+
return nil if html.to_s.empty?
|
53
|
+
|
54
|
+
new(html, { config: Config::Channel.new({ url: }) }).get
|
48
55
|
end
|
49
56
|
|
50
57
|
##
|
51
58
|
# @return [String]
|
52
59
|
def get
|
53
|
-
sanitized_html = Sanitize.fragment(
|
60
|
+
sanitized_html = Sanitize.fragment(value, sanitize_config)
|
54
61
|
sanitized_html.to_s.gsub(/\s+/, ' ').strip
|
55
62
|
end
|
56
63
|
|
@@ -77,13 +84,15 @@ module Html2rss
|
|
77
84
|
}
|
78
85
|
end
|
79
86
|
|
87
|
+
def channel_url = context[:config].url
|
88
|
+
|
80
89
|
##
|
81
90
|
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
82
91
|
#
|
83
92
|
# @param env [Hash]
|
84
93
|
# @return [nil]
|
85
94
|
def transform_urls_to_absolute_ones(env)
|
86
|
-
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(
|
95
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
87
96
|
end
|
88
97
|
|
89
98
|
##
|
@@ -28,13 +28,15 @@ module Html2rss
|
|
28
28
|
#
|
29
29
|
# Would return:
|
30
30
|
# 'bar'
|
31
|
-
class Substring
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
class Substring < Base
|
32
|
+
def self.validate_args!(value, context)
|
33
|
+
assert_type value, String, :value, context:
|
34
|
+
|
35
|
+
options = context[:options]
|
36
|
+
assert_type options[:start], Integer, :start, context:
|
37
|
+
|
38
|
+
end_index = options[:end]
|
39
|
+
assert_type(end_index, Integer, :end, context:) if end_index
|
38
40
|
end
|
39
41
|
|
40
42
|
##
|
@@ -42,11 +44,29 @@ module Html2rss
|
|
42
44
|
#
|
43
45
|
# @return [String] The extracted substring.
|
44
46
|
def get
|
45
|
-
|
46
|
-
|
47
|
+
value[range]
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# Determines the range for the substring extraction based on the provided start and end indices.
|
52
|
+
#
|
53
|
+
# @return [Range] The range object representing the start and end/Infinity (integers).
|
54
|
+
def range
|
55
|
+
return (start_index..) unless end_index?
|
56
|
+
|
57
|
+
if start_index == end_index
|
58
|
+
raise ArgumentError,
|
59
|
+
'The `start` value must be unequal to the `end` value.'
|
60
|
+
end
|
47
61
|
|
48
|
-
|
62
|
+
(start_index..end_index)
|
49
63
|
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def end_index? = !context[:options][:end].to_s.empty?
|
68
|
+
def end_index = context[:options][:end].to_i
|
69
|
+
def start_index = context[:options][:start].to_i
|
50
70
|
end
|
51
71
|
end
|
52
72
|
end
|
@@ -31,15 +31,23 @@ module Html2rss
|
|
31
31
|
#
|
32
32
|
# Would return:
|
33
33
|
# 'Product (23,42€)'
|
34
|
-
class Template
|
34
|
+
class Template < Base
|
35
|
+
def self.validate_args!(value, context)
|
36
|
+
assert_type value, String, :value, context:
|
37
|
+
|
38
|
+
string = context[:options]&.dig(:string).to_s
|
39
|
+
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
40
|
+
end
|
41
|
+
|
35
42
|
##
|
36
43
|
# @param value [String]
|
37
|
-
# @param
|
38
|
-
def initialize(value,
|
39
|
-
|
40
|
-
|
41
|
-
@
|
42
|
-
@
|
44
|
+
# @param context [Item::Context]
|
45
|
+
def initialize(value, context)
|
46
|
+
super
|
47
|
+
|
48
|
+
@options = context[:options] || {}
|
49
|
+
@item = context[:item]
|
50
|
+
@string = @options[:string].to_s
|
43
51
|
end
|
44
52
|
|
45
53
|
##
|
@@ -66,9 +74,9 @@ module Html2rss
|
|
66
74
|
# @return [String]
|
67
75
|
# @deprecated Use %<id>s formatting instead. Will be removed in version 1.0.0. See README / Dynamic parameters.
|
68
76
|
def format_string_with_methods
|
69
|
-
warn '[DEPRECATION] This method of using params is deprecated and \
|
70
|
-
|
71
|
-
|
77
|
+
Log.warn '[DEPRECATION] This method of using params is deprecated and \
|
78
|
+
support for it will be removed in version 1.0.0.\
|
79
|
+
Please use dynamic parameters (i.e. %<id>s, see README.md) instead.'
|
72
80
|
|
73
81
|
string % methods
|
74
82
|
end
|
@@ -86,7 +94,7 @@ module Html2rss
|
|
86
94
|
# @param method_name [String, Symbol]
|
87
95
|
# @return [String]
|
88
96
|
def item_value(method_name)
|
89
|
-
method_name.to_sym == :self ?
|
97
|
+
method_name.to_sym == :self ? value : @item.public_send(method_name).to_s
|
90
98
|
end
|
91
99
|
end
|
92
100
|
end
|
@@ -8,6 +8,14 @@ module Html2rss
|
|
8
8
|
# Error raised when an unknown post processor name is requested.
|
9
9
|
class UnknownPostProcessorName < Html2rss::Error; end
|
10
10
|
|
11
|
+
##
|
12
|
+
# Error raised when a required option is missing.
|
13
|
+
class MissingOption < Html2rss::Error; end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Error raised when an invalid type is provided.
|
17
|
+
class InvalidType < Html2rss::Error; end
|
18
|
+
|
11
19
|
##
|
12
20
|
# Maps the post processor name to the class implementing the post processor.
|
13
21
|
#
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'zlib'
|
4
|
+
require 'sanitize'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
##
|
9
|
+
# Article is a simple data object representing an article extracted from a page.
|
10
|
+
# It is enumerable and responds to all keys specified in PROVIDED_KEYS.
|
11
|
+
class Article
|
12
|
+
include Enumerable
|
13
|
+
include Comparable
|
14
|
+
|
15
|
+
PROVIDED_KEYS = %i[id title description url image guid published_at scraper].freeze
|
16
|
+
|
17
|
+
# @param options [Hash<Symbol, String>]
|
18
|
+
def initialize(**options)
|
19
|
+
@to_h = {}
|
20
|
+
options.each_pair { |key, value| @to_h[key] = value.freeze if value }
|
21
|
+
@to_h.freeze
|
22
|
+
|
23
|
+
return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
|
24
|
+
|
25
|
+
Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Checks if the article is valid based on the presence of URL, ID, and either title or description.
|
29
|
+
# @return [Boolean] True if the article is valid, otherwise false.
|
30
|
+
def valid?
|
31
|
+
!url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
|
32
|
+
end
|
33
|
+
|
34
|
+
# @yield [key, value]
|
35
|
+
# @return [Enumerator] if no block is given
|
36
|
+
def each
|
37
|
+
return enum_for(:each) unless block_given?
|
38
|
+
|
39
|
+
PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def id
|
43
|
+
@to_h[:id]
|
44
|
+
end
|
45
|
+
|
46
|
+
def title
|
47
|
+
@to_h[:title]
|
48
|
+
end
|
49
|
+
|
50
|
+
def description
|
51
|
+
return @description if defined?(@description)
|
52
|
+
|
53
|
+
return if url.to_s.empty? || @to_h[:description].to_s.empty?
|
54
|
+
|
55
|
+
@description ||= Html2rss::AttributePostProcessors::SanitizeHtml.get(@to_h[:description], url)
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [Addressable::URI, nil]
|
59
|
+
def url
|
60
|
+
@url ||= Html2rss::Utils.sanitize_url(@to_h[:url])
|
61
|
+
end
|
62
|
+
|
63
|
+
# @return [Addressable::URI, nil]
|
64
|
+
def image
|
65
|
+
@image ||= Html2rss::Utils.sanitize_url(@to_h[:image])
|
66
|
+
end
|
67
|
+
|
68
|
+
# Generates a unique identifier based on the URL and ID using CRC32.
|
69
|
+
# @return [String]
|
70
|
+
def guid
|
71
|
+
@guid ||= Zlib.crc32([url, id].join('#!/')).to_s(36).encode('utf-8')
|
72
|
+
end
|
73
|
+
|
74
|
+
# Parses and returns the published_at time.
|
75
|
+
# @return [Time, nil]
|
76
|
+
def published_at
|
77
|
+
return if (string = @to_h[:published_at].to_s).strip.empty?
|
78
|
+
|
79
|
+
@published_at ||= Time.parse(string)
|
80
|
+
rescue ArgumentError
|
81
|
+
nil
|
82
|
+
end
|
83
|
+
|
84
|
+
def scraper
|
85
|
+
@to_h[:scraper]
|
86
|
+
end
|
87
|
+
|
88
|
+
def <=>(other)
|
89
|
+
return nil unless other.is_a?(Article)
|
90
|
+
|
91
|
+
0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|