html2rss 0.6.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +122 -17
- data/.travis.yml +3 -3
- data/CHANGELOG.md +97 -42
- data/Gemfile +2 -0
- data/Gemfile.lock +84 -53
- data/README.md +461 -47
- data/html2rss.gemspec +11 -7
- data/lib/html2rss.rb +5 -4
- data/lib/html2rss/attribute_post_processors.rb +4 -10
- data/lib/html2rss/attribute_post_processors/gsub.rb +42 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +45 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +1 -1
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +3 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +42 -22
- data/lib/html2rss/attribute_post_processors/substring.rb +11 -5
- data/lib/html2rss/attribute_post_processors/template.rb +23 -14
- data/lib/html2rss/config.rb +40 -20
- data/lib/html2rss/feed_builder.rb +42 -20
- data/lib/html2rss/item.rb +24 -18
- data/lib/html2rss/item_extractors.rb +6 -13
- data/lib/html2rss/item_extractors/attribute.rb +1 -1
- data/lib/html2rss/item_extractors/href.rb +2 -2
- data/lib/html2rss/item_extractors/static.rb +2 -2
- data/lib/html2rss/utils.rb +18 -12
- data/lib/html2rss/version.rb +2 -1
- metadata +88 -23
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'rss'
|
2
|
-
|
2
|
+
require 'mime/types'
|
3
3
|
|
4
4
|
module Html2rss
|
5
5
|
##
|
@@ -18,42 +18,64 @@ module Html2rss
|
|
18
18
|
# @return [RSS:Rss]
|
19
19
|
def rss
|
20
20
|
RSS::Maker.make('2.0') do |maker|
|
21
|
-
add_channel(maker)
|
21
|
+
add_channel(maker.channel)
|
22
22
|
|
23
|
-
|
24
|
-
add_item(feed_item, maker.items.new_item)
|
25
|
-
end
|
23
|
+
items.each { |item| add_item(item, maker.items.new_item) }
|
26
24
|
end
|
27
25
|
end
|
28
26
|
|
27
|
+
def self.add_categories(categories, item_maker)
|
28
|
+
categories.each { |category| item_maker.categories.new_category.content = category }
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.add_enclosure_from_url(url, item_maker)
|
32
|
+
return unless url
|
33
|
+
|
34
|
+
enclosure = item_maker.enclosure
|
35
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
36
|
+
|
37
|
+
enclosure.type = content_type.any? ? content_type.first.to_s : 'application/octet-stream'
|
38
|
+
enclosure.length = 0
|
39
|
+
enclosure.url = url
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.add_guid(item, item_maker)
|
43
|
+
guid = item_maker.guid
|
44
|
+
guid.content = Digest::SHA1.hexdigest(item.title)
|
45
|
+
guid.isPermaLink = false
|
46
|
+
end
|
47
|
+
|
29
48
|
private
|
30
49
|
|
31
50
|
attr_reader :config
|
32
51
|
|
33
|
-
def add_channel(
|
52
|
+
def add_channel(channel_maker)
|
34
53
|
%i[language author title description link ttl].each do |attribute_name|
|
35
|
-
|
54
|
+
channel_maker.public_send("#{attribute_name}=", config.public_send(attribute_name))
|
36
55
|
end
|
37
56
|
|
38
|
-
|
39
|
-
|
57
|
+
channel_maker.generator = "html2rss V. #{::Html2rss::VERSION}"
|
58
|
+
channel_maker.lastBuildDate = Time.now
|
40
59
|
end
|
41
60
|
|
42
|
-
def
|
43
|
-
@
|
44
|
-
end
|
61
|
+
def items
|
62
|
+
return @items if defined?(@items)
|
45
63
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
64
|
+
items = Item.from_url(config.url, config)
|
65
|
+
|
66
|
+
items.reverse! if config.items_order == :reverse
|
67
|
+
|
68
|
+
@items = items
|
69
|
+
end
|
50
70
|
|
51
|
-
|
52
|
-
|
71
|
+
def add_item(item, item_maker)
|
72
|
+
item.available_attributes.each do |attribute_name|
|
73
|
+
item_maker.public_send("#{attribute_name}=", item.public_send(attribute_name))
|
53
74
|
end
|
54
75
|
|
55
|
-
|
56
|
-
|
76
|
+
self.class.add_categories(item.categories, item_maker)
|
77
|
+
self.class.add_enclosure_from_url(item.enclosure_url, item_maker) if item.enclosure?
|
78
|
+
self.class.add_guid(item, item_maker)
|
57
79
|
end
|
58
80
|
end
|
59
81
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
require 'faraday'
|
2
2
|
require 'faraday_middleware'
|
3
|
-
require 'open-uri'
|
4
3
|
require 'nokogiri'
|
5
|
-
require_relative 'item_extractors'
|
6
|
-
require_relative 'attribute_post_processors'
|
7
4
|
|
8
5
|
module Html2rss
|
9
6
|
##
|
@@ -26,15 +23,15 @@ module Html2rss
|
|
26
23
|
|
27
24
|
attribute_options = config.attribute_options(method_name)
|
28
25
|
|
29
|
-
extractor = ItemExtractors.get_extractor(attribute_options[
|
26
|
+
extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
|
30
27
|
value = extractor.new(xml, attribute_options).get
|
31
28
|
|
32
|
-
post_process(value, attribute_options.fetch(
|
29
|
+
post_process(value, attribute_options.fetch(:post_process, false))
|
33
30
|
end
|
34
31
|
|
35
32
|
def available_attributes
|
36
|
-
@available_attributes ||= (%
|
37
|
-
@config.attribute_names) - [
|
33
|
+
@available_attributes ||= (%i[title link description author comments updated] &
|
34
|
+
@config.attribute_names) - %i[categories enclosure]
|
38
35
|
end
|
39
36
|
|
40
37
|
##
|
@@ -48,10 +45,17 @@ module Html2rss
|
|
48
45
|
##
|
49
46
|
# @return [Array]
|
50
47
|
def categories
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
48
|
+
config.category_selectors.map(&method(:method_missing))
|
49
|
+
end
|
50
|
+
|
51
|
+
def enclosure?
|
52
|
+
config.attribute?(:enclosure)
|
53
|
+
end
|
54
|
+
|
55
|
+
def enclosure_url
|
56
|
+
enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
|
57
|
+
|
58
|
+
Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
|
55
59
|
end
|
56
60
|
|
57
61
|
##
|
@@ -59,20 +63,22 @@ module Html2rss
|
|
59
63
|
def self.from_url(url, config)
|
60
64
|
body = get_body_from_url(url, config)
|
61
65
|
|
62
|
-
Nokogiri
|
63
|
-
|
64
|
-
|
66
|
+
Nokogiri.HTML(body).css(config.selector(:items))
|
67
|
+
.map { |xml_item| new xml_item, config }
|
68
|
+
.keep_if(&:valid?)
|
65
69
|
end
|
66
70
|
|
67
71
|
private
|
68
72
|
|
69
73
|
def self.get_body_from_url(url, config)
|
70
|
-
|
74
|
+
request = Faraday.new(url: url, headers: config.headers) do |faraday|
|
71
75
|
faraday.use FaradayMiddleware::FollowRedirects
|
72
76
|
faraday.adapter Faraday.default_adapter
|
73
|
-
end
|
77
|
+
end
|
78
|
+
|
79
|
+
body = request.get.body
|
74
80
|
|
75
|
-
config.json? ? Html2rss::Utils.
|
81
|
+
config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
|
76
82
|
end
|
77
83
|
private_class_method :get_body_from_url
|
78
84
|
|
@@ -82,7 +88,7 @@ module Html2rss
|
|
82
88
|
return value unless post_process_options
|
83
89
|
|
84
90
|
[post_process_options].flatten.each do |options|
|
85
|
-
value = AttributePostProcessors.get_processor(options[
|
91
|
+
value = AttributePostProcessors.get_processor(options[:name])
|
86
92
|
.new(value, options: options, item: self, config: @config)
|
87
93
|
.get
|
88
94
|
end
|
@@ -1,29 +1,22 @@
|
|
1
|
-
require_relative 'item_extractors/attribute'
|
2
|
-
require_relative 'item_extractors/current_time'
|
3
|
-
require_relative 'item_extractors/href'
|
4
|
-
require_relative 'item_extractors/html'
|
5
|
-
require_relative 'item_extractors/static'
|
6
|
-
require_relative 'item_extractors/text'
|
7
|
-
|
8
1
|
module Html2rss
|
9
2
|
##
|
10
3
|
# Provides a namespace for item extractors.
|
11
4
|
module ItemExtractors
|
12
5
|
DEFAULT = 'text'.freeze
|
6
|
+
private_constant :DEFAULT
|
13
7
|
|
14
8
|
def self.get_extractor(name)
|
15
|
-
@
|
16
|
-
|
17
|
-
|
9
|
+
@get_extractor ||= Hash.new do |extractors, key|
|
10
|
+
extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
|
11
|
+
end
|
18
12
|
|
19
|
-
|
20
|
-
end[name || DEFAULT]
|
13
|
+
@get_extractor[name]
|
21
14
|
end
|
22
15
|
|
23
16
|
##
|
24
17
|
# @return [Nokogiri::XML::Element]
|
25
18
|
def self.element(xml, options)
|
26
|
-
selector = options[
|
19
|
+
selector = options[:selector]
|
27
20
|
selector ? xml.css(selector) : xml
|
28
21
|
end
|
29
22
|
end
|
@@ -24,12 +24,12 @@ module Html2rss
|
|
24
24
|
def initialize(xml, options)
|
25
25
|
@options = options
|
26
26
|
element = ItemExtractors.element(xml, options)
|
27
|
-
@href = element.attr('href')
|
27
|
+
@href = Html2rss::Utils.sanitize_url(element.attr('href'))
|
28
28
|
end
|
29
29
|
|
30
30
|
# @return [URI::HTTPS, URI::HTTP]
|
31
31
|
def get
|
32
|
-
Html2rss::Utils.build_absolute_url_from_relative(@href, @options[
|
32
|
+
Html2rss::Utils.build_absolute_url_from_relative(@href, @options[:channel][:url])
|
33
33
|
end
|
34
34
|
end
|
35
35
|
end
|
@@ -15,13 +15,13 @@ module Html2rss
|
|
15
15
|
@options = options
|
16
16
|
end
|
17
17
|
|
18
|
-
# Returns what options[
|
18
|
+
# Returns what options[:static] holds.
|
19
19
|
#
|
20
20
|
# options = { static: 'Foobar' }
|
21
21
|
# Static.new(xml, options).get
|
22
22
|
# # => 'Foobar'
|
23
23
|
def get
|
24
|
-
@options[
|
24
|
+
@options[:static]
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'active_support/core_ext/hash'
|
2
|
+
require 'addressable/uri'
|
2
3
|
require 'builder'
|
3
|
-
require 'hashie'
|
4
4
|
require 'json'
|
5
5
|
require 'nokogiri'
|
6
6
|
|
@@ -8,27 +8,33 @@ module Html2rss
|
|
8
8
|
##
|
9
9
|
# The collecting tank for utility methods.
|
10
10
|
module Utils
|
11
|
-
|
12
|
-
# A Hash with indifferent access, build with {https://github.com/intridea/hashie Hashie}.
|
13
|
-
class IndifferentAccessHash < Hash
|
14
|
-
include Hashie::Extensions::MergeInitializer
|
15
|
-
include Hashie::Extensions::IndifferentAccess
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.build_absolute_url_from_relative(url, channel_url)
|
11
|
+
def self.build_absolute_url_from_relative(url, base_url)
|
19
12
|
url = URI(url) if url.is_a?(String)
|
20
13
|
|
21
14
|
return url if url.absolute?
|
22
15
|
|
23
|
-
URI(
|
16
|
+
URI(base_url).tap do |uri|
|
24
17
|
uri.path = url.path.to_s.start_with?('/') ? url.path : "/#{url.path}"
|
25
18
|
uri.query = url.query
|
26
19
|
uri.fragment = url.fragment if url.fragment
|
27
20
|
end
|
28
21
|
end
|
29
22
|
|
30
|
-
def self.
|
31
|
-
|
23
|
+
def self.object_to_xml(object)
|
24
|
+
object.to_xml(skip_instruct: true, skip_types: true)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.get_class_from_name(snake_cased_name, module_name)
|
28
|
+
camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
|
29
|
+
class_name = ['Html2rss', module_name, camel_cased_name].join('::')
|
30
|
+
Object.const_get(class_name)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.sanitize_url(url)
|
34
|
+
squished_url = url.to_s.split(' ').join
|
35
|
+
return if squished_url.to_s == ''
|
36
|
+
|
37
|
+
Addressable::URI.parse(squished_url).normalize.to_s
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,49 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-06-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '5'
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '7'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '5'
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '7'
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: addressable
|
15
35
|
requirement: !ruby/object:Gem::Requirement
|
16
36
|
requirements:
|
17
37
|
- - "~>"
|
18
38
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
39
|
+
version: '2.7'
|
20
40
|
type: :runtime
|
21
41
|
prerelease: false
|
22
42
|
version_requirements: !ruby/object:Gem::Requirement
|
23
43
|
requirements:
|
24
44
|
- - "~>"
|
25
45
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
46
|
+
version: '2.7'
|
27
47
|
- !ruby/object:Gem::Dependency
|
28
48
|
name: builder
|
29
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,42 +64,56 @@ dependencies:
|
|
44
64
|
requirements:
|
45
65
|
- - "~>"
|
46
66
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0
|
67
|
+
version: '1.0'
|
48
68
|
type: :runtime
|
49
69
|
prerelease: false
|
50
70
|
version_requirements: !ruby/object:Gem::Requirement
|
51
71
|
requirements:
|
52
72
|
- - "~>"
|
53
73
|
- !ruby/object:Gem::Version
|
54
|
-
version: '0
|
74
|
+
version: '1.0'
|
55
75
|
- !ruby/object:Gem::Dependency
|
56
76
|
name: faraday_middleware
|
57
77
|
requirement: !ruby/object:Gem::Requirement
|
58
78
|
requirements:
|
59
|
-
- - "
|
79
|
+
- - ">="
|
60
80
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0
|
81
|
+
version: '0'
|
62
82
|
type: :runtime
|
63
83
|
prerelease: false
|
64
84
|
version_requirements: !ruby/object:Gem::Requirement
|
65
85
|
requirements:
|
66
|
-
- - "
|
86
|
+
- - ">="
|
67
87
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0
|
88
|
+
version: '0'
|
69
89
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
90
|
+
name: kramdown
|
71
91
|
requirement: !ruby/object:Gem::Requirement
|
72
92
|
requirements:
|
73
|
-
- - "
|
93
|
+
- - ">="
|
74
94
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
95
|
+
version: '0'
|
76
96
|
type: :runtime
|
77
97
|
prerelease: false
|
78
98
|
version_requirements: !ruby/object:Gem::Requirement
|
79
99
|
requirements:
|
80
|
-
- - "
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
- !ruby/object:Gem::Dependency
|
104
|
+
name: mime-types
|
105
|
+
requirement: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '3.0'
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">"
|
81
115
|
- !ruby/object:Gem::Version
|
82
|
-
version: '3.
|
116
|
+
version: '3.0'
|
83
117
|
- !ruby/object:Gem::Dependency
|
84
118
|
name: nokogiri
|
85
119
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,14 +140,14 @@ dependencies:
|
|
106
140
|
requirements:
|
107
141
|
- - "~>"
|
108
142
|
- !ruby/object:Gem::Version
|
109
|
-
version: '
|
143
|
+
version: '2.0'
|
110
144
|
type: :runtime
|
111
145
|
prerelease: false
|
112
146
|
version_requirements: !ruby/object:Gem::Requirement
|
113
147
|
requirements:
|
114
148
|
- - "~>"
|
115
149
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
150
|
+
version: '2.0'
|
117
151
|
- !ruby/object:Gem::Dependency
|
118
152
|
name: sanitize
|
119
153
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,20 +162,48 @@ dependencies:
|
|
128
162
|
- - "~>"
|
129
163
|
- !ruby/object:Gem::Version
|
130
164
|
version: '5.0'
|
165
|
+
- !ruby/object:Gem::Dependency
|
166
|
+
name: to_regexp
|
167
|
+
requirement: !ruby/object:Gem::Requirement
|
168
|
+
requirements:
|
169
|
+
- - ">="
|
170
|
+
- !ruby/object:Gem::Version
|
171
|
+
version: '0'
|
172
|
+
type: :runtime
|
173
|
+
prerelease: false
|
174
|
+
version_requirements: !ruby/object:Gem::Requirement
|
175
|
+
requirements:
|
176
|
+
- - ">="
|
177
|
+
- !ruby/object:Gem::Version
|
178
|
+
version: '0'
|
179
|
+
- !ruby/object:Gem::Dependency
|
180
|
+
name: zeitwerk
|
181
|
+
requirement: !ruby/object:Gem::Requirement
|
182
|
+
requirements:
|
183
|
+
- - ">="
|
184
|
+
- !ruby/object:Gem::Version
|
185
|
+
version: '0'
|
186
|
+
type: :runtime
|
187
|
+
prerelease: false
|
188
|
+
version_requirements: !ruby/object:Gem::Requirement
|
189
|
+
requirements:
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: '0'
|
131
193
|
- !ruby/object:Gem::Dependency
|
132
194
|
name: bundler
|
133
195
|
requirement: !ruby/object:Gem::Requirement
|
134
196
|
requirements:
|
135
|
-
- - "
|
197
|
+
- - ">="
|
136
198
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
199
|
+
version: '0'
|
138
200
|
type: :development
|
139
201
|
prerelease: false
|
140
202
|
version_requirements: !ruby/object:Gem::Requirement
|
141
203
|
requirements:
|
142
|
-
- - "
|
204
|
+
- - ">="
|
143
205
|
- !ruby/object:Gem::Version
|
144
|
-
version: '
|
206
|
+
version: '0'
|
145
207
|
- !ruby/object:Gem::Dependency
|
146
208
|
name: byebug
|
147
209
|
requirement: !ruby/object:Gem::Requirement
|
@@ -277,7 +339,9 @@ files:
|
|
277
339
|
- html2rss.gemspec
|
278
340
|
- lib/html2rss.rb
|
279
341
|
- lib/html2rss/attribute_post_processors.rb
|
342
|
+
- lib/html2rss/attribute_post_processors/gsub.rb
|
280
343
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
344
|
+
- lib/html2rss/attribute_post_processors/markdown_to_html.rb
|
281
345
|
- lib/html2rss/attribute_post_processors/parse_time.rb
|
282
346
|
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
283
347
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
@@ -310,14 +374,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
310
374
|
requirements:
|
311
375
|
- - ">="
|
312
376
|
- !ruby/object:Gem::Version
|
313
|
-
version: 2.
|
377
|
+
version: 2.5.0
|
314
378
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
315
379
|
requirements:
|
316
380
|
- - ">="
|
317
381
|
- !ruby/object:Gem::Version
|
318
382
|
version: '0'
|
319
383
|
requirements: []
|
320
|
-
|
384
|
+
rubyforge_project:
|
385
|
+
rubygems_version: 2.7.7
|
321
386
|
signing_key:
|
322
387
|
specification_version: 4
|
323
388
|
summary: Returns an RSS::Rss object by scraping a URL.
|