html2rss 0.6.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +122 -17
- data/.travis.yml +3 -3
- data/CHANGELOG.md +97 -42
- data/Gemfile +2 -0
- data/Gemfile.lock +84 -53
- data/README.md +461 -47
- data/html2rss.gemspec +11 -7
- data/lib/html2rss.rb +5 -4
- data/lib/html2rss/attribute_post_processors.rb +4 -10
- data/lib/html2rss/attribute_post_processors/gsub.rb +42 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +45 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +1 -1
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +3 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +42 -22
- data/lib/html2rss/attribute_post_processors/substring.rb +11 -5
- data/lib/html2rss/attribute_post_processors/template.rb +23 -14
- data/lib/html2rss/config.rb +40 -20
- data/lib/html2rss/feed_builder.rb +42 -20
- data/lib/html2rss/item.rb +24 -18
- data/lib/html2rss/item_extractors.rb +6 -13
- data/lib/html2rss/item_extractors/attribute.rb +1 -1
- data/lib/html2rss/item_extractors/href.rb +2 -2
- data/lib/html2rss/item_extractors/static.rb +2 -2
- data/lib/html2rss/utils.rb +18 -12
- data/lib/html2rss/version.rb +2 -1
- metadata +88 -23
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'rss'
|
2
|
-
|
2
|
+
require 'mime/types'
|
3
3
|
|
4
4
|
module Html2rss
|
5
5
|
##
|
@@ -18,42 +18,64 @@ module Html2rss
|
|
18
18
|
# @return [RSS:Rss]
|
19
19
|
def rss
|
20
20
|
RSS::Maker.make('2.0') do |maker|
|
21
|
-
add_channel(maker)
|
21
|
+
add_channel(maker.channel)
|
22
22
|
|
23
|
-
|
24
|
-
add_item(feed_item, maker.items.new_item)
|
25
|
-
end
|
23
|
+
items.each { |item| add_item(item, maker.items.new_item) }
|
26
24
|
end
|
27
25
|
end
|
28
26
|
|
27
|
+
def self.add_categories(categories, item_maker)
|
28
|
+
categories.each { |category| item_maker.categories.new_category.content = category }
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.add_enclosure_from_url(url, item_maker)
|
32
|
+
return unless url
|
33
|
+
|
34
|
+
enclosure = item_maker.enclosure
|
35
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
36
|
+
|
37
|
+
enclosure.type = content_type.any? ? content_type.first.to_s : 'application/octet-stream'
|
38
|
+
enclosure.length = 0
|
39
|
+
enclosure.url = url
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.add_guid(item, item_maker)
|
43
|
+
guid = item_maker.guid
|
44
|
+
guid.content = Digest::SHA1.hexdigest(item.title)
|
45
|
+
guid.isPermaLink = false
|
46
|
+
end
|
47
|
+
|
29
48
|
private
|
30
49
|
|
31
50
|
attr_reader :config
|
32
51
|
|
33
|
-
def add_channel(
|
52
|
+
def add_channel(channel_maker)
|
34
53
|
%i[language author title description link ttl].each do |attribute_name|
|
35
|
-
|
54
|
+
channel_maker.public_send("#{attribute_name}=", config.public_send(attribute_name))
|
36
55
|
end
|
37
56
|
|
38
|
-
|
39
|
-
|
57
|
+
channel_maker.generator = "html2rss V. #{::Html2rss::VERSION}"
|
58
|
+
channel_maker.lastBuildDate = Time.now
|
40
59
|
end
|
41
60
|
|
42
|
-
def
|
43
|
-
@
|
44
|
-
end
|
61
|
+
def items
|
62
|
+
return @items if defined?(@items)
|
45
63
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
64
|
+
items = Item.from_url(config.url, config)
|
65
|
+
|
66
|
+
items.reverse! if config.items_order == :reverse
|
67
|
+
|
68
|
+
@items = items
|
69
|
+
end
|
50
70
|
|
51
|
-
|
52
|
-
|
71
|
+
def add_item(item, item_maker)
|
72
|
+
item.available_attributes.each do |attribute_name|
|
73
|
+
item_maker.public_send("#{attribute_name}=", item.public_send(attribute_name))
|
53
74
|
end
|
54
75
|
|
55
|
-
|
56
|
-
|
76
|
+
self.class.add_categories(item.categories, item_maker)
|
77
|
+
self.class.add_enclosure_from_url(item.enclosure_url, item_maker) if item.enclosure?
|
78
|
+
self.class.add_guid(item, item_maker)
|
57
79
|
end
|
58
80
|
end
|
59
81
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
require 'faraday'
|
2
2
|
require 'faraday_middleware'
|
3
|
-
require 'open-uri'
|
4
3
|
require 'nokogiri'
|
5
|
-
require_relative 'item_extractors'
|
6
|
-
require_relative 'attribute_post_processors'
|
7
4
|
|
8
5
|
module Html2rss
|
9
6
|
##
|
@@ -26,15 +23,15 @@ module Html2rss
|
|
26
23
|
|
27
24
|
attribute_options = config.attribute_options(method_name)
|
28
25
|
|
29
|
-
extractor = ItemExtractors.get_extractor(attribute_options[
|
26
|
+
extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
|
30
27
|
value = extractor.new(xml, attribute_options).get
|
31
28
|
|
32
|
-
post_process(value, attribute_options.fetch(
|
29
|
+
post_process(value, attribute_options.fetch(:post_process, false))
|
33
30
|
end
|
34
31
|
|
35
32
|
def available_attributes
|
36
|
-
@available_attributes ||= (%
|
37
|
-
@config.attribute_names) - [
|
33
|
+
@available_attributes ||= (%i[title link description author comments updated] &
|
34
|
+
@config.attribute_names) - %i[categories enclosure]
|
38
35
|
end
|
39
36
|
|
40
37
|
##
|
@@ -48,10 +45,17 @@ module Html2rss
|
|
48
45
|
##
|
49
46
|
# @return [Array]
|
50
47
|
def categories
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
48
|
+
config.category_selectors.map(&method(:method_missing))
|
49
|
+
end
|
50
|
+
|
51
|
+
def enclosure?
|
52
|
+
config.attribute?(:enclosure)
|
53
|
+
end
|
54
|
+
|
55
|
+
def enclosure_url
|
56
|
+
enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
|
57
|
+
|
58
|
+
Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
|
55
59
|
end
|
56
60
|
|
57
61
|
##
|
@@ -59,20 +63,22 @@ module Html2rss
|
|
59
63
|
def self.from_url(url, config)
|
60
64
|
body = get_body_from_url(url, config)
|
61
65
|
|
62
|
-
Nokogiri
|
63
|
-
|
64
|
-
|
66
|
+
Nokogiri.HTML(body).css(config.selector(:items))
|
67
|
+
.map { |xml_item| new xml_item, config }
|
68
|
+
.keep_if(&:valid?)
|
65
69
|
end
|
66
70
|
|
67
71
|
private
|
68
72
|
|
69
73
|
def self.get_body_from_url(url, config)
|
70
|
-
|
74
|
+
request = Faraday.new(url: url, headers: config.headers) do |faraday|
|
71
75
|
faraday.use FaradayMiddleware::FollowRedirects
|
72
76
|
faraday.adapter Faraday.default_adapter
|
73
|
-
end
|
77
|
+
end
|
78
|
+
|
79
|
+
body = request.get.body
|
74
80
|
|
75
|
-
config.json? ? Html2rss::Utils.
|
81
|
+
config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
|
76
82
|
end
|
77
83
|
private_class_method :get_body_from_url
|
78
84
|
|
@@ -82,7 +88,7 @@ module Html2rss
|
|
82
88
|
return value unless post_process_options
|
83
89
|
|
84
90
|
[post_process_options].flatten.each do |options|
|
85
|
-
value = AttributePostProcessors.get_processor(options[
|
91
|
+
value = AttributePostProcessors.get_processor(options[:name])
|
86
92
|
.new(value, options: options, item: self, config: @config)
|
87
93
|
.get
|
88
94
|
end
|
@@ -1,29 +1,22 @@
|
|
1
|
-
require_relative 'item_extractors/attribute'
|
2
|
-
require_relative 'item_extractors/current_time'
|
3
|
-
require_relative 'item_extractors/href'
|
4
|
-
require_relative 'item_extractors/html'
|
5
|
-
require_relative 'item_extractors/static'
|
6
|
-
require_relative 'item_extractors/text'
|
7
|
-
|
8
1
|
module Html2rss
|
9
2
|
##
|
10
3
|
# Provides a namespace for item extractors.
|
11
4
|
module ItemExtractors
|
12
5
|
DEFAULT = 'text'.freeze
|
6
|
+
private_constant :DEFAULT
|
13
7
|
|
14
8
|
def self.get_extractor(name)
|
15
|
-
@
|
16
|
-
|
17
|
-
|
9
|
+
@get_extractor ||= Hash.new do |extractors, key|
|
10
|
+
extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
|
11
|
+
end
|
18
12
|
|
19
|
-
|
20
|
-
end[name || DEFAULT]
|
13
|
+
@get_extractor[name]
|
21
14
|
end
|
22
15
|
|
23
16
|
##
|
24
17
|
# @return [Nokogiri::XML::Element]
|
25
18
|
def self.element(xml, options)
|
26
|
-
selector = options[
|
19
|
+
selector = options[:selector]
|
27
20
|
selector ? xml.css(selector) : xml
|
28
21
|
end
|
29
22
|
end
|
@@ -24,12 +24,12 @@ module Html2rss
|
|
24
24
|
def initialize(xml, options)
|
25
25
|
@options = options
|
26
26
|
element = ItemExtractors.element(xml, options)
|
27
|
-
@href = element.attr('href')
|
27
|
+
@href = Html2rss::Utils.sanitize_url(element.attr('href'))
|
28
28
|
end
|
29
29
|
|
30
30
|
# @return [URI::HTTPS, URI::HTTP]
|
31
31
|
def get
|
32
|
-
Html2rss::Utils.build_absolute_url_from_relative(@href, @options[
|
32
|
+
Html2rss::Utils.build_absolute_url_from_relative(@href, @options[:channel][:url])
|
33
33
|
end
|
34
34
|
end
|
35
35
|
end
|
@@ -15,13 +15,13 @@ module Html2rss
|
|
15
15
|
@options = options
|
16
16
|
end
|
17
17
|
|
18
|
-
# Returns what options[
|
18
|
+
# Returns what options[:static] holds.
|
19
19
|
#
|
20
20
|
# options = { static: 'Foobar' }
|
21
21
|
# Static.new(xml, options).get
|
22
22
|
# # => 'Foobar'
|
23
23
|
def get
|
24
|
-
@options[
|
24
|
+
@options[:static]
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'active_support/core_ext/hash'
|
2
|
+
require 'addressable/uri'
|
2
3
|
require 'builder'
|
3
|
-
require 'hashie'
|
4
4
|
require 'json'
|
5
5
|
require 'nokogiri'
|
6
6
|
|
@@ -8,27 +8,33 @@ module Html2rss
|
|
8
8
|
##
|
9
9
|
# The collecting tank for utility methods.
|
10
10
|
module Utils
|
11
|
-
|
12
|
-
# A Hash with indifferent access, build with {https://github.com/intridea/hashie Hashie}.
|
13
|
-
class IndifferentAccessHash < Hash
|
14
|
-
include Hashie::Extensions::MergeInitializer
|
15
|
-
include Hashie::Extensions::IndifferentAccess
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.build_absolute_url_from_relative(url, channel_url)
|
11
|
+
def self.build_absolute_url_from_relative(url, base_url)
|
19
12
|
url = URI(url) if url.is_a?(String)
|
20
13
|
|
21
14
|
return url if url.absolute?
|
22
15
|
|
23
|
-
URI(
|
16
|
+
URI(base_url).tap do |uri|
|
24
17
|
uri.path = url.path.to_s.start_with?('/') ? url.path : "/#{url.path}"
|
25
18
|
uri.query = url.query
|
26
19
|
uri.fragment = url.fragment if url.fragment
|
27
20
|
end
|
28
21
|
end
|
29
22
|
|
30
|
-
def self.
|
31
|
-
|
23
|
+
def self.object_to_xml(object)
|
24
|
+
object.to_xml(skip_instruct: true, skip_types: true)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.get_class_from_name(snake_cased_name, module_name)
|
28
|
+
camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
|
29
|
+
class_name = ['Html2rss', module_name, camel_cased_name].join('::')
|
30
|
+
Object.const_get(class_name)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.sanitize_url(url)
|
34
|
+
squished_url = url.to_s.split(' ').join
|
35
|
+
return if squished_url.to_s == ''
|
36
|
+
|
37
|
+
Addressable::URI.parse(squished_url).normalize.to_s
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,49 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-06-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '5'
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '7'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '5'
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '7'
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: addressable
|
15
35
|
requirement: !ruby/object:Gem::Requirement
|
16
36
|
requirements:
|
17
37
|
- - "~>"
|
18
38
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
39
|
+
version: '2.7'
|
20
40
|
type: :runtime
|
21
41
|
prerelease: false
|
22
42
|
version_requirements: !ruby/object:Gem::Requirement
|
23
43
|
requirements:
|
24
44
|
- - "~>"
|
25
45
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
46
|
+
version: '2.7'
|
27
47
|
- !ruby/object:Gem::Dependency
|
28
48
|
name: builder
|
29
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,42 +64,56 @@ dependencies:
|
|
44
64
|
requirements:
|
45
65
|
- - "~>"
|
46
66
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0
|
67
|
+
version: '1.0'
|
48
68
|
type: :runtime
|
49
69
|
prerelease: false
|
50
70
|
version_requirements: !ruby/object:Gem::Requirement
|
51
71
|
requirements:
|
52
72
|
- - "~>"
|
53
73
|
- !ruby/object:Gem::Version
|
54
|
-
version: '0
|
74
|
+
version: '1.0'
|
55
75
|
- !ruby/object:Gem::Dependency
|
56
76
|
name: faraday_middleware
|
57
77
|
requirement: !ruby/object:Gem::Requirement
|
58
78
|
requirements:
|
59
|
-
- - "
|
79
|
+
- - ">="
|
60
80
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0
|
81
|
+
version: '0'
|
62
82
|
type: :runtime
|
63
83
|
prerelease: false
|
64
84
|
version_requirements: !ruby/object:Gem::Requirement
|
65
85
|
requirements:
|
66
|
-
- - "
|
86
|
+
- - ">="
|
67
87
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0
|
88
|
+
version: '0'
|
69
89
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
90
|
+
name: kramdown
|
71
91
|
requirement: !ruby/object:Gem::Requirement
|
72
92
|
requirements:
|
73
|
-
- - "
|
93
|
+
- - ">="
|
74
94
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
95
|
+
version: '0'
|
76
96
|
type: :runtime
|
77
97
|
prerelease: false
|
78
98
|
version_requirements: !ruby/object:Gem::Requirement
|
79
99
|
requirements:
|
80
|
-
- - "
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
- !ruby/object:Gem::Dependency
|
104
|
+
name: mime-types
|
105
|
+
requirement: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '3.0'
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">"
|
81
115
|
- !ruby/object:Gem::Version
|
82
|
-
version: '3.
|
116
|
+
version: '3.0'
|
83
117
|
- !ruby/object:Gem::Dependency
|
84
118
|
name: nokogiri
|
85
119
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,14 +140,14 @@ dependencies:
|
|
106
140
|
requirements:
|
107
141
|
- - "~>"
|
108
142
|
- !ruby/object:Gem::Version
|
109
|
-
version: '
|
143
|
+
version: '2.0'
|
110
144
|
type: :runtime
|
111
145
|
prerelease: false
|
112
146
|
version_requirements: !ruby/object:Gem::Requirement
|
113
147
|
requirements:
|
114
148
|
- - "~>"
|
115
149
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
150
|
+
version: '2.0'
|
117
151
|
- !ruby/object:Gem::Dependency
|
118
152
|
name: sanitize
|
119
153
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,20 +162,48 @@ dependencies:
|
|
128
162
|
- - "~>"
|
129
163
|
- !ruby/object:Gem::Version
|
130
164
|
version: '5.0'
|
165
|
+
- !ruby/object:Gem::Dependency
|
166
|
+
name: to_regexp
|
167
|
+
requirement: !ruby/object:Gem::Requirement
|
168
|
+
requirements:
|
169
|
+
- - ">="
|
170
|
+
- !ruby/object:Gem::Version
|
171
|
+
version: '0'
|
172
|
+
type: :runtime
|
173
|
+
prerelease: false
|
174
|
+
version_requirements: !ruby/object:Gem::Requirement
|
175
|
+
requirements:
|
176
|
+
- - ">="
|
177
|
+
- !ruby/object:Gem::Version
|
178
|
+
version: '0'
|
179
|
+
- !ruby/object:Gem::Dependency
|
180
|
+
name: zeitwerk
|
181
|
+
requirement: !ruby/object:Gem::Requirement
|
182
|
+
requirements:
|
183
|
+
- - ">="
|
184
|
+
- !ruby/object:Gem::Version
|
185
|
+
version: '0'
|
186
|
+
type: :runtime
|
187
|
+
prerelease: false
|
188
|
+
version_requirements: !ruby/object:Gem::Requirement
|
189
|
+
requirements:
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: '0'
|
131
193
|
- !ruby/object:Gem::Dependency
|
132
194
|
name: bundler
|
133
195
|
requirement: !ruby/object:Gem::Requirement
|
134
196
|
requirements:
|
135
|
-
- - "
|
197
|
+
- - ">="
|
136
198
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
199
|
+
version: '0'
|
138
200
|
type: :development
|
139
201
|
prerelease: false
|
140
202
|
version_requirements: !ruby/object:Gem::Requirement
|
141
203
|
requirements:
|
142
|
-
- - "
|
204
|
+
- - ">="
|
143
205
|
- !ruby/object:Gem::Version
|
144
|
-
version: '
|
206
|
+
version: '0'
|
145
207
|
- !ruby/object:Gem::Dependency
|
146
208
|
name: byebug
|
147
209
|
requirement: !ruby/object:Gem::Requirement
|
@@ -277,7 +339,9 @@ files:
|
|
277
339
|
- html2rss.gemspec
|
278
340
|
- lib/html2rss.rb
|
279
341
|
- lib/html2rss/attribute_post_processors.rb
|
342
|
+
- lib/html2rss/attribute_post_processors/gsub.rb
|
280
343
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
344
|
+
- lib/html2rss/attribute_post_processors/markdown_to_html.rb
|
281
345
|
- lib/html2rss/attribute_post_processors/parse_time.rb
|
282
346
|
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
283
347
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
@@ -310,14 +374,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
310
374
|
requirements:
|
311
375
|
- - ">="
|
312
376
|
- !ruby/object:Gem::Version
|
313
|
-
version: 2.
|
377
|
+
version: 2.5.0
|
314
378
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
315
379
|
requirements:
|
316
380
|
- - ">="
|
317
381
|
- !ruby/object:Gem::Version
|
318
382
|
version: '0'
|
319
383
|
requirements: []
|
320
|
-
|
384
|
+
rubyforge_project:
|
385
|
+
rubygems_version: 2.7.7
|
321
386
|
signing_key:
|
322
387
|
specification_version: 4
|
323
388
|
summary: Returns an RSS::Rss object by scraping a URL.
|