html2rss 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +0 -3
- data/CHANGELOG.md +14 -1
- data/Gemfile.lock +23 -15
- data/README.md +100 -20
- data/html2rss.gemspec +4 -1
- data/lib/html2rss.rb +5 -4
- data/lib/html2rss/attribute_post_processors.rb +6 -10
- data/lib/html2rss/attribute_post_processors/gsub.rb +39 -0
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +37 -18
- data/lib/html2rss/feed_builder.rb +20 -7
- data/lib/html2rss/item.rb +9 -7
- data/lib/html2rss/item_extractors.rb +6 -12
- data/lib/html2rss/utils.rb +1 -1
- data/lib/html2rss/version.rb +1 -1
- metadata +46 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 645fe7ea0ebe4a733c9833e8c22a93aa3f2d2b3b8589160fb001ad6ffd4659d3
|
4
|
+
data.tar.gz: 0fb454258a1f334243984bdb73a699ffcfc676c529d87bb0886fbba75127b7cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 350f0fd0e11bd35963c89b56aea9eb1dc90ae72956d849fd85b0b67d39e0bb7625176558becc0712f5e573a7605b541dea6497557699f727e4c27ef510d6d58f
|
7
|
+
data.tar.gz: e83905ddf24fc4c63793392cd47aa2d02e984c44ba489610119cff6a25bb70be57f4a95f5b9fb78229d9dc6366e0181d2322743eeee8ba59ee840831f8790bb1
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,17 @@
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v) (2019-10-28)
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
# [0.7.0](https://github.com/gildesmarais/html2rss/compare/v0.6.0...v0.7.0) (2019-10-28)
|
6
|
+
|
7
|
+
|
8
|
+
### Features
|
9
|
+
|
10
|
+
* handle json array response ([#49](https://github.com/gildesmarais/html2rss/issues/49)) ([288c2af](https://github.com/gildesmarais/html2rss/commit/288c2af))
|
11
|
+
* support enclosure on items ([#52](https://github.com/gildesmarais/html2rss/issues/52)) ([80a30a1](https://github.com/gildesmarais/html2rss/commit/80a30a1)), closes [#50](https://github.com/gildesmarais/html2rss/issues/50)
|
12
|
+
* use zeitwerk for autoloading ([#47](https://github.com/gildesmarais/html2rss/issues/47)) ([bce523d](https://github.com/gildesmarais/html2rss/commit/bce523d))
|
13
|
+
* **post_processors:** add gsub ([#53](https://github.com/gildesmarais/html2rss/issues/53)) ([de268ae](https://github.com/gildesmarais/html2rss/commit/de268ae))
|
14
|
+
* **postprocessor:** always wrap img tag in an a tag in sanitze html ([#51](https://github.com/gildesmarais/html2rss/issues/51)) ([6c7fb88](https://github.com/gildesmarais/html2rss/commit/6c7fb88))
|
2
15
|
|
3
16
|
|
4
17
|
|
data/Gemfile.lock
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html2rss (0.
|
4
|
+
html2rss (0.7.0)
|
5
5
|
activesupport (~> 5.0)
|
6
6
|
builder
|
7
7
|
faraday (~> 0.15)
|
8
8
|
faraday_middleware (~> 0.13)
|
9
9
|
hashie (~> 3.6)
|
10
|
+
mime-types (> 3.0)
|
10
11
|
nokogiri (>= 1.10, < 2.0)
|
11
12
|
reverse_markdown (~> 1.3)
|
12
13
|
sanitize (~> 5.0)
|
14
|
+
to_regexp
|
15
|
+
zeitwerk
|
13
16
|
|
14
17
|
GEM
|
15
18
|
remote: https://rubygems.org/
|
@@ -23,10 +26,10 @@ GEM
|
|
23
26
|
builder (3.2.3)
|
24
27
|
byebug (11.0.1)
|
25
28
|
concurrent-ruby (1.1.5)
|
26
|
-
crass (1.0.
|
29
|
+
crass (1.0.5)
|
27
30
|
diff-lcs (1.3)
|
28
31
|
docile (1.3.2)
|
29
|
-
faraday (0.
|
32
|
+
faraday (0.17.0)
|
30
33
|
multipart-post (>= 1.2, < 3)
|
31
34
|
faraday_middleware (0.13.1)
|
32
35
|
faraday (>= 0.7.4, < 1.0)
|
@@ -35,6 +38,9 @@ GEM
|
|
35
38
|
concurrent-ruby (~> 1.0)
|
36
39
|
jaro_winkler (1.5.3)
|
37
40
|
json (2.2.0)
|
41
|
+
mime-types (3.3)
|
42
|
+
mime-types-data (~> 3.2015)
|
43
|
+
mime-types-data (3.2019.1009)
|
38
44
|
mini_portile2 (2.4.0)
|
39
45
|
minitest (5.12.2)
|
40
46
|
multipart-post (2.1.1)
|
@@ -42,25 +48,25 @@ GEM
|
|
42
48
|
mini_portile2 (~> 2.4.0)
|
43
49
|
nokogumbo (2.0.1)
|
44
50
|
nokogiri (~> 1.8, >= 1.8.4)
|
45
|
-
parallel (1.
|
51
|
+
parallel (1.18.0)
|
46
52
|
parser (2.6.5.0)
|
47
53
|
ast (~> 2.4.0)
|
48
54
|
rainbow (3.0.0)
|
49
55
|
reverse_markdown (1.3.0)
|
50
56
|
nokogiri
|
51
|
-
rspec (3.
|
52
|
-
rspec-core (~> 3.
|
53
|
-
rspec-expectations (~> 3.
|
54
|
-
rspec-mocks (~> 3.
|
55
|
-
rspec-core (3.
|
56
|
-
rspec-support (~> 3.
|
57
|
-
rspec-expectations (3.
|
57
|
+
rspec (3.9.0)
|
58
|
+
rspec-core (~> 3.9.0)
|
59
|
+
rspec-expectations (~> 3.9.0)
|
60
|
+
rspec-mocks (~> 3.9.0)
|
61
|
+
rspec-core (3.9.0)
|
62
|
+
rspec-support (~> 3.9.0)
|
63
|
+
rspec-expectations (3.9.0)
|
58
64
|
diff-lcs (>= 1.2.0, < 2.0)
|
59
|
-
rspec-support (~> 3.
|
60
|
-
rspec-mocks (3.
|
65
|
+
rspec-support (~> 3.9.0)
|
66
|
+
rspec-mocks (3.9.0)
|
61
67
|
diff-lcs (>= 1.2.0, < 2.0)
|
62
|
-
rspec-support (~> 3.
|
63
|
-
rspec-support (3.
|
68
|
+
rspec-support (~> 3.9.0)
|
69
|
+
rspec-support (3.9.0)
|
64
70
|
rubocop (0.75.0)
|
65
71
|
jaro_winkler (~> 1.5.1)
|
66
72
|
parallel (~> 1.10)
|
@@ -83,11 +89,13 @@ GEM
|
|
83
89
|
simplecov-html (~> 0.10.0)
|
84
90
|
simplecov-html (0.10.2)
|
85
91
|
thread_safe (0.3.6)
|
92
|
+
to_regexp (0.2.1)
|
86
93
|
tzinfo (1.2.5)
|
87
94
|
thread_safe (~> 0.1)
|
88
95
|
unicode-display_width (1.6.0)
|
89
96
|
vcr (5.0.0)
|
90
97
|
yard (0.9.20)
|
98
|
+
zeitwerk (2.2.0)
|
91
99
|
|
92
100
|
PLATFORMS
|
93
101
|
ruby
|
data/README.md
CHANGED
@@ -20,45 +20,106 @@ Add this line to your application's Gemfile: `gem 'html2rss'`
|
|
20
20
|
Then execute: `bundle`
|
21
21
|
|
22
22
|
```ruby
|
23
|
-
rss =
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
23
|
+
rss =
|
24
|
+
Html2rss.feed(
|
25
|
+
channel: { title: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com/questions' },
|
26
|
+
selectors: {
|
27
|
+
items: { selector: '#hot-network-questions > ul > li' },
|
28
|
+
title: { selector: 'a' },
|
29
|
+
link: { selector: 'a', extractor: 'href' }
|
30
|
+
}
|
31
|
+
)
|
31
32
|
|
32
33
|
puts rss.to_s
|
33
34
|
```
|
34
35
|
|
35
36
|
## Usage with a YAML config file
|
36
37
|
|
37
|
-
Create a YAML config file. Find an example at [`
|
38
|
+
Create a YAML config file. Find an example at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
|
38
39
|
|
39
|
-
`Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')`
|
40
|
-
|
41
|
-
an `RSS:Rss` object.
|
40
|
+
`Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')`
|
41
|
+
returns an `RSS:Rss` object.
|
42
42
|
|
43
43
|
**Too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
|
44
44
|
|
45
|
+
## Assigning categories to an item
|
46
|
+
|
47
|
+
The `categories` selector takes an array of selector names. The value of those
|
48
|
+
selectors will become a category on the item.
|
49
|
+
|
50
|
+
<details>
|
51
|
+
<summary>See a YAML config example</summary>
|
52
|
+
|
53
|
+
```yml
|
54
|
+
channel:
|
55
|
+
# ... omitted
|
56
|
+
selectors:
|
57
|
+
#... omitted
|
58
|
+
genre:
|
59
|
+
selector: '.genre'
|
60
|
+
branch:
|
61
|
+
selector: '.branch'
|
62
|
+
categories:
|
63
|
+
- genre
|
64
|
+
- branch
|
65
|
+
```
|
66
|
+
|
67
|
+
</details>
|
68
|
+
|
69
|
+
## Adding an enclosure to each item
|
70
|
+
|
71
|
+
An enclosure can be 'anything', e.g. a image, audio or video file.
|
72
|
+
|
73
|
+
The config's `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's url as a base.
|
74
|
+
|
75
|
+
Since html2rss does no further inspection of the enclosure, the support of this tag comes with trade-offs:
|
76
|
+
|
77
|
+
1. The content-type is guessed from the file extension of the URL.
|
78
|
+
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
79
|
+
3. The content-length will always be undetermined and thus stated as `0` bytes.
|
80
|
+
|
81
|
+
Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item-enclosure) for further information on enclosing content.
|
82
|
+
|
83
|
+
<details>
|
84
|
+
<summary>See a YAML config example</summary>
|
85
|
+
|
86
|
+
```yml
|
87
|
+
channel:
|
88
|
+
# ... omitted
|
89
|
+
selectors:
|
90
|
+
#... omitted
|
91
|
+
enclosure:
|
92
|
+
selector: 'img'
|
93
|
+
extractor: 'attribute'
|
94
|
+
attribute: 'src'
|
95
|
+
```
|
96
|
+
|
97
|
+
</details>
|
98
|
+
|
45
99
|
## Scraping JSON
|
46
100
|
|
47
|
-
Since 0.5.0 it
|
101
|
+
Since 0.5.0 it's possible to scrape and process JSON.
|
48
102
|
|
49
103
|
Adding `json: true` to the channel config will convert the JSON response to XML.
|
50
104
|
|
51
|
-
|
105
|
+
<details>
|
106
|
+
<summary>See a YAML feed config example</summary>
|
52
107
|
|
53
108
|
```yaml
|
54
109
|
channel:
|
55
110
|
url: https://example.com
|
56
|
-
title:
|
111
|
+
title: 'Example with JSON'
|
57
112
|
json: true
|
58
113
|
# ...
|
59
114
|
```
|
60
115
|
|
61
|
-
|
116
|
+
</details>
|
117
|
+
|
118
|
+
Under the hood it uses ActiveSupport's [`Hash.to_xml`](https://apidock.com/rails/Hash/to_xml) core extension for the JSON to XML conversion.
|
119
|
+
|
120
|
+
### Conversion of JSON objects
|
121
|
+
|
122
|
+
This JSON object:
|
62
123
|
|
63
124
|
```json
|
64
125
|
{
|
@@ -69,19 +130,38 @@ Imagine this HTTP response:
|
|
69
130
|
will be converted to:
|
70
131
|
|
71
132
|
```xml
|
72
|
-
<
|
133
|
+
<hash>
|
73
134
|
<data>
|
74
135
|
<datum>
|
75
136
|
<title>Headline</title>
|
76
137
|
<url>https://example.com</url>
|
77
138
|
</datum>
|
78
139
|
</data>
|
79
|
-
</
|
140
|
+
</hash>
|
80
141
|
```
|
81
142
|
|
82
|
-
Your items selector would be `data > datum`, the item's link selector would be `url`.
|
143
|
+
Your items selector would be `data > datum`, the item's `link` selector would be `url`.
|
83
144
|
|
84
|
-
|
145
|
+
### Conversion of JSON arrays
|
146
|
+
|
147
|
+
This JSON array:
|
148
|
+
|
149
|
+
```json
|
150
|
+
[{ "title": "Headline", "url": "https://example.com" }]
|
151
|
+
```
|
152
|
+
|
153
|
+
will be converted to:
|
154
|
+
|
155
|
+
```xml
|
156
|
+
<objects>
|
157
|
+
<object>
|
158
|
+
<title>Headline</title>
|
159
|
+
<url>https://example.com</url>
|
160
|
+
</object>
|
161
|
+
</objects>
|
162
|
+
```
|
163
|
+
|
164
|
+
Your items selector would be `objects > object`, the item's `link` selector would be `url`.
|
85
165
|
|
86
166
|
## Set any HTTP header in the request
|
87
167
|
|
data/html2rss.gemspec
CHANGED
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
|
13
13
|
spec.homepage = 'https://github.com/gildesmarais/html2rss'
|
14
14
|
spec.license = 'MIT'
|
15
|
-
spec.required_ruby_version = '>= 2.4.
|
15
|
+
spec.required_ruby_version = '>= 2.4.4'
|
16
16
|
|
17
17
|
if spec.respond_to?(:metadata)
|
18
18
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
@@ -34,9 +34,12 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.add_dependency 'faraday', '~> 0.15'
|
35
35
|
spec.add_dependency 'faraday_middleware', '~> 0.13'
|
36
36
|
spec.add_dependency 'hashie', '~> 3.6'
|
37
|
+
spec.add_dependency 'mime-types', '> 3.0'
|
37
38
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
38
39
|
spec.add_dependency 'reverse_markdown', '~> 1.3'
|
39
40
|
spec.add_dependency 'sanitize', '~> 5.0'
|
41
|
+
spec.add_dependency 'to_regexp'
|
42
|
+
spec.add_dependency 'zeitwerk'
|
40
43
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
41
44
|
spec.add_development_dependency 'byebug'
|
42
45
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
data/lib/html2rss.rb
CHANGED
@@ -1,19 +1,15 @@
|
|
1
|
-
require_relative 'attribute_post_processors/html_to_markdown'
|
2
|
-
require_relative 'attribute_post_processors/parse_time'
|
3
|
-
require_relative 'attribute_post_processors/parse_uri'
|
4
|
-
require_relative 'attribute_post_processors/sanitize_html'
|
5
|
-
require_relative 'attribute_post_processors/substring'
|
6
|
-
require_relative 'attribute_post_processors/template'
|
7
|
-
|
8
1
|
module Html2rss
|
9
2
|
##
|
10
3
|
# Provides a namespace for attribute post processors.
|
11
4
|
module AttributePostProcessors
|
12
5
|
def self.get_processor(name)
|
13
|
-
|
14
|
-
|
6
|
+
@get_processor ||= Hash.new do |processors, key|
|
7
|
+
camel_cased_name = key.split('_').map(&:capitalize).join
|
8
|
+
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
|
9
|
+
processors[key] = Object.const_get(class_name)
|
10
|
+
end
|
15
11
|
|
16
|
-
|
12
|
+
@get_processor[name]
|
17
13
|
end
|
18
14
|
end
|
19
15
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'to_regexp'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
#
|
7
|
+
# Imagine this HTML:
|
8
|
+
# <h1>Foo bar and boo<h1>
|
9
|
+
#
|
10
|
+
# YAML usage example:
|
11
|
+
# selectors:
|
12
|
+
# title:
|
13
|
+
# selector: h1
|
14
|
+
# post_process:
|
15
|
+
# name: gsub
|
16
|
+
# pattern: boo
|
17
|
+
# replacement: baz
|
18
|
+
#
|
19
|
+
# Would return:
|
20
|
+
# 'Foo bar and baz'
|
21
|
+
#
|
22
|
+
# `pattern` can be a Regexp or a String.
|
23
|
+
# `replacement` can be a String or a Hash.
|
24
|
+
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
25
|
+
class Gsub
|
26
|
+
def initialize(value, env)
|
27
|
+
@value = value
|
28
|
+
@pattern = env[:options]['pattern'].to_regexp || env[:options]['pattern']
|
29
|
+
@replacement = env[:options]['replacement']
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# @return [String]
|
34
|
+
def get
|
35
|
+
@value.to_s.gsub(@pattern, @replacement)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -17,6 +17,10 @@ module Html2rss
|
|
17
17
|
# <script>alert();</script>
|
18
18
|
# </section>
|
19
19
|
#
|
20
|
+
# It also:
|
21
|
+
#
|
22
|
+
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
23
|
+
#
|
20
24
|
# YAML usage example:
|
21
25
|
#
|
22
26
|
# selectors:
|
@@ -41,28 +45,26 @@ module Html2rss
|
|
41
45
|
# - adds target="_blank" to a elements
|
42
46
|
# @return [String]
|
43
47
|
def get
|
44
|
-
Sanitize.fragment(
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
48
|
+
Sanitize.fragment(
|
49
|
+
@value,
|
50
|
+
Sanitize::Config.merge(
|
51
|
+
Sanitize::Config::RELAXED,
|
52
|
+
attributes: { all: %w[dir lang alt title translate] },
|
53
|
+
add_attributes: {
|
54
|
+
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
55
|
+
'img' => { 'referrer-policy' => 'no-referrer' }
|
56
|
+
},
|
57
|
+
transformers: [transform_urls_to_absolute_ones, wrap_img_in_a]
|
58
|
+
)
|
59
|
+
)
|
60
|
+
.to_s
|
61
|
+
.split
|
62
|
+
.join(' ')
|
58
63
|
end
|
59
64
|
|
60
65
|
private
|
61
66
|
|
62
|
-
URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
|
63
|
-
'a' => :href,
|
64
|
-
'img' => :src
|
65
|
-
}.freeze
|
67
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
66
68
|
|
67
69
|
def transform_urls_to_absolute_ones
|
68
70
|
lambda do |env|
|
@@ -78,6 +80,23 @@ module Html2rss
|
|
78
80
|
env[:node][url_attribute] = absolute_url
|
79
81
|
end
|
80
82
|
end
|
83
|
+
|
84
|
+
def wrap_img_in_a
|
85
|
+
lambda do |env|
|
86
|
+
return if env[:node_name] != 'img'
|
87
|
+
|
88
|
+
img = env[:node]
|
89
|
+
|
90
|
+
return if img.parent.name == 'a'
|
91
|
+
|
92
|
+
anchor = Nokogiri::XML::Node.new('a', img)
|
93
|
+
anchor[:href] = img[:src]
|
94
|
+
|
95
|
+
anchor.add_child img.dup
|
96
|
+
|
97
|
+
img.replace(anchor)
|
98
|
+
end
|
99
|
+
end
|
81
100
|
end
|
82
101
|
end
|
83
102
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'rss'
|
2
|
-
|
2
|
+
require 'mime/types'
|
3
3
|
|
4
4
|
module Html2rss
|
5
5
|
##
|
@@ -20,9 +20,7 @@ module Html2rss
|
|
20
20
|
RSS::Maker.make('2.0') do |maker|
|
21
21
|
add_channel(maker)
|
22
22
|
|
23
|
-
feed_items.map
|
24
|
-
add_item(feed_item, maker.items.new_item)
|
25
|
-
end
|
23
|
+
feed_items.map { |feed_item| add_item(feed_item, maker.items.new_item) }
|
26
24
|
end
|
27
25
|
end
|
28
26
|
|
@@ -48,10 +46,25 @@ module Html2rss
|
|
48
46
|
rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
|
49
47
|
end
|
50
48
|
|
51
|
-
feed_item.categories.each
|
52
|
-
|
53
|
-
|
49
|
+
feed_item.categories.each { |category| rss_item.categories.new_category.content = category }
|
50
|
+
add_enclosure_from_url(feed_item.enclosure_url, rss_item) if config.attribute?(:enclosure)
|
51
|
+
|
52
|
+
add_guid(feed_item, rss_item)
|
53
|
+
end
|
54
|
+
|
55
|
+
def add_enclosure_from_url(url, rss_item)
|
56
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
57
|
+
|
58
|
+
rss_item.enclosure.type = if content_type && content_type.first
|
59
|
+
content_type.first.to_s
|
60
|
+
else
|
61
|
+
'application/octet-stream'
|
62
|
+
end
|
63
|
+
rss_item.enclosure.length = 0
|
64
|
+
rss_item.enclosure.url = url
|
65
|
+
end
|
54
66
|
|
67
|
+
def add_guid(feed_item, rss_item)
|
55
68
|
rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
|
56
69
|
rss_item.guid.isPermaLink = false
|
57
70
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
require 'faraday'
|
2
2
|
require 'faraday_middleware'
|
3
|
-
require 'open-uri'
|
4
3
|
require 'nokogiri'
|
5
|
-
require_relative 'item_extractors'
|
6
|
-
require_relative 'attribute_post_processors'
|
7
4
|
|
8
5
|
module Html2rss
|
9
6
|
##
|
@@ -34,7 +31,7 @@ module Html2rss
|
|
34
31
|
|
35
32
|
def available_attributes
|
36
33
|
@available_attributes ||= (%w[title link description author comments updated] &
|
37
|
-
@config.attribute_names) - [
|
34
|
+
@config.attribute_names) - %w[categories enclosure]
|
38
35
|
end
|
39
36
|
|
40
37
|
##
|
@@ -54,14 +51,19 @@ module Html2rss
|
|
54
51
|
categories.keep_if { |category| category.to_s != '' }
|
55
52
|
end
|
56
53
|
|
54
|
+
def enclosure_url
|
55
|
+
enclosure = method_missing(:enclosure)
|
56
|
+
return if enclosure.to_s == ''
|
57
|
+
|
58
|
+
Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s
|
59
|
+
end
|
60
|
+
|
57
61
|
##
|
58
62
|
# @return [Array]
|
59
63
|
def self.from_url(url, config)
|
60
64
|
body = get_body_from_url(url, config)
|
61
65
|
|
62
|
-
Nokogiri
|
63
|
-
new xml_item, config
|
64
|
-
end
|
66
|
+
Nokogiri.HTML(body).css(config.selector('items')).map { |xml_item| new xml_item, config }
|
65
67
|
end
|
66
68
|
|
67
69
|
private
|
@@ -1,23 +1,17 @@
|
|
1
|
-
require_relative 'item_extractors/attribute'
|
2
|
-
require_relative 'item_extractors/current_time'
|
3
|
-
require_relative 'item_extractors/href'
|
4
|
-
require_relative 'item_extractors/html'
|
5
|
-
require_relative 'item_extractors/static'
|
6
|
-
require_relative 'item_extractors/text'
|
7
|
-
|
8
1
|
module Html2rss
|
9
2
|
##
|
10
3
|
# Provides a namespace for item extractors.
|
11
4
|
module ItemExtractors
|
12
|
-
DEFAULT = '
|
5
|
+
DEFAULT = 'Text'.freeze
|
13
6
|
|
14
7
|
def self.get_extractor(name)
|
15
|
-
@
|
16
|
-
camel_cased_name = key.split('_').map(&:capitalize).join
|
8
|
+
@get_extractor ||= Hash.new do |extractors, key|
|
9
|
+
camel_cased_name = (key || DEFAULT).split('_').map(&:capitalize).join
|
17
10
|
class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
|
11
|
+
extractors[key] = Object.const_get(class_name)
|
12
|
+
end
|
18
13
|
|
19
|
-
|
20
|
-
end[name || DEFAULT]
|
14
|
+
@get_extractor[name]
|
21
15
|
end
|
22
16
|
|
23
17
|
##
|
data/lib/html2rss/utils.rb
CHANGED
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '3.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: mime-types
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: nokogiri
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +142,34 @@ dependencies:
|
|
128
142
|
- - "~>"
|
129
143
|
- !ruby/object:Gem::Version
|
130
144
|
version: '5.0'
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
name: to_regexp
|
147
|
+
requirement: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - ">="
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
type: :runtime
|
153
|
+
prerelease: false
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - ">="
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '0'
|
159
|
+
- !ruby/object:Gem::Dependency
|
160
|
+
name: zeitwerk
|
161
|
+
requirement: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - ">="
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
166
|
+
type: :runtime
|
167
|
+
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - ">="
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0'
|
131
173
|
- !ruby/object:Gem::Dependency
|
132
174
|
name: bundler
|
133
175
|
requirement: !ruby/object:Gem::Requirement
|
@@ -277,6 +319,7 @@ files:
|
|
277
319
|
- html2rss.gemspec
|
278
320
|
- lib/html2rss.rb
|
279
321
|
- lib/html2rss/attribute_post_processors.rb
|
322
|
+
- lib/html2rss/attribute_post_processors/gsub.rb
|
280
323
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
281
324
|
- lib/html2rss/attribute_post_processors/parse_time.rb
|
282
325
|
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
@@ -310,7 +353,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
310
353
|
requirements:
|
311
354
|
- - ">="
|
312
355
|
- !ruby/object:Gem::Version
|
313
|
-
version: 2.4.
|
356
|
+
version: 2.4.4
|
314
357
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
315
358
|
requirements:
|
316
359
|
- - ">="
|