html2rss 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +0 -3
- data/CHANGELOG.md +14 -1
- data/Gemfile.lock +23 -15
- data/README.md +100 -20
- data/html2rss.gemspec +4 -1
- data/lib/html2rss.rb +5 -4
- data/lib/html2rss/attribute_post_processors.rb +6 -10
- data/lib/html2rss/attribute_post_processors/gsub.rb +39 -0
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +37 -18
- data/lib/html2rss/feed_builder.rb +20 -7
- data/lib/html2rss/item.rb +9 -7
- data/lib/html2rss/item_extractors.rb +6 -12
- data/lib/html2rss/utils.rb +1 -1
- data/lib/html2rss/version.rb +1 -1
- metadata +46 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 645fe7ea0ebe4a733c9833e8c22a93aa3f2d2b3b8589160fb001ad6ffd4659d3
|
|
4
|
+
data.tar.gz: 0fb454258a1f334243984bdb73a699ffcfc676c529d87bb0886fbba75127b7cf
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 350f0fd0e11bd35963c89b56aea9eb1dc90ae72956d849fd85b0b67d39e0bb7625176558becc0712f5e573a7605b541dea6497557699f727e4c27ef510d6d58f
|
|
7
|
+
data.tar.gz: e83905ddf24fc4c63793392cd47aa2d02e984c44ba489610119cff6a25bb70be57f4a95f5b9fb78229d9dc6366e0181d2322743eeee8ba59ee840831f8790bb1
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,4 +1,17 @@
|
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.
|
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v) (2019-10-28)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# [0.7.0](https://github.com/gildesmarais/html2rss/compare/v0.6.0...v0.7.0) (2019-10-28)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### Features
|
|
9
|
+
|
|
10
|
+
* handle json array response ([#49](https://github.com/gildesmarais/html2rss/issues/49)) ([288c2af](https://github.com/gildesmarais/html2rss/commit/288c2af))
|
|
11
|
+
* support enclosure on items ([#52](https://github.com/gildesmarais/html2rss/issues/52)) ([80a30a1](https://github.com/gildesmarais/html2rss/commit/80a30a1)), closes [#50](https://github.com/gildesmarais/html2rss/issues/50)
|
|
12
|
+
* use zeitwerk for autoloading ([#47](https://github.com/gildesmarais/html2rss/issues/47)) ([bce523d](https://github.com/gildesmarais/html2rss/commit/bce523d))
|
|
13
|
+
* **post_processors:** add gsub ([#53](https://github.com/gildesmarais/html2rss/issues/53)) ([de268ae](https://github.com/gildesmarais/html2rss/commit/de268ae))
|
|
14
|
+
* **postprocessor:** always wrap img tag in an a tag in sanitze html ([#51](https://github.com/gildesmarais/html2rss/issues/51)) ([6c7fb88](https://github.com/gildesmarais/html2rss/commit/6c7fb88))
|
|
2
15
|
|
|
3
16
|
|
|
4
17
|
|
data/Gemfile.lock
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html2rss (0.
|
|
4
|
+
html2rss (0.7.0)
|
|
5
5
|
activesupport (~> 5.0)
|
|
6
6
|
builder
|
|
7
7
|
faraday (~> 0.15)
|
|
8
8
|
faraday_middleware (~> 0.13)
|
|
9
9
|
hashie (~> 3.6)
|
|
10
|
+
mime-types (> 3.0)
|
|
10
11
|
nokogiri (>= 1.10, < 2.0)
|
|
11
12
|
reverse_markdown (~> 1.3)
|
|
12
13
|
sanitize (~> 5.0)
|
|
14
|
+
to_regexp
|
|
15
|
+
zeitwerk
|
|
13
16
|
|
|
14
17
|
GEM
|
|
15
18
|
remote: https://rubygems.org/
|
|
@@ -23,10 +26,10 @@ GEM
|
|
|
23
26
|
builder (3.2.3)
|
|
24
27
|
byebug (11.0.1)
|
|
25
28
|
concurrent-ruby (1.1.5)
|
|
26
|
-
crass (1.0.
|
|
29
|
+
crass (1.0.5)
|
|
27
30
|
diff-lcs (1.3)
|
|
28
31
|
docile (1.3.2)
|
|
29
|
-
faraday (0.
|
|
32
|
+
faraday (0.17.0)
|
|
30
33
|
multipart-post (>= 1.2, < 3)
|
|
31
34
|
faraday_middleware (0.13.1)
|
|
32
35
|
faraday (>= 0.7.4, < 1.0)
|
|
@@ -35,6 +38,9 @@ GEM
|
|
|
35
38
|
concurrent-ruby (~> 1.0)
|
|
36
39
|
jaro_winkler (1.5.3)
|
|
37
40
|
json (2.2.0)
|
|
41
|
+
mime-types (3.3)
|
|
42
|
+
mime-types-data (~> 3.2015)
|
|
43
|
+
mime-types-data (3.2019.1009)
|
|
38
44
|
mini_portile2 (2.4.0)
|
|
39
45
|
minitest (5.12.2)
|
|
40
46
|
multipart-post (2.1.1)
|
|
@@ -42,25 +48,25 @@ GEM
|
|
|
42
48
|
mini_portile2 (~> 2.4.0)
|
|
43
49
|
nokogumbo (2.0.1)
|
|
44
50
|
nokogiri (~> 1.8, >= 1.8.4)
|
|
45
|
-
parallel (1.
|
|
51
|
+
parallel (1.18.0)
|
|
46
52
|
parser (2.6.5.0)
|
|
47
53
|
ast (~> 2.4.0)
|
|
48
54
|
rainbow (3.0.0)
|
|
49
55
|
reverse_markdown (1.3.0)
|
|
50
56
|
nokogiri
|
|
51
|
-
rspec (3.
|
|
52
|
-
rspec-core (~> 3.
|
|
53
|
-
rspec-expectations (~> 3.
|
|
54
|
-
rspec-mocks (~> 3.
|
|
55
|
-
rspec-core (3.
|
|
56
|
-
rspec-support (~> 3.
|
|
57
|
-
rspec-expectations (3.
|
|
57
|
+
rspec (3.9.0)
|
|
58
|
+
rspec-core (~> 3.9.0)
|
|
59
|
+
rspec-expectations (~> 3.9.0)
|
|
60
|
+
rspec-mocks (~> 3.9.0)
|
|
61
|
+
rspec-core (3.9.0)
|
|
62
|
+
rspec-support (~> 3.9.0)
|
|
63
|
+
rspec-expectations (3.9.0)
|
|
58
64
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
59
|
-
rspec-support (~> 3.
|
|
60
|
-
rspec-mocks (3.
|
|
65
|
+
rspec-support (~> 3.9.0)
|
|
66
|
+
rspec-mocks (3.9.0)
|
|
61
67
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
62
|
-
rspec-support (~> 3.
|
|
63
|
-
rspec-support (3.
|
|
68
|
+
rspec-support (~> 3.9.0)
|
|
69
|
+
rspec-support (3.9.0)
|
|
64
70
|
rubocop (0.75.0)
|
|
65
71
|
jaro_winkler (~> 1.5.1)
|
|
66
72
|
parallel (~> 1.10)
|
|
@@ -83,11 +89,13 @@ GEM
|
|
|
83
89
|
simplecov-html (~> 0.10.0)
|
|
84
90
|
simplecov-html (0.10.2)
|
|
85
91
|
thread_safe (0.3.6)
|
|
92
|
+
to_regexp (0.2.1)
|
|
86
93
|
tzinfo (1.2.5)
|
|
87
94
|
thread_safe (~> 0.1)
|
|
88
95
|
unicode-display_width (1.6.0)
|
|
89
96
|
vcr (5.0.0)
|
|
90
97
|
yard (0.9.20)
|
|
98
|
+
zeitwerk (2.2.0)
|
|
91
99
|
|
|
92
100
|
PLATFORMS
|
|
93
101
|
ruby
|
data/README.md
CHANGED
|
@@ -20,45 +20,106 @@ Add this line to your application's Gemfile: `gem 'html2rss'`
|
|
|
20
20
|
Then execute: `bundle`
|
|
21
21
|
|
|
22
22
|
```ruby
|
|
23
|
-
rss =
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
23
|
+
rss =
|
|
24
|
+
Html2rss.feed(
|
|
25
|
+
channel: { title: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com/questions' },
|
|
26
|
+
selectors: {
|
|
27
|
+
items: { selector: '#hot-network-questions > ul > li' },
|
|
28
|
+
title: { selector: 'a' },
|
|
29
|
+
link: { selector: 'a', extractor: 'href' }
|
|
30
|
+
}
|
|
31
|
+
)
|
|
31
32
|
|
|
32
33
|
puts rss.to_s
|
|
33
34
|
```
|
|
34
35
|
|
|
35
36
|
## Usage with a YAML config file
|
|
36
37
|
|
|
37
|
-
Create a YAML config file. Find an example at [`
|
|
38
|
+
Create a YAML config file. Find an example at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
|
|
38
39
|
|
|
39
|
-
`Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')`
|
|
40
|
-
|
|
41
|
-
an `RSS:Rss` object.
|
|
40
|
+
`Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')`
|
|
41
|
+
returns an `RSS:Rss` object.
|
|
42
42
|
|
|
43
43
|
**Too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
|
|
44
44
|
|
|
45
|
+
## Assigning categories to an item
|
|
46
|
+
|
|
47
|
+
The `categories` selector takes an array of selector names. The value of those
|
|
48
|
+
selectors will become a category on the item.
|
|
49
|
+
|
|
50
|
+
<details>
|
|
51
|
+
<summary>See a YAML config example</summary>
|
|
52
|
+
|
|
53
|
+
```yml
|
|
54
|
+
channel:
|
|
55
|
+
# ... omitted
|
|
56
|
+
selectors:
|
|
57
|
+
#... omitted
|
|
58
|
+
genre:
|
|
59
|
+
selector: '.genre'
|
|
60
|
+
branch:
|
|
61
|
+
selector: '.branch'
|
|
62
|
+
categories:
|
|
63
|
+
- genre
|
|
64
|
+
- branch
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
</details>
|
|
68
|
+
|
|
69
|
+
## Adding an enclosure to each item
|
|
70
|
+
|
|
71
|
+
An enclosure can be 'anything', e.g. a image, audio or video file.
|
|
72
|
+
|
|
73
|
+
The config's `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's url as a base.
|
|
74
|
+
|
|
75
|
+
Since html2rss does no further inspection of the enclosure, the support of this tag comes with trade-offs:
|
|
76
|
+
|
|
77
|
+
1. The content-type is guessed from the file extension of the URL.
|
|
78
|
+
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
|
79
|
+
3. The content-length will always be undetermined and thus stated as `0` bytes.
|
|
80
|
+
|
|
81
|
+
Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item-enclosure) for further information on enclosing content.
|
|
82
|
+
|
|
83
|
+
<details>
|
|
84
|
+
<summary>See a YAML config example</summary>
|
|
85
|
+
|
|
86
|
+
```yml
|
|
87
|
+
channel:
|
|
88
|
+
# ... omitted
|
|
89
|
+
selectors:
|
|
90
|
+
#... omitted
|
|
91
|
+
enclosure:
|
|
92
|
+
selector: 'img'
|
|
93
|
+
extractor: 'attribute'
|
|
94
|
+
attribute: 'src'
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
</details>
|
|
98
|
+
|
|
45
99
|
## Scraping JSON
|
|
46
100
|
|
|
47
|
-
Since 0.5.0 it
|
|
101
|
+
Since 0.5.0 it's possible to scrape and process JSON.
|
|
48
102
|
|
|
49
103
|
Adding `json: true` to the channel config will convert the JSON response to XML.
|
|
50
104
|
|
|
51
|
-
|
|
105
|
+
<details>
|
|
106
|
+
<summary>See a YAML feed config example</summary>
|
|
52
107
|
|
|
53
108
|
```yaml
|
|
54
109
|
channel:
|
|
55
110
|
url: https://example.com
|
|
56
|
-
title:
|
|
111
|
+
title: 'Example with JSON'
|
|
57
112
|
json: true
|
|
58
113
|
# ...
|
|
59
114
|
```
|
|
60
115
|
|
|
61
|
-
|
|
116
|
+
</details>
|
|
117
|
+
|
|
118
|
+
Under the hood it uses ActiveSupport's [`Hash.to_xml`](https://apidock.com/rails/Hash/to_xml) core extension for the JSON to XML conversion.
|
|
119
|
+
|
|
120
|
+
### Conversion of JSON objects
|
|
121
|
+
|
|
122
|
+
This JSON object:
|
|
62
123
|
|
|
63
124
|
```json
|
|
64
125
|
{
|
|
@@ -69,19 +130,38 @@ Imagine this HTTP response:
|
|
|
69
130
|
will be converted to:
|
|
70
131
|
|
|
71
132
|
```xml
|
|
72
|
-
<
|
|
133
|
+
<hash>
|
|
73
134
|
<data>
|
|
74
135
|
<datum>
|
|
75
136
|
<title>Headline</title>
|
|
76
137
|
<url>https://example.com</url>
|
|
77
138
|
</datum>
|
|
78
139
|
</data>
|
|
79
|
-
</
|
|
140
|
+
</hash>
|
|
80
141
|
```
|
|
81
142
|
|
|
82
|
-
Your items selector would be `data > datum`, the item's link selector would be `url`.
|
|
143
|
+
Your items selector would be `data > datum`, the item's `link` selector would be `url`.
|
|
83
144
|
|
|
84
|
-
|
|
145
|
+
### Conversion of JSON arrays
|
|
146
|
+
|
|
147
|
+
This JSON array:
|
|
148
|
+
|
|
149
|
+
```json
|
|
150
|
+
[{ "title": "Headline", "url": "https://example.com" }]
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
will be converted to:
|
|
154
|
+
|
|
155
|
+
```xml
|
|
156
|
+
<objects>
|
|
157
|
+
<object>
|
|
158
|
+
<title>Headline</title>
|
|
159
|
+
<url>https://example.com</url>
|
|
160
|
+
</object>
|
|
161
|
+
</objects>
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Your items selector would be `objects > object`, the item's `link` selector would be `url`.
|
|
85
165
|
|
|
86
166
|
## Set any HTTP header in the request
|
|
87
167
|
|
data/html2rss.gemspec
CHANGED
|
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
|
|
|
12
12
|
spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
|
|
13
13
|
spec.homepage = 'https://github.com/gildesmarais/html2rss'
|
|
14
14
|
spec.license = 'MIT'
|
|
15
|
-
spec.required_ruby_version = '>= 2.4.
|
|
15
|
+
spec.required_ruby_version = '>= 2.4.4'
|
|
16
16
|
|
|
17
17
|
if spec.respond_to?(:metadata)
|
|
18
18
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
|
@@ -34,9 +34,12 @@ Gem::Specification.new do |spec|
|
|
|
34
34
|
spec.add_dependency 'faraday', '~> 0.15'
|
|
35
35
|
spec.add_dependency 'faraday_middleware', '~> 0.13'
|
|
36
36
|
spec.add_dependency 'hashie', '~> 3.6'
|
|
37
|
+
spec.add_dependency 'mime-types', '> 3.0'
|
|
37
38
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
|
38
39
|
spec.add_dependency 'reverse_markdown', '~> 1.3'
|
|
39
40
|
spec.add_dependency 'sanitize', '~> 5.0'
|
|
41
|
+
spec.add_dependency 'to_regexp'
|
|
42
|
+
spec.add_dependency 'zeitwerk'
|
|
40
43
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
|
41
44
|
spec.add_development_dependency 'byebug'
|
|
42
45
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
data/lib/html2rss.rb
CHANGED
|
@@ -1,19 +1,15 @@
|
|
|
1
|
-
require_relative 'attribute_post_processors/html_to_markdown'
|
|
2
|
-
require_relative 'attribute_post_processors/parse_time'
|
|
3
|
-
require_relative 'attribute_post_processors/parse_uri'
|
|
4
|
-
require_relative 'attribute_post_processors/sanitize_html'
|
|
5
|
-
require_relative 'attribute_post_processors/substring'
|
|
6
|
-
require_relative 'attribute_post_processors/template'
|
|
7
|
-
|
|
8
1
|
module Html2rss
|
|
9
2
|
##
|
|
10
3
|
# Provides a namespace for attribute post processors.
|
|
11
4
|
module AttributePostProcessors
|
|
12
5
|
def self.get_processor(name)
|
|
13
|
-
|
|
14
|
-
|
|
6
|
+
@get_processor ||= Hash.new do |processors, key|
|
|
7
|
+
camel_cased_name = key.split('_').map(&:capitalize).join
|
|
8
|
+
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
|
|
9
|
+
processors[key] = Object.const_get(class_name)
|
|
10
|
+
end
|
|
15
11
|
|
|
16
|
-
|
|
12
|
+
@get_processor[name]
|
|
17
13
|
end
|
|
18
14
|
end
|
|
19
15
|
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
require 'to_regexp'
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
module AttributePostProcessors
|
|
5
|
+
##
|
|
6
|
+
#
|
|
7
|
+
# Imagine this HTML:
|
|
8
|
+
# <h1>Foo bar and boo<h1>
|
|
9
|
+
#
|
|
10
|
+
# YAML usage example:
|
|
11
|
+
# selectors:
|
|
12
|
+
# title:
|
|
13
|
+
# selector: h1
|
|
14
|
+
# post_process:
|
|
15
|
+
# name: gsub
|
|
16
|
+
# pattern: boo
|
|
17
|
+
# replacement: baz
|
|
18
|
+
#
|
|
19
|
+
# Would return:
|
|
20
|
+
# 'Foo bar and baz'
|
|
21
|
+
#
|
|
22
|
+
# `pattern` can be a Regexp or a String.
|
|
23
|
+
# `replacement` can be a String or a Hash.
|
|
24
|
+
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
|
25
|
+
class Gsub
|
|
26
|
+
def initialize(value, env)
|
|
27
|
+
@value = value
|
|
28
|
+
@pattern = env[:options]['pattern'].to_regexp || env[:options]['pattern']
|
|
29
|
+
@replacement = env[:options]['replacement']
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# @return [String]
|
|
34
|
+
def get
|
|
35
|
+
@value.to_s.gsub(@pattern, @replacement)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -17,6 +17,10 @@ module Html2rss
|
|
|
17
17
|
# <script>alert();</script>
|
|
18
18
|
# </section>
|
|
19
19
|
#
|
|
20
|
+
# It also:
|
|
21
|
+
#
|
|
22
|
+
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
|
23
|
+
#
|
|
20
24
|
# YAML usage example:
|
|
21
25
|
#
|
|
22
26
|
# selectors:
|
|
@@ -41,28 +45,26 @@ module Html2rss
|
|
|
41
45
|
# - adds target="_blank" to a elements
|
|
42
46
|
# @return [String]
|
|
43
47
|
def get
|
|
44
|
-
Sanitize.fragment(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
48
|
+
Sanitize.fragment(
|
|
49
|
+
@value,
|
|
50
|
+
Sanitize::Config.merge(
|
|
51
|
+
Sanitize::Config::RELAXED,
|
|
52
|
+
attributes: { all: %w[dir lang alt title translate] },
|
|
53
|
+
add_attributes: {
|
|
54
|
+
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
|
55
|
+
'img' => { 'referrer-policy' => 'no-referrer' }
|
|
56
|
+
},
|
|
57
|
+
transformers: [transform_urls_to_absolute_ones, wrap_img_in_a]
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
.to_s
|
|
61
|
+
.split
|
|
62
|
+
.join(' ')
|
|
58
63
|
end
|
|
59
64
|
|
|
60
65
|
private
|
|
61
66
|
|
|
62
|
-
URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
|
|
63
|
-
'a' => :href,
|
|
64
|
-
'img' => :src
|
|
65
|
-
}.freeze
|
|
67
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
|
66
68
|
|
|
67
69
|
def transform_urls_to_absolute_ones
|
|
68
70
|
lambda do |env|
|
|
@@ -78,6 +80,23 @@ module Html2rss
|
|
|
78
80
|
env[:node][url_attribute] = absolute_url
|
|
79
81
|
end
|
|
80
82
|
end
|
|
83
|
+
|
|
84
|
+
def wrap_img_in_a
|
|
85
|
+
lambda do |env|
|
|
86
|
+
return if env[:node_name] != 'img'
|
|
87
|
+
|
|
88
|
+
img = env[:node]
|
|
89
|
+
|
|
90
|
+
return if img.parent.name == 'a'
|
|
91
|
+
|
|
92
|
+
anchor = Nokogiri::XML::Node.new('a', img)
|
|
93
|
+
anchor[:href] = img[:src]
|
|
94
|
+
|
|
95
|
+
anchor.add_child img.dup
|
|
96
|
+
|
|
97
|
+
img.replace(anchor)
|
|
98
|
+
end
|
|
99
|
+
end
|
|
81
100
|
end
|
|
82
101
|
end
|
|
83
102
|
end
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
require 'rss'
|
|
2
|
-
|
|
2
|
+
require 'mime/types'
|
|
3
3
|
|
|
4
4
|
module Html2rss
|
|
5
5
|
##
|
|
@@ -20,9 +20,7 @@ module Html2rss
|
|
|
20
20
|
RSS::Maker.make('2.0') do |maker|
|
|
21
21
|
add_channel(maker)
|
|
22
22
|
|
|
23
|
-
feed_items.map
|
|
24
|
-
add_item(feed_item, maker.items.new_item)
|
|
25
|
-
end
|
|
23
|
+
feed_items.map { |feed_item| add_item(feed_item, maker.items.new_item) }
|
|
26
24
|
end
|
|
27
25
|
end
|
|
28
26
|
|
|
@@ -48,10 +46,25 @@ module Html2rss
|
|
|
48
46
|
rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
|
|
49
47
|
end
|
|
50
48
|
|
|
51
|
-
feed_item.categories.each
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
feed_item.categories.each { |category| rss_item.categories.new_category.content = category }
|
|
50
|
+
add_enclosure_from_url(feed_item.enclosure_url, rss_item) if config.attribute?(:enclosure)
|
|
51
|
+
|
|
52
|
+
add_guid(feed_item, rss_item)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def add_enclosure_from_url(url, rss_item)
|
|
56
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
|
57
|
+
|
|
58
|
+
rss_item.enclosure.type = if content_type && content_type.first
|
|
59
|
+
content_type.first.to_s
|
|
60
|
+
else
|
|
61
|
+
'application/octet-stream'
|
|
62
|
+
end
|
|
63
|
+
rss_item.enclosure.length = 0
|
|
64
|
+
rss_item.enclosure.url = url
|
|
65
|
+
end
|
|
54
66
|
|
|
67
|
+
def add_guid(feed_item, rss_item)
|
|
55
68
|
rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
|
|
56
69
|
rss_item.guid.isPermaLink = false
|
|
57
70
|
end
|
data/lib/html2rss/item.rb
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
require 'faraday'
|
|
2
2
|
require 'faraday_middleware'
|
|
3
|
-
require 'open-uri'
|
|
4
3
|
require 'nokogiri'
|
|
5
|
-
require_relative 'item_extractors'
|
|
6
|
-
require_relative 'attribute_post_processors'
|
|
7
4
|
|
|
8
5
|
module Html2rss
|
|
9
6
|
##
|
|
@@ -34,7 +31,7 @@ module Html2rss
|
|
|
34
31
|
|
|
35
32
|
def available_attributes
|
|
36
33
|
@available_attributes ||= (%w[title link description author comments updated] &
|
|
37
|
-
@config.attribute_names) - [
|
|
34
|
+
@config.attribute_names) - %w[categories enclosure]
|
|
38
35
|
end
|
|
39
36
|
|
|
40
37
|
##
|
|
@@ -54,14 +51,19 @@ module Html2rss
|
|
|
54
51
|
categories.keep_if { |category| category.to_s != '' }
|
|
55
52
|
end
|
|
56
53
|
|
|
54
|
+
def enclosure_url
|
|
55
|
+
enclosure = method_missing(:enclosure)
|
|
56
|
+
return if enclosure.to_s == ''
|
|
57
|
+
|
|
58
|
+
Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s
|
|
59
|
+
end
|
|
60
|
+
|
|
57
61
|
##
|
|
58
62
|
# @return [Array]
|
|
59
63
|
def self.from_url(url, config)
|
|
60
64
|
body = get_body_from_url(url, config)
|
|
61
65
|
|
|
62
|
-
Nokogiri
|
|
63
|
-
new xml_item, config
|
|
64
|
-
end
|
|
66
|
+
Nokogiri.HTML(body).css(config.selector('items')).map { |xml_item| new xml_item, config }
|
|
65
67
|
end
|
|
66
68
|
|
|
67
69
|
private
|
|
@@ -1,23 +1,17 @@
|
|
|
1
|
-
require_relative 'item_extractors/attribute'
|
|
2
|
-
require_relative 'item_extractors/current_time'
|
|
3
|
-
require_relative 'item_extractors/href'
|
|
4
|
-
require_relative 'item_extractors/html'
|
|
5
|
-
require_relative 'item_extractors/static'
|
|
6
|
-
require_relative 'item_extractors/text'
|
|
7
|
-
|
|
8
1
|
module Html2rss
|
|
9
2
|
##
|
|
10
3
|
# Provides a namespace for item extractors.
|
|
11
4
|
module ItemExtractors
|
|
12
|
-
DEFAULT = '
|
|
5
|
+
DEFAULT = 'Text'.freeze
|
|
13
6
|
|
|
14
7
|
def self.get_extractor(name)
|
|
15
|
-
@
|
|
16
|
-
camel_cased_name = key.split('_').map(&:capitalize).join
|
|
8
|
+
@get_extractor ||= Hash.new do |extractors, key|
|
|
9
|
+
camel_cased_name = (key || DEFAULT).split('_').map(&:capitalize).join
|
|
17
10
|
class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
|
|
11
|
+
extractors[key] = Object.const_get(class_name)
|
|
12
|
+
end
|
|
18
13
|
|
|
19
|
-
|
|
20
|
-
end[name || DEFAULT]
|
|
14
|
+
@get_extractor[name]
|
|
21
15
|
end
|
|
22
16
|
|
|
23
17
|
##
|
data/lib/html2rss/utils.rb
CHANGED
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2019-10-
|
|
11
|
+
date: 2019-10-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activesupport
|
|
@@ -80,6 +80,20 @@ dependencies:
|
|
|
80
80
|
- - "~>"
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
82
|
version: '3.6'
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: mime-types
|
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - ">"
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: '3.0'
|
|
90
|
+
type: :runtime
|
|
91
|
+
prerelease: false
|
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - ">"
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '3.0'
|
|
83
97
|
- !ruby/object:Gem::Dependency
|
|
84
98
|
name: nokogiri
|
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -128,6 +142,34 @@ dependencies:
|
|
|
128
142
|
- - "~>"
|
|
129
143
|
- !ruby/object:Gem::Version
|
|
130
144
|
version: '5.0'
|
|
145
|
+
- !ruby/object:Gem::Dependency
|
|
146
|
+
name: to_regexp
|
|
147
|
+
requirement: !ruby/object:Gem::Requirement
|
|
148
|
+
requirements:
|
|
149
|
+
- - ">="
|
|
150
|
+
- !ruby/object:Gem::Version
|
|
151
|
+
version: '0'
|
|
152
|
+
type: :runtime
|
|
153
|
+
prerelease: false
|
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
155
|
+
requirements:
|
|
156
|
+
- - ">="
|
|
157
|
+
- !ruby/object:Gem::Version
|
|
158
|
+
version: '0'
|
|
159
|
+
- !ruby/object:Gem::Dependency
|
|
160
|
+
name: zeitwerk
|
|
161
|
+
requirement: !ruby/object:Gem::Requirement
|
|
162
|
+
requirements:
|
|
163
|
+
- - ">="
|
|
164
|
+
- !ruby/object:Gem::Version
|
|
165
|
+
version: '0'
|
|
166
|
+
type: :runtime
|
|
167
|
+
prerelease: false
|
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
169
|
+
requirements:
|
|
170
|
+
- - ">="
|
|
171
|
+
- !ruby/object:Gem::Version
|
|
172
|
+
version: '0'
|
|
131
173
|
- !ruby/object:Gem::Dependency
|
|
132
174
|
name: bundler
|
|
133
175
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -277,6 +319,7 @@ files:
|
|
|
277
319
|
- html2rss.gemspec
|
|
278
320
|
- lib/html2rss.rb
|
|
279
321
|
- lib/html2rss/attribute_post_processors.rb
|
|
322
|
+
- lib/html2rss/attribute_post_processors/gsub.rb
|
|
280
323
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
|
281
324
|
- lib/html2rss/attribute_post_processors/parse_time.rb
|
|
282
325
|
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
|
@@ -310,7 +353,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
310
353
|
requirements:
|
|
311
354
|
- - ">="
|
|
312
355
|
- !ruby/object:Gem::Version
|
|
313
|
-
version: 2.4.
|
|
356
|
+
version: 2.4.4
|
|
314
357
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
315
358
|
requirements:
|
|
316
359
|
- - ">="
|