html2rss 0.6.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +122 -17
- data/.travis.yml +3 -3
- data/CHANGELOG.md +97 -42
- data/Gemfile +2 -0
- data/Gemfile.lock +84 -53
- data/README.md +461 -47
- data/html2rss.gemspec +11 -7
- data/lib/html2rss.rb +5 -4
- data/lib/html2rss/attribute_post_processors.rb +4 -10
- data/lib/html2rss/attribute_post_processors/gsub.rb +42 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +45 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +1 -1
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +3 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +42 -22
- data/lib/html2rss/attribute_post_processors/substring.rb +11 -5
- data/lib/html2rss/attribute_post_processors/template.rb +23 -14
- data/lib/html2rss/config.rb +40 -20
- data/lib/html2rss/feed_builder.rb +42 -20
- data/lib/html2rss/item.rb +24 -18
- data/lib/html2rss/item_extractors.rb +6 -13
- data/lib/html2rss/item_extractors/attribute.rb +1 -1
- data/lib/html2rss/item_extractors/href.rb +2 -2
- data/lib/html2rss/item_extractors/static.rb +2 -2
- data/lib/html2rss/utils.rb +18 -12
- data/lib/html2rss/version.rb +2 -1
- metadata +88 -23
data/Gemfile.lock
CHANGED
@@ -1,100 +1,131 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html2rss (0.
|
5
|
-
activesupport (
|
4
|
+
html2rss (0.9.0)
|
5
|
+
activesupport (>= 5, < 7)
|
6
|
+
addressable (~> 2.7)
|
6
7
|
builder
|
7
|
-
faraday (~> 0
|
8
|
-
faraday_middleware
|
9
|
-
|
8
|
+
faraday (~> 1.0)
|
9
|
+
faraday_middleware
|
10
|
+
kramdown
|
11
|
+
mime-types (> 3.0)
|
10
12
|
nokogiri (>= 1.10, < 2.0)
|
11
|
-
reverse_markdown (~>
|
13
|
+
reverse_markdown (~> 2.0)
|
12
14
|
sanitize (~> 5.0)
|
15
|
+
to_regexp
|
16
|
+
zeitwerk
|
13
17
|
|
14
18
|
GEM
|
15
19
|
remote: https://rubygems.org/
|
16
20
|
specs:
|
17
|
-
activesupport (
|
21
|
+
activesupport (6.0.3.2)
|
18
22
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
19
23
|
i18n (>= 0.7, < 2)
|
20
24
|
minitest (~> 5.1)
|
21
25
|
tzinfo (~> 1.1)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
26
|
+
zeitwerk (~> 2.2, >= 2.2.2)
|
27
|
+
addressable (2.7.0)
|
28
|
+
public_suffix (>= 2.0.2, < 5.0)
|
29
|
+
ast (2.4.1)
|
30
|
+
builder (3.2.4)
|
31
|
+
byebug (11.1.3)
|
32
|
+
concurrent-ruby (1.1.6)
|
33
|
+
coveralls (0.7.2)
|
34
|
+
multi_json (~> 1.3)
|
35
|
+
rest-client (= 1.6.7)
|
36
|
+
simplecov (>= 0.7)
|
37
|
+
term-ansicolor (= 1.2.2)
|
38
|
+
thor (= 0.18.1)
|
39
|
+
crass (1.0.6)
|
27
40
|
diff-lcs (1.3)
|
28
41
|
docile (1.3.2)
|
29
|
-
faraday (0.
|
42
|
+
faraday (1.0.1)
|
30
43
|
multipart-post (>= 1.2, < 3)
|
31
|
-
faraday_middleware (0.
|
32
|
-
faraday (
|
33
|
-
|
34
|
-
i18n (1.7.0)
|
44
|
+
faraday_middleware (1.0.0)
|
45
|
+
faraday (~> 1.0)
|
46
|
+
i18n (1.8.3)
|
35
47
|
concurrent-ruby (~> 1.0)
|
36
|
-
|
37
|
-
|
48
|
+
kramdown (2.2.1)
|
49
|
+
rexml
|
50
|
+
mime-types (3.3.1)
|
51
|
+
mime-types-data (~> 3.2015)
|
52
|
+
mime-types-data (3.2020.0512)
|
38
53
|
mini_portile2 (2.4.0)
|
39
|
-
minitest (5.
|
54
|
+
minitest (5.14.1)
|
55
|
+
multi_json (1.14.1)
|
40
56
|
multipart-post (2.1.1)
|
41
|
-
nokogiri (1.10.
|
57
|
+
nokogiri (1.10.9)
|
42
58
|
mini_portile2 (~> 2.4.0)
|
43
|
-
nokogumbo (2.0.
|
59
|
+
nokogumbo (2.0.2)
|
44
60
|
nokogiri (~> 1.8, >= 1.8.4)
|
45
|
-
parallel (1.
|
46
|
-
parser (2.
|
61
|
+
parallel (1.19.2)
|
62
|
+
parser (2.7.1.3)
|
47
63
|
ast (~> 2.4.0)
|
64
|
+
public_suffix (4.0.5)
|
48
65
|
rainbow (3.0.0)
|
49
|
-
|
66
|
+
regexp_parser (1.7.1)
|
67
|
+
rest-client (1.6.7)
|
68
|
+
mime-types (>= 1.16)
|
69
|
+
reverse_markdown (2.0.0)
|
50
70
|
nokogiri
|
51
|
-
|
52
|
-
|
53
|
-
rspec-
|
54
|
-
rspec-
|
55
|
-
|
56
|
-
|
57
|
-
|
71
|
+
rexml (3.2.4)
|
72
|
+
rspec (3.9.0)
|
73
|
+
rspec-core (~> 3.9.0)
|
74
|
+
rspec-expectations (~> 3.9.0)
|
75
|
+
rspec-mocks (~> 3.9.0)
|
76
|
+
rspec-core (3.9.2)
|
77
|
+
rspec-support (~> 3.9.3)
|
78
|
+
rspec-expectations (3.9.2)
|
58
79
|
diff-lcs (>= 1.2.0, < 2.0)
|
59
|
-
rspec-support (~> 3.
|
60
|
-
rspec-mocks (3.
|
80
|
+
rspec-support (~> 3.9.0)
|
81
|
+
rspec-mocks (3.9.1)
|
61
82
|
diff-lcs (>= 1.2.0, < 2.0)
|
62
|
-
rspec-support (~> 3.
|
63
|
-
rspec-support (3.
|
64
|
-
rubocop (0.
|
65
|
-
jaro_winkler (~> 1.5.1)
|
83
|
+
rspec-support (~> 3.9.0)
|
84
|
+
rspec-support (3.9.3)
|
85
|
+
rubocop (0.85.1)
|
66
86
|
parallel (~> 1.10)
|
67
|
-
parser (>= 2.
|
87
|
+
parser (>= 2.7.0.1)
|
68
88
|
rainbow (>= 2.2.2, < 4.0)
|
89
|
+
regexp_parser (>= 1.7)
|
90
|
+
rexml
|
91
|
+
rubocop-ast (>= 0.0.3)
|
69
92
|
ruby-progressbar (~> 1.7)
|
70
|
-
unicode-display_width (>= 1.4.0, <
|
71
|
-
rubocop-
|
93
|
+
unicode-display_width (>= 1.4.0, < 2.0)
|
94
|
+
rubocop-ast (0.0.3)
|
95
|
+
parser (>= 2.7.0.1)
|
96
|
+
rubocop-performance (1.6.1)
|
72
97
|
rubocop (>= 0.71.0)
|
73
|
-
rubocop-rspec (1.
|
98
|
+
rubocop-rspec (1.40.0)
|
74
99
|
rubocop (>= 0.68.1)
|
75
100
|
ruby-progressbar (1.10.1)
|
76
|
-
sanitize (5.1
|
101
|
+
sanitize (5.2.1)
|
77
102
|
crass (~> 1.0.2)
|
78
103
|
nokogiri (>= 1.8.0)
|
79
104
|
nokogumbo (~> 2.0)
|
80
|
-
simplecov (0.
|
105
|
+
simplecov (0.18.5)
|
81
106
|
docile (~> 1.1)
|
82
|
-
|
83
|
-
|
84
|
-
|
107
|
+
simplecov-html (~> 0.11)
|
108
|
+
simplecov-html (0.12.2)
|
109
|
+
term-ansicolor (1.2.2)
|
110
|
+
tins (~> 0.8)
|
111
|
+
thor (0.18.1)
|
85
112
|
thread_safe (0.3.6)
|
86
|
-
|
113
|
+
tins (0.13.2)
|
114
|
+
to_regexp (0.2.1)
|
115
|
+
tzinfo (1.2.7)
|
87
116
|
thread_safe (~> 0.1)
|
88
|
-
unicode-display_width (1.
|
89
|
-
vcr (
|
90
|
-
yard (0.9.
|
117
|
+
unicode-display_width (1.7.0)
|
118
|
+
vcr (6.0.0)
|
119
|
+
yard (0.9.25)
|
120
|
+
zeitwerk (2.3.0)
|
91
121
|
|
92
122
|
PLATFORMS
|
93
123
|
ruby
|
94
124
|
|
95
125
|
DEPENDENCIES
|
96
|
-
bundler
|
126
|
+
bundler
|
97
127
|
byebug
|
128
|
+
coveralls
|
98
129
|
html2rss!
|
99
130
|
rspec (~> 3.0)
|
100
131
|
rubocop
|
@@ -105,4 +136,4 @@ DEPENDENCIES
|
|
105
136
|
yard
|
106
137
|
|
107
138
|
BUNDLED WITH
|
108
|
-
1.
|
139
|
+
2.1.4
|
data/README.md
CHANGED
@@ -2,63 +2,375 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://travis-ci.org/gildesmarais/html2rss.svg?branch=master)](https://travis-ci.org/gildesmarais/html2rss)
|
4
4
|
[![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/)
|
5
|
-
[
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/github/gildesmarais/html2rss/badge.svg?branch=master)](https://coveralls.io/github/gildesmarais/html2rss?branch=master)
|
6
|
+
[![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss)
|
7
|
+
![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
|
8
|
+
[![](http://img.shields.io/liberapay/goal/gildesmarais.svg?logo=liberapa)](https://liberapay.com/gildesmarais/donate)
|
6
9
|
|
7
|
-
|
10
|
+
**Searching for a ready to use app which serves generated feeds via HTTP?**
|
11
|
+
[Head over to `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
|
8
12
|
|
9
|
-
|
10
|
-
[Check out `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
|
13
|
+
This Ruby gem builds RSS 2.0 feeds from a _feed config_.
|
11
14
|
|
12
|
-
|
13
|
-
CSS selectors
|
14
|
-
|
15
|
-
|
15
|
+
With the _feed config_ containing the URL to scrape and
|
16
|
+
CSS selectors for information extraction (like title, URL, ...) your RSS builds.
|
17
|
+
[Extractors](#using-extractors) and chain-able [post processors](#using-post-processors)
|
18
|
+
make information extraction, processing and sanitizing a breeze.
|
19
|
+
[Scraping JSON](#scraping-and-handling-json-responses) responses and
|
20
|
+
[setting HTTP request headers](#set-any-http-header-in-the-request) is
|
21
|
+
supported, too.
|
16
22
|
|
17
23
|
## Installation
|
18
24
|
|
19
|
-
|
20
|
-
|
25
|
+
| 🤩 Like it? | Star it! ⭐️ |
|
26
|
+
| ---------------------------------------------: | -------------------- |
|
27
|
+
| Add this line to your application's `Gemfile`: | `gem 'html2rss'` |
|
28
|
+
| Then execute: | `bundle` |
|
29
|
+
| In your code: | `require 'html2rss'` |
|
30
|
+
|
31
|
+
😍 Love it? Feel free [to donate](https://liberapay.com/gildesmarais/donate). Thank you! 💓
|
32
|
+
|
33
|
+
## Building a feed config
|
34
|
+
|
35
|
+
Here's a minimal working example:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
require 'html2rss'
|
39
|
+
|
40
|
+
rss =
|
41
|
+
Html2rss.feed(
|
42
|
+
channel: { url: 'https://stackoverflow.com/questions' },
|
43
|
+
selectors: {
|
44
|
+
items: { selector: '#hot-network-questions > ul > li' },
|
45
|
+
title: { selector: 'a' },
|
46
|
+
link: { selector: 'a', extractor: 'href' }
|
47
|
+
}
|
48
|
+
)
|
49
|
+
|
50
|
+
puts rss
|
51
|
+
```
|
52
|
+
|
53
|
+
A _feed config_ consists of a `channel` and a `selectors` Hash.
|
54
|
+
The contents of both hashes are explained below.
|
55
|
+
|
56
|
+
**Looks too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
|
57
|
+
|
58
|
+
### The `channel`
|
59
|
+
|
60
|
+
| attribute | | type | default | remark |
|
61
|
+
| ------------- | -------- | ------- | -------------: | ------------------------------------------ |
|
62
|
+
| `url` | required | String | | |
|
63
|
+
| `title` | optional | String | auto-generated | |
|
64
|
+
| `description` | optional | String | auto-generated | |
|
65
|
+
| `ttl` | optional | Integer | `360` | TTL in _minutes_ |
|
66
|
+
| `time_zone` | optional | String | `'UTC'` | TimeZone name |
|
67
|
+
| `language` | optional | String | `'en'` | Language code |
|
68
|
+
| `author` | optional | String | | Format: `email (Name)'` |
|
69
|
+
| `headers` | optional | Hash | `{}` | Set HTTP request headers. See notes below. |
|
70
|
+
| `json` | optional | Boolean | `false` | Handle JSON response. See notes below. |
|
71
|
+
|
72
|
+
### The `selectors`
|
73
|
+
|
74
|
+
You must provide an `items` selector hash which contains the CSS selector.
|
75
|
+
`items` needs to return a collection of HTML tags.
|
76
|
+
The other selectors are scoped to the tags of the items' collection.
|
77
|
+
|
78
|
+
To build a
|
79
|
+
[valid RSS 2.0 item](http://www.rssboard.org/rss-profile#element-channel-item)
|
80
|
+
each item has to have at least a `title` or a `description`.
|
81
|
+
|
82
|
+
Your `selectors` can contain arbitrary selector names, but only these
|
83
|
+
will make it into the RSS feed:
|
84
|
+
|
85
|
+
| RSS 2.0 tag | name in `html2rss` | remark |
|
86
|
+
| ------------- | ------------------ | --------------------------- |
|
87
|
+
| `title` | `title` | |
|
88
|
+
| `description` | `description` | Supports HTML. |
|
89
|
+
| `link` | `link` | A URL. |
|
90
|
+
| `author` | `author` | |
|
91
|
+
| `category` | `categories` | See notes below. |
|
92
|
+
| `enclosure` | `enclosure` | See notes below. |
|
93
|
+
| `pubDate` | `update` | An instance of `Time`. |
|
94
|
+
| `guid` | `guid` | Generated from the `title`. |
|
95
|
+
| `comments` | `comments` | A URL. |
|
96
|
+
| `source` | ~~source~~ | Not yet supported. |
|
97
|
+
|
98
|
+
### The `selector` hash
|
99
|
+
|
100
|
+
Your selector hash can have these attributes:
|
101
|
+
|
102
|
+
| name | value |
|
103
|
+
| -------------- | -------------------------------------------------------- |
|
104
|
+
| `selector` | The CSS selector to select the tag with the information. |
|
105
|
+
| `extractor` | Name of the extractor. See notes below. |
|
106
|
+
| `post_process` | A hash or array of hashes. See notes below. |
|
107
|
+
|
108
|
+
#### Reverse ordering of items
|
109
|
+
|
110
|
+
The `items` selector hash can have an `order` attribute.
|
111
|
+
If the value is `reverse` the order of items in the RSS will be reversed.
|
112
|
+
|
113
|
+
<details>
|
114
|
+
<summary>See a YAML feed config example</summary>
|
115
|
+
|
116
|
+
```yml
|
117
|
+
channel:
|
118
|
+
# ... omitted
|
119
|
+
selectors:
|
120
|
+
items:
|
121
|
+
selector: 'ul > li'
|
122
|
+
order: 'reverse'
|
123
|
+
# ... omitted
|
124
|
+
```
|
125
|
+
|
126
|
+
</details>
|
127
|
+
|
128
|
+
## Using extractors
|
129
|
+
|
130
|
+
Extractors help with extracting the information from the selected HTML tag.
|
131
|
+
|
132
|
+
- The default extractor is `text`, which returns the tag's inner text.
|
133
|
+
- The `html` extractor returns the tag's outer HTML.
|
134
|
+
- The `href` extractor returns a URL from the tag's `href` attribute and corrects relative ones to absolute ones.
|
135
|
+
- The `attribute` extractor returns the value of that tag's attribute.
|
136
|
+
- The `static` extractor returns the configured static value (it doesn't extract anything).
|
137
|
+
- [See file list of extractors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/item_extractors).
|
138
|
+
|
139
|
+
Extractors can require additional attributes on the selector hash.
|
140
|
+
👉 [Read their docs for usage examples](https://www.rubydoc.info/gems/html2rss/Html2rss/ItemExtractors).
|
141
|
+
|
142
|
+
<details>
|
143
|
+
<summary>See a Ruby example</summary>
|
144
|
+
|
145
|
+
```ruby
|
146
|
+
Html2rss.feed(
|
147
|
+
channel: {}, selectors: { link: { selector: 'a', extractor: 'href' } }
|
148
|
+
)
|
149
|
+
```
|
150
|
+
|
151
|
+
</details>
|
152
|
+
|
153
|
+
<details>
|
154
|
+
<summary>See a YAML feed config example</summary>
|
155
|
+
|
156
|
+
```yml
|
157
|
+
channel:
|
158
|
+
# ... omitted
|
159
|
+
selectors:
|
160
|
+
# ... omitted
|
161
|
+
link:
|
162
|
+
selector: 'a'
|
163
|
+
extractor: 'href'
|
164
|
+
```
|
165
|
+
|
166
|
+
</details>
|
167
|
+
|
168
|
+
## Using post processors
|
169
|
+
|
170
|
+
Extracted information can be further manipulated with post processors.
|
171
|
+
|
172
|
+
| name | |
|
173
|
+
| ------------------ | ------------------------------------------------------------------------------------- |
|
174
|
+
| `gsub` | Allows global substitution operations on Strings (Regexp or simple pattern). |
|
175
|
+
| `html_to_markdown` | HTML to Markdown, using [reverse_markdown](https://github.com/xijo/reverse_markdown). |
|
176
|
+
| `markdown_to_html` | converts Markdown to HTML, using [kramdown](https://github.com/gettalong/kramdown). |
|
177
|
+
| `parse_time` | Parses a String containing a time in a time zone. |
|
178
|
+
| `parse_uri` | Parses a String as URL. |
|
179
|
+
| `sanitize_html` | Strips unsafe and uneeded HTML and adds security related attributes. |
|
180
|
+
| `substring` | Cuts a part off of a String, starting at a position. |
|
181
|
+
| `template` | Based on a template, it creates a new String filled with other selectors values. |
|
182
|
+
|
183
|
+
⚠️ Always make use of the `sanitize_html` post processor for HTML content. _Never trust the internet!_ ⚠️
|
184
|
+
|
185
|
+
- [See file list of post processors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/attribute_post_processors).
|
186
|
+
|
187
|
+
👉 [Read their docs for usage examples.](https://www.rubydoc.info/gems/html2rss/Html2rss/AttributePostProcessors)
|
188
|
+
|
189
|
+
<details>
|
190
|
+
<summary>See a Ruby example</summary>
|
21
191
|
|
22
192
|
```ruby
|
23
|
-
|
24
|
-
channel: {
|
193
|
+
Html2rss.feed(
|
194
|
+
channel: {},
|
25
195
|
selectors: {
|
26
|
-
|
27
|
-
|
28
|
-
|
196
|
+
description: {
|
197
|
+
selector: '.content', post_process: { name: 'sanitize_html' }
|
198
|
+
}
|
29
199
|
}
|
30
200
|
)
|
201
|
+
```
|
202
|
+
|
203
|
+
</details>
|
204
|
+
|
205
|
+
<details>
|
206
|
+
<summary>See a YAML feed config example</summary>
|
31
207
|
|
32
|
-
|
208
|
+
```yml
|
209
|
+
channel:
|
210
|
+
# ... omitted
|
211
|
+
selectors:
|
212
|
+
# ... omitted
|
213
|
+
description:
|
214
|
+
selector: '.content'
|
215
|
+
post_process:
|
216
|
+
- name: sanitize_html
|
33
217
|
```
|
34
218
|
|
35
|
-
|
219
|
+
</details>
|
220
|
+
|
221
|
+
### Chaining post processors
|
222
|
+
|
223
|
+
Pass an array to `post_process` to chain the post processors.
|
224
|
+
|
225
|
+
<details>
|
226
|
+
<summary>YAML example: build the description from a template String (in Markdown) and convert that Markdown to HTML</summary>
|
227
|
+
|
228
|
+
```yml
|
229
|
+
channel:
|
230
|
+
# ... omitted
|
231
|
+
selectors:
|
232
|
+
# ... omitted
|
233
|
+
price:
|
234
|
+
selector: '.price'
|
235
|
+
description:
|
236
|
+
selector: '.section'
|
237
|
+
post_process:
|
238
|
+
- name: template
|
239
|
+
string: |
|
240
|
+
# %{self}
|
241
|
+
|
242
|
+
Price: %{price}
|
243
|
+
- name: markdown_to_html
|
244
|
+
```
|
245
|
+
|
246
|
+
Note the use of `|` for a multi-line String in YAML.
|
247
|
+
|
248
|
+
</details>
|
249
|
+
|
250
|
+
## Adding `<category>` tags to an item
|
251
|
+
|
252
|
+
The `categories` selector takes an array of selector names. Each value of those
|
253
|
+
selectors will become a `<category>` on the RSS item.
|
254
|
+
|
255
|
+
<details>
|
256
|
+
<summary>See a Ruby example</summary>
|
257
|
+
|
258
|
+
```ruby
|
259
|
+
Html2rss.feed(
|
260
|
+
channel: {},
|
261
|
+
selectors: {
|
262
|
+
genre: {
|
263
|
+
# ... omitted
|
264
|
+
selector: '.genre'
|
265
|
+
},
|
266
|
+
branch: { selector: '.branch' },
|
267
|
+
categories: %i[genre branch]
|
268
|
+
}
|
269
|
+
)
|
270
|
+
```
|
271
|
+
|
272
|
+
</details>
|
273
|
+
|
274
|
+
<details>
|
275
|
+
<summary>See a YAML feed config example</summary>
|
276
|
+
|
277
|
+
```yml
|
278
|
+
channel:
|
279
|
+
# ... omitted
|
280
|
+
selectors:
|
281
|
+
# ... omitted
|
282
|
+
genre:
|
283
|
+
selector: ".genre"
|
284
|
+
branch:
|
285
|
+
selector: ".branch"
|
286
|
+
categories:
|
287
|
+
- genre
|
288
|
+
- branch
|
289
|
+
```
|
290
|
+
|
291
|
+
</details>
|
36
292
|
|
37
|
-
|
293
|
+
## Adding an `<enclosure>` tag to an item
|
38
294
|
|
39
|
-
|
295
|
+
An enclosure can be any file, e.g. a image, audio or video.
|
40
296
|
|
41
|
-
|
297
|
+
The `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's URL as base.
|
42
298
|
|
43
|
-
|
299
|
+
Since `html2rss` does no further inspection of the enclosure, its support comes with trade-offs:
|
44
300
|
|
45
|
-
|
301
|
+
1. The content-type is guessed from the file extension of the URL.
|
302
|
+
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
303
|
+
3. The content-length will always be undetermined and thus stated as `0` bytes.
|
304
|
+
|
305
|
+
Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item-enclosure) for further information on enclosing content.
|
306
|
+
|
307
|
+
<details>
|
308
|
+
<summary>See a Ruby example</summary>
|
309
|
+
|
310
|
+
```ruby
|
311
|
+
Html2rss.feed(
|
312
|
+
channel: {},
|
313
|
+
selectors: {
|
314
|
+
enclosure: { selector: 'img', extractor: 'attribute', attribute: 'src' }
|
315
|
+
}
|
316
|
+
)
|
317
|
+
```
|
46
318
|
|
47
|
-
|
319
|
+
</details>
|
320
|
+
|
321
|
+
<details>
|
322
|
+
<summary>See a YAML feed config example</summary>
|
323
|
+
|
324
|
+
```yml
|
325
|
+
channel:
|
326
|
+
# ... omitted
|
327
|
+
selectors:
|
328
|
+
# ... omitted
|
329
|
+
enclosure:
|
330
|
+
selector: "img"
|
331
|
+
extractor: "attribute"
|
332
|
+
attribute: "src"
|
333
|
+
```
|
334
|
+
|
335
|
+
</details>
|
336
|
+
|
337
|
+
## Scraping and handling JSON responses
|
338
|
+
|
339
|
+
Although this gem is called **html***2rss*, it's possible to scrape and process JSON.
|
48
340
|
|
49
341
|
Adding `json: true` to the channel config will convert the JSON response to XML.
|
50
342
|
|
51
|
-
|
343
|
+
<details>
|
344
|
+
<summary>See a Ruby example</summary>
|
345
|
+
|
346
|
+
```ruby
|
347
|
+
Html2rss.feed(
|
348
|
+
channel: {
|
349
|
+
url: 'https://example.com', json: true
|
350
|
+
},
|
351
|
+
selectors: {} # ... omitted
|
352
|
+
)
|
353
|
+
```
|
354
|
+
|
355
|
+
</details>
|
356
|
+
|
357
|
+
<details>
|
358
|
+
<summary>See a YAML feed config example</summary>
|
52
359
|
|
53
360
|
```yaml
|
54
361
|
channel:
|
55
362
|
url: https://example.com
|
56
|
-
title: "Example with JSON"
|
57
363
|
json: true
|
58
|
-
|
364
|
+
selectors:
|
365
|
+
# ... omitted
|
59
366
|
```
|
60
367
|
|
61
|
-
|
368
|
+
</details>
|
369
|
+
|
370
|
+
<details>
|
371
|
+
<summary>See example of a converted JSON object</summary>
|
372
|
+
|
373
|
+
This JSON object:
|
62
374
|
|
63
375
|
```json
|
64
376
|
{
|
@@ -66,58 +378,160 @@ Imagine this HTTP response:
|
|
66
378
|
}
|
67
379
|
```
|
68
380
|
|
69
|
-
|
381
|
+
converts to:
|
70
382
|
|
71
383
|
```xml
|
72
|
-
<
|
384
|
+
<hash>
|
73
385
|
<data>
|
74
386
|
<datum>
|
75
387
|
<title>Headline</title>
|
76
388
|
<url>https://example.com</url>
|
77
389
|
</datum>
|
78
390
|
</data>
|
79
|
-
</
|
391
|
+
</hash>
|
392
|
+
```
|
393
|
+
|
394
|
+
Your items selector would be `data > datum`, the item's `link` selector would be `url`.
|
395
|
+
|
396
|
+
Find further information in [ActiveSupport's `Hash.to_xml` documentation](https://apidock.com/rails/Hash/to_xml).
|
397
|
+
|
398
|
+
</details>
|
399
|
+
|
400
|
+
<details>
|
401
|
+
<summary>See example of a converted JSON array</summary>
|
402
|
+
|
403
|
+
This JSON array:
|
404
|
+
|
405
|
+
```json
|
406
|
+
[{ "title": "Headline", "url": "https://example.com" }]
|
80
407
|
```
|
81
408
|
|
82
|
-
|
409
|
+
converts to:
|
83
410
|
|
84
|
-
|
411
|
+
```xml
|
412
|
+
<objects>
|
413
|
+
<object>
|
414
|
+
<title>Headline</title>
|
415
|
+
<url>https://example.com</url>
|
416
|
+
</object>
|
417
|
+
</objects>
|
418
|
+
```
|
419
|
+
|
420
|
+
Your items selector would be `objects > object`, the item's `link` selector would be `url`.
|
421
|
+
|
422
|
+
Find further information in [ActiveSupport's `Array.to_xml` documentation](https://apidock.com/rails/Array/to_xml).
|
423
|
+
|
424
|
+
</details>
|
85
425
|
|
86
426
|
## Set any HTTP header in the request
|
87
427
|
|
88
428
|
You can add any HTTP headers to the request to the channel URL.
|
89
|
-
|
429
|
+
Use this to e.g. have Cookie or Authorization information sent or to spoof the User-Agent.
|
430
|
+
|
431
|
+
<details>
|
432
|
+
<summary>See a Ruby example</summary>
|
433
|
+
|
434
|
+
```ruby
|
435
|
+
Html2rss.feed(
|
436
|
+
channel: {
|
437
|
+
url: 'https://example.com',
|
438
|
+
headers: {
|
439
|
+
"User-Agent": "html2rss-request",
|
440
|
+
"X-Something": "Foobar",
|
441
|
+
"Authorization": "Token deadbea7",
|
442
|
+
"Cookie": "monster=MeWantCookie"
|
443
|
+
}
|
444
|
+
},
|
445
|
+
selectors: {}
|
446
|
+
)
|
447
|
+
```
|
448
|
+
|
449
|
+
</details>
|
450
|
+
|
451
|
+
<details>
|
452
|
+
<summary>See a YAML feed config example</summary>
|
90
453
|
|
91
454
|
```yaml
|
92
455
|
channel:
|
93
456
|
url: https://example.com
|
94
|
-
title: "Example with http headers"
|
95
457
|
headers:
|
96
458
|
"User-Agent": "html2rss-request"
|
97
459
|
"X-Something": "Foobar"
|
98
460
|
"Authorization": "Token deadbea7"
|
99
461
|
"Cookie": "monster=MeWantCookie"
|
100
|
-
|
462
|
+
selectors:
|
463
|
+
# ...
|
101
464
|
```
|
102
465
|
|
103
|
-
|
466
|
+
</details>
|
104
467
|
|
105
|
-
|
468
|
+
The headers provided by the channel are merged into the global headers.
|
106
469
|
|
107
|
-
|
470
|
+
## Usage with a YAML config file
|
108
471
|
|
109
|
-
|
472
|
+
This step is not required to work with this gem. If you're using
|
473
|
+
[`html2rss-web`](https://github.com/gildesmarais/html2rss-web)
|
474
|
+
and want to create your private feed configs, keep on reading!
|
475
|
+
|
476
|
+
First, create your YAML file, e.g. called `feeds.yml`.
|
477
|
+
This file will contain your global config and feed configs.
|
478
|
+
|
479
|
+
Example:
|
480
|
+
|
481
|
+
```yml
|
482
|
+
headers:
|
483
|
+
'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
|
484
|
+
feeds:
|
485
|
+
myfeed:
|
486
|
+
channel:
|
487
|
+
selectors:
|
488
|
+
myotherfeed:
|
489
|
+
channel:
|
490
|
+
selectors:
|
491
|
+
```
|
110
492
|
|
111
|
-
|
493
|
+
Your feed configs go below `feeds`. Everything else is part of the global config.
|
112
494
|
|
113
|
-
|
495
|
+
Build your feeds like this:
|
496
|
+
|
497
|
+
```ruby
|
498
|
+
require 'html2rss'
|
499
|
+
|
500
|
+
myfeed = Html2rss.feed_from_yaml_config('feeds.yml', 'myfeed')
|
501
|
+
myotherfeed = Html2rss.feed_from_yaml_config('feeds.yml', 'myotherfeed')
|
502
|
+
```
|
503
|
+
|
504
|
+
Find a full example of a `feeds.yml` at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
|
505
|
+
|
506
|
+
## Gotchas and tips & tricks
|
507
|
+
|
508
|
+
- Check that the channel URL does not redirect to a mobile page with a different markup structure.
|
509
|
+
- Do not rely on your web browser's developer console. `html2rss` does not execute JavaScript.
|
510
|
+
- Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
|
511
|
+
- [CSS selectors are quite versatile, here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
|
512
|
+
|
513
|
+
## Development
|
514
|
+
|
515
|
+
After checking out the repository, run `bin/setup` to install dependencies. Then, run `bundle exec rspec` to run the tests.
|
516
|
+
You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
517
|
+
|
518
|
+
<details>
|
519
|
+
<summary>Releasing a new version</summary>
|
114
520
|
|
115
521
|
1. `git pull`
|
116
522
|
2. increase version in `lib/html2rss/version.rb`
|
117
523
|
3. `bundle`
|
118
|
-
4.
|
119
|
-
5. `
|
120
|
-
6.
|
121
|
-
7. `git
|
122
|
-
8. `
|
123
|
-
9. `git
|
524
|
+
4. `git add Gemfile.lock lib/html2rss/version.rb`
|
525
|
+
5. `VERSION=$(ruby -e 'require "./lib/html2rss/version.rb"; puts Html2rss::VERSION')`
|
526
|
+
6. `git commit -m "chore: release $VERSION"`
|
527
|
+
7. `git tag v$VERSION`
|
528
|
+
8. [`standard-changelog -f`](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/standard-changelog)
|
529
|
+
9. `git add CHANGELOG.md && git commit --amend`
|
530
|
+
10. `git tag v$VERSION -f`
|
531
|
+
11. `git push && git push --tags`
|
532
|
+
|
533
|
+
</details>
|
534
|
+
|
535
|
+
## Contributing
|
536
|
+
|
537
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
|