html2rss 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -4
- data/README.md +370 -57
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors.rb +1 -3
- data/lib/html2rss/attribute_post_processors/gsub.rb +5 -3
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +48 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +1 -1
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +1 -0
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +10 -8
- data/lib/html2rss/attribute_post_processors/substring.rb +9 -3
- data/lib/html2rss/attribute_post_processors/template.rb +17 -11
- data/lib/html2rss/item_extractors.rb +2 -4
- data/lib/html2rss/utils.rb +6 -0
- data/lib/html2rss/version.rb +1 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38b547bdfe0799d71348690ae20d29d74d88c3cbb0828cdd807a382dec14f895
|
4
|
+
data.tar.gz: 8e9ae611d6bbf0174bd9038c3aa7a7b214ec8ffa152e990b2224e98d5022bec0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c0c4dc94d9e339d054ec4ee1c586ac21355cdfa1c9664dd05ef01ca474668f7e483baec35bb8abfd1971eff296b7b5c270816c992ec9fc6abd51a89a7ce28000
|
7
|
+
data.tar.gz: 3f45fb28e10360055ead9b757b4972bd2c52ad3285b94b506c3a6ce66b7bd5ec6364da63cc83b066fab6bc69396ed7408108536b5b9178d4a4a34c28436b4d2d
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,14 @@
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.8.0...v) (2019-11-02)
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
# [0.8.0](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v0.8.0) (2019-11-02)
|
6
|
+
|
7
|
+
|
8
|
+
### Features
|
9
|
+
|
10
|
+
* **post_processors:** add markdown to html ([#54](https://github.com/gildesmarais/html2rss/issues/54)) ([cdf77b8](https://github.com/gildesmarais/html2rss/commit/cdf77b8))
|
11
|
+
* **post_processors:** support annotated tokens ([#62](https://github.com/gildesmarais/html2rss/issues/62)) ([b57bd7b](https://github.com/gildesmarais/html2rss/commit/b57bd7b)), closes [#56](https://github.com/gildesmarais/html2rss/issues/56)
|
2
12
|
|
3
13
|
|
4
14
|
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html2rss (0.
|
4
|
+
html2rss (0.8.0)
|
5
5
|
activesupport (~> 5.0)
|
6
6
|
builder
|
7
7
|
faraday (~> 0.15)
|
8
8
|
faraday_middleware (~> 0.13)
|
9
9
|
hashie (~> 3.6)
|
10
|
+
kramdown
|
10
11
|
mime-types (> 3.0)
|
11
12
|
nokogiri (>= 1.10, < 2.0)
|
12
13
|
reverse_markdown (~> 1.3)
|
@@ -26,6 +27,12 @@ GEM
|
|
26
27
|
builder (3.2.3)
|
27
28
|
byebug (11.0.1)
|
28
29
|
concurrent-ruby (1.1.5)
|
30
|
+
coveralls (0.7.2)
|
31
|
+
multi_json (~> 1.3)
|
32
|
+
rest-client (= 1.6.7)
|
33
|
+
simplecov (>= 0.7)
|
34
|
+
term-ansicolor (= 1.2.2)
|
35
|
+
thor (= 0.18.1)
|
29
36
|
crass (1.0.5)
|
30
37
|
diff-lcs (1.3)
|
31
38
|
docile (1.3.2)
|
@@ -38,13 +45,15 @@ GEM
|
|
38
45
|
concurrent-ruby (~> 1.0)
|
39
46
|
jaro_winkler (1.5.3)
|
40
47
|
json (2.2.0)
|
48
|
+
kramdown (2.1.0)
|
41
49
|
mime-types (3.3)
|
42
50
|
mime-types-data (~> 3.2015)
|
43
51
|
mime-types-data (3.2019.1009)
|
44
52
|
mini_portile2 (2.4.0)
|
45
|
-
minitest (5.
|
53
|
+
minitest (5.13.0)
|
54
|
+
multi_json (1.14.1)
|
46
55
|
multipart-post (2.1.1)
|
47
|
-
nokogiri (1.10.
|
56
|
+
nokogiri (1.10.5)
|
48
57
|
mini_portile2 (~> 2.4.0)
|
49
58
|
nokogumbo (2.0.1)
|
50
59
|
nokogiri (~> 1.8, >= 1.8.4)
|
@@ -52,6 +61,8 @@ GEM
|
|
52
61
|
parser (2.6.5.0)
|
53
62
|
ast (~> 2.4.0)
|
54
63
|
rainbow (3.0.0)
|
64
|
+
rest-client (1.6.7)
|
65
|
+
mime-types (>= 1.16)
|
55
66
|
reverse_markdown (1.3.0)
|
56
67
|
nokogiri
|
57
68
|
rspec (3.9.0)
|
@@ -88,14 +99,18 @@ GEM
|
|
88
99
|
json (>= 1.8, < 3)
|
89
100
|
simplecov-html (~> 0.10.0)
|
90
101
|
simplecov-html (0.10.2)
|
102
|
+
term-ansicolor (1.2.2)
|
103
|
+
tins (~> 0.8)
|
104
|
+
thor (0.18.1)
|
91
105
|
thread_safe (0.3.6)
|
106
|
+
tins (0.13.2)
|
92
107
|
to_regexp (0.2.1)
|
93
108
|
tzinfo (1.2.5)
|
94
109
|
thread_safe (~> 0.1)
|
95
110
|
unicode-display_width (1.6.0)
|
96
111
|
vcr (5.0.0)
|
97
112
|
yard (0.9.20)
|
98
|
-
zeitwerk (2.2.
|
113
|
+
zeitwerk (2.2.1)
|
99
114
|
|
100
115
|
PLATFORMS
|
101
116
|
ruby
|
@@ -103,6 +118,7 @@ PLATFORMS
|
|
103
118
|
DEPENDENCIES
|
104
119
|
bundler (~> 1.16)
|
105
120
|
byebug
|
121
|
+
coveralls
|
106
122
|
html2rss!
|
107
123
|
rspec (~> 3.0)
|
108
124
|
rubocop
|
data/README.md
CHANGED
@@ -2,27 +2,44 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://travis-ci.org/gildesmarais/html2rss.svg?branch=master)](https://travis-ci.org/gildesmarais/html2rss)
|
4
4
|
[![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/)
|
5
|
-
[
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/github/gildesmarais/html2rss/badge.svg?branch=master)](https://coveralls.io/github/gildesmarais/html2rss?branch=master)
|
6
|
+
[![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss)
|
7
|
+
![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
|
6
8
|
|
7
|
-
|
9
|
+
**Searching for a ready to use app which serves generated feeds via HTTP?**
|
10
|
+
[Head over to `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
|
8
11
|
|
9
|
-
|
10
|
-
[Check out `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
|
12
|
+
This Ruby gem builds RSS 2.0 feeds from a _feed config_.
|
11
13
|
|
12
|
-
|
13
|
-
CSS selectors
|
14
|
-
|
15
|
-
|
14
|
+
With the _feed config_ containing the URL to scrape and
|
15
|
+
CSS selectors for information extraction (like title, URL, ...) your RSS builds.
|
16
|
+
[Extractors](#using-extractors) and chain-able [post processors](#using-post-processors)
|
17
|
+
make information extraction, processing and sanitizing a breeze.
|
18
|
+
[Scraping JSON](#scraping-json) responses and
|
19
|
+
[setting HTTP request headers](#set-any-http-header-in-the-request) is
|
20
|
+
supported, too.
|
16
21
|
|
17
22
|
## Installation
|
18
23
|
|
19
|
-
|
20
|
-
|
24
|
+
| 🤩 Like it? | Star it! ⭐️ |
|
25
|
+
| ---------------------------------------------: | -------------------- |
|
26
|
+
| Add this line to your application's `Gemfile`: | `gem 'html2rss'` |
|
27
|
+
| Then execute: | `bundle` |
|
28
|
+
| In your code: | `require 'html2rss'` |
|
29
|
+
|
30
|
+
## Building a feed config
|
31
|
+
|
32
|
+
Here's a minimal working example:
|
21
33
|
|
22
34
|
```ruby
|
35
|
+
require 'html2rss'
|
36
|
+
|
23
37
|
rss =
|
24
38
|
Html2rss.feed(
|
25
|
-
channel: {
|
39
|
+
channel: {
|
40
|
+
title: 'StackOverflow: Hot Network Questions',
|
41
|
+
url: 'https://stackoverflow.com/questions'
|
42
|
+
},
|
26
43
|
selectors: {
|
27
44
|
items: { selector: '#hot-network-questions > ul > li' },
|
28
45
|
title: { selector: 'a' },
|
@@ -30,35 +47,218 @@ rss =
|
|
30
47
|
}
|
31
48
|
)
|
32
49
|
|
33
|
-
puts rss
|
50
|
+
puts rss
|
34
51
|
```
|
35
52
|
|
36
|
-
|
53
|
+
A _feed config_ consists of a `channel` and a `selectors` Hash.
|
54
|
+
The contents of both hashes are explained below.
|
55
|
+
|
56
|
+
**Looks too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
|
57
|
+
|
58
|
+
### The `channel`
|
59
|
+
|
60
|
+
| attribute | | type | remark |
|
61
|
+
| ------------- | -------- | ------- | ----------------------- |
|
62
|
+
| `title` | required | String | |
|
63
|
+
| `url` | required | String | |
|
64
|
+
| `ttl` | optional | Integer | time to live in minutes |
|
65
|
+
| `description` | optional | String | |
|
66
|
+
| `headers` | optional | Hash | See notes below. |
|
67
|
+
|
68
|
+
### The `selectors`
|
69
|
+
|
70
|
+
You must provide an `items` selector hash which contains the CSS selector.
|
71
|
+
`items` needs to return a collection of HTML tags.
|
72
|
+
The other selectors are scoped to the tags of the items' collection.
|
73
|
+
|
74
|
+
To build a
|
75
|
+
[valid RSS 2.0 item](http://www.rssboard.org/rss-profile#element-channel-item)
|
76
|
+
each item has to have at least a `title` or a `description`.
|
77
|
+
|
78
|
+
Your `selectors` can contain arbitrary selector names, but only these
|
79
|
+
will make it into the RSS feed:
|
80
|
+
|
81
|
+
| RSS 2.0 tag | name in html2rss | remark |
|
82
|
+
| ------------- | ---------------- | --------------------------- |
|
83
|
+
| `title` | `title` | |
|
84
|
+
| `description` | `description` | Supports HTML. |
|
85
|
+
| `link` | `link` | A URL. |
|
86
|
+
| `author` | `author` | |
|
87
|
+
| `category` | `categories` | See notes below. |
|
88
|
+
| `enclosure` | `enclosure` | See notes below. |
|
89
|
+
| `pubDate` | `update` | An instance of `Time`. |
|
90
|
+
| `guid` | `guid` | Generated from the `title`. |
|
91
|
+
| `comments` | `comments` | A URL. |
|
92
|
+
| `source` | ~~source~~ | Not yet supported. |
|
93
|
+
|
94
|
+
### The `selector` hash
|
95
|
+
|
96
|
+
Your selector hash can have these attributes:
|
97
|
+
|
98
|
+
| name | value |
|
99
|
+
| -------------- | -------------------------------------------------------- |
|
100
|
+
| `selector` | The CSS selector to select the tag with the information. |
|
101
|
+
| `extractor` | Name of the extractor. See notes below. |
|
102
|
+
| `post_process` | A hash or array of hashes. See notes below. |
|
103
|
+
|
104
|
+
## Using extractors
|
105
|
+
|
106
|
+
Extractors help with extracting the information from the selected HTML tag.
|
107
|
+
|
108
|
+
- The default extractor is `text`, which returns the tag's inner text.
|
109
|
+
- The `html` extractor returns the tag's outer HTML.
|
110
|
+
- The `href` extractor returns a URL from the tag's `href` attribute and corrects relative ones to absolute ones.
|
111
|
+
- The `attribute` extractor returns the value of that tag's attribute.
|
112
|
+
- The `static` extractor returns the configured static value (it doesn't extract anything).
|
113
|
+
- [See file list of extractors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/item_extractors).
|
114
|
+
|
115
|
+
Extractors can require additional attributes on the selector hash.
|
116
|
+
👉 [Read their docs for usage examples](https://www.rubydoc.info/gems/html2rss/Html2rss/ItemExtractors).
|
117
|
+
|
118
|
+
<details>
|
119
|
+
<summary>See a Ruby example</summary>
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
Html2rss.feed(
|
123
|
+
channel: {}, selectors: { link: { selector: 'a', extractor: 'href' } }
|
124
|
+
)
|
125
|
+
```
|
126
|
+
|
127
|
+
</details>
|
128
|
+
|
129
|
+
<details>
|
130
|
+
<summary>See a YAML feed config example</summary>
|
131
|
+
|
132
|
+
```yml
|
133
|
+
channel:
|
134
|
+
# ... omitted
|
135
|
+
selectors:
|
136
|
+
# ... omitted
|
137
|
+
link:
|
138
|
+
selector: 'a'
|
139
|
+
extractor: 'href'
|
140
|
+
```
|
141
|
+
|
142
|
+
</details>
|
143
|
+
|
144
|
+
## Using post processors
|
145
|
+
|
146
|
+
Extracted information can be further manipulated with post processors.
|
147
|
+
|
148
|
+
| name | |
|
149
|
+
| ------------------ | ------------------------------------------------------------------------------------- |
|
150
|
+
| `gsub` | Allows global substitution operations on Strings (Regexp or simple pattern). |
|
151
|
+
| `html_to_markdown` | HTML to Markdown, using [reverse_markdown](https://github.com/xijo/reverse_markdown). |
|
152
|
+
| `markdown_to_html` | converts Markdown to HTML, using [kramdown](https://github.com/gettalong/kramdown). |
|
153
|
+
| `parse_time` | Parses a String containing a time in a time zone. |
|
154
|
+
| `parse_uri` | Parses a String as URL. |
|
155
|
+
| `sanitize_html` | Strips unsafe and uneeded HTML and adds security related attributes. |
|
156
|
+
| `substring` | Cuts a part off of a String, starting at a position. |
|
157
|
+
| `template` | Based on a template, it creates a new String filled with other selectors values. |
|
158
|
+
|
159
|
+
⚠️ Always make use of the `sanitize_html` post processor for HTML content. _Never trust the internet!_ ⚠️
|
37
160
|
|
38
|
-
|
161
|
+
- [See file list of post processors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/attribute_post_processors).
|
39
162
|
|
40
|
-
|
41
|
-
|
163
|
+
👉 [Read their docs for usage examples.](https://www.rubydoc.info/gems/html2rss/Html2rss/AttributePostProcessors)
|
164
|
+
|
165
|
+
<details>
|
166
|
+
<summary>See a Ruby example</summary>
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
Html2rss.feed(
|
170
|
+
channel: {},
|
171
|
+
selectors: {
|
172
|
+
description: {
|
173
|
+
selector: '.content', post_process: { name: 'sanitize_html' }
|
174
|
+
}
|
175
|
+
}
|
176
|
+
)
|
177
|
+
```
|
42
178
|
|
43
|
-
|
179
|
+
</details>
|
44
180
|
|
45
|
-
|
181
|
+
<details>
|
182
|
+
<summary>See a YAML feed config example</summary>
|
183
|
+
|
184
|
+
```yml
|
185
|
+
channel:
|
186
|
+
# ... omitted
|
187
|
+
selectors:
|
188
|
+
# ... omitted
|
189
|
+
description:
|
190
|
+
selector: '.content'
|
191
|
+
post_process:
|
192
|
+
- name: sanitize_html
|
193
|
+
```
|
194
|
+
|
195
|
+
</details>
|
196
|
+
|
197
|
+
### Chaining post processors
|
198
|
+
|
199
|
+
Pass an array to `post_process` to chain the post processors.
|
200
|
+
|
201
|
+
<details>
|
202
|
+
<summary>YAML example: build the description from a template String (in Markdown) and convert that Markdown to HTML</summary>
|
203
|
+
|
204
|
+
```yml
|
205
|
+
channel:
|
206
|
+
# ... omitted
|
207
|
+
selectors:
|
208
|
+
# ... omitted
|
209
|
+
price:
|
210
|
+
selector: '.price'
|
211
|
+
description:
|
212
|
+
selector: '.section'
|
213
|
+
post_process:
|
214
|
+
- name: template
|
215
|
+
string: |
|
216
|
+
# %{self}
|
217
|
+
|
218
|
+
Price: %{price}
|
219
|
+
- name: markdown_to_html
|
220
|
+
```
|
221
|
+
|
222
|
+
Note the use of `|` for a multi-line String in YAML.
|
223
|
+
|
224
|
+
</details>
|
225
|
+
|
226
|
+
## Adding `<category>` tags to an item
|
46
227
|
|
47
228
|
The `categories` selector takes an array of selector names. The value of those
|
48
|
-
selectors will become a category on the item.
|
229
|
+
selectors will become a `<category>` on the RSS item.
|
230
|
+
|
231
|
+
<details>
|
232
|
+
<summary>See a Ruby example</summary>
|
233
|
+
|
234
|
+
```ruby
|
235
|
+
Html2rss.feed(
|
236
|
+
channel: {},
|
237
|
+
selectors: {
|
238
|
+
genre: {
|
239
|
+
# ... omitted
|
240
|
+
selector: '.genre'
|
241
|
+
},
|
242
|
+
branch: { selector: '.branch' },
|
243
|
+
categories: %i[genre branch]
|
244
|
+
}
|
245
|
+
)
|
246
|
+
```
|
247
|
+
|
248
|
+
</details>
|
49
249
|
|
50
250
|
<details>
|
51
|
-
<summary>See a YAML config example</summary>
|
251
|
+
<summary>See a YAML feed config example</summary>
|
52
252
|
|
53
253
|
```yml
|
54
254
|
channel:
|
55
|
-
# ... omitted
|
255
|
+
# ... omitted
|
56
256
|
selectors:
|
57
|
-
|
257
|
+
# ... omitted
|
58
258
|
genre:
|
59
|
-
selector:
|
259
|
+
selector: ".genre"
|
60
260
|
branch:
|
61
|
-
selector:
|
261
|
+
selector: ".branch"
|
62
262
|
categories:
|
63
263
|
- genre
|
64
264
|
- branch
|
@@ -66,13 +266,13 @@ selectors:
|
|
66
266
|
|
67
267
|
</details>
|
68
268
|
|
69
|
-
## Adding an enclosure to
|
269
|
+
## Adding an `<enclosure>` tag to an item
|
70
270
|
|
71
271
|
An enclosure can be 'anything', e.g. a image, audio or video file.
|
72
272
|
|
73
|
-
The
|
273
|
+
The `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's URL as base.
|
74
274
|
|
75
|
-
Since html2rss does no further inspection of the enclosure,
|
275
|
+
Since html2rss does no further inspection of the enclosure, its support comes with trade-offs:
|
76
276
|
|
77
277
|
1. The content-type is guessed from the file extension of the URL.
|
78
278
|
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
@@ -81,43 +281,71 @@ Since html2rss does no further inspection of the enclosure, the support of this
|
|
81
281
|
Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item-enclosure) for further information on enclosing content.
|
82
282
|
|
83
283
|
<details>
|
84
|
-
<summary>See a
|
284
|
+
<summary>See a Ruby example</summary>
|
285
|
+
|
286
|
+
```ruby
|
287
|
+
Html2rss.feed(
|
288
|
+
channel: {},
|
289
|
+
selectors: {
|
290
|
+
enclosure: { selector: 'img', extractor: 'attribute', attribute: 'src' }
|
291
|
+
}
|
292
|
+
)
|
293
|
+
```
|
294
|
+
|
295
|
+
</details>
|
296
|
+
|
297
|
+
<details>
|
298
|
+
<summary>See a YAML feed config example</summary>
|
85
299
|
|
86
300
|
```yml
|
87
301
|
channel:
|
88
|
-
# ... omitted
|
302
|
+
# ... omitted
|
89
303
|
selectors:
|
90
|
-
|
91
|
-
enclosure:
|
92
|
-
|
93
|
-
|
94
|
-
|
304
|
+
# ... omitted
|
305
|
+
enclosure:
|
306
|
+
selector: "img"
|
307
|
+
extractor: "attribute"
|
308
|
+
attribute: "src"
|
95
309
|
```
|
96
310
|
|
97
311
|
</details>
|
98
312
|
|
99
313
|
## Scraping JSON
|
100
314
|
|
101
|
-
|
315
|
+
Although this gem is called **html***2rss*, it's possible to scrape and process JSON.
|
102
316
|
|
103
317
|
Adding `json: true` to the channel config will convert the JSON response to XML.
|
104
318
|
|
319
|
+
<details>
|
320
|
+
<summary>See a Ruby example</summary>
|
321
|
+
|
322
|
+
```ruby
|
323
|
+
Html2rss.feed(
|
324
|
+
channel: {
|
325
|
+
url: 'https://example.com', title: 'Example with JSON', json: true
|
326
|
+
},
|
327
|
+
selectors: {} # ... omitted
|
328
|
+
)
|
329
|
+
```
|
330
|
+
|
331
|
+
</details>
|
332
|
+
|
105
333
|
<details>
|
106
334
|
<summary>See a YAML feed config example</summary>
|
107
335
|
|
108
336
|
```yaml
|
109
337
|
channel:
|
110
338
|
url: https://example.com
|
111
|
-
title:
|
339
|
+
title: "Example with JSON"
|
112
340
|
json: true
|
113
|
-
|
341
|
+
selectors:
|
342
|
+
# ... omitted
|
114
343
|
```
|
115
344
|
|
116
345
|
</details>
|
117
346
|
|
118
|
-
|
119
|
-
|
120
|
-
### Conversion of JSON objects
|
347
|
+
<details>
|
348
|
+
<summary>See example of a converted JSON object</summary>
|
121
349
|
|
122
350
|
This JSON object:
|
123
351
|
|
@@ -127,7 +355,7 @@ This JSON object:
|
|
127
355
|
}
|
128
356
|
```
|
129
357
|
|
130
|
-
|
358
|
+
converts to:
|
131
359
|
|
132
360
|
```xml
|
133
361
|
<hash>
|
@@ -142,7 +370,12 @@ will be converted to:
|
|
142
370
|
|
143
371
|
Your items selector would be `data > datum`, the item's `link` selector would be `url`.
|
144
372
|
|
145
|
-
|
373
|
+
Find further information in [ActiveSupport's `Hash.to_xml` documentation](https://apidock.com/rails/Hash/to_xml).
|
374
|
+
|
375
|
+
</details>
|
376
|
+
|
377
|
+
<details>
|
378
|
+
<summary>See example of a converted JSON array</summary>
|
146
379
|
|
147
380
|
This JSON array:
|
148
381
|
|
@@ -150,7 +383,7 @@ This JSON array:
|
|
150
383
|
[{ "title": "Headline", "url": "https://example.com" }]
|
151
384
|
```
|
152
385
|
|
153
|
-
|
386
|
+
converts to:
|
154
387
|
|
155
388
|
```xml
|
156
389
|
<objects>
|
@@ -163,10 +396,38 @@ will be converted to:
|
|
163
396
|
|
164
397
|
Your items selector would be `objects > object`, the item's `link` selector would be `url`.
|
165
398
|
|
399
|
+
Find further information in [ActiveSupport's `Array.to_xml` documentation](https://apidock.com/rails/Array/to_xml).
|
400
|
+
|
401
|
+
</details>
|
402
|
+
|
166
403
|
## Set any HTTP header in the request
|
167
404
|
|
168
405
|
You can add any HTTP headers to the request to the channel URL.
|
169
|
-
|
406
|
+
Use this to e.g. have Cookie or Authorization information sent or to spoof the User-Agent.
|
407
|
+
|
408
|
+
<details>
|
409
|
+
<summary>See a Ruby example</summary>
|
410
|
+
|
411
|
+
```ruby
|
412
|
+
Html2rss.feed(
|
413
|
+
channel: {
|
414
|
+
url: 'https://example.com',
|
415
|
+
title: "Example with http headers",
|
416
|
+
headers: {
|
417
|
+
"User-Agent" => "html2rss-request",
|
418
|
+
"X-Something" => "Foobar",
|
419
|
+
"Authorization" => "Token deadbea7",
|
420
|
+
"Cookie" => "monster=MeWantCookie"
|
421
|
+
}
|
422
|
+
},
|
423
|
+
selectors: {}
|
424
|
+
)
|
425
|
+
```
|
426
|
+
|
427
|
+
</details>
|
428
|
+
|
429
|
+
<details>
|
430
|
+
<summary>See a YAML feed config example</summary>
|
170
431
|
|
171
432
|
```yaml
|
172
433
|
channel:
|
@@ -177,27 +438,79 @@ channel:
|
|
177
438
|
"X-Something": "Foobar"
|
178
439
|
"Authorization": "Token deadbea7"
|
179
440
|
"Cookie": "monster=MeWantCookie"
|
180
|
-
|
441
|
+
selectors:
|
442
|
+
# ...
|
181
443
|
```
|
182
444
|
|
183
|
-
|
445
|
+
</details>
|
184
446
|
|
185
|
-
|
447
|
+
The headers provided by the channel are merged into the global headers.
|
186
448
|
|
187
|
-
|
449
|
+
## Usage with a YAML config file
|
188
450
|
|
189
|
-
|
451
|
+
This step is not required to work with this gem. If you're using
|
452
|
+
[`html2rss-web`](https://github.com/gildesmarais/html2rss-web)
|
453
|
+
and want to create your private feed configs, keep on reading!
|
190
454
|
|
191
|
-
|
455
|
+
First, create your YAML file, e.g. called `config.yml`.
|
456
|
+
This file will contain your global config and feed configs.
|
192
457
|
|
193
|
-
|
458
|
+
Example:
|
459
|
+
|
460
|
+
```yml
|
461
|
+
headers:
|
462
|
+
'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
|
463
|
+
feeds:
|
464
|
+
myfeed:
|
465
|
+
channel:
|
466
|
+
selectors:
|
467
|
+
myotherfeed:
|
468
|
+
channel:
|
469
|
+
selectors:
|
470
|
+
```
|
471
|
+
|
472
|
+
Your feed configs go below `feeds`. Everything else is part of the global config.
|
473
|
+
|
474
|
+
Build your feeds like this:
|
475
|
+
|
476
|
+
```ruby
|
477
|
+
require 'html2rss'
|
478
|
+
|
479
|
+
myfeed = Html2rss.feed_from_yaml_config('config.yml', 'myfeed')
|
480
|
+
myotherfeed = Html2rss.feed_from_yaml_config('config.yml', 'myotherfeed')
|
481
|
+
```
|
482
|
+
|
483
|
+
Find a full example of a `config.yml` at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
|
484
|
+
|
485
|
+
## Gotchas and tips & tricks
|
486
|
+
|
487
|
+
- Check that the channel URL does not redirect to a mobile page with a different markup structure.
|
488
|
+
- Do not rely on your web browser's developer console. html2rss does not execute JavaScript.
|
489
|
+
- Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
|
490
|
+
- [CSS selectors are quite versatile, here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
|
491
|
+
|
492
|
+
## Development
|
493
|
+
|
494
|
+
After checking out the repository, run `bin/setup` to install dependencies. Then, run `bundle exec rspec` to run the tests.
|
495
|
+
You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
496
|
+
|
497
|
+
<details>
|
498
|
+
<summary>Releasing a new version</summary>
|
194
499
|
|
195
500
|
1. `git pull`
|
196
501
|
2. increase version in `lib/html2rss/version.rb`
|
197
502
|
3. `bundle`
|
198
|
-
4.
|
199
|
-
5. `
|
200
|
-
6.
|
201
|
-
7. `git
|
202
|
-
8. `
|
203
|
-
9. `git
|
503
|
+
4. `git add Gemfile.lock lib/html2rss/version.rb`
|
504
|
+
5. `VERSION=$(ruby -e 'require "./lib/html2rss/version.rb"; puts Html2rss::VERSION')`
|
505
|
+
6. `git commit -m "chore: release $VERSION"`
|
506
|
+
7. `git tag v$VERSION`
|
507
|
+
8. [`standard-changelog -f`](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/standard-changelog)
|
508
|
+
9. `git add CHANGELOG.md && git commit --amend`
|
509
|
+
10. `git tag v$VERSION -f`
|
510
|
+
11. `git push && git push --tags`
|
511
|
+
|
512
|
+
</details>
|
513
|
+
|
514
|
+
## Contributing
|
515
|
+
|
516
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
|
data/html2rss.gemspec
CHANGED
@@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.add_dependency 'faraday', '~> 0.15'
|
35
35
|
spec.add_dependency 'faraday_middleware', '~> 0.13'
|
36
36
|
spec.add_dependency 'hashie', '~> 3.6'
|
37
|
+
spec.add_dependency 'kramdown'
|
37
38
|
spec.add_dependency 'mime-types', '> 3.0'
|
38
39
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
39
40
|
spec.add_dependency 'reverse_markdown', '~> 1.3'
|
@@ -4,9 +4,7 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
def self.get_processor(name)
|
6
6
|
@get_processor ||= Hash.new do |processors, key|
|
7
|
-
|
8
|
-
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
|
9
|
-
processors[key] = Object.const_get(class_name)
|
7
|
+
processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
|
10
8
|
end
|
11
9
|
|
12
10
|
@get_processor[name]
|
@@ -12,15 +12,17 @@ module Html2rss
|
|
12
12
|
# title:
|
13
13
|
# selector: h1
|
14
14
|
# post_process:
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
15
|
+
# name: gsub
|
16
|
+
# pattern: boo
|
17
|
+
# replacement: baz
|
18
18
|
#
|
19
19
|
# Would return:
|
20
20
|
# 'Foo bar and baz'
|
21
21
|
#
|
22
22
|
# `pattern` can be a Regexp or a String.
|
23
|
+
#
|
23
24
|
# `replacement` can be a String or a Hash.
|
25
|
+
#
|
24
26
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
25
27
|
class Gsub
|
26
28
|
def initialize(value, env)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'kramdown'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
# Generates HTML from Markdown.
|
7
|
+
#
|
8
|
+
# It's particularly useful in conjunction with the Template post processor
|
9
|
+
# to generate a description from other selectors.
|
10
|
+
#
|
11
|
+
# YAML usage example:
|
12
|
+
#
|
13
|
+
# selectors:
|
14
|
+
# description:
|
15
|
+
# selector: section
|
16
|
+
# post_process:
|
17
|
+
# - name: template
|
18
|
+
# string: |
|
19
|
+
# # %s
|
20
|
+
#
|
21
|
+
# Price: %s
|
22
|
+
# methods:
|
23
|
+
# - self
|
24
|
+
# - price
|
25
|
+
# - name: markdown_to_html
|
26
|
+
#
|
27
|
+
# Would e.g. return:
|
28
|
+
#
|
29
|
+
# <h1>Section</h1>
|
30
|
+
#
|
31
|
+
# <p>Price: 12.34</p>
|
32
|
+
class MarkdownToHtml
|
33
|
+
def initialize(value, env)
|
34
|
+
@value = value
|
35
|
+
@env = env
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# @return [String] formatted in Markdown
|
40
|
+
def get
|
41
|
+
SanitizeHtml.new(
|
42
|
+
Kramdown::Document.new(@value).to_html,
|
43
|
+
@env
|
44
|
+
).get
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -17,12 +17,12 @@ module Html2rss
|
|
17
17
|
# selector: span
|
18
18
|
# post_process:
|
19
19
|
# name: 'parse_time'
|
20
|
+
# time_zone: 'Europe/Berlin'
|
20
21
|
#
|
21
22
|
# Would return:
|
22
23
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
23
24
|
#
|
24
25
|
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
25
|
-
# As of now it ignores time zones and always falls back to the UTC time zone.
|
26
26
|
class ParseTime
|
27
27
|
def initialize(value, env)
|
28
28
|
@value = value.to_s
|
@@ -4,10 +4,16 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
##
|
6
6
|
# Returns sanitized HTML code as String.
|
7
|
-
# Adds
|
8
7
|
#
|
9
|
-
#
|
10
|
-
#
|
8
|
+
# It adds:
|
9
|
+
#
|
10
|
+
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
11
|
+
# - `referrer-policy='no-referrer'` to <img> tags
|
12
|
+
#
|
13
|
+
# It also:
|
14
|
+
#
|
15
|
+
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
16
|
+
# linking to the <img>'s `src`.
|
11
17
|
#
|
12
18
|
# Imagine this HTML structure:
|
13
19
|
#
|
@@ -17,15 +23,11 @@ module Html2rss
|
|
17
23
|
# <script>alert();</script>
|
18
24
|
# </section>
|
19
25
|
#
|
20
|
-
# It also:
|
21
|
-
#
|
22
|
-
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
23
|
-
#
|
24
26
|
# YAML usage example:
|
25
27
|
#
|
26
28
|
# selectors:
|
27
29
|
# description:
|
28
|
-
# selector: section
|
30
|
+
# selector: '.section'
|
29
31
|
# extractor: html
|
30
32
|
# post_process:
|
31
33
|
# name: sanitize_html
|
@@ -2,9 +2,15 @@ module Html2rss
|
|
2
2
|
module AttributePostProcessors
|
3
3
|
## Returns a defined part of a String.
|
4
4
|
#
|
5
|
+
# Both parameters must be an Integer and they can be negative.
|
5
6
|
# The +end+ parameter can be omitted, in that case it will not cut the
|
6
7
|
# String at the end.
|
7
8
|
#
|
9
|
+
# A Regexp or a MatchString is not supported.
|
10
|
+
#
|
11
|
+
# See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
|
12
|
+
# documentation for more information.
|
13
|
+
#
|
8
14
|
# Imagine this HTML:
|
9
15
|
# <h1>Foo bar and baz<h1>
|
10
16
|
#
|
@@ -13,9 +19,9 @@ module Html2rss
|
|
13
19
|
# title:
|
14
20
|
# selector: h1
|
15
21
|
# post_process:
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
22
|
+
# name: substring
|
23
|
+
# start: 4
|
24
|
+
# end: 6
|
19
25
|
#
|
20
26
|
# Would return:
|
21
27
|
# 'bar'
|
@@ -4,7 +4,8 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
## Returns a formatted String according to the string pattern.
|
6
6
|
#
|
7
|
-
# If +self+ is
|
7
|
+
# If +self+ is used, the selectors extracted value will be used.
|
8
|
+
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
8
9
|
#
|
9
10
|
# Imagine this HTML:
|
10
11
|
# <li>
|
@@ -22,11 +23,8 @@ module Html2rss
|
|
22
23
|
# title:
|
23
24
|
# selector: h1
|
24
25
|
# post_process:
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# methods:
|
28
|
-
# - self
|
29
|
-
# - price
|
26
|
+
# name: template
|
27
|
+
# string: '%{self} (%{price})'
|
30
28
|
#
|
31
29
|
# Would return:
|
32
30
|
# 'Product (23,42€)'
|
@@ -38,10 +36,16 @@ module Html2rss
|
|
38
36
|
end
|
39
37
|
|
40
38
|
##
|
41
|
-
# - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
|
42
39
|
# @return [String]
|
43
40
|
def get
|
44
|
-
|
41
|
+
if @options['methods']
|
42
|
+
string % methods
|
43
|
+
else
|
44
|
+
names = string.scan(/%[<|{](\w*)[>|}]/).flatten
|
45
|
+
names.uniq!
|
46
|
+
|
47
|
+
format(string, names.map { |name| [name.to_sym, item_value(name)] }.to_h)
|
48
|
+
end
|
45
49
|
end
|
46
50
|
|
47
51
|
private
|
@@ -51,9 +55,11 @@ module Html2rss
|
|
51
55
|
end
|
52
56
|
|
53
57
|
def methods
|
54
|
-
@methods ||= @options['methods'].map
|
55
|
-
|
56
|
-
|
58
|
+
@methods ||= @options['methods'].map(&method(:item_value))
|
59
|
+
end
|
60
|
+
|
61
|
+
def item_value(method_name)
|
62
|
+
method_name.to_s == 'self' ? @value.to_s : @item.public_send(method_name.to_sym).to_s
|
57
63
|
end
|
58
64
|
end
|
59
65
|
end
|
@@ -2,13 +2,11 @@ module Html2rss
|
|
2
2
|
##
|
3
3
|
# Provides a namespace for item extractors.
|
4
4
|
module ItemExtractors
|
5
|
-
DEFAULT = '
|
5
|
+
DEFAULT = 'text'.freeze
|
6
6
|
|
7
7
|
def self.get_extractor(name)
|
8
8
|
@get_extractor ||= Hash.new do |extractors, key|
|
9
|
-
|
10
|
-
class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
|
11
|
-
extractors[key] = Object.const_get(class_name)
|
9
|
+
extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
|
12
10
|
end
|
13
11
|
|
14
12
|
@get_extractor[name]
|
data/lib/html2rss/utils.rb
CHANGED
@@ -30,5 +30,11 @@ module Html2rss
|
|
30
30
|
def self.hash_to_xml(hash)
|
31
31
|
hash.to_xml(skip_instruct: true, skip_types: true)
|
32
32
|
end
|
33
|
+
|
34
|
+
def self.get_class_from_name(snake_cased_name, module_name)
|
35
|
+
camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
|
36
|
+
class_name = ['Html2rss', module_name, camel_cased_name].join('::')
|
37
|
+
Object.const_get(class_name)
|
38
|
+
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '3.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: kramdown
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: mime-types
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -321,6 +335,7 @@ files:
|
|
321
335
|
- lib/html2rss/attribute_post_processors.rb
|
322
336
|
- lib/html2rss/attribute_post_processors/gsub.rb
|
323
337
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
338
|
+
- lib/html2rss/attribute_post_processors/markdown_to_html.rb
|
324
339
|
- lib/html2rss/attribute_post_processors/parse_time.rb
|
325
340
|
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
326
341
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|