html2rss 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -4
- data/README.md +370 -57
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors.rb +1 -3
- data/lib/html2rss/attribute_post_processors/gsub.rb +5 -3
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +48 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +1 -1
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +1 -0
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +10 -8
- data/lib/html2rss/attribute_post_processors/substring.rb +9 -3
- data/lib/html2rss/attribute_post_processors/template.rb +17 -11
- data/lib/html2rss/item_extractors.rb +2 -4
- data/lib/html2rss/utils.rb +6 -0
- data/lib/html2rss/version.rb +1 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38b547bdfe0799d71348690ae20d29d74d88c3cbb0828cdd807a382dec14f895
|
4
|
+
data.tar.gz: 8e9ae611d6bbf0174bd9038c3aa7a7b214ec8ffa152e990b2224e98d5022bec0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c0c4dc94d9e339d054ec4ee1c586ac21355cdfa1c9664dd05ef01ca474668f7e483baec35bb8abfd1971eff296b7b5c270816c992ec9fc6abd51a89a7ce28000
|
7
|
+
data.tar.gz: 3f45fb28e10360055ead9b757b4972bd2c52ad3285b94b506c3a6ce66b7bd5ec6364da63cc83b066fab6bc69396ed7408108536b5b9178d4a4a34c28436b4d2d
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,14 @@
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.8.0...v) (2019-11-02)
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
# [0.8.0](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v0.8.0) (2019-11-02)
|
6
|
+
|
7
|
+
|
8
|
+
### Features
|
9
|
+
|
10
|
+
* **post_processors:** add markdown to html ([#54](https://github.com/gildesmarais/html2rss/issues/54)) ([cdf77b8](https://github.com/gildesmarais/html2rss/commit/cdf77b8))
|
11
|
+
* **post_processors:** support annotated tokens ([#62](https://github.com/gildesmarais/html2rss/issues/62)) ([b57bd7b](https://github.com/gildesmarais/html2rss/commit/b57bd7b)), closes [#56](https://github.com/gildesmarais/html2rss/issues/56)
|
2
12
|
|
3
13
|
|
4
14
|
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html2rss (0.
|
4
|
+
html2rss (0.8.0)
|
5
5
|
activesupport (~> 5.0)
|
6
6
|
builder
|
7
7
|
faraday (~> 0.15)
|
8
8
|
faraday_middleware (~> 0.13)
|
9
9
|
hashie (~> 3.6)
|
10
|
+
kramdown
|
10
11
|
mime-types (> 3.0)
|
11
12
|
nokogiri (>= 1.10, < 2.0)
|
12
13
|
reverse_markdown (~> 1.3)
|
@@ -26,6 +27,12 @@ GEM
|
|
26
27
|
builder (3.2.3)
|
27
28
|
byebug (11.0.1)
|
28
29
|
concurrent-ruby (1.1.5)
|
30
|
+
coveralls (0.7.2)
|
31
|
+
multi_json (~> 1.3)
|
32
|
+
rest-client (= 1.6.7)
|
33
|
+
simplecov (>= 0.7)
|
34
|
+
term-ansicolor (= 1.2.2)
|
35
|
+
thor (= 0.18.1)
|
29
36
|
crass (1.0.5)
|
30
37
|
diff-lcs (1.3)
|
31
38
|
docile (1.3.2)
|
@@ -38,13 +45,15 @@ GEM
|
|
38
45
|
concurrent-ruby (~> 1.0)
|
39
46
|
jaro_winkler (1.5.3)
|
40
47
|
json (2.2.0)
|
48
|
+
kramdown (2.1.0)
|
41
49
|
mime-types (3.3)
|
42
50
|
mime-types-data (~> 3.2015)
|
43
51
|
mime-types-data (3.2019.1009)
|
44
52
|
mini_portile2 (2.4.0)
|
45
|
-
minitest (5.
|
53
|
+
minitest (5.13.0)
|
54
|
+
multi_json (1.14.1)
|
46
55
|
multipart-post (2.1.1)
|
47
|
-
nokogiri (1.10.
|
56
|
+
nokogiri (1.10.5)
|
48
57
|
mini_portile2 (~> 2.4.0)
|
49
58
|
nokogumbo (2.0.1)
|
50
59
|
nokogiri (~> 1.8, >= 1.8.4)
|
@@ -52,6 +61,8 @@ GEM
|
|
52
61
|
parser (2.6.5.0)
|
53
62
|
ast (~> 2.4.0)
|
54
63
|
rainbow (3.0.0)
|
64
|
+
rest-client (1.6.7)
|
65
|
+
mime-types (>= 1.16)
|
55
66
|
reverse_markdown (1.3.0)
|
56
67
|
nokogiri
|
57
68
|
rspec (3.9.0)
|
@@ -88,14 +99,18 @@ GEM
|
|
88
99
|
json (>= 1.8, < 3)
|
89
100
|
simplecov-html (~> 0.10.0)
|
90
101
|
simplecov-html (0.10.2)
|
102
|
+
term-ansicolor (1.2.2)
|
103
|
+
tins (~> 0.8)
|
104
|
+
thor (0.18.1)
|
91
105
|
thread_safe (0.3.6)
|
106
|
+
tins (0.13.2)
|
92
107
|
to_regexp (0.2.1)
|
93
108
|
tzinfo (1.2.5)
|
94
109
|
thread_safe (~> 0.1)
|
95
110
|
unicode-display_width (1.6.0)
|
96
111
|
vcr (5.0.0)
|
97
112
|
yard (0.9.20)
|
98
|
-
zeitwerk (2.2.
|
113
|
+
zeitwerk (2.2.1)
|
99
114
|
|
100
115
|
PLATFORMS
|
101
116
|
ruby
|
@@ -103,6 +118,7 @@ PLATFORMS
|
|
103
118
|
DEPENDENCIES
|
104
119
|
bundler (~> 1.16)
|
105
120
|
byebug
|
121
|
+
coveralls
|
106
122
|
html2rss!
|
107
123
|
rspec (~> 3.0)
|
108
124
|
rubocop
|
data/README.md
CHANGED
@@ -2,27 +2,44 @@
|
|
2
2
|
|
3
3
|
[](https://travis-ci.org/gildesmarais/html2rss)
|
4
4
|
[](http://rubygems.org/gems/html2rss/)
|
5
|
-
[
|
5
|
+
[](https://coveralls.io/github/gildesmarais/html2rss?branch=master)
|
6
|
+
[](https://www.rubydoc.info/gems/html2rss)
|
7
|
+

|
6
8
|
|
7
|
-
|
9
|
+
**Searching for a ready to use app which serves generated feeds via HTTP?**
|
10
|
+
[Head over to `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
|
8
11
|
|
9
|
-
|
10
|
-
[Check out `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
|
12
|
+
This Ruby gem builds RSS 2.0 feeds from a _feed config_.
|
11
13
|
|
12
|
-
|
13
|
-
CSS selectors
|
14
|
-
|
15
|
-
|
14
|
+
With the _feed config_ containing the URL to scrape and
|
15
|
+
CSS selectors for information extraction (like title, URL, ...) your RSS builds.
|
16
|
+
[Extractors](#using-extractors) and chain-able [post processors](#using-post-processors)
|
17
|
+
make information extraction, processing and sanitizing a breeze.
|
18
|
+
[Scraping JSON](#scraping-json) responses and
|
19
|
+
[setting HTTP request headers](#set-any-http-header-in-the-request) is
|
20
|
+
supported, too.
|
16
21
|
|
17
22
|
## Installation
|
18
23
|
|
19
|
-
|
20
|
-
|
24
|
+
| 🤩 Like it? | Star it! ⭐️ |
|
25
|
+
| ---------------------------------------------: | -------------------- |
|
26
|
+
| Add this line to your application's `Gemfile`: | `gem 'html2rss'` |
|
27
|
+
| Then execute: | `bundle` |
|
28
|
+
| In your code: | `require 'html2rss'` |
|
29
|
+
|
30
|
+
## Building a feed config
|
31
|
+
|
32
|
+
Here's a minimal working example:
|
21
33
|
|
22
34
|
```ruby
|
35
|
+
require 'html2rss'
|
36
|
+
|
23
37
|
rss =
|
24
38
|
Html2rss.feed(
|
25
|
-
channel: {
|
39
|
+
channel: {
|
40
|
+
title: 'StackOverflow: Hot Network Questions',
|
41
|
+
url: 'https://stackoverflow.com/questions'
|
42
|
+
},
|
26
43
|
selectors: {
|
27
44
|
items: { selector: '#hot-network-questions > ul > li' },
|
28
45
|
title: { selector: 'a' },
|
@@ -30,35 +47,218 @@ rss =
|
|
30
47
|
}
|
31
48
|
)
|
32
49
|
|
33
|
-
puts rss
|
50
|
+
puts rss
|
34
51
|
```
|
35
52
|
|
36
|
-
|
53
|
+
A _feed config_ consists of a `channel` and a `selectors` Hash.
|
54
|
+
The contents of both hashes are explained below.
|
55
|
+
|
56
|
+
**Looks too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
|
57
|
+
|
58
|
+
### The `channel`
|
59
|
+
|
60
|
+
| attribute | | type | remark |
|
61
|
+
| ------------- | -------- | ------- | ----------------------- |
|
62
|
+
| `title` | required | String | |
|
63
|
+
| `url` | required | String | |
|
64
|
+
| `ttl` | optional | Integer | time to live in minutes |
|
65
|
+
| `description` | optional | String | |
|
66
|
+
| `headers` | optional | Hash | See notes below. |
|
67
|
+
|
68
|
+
### The `selectors`
|
69
|
+
|
70
|
+
You must provide an `items` selector hash which contains the CSS selector.
|
71
|
+
`items` needs to return a collection of HTML tags.
|
72
|
+
The other selectors are scoped to the tags of the items' collection.
|
73
|
+
|
74
|
+
To build a
|
75
|
+
[valid RSS 2.0 item](http://www.rssboard.org/rss-profile#element-channel-item)
|
76
|
+
each item has to have at least a `title` or a `description`.
|
77
|
+
|
78
|
+
Your `selectors` can contain arbitrary selector names, but only these
|
79
|
+
will make it into the RSS feed:
|
80
|
+
|
81
|
+
| RSS 2.0 tag | name in html2rss | remark |
|
82
|
+
| ------------- | ---------------- | --------------------------- |
|
83
|
+
| `title` | `title` | |
|
84
|
+
| `description` | `description` | Supports HTML. |
|
85
|
+
| `link` | `link` | A URL. |
|
86
|
+
| `author` | `author` | |
|
87
|
+
| `category` | `categories` | See notes below. |
|
88
|
+
| `enclosure` | `enclosure` | See notes below. |
|
89
|
+
| `pubDate` | `update` | An instance of `Time`. |
|
90
|
+
| `guid` | `guid` | Generated from the `title`. |
|
91
|
+
| `comments` | `comments` | A URL. |
|
92
|
+
| `source` | ~~source~~ | Not yet supported. |
|
93
|
+
|
94
|
+
### The `selector` hash
|
95
|
+
|
96
|
+
Your selector hash can have these attributes:
|
97
|
+
|
98
|
+
| name | value |
|
99
|
+
| -------------- | -------------------------------------------------------- |
|
100
|
+
| `selector` | The CSS selector to select the tag with the information. |
|
101
|
+
| `extractor` | Name of the extractor. See notes below. |
|
102
|
+
| `post_process` | A hash or array of hashes. See notes below. |
|
103
|
+
|
104
|
+
## Using extractors
|
105
|
+
|
106
|
+
Extractors help with extracting the information from the selected HTML tag.
|
107
|
+
|
108
|
+
- The default extractor is `text`, which returns the tag's inner text.
|
109
|
+
- The `html` extractor returns the tag's outer HTML.
|
110
|
+
- The `href` extractor returns a URL from the tag's `href` attribute and corrects relative ones to absolute ones.
|
111
|
+
- The `attribute` extractor returns the value of that tag's attribute.
|
112
|
+
- The `static` extractor returns the configured static value (it doesn't extract anything).
|
113
|
+
- [See file list of extractors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/item_extractors).
|
114
|
+
|
115
|
+
Extractors can require additional attributes on the selector hash.
|
116
|
+
👉 [Read their docs for usage examples](https://www.rubydoc.info/gems/html2rss/Html2rss/ItemExtractors).
|
117
|
+
|
118
|
+
<details>
|
119
|
+
<summary>See a Ruby example</summary>
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
Html2rss.feed(
|
123
|
+
channel: {}, selectors: { link: { selector: 'a', extractor: 'href' } }
|
124
|
+
)
|
125
|
+
```
|
126
|
+
|
127
|
+
</details>
|
128
|
+
|
129
|
+
<details>
|
130
|
+
<summary>See a YAML feed config example</summary>
|
131
|
+
|
132
|
+
```yml
|
133
|
+
channel:
|
134
|
+
# ... omitted
|
135
|
+
selectors:
|
136
|
+
# ... omitted
|
137
|
+
link:
|
138
|
+
selector: 'a'
|
139
|
+
extractor: 'href'
|
140
|
+
```
|
141
|
+
|
142
|
+
</details>
|
143
|
+
|
144
|
+
## Using post processors
|
145
|
+
|
146
|
+
Extracted information can be further manipulated with post processors.
|
147
|
+
|
148
|
+
| name | |
|
149
|
+
| ------------------ | ------------------------------------------------------------------------------------- |
|
150
|
+
| `gsub` | Allows global substitution operations on Strings (Regexp or simple pattern). |
|
151
|
+
| `html_to_markdown` | HTML to Markdown, using [reverse_markdown](https://github.com/xijo/reverse_markdown). |
|
152
|
+
| `markdown_to_html` | converts Markdown to HTML, using [kramdown](https://github.com/gettalong/kramdown). |
|
153
|
+
| `parse_time` | Parses a String containing a time in a time zone. |
|
154
|
+
| `parse_uri` | Parses a String as URL. |
|
155
|
+
| `sanitize_html` | Strips unsafe and uneeded HTML and adds security related attributes. |
|
156
|
+
| `substring` | Cuts a part off of a String, starting at a position. |
|
157
|
+
| `template` | Based on a template, it creates a new String filled with other selectors values. |
|
158
|
+
|
159
|
+
⚠️ Always make use of the `sanitize_html` post processor for HTML content. _Never trust the internet!_ ⚠️
|
37
160
|
|
38
|
-
|
161
|
+
- [See file list of post processors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/attribute_post_processors).
|
39
162
|
|
40
|
-
|
41
|
-
|
163
|
+
👉 [Read their docs for usage examples.](https://www.rubydoc.info/gems/html2rss/Html2rss/AttributePostProcessors)
|
164
|
+
|
165
|
+
<details>
|
166
|
+
<summary>See a Ruby example</summary>
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
Html2rss.feed(
|
170
|
+
channel: {},
|
171
|
+
selectors: {
|
172
|
+
description: {
|
173
|
+
selector: '.content', post_process: { name: 'sanitize_html' }
|
174
|
+
}
|
175
|
+
}
|
176
|
+
)
|
177
|
+
```
|
42
178
|
|
43
|
-
|
179
|
+
</details>
|
44
180
|
|
45
|
-
|
181
|
+
<details>
|
182
|
+
<summary>See a YAML feed config example</summary>
|
183
|
+
|
184
|
+
```yml
|
185
|
+
channel:
|
186
|
+
# ... omitted
|
187
|
+
selectors:
|
188
|
+
# ... omitted
|
189
|
+
description:
|
190
|
+
selector: '.content'
|
191
|
+
post_process:
|
192
|
+
- name: sanitize_html
|
193
|
+
```
|
194
|
+
|
195
|
+
</details>
|
196
|
+
|
197
|
+
### Chaining post processors
|
198
|
+
|
199
|
+
Pass an array to `post_process` to chain the post processors.
|
200
|
+
|
201
|
+
<details>
|
202
|
+
<summary>YAML example: build the description from a template String (in Markdown) and convert that Markdown to HTML</summary>
|
203
|
+
|
204
|
+
```yml
|
205
|
+
channel:
|
206
|
+
# ... omitted
|
207
|
+
selectors:
|
208
|
+
# ... omitted
|
209
|
+
price:
|
210
|
+
selector: '.price'
|
211
|
+
description:
|
212
|
+
selector: '.section'
|
213
|
+
post_process:
|
214
|
+
- name: template
|
215
|
+
string: |
|
216
|
+
# %{self}
|
217
|
+
|
218
|
+
Price: %{price}
|
219
|
+
- name: markdown_to_html
|
220
|
+
```
|
221
|
+
|
222
|
+
Note the use of `|` for a multi-line String in YAML.
|
223
|
+
|
224
|
+
</details>
|
225
|
+
|
226
|
+
## Adding `<category>` tags to an item
|
46
227
|
|
47
228
|
The `categories` selector takes an array of selector names. The value of those
|
48
|
-
selectors will become a category on the item.
|
229
|
+
selectors will become a `<category>` on the RSS item.
|
230
|
+
|
231
|
+
<details>
|
232
|
+
<summary>See a Ruby example</summary>
|
233
|
+
|
234
|
+
```ruby
|
235
|
+
Html2rss.feed(
|
236
|
+
channel: {},
|
237
|
+
selectors: {
|
238
|
+
genre: {
|
239
|
+
# ... omitted
|
240
|
+
selector: '.genre'
|
241
|
+
},
|
242
|
+
branch: { selector: '.branch' },
|
243
|
+
categories: %i[genre branch]
|
244
|
+
}
|
245
|
+
)
|
246
|
+
```
|
247
|
+
|
248
|
+
</details>
|
49
249
|
|
50
250
|
<details>
|
51
|
-
<summary>See a YAML config example</summary>
|
251
|
+
<summary>See a YAML feed config example</summary>
|
52
252
|
|
53
253
|
```yml
|
54
254
|
channel:
|
55
|
-
# ... omitted
|
255
|
+
# ... omitted
|
56
256
|
selectors:
|
57
|
-
|
257
|
+
# ... omitted
|
58
258
|
genre:
|
59
|
-
selector:
|
259
|
+
selector: ".genre"
|
60
260
|
branch:
|
61
|
-
selector:
|
261
|
+
selector: ".branch"
|
62
262
|
categories:
|
63
263
|
- genre
|
64
264
|
- branch
|
@@ -66,13 +266,13 @@ selectors:
|
|
66
266
|
|
67
267
|
</details>
|
68
268
|
|
69
|
-
## Adding an enclosure to
|
269
|
+
## Adding an `<enclosure>` tag to an item
|
70
270
|
|
71
271
|
An enclosure can be 'anything', e.g. a image, audio or video file.
|
72
272
|
|
73
|
-
The
|
273
|
+
The `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's URL as base.
|
74
274
|
|
75
|
-
Since html2rss does no further inspection of the enclosure,
|
275
|
+
Since html2rss does no further inspection of the enclosure, its support comes with trade-offs:
|
76
276
|
|
77
277
|
1. The content-type is guessed from the file extension of the URL.
|
78
278
|
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
@@ -81,43 +281,71 @@ Since html2rss does no further inspection of the enclosure, the support of this
|
|
81
281
|
Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item-enclosure) for further information on enclosing content.
|
82
282
|
|
83
283
|
<details>
|
84
|
-
<summary>See a
|
284
|
+
<summary>See a Ruby example</summary>
|
285
|
+
|
286
|
+
```ruby
|
287
|
+
Html2rss.feed(
|
288
|
+
channel: {},
|
289
|
+
selectors: {
|
290
|
+
enclosure: { selector: 'img', extractor: 'attribute', attribute: 'src' }
|
291
|
+
}
|
292
|
+
)
|
293
|
+
```
|
294
|
+
|
295
|
+
</details>
|
296
|
+
|
297
|
+
<details>
|
298
|
+
<summary>See a YAML feed config example</summary>
|
85
299
|
|
86
300
|
```yml
|
87
301
|
channel:
|
88
|
-
# ... omitted
|
302
|
+
# ... omitted
|
89
303
|
selectors:
|
90
|
-
|
91
|
-
enclosure:
|
92
|
-
|
93
|
-
|
94
|
-
|
304
|
+
# ... omitted
|
305
|
+
enclosure:
|
306
|
+
selector: "img"
|
307
|
+
extractor: "attribute"
|
308
|
+
attribute: "src"
|
95
309
|
```
|
96
310
|
|
97
311
|
</details>
|
98
312
|
|
99
313
|
## Scraping JSON
|
100
314
|
|
101
|
-
|
315
|
+
Although this gem is called **html***2rss*, it's possible to scrape and process JSON.
|
102
316
|
|
103
317
|
Adding `json: true` to the channel config will convert the JSON response to XML.
|
104
318
|
|
319
|
+
<details>
|
320
|
+
<summary>See a Ruby example</summary>
|
321
|
+
|
322
|
+
```ruby
|
323
|
+
Html2rss.feed(
|
324
|
+
channel: {
|
325
|
+
url: 'https://example.com', title: 'Example with JSON', json: true
|
326
|
+
},
|
327
|
+
selectors: {} # ... omitted
|
328
|
+
)
|
329
|
+
```
|
330
|
+
|
331
|
+
</details>
|
332
|
+
|
105
333
|
<details>
|
106
334
|
<summary>See a YAML feed config example</summary>
|
107
335
|
|
108
336
|
```yaml
|
109
337
|
channel:
|
110
338
|
url: https://example.com
|
111
|
-
title:
|
339
|
+
title: "Example with JSON"
|
112
340
|
json: true
|
113
|
-
|
341
|
+
selectors:
|
342
|
+
# ... omitted
|
114
343
|
```
|
115
344
|
|
116
345
|
</details>
|
117
346
|
|
118
|
-
|
119
|
-
|
120
|
-
### Conversion of JSON objects
|
347
|
+
<details>
|
348
|
+
<summary>See example of a converted JSON object</summary>
|
121
349
|
|
122
350
|
This JSON object:
|
123
351
|
|
@@ -127,7 +355,7 @@ This JSON object:
|
|
127
355
|
}
|
128
356
|
```
|
129
357
|
|
130
|
-
|
358
|
+
converts to:
|
131
359
|
|
132
360
|
```xml
|
133
361
|
<hash>
|
@@ -142,7 +370,12 @@ will be converted to:
|
|
142
370
|
|
143
371
|
Your items selector would be `data > datum`, the item's `link` selector would be `url`.
|
144
372
|
|
145
|
-
|
373
|
+
Find further information in [ActiveSupport's `Hash.to_xml` documentation](https://apidock.com/rails/Hash/to_xml).
|
374
|
+
|
375
|
+
</details>
|
376
|
+
|
377
|
+
<details>
|
378
|
+
<summary>See example of a converted JSON array</summary>
|
146
379
|
|
147
380
|
This JSON array:
|
148
381
|
|
@@ -150,7 +383,7 @@ This JSON array:
|
|
150
383
|
[{ "title": "Headline", "url": "https://example.com" }]
|
151
384
|
```
|
152
385
|
|
153
|
-
|
386
|
+
converts to:
|
154
387
|
|
155
388
|
```xml
|
156
389
|
<objects>
|
@@ -163,10 +396,38 @@ will be converted to:
|
|
163
396
|
|
164
397
|
Your items selector would be `objects > object`, the item's `link` selector would be `url`.
|
165
398
|
|
399
|
+
Find further information in [ActiveSupport's `Array.to_xml` documentation](https://apidock.com/rails/Array/to_xml).
|
400
|
+
|
401
|
+
</details>
|
402
|
+
|
166
403
|
## Set any HTTP header in the request
|
167
404
|
|
168
405
|
You can add any HTTP headers to the request to the channel URL.
|
169
|
-
|
406
|
+
Use this to e.g. have Cookie or Authorization information sent or to spoof the User-Agent.
|
407
|
+
|
408
|
+
<details>
|
409
|
+
<summary>See a Ruby example</summary>
|
410
|
+
|
411
|
+
```ruby
|
412
|
+
Html2rss.feed(
|
413
|
+
channel: {
|
414
|
+
url: 'https://example.com',
|
415
|
+
title: "Example with http headers",
|
416
|
+
headers: {
|
417
|
+
"User-Agent" => "html2rss-request",
|
418
|
+
"X-Something" => "Foobar",
|
419
|
+
"Authorization" => "Token deadbea7",
|
420
|
+
"Cookie" => "monster=MeWantCookie"
|
421
|
+
}
|
422
|
+
},
|
423
|
+
selectors: {}
|
424
|
+
)
|
425
|
+
```
|
426
|
+
|
427
|
+
</details>
|
428
|
+
|
429
|
+
<details>
|
430
|
+
<summary>See a YAML feed config example</summary>
|
170
431
|
|
171
432
|
```yaml
|
172
433
|
channel:
|
@@ -177,27 +438,79 @@ channel:
|
|
177
438
|
"X-Something": "Foobar"
|
178
439
|
"Authorization": "Token deadbea7"
|
179
440
|
"Cookie": "monster=MeWantCookie"
|
180
|
-
|
441
|
+
selectors:
|
442
|
+
# ...
|
181
443
|
```
|
182
444
|
|
183
|
-
|
445
|
+
</details>
|
184
446
|
|
185
|
-
|
447
|
+
The headers provided by the channel are merged into the global headers.
|
186
448
|
|
187
|
-
|
449
|
+
## Usage with a YAML config file
|
188
450
|
|
189
|
-
|
451
|
+
This step is not required to work with this gem. If you're using
|
452
|
+
[`html2rss-web`](https://github.com/gildesmarais/html2rss-web)
|
453
|
+
and want to create your private feed configs, keep on reading!
|
190
454
|
|
191
|
-
|
455
|
+
First, create your YAML file, e.g. called `config.yml`.
|
456
|
+
This file will contain your global config and feed configs.
|
192
457
|
|
193
|
-
|
458
|
+
Example:
|
459
|
+
|
460
|
+
```yml
|
461
|
+
headers:
|
462
|
+
'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
|
463
|
+
feeds:
|
464
|
+
myfeed:
|
465
|
+
channel:
|
466
|
+
selectors:
|
467
|
+
myotherfeed:
|
468
|
+
channel:
|
469
|
+
selectors:
|
470
|
+
```
|
471
|
+
|
472
|
+
Your feed configs go below `feeds`. Everything else is part of the global config.
|
473
|
+
|
474
|
+
Build your feeds like this:
|
475
|
+
|
476
|
+
```ruby
|
477
|
+
require 'html2rss'
|
478
|
+
|
479
|
+
myfeed = Html2rss.feed_from_yaml_config('config.yml', 'myfeed')
|
480
|
+
myotherfeed = Html2rss.feed_from_yaml_config('config.yml', 'myotherfeed')
|
481
|
+
```
|
482
|
+
|
483
|
+
Find a full example of a `config.yml` at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
|
484
|
+
|
485
|
+
## Gotchas and tips & tricks
|
486
|
+
|
487
|
+
- Check that the channel URL does not redirect to a mobile page with a different markup structure.
|
488
|
+
- Do not rely on your web browser's developer console. html2rss does not execute JavaScript.
|
489
|
+
- Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
|
490
|
+
- [CSS selectors are quite versatile, here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
|
491
|
+
|
492
|
+
## Development
|
493
|
+
|
494
|
+
After checking out the repository, run `bin/setup` to install dependencies. Then, run `bundle exec rspec` to run the tests.
|
495
|
+
You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
496
|
+
|
497
|
+
<details>
|
498
|
+
<summary>Releasing a new version</summary>
|
194
499
|
|
195
500
|
1. `git pull`
|
196
501
|
2. increase version in `lib/html2rss/version.rb`
|
197
502
|
3. `bundle`
|
198
|
-
4.
|
199
|
-
5. `
|
200
|
-
6.
|
201
|
-
7. `git
|
202
|
-
8. `
|
203
|
-
9. `git
|
503
|
+
4. `git add Gemfile.lock lib/html2rss/version.rb`
|
504
|
+
5. `VERSION=$(ruby -e 'require "./lib/html2rss/version.rb"; puts Html2rss::VERSION')`
|
505
|
+
6. `git commit -m "chore: release $VERSION"`
|
506
|
+
7. `git tag v$VERSION`
|
507
|
+
8. [`standard-changelog -f`](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/standard-changelog)
|
508
|
+
9. `git add CHANGELOG.md && git commit --amend`
|
509
|
+
10. `git tag v$VERSION -f`
|
510
|
+
11. `git push && git push --tags`
|
511
|
+
|
512
|
+
</details>
|
513
|
+
|
514
|
+
## Contributing
|
515
|
+
|
516
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
|
data/html2rss.gemspec
CHANGED
@@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.add_dependency 'faraday', '~> 0.15'
|
35
35
|
spec.add_dependency 'faraday_middleware', '~> 0.13'
|
36
36
|
spec.add_dependency 'hashie', '~> 3.6'
|
37
|
+
spec.add_dependency 'kramdown'
|
37
38
|
spec.add_dependency 'mime-types', '> 3.0'
|
38
39
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
39
40
|
spec.add_dependency 'reverse_markdown', '~> 1.3'
|
@@ -4,9 +4,7 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
def self.get_processor(name)
|
6
6
|
@get_processor ||= Hash.new do |processors, key|
|
7
|
-
|
8
|
-
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
|
9
|
-
processors[key] = Object.const_get(class_name)
|
7
|
+
processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
|
10
8
|
end
|
11
9
|
|
12
10
|
@get_processor[name]
|
@@ -12,15 +12,17 @@ module Html2rss
|
|
12
12
|
# title:
|
13
13
|
# selector: h1
|
14
14
|
# post_process:
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
15
|
+
# name: gsub
|
16
|
+
# pattern: boo
|
17
|
+
# replacement: baz
|
18
18
|
#
|
19
19
|
# Would return:
|
20
20
|
# 'Foo bar and baz'
|
21
21
|
#
|
22
22
|
# `pattern` can be a Regexp or a String.
|
23
|
+
#
|
23
24
|
# `replacement` can be a String or a Hash.
|
25
|
+
#
|
24
26
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
25
27
|
class Gsub
|
26
28
|
def initialize(value, env)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'kramdown'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
# Generates HTML from Markdown.
|
7
|
+
#
|
8
|
+
# It's particularly useful in conjunction with the Template post processor
|
9
|
+
# to generate a description from other selectors.
|
10
|
+
#
|
11
|
+
# YAML usage example:
|
12
|
+
#
|
13
|
+
# selectors:
|
14
|
+
# description:
|
15
|
+
# selector: section
|
16
|
+
# post_process:
|
17
|
+
# - name: template
|
18
|
+
# string: |
|
19
|
+
# # %s
|
20
|
+
#
|
21
|
+
# Price: %s
|
22
|
+
# methods:
|
23
|
+
# - self
|
24
|
+
# - price
|
25
|
+
# - name: markdown_to_html
|
26
|
+
#
|
27
|
+
# Would e.g. return:
|
28
|
+
#
|
29
|
+
# <h1>Section</h1>
|
30
|
+
#
|
31
|
+
# <p>Price: 12.34</p>
|
32
|
+
class MarkdownToHtml
|
33
|
+
def initialize(value, env)
|
34
|
+
@value = value
|
35
|
+
@env = env
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# @return [String] formatted in Markdown
|
40
|
+
def get
|
41
|
+
SanitizeHtml.new(
|
42
|
+
Kramdown::Document.new(@value).to_html,
|
43
|
+
@env
|
44
|
+
).get
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -17,12 +17,12 @@ module Html2rss
|
|
17
17
|
# selector: span
|
18
18
|
# post_process:
|
19
19
|
# name: 'parse_time'
|
20
|
+
# time_zone: 'Europe/Berlin'
|
20
21
|
#
|
21
22
|
# Would return:
|
22
23
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
23
24
|
#
|
24
25
|
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
25
|
-
# As of now it ignores time zones and always falls back to the UTC time zone.
|
26
26
|
class ParseTime
|
27
27
|
def initialize(value, env)
|
28
28
|
@value = value.to_s
|
@@ -4,10 +4,16 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
##
|
6
6
|
# Returns sanitized HTML code as String.
|
7
|
-
# Adds
|
8
7
|
#
|
9
|
-
#
|
10
|
-
#
|
8
|
+
# It adds:
|
9
|
+
#
|
10
|
+
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
11
|
+
# - `referrer-policy='no-referrer'` to <img> tags
|
12
|
+
#
|
13
|
+
# It also:
|
14
|
+
#
|
15
|
+
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
16
|
+
# linking to the <img>'s `src`.
|
11
17
|
#
|
12
18
|
# Imagine this HTML structure:
|
13
19
|
#
|
@@ -17,15 +23,11 @@ module Html2rss
|
|
17
23
|
# <script>alert();</script>
|
18
24
|
# </section>
|
19
25
|
#
|
20
|
-
# It also:
|
21
|
-
#
|
22
|
-
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
23
|
-
#
|
24
26
|
# YAML usage example:
|
25
27
|
#
|
26
28
|
# selectors:
|
27
29
|
# description:
|
28
|
-
# selector: section
|
30
|
+
# selector: '.section'
|
29
31
|
# extractor: html
|
30
32
|
# post_process:
|
31
33
|
# name: sanitize_html
|
@@ -2,9 +2,15 @@ module Html2rss
|
|
2
2
|
module AttributePostProcessors
|
3
3
|
## Returns a defined part of a String.
|
4
4
|
#
|
5
|
+
# Both parameters must be an Integer and they can be negative.
|
5
6
|
# The +end+ parameter can be omitted, in that case it will not cut the
|
6
7
|
# String at the end.
|
7
8
|
#
|
9
|
+
# A Regexp or a MatchString is not supported.
|
10
|
+
#
|
11
|
+
# See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
|
12
|
+
# documentation for more information.
|
13
|
+
#
|
8
14
|
# Imagine this HTML:
|
9
15
|
# <h1>Foo bar and baz<h1>
|
10
16
|
#
|
@@ -13,9 +19,9 @@ module Html2rss
|
|
13
19
|
# title:
|
14
20
|
# selector: h1
|
15
21
|
# post_process:
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
22
|
+
# name: substring
|
23
|
+
# start: 4
|
24
|
+
# end: 6
|
19
25
|
#
|
20
26
|
# Would return:
|
21
27
|
# 'bar'
|
@@ -4,7 +4,8 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
## Returns a formatted String according to the string pattern.
|
6
6
|
#
|
7
|
-
# If +self+ is
|
7
|
+
# If +self+ is used, the selectors extracted value will be used.
|
8
|
+
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
8
9
|
#
|
9
10
|
# Imagine this HTML:
|
10
11
|
# <li>
|
@@ -22,11 +23,8 @@ module Html2rss
|
|
22
23
|
# title:
|
23
24
|
# selector: h1
|
24
25
|
# post_process:
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# methods:
|
28
|
-
# - self
|
29
|
-
# - price
|
26
|
+
# name: template
|
27
|
+
# string: '%{self} (%{price})'
|
30
28
|
#
|
31
29
|
# Would return:
|
32
30
|
# 'Product (23,42€)'
|
@@ -38,10 +36,16 @@ module Html2rss
|
|
38
36
|
end
|
39
37
|
|
40
38
|
##
|
41
|
-
# - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
|
42
39
|
# @return [String]
|
43
40
|
def get
|
44
|
-
|
41
|
+
if @options['methods']
|
42
|
+
string % methods
|
43
|
+
else
|
44
|
+
names = string.scan(/%[<|{](\w*)[>|}]/).flatten
|
45
|
+
names.uniq!
|
46
|
+
|
47
|
+
format(string, names.map { |name| [name.to_sym, item_value(name)] }.to_h)
|
48
|
+
end
|
45
49
|
end
|
46
50
|
|
47
51
|
private
|
@@ -51,9 +55,11 @@ module Html2rss
|
|
51
55
|
end
|
52
56
|
|
53
57
|
def methods
|
54
|
-
@methods ||= @options['methods'].map
|
55
|
-
|
56
|
-
|
58
|
+
@methods ||= @options['methods'].map(&method(:item_value))
|
59
|
+
end
|
60
|
+
|
61
|
+
def item_value(method_name)
|
62
|
+
method_name.to_s == 'self' ? @value.to_s : @item.public_send(method_name.to_sym).to_s
|
57
63
|
end
|
58
64
|
end
|
59
65
|
end
|
@@ -2,13 +2,11 @@ module Html2rss
|
|
2
2
|
##
|
3
3
|
# Provides a namespace for item extractors.
|
4
4
|
module ItemExtractors
|
5
|
-
DEFAULT = '
|
5
|
+
DEFAULT = 'text'.freeze
|
6
6
|
|
7
7
|
def self.get_extractor(name)
|
8
8
|
@get_extractor ||= Hash.new do |extractors, key|
|
9
|
-
|
10
|
-
class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
|
11
|
-
extractors[key] = Object.const_get(class_name)
|
9
|
+
extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
|
12
10
|
end
|
13
11
|
|
14
12
|
@get_extractor[name]
|
data/lib/html2rss/utils.rb
CHANGED
@@ -30,5 +30,11 @@ module Html2rss
|
|
30
30
|
def self.hash_to_xml(hash)
|
31
31
|
hash.to_xml(skip_instruct: true, skip_types: true)
|
32
32
|
end
|
33
|
+
|
34
|
+
def self.get_class_from_name(snake_cased_name, module_name)
|
35
|
+
camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
|
36
|
+
class_name = ['Html2rss', module_name, camel_cased_name].join('::')
|
37
|
+
Object.const_get(class_name)
|
38
|
+
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '3.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: kramdown
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: mime-types
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -321,6 +335,7 @@ files:
|
|
321
335
|
- lib/html2rss/attribute_post_processors.rb
|
322
336
|
- lib/html2rss/attribute_post_processors/gsub.rb
|
323
337
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
338
|
+
- lib/html2rss/attribute_post_processors/markdown_to_html.rb
|
324
339
|
- lib/html2rss/attribute_post_processors/parse_time.rb
|
325
340
|
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
326
341
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|