html2rss 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +13 -1
- data/Gemfile.lock +14 -17
- data/README.md +28 -29
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +1 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/config.rb +11 -3
- data/lib/html2rss/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 134aacbabf1b61aadcdb6d96757d329bde20429027f97c2c3bbd0328bb02cd7b
|
|
4
|
+
data.tar.gz: 2e6ef3c8a38df3e54983670b17d574bfe771069f662414e58a5c0572ab922b1f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e3b80225d14b44820f742f484a418e2d0b90060f7e329f2c89a2786795a1d87bf372f194e873d4a109d8c3649399e0716a41f65479c1b60ef9b8202154a8e48
|
|
7
|
+
data.tar.gz: ed5c095e4457d11208fa67634e526f1345c64f9d48d48df3eb11b8a5ebeb9cb203a62ae329f5388491d57435b1326bf25771349e3a374dc424094d3da60822e0
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,4 +1,16 @@
|
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.8.
|
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.8.1...v) (2019-11-08)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
## [0.8.1](https://github.com/gildesmarais/html2rss/compare/v0.8.0...v0.8.1) (2019-11-08)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### Features
|
|
9
|
+
|
|
10
|
+
* auto generate nicer channel's title and description ([#63](https://github.com/gildesmarais/html2rss/issues/63)) ([6db28f6](https://github.com/gildesmarais/html2rss/commit/6db28f6))
|
|
11
|
+
* change default ttl to 360 ([#65](https://github.com/gildesmarais/html2rss/issues/65)) ([605c8db](https://github.com/gildesmarais/html2rss/commit/605c8db))
|
|
12
|
+
* **config:** improve generation of channel.title from channel.url ([#68](https://github.com/gildesmarais/html2rss/issues/68)) ([bc8ecbb](https://github.com/gildesmarais/html2rss/commit/bc8ecbb))
|
|
13
|
+
* **parse_uri:** squish url to not fail on url with padding spaces ([#67](https://github.com/gildesmarais/html2rss/issues/67)) ([e349449](https://github.com/gildesmarais/html2rss/commit/e349449))
|
|
2
14
|
|
|
3
15
|
|
|
4
16
|
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html2rss (0.8.
|
|
4
|
+
html2rss (0.8.1)
|
|
5
5
|
activesupport (~> 5.0)
|
|
6
6
|
builder
|
|
7
7
|
faraday (~> 0.15)
|
|
@@ -27,12 +27,12 @@ GEM
|
|
|
27
27
|
builder (3.2.3)
|
|
28
28
|
byebug (11.0.1)
|
|
29
29
|
concurrent-ruby (1.1.5)
|
|
30
|
-
coveralls (0.
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
coveralls (0.8.23)
|
|
31
|
+
json (>= 1.8, < 3)
|
|
32
|
+
simplecov (~> 0.16.1)
|
|
33
|
+
term-ansicolor (~> 1.3)
|
|
34
|
+
thor (>= 0.19.4, < 2.0)
|
|
35
|
+
tins (~> 1.6)
|
|
36
36
|
crass (1.0.5)
|
|
37
37
|
diff-lcs (1.3)
|
|
38
38
|
docile (1.3.2)
|
|
@@ -43,7 +43,7 @@ GEM
|
|
|
43
43
|
hashie (3.6.0)
|
|
44
44
|
i18n (1.7.0)
|
|
45
45
|
concurrent-ruby (~> 1.0)
|
|
46
|
-
jaro_winkler (1.5.
|
|
46
|
+
jaro_winkler (1.5.4)
|
|
47
47
|
json (2.2.0)
|
|
48
48
|
kramdown (2.1.0)
|
|
49
49
|
mime-types (3.3)
|
|
@@ -51,7 +51,6 @@ GEM
|
|
|
51
51
|
mime-types-data (3.2019.1009)
|
|
52
52
|
mini_portile2 (2.4.0)
|
|
53
53
|
minitest (5.13.0)
|
|
54
|
-
multi_json (1.14.1)
|
|
55
54
|
multipart-post (2.1.1)
|
|
56
55
|
nokogiri (1.10.5)
|
|
57
56
|
mini_portile2 (~> 2.4.0)
|
|
@@ -61,8 +60,6 @@ GEM
|
|
|
61
60
|
parser (2.6.5.0)
|
|
62
61
|
ast (~> 2.4.0)
|
|
63
62
|
rainbow (3.0.0)
|
|
64
|
-
rest-client (1.6.7)
|
|
65
|
-
mime-types (>= 1.16)
|
|
66
63
|
reverse_markdown (1.3.0)
|
|
67
64
|
nokogiri
|
|
68
65
|
rspec (3.9.0)
|
|
@@ -78,7 +75,7 @@ GEM
|
|
|
78
75
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
79
76
|
rspec-support (~> 3.9.0)
|
|
80
77
|
rspec-support (3.9.0)
|
|
81
|
-
rubocop (0.
|
|
78
|
+
rubocop (0.76.0)
|
|
82
79
|
jaro_winkler (~> 1.5.1)
|
|
83
80
|
parallel (~> 1.10)
|
|
84
81
|
parser (>= 2.6)
|
|
@@ -94,16 +91,16 @@ GEM
|
|
|
94
91
|
crass (~> 1.0.2)
|
|
95
92
|
nokogiri (>= 1.8.0)
|
|
96
93
|
nokogumbo (~> 2.0)
|
|
97
|
-
simplecov (0.
|
|
94
|
+
simplecov (0.16.1)
|
|
98
95
|
docile (~> 1.1)
|
|
99
96
|
json (>= 1.8, < 3)
|
|
100
97
|
simplecov-html (~> 0.10.0)
|
|
101
98
|
simplecov-html (0.10.2)
|
|
102
|
-
term-ansicolor (1.
|
|
103
|
-
tins (~> 0
|
|
104
|
-
thor (0.
|
|
99
|
+
term-ansicolor (1.7.1)
|
|
100
|
+
tins (~> 1.0)
|
|
101
|
+
thor (0.20.3)
|
|
105
102
|
thread_safe (0.3.6)
|
|
106
|
-
tins (
|
|
103
|
+
tins (1.22.0)
|
|
107
104
|
to_regexp (0.2.1)
|
|
108
105
|
tzinfo (1.2.5)
|
|
109
106
|
thread_safe (~> 0.1)
|
data/README.md
CHANGED
|
@@ -15,7 +15,7 @@ With the _feed config_ containing the URL to scrape and
|
|
|
15
15
|
CSS selectors for information extraction (like title, URL, ...) your RSS builds.
|
|
16
16
|
[Extractors](#using-extractors) and chain-able [post processors](#using-post-processors)
|
|
17
17
|
make information extraction, processing and sanitizing a breeze.
|
|
18
|
-
[Scraping JSON](#scraping-json) responses and
|
|
18
|
+
[Scraping JSON](#scraping-and-handling-json-responses) responses and
|
|
19
19
|
[setting HTTP request headers](#set-any-http-header-in-the-request) is
|
|
20
20
|
supported, too.
|
|
21
21
|
|
|
@@ -36,10 +36,7 @@ require 'html2rss'
|
|
|
36
36
|
|
|
37
37
|
rss =
|
|
38
38
|
Html2rss.feed(
|
|
39
|
-
channel: {
|
|
40
|
-
title: 'StackOverflow: Hot Network Questions',
|
|
41
|
-
url: 'https://stackoverflow.com/questions'
|
|
42
|
-
},
|
|
39
|
+
channel: { url: 'https://stackoverflow.com/questions' },
|
|
43
40
|
selectors: {
|
|
44
41
|
items: { selector: '#hot-network-questions > ul > li' },
|
|
45
42
|
title: { selector: 'a' },
|
|
@@ -57,13 +54,15 @@ The contents of both hashes are explained below.
|
|
|
57
54
|
|
|
58
55
|
### The `channel`
|
|
59
56
|
|
|
60
|
-
| attribute | | type | remark
|
|
61
|
-
| ------------- | -------- | ------- |
|
|
62
|
-
| `
|
|
63
|
-
| `
|
|
64
|
-
| `
|
|
65
|
-
| `
|
|
66
|
-
| `
|
|
57
|
+
| attribute | | type | default | remark |
|
|
58
|
+
| ------------- | -------- | ------- | -------------: | ------------------------------------------ |
|
|
59
|
+
| `url` | required | String | | |
|
|
60
|
+
| `title` | optional | String | auto-generated | |
|
|
61
|
+
| `description` | optional | String | auto-generated | |
|
|
62
|
+
| `ttl` | optional | Integer | `360` | TTL in _minutes_ |
|
|
63
|
+
| `time_zone` | optional | String | `'UTC'` | TimeZone name |
|
|
64
|
+
| `headers` | optional | Hash | `{}` | Set HTTP request headers. See notes below. |
|
|
65
|
+
| `json` | optional | Boolean | `false` | Handle JSON response. See notes below. |
|
|
67
66
|
|
|
68
67
|
### The `selectors`
|
|
69
68
|
|
|
@@ -78,18 +77,18 @@ each item has to have at least a `title` or a `description`.
|
|
|
78
77
|
Your `selectors` can contain arbitrary selector names, but only these
|
|
79
78
|
will make it into the RSS feed:
|
|
80
79
|
|
|
81
|
-
| RSS 2.0 tag | name in html2rss | remark |
|
|
82
|
-
| ------------- |
|
|
83
|
-
| `title` | `title`
|
|
84
|
-
| `description` | `description`
|
|
85
|
-
| `link` | `link`
|
|
86
|
-
| `author` | `author`
|
|
87
|
-
| `category` | `categories`
|
|
88
|
-
| `enclosure` | `enclosure`
|
|
89
|
-
| `pubDate` | `update`
|
|
90
|
-
| `guid` | `guid`
|
|
91
|
-
| `comments` | `comments`
|
|
92
|
-
| `source` | ~~source~~
|
|
80
|
+
| RSS 2.0 tag | name in `html2rss` | remark |
|
|
81
|
+
| ------------- | ------------------ | --------------------------- |
|
|
82
|
+
| `title` | `title` | |
|
|
83
|
+
| `description` | `description` | Supports HTML. |
|
|
84
|
+
| `link` | `link` | A URL. |
|
|
85
|
+
| `author` | `author` | |
|
|
86
|
+
| `category` | `categories` | See notes below. |
|
|
87
|
+
| `enclosure` | `enclosure` | See notes below. |
|
|
88
|
+
| `pubDate` | `update` | An instance of `Time`. |
|
|
89
|
+
| `guid` | `guid` | Generated from the `title`. |
|
|
90
|
+
| `comments` | `comments` | A URL. |
|
|
91
|
+
| `source` | ~~source~~ | Not yet supported. |
|
|
93
92
|
|
|
94
93
|
### The `selector` hash
|
|
95
94
|
|
|
@@ -225,7 +224,7 @@ Note the use of `|` for a multi-line String in YAML.
|
|
|
225
224
|
|
|
226
225
|
## Adding `<category>` tags to an item
|
|
227
226
|
|
|
228
|
-
The `categories` selector takes an array of selector names.
|
|
227
|
+
The `categories` selector takes an array of selector names. Each value of those
|
|
229
228
|
selectors will become a `<category>` on the RSS item.
|
|
230
229
|
|
|
231
230
|
<details>
|
|
@@ -268,11 +267,11 @@ selectors:
|
|
|
268
267
|
|
|
269
268
|
## Adding an `<enclosure>` tag to an item
|
|
270
269
|
|
|
271
|
-
An enclosure can be
|
|
270
|
+
An enclosure can be any file, e.g. a image, audio or video.
|
|
272
271
|
|
|
273
272
|
The `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's URL as base.
|
|
274
273
|
|
|
275
|
-
Since html2rss does no further inspection of the enclosure, its support comes with trade-offs:
|
|
274
|
+
Since `html2rss` does no further inspection of the enclosure, its support comes with trade-offs:
|
|
276
275
|
|
|
277
276
|
1. The content-type is guessed from the file extension of the URL.
|
|
278
277
|
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
|
@@ -310,7 +309,7 @@ selectors:
|
|
|
310
309
|
|
|
311
310
|
</details>
|
|
312
311
|
|
|
313
|
-
## Scraping JSON
|
|
312
|
+
## Scraping and handling JSON responses
|
|
314
313
|
|
|
315
314
|
Although this gem is called **html***2rss*, it's possible to scrape and process JSON.
|
|
316
315
|
|
|
@@ -485,7 +484,7 @@ Find a full example of a `config.yml` at [`spec/config.test.yml`](https://github
|
|
|
485
484
|
## Gotchas and tips & tricks
|
|
486
485
|
|
|
487
486
|
- Check that the channel URL does not redirect to a mobile page with a different markup structure.
|
|
488
|
-
- Do not rely on your web browser's developer console. html2rss does not execute JavaScript.
|
|
487
|
+
- Do not rely on your web browser's developer console. `html2rss` does not execute JavaScript.
|
|
489
488
|
- Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
|
|
490
489
|
- [CSS selectors are quite versatile, here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
|
|
491
490
|
|
|
@@ -5,7 +5,7 @@ module Html2rss
|
|
|
5
5
|
#
|
|
6
6
|
# Imagine this HTML structure:
|
|
7
7
|
#
|
|
8
|
-
# <span>http://why-not-use-a-link.uh</span>
|
|
8
|
+
# <span>http://why-not-use-a-link.uh </span>
|
|
9
9
|
#
|
|
10
10
|
# YAML usage example:
|
|
11
11
|
#
|
|
@@ -26,7 +26,7 @@ module Html2rss
|
|
|
26
26
|
##
|
|
27
27
|
# @return [String]
|
|
28
28
|
def get
|
|
29
|
-
URI(@value).to_s
|
|
29
|
+
URI(@value.to_s.split(' ').join).to_s
|
|
30
30
|
end
|
|
31
31
|
end
|
|
32
32
|
end
|
data/lib/html2rss/config.rb
CHANGED
|
@@ -14,11 +14,19 @@ module Html2rss
|
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def ttl
|
|
17
|
-
channel_config.fetch 'ttl',
|
|
17
|
+
channel_config.fetch 'ttl', 360
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
def title
|
|
21
|
-
channel_config.fetch 'title'
|
|
21
|
+
channel_config.fetch 'title' do
|
|
22
|
+
uri = URI(url)
|
|
23
|
+
|
|
24
|
+
nicer_path = uri.path.split('/')
|
|
25
|
+
nicer_path.reject! { |p| p == '' }
|
|
26
|
+
nicer_path.map!(&:titleize)
|
|
27
|
+
|
|
28
|
+
nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ')}" : uri.host
|
|
29
|
+
end
|
|
22
30
|
end
|
|
23
31
|
|
|
24
32
|
def language
|
|
@@ -26,7 +34,7 @@ module Html2rss
|
|
|
26
34
|
end
|
|
27
35
|
|
|
28
36
|
def description
|
|
29
|
-
channel_config.fetch 'description',
|
|
37
|
+
channel_config.fetch 'description', "Latest items from #{url}."
|
|
30
38
|
end
|
|
31
39
|
|
|
32
40
|
def url
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.8.
|
|
4
|
+
version: 0.8.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2019-11-
|
|
11
|
+
date: 2019-11-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activesupport
|