html2rss 0.8.0 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +13 -1
- data/Gemfile.lock +14 -17
- data/README.md +28 -29
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +1 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/config.rb +11 -3
- data/lib/html2rss/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 134aacbabf1b61aadcdb6d96757d329bde20429027f97c2c3bbd0328bb02cd7b
|
4
|
+
data.tar.gz: 2e6ef3c8a38df3e54983670b17d574bfe771069f662414e58a5c0572ab922b1f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e3b80225d14b44820f742f484a418e2d0b90060f7e329f2c89a2786795a1d87bf372f194e873d4a109d8c3649399e0716a41f65479c1b60ef9b8202154a8e48
|
7
|
+
data.tar.gz: ed5c095e4457d11208fa67634e526f1345c64f9d48d48df3eb11b8a5ebeb9cb203a62ae329f5388491d57435b1326bf25771349e3a374dc424094d3da60822e0
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,16 @@
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.8.
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.8.1...v) (2019-11-08)
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
## [0.8.1](https://github.com/gildesmarais/html2rss/compare/v0.8.0...v0.8.1) (2019-11-08)
|
6
|
+
|
7
|
+
|
8
|
+
### Features
|
9
|
+
|
10
|
+
* auto generate nicer channel's title and description ([#63](https://github.com/gildesmarais/html2rss/issues/63)) ([6db28f6](https://github.com/gildesmarais/html2rss/commit/6db28f6))
|
11
|
+
* change default ttl to 360 ([#65](https://github.com/gildesmarais/html2rss/issues/65)) ([605c8db](https://github.com/gildesmarais/html2rss/commit/605c8db))
|
12
|
+
* **config:** improve generation of channel.title from channel.url ([#68](https://github.com/gildesmarais/html2rss/issues/68)) ([bc8ecbb](https://github.com/gildesmarais/html2rss/commit/bc8ecbb))
|
13
|
+
* **parse_uri:** squish url to not fail on url with padding spaces ([#67](https://github.com/gildesmarais/html2rss/issues/67)) ([e349449](https://github.com/gildesmarais/html2rss/commit/e349449))
|
2
14
|
|
3
15
|
|
4
16
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html2rss (0.8.
|
4
|
+
html2rss (0.8.1)
|
5
5
|
activesupport (~> 5.0)
|
6
6
|
builder
|
7
7
|
faraday (~> 0.15)
|
@@ -27,12 +27,12 @@ GEM
|
|
27
27
|
builder (3.2.3)
|
28
28
|
byebug (11.0.1)
|
29
29
|
concurrent-ruby (1.1.5)
|
30
|
-
coveralls (0.
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
30
|
+
coveralls (0.8.23)
|
31
|
+
json (>= 1.8, < 3)
|
32
|
+
simplecov (~> 0.16.1)
|
33
|
+
term-ansicolor (~> 1.3)
|
34
|
+
thor (>= 0.19.4, < 2.0)
|
35
|
+
tins (~> 1.6)
|
36
36
|
crass (1.0.5)
|
37
37
|
diff-lcs (1.3)
|
38
38
|
docile (1.3.2)
|
@@ -43,7 +43,7 @@ GEM
|
|
43
43
|
hashie (3.6.0)
|
44
44
|
i18n (1.7.0)
|
45
45
|
concurrent-ruby (~> 1.0)
|
46
|
-
jaro_winkler (1.5.
|
46
|
+
jaro_winkler (1.5.4)
|
47
47
|
json (2.2.0)
|
48
48
|
kramdown (2.1.0)
|
49
49
|
mime-types (3.3)
|
@@ -51,7 +51,6 @@ GEM
|
|
51
51
|
mime-types-data (3.2019.1009)
|
52
52
|
mini_portile2 (2.4.0)
|
53
53
|
minitest (5.13.0)
|
54
|
-
multi_json (1.14.1)
|
55
54
|
multipart-post (2.1.1)
|
56
55
|
nokogiri (1.10.5)
|
57
56
|
mini_portile2 (~> 2.4.0)
|
@@ -61,8 +60,6 @@ GEM
|
|
61
60
|
parser (2.6.5.0)
|
62
61
|
ast (~> 2.4.0)
|
63
62
|
rainbow (3.0.0)
|
64
|
-
rest-client (1.6.7)
|
65
|
-
mime-types (>= 1.16)
|
66
63
|
reverse_markdown (1.3.0)
|
67
64
|
nokogiri
|
68
65
|
rspec (3.9.0)
|
@@ -78,7 +75,7 @@ GEM
|
|
78
75
|
diff-lcs (>= 1.2.0, < 2.0)
|
79
76
|
rspec-support (~> 3.9.0)
|
80
77
|
rspec-support (3.9.0)
|
81
|
-
rubocop (0.
|
78
|
+
rubocop (0.76.0)
|
82
79
|
jaro_winkler (~> 1.5.1)
|
83
80
|
parallel (~> 1.10)
|
84
81
|
parser (>= 2.6)
|
@@ -94,16 +91,16 @@ GEM
|
|
94
91
|
crass (~> 1.0.2)
|
95
92
|
nokogiri (>= 1.8.0)
|
96
93
|
nokogumbo (~> 2.0)
|
97
|
-
simplecov (0.
|
94
|
+
simplecov (0.16.1)
|
98
95
|
docile (~> 1.1)
|
99
96
|
json (>= 1.8, < 3)
|
100
97
|
simplecov-html (~> 0.10.0)
|
101
98
|
simplecov-html (0.10.2)
|
102
|
-
term-ansicolor (1.
|
103
|
-
tins (~> 0
|
104
|
-
thor (0.
|
99
|
+
term-ansicolor (1.7.1)
|
100
|
+
tins (~> 1.0)
|
101
|
+
thor (0.20.3)
|
105
102
|
thread_safe (0.3.6)
|
106
|
-
tins (
|
103
|
+
tins (1.22.0)
|
107
104
|
to_regexp (0.2.1)
|
108
105
|
tzinfo (1.2.5)
|
109
106
|
thread_safe (~> 0.1)
|
data/README.md
CHANGED
@@ -15,7 +15,7 @@ With the _feed config_ containing the URL to scrape and
|
|
15
15
|
CSS selectors for information extraction (like title, URL, ...) your RSS builds.
|
16
16
|
[Extractors](#using-extractors) and chain-able [post processors](#using-post-processors)
|
17
17
|
make information extraction, processing and sanitizing a breeze.
|
18
|
-
[Scraping JSON](#scraping-json) responses and
|
18
|
+
[Scraping JSON](#scraping-and-handling-json-responses) responses and
|
19
19
|
[setting HTTP request headers](#set-any-http-header-in-the-request) is
|
20
20
|
supported, too.
|
21
21
|
|
@@ -36,10 +36,7 @@ require 'html2rss'
|
|
36
36
|
|
37
37
|
rss =
|
38
38
|
Html2rss.feed(
|
39
|
-
channel: {
|
40
|
-
title: 'StackOverflow: Hot Network Questions',
|
41
|
-
url: 'https://stackoverflow.com/questions'
|
42
|
-
},
|
39
|
+
channel: { url: 'https://stackoverflow.com/questions' },
|
43
40
|
selectors: {
|
44
41
|
items: { selector: '#hot-network-questions > ul > li' },
|
45
42
|
title: { selector: 'a' },
|
@@ -57,13 +54,15 @@ The contents of both hashes are explained below.
|
|
57
54
|
|
58
55
|
### The `channel`
|
59
56
|
|
60
|
-
| attribute | | type | remark
|
61
|
-
| ------------- | -------- | ------- |
|
62
|
-
| `
|
63
|
-
| `
|
64
|
-
| `
|
65
|
-
| `
|
66
|
-
| `
|
57
|
+
| attribute | | type | default | remark |
|
58
|
+
| ------------- | -------- | ------- | -------------: | ------------------------------------------ |
|
59
|
+
| `url` | required | String | | |
|
60
|
+
| `title` | optional | String | auto-generated | |
|
61
|
+
| `description` | optional | String | auto-generated | |
|
62
|
+
| `ttl` | optional | Integer | `360` | TTL in _minutes_ |
|
63
|
+
| `time_zone` | optional | String | `'UTC'` | TimeZone name |
|
64
|
+
| `headers` | optional | Hash | `{}` | Set HTTP request headers. See notes below. |
|
65
|
+
| `json` | optional | Boolean | `false` | Handle JSON response. See notes below. |
|
67
66
|
|
68
67
|
### The `selectors`
|
69
68
|
|
@@ -78,18 +77,18 @@ each item has to have at least a `title` or a `description`.
|
|
78
77
|
Your `selectors` can contain arbitrary selector names, but only these
|
79
78
|
will make it into the RSS feed:
|
80
79
|
|
81
|
-
| RSS 2.0 tag | name in html2rss | remark |
|
82
|
-
| ------------- |
|
83
|
-
| `title` | `title`
|
84
|
-
| `description` | `description`
|
85
|
-
| `link` | `link`
|
86
|
-
| `author` | `author`
|
87
|
-
| `category` | `categories`
|
88
|
-
| `enclosure` | `enclosure`
|
89
|
-
| `pubDate` | `update`
|
90
|
-
| `guid` | `guid`
|
91
|
-
| `comments` | `comments`
|
92
|
-
| `source` | ~~source~~
|
80
|
+
| RSS 2.0 tag | name in `html2rss` | remark |
|
81
|
+
| ------------- | ------------------ | --------------------------- |
|
82
|
+
| `title` | `title` | |
|
83
|
+
| `description` | `description` | Supports HTML. |
|
84
|
+
| `link` | `link` | A URL. |
|
85
|
+
| `author` | `author` | |
|
86
|
+
| `category` | `categories` | See notes below. |
|
87
|
+
| `enclosure` | `enclosure` | See notes below. |
|
88
|
+
| `pubDate` | `update` | An instance of `Time`. |
|
89
|
+
| `guid` | `guid` | Generated from the `title`. |
|
90
|
+
| `comments` | `comments` | A URL. |
|
91
|
+
| `source` | ~~source~~ | Not yet supported. |
|
93
92
|
|
94
93
|
### The `selector` hash
|
95
94
|
|
@@ -225,7 +224,7 @@ Note the use of `|` for a multi-line String in YAML.
|
|
225
224
|
|
226
225
|
## Adding `<category>` tags to an item
|
227
226
|
|
228
|
-
The `categories` selector takes an array of selector names.
|
227
|
+
The `categories` selector takes an array of selector names. Each value of those
|
229
228
|
selectors will become a `<category>` on the RSS item.
|
230
229
|
|
231
230
|
<details>
|
@@ -268,11 +267,11 @@ selectors:
|
|
268
267
|
|
269
268
|
## Adding an `<enclosure>` tag to an item
|
270
269
|
|
271
|
-
An enclosure can be
|
270
|
+
An enclosure can be any file, e.g. a image, audio or video.
|
272
271
|
|
273
272
|
The `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's URL as base.
|
274
273
|
|
275
|
-
Since html2rss does no further inspection of the enclosure, its support comes with trade-offs:
|
274
|
+
Since `html2rss` does no further inspection of the enclosure, its support comes with trade-offs:
|
276
275
|
|
277
276
|
1. The content-type is guessed from the file extension of the URL.
|
278
277
|
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
@@ -310,7 +309,7 @@ selectors:
|
|
310
309
|
|
311
310
|
</details>
|
312
311
|
|
313
|
-
## Scraping JSON
|
312
|
+
## Scraping and handling JSON responses
|
314
313
|
|
315
314
|
Although this gem is called **html***2rss*, it's possible to scrape and process JSON.
|
316
315
|
|
@@ -485,7 +484,7 @@ Find a full example of a `config.yml` at [`spec/config.test.yml`](https://github
|
|
485
484
|
## Gotchas and tips & tricks
|
486
485
|
|
487
486
|
- Check that the channel URL does not redirect to a mobile page with a different markup structure.
|
488
|
-
- Do not rely on your web browser's developer console. html2rss does not execute JavaScript.
|
487
|
+
- Do not rely on your web browser's developer console. `html2rss` does not execute JavaScript.
|
489
488
|
- Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
|
490
489
|
- [CSS selectors are quite versatile, here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
|
491
490
|
|
@@ -5,7 +5,7 @@ module Html2rss
|
|
5
5
|
#
|
6
6
|
# Imagine this HTML structure:
|
7
7
|
#
|
8
|
-
# <span>http://why-not-use-a-link.uh</span>
|
8
|
+
# <span>http://why-not-use-a-link.uh </span>
|
9
9
|
#
|
10
10
|
# YAML usage example:
|
11
11
|
#
|
@@ -26,7 +26,7 @@ module Html2rss
|
|
26
26
|
##
|
27
27
|
# @return [String]
|
28
28
|
def get
|
29
|
-
URI(@value).to_s
|
29
|
+
URI(@value.to_s.split(' ').join).to_s
|
30
30
|
end
|
31
31
|
end
|
32
32
|
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -14,11 +14,19 @@ module Html2rss
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def ttl
|
17
|
-
channel_config.fetch 'ttl',
|
17
|
+
channel_config.fetch 'ttl', 360
|
18
18
|
end
|
19
19
|
|
20
20
|
def title
|
21
|
-
channel_config.fetch 'title'
|
21
|
+
channel_config.fetch 'title' do
|
22
|
+
uri = URI(url)
|
23
|
+
|
24
|
+
nicer_path = uri.path.split('/')
|
25
|
+
nicer_path.reject! { |p| p == '' }
|
26
|
+
nicer_path.map!(&:titleize)
|
27
|
+
|
28
|
+
nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ')}" : uri.host
|
29
|
+
end
|
22
30
|
end
|
23
31
|
|
24
32
|
def language
|
@@ -26,7 +34,7 @@ module Html2rss
|
|
26
34
|
end
|
27
35
|
|
28
36
|
def description
|
29
|
-
channel_config.fetch 'description',
|
37
|
+
channel_config.fetch 'description', "Latest items from #{url}."
|
30
38
|
end
|
31
39
|
|
32
40
|
def url
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|