html2rss 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/.travis.yml +4 -3
- data/CHANGELOG.md +17 -1
- data/Gemfile.lock +12 -12
- data/README.md +30 -1
- data/html2rss.gemspec +2 -0
- data/lib/html2rss.rb +3 -0
- data/lib/html2rss/config.rb +4 -1
- data/lib/html2rss/feed_builder.rb +20 -17
- data/lib/html2rss/item.rb +3 -0
- data/lib/html2rss/item_extractors.rb +5 -4
- data/lib/html2rss/utils.rb +2 -0
- data/lib/html2rss/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fdb35d5375dda349c92fb7ccaed71747de5e49b2adfb3a1cfca9a9c251aa86a9
|
4
|
+
data.tar.gz: 6532e1df3c67108c6ae8bf7eaef7a6d5eb228c4d3020f99aa02f8a9d507a625e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86fbfe46d9ec7b0f21bc0e756da94ed96ecfa635f858336424034875a646f81c08e64610bf8b478d9e0212ea272ea06ee13d6bf33c32cf01dd25363e4ba44868
|
7
|
+
data.tar.gz: dfad9fc0ba59cc8ecd323299b330c5cc6831336da7fd913f4f72a52ee807861d4b604927e73477f33e353314f2107477457cfa4f320024cd63c19e4b0a577cd5
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,20 @@
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.6.0...v) (2019-10-05)
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
# [0.6.0](https://github.com/gildesmarais/html2rss/compare/v0.5.2...v0.6.0) (2019-10-05)
|
6
|
+
|
7
|
+
|
8
|
+
### Bug Fixes
|
9
|
+
|
10
|
+
* **specs:** simplecov does not exclude files from spec/ ([#44](https://github.com/gildesmarais/html2rss/issues/44)) ([b0ca780](https://github.com/gildesmarais/html2rss/commit/b0ca780))
|
11
|
+
|
12
|
+
|
13
|
+
### Features
|
14
|
+
|
15
|
+
* **ci:** run rubocop on ci ([#40](https://github.com/gildesmarais/html2rss/issues/40)) ([f4ec8d1](https://github.com/gildesmarais/html2rss/commit/f4ec8d1))
|
16
|
+
* memoize ItemExtractor lookups ([#45](https://github.com/gildesmarais/html2rss/issues/45)) ([e88321c](https://github.com/gildesmarais/html2rss/commit/e88321c))
|
17
|
+
* support setting of request headers in feed config ([#41](https://github.com/gildesmarais/html2rss/issues/41)) ([a7aca11](https://github.com/gildesmarais/html2rss/commit/a7aca11)), closes [#38](https://github.com/gildesmarais/html2rss/issues/38)
|
2
18
|
|
3
19
|
|
4
20
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html2rss (0.
|
4
|
+
html2rss (0.6.0)
|
5
5
|
activesupport (~> 5.0)
|
6
6
|
builder
|
7
7
|
faraday (~> 0.15)
|
@@ -26,24 +26,24 @@ GEM
|
|
26
26
|
crass (1.0.4)
|
27
27
|
diff-lcs (1.3)
|
28
28
|
docile (1.3.2)
|
29
|
-
faraday (0.
|
29
|
+
faraday (0.16.2)
|
30
30
|
multipart-post (>= 1.2, < 3)
|
31
31
|
faraday_middleware (0.13.1)
|
32
32
|
faraday (>= 0.7.4, < 1.0)
|
33
33
|
hashie (3.6.0)
|
34
|
-
i18n (1.
|
34
|
+
i18n (1.7.0)
|
35
35
|
concurrent-ruby (~> 1.0)
|
36
36
|
jaro_winkler (1.5.3)
|
37
37
|
json (2.2.0)
|
38
38
|
mini_portile2 (2.4.0)
|
39
|
-
minitest (5.
|
39
|
+
minitest (5.12.2)
|
40
40
|
multipart-post (2.1.1)
|
41
41
|
nokogiri (1.10.4)
|
42
42
|
mini_portile2 (~> 2.4.0)
|
43
43
|
nokogumbo (2.0.1)
|
44
44
|
nokogiri (~> 1.8, >= 1.8.4)
|
45
45
|
parallel (1.17.0)
|
46
|
-
parser (2.6.
|
46
|
+
parser (2.6.5.0)
|
47
47
|
ast (~> 2.4.0)
|
48
48
|
rainbow (3.0.0)
|
49
49
|
reverse_markdown (1.3.0)
|
@@ -54,24 +54,24 @@ GEM
|
|
54
54
|
rspec-mocks (~> 3.8.0)
|
55
55
|
rspec-core (3.8.2)
|
56
56
|
rspec-support (~> 3.8.0)
|
57
|
-
rspec-expectations (3.8.
|
57
|
+
rspec-expectations (3.8.5)
|
58
58
|
diff-lcs (>= 1.2.0, < 2.0)
|
59
59
|
rspec-support (~> 3.8.0)
|
60
|
-
rspec-mocks (3.8.
|
60
|
+
rspec-mocks (3.8.2)
|
61
61
|
diff-lcs (>= 1.2.0, < 2.0)
|
62
62
|
rspec-support (~> 3.8.0)
|
63
|
-
rspec-support (3.8.
|
64
|
-
rubocop (0.
|
63
|
+
rspec-support (3.8.3)
|
64
|
+
rubocop (0.75.0)
|
65
65
|
jaro_winkler (~> 1.5.1)
|
66
66
|
parallel (~> 1.10)
|
67
67
|
parser (>= 2.6)
|
68
68
|
rainbow (>= 2.2.2, < 4.0)
|
69
69
|
ruby-progressbar (~> 1.7)
|
70
70
|
unicode-display_width (>= 1.4.0, < 1.7)
|
71
|
-
rubocop-performance (1.
|
71
|
+
rubocop-performance (1.5.0)
|
72
72
|
rubocop (>= 0.71.0)
|
73
|
-
rubocop-rspec (1.
|
74
|
-
rubocop (>= 0.
|
73
|
+
rubocop-rspec (1.36.0)
|
74
|
+
rubocop (>= 0.68.1)
|
75
75
|
ruby-progressbar (1.10.1)
|
76
76
|
sanitize (5.1.0)
|
77
77
|
crass (~> 1.0.2)
|
data/README.md
CHANGED
@@ -48,7 +48,17 @@ Since 0.5.0 it is possible to scrape and process JSON.
|
|
48
48
|
|
49
49
|
Adding `json: true` to the channel config will convert the JSON response to XML.
|
50
50
|
|
51
|
-
|
51
|
+
Feed config:
|
52
|
+
|
53
|
+
```yaml
|
54
|
+
channel:
|
55
|
+
url: https://example.com
|
56
|
+
title: "Example with JSON"
|
57
|
+
json: true
|
58
|
+
# ...
|
59
|
+
```
|
60
|
+
|
61
|
+
Imagine this HTTP response:
|
52
62
|
|
53
63
|
```json
|
54
64
|
{
|
@@ -73,6 +83,25 @@ Your items selector would be `data > datum`, the item's link selector would be `
|
|
73
83
|
|
74
84
|
Under the hood it uses ActiveSupport's [`Hash.to_xml`](https://apidock.com/rails/Hash/to_xml) core extension for the JSON to XML conversion.
|
75
85
|
|
86
|
+
## Set any HTTP header in the request
|
87
|
+
|
88
|
+
You can add any HTTP headers to the request to the channel URL.
|
89
|
+
You can use this to e.g. have Cookie or Authorization information being sent or to overwrite the User-Agent.
|
90
|
+
|
91
|
+
```yaml
|
92
|
+
channel:
|
93
|
+
url: https://example.com
|
94
|
+
title: "Example with http headers"
|
95
|
+
headers:
|
96
|
+
"User-Agent": "html2rss-request"
|
97
|
+
"X-Something": "Foobar"
|
98
|
+
"Authorization": "Token deadbea7"
|
99
|
+
"Cookie": "monster=MeWantCookie"
|
100
|
+
# ...
|
101
|
+
```
|
102
|
+
|
103
|
+
The headers provided by the channel will be merged into the global headers.
|
104
|
+
|
76
105
|
## Development
|
77
106
|
|
78
107
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/html2rss.gemspec
CHANGED
@@ -12,9 +12,11 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
|
13
13
|
spec.homepage = 'https://github.com/gildesmarais/html2rss'
|
14
14
|
spec.license = 'MIT'
|
15
|
+
spec.required_ruby_version = '>= 2.4.0'
|
15
16
|
|
16
17
|
if spec.respond_to?(:metadata)
|
17
18
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
19
|
+
spec.metadata['changelog_uri'] = 'https://github.com/gildesmarais/html2rss/blob/master/CHANGELOG.md'
|
18
20
|
else
|
19
21
|
raise 'RubyGems 2.0 or newer is required to protect against ' \
|
20
22
|
'public gem pushes.'
|
data/lib/html2rss.rb
CHANGED
@@ -4,6 +4,9 @@ require 'html2rss/version'
|
|
4
4
|
require 'html2rss/utils'
|
5
5
|
require 'yaml'
|
6
6
|
|
7
|
+
##
|
8
|
+
# The Html2rss namespace.
|
9
|
+
# Request HTML from an URL and transform it to a RSS 2.0 object.
|
7
10
|
module Html2rss
|
8
11
|
##
|
9
12
|
# Returns a RSS object which is generated from the provided file.
|
data/lib/html2rss/config.rb
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
module Html2rss
|
2
|
+
##
|
3
|
+
# The Config class abstracts from the config data structure and
|
4
|
+
# provides default values.
|
2
5
|
class Config
|
3
6
|
def initialize(feed_config, global_config = {})
|
4
7
|
@global_config = Utils::IndifferentAccessHash.new global_config
|
@@ -40,7 +43,7 @@ module Html2rss
|
|
40
43
|
end
|
41
44
|
|
42
45
|
def headers
|
43
|
-
global_config.fetch('headers', {})
|
46
|
+
global_config.fetch('headers', {}).merge(channel_config.fetch('headers', {}))
|
44
47
|
end
|
45
48
|
|
46
49
|
def attribute_options(name)
|
@@ -2,6 +2,13 @@ require 'rss'
|
|
2
2
|
require_relative 'item'
|
3
3
|
|
4
4
|
module Html2rss
|
5
|
+
##
|
6
|
+
# The purpose is to build the feed, consisting of
|
7
|
+
#
|
8
|
+
# - the 'channel' and
|
9
|
+
# - the 'item'
|
10
|
+
#
|
11
|
+
# parts.
|
5
12
|
class FeedBuilder
|
6
13
|
def initialize(config)
|
7
14
|
@config = config
|
@@ -11,10 +18,10 @@ module Html2rss
|
|
11
18
|
# @return [RSS:Rss]
|
12
19
|
def rss
|
13
20
|
RSS::Maker.make('2.0') do |maker|
|
14
|
-
|
21
|
+
add_channel(maker)
|
15
22
|
|
16
23
|
feed_items.map do |feed_item|
|
17
|
-
|
24
|
+
add_item(feed_item, maker.items.new_item)
|
18
25
|
end
|
19
26
|
end
|
20
27
|
end
|
@@ -23,7 +30,7 @@ module Html2rss
|
|
23
30
|
|
24
31
|
attr_reader :config
|
25
32
|
|
26
|
-
def
|
33
|
+
def add_channel(maker)
|
27
34
|
%i[language author title description link ttl].each do |attribute_name|
|
28
35
|
maker.channel.public_send("#{attribute_name}=".to_sym, config.public_send(attribute_name))
|
29
36
|
end
|
@@ -33,24 +40,20 @@ module Html2rss
|
|
33
40
|
end
|
34
41
|
|
35
42
|
def feed_items
|
36
|
-
@feed_items ||= Item.from_url
|
43
|
+
@feed_items ||= Item.from_url(config.url, config).keep_if(&:valid?)
|
37
44
|
end
|
38
45
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
feed_item.available_attributes.each do |attribute_name|
|
44
|
-
rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
|
45
|
-
end
|
46
|
-
|
47
|
-
feed_item.categories.each do |category|
|
48
|
-
rss_item.categories.new_category.content = category
|
49
|
-
end
|
46
|
+
def add_item(feed_item, rss_item)
|
47
|
+
feed_item.available_attributes.each do |attribute_name|
|
48
|
+
rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
|
49
|
+
end
|
50
50
|
|
51
|
-
|
52
|
-
rss_item.
|
51
|
+
feed_item.categories.each do |category|
|
52
|
+
rss_item.categories.new_category.content = category
|
53
53
|
end
|
54
|
+
|
55
|
+
rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
|
56
|
+
rss_item.guid.isPermaLink = false
|
54
57
|
end
|
55
58
|
end
|
56
59
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -6,6 +6,9 @@ require_relative 'item_extractors'
|
|
6
6
|
require_relative 'attribute_post_processors'
|
7
7
|
|
8
8
|
module Html2rss
|
9
|
+
##
|
10
|
+
# Takes the selected Nokogiri::HTML and responds to accessors names
|
11
|
+
# defined in the feed config.
|
9
12
|
class Item
|
10
13
|
def initialize(xml, config)
|
11
14
|
@xml = xml
|
@@ -12,11 +12,12 @@ module Html2rss
|
|
12
12
|
DEFAULT = 'text'.freeze
|
13
13
|
|
14
14
|
def self.get_extractor(name)
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
@extractors = Hash.new do |hash, key|
|
16
|
+
camel_cased_name = key.split('_').map(&:capitalize).join
|
17
|
+
class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
|
18
18
|
|
19
|
-
|
19
|
+
hash[key] = Object.const_get(class_name)
|
20
|
+
end[name || DEFAULT]
|
20
21
|
end
|
21
22
|
|
22
23
|
##
|
data/lib/html2rss/utils.rb
CHANGED
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -301,6 +301,7 @@ licenses:
|
|
301
301
|
- MIT
|
302
302
|
metadata:
|
303
303
|
allowed_push_host: https://rubygems.org
|
304
|
+
changelog_uri: https://github.com/gildesmarais/html2rss/blob/master/CHANGELOG.md
|
304
305
|
post_install_message:
|
305
306
|
rdoc_options: []
|
306
307
|
require_paths:
|
@@ -309,15 +310,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
309
310
|
requirements:
|
310
311
|
- - ">="
|
311
312
|
- !ruby/object:Gem::Version
|
312
|
-
version:
|
313
|
+
version: 2.4.0
|
313
314
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
314
315
|
requirements:
|
315
316
|
- - ">="
|
316
317
|
- !ruby/object:Gem::Version
|
317
318
|
version: '0'
|
318
319
|
requirements: []
|
319
|
-
|
320
|
-
rubygems_version: 2.7.7
|
320
|
+
rubygems_version: 3.0.6
|
321
321
|
signing_key:
|
322
322
|
specification_version: 4
|
323
323
|
summary: Returns an RSS::Rss object by scraping a URL.
|