html2rss 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/.travis.yml +4 -3
- data/CHANGELOG.md +17 -1
- data/Gemfile.lock +12 -12
- data/README.md +30 -1
- data/html2rss.gemspec +2 -0
- data/lib/html2rss.rb +3 -0
- data/lib/html2rss/config.rb +4 -1
- data/lib/html2rss/feed_builder.rb +20 -17
- data/lib/html2rss/item.rb +3 -0
- data/lib/html2rss/item_extractors.rb +5 -4
- data/lib/html2rss/utils.rb +2 -0
- data/lib/html2rss/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fdb35d5375dda349c92fb7ccaed71747de5e49b2adfb3a1cfca9a9c251aa86a9
|
4
|
+
data.tar.gz: 6532e1df3c67108c6ae8bf7eaef7a6d5eb228c4d3020f99aa02f8a9d507a625e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86fbfe46d9ec7b0f21bc0e756da94ed96ecfa635f858336424034875a646f81c08e64610bf8b478d9e0212ea272ea06ee13d6bf33c32cf01dd25363e4ba44868
|
7
|
+
data.tar.gz: dfad9fc0ba59cc8ecd323299b330c5cc6831336da7fd913f4f72a52ee807861d4b604927e73477f33e353314f2107477457cfa4f320024cd63c19e4b0a577cd5
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,20 @@
|
|
1
|
-
# [](https://github.com/gildesmarais/html2rss/compare/v0.
|
1
|
+
# [](https://github.com/gildesmarais/html2rss/compare/v0.6.0...v) (2019-10-05)
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
# [0.6.0](https://github.com/gildesmarais/html2rss/compare/v0.5.2...v0.6.0) (2019-10-05)
|
6
|
+
|
7
|
+
|
8
|
+
### Bug Fixes
|
9
|
+
|
10
|
+
* **specs:** simplecov does not exclude files from spec/ ([#44](https://github.com/gildesmarais/html2rss/issues/44)) ([b0ca780](https://github.com/gildesmarais/html2rss/commit/b0ca780))
|
11
|
+
|
12
|
+
|
13
|
+
### Features
|
14
|
+
|
15
|
+
* **ci:** run rubocop on ci ([#40](https://github.com/gildesmarais/html2rss/issues/40)) ([f4ec8d1](https://github.com/gildesmarais/html2rss/commit/f4ec8d1))
|
16
|
+
* memoize ItemExtractor lookups ([#45](https://github.com/gildesmarais/html2rss/issues/45)) ([e88321c](https://github.com/gildesmarais/html2rss/commit/e88321c))
|
17
|
+
* support setting of request headers in feed config ([#41](https://github.com/gildesmarais/html2rss/issues/41)) ([a7aca11](https://github.com/gildesmarais/html2rss/commit/a7aca11)), closes [#38](https://github.com/gildesmarais/html2rss/issues/38)
|
2
18
|
|
3
19
|
|
4
20
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html2rss (0.
|
4
|
+
html2rss (0.6.0)
|
5
5
|
activesupport (~> 5.0)
|
6
6
|
builder
|
7
7
|
faraday (~> 0.15)
|
@@ -26,24 +26,24 @@ GEM
|
|
26
26
|
crass (1.0.4)
|
27
27
|
diff-lcs (1.3)
|
28
28
|
docile (1.3.2)
|
29
|
-
faraday (0.
|
29
|
+
faraday (0.16.2)
|
30
30
|
multipart-post (>= 1.2, < 3)
|
31
31
|
faraday_middleware (0.13.1)
|
32
32
|
faraday (>= 0.7.4, < 1.0)
|
33
33
|
hashie (3.6.0)
|
34
|
-
i18n (1.
|
34
|
+
i18n (1.7.0)
|
35
35
|
concurrent-ruby (~> 1.0)
|
36
36
|
jaro_winkler (1.5.3)
|
37
37
|
json (2.2.0)
|
38
38
|
mini_portile2 (2.4.0)
|
39
|
-
minitest (5.
|
39
|
+
minitest (5.12.2)
|
40
40
|
multipart-post (2.1.1)
|
41
41
|
nokogiri (1.10.4)
|
42
42
|
mini_portile2 (~> 2.4.0)
|
43
43
|
nokogumbo (2.0.1)
|
44
44
|
nokogiri (~> 1.8, >= 1.8.4)
|
45
45
|
parallel (1.17.0)
|
46
|
-
parser (2.6.
|
46
|
+
parser (2.6.5.0)
|
47
47
|
ast (~> 2.4.0)
|
48
48
|
rainbow (3.0.0)
|
49
49
|
reverse_markdown (1.3.0)
|
@@ -54,24 +54,24 @@ GEM
|
|
54
54
|
rspec-mocks (~> 3.8.0)
|
55
55
|
rspec-core (3.8.2)
|
56
56
|
rspec-support (~> 3.8.0)
|
57
|
-
rspec-expectations (3.8.
|
57
|
+
rspec-expectations (3.8.5)
|
58
58
|
diff-lcs (>= 1.2.0, < 2.0)
|
59
59
|
rspec-support (~> 3.8.0)
|
60
|
-
rspec-mocks (3.8.
|
60
|
+
rspec-mocks (3.8.2)
|
61
61
|
diff-lcs (>= 1.2.0, < 2.0)
|
62
62
|
rspec-support (~> 3.8.0)
|
63
|
-
rspec-support (3.8.
|
64
|
-
rubocop (0.
|
63
|
+
rspec-support (3.8.3)
|
64
|
+
rubocop (0.75.0)
|
65
65
|
jaro_winkler (~> 1.5.1)
|
66
66
|
parallel (~> 1.10)
|
67
67
|
parser (>= 2.6)
|
68
68
|
rainbow (>= 2.2.2, < 4.0)
|
69
69
|
ruby-progressbar (~> 1.7)
|
70
70
|
unicode-display_width (>= 1.4.0, < 1.7)
|
71
|
-
rubocop-performance (1.
|
71
|
+
rubocop-performance (1.5.0)
|
72
72
|
rubocop (>= 0.71.0)
|
73
|
-
rubocop-rspec (1.
|
74
|
-
rubocop (>= 0.
|
73
|
+
rubocop-rspec (1.36.0)
|
74
|
+
rubocop (>= 0.68.1)
|
75
75
|
ruby-progressbar (1.10.1)
|
76
76
|
sanitize (5.1.0)
|
77
77
|
crass (~> 1.0.2)
|
data/README.md
CHANGED
@@ -48,7 +48,17 @@ Since 0.5.0 it is possible to scrape and process JSON.
|
|
48
48
|
|
49
49
|
Adding `json: true` to the channel config will convert the JSON response to XML.
|
50
50
|
|
51
|
-
|
51
|
+
Feed config:
|
52
|
+
|
53
|
+
```yaml
|
54
|
+
channel:
|
55
|
+
url: https://example.com
|
56
|
+
title: "Example with JSON"
|
57
|
+
json: true
|
58
|
+
# ...
|
59
|
+
```
|
60
|
+
|
61
|
+
Imagine this HTTP response:
|
52
62
|
|
53
63
|
```json
|
54
64
|
{
|
@@ -73,6 +83,25 @@ Your items selector would be `data > datum`, the item's link selector would be `
|
|
73
83
|
|
74
84
|
Under the hood it uses ActiveSupport's [`Hash.to_xml`](https://apidock.com/rails/Hash/to_xml) core extension for the JSON to XML conversion.
|
75
85
|
|
86
|
+
## Set any HTTP header in the request
|
87
|
+
|
88
|
+
You can add any HTTP headers to the request to the channel URL.
|
89
|
+
You can use this to e.g. have Cookie or Authorization information being sent or to overwrite the User-Agent.
|
90
|
+
|
91
|
+
```yaml
|
92
|
+
channel:
|
93
|
+
url: https://example.com
|
94
|
+
title: "Example with http headers"
|
95
|
+
headers:
|
96
|
+
"User-Agent": "html2rss-request"
|
97
|
+
"X-Something": "Foobar"
|
98
|
+
"Authorization": "Token deadbea7"
|
99
|
+
"Cookie": "monster=MeWantCookie"
|
100
|
+
# ...
|
101
|
+
```
|
102
|
+
|
103
|
+
The headers provided by the channel will be merged into the global headers.
|
104
|
+
|
76
105
|
## Development
|
77
106
|
|
78
107
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/html2rss.gemspec
CHANGED
@@ -12,9 +12,11 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
|
13
13
|
spec.homepage = 'https://github.com/gildesmarais/html2rss'
|
14
14
|
spec.license = 'MIT'
|
15
|
+
spec.required_ruby_version = '>= 2.4.0'
|
15
16
|
|
16
17
|
if spec.respond_to?(:metadata)
|
17
18
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
19
|
+
spec.metadata['changelog_uri'] = 'https://github.com/gildesmarais/html2rss/blob/master/CHANGELOG.md'
|
18
20
|
else
|
19
21
|
raise 'RubyGems 2.0 or newer is required to protect against ' \
|
20
22
|
'public gem pushes.'
|
data/lib/html2rss.rb
CHANGED
@@ -4,6 +4,9 @@ require 'html2rss/version'
|
|
4
4
|
require 'html2rss/utils'
|
5
5
|
require 'yaml'
|
6
6
|
|
7
|
+
##
|
8
|
+
# The Html2rss namespace.
|
9
|
+
# Request HTML from an URL and transform it to a RSS 2.0 object.
|
7
10
|
module Html2rss
|
8
11
|
##
|
9
12
|
# Returns a RSS object which is generated from the provided file.
|
data/lib/html2rss/config.rb
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
module Html2rss
|
2
|
+
##
|
3
|
+
# The Config class abstracts from the config data structure and
|
4
|
+
# provides default values.
|
2
5
|
class Config
|
3
6
|
def initialize(feed_config, global_config = {})
|
4
7
|
@global_config = Utils::IndifferentAccessHash.new global_config
|
@@ -40,7 +43,7 @@ module Html2rss
|
|
40
43
|
end
|
41
44
|
|
42
45
|
def headers
|
43
|
-
global_config.fetch('headers', {})
|
46
|
+
global_config.fetch('headers', {}).merge(channel_config.fetch('headers', {}))
|
44
47
|
end
|
45
48
|
|
46
49
|
def attribute_options(name)
|
@@ -2,6 +2,13 @@ require 'rss'
|
|
2
2
|
require_relative 'item'
|
3
3
|
|
4
4
|
module Html2rss
|
5
|
+
##
|
6
|
+
# The purpose is to build the feed, consisting of
|
7
|
+
#
|
8
|
+
# - the 'channel' and
|
9
|
+
# - the 'item'
|
10
|
+
#
|
11
|
+
# parts.
|
5
12
|
class FeedBuilder
|
6
13
|
def initialize(config)
|
7
14
|
@config = config
|
@@ -11,10 +18,10 @@ module Html2rss
|
|
11
18
|
# @return [RSS:Rss]
|
12
19
|
def rss
|
13
20
|
RSS::Maker.make('2.0') do |maker|
|
14
|
-
|
21
|
+
add_channel(maker)
|
15
22
|
|
16
23
|
feed_items.map do |feed_item|
|
17
|
-
|
24
|
+
add_item(feed_item, maker.items.new_item)
|
18
25
|
end
|
19
26
|
end
|
20
27
|
end
|
@@ -23,7 +30,7 @@ module Html2rss
|
|
23
30
|
|
24
31
|
attr_reader :config
|
25
32
|
|
26
|
-
def
|
33
|
+
def add_channel(maker)
|
27
34
|
%i[language author title description link ttl].each do |attribute_name|
|
28
35
|
maker.channel.public_send("#{attribute_name}=".to_sym, config.public_send(attribute_name))
|
29
36
|
end
|
@@ -33,24 +40,20 @@ module Html2rss
|
|
33
40
|
end
|
34
41
|
|
35
42
|
def feed_items
|
36
|
-
@feed_items ||= Item.from_url
|
43
|
+
@feed_items ||= Item.from_url(config.url, config).keep_if(&:valid?)
|
37
44
|
end
|
38
45
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
feed_item.available_attributes.each do |attribute_name|
|
44
|
-
rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
|
45
|
-
end
|
46
|
-
|
47
|
-
feed_item.categories.each do |category|
|
48
|
-
rss_item.categories.new_category.content = category
|
49
|
-
end
|
46
|
+
def add_item(feed_item, rss_item)
|
47
|
+
feed_item.available_attributes.each do |attribute_name|
|
48
|
+
rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
|
49
|
+
end
|
50
50
|
|
51
|
-
|
52
|
-
rss_item.
|
51
|
+
feed_item.categories.each do |category|
|
52
|
+
rss_item.categories.new_category.content = category
|
53
53
|
end
|
54
|
+
|
55
|
+
rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
|
56
|
+
rss_item.guid.isPermaLink = false
|
54
57
|
end
|
55
58
|
end
|
56
59
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -6,6 +6,9 @@ require_relative 'item_extractors'
|
|
6
6
|
require_relative 'attribute_post_processors'
|
7
7
|
|
8
8
|
module Html2rss
|
9
|
+
##
|
10
|
+
# Takes the selected Nokogiri::HTML and responds to accessors names
|
11
|
+
# defined in the feed config.
|
9
12
|
class Item
|
10
13
|
def initialize(xml, config)
|
11
14
|
@xml = xml
|
@@ -12,11 +12,12 @@ module Html2rss
|
|
12
12
|
DEFAULT = 'text'.freeze
|
13
13
|
|
14
14
|
def self.get_extractor(name)
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
@extractors = Hash.new do |hash, key|
|
16
|
+
camel_cased_name = key.split('_').map(&:capitalize).join
|
17
|
+
class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
|
18
18
|
|
19
|
-
|
19
|
+
hash[key] = Object.const_get(class_name)
|
20
|
+
end[name || DEFAULT]
|
20
21
|
end
|
21
22
|
|
22
23
|
##
|
data/lib/html2rss/utils.rb
CHANGED
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -301,6 +301,7 @@ licenses:
|
|
301
301
|
- MIT
|
302
302
|
metadata:
|
303
303
|
allowed_push_host: https://rubygems.org
|
304
|
+
changelog_uri: https://github.com/gildesmarais/html2rss/blob/master/CHANGELOG.md
|
304
305
|
post_install_message:
|
305
306
|
rdoc_options: []
|
306
307
|
require_paths:
|
@@ -309,15 +310,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
309
310
|
requirements:
|
310
311
|
- - ">="
|
311
312
|
- !ruby/object:Gem::Version
|
312
|
-
version:
|
313
|
+
version: 2.4.0
|
313
314
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
314
315
|
requirements:
|
315
316
|
- - ">="
|
316
317
|
- !ruby/object:Gem::Version
|
317
318
|
version: '0'
|
318
319
|
requirements: []
|
319
|
-
|
320
|
-
rubygems_version: 2.7.7
|
320
|
+
rubygems_version: 3.0.6
|
321
321
|
signing_key:
|
322
322
|
specification_version: 4
|
323
323
|
summary: Returns an RSS::Rss object by scraping a URL.
|