html2rss 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.changelogrc +19 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +46 -0
- data/.travis.yml +13 -4
- data/CHANGELOG.md +115 -0
- data/Gemfile.lock +28 -20
- data/README.md +22 -3
- data/html2rss.gemspec +5 -4
- data/lib/html2rss/attribute_post_processors.rb +16 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +13 -0
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +13 -0
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +23 -0
- data/lib/html2rss/attribute_post_processors/substring.rb +15 -0
- data/lib/html2rss/attribute_post_processors/template.rb +33 -0
- data/lib/html2rss/config.rb +3 -3
- data/lib/html2rss/feed_builder.rb +5 -3
- data/lib/html2rss/item.rb +24 -12
- data/lib/html2rss/item_extractor.rb +16 -16
- data/lib/html2rss/version.rb +1 -1
- data/support/changelog.md +15 -0
- data/support/logo.png +0 -0
- metadata +45 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1cc71ee2722757d0f4cceaf8aea1b9007496acf35fdf4da37a5dea998233258d
|
4
|
+
data.tar.gz: '0009a02fe85a6e2b088aec00944e1dc46233ce6878b8558782286be0e020511b'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2de4befd1f3beec2ee094f2c792ec94a1bde37c3da1f30eb737403ae04eddb36b54a43b86163f8b348229c42a5d38aae0bc7657c677bdbe95a54bee26b1fa6d9
|
7
|
+
data.tar.gz: f344bb3046c0493d9979e9daf53d5f3725798c96093dbbff6164a1339fd80bf54ad60868eb55dbc5a67239584f825cb0f5f6ca18c4e3e7f8f67b6698d26f3adf
|
data/.changelogrc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
{
|
2
|
+
"app_name": "html2rss",
|
3
|
+
"logo": "https://github.com/gildesmarais/html2rss/raw/master/support/logo.png",
|
4
|
+
"intro": "Generate RSS feeds by scraping websites by providing a config.",
|
5
|
+
"debug": "true",
|
6
|
+
"template": "support/changelog.md",
|
7
|
+
"sections": [
|
8
|
+
{ "title": "Bugfixes", "grep": "^fix" },
|
9
|
+
{ "title": "Features", "grep": "^feat" },
|
10
|
+
{ "title": "Documentation", "grep": "^docs" },
|
11
|
+
{ "title": "Breaking changes", "grep": "BREAKING" },
|
12
|
+
{ "title": "Refactorings", "grep": "^refactor" },
|
13
|
+
{ "title": "Code style", "grep": "^style" },
|
14
|
+
{ "title": "Test", "grep": "^spec" },
|
15
|
+
{ "title": "Chore", "grep": "^chore" },
|
16
|
+
{ "title": "Branches merged", "grep": "^Merge branch" },
|
17
|
+
{ "title": "Pull requests merged", "grep": "^Merge pull request" }
|
18
|
+
]
|
19
|
+
}
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
AllCops:
|
2
|
+
TargetRubyVersion: 2.4
|
3
|
+
DisplayCopNames: true
|
4
|
+
|
5
|
+
Metrics/LineLength:
|
6
|
+
Max: 110
|
7
|
+
|
8
|
+
Metrics/BlockLength:
|
9
|
+
Exclude:
|
10
|
+
- "**/*_spec.rb"
|
11
|
+
- html2rss.gemspec
|
12
|
+
|
13
|
+
Metrics/ModuleLength:
|
14
|
+
Exclude:
|
15
|
+
- "**/*_spec.rb"
|
16
|
+
|
17
|
+
Documentation:
|
18
|
+
Enabled: false
|
19
|
+
|
20
|
+
Style/BlockDelimiters:
|
21
|
+
Enabled: false
|
22
|
+
|
23
|
+
Style/FrozenStringLiteralComment:
|
24
|
+
Enabled: false
|
25
|
+
|
26
|
+
Style/ParallelAssignment:
|
27
|
+
Enabled: false
|
28
|
+
|
29
|
+
Style/AsciiComments:
|
30
|
+
Enabled: false
|
31
|
+
|
32
|
+
Style/BracesAroundHashParameters:
|
33
|
+
Description: 'Enforce braces style around hash parameters.'
|
34
|
+
Enabled: true
|
35
|
+
|
36
|
+
Style/HashSyntax:
|
37
|
+
Description: >-
|
38
|
+
Prefer Ruby 1.9 hash syntax { a: 1, b: 2 } over 1.8 syntax
|
39
|
+
{ :a => 1, :b => 2 }.
|
40
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#hash-literals'
|
41
|
+
Enabled: true
|
42
|
+
|
43
|
+
Layout/SpaceInsideParens:
|
44
|
+
Description: 'No spaces after ( or before ).'
|
45
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#no-spaces-braces'
|
46
|
+
Enabled: true
|
data/.travis.yml
CHANGED
@@ -1,15 +1,24 @@
|
|
1
1
|
sudo: false
|
2
2
|
language: ruby
|
3
|
+
cache: bundler
|
3
4
|
|
4
5
|
before_install:
|
5
6
|
- gem update --system
|
6
7
|
- gem install bundler
|
7
|
-
|
8
|
+
|
9
|
+
bundler_args: "--jobs=3 --retry=3"
|
8
10
|
|
9
11
|
rvm:
|
10
|
-
- 2.3.
|
11
|
-
- 2.4.
|
12
|
-
- 2.5.
|
12
|
+
- 2.3.8
|
13
|
+
- 2.4.5
|
14
|
+
- 2.5.3
|
13
15
|
|
14
16
|
script:
|
15
17
|
- bundle exec rspec
|
18
|
+
|
19
|
+
deploy:
|
20
|
+
provider: rubygems
|
21
|
+
api_key:
|
22
|
+
secure: bM3Yl8iWdB1Amra3Bm6bIH/mTwHcRhZrX8etFFbJANxIbkhzUOyTKcDMYiWUVM/mBzzv0NOuRejrDR6R0v7E2udrKcLQFCBtv7HqPAXIlkEEyxZy+M1kTqcPzP872E+ZKTn93vCzbiXBLYoMmqgCzqvcO87IBYNzTURHkfFjaYJJdVyZ5EVtbpXf4FhBvuQf9LTk/ocClgwYeuqd+45lO7qHoPatsvbY0vCOfKaiwkdOkBt+hjc56awcYSc9CXn0DCatebPQmQmdrqFd8fKgyCatWS3n+8TPmvzVfNJe44wg3oNfHbWruP85I2LE9ei1iG+iGQIF60fMhGgMJ4EM3REXDE5Mg+GA5uJcgH9Poirut3Ih65jtAzYNGohlmEmc7ysKc0dmG1O3ndwrHjh5KePrOAGDaW6QKG+m5ebIZ+mgrEA+ZVU1mjDM8FlbSKAayoPloslZdllSv7miwGzh6xrHWGQSCURZAkygFh+Kd+Kg1eVlEs+n6aObod82mEOfBPvWPacOrE2fY4B0ocFOKotZBCZSD0ZIixlyslRTnmcJfpRNlYLsQ56oy5uPNUccPQ86NSmmE+qbRdPCLQCKLPm2iYBgOa5iQrfHR/fUgcO0skAZiW4o9QflDgIFS/G+BE6FMHIvjkKA6Ae4KbqGzlF5pGFdo6p4MhlvubwjsVI=
|
23
|
+
on:
|
24
|
+
tags: true
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
<img width="300px" src="https://github.com/gildesmarais/html2rss/raw/master/support/logo.png" />
|
2
|
+
|
3
|
+
# html2rss
|
4
|
+
|
5
|
+
_Generate RSS feeds by scraping websites by providing a config._
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
## Bugfixes
|
10
|
+
- handling of url query breaks processing
|
11
|
+
([ace289e9](git@github.com:gildesmarais/html2rss/commit/ace289e911b69cb92433cac6f1ca0403715d8286))
|
12
|
+
- only set supported attributes on rss item
|
13
|
+
([dae0d8e7](git@github.com:gildesmarais/html2rss/commit/dae0d8e75541e810275e789a23971a61e60a2154))
|
14
|
+
|
15
|
+
- **config**
|
16
|
+
- feed generation fails
|
17
|
+
([7dd55869](git@github.com:gildesmarais/html2rss/commit/7dd55869f79b1de76c004bf0e82d13b16b5b3f0d))
|
18
|
+
|
19
|
+
- **parse_uri**
|
20
|
+
- handle non-absolute paths
|
21
|
+
([92150257](git@github.com:gildesmarais/html2rss/commit/921502574e4436d65a30e1d34b9b31f238336247))
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
## Features
|
27
|
+
- add logo [skip ci]
|
28
|
+
([857a55fd](git@github.com:gildesmarais/html2rss/commit/857a55fd8c932930d96c47c5abe57f0507356df1))
|
29
|
+
- require updated to be present
|
30
|
+
([e1bedaec](git@github.com:gildesmarais/html2rss/commit/e1bedaecc91e874fe24e96000612abb9cd11e9fe))
|
31
|
+
- do not fail on invalid item, just skip it
|
32
|
+
([3b83d715](git@github.com:gildesmarais/html2rss/commit/3b83d715619abbc33b124de1945d17cb0dc7edb0))
|
33
|
+
|
34
|
+
- **item_extractor**
|
35
|
+
- text strips strings
|
36
|
+
([f5982859](git@github.com:gildesmarais/html2rss/commit/f59828593dca663bdbe8699392594e2d18658f8f))
|
37
|
+
- add static and current_time
|
38
|
+
([25043dcb](git@github.com:gildesmarais/html2rss/commit/25043dcbd8f0f4901202f4a2f66b355ac48825a8))
|
39
|
+
- handle absolute urls
|
40
|
+
([f96be008](git@github.com:gildesmarais/html2rss/commit/f96be00857bdcded02d52dd62ec22b9b52c803ed))
|
41
|
+
|
42
|
+
- **post_processing**
|
43
|
+
- add configurable post_processing (#5)
|
44
|
+
([4cf6caca](git@github.com:gildesmarais/html2rss/commit/4cf6cacac00bd3c0c53d584ca11274ba24b03ef7),
|
45
|
+
[#1](git@github.com:gildesmarais/html2rss/issues/1))
|
46
|
+
|
47
|
+
- **post_processor**
|
48
|
+
- add substring
|
49
|
+
([6f2a32a6](git@github.com:gildesmarais/html2rss/commit/6f2a32a6304ef9956577711173de681daf93f55f))
|
50
|
+
|
51
|
+
- **postprocessors**
|
52
|
+
- add Template (#6)
|
53
|
+
([f1db542e](git@github.com:gildesmarais/html2rss/commit/f1db542e8c1e9e09a066a3cd6c8514a6ca0aa871),
|
54
|
+
[#4](git@github.com:gildesmarais/html2rss/issues/4))
|
55
|
+
|
56
|
+
- **sanitize_html**
|
57
|
+
- add target="_blank" to anchors
|
58
|
+
([975a73bf](git@github.com:gildesmarais/html2rss/commit/975a73bfd396ba5942bc0ea80eebd14cc37ad776))
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
## Documentation
|
64
|
+
- create the changelog
|
65
|
+
([5c561db5](git@github.com:gildesmarais/html2rss/commit/5c561db51d4e0b8592b1c82812ab5cdbe9320b70))
|
66
|
+
- add tips and tricks
|
67
|
+
([ea978240](git@github.com:gildesmarais/html2rss/commit/ea9782408107f3637a4c9665396f511fc07be19b))
|
68
|
+
- update readme
|
69
|
+
([4743167c](git@github.com:gildesmarais/html2rss/commit/4743167c86959e83524ffb7282c562413a651797))
|
70
|
+
- add note about html2rss-web
|
71
|
+
([3371c12f](git@github.com:gildesmarais/html2rss/commit/3371c12ffc6c8d3c29073d03ff206886a39401cd))
|
72
|
+
- add a badge for travis-ci
|
73
|
+
([8818d4f4](git@github.com:gildesmarais/html2rss/commit/8818d4f464a9c163ebc9665d01719e2bab132bd6))
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
## Test
|
79
|
+
- don't be so lazy when matching strings
|
80
|
+
([6a0eb627](git@github.com:gildesmarais/html2rss/commit/6a0eb62765523a1405fd269466b2fc57794eac7a))
|
81
|
+
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
## Chore
|
86
|
+
- upgrade sanitze gem to version 5.0.0
|
87
|
+
([8c4bf3a4](git@github.com:gildesmarais/html2rss/commit/8c4bf3a44885758e395568ec452a7cffdb9a0389))
|
88
|
+
- rubocop autocorrect
|
89
|
+
([af6b9fac](git@github.com:gildesmarais/html2rss/commit/af6b9facca547d3ca3ce9ef0d1227707cd16eaea))
|
90
|
+
- build against latest ruby releases
|
91
|
+
([17ba79ac](git@github.com:gildesmarais/html2rss/commit/17ba79acd2f68da1fcc984368d3e6de3437cbf1b))
|
92
|
+
- update dependencies
|
93
|
+
([855279b4](git@github.com:gildesmarais/html2rss/commit/855279b46d584a8a8c2a317529f7a4be550eaf15))
|
94
|
+
- update dependencies
|
95
|
+
([46e5a283](git@github.com:gildesmarais/html2rss/commit/46e5a2832d1f2fe1353dcc2a8d82a9786f15f6bd))
|
96
|
+
- add simplecov
|
97
|
+
([b4e1144b](git@github.com:gildesmarais/html2rss/commit/b4e1144b7f8f90126e528cc4a4ec048113d93634))
|
98
|
+
|
99
|
+
- **changelog**
|
100
|
+
- add generation with git-changelog
|
101
|
+
([07ad5a51](git@github.com:gildesmarais/html2rss/commit/07ad5a513f0951ee988426abda4b8c233411ead7))
|
102
|
+
|
103
|
+
- **travis**
|
104
|
+
- use cache for bundler
|
105
|
+
([ac76b3b2](git@github.com:gildesmarais/html2rss/commit/ac76b3b2dd94adecd4927de18651800438a7e7ba))
|
106
|
+
- setup autorelease to rubygems
|
107
|
+
([eb9c8e1b](git@github.com:gildesmarais/html2rss/commit/eb9c8e1b16902dc0e174a0cccb6eb9227307ce82))
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
---
|
114
|
+
<sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
|
115
|
+
|
data/Gemfile.lock
CHANGED
@@ -4,7 +4,7 @@ PATH
|
|
4
4
|
html2rss (0.0.1)
|
5
5
|
faraday (~> 0.15)
|
6
6
|
nokogiri (~> 1.8)
|
7
|
-
sanitize (~>
|
7
|
+
sanitize (~> 5.0)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -12,31 +12,38 @@ GEM
|
|
12
12
|
byebug (10.0.2)
|
13
13
|
crass (1.0.4)
|
14
14
|
diff-lcs (1.3)
|
15
|
-
|
15
|
+
docile (1.3.1)
|
16
|
+
faraday (0.15.3)
|
16
17
|
multipart-post (>= 1.2, < 3)
|
18
|
+
json (2.1.0)
|
17
19
|
mini_portile2 (2.3.0)
|
18
20
|
multipart-post (2.0.0)
|
19
|
-
nokogiri (1.8.
|
21
|
+
nokogiri (1.8.5)
|
20
22
|
mini_portile2 (~> 2.3.0)
|
21
|
-
nokogumbo (
|
22
|
-
nokogiri
|
23
|
-
rspec (3.
|
24
|
-
rspec-core (~> 3.
|
25
|
-
rspec-expectations (~> 3.
|
26
|
-
rspec-mocks (~> 3.
|
27
|
-
rspec-core (3.
|
28
|
-
rspec-support (~> 3.
|
29
|
-
rspec-expectations (3.
|
23
|
+
nokogumbo (2.0.0)
|
24
|
+
nokogiri (~> 1.8, >= 1.8.4)
|
25
|
+
rspec (3.8.0)
|
26
|
+
rspec-core (~> 3.8.0)
|
27
|
+
rspec-expectations (~> 3.8.0)
|
28
|
+
rspec-mocks (~> 3.8.0)
|
29
|
+
rspec-core (3.8.0)
|
30
|
+
rspec-support (~> 3.8.0)
|
31
|
+
rspec-expectations (3.8.2)
|
30
32
|
diff-lcs (>= 1.2.0, < 2.0)
|
31
|
-
rspec-support (~> 3.
|
32
|
-
rspec-mocks (3.
|
33
|
+
rspec-support (~> 3.8.0)
|
34
|
+
rspec-mocks (3.8.0)
|
33
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
34
|
-
rspec-support (~> 3.
|
35
|
-
rspec-support (3.
|
36
|
-
sanitize (
|
36
|
+
rspec-support (~> 3.8.0)
|
37
|
+
rspec-support (3.8.0)
|
38
|
+
sanitize (5.0.0)
|
37
39
|
crass (~> 1.0.2)
|
38
|
-
nokogiri (>= 1.
|
39
|
-
nokogumbo (~>
|
40
|
+
nokogiri (>= 1.8.0)
|
41
|
+
nokogumbo (~> 2.0)
|
42
|
+
simplecov (0.16.1)
|
43
|
+
docile (~> 1.1)
|
44
|
+
json (>= 1.8, < 3)
|
45
|
+
simplecov-html (~> 0.10.0)
|
46
|
+
simplecov-html (0.10.2)
|
40
47
|
vcr (4.0.0)
|
41
48
|
|
42
49
|
PLATFORMS
|
@@ -47,7 +54,8 @@ DEPENDENCIES
|
|
47
54
|
byebug (~> 10.0)
|
48
55
|
html2rss!
|
49
56
|
rspec (~> 3.0)
|
57
|
+
simplecov
|
50
58
|
vcr (~> 4.0)
|
51
59
|
|
52
60
|
BUNDLED WITH
|
53
|
-
1.16.
|
61
|
+
1.16.6
|
data/README.md
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
![html2rss logo](https://github.com/gildesmarais/html2rss/raw/master/support/logo.png)
|
2
|
+
|
3
|
+
# html2rss [![Build Status](https://travis-ci.org/gildesmarais/html2rss.svg?branch=master)](https://travis-ci.org/gildesmarais/html2rss)
|
2
4
|
|
3
5
|
Request and convert an HTML document to an RSS feed via a config object.
|
4
6
|
The config contains the URL to scrape and the selectors needed to extract
|
@@ -24,14 +26,26 @@ Or install it yourself as:
|
|
24
26
|
|
25
27
|
$ gem install html2rss
|
26
28
|
|
27
|
-
## Usage
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
## Usage with a YAML file
|
28
32
|
|
29
|
-
Create a YAML config file. Find an example at `rspec/config.test.yml
|
33
|
+
Create a YAML config file. Find an example at [`rspec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
|
30
34
|
|
31
35
|
`Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')` returns
|
32
36
|
|
33
37
|
an `RSS:Rss` object.
|
34
38
|
|
39
|
+
## Usage in a web application
|
40
|
+
|
41
|
+
Find a minimal Sintra app which exposes your feeds to HTTP endpoints here:
|
42
|
+
[gildesmarais/html2rss-web](https://github.com/gildesmarais/html2rss-web)
|
43
|
+
|
44
|
+
### Tips and tricks
|
45
|
+
|
46
|
+
- Check that the channel url does not redirect to a mobile page
|
47
|
+
- fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems quite efficient
|
48
|
+
|
35
49
|
## Development
|
36
50
|
|
37
51
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -40,6 +54,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
40
54
|
|
41
55
|
Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
|
42
56
|
|
57
|
+
## Changelog generation
|
58
|
+
|
59
|
+
The `CHANGELOG.md` can be generated automatically.
|
60
|
+
Install [git-changelog](https://www.npmjs.com/package/git-changelog) globally and run `git-changelog` afterwards.
|
61
|
+
|
43
62
|
## License
|
44
63
|
|
45
64
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/html2rss.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
lib = File.expand_path('
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
2
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
3
|
require 'html2rss/version'
|
4
4
|
|
@@ -28,11 +28,12 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
29
|
spec.require_paths = ['lib']
|
30
30
|
|
31
|
-
spec.add_dependency 'nokogiri', '~> 1.8'
|
32
|
-
spec.add_dependency 'sanitize', '~> 4.6'
|
33
31
|
spec.add_dependency 'faraday', '~> 0.15'
|
32
|
+
spec.add_dependency 'nokogiri', '~> 1.8'
|
33
|
+
spec.add_dependency 'sanitize', '~> 5.0'
|
34
34
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
35
|
+
spec.add_development_dependency 'byebug', '~> 10.0'
|
35
36
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
37
|
+
spec.add_development_dependency 'simplecov'
|
36
38
|
spec.add_development_dependency 'vcr', '~> 4.0'
|
37
|
-
spec.add_development_dependency 'byebug', '~> 10.0'
|
38
39
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'attribute_post_processors/parse_time'
|
2
|
+
require_relative 'attribute_post_processors/parse_uri'
|
3
|
+
require_relative 'attribute_post_processors/sanitize_html'
|
4
|
+
require_relative 'attribute_post_processors/substring'
|
5
|
+
require_relative 'attribute_post_processors/template'
|
6
|
+
|
7
|
+
module Html2rss
|
8
|
+
module AttributePostProcessors
|
9
|
+
def self.get_processor(options)
|
10
|
+
camel_cased_option = options['name'].split('_').collect(&:capitalize).join
|
11
|
+
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_option].join('::')
|
12
|
+
|
13
|
+
Object.const_get(class_name)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'sanitize'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
class SanitizeHtml
|
6
|
+
def initialize(value, _options, _item)
|
7
|
+
@value = value
|
8
|
+
end
|
9
|
+
|
10
|
+
def get
|
11
|
+
Sanitize.fragment(@value, Sanitize::Config.merge(
|
12
|
+
Sanitize::Config::RELAXED,
|
13
|
+
add_attributes: {
|
14
|
+
'a' => {
|
15
|
+
'rel' => 'nofollow noopener noreferrer',
|
16
|
+
'target' => '_blank'
|
17
|
+
}
|
18
|
+
}
|
19
|
+
))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Html2rss
|
2
|
+
module AttributePostProcessors
|
3
|
+
class Substring
|
4
|
+
def initialize(value, options, _item)
|
5
|
+
@value = value
|
6
|
+
@options = options
|
7
|
+
end
|
8
|
+
|
9
|
+
def get
|
10
|
+
ending = @options['end'].to_i.positive? ? @options['end'].to_i : @value.length
|
11
|
+
@value[@options['start'].to_i..ending]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'sanitize'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
class Template
|
6
|
+
def initialize(value, options, item)
|
7
|
+
@value = value
|
8
|
+
@options = options
|
9
|
+
@item = item
|
10
|
+
end
|
11
|
+
|
12
|
+
def get
|
13
|
+
string % methods
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def string
|
19
|
+
@options['string']
|
20
|
+
end
|
21
|
+
|
22
|
+
def methods
|
23
|
+
@methods ||= @options['methods'].map { |method|
|
24
|
+
if method == 'self'
|
25
|
+
@value
|
26
|
+
else
|
27
|
+
@item.send(method.to_sym)&.to_s
|
28
|
+
end
|
29
|
+
}
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -38,7 +38,7 @@ module Html2rss
|
|
38
38
|
end
|
39
39
|
|
40
40
|
def options(name)
|
41
|
-
feed_config.dig('selectors',
|
41
|
+
feed_config.dig('selectors').fetch(name, {}).merge('channel' => channel_config)
|
42
42
|
end
|
43
43
|
|
44
44
|
def selector(name)
|
@@ -46,8 +46,8 @@ module Html2rss
|
|
46
46
|
end
|
47
47
|
|
48
48
|
def attribute_names
|
49
|
-
attribute_names = feed_config.fetch('selectors', {}).keys.map(&:
|
50
|
-
attribute_names.delete(
|
49
|
+
attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_s)
|
50
|
+
attribute_names.delete('items')
|
51
51
|
attribute_names
|
52
52
|
end
|
53
53
|
end
|
@@ -22,7 +22,7 @@ module Html2rss
|
|
22
22
|
private
|
23
23
|
|
24
24
|
def add_channel_to_maker(maker)
|
25
|
-
[
|
25
|
+
%i[language author title description link ttl].each do |attribute_name|
|
26
26
|
maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
|
27
27
|
end
|
28
28
|
|
@@ -31,12 +31,14 @@ module Html2rss
|
|
31
31
|
end
|
32
32
|
|
33
33
|
def feed_items
|
34
|
-
Item.from_url config.url, config
|
34
|
+
@feed_items ||= Item.from_url config.url, config
|
35
35
|
end
|
36
36
|
|
37
37
|
def add_item_to_items(feed_item, items)
|
38
|
+
return unless feed_item.valid?
|
39
|
+
|
38
40
|
items.new_item do |rss_item|
|
39
|
-
|
41
|
+
feed_item.available_attributes.each do |attribute_name|
|
40
42
|
rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
|
41
43
|
|
42
44
|
rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
|
data/lib/html2rss/item.rb
CHANGED
@@ -2,6 +2,7 @@ require 'faraday'
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'nokogiri'
|
4
4
|
require_relative 'item_extractor'
|
5
|
+
require_relative 'attribute_post_processors'
|
5
6
|
|
6
7
|
module Html2rss
|
7
8
|
class Item
|
@@ -24,26 +25,37 @@ module Html2rss
|
|
24
25
|
proc = ItemExtractor.const_get extractor.upcase.to_sym
|
25
26
|
value = proc.call(xml, attribute_config)
|
26
27
|
|
27
|
-
post_process
|
28
|
+
post_process_options = attribute_config.fetch('post_process', false)
|
29
|
+
value = post_process(value, post_process_options) if post_process_options
|
30
|
+
|
31
|
+
value
|
28
32
|
end
|
29
33
|
|
30
|
-
def
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
def available_attributes
|
35
|
+
# TODO: support optional attributes, e.g. category, enclosure, source
|
36
|
+
@available_attributes ||= (%w[title link description author comments updated] & @config.attribute_names)
|
37
|
+
end
|
38
|
+
|
39
|
+
def valid?
|
40
|
+
return false if [title.to_s, description.to_s].join('') == ''
|
41
|
+
|
42
|
+
true
|
39
43
|
end
|
40
44
|
|
41
45
|
def self.from_url(url, config)
|
42
46
|
connection = Faraday.new(url: url, headers: config.headers)
|
43
47
|
page = Nokogiri::HTML(connection.get.body)
|
44
|
-
page.css(config.selector('items')).map
|
48
|
+
page.css(config.selector('items')).map do |xml_item|
|
45
49
|
new xml_item, config
|
46
|
-
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def post_process(value, options)
|
56
|
+
Html2rss::AttributePostProcessors.get_processor(options)
|
57
|
+
.new(value, options, self)
|
58
|
+
.get
|
47
59
|
end
|
48
60
|
end
|
49
61
|
end
|
@@ -1,25 +1,25 @@
|
|
1
|
-
require 'sanitize'
|
2
|
-
|
3
1
|
module Html2rss
|
4
2
|
module ItemExtractor
|
5
|
-
TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
|
6
|
-
ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }
|
3
|
+
TEXT = proc { |xml, options| xml.css(options['selector'])&.text&.strip }
|
4
|
+
ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']).to_s }
|
7
5
|
|
8
6
|
HREF = proc { |xml, options|
|
9
|
-
|
10
|
-
|
11
|
-
uri
|
12
|
-
}
|
7
|
+
href = xml.css(options['selector']).attr('href').to_s
|
8
|
+
path, query = href.split('?')
|
13
9
|
|
14
|
-
|
15
|
-
|
10
|
+
if href.start_with?('http')
|
11
|
+
uri = URI(href)
|
12
|
+
else
|
13
|
+
uri = URI(options['channel']['url'])
|
14
|
+
uri.path = path.start_with?('/') ? path : "/#{path}"
|
15
|
+
uri.query = query
|
16
|
+
end
|
16
17
|
|
17
|
-
|
18
|
-
Sanitize::Config::RELAXED,
|
19
|
-
add_attributes: {
|
20
|
-
'a' => { 'rel' => 'nofollow noopener noreferrer' }
|
21
|
-
}
|
22
|
-
))
|
18
|
+
uri
|
23
19
|
}
|
20
|
+
|
21
|
+
HTML = proc { |xml, options| xml.css(options['selector']).to_s }
|
22
|
+
STATIC = proc { |_xml, options| options['static'] }
|
23
|
+
CURRENT_TIME = proc { |_xml, _options| Time.new }
|
24
24
|
end
|
25
25
|
end
|
data/lib/html2rss/version.rb
CHANGED
@@ -0,0 +1,15 @@
|
|
1
|
+
<% if(logo) { %><img width="300px" src="<%= logo %>" /><%= '\n\n' %><% } %># <%= title %>
|
2
|
+
<% if(intro) { %><%= '\n' %>_<%= intro %>_<%= '\n' %><% } %>
|
3
|
+
<% if(version && (version.name || version.number)) { %>##<% if(version.name){%> <%= version.name %><% } %> <%= version.number %> <% if(version.date){ %>( <%= version.date %> )<% } %><%= '\n' %><% } %>
|
4
|
+
<% _.forEach(sections, function(section){
|
5
|
+
if(section.commitsCount > 0) { %>
|
6
|
+
## <%= section.title %>
|
7
|
+
<% _.forEach(section.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
|
8
|
+
<% _.forEach(section.components, function(component){ %> - **<%= component.name %>**
|
9
|
+
<% _.forEach(component.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
|
10
|
+
<% }) %>
|
11
|
+
<% } %>
|
12
|
+
<% }) %>
|
13
|
+
|
14
|
+
---
|
15
|
+
<sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
|
data/support/logo.png
ADDED
Binary file
|
metadata
CHANGED
@@ -1,57 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: faraday
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0.15'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0.15'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.8'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.8'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: sanitize
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0
|
47
|
+
version: '5.0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '0
|
54
|
+
version: '5.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: bundler
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.16'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: byebug
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '10.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '10.0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: rspec
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -81,33 +95,33 @@ dependencies:
|
|
81
95
|
- !ruby/object:Gem::Version
|
82
96
|
version: '3.0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
98
|
+
name: simplecov
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - ">="
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
103
|
+
version: '0'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - ">="
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
112
|
+
name: vcr
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
117
|
+
version: '4.0'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
124
|
+
version: '4.0'
|
111
125
|
description: |-
|
112
126
|
Create your config object, include the url to scrape,
|
113
127
|
some selectors and get a RSS2 feed in return.
|
@@ -117,9 +131,12 @@ executables: []
|
|
117
131
|
extensions: []
|
118
132
|
extra_rdoc_files: []
|
119
133
|
files:
|
134
|
+
- ".changelogrc"
|
120
135
|
- ".gitignore"
|
121
136
|
- ".rspec"
|
137
|
+
- ".rubocop.yml"
|
122
138
|
- ".travis.yml"
|
139
|
+
- CHANGELOG.md
|
123
140
|
- Gemfile
|
124
141
|
- Gemfile.lock
|
125
142
|
- LICENSE
|
@@ -128,11 +145,19 @@ files:
|
|
128
145
|
- bin/setup
|
129
146
|
- html2rss.gemspec
|
130
147
|
- lib/html2rss.rb
|
148
|
+
- lib/html2rss/attribute_post_processors.rb
|
149
|
+
- lib/html2rss/attribute_post_processors/parse_time.rb
|
150
|
+
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
151
|
+
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
152
|
+
- lib/html2rss/attribute_post_processors/substring.rb
|
153
|
+
- lib/html2rss/attribute_post_processors/template.rb
|
131
154
|
- lib/html2rss/config.rb
|
132
155
|
- lib/html2rss/feed_builder.rb
|
133
156
|
- lib/html2rss/item.rb
|
134
157
|
- lib/html2rss/item_extractor.rb
|
135
158
|
- lib/html2rss/version.rb
|
159
|
+
- support/changelog.md
|
160
|
+
- support/logo.png
|
136
161
|
homepage: https://github.com/gildesmarais/html2rss
|
137
162
|
licenses:
|
138
163
|
- MIT
|
@@ -154,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
179
|
version: '0'
|
155
180
|
requirements: []
|
156
181
|
rubyforge_project:
|
157
|
-
rubygems_version: 2.7.
|
182
|
+
rubygems_version: 2.7.8
|
158
183
|
signing_key:
|
159
184
|
specification_version: 4
|
160
185
|
summary: Generate RSS feeds by scraping websites by providing a config.
|