html2rss 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.changelogrc +19 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +46 -0
- data/.travis.yml +13 -4
- data/CHANGELOG.md +115 -0
- data/Gemfile.lock +28 -20
- data/README.md +22 -3
- data/html2rss.gemspec +5 -4
- data/lib/html2rss/attribute_post_processors.rb +16 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +13 -0
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +13 -0
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +23 -0
- data/lib/html2rss/attribute_post_processors/substring.rb +15 -0
- data/lib/html2rss/attribute_post_processors/template.rb +33 -0
- data/lib/html2rss/config.rb +3 -3
- data/lib/html2rss/feed_builder.rb +5 -3
- data/lib/html2rss/item.rb +24 -12
- data/lib/html2rss/item_extractor.rb +16 -16
- data/lib/html2rss/version.rb +1 -1
- data/support/changelog.md +15 -0
- data/support/logo.png +0 -0
- metadata +45 -20
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1cc71ee2722757d0f4cceaf8aea1b9007496acf35fdf4da37a5dea998233258d
|
|
4
|
+
data.tar.gz: '0009a02fe85a6e2b088aec00944e1dc46233ce6878b8558782286be0e020511b'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2de4befd1f3beec2ee094f2c792ec94a1bde37c3da1f30eb737403ae04eddb36b54a43b86163f8b348229c42a5d38aae0bc7657c677bdbe95a54bee26b1fa6d9
|
|
7
|
+
data.tar.gz: f344bb3046c0493d9979e9daf53d5f3725798c96093dbbff6164a1339fd80bf54ad60868eb55dbc5a67239584f825cb0f5f6ca18c4e3e7f8f67b6698d26f3adf
|
data/.changelogrc
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"app_name": "html2rss",
|
|
3
|
+
"logo": "https://github.com/gildesmarais/html2rss/raw/master/support/logo.png",
|
|
4
|
+
"intro": "Generate RSS feeds by scraping websites by providing a config.",
|
|
5
|
+
"debug": "true",
|
|
6
|
+
"template": "support/changelog.md",
|
|
7
|
+
"sections": [
|
|
8
|
+
{ "title": "Bugfixes", "grep": "^fix" },
|
|
9
|
+
{ "title": "Features", "grep": "^feat" },
|
|
10
|
+
{ "title": "Documentation", "grep": "^docs" },
|
|
11
|
+
{ "title": "Breaking changes", "grep": "BREAKING" },
|
|
12
|
+
{ "title": "Refactorings", "grep": "^refactor" },
|
|
13
|
+
{ "title": "Code style", "grep": "^style" },
|
|
14
|
+
{ "title": "Test", "grep": "^spec" },
|
|
15
|
+
{ "title": "Chore", "grep": "^chore" },
|
|
16
|
+
{ "title": "Branches merged", "grep": "^Merge branch" },
|
|
17
|
+
{ "title": "Pull requests merged", "grep": "^Merge pull request" }
|
|
18
|
+
]
|
|
19
|
+
}
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 2.4
|
|
3
|
+
DisplayCopNames: true
|
|
4
|
+
|
|
5
|
+
Metrics/LineLength:
|
|
6
|
+
Max: 110
|
|
7
|
+
|
|
8
|
+
Metrics/BlockLength:
|
|
9
|
+
Exclude:
|
|
10
|
+
- "**/*_spec.rb"
|
|
11
|
+
- html2rss.gemspec
|
|
12
|
+
|
|
13
|
+
Metrics/ModuleLength:
|
|
14
|
+
Exclude:
|
|
15
|
+
- "**/*_spec.rb"
|
|
16
|
+
|
|
17
|
+
Documentation:
|
|
18
|
+
Enabled: false
|
|
19
|
+
|
|
20
|
+
Style/BlockDelimiters:
|
|
21
|
+
Enabled: false
|
|
22
|
+
|
|
23
|
+
Style/FrozenStringLiteralComment:
|
|
24
|
+
Enabled: false
|
|
25
|
+
|
|
26
|
+
Style/ParallelAssignment:
|
|
27
|
+
Enabled: false
|
|
28
|
+
|
|
29
|
+
Style/AsciiComments:
|
|
30
|
+
Enabled: false
|
|
31
|
+
|
|
32
|
+
Style/BracesAroundHashParameters:
|
|
33
|
+
Description: 'Enforce braces style around hash parameters.'
|
|
34
|
+
Enabled: true
|
|
35
|
+
|
|
36
|
+
Style/HashSyntax:
|
|
37
|
+
Description: >-
|
|
38
|
+
Prefer Ruby 1.9 hash syntax { a: 1, b: 2 } over 1.8 syntax
|
|
39
|
+
{ :a => 1, :b => 2 }.
|
|
40
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#hash-literals'
|
|
41
|
+
Enabled: true
|
|
42
|
+
|
|
43
|
+
Layout/SpaceInsideParens:
|
|
44
|
+
Description: 'No spaces after ( or before ).'
|
|
45
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#no-spaces-braces'
|
|
46
|
+
Enabled: true
|
data/.travis.yml
CHANGED
|
@@ -1,15 +1,24 @@
|
|
|
1
1
|
sudo: false
|
|
2
2
|
language: ruby
|
|
3
|
+
cache: bundler
|
|
3
4
|
|
|
4
5
|
before_install:
|
|
5
6
|
- gem update --system
|
|
6
7
|
- gem install bundler
|
|
7
|
-
|
|
8
|
+
|
|
9
|
+
bundler_args: "--jobs=3 --retry=3"
|
|
8
10
|
|
|
9
11
|
rvm:
|
|
10
|
-
- 2.3.
|
|
11
|
-
- 2.4.
|
|
12
|
-
- 2.5.
|
|
12
|
+
- 2.3.8
|
|
13
|
+
- 2.4.5
|
|
14
|
+
- 2.5.3
|
|
13
15
|
|
|
14
16
|
script:
|
|
15
17
|
- bundle exec rspec
|
|
18
|
+
|
|
19
|
+
deploy:
|
|
20
|
+
provider: rubygems
|
|
21
|
+
api_key:
|
|
22
|
+
secure: bM3Yl8iWdB1Amra3Bm6bIH/mTwHcRhZrX8etFFbJANxIbkhzUOyTKcDMYiWUVM/mBzzv0NOuRejrDR6R0v7E2udrKcLQFCBtv7HqPAXIlkEEyxZy+M1kTqcPzP872E+ZKTn93vCzbiXBLYoMmqgCzqvcO87IBYNzTURHkfFjaYJJdVyZ5EVtbpXf4FhBvuQf9LTk/ocClgwYeuqd+45lO7qHoPatsvbY0vCOfKaiwkdOkBt+hjc56awcYSc9CXn0DCatebPQmQmdrqFd8fKgyCatWS3n+8TPmvzVfNJe44wg3oNfHbWruP85I2LE9ei1iG+iGQIF60fMhGgMJ4EM3REXDE5Mg+GA5uJcgH9Poirut3Ih65jtAzYNGohlmEmc7ysKc0dmG1O3ndwrHjh5KePrOAGDaW6QKG+m5ebIZ+mgrEA+ZVU1mjDM8FlbSKAayoPloslZdllSv7miwGzh6xrHWGQSCURZAkygFh+Kd+Kg1eVlEs+n6aObod82mEOfBPvWPacOrE2fY4B0ocFOKotZBCZSD0ZIixlyslRTnmcJfpRNlYLsQ56oy5uPNUccPQ86NSmmE+qbRdPCLQCKLPm2iYBgOa5iQrfHR/fUgcO0skAZiW4o9QflDgIFS/G+BE6FMHIvjkKA6Ae4KbqGzlF5pGFdo6p4MhlvubwjsVI=
|
|
23
|
+
on:
|
|
24
|
+
tags: true
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
<img width="300px" src="https://github.com/gildesmarais/html2rss/raw/master/support/logo.png" />
|
|
2
|
+
|
|
3
|
+
# html2rss
|
|
4
|
+
|
|
5
|
+
_Generate RSS feeds by scraping websites by providing a config._
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## Bugfixes
|
|
10
|
+
- handling of url query breaks processing
|
|
11
|
+
([ace289e9](git@github.com:gildesmarais/html2rss/commit/ace289e911b69cb92433cac6f1ca0403715d8286))
|
|
12
|
+
- only set supported attributes on rss item
|
|
13
|
+
([dae0d8e7](git@github.com:gildesmarais/html2rss/commit/dae0d8e75541e810275e789a23971a61e60a2154))
|
|
14
|
+
|
|
15
|
+
- **config**
|
|
16
|
+
- feed generation fails
|
|
17
|
+
([7dd55869](git@github.com:gildesmarais/html2rss/commit/7dd55869f79b1de76c004bf0e82d13b16b5b3f0d))
|
|
18
|
+
|
|
19
|
+
- **parse_uri**
|
|
20
|
+
- handle non-absolute paths
|
|
21
|
+
([92150257](git@github.com:gildesmarais/html2rss/commit/921502574e4436d65a30e1d34b9b31f238336247))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
- add logo [skip ci]
|
|
28
|
+
([857a55fd](git@github.com:gildesmarais/html2rss/commit/857a55fd8c932930d96c47c5abe57f0507356df1))
|
|
29
|
+
- require updated to be present
|
|
30
|
+
([e1bedaec](git@github.com:gildesmarais/html2rss/commit/e1bedaecc91e874fe24e96000612abb9cd11e9fe))
|
|
31
|
+
- do not fail on invalid item, just skip it
|
|
32
|
+
([3b83d715](git@github.com:gildesmarais/html2rss/commit/3b83d715619abbc33b124de1945d17cb0dc7edb0))
|
|
33
|
+
|
|
34
|
+
- **item_extractor**
|
|
35
|
+
- text strips strings
|
|
36
|
+
([f5982859](git@github.com:gildesmarais/html2rss/commit/f59828593dca663bdbe8699392594e2d18658f8f))
|
|
37
|
+
- add static and current_time
|
|
38
|
+
([25043dcb](git@github.com:gildesmarais/html2rss/commit/25043dcbd8f0f4901202f4a2f66b355ac48825a8))
|
|
39
|
+
- handle absolute urls
|
|
40
|
+
([f96be008](git@github.com:gildesmarais/html2rss/commit/f96be00857bdcded02d52dd62ec22b9b52c803ed))
|
|
41
|
+
|
|
42
|
+
- **post_processing**
|
|
43
|
+
- add configurable post_processing (#5)
|
|
44
|
+
([4cf6caca](git@github.com:gildesmarais/html2rss/commit/4cf6cacac00bd3c0c53d584ca11274ba24b03ef7),
|
|
45
|
+
[#1](git@github.com:gildesmarais/html2rss/issues/1))
|
|
46
|
+
|
|
47
|
+
- **post_processor**
|
|
48
|
+
- add substring
|
|
49
|
+
([6f2a32a6](git@github.com:gildesmarais/html2rss/commit/6f2a32a6304ef9956577711173de681daf93f55f))
|
|
50
|
+
|
|
51
|
+
- **postprocessors**
|
|
52
|
+
- add Template (#6)
|
|
53
|
+
([f1db542e](git@github.com:gildesmarais/html2rss/commit/f1db542e8c1e9e09a066a3cd6c8514a6ca0aa871),
|
|
54
|
+
[#4](git@github.com:gildesmarais/html2rss/issues/4))
|
|
55
|
+
|
|
56
|
+
- **sanitize_html**
|
|
57
|
+
- add target="_blank" to anchors
|
|
58
|
+
([975a73bf](git@github.com:gildesmarais/html2rss/commit/975a73bfd396ba5942bc0ea80eebd14cc37ad776))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
## Documentation
|
|
64
|
+
- create the changelog
|
|
65
|
+
([5c561db5](git@github.com:gildesmarais/html2rss/commit/5c561db51d4e0b8592b1c82812ab5cdbe9320b70))
|
|
66
|
+
- add tips and tricks
|
|
67
|
+
([ea978240](git@github.com:gildesmarais/html2rss/commit/ea9782408107f3637a4c9665396f511fc07be19b))
|
|
68
|
+
- update readme
|
|
69
|
+
([4743167c](git@github.com:gildesmarais/html2rss/commit/4743167c86959e83524ffb7282c562413a651797))
|
|
70
|
+
- add note about html2rss-web
|
|
71
|
+
([3371c12f](git@github.com:gildesmarais/html2rss/commit/3371c12ffc6c8d3c29073d03ff206886a39401cd))
|
|
72
|
+
- add a badge for travis-ci
|
|
73
|
+
([8818d4f4](git@github.com:gildesmarais/html2rss/commit/8818d4f464a9c163ebc9665d01719e2bab132bd6))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
## Test
|
|
79
|
+
- don't be so lazy when matching strings
|
|
80
|
+
([6a0eb627](git@github.com:gildesmarais/html2rss/commit/6a0eb62765523a1405fd269466b2fc57794eac7a))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
## Chore
|
|
86
|
+
- upgrade sanitze gem to version 5.0.0
|
|
87
|
+
([8c4bf3a4](git@github.com:gildesmarais/html2rss/commit/8c4bf3a44885758e395568ec452a7cffdb9a0389))
|
|
88
|
+
- rubocop autocorrect
|
|
89
|
+
([af6b9fac](git@github.com:gildesmarais/html2rss/commit/af6b9facca547d3ca3ce9ef0d1227707cd16eaea))
|
|
90
|
+
- build against latest ruby releases
|
|
91
|
+
([17ba79ac](git@github.com:gildesmarais/html2rss/commit/17ba79acd2f68da1fcc984368d3e6de3437cbf1b))
|
|
92
|
+
- update dependencies
|
|
93
|
+
([855279b4](git@github.com:gildesmarais/html2rss/commit/855279b46d584a8a8c2a317529f7a4be550eaf15))
|
|
94
|
+
- update dependencies
|
|
95
|
+
([46e5a283](git@github.com:gildesmarais/html2rss/commit/46e5a2832d1f2fe1353dcc2a8d82a9786f15f6bd))
|
|
96
|
+
- add simplecov
|
|
97
|
+
([b4e1144b](git@github.com:gildesmarais/html2rss/commit/b4e1144b7f8f90126e528cc4a4ec048113d93634))
|
|
98
|
+
|
|
99
|
+
- **changelog**
|
|
100
|
+
- add generation with git-changelog
|
|
101
|
+
([07ad5a51](git@github.com:gildesmarais/html2rss/commit/07ad5a513f0951ee988426abda4b8c233411ead7))
|
|
102
|
+
|
|
103
|
+
- **travis**
|
|
104
|
+
- use cache for bundler
|
|
105
|
+
([ac76b3b2](git@github.com:gildesmarais/html2rss/commit/ac76b3b2dd94adecd4927de18651800438a7e7ba))
|
|
106
|
+
- setup autorelease to rubygems
|
|
107
|
+
([eb9c8e1b](git@github.com:gildesmarais/html2rss/commit/eb9c8e1b16902dc0e174a0cccb6eb9227307ce82))
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
<sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
|
|
115
|
+
|
data/Gemfile.lock
CHANGED
|
@@ -4,7 +4,7 @@ PATH
|
|
|
4
4
|
html2rss (0.0.1)
|
|
5
5
|
faraday (~> 0.15)
|
|
6
6
|
nokogiri (~> 1.8)
|
|
7
|
-
sanitize (~>
|
|
7
|
+
sanitize (~> 5.0)
|
|
8
8
|
|
|
9
9
|
GEM
|
|
10
10
|
remote: https://rubygems.org/
|
|
@@ -12,31 +12,38 @@ GEM
|
|
|
12
12
|
byebug (10.0.2)
|
|
13
13
|
crass (1.0.4)
|
|
14
14
|
diff-lcs (1.3)
|
|
15
|
-
|
|
15
|
+
docile (1.3.1)
|
|
16
|
+
faraday (0.15.3)
|
|
16
17
|
multipart-post (>= 1.2, < 3)
|
|
18
|
+
json (2.1.0)
|
|
17
19
|
mini_portile2 (2.3.0)
|
|
18
20
|
multipart-post (2.0.0)
|
|
19
|
-
nokogiri (1.8.
|
|
21
|
+
nokogiri (1.8.5)
|
|
20
22
|
mini_portile2 (~> 2.3.0)
|
|
21
|
-
nokogumbo (
|
|
22
|
-
nokogiri
|
|
23
|
-
rspec (3.
|
|
24
|
-
rspec-core (~> 3.
|
|
25
|
-
rspec-expectations (~> 3.
|
|
26
|
-
rspec-mocks (~> 3.
|
|
27
|
-
rspec-core (3.
|
|
28
|
-
rspec-support (~> 3.
|
|
29
|
-
rspec-expectations (3.
|
|
23
|
+
nokogumbo (2.0.0)
|
|
24
|
+
nokogiri (~> 1.8, >= 1.8.4)
|
|
25
|
+
rspec (3.8.0)
|
|
26
|
+
rspec-core (~> 3.8.0)
|
|
27
|
+
rspec-expectations (~> 3.8.0)
|
|
28
|
+
rspec-mocks (~> 3.8.0)
|
|
29
|
+
rspec-core (3.8.0)
|
|
30
|
+
rspec-support (~> 3.8.0)
|
|
31
|
+
rspec-expectations (3.8.2)
|
|
30
32
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
31
|
-
rspec-support (~> 3.
|
|
32
|
-
rspec-mocks (3.
|
|
33
|
+
rspec-support (~> 3.8.0)
|
|
34
|
+
rspec-mocks (3.8.0)
|
|
33
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
34
|
-
rspec-support (~> 3.
|
|
35
|
-
rspec-support (3.
|
|
36
|
-
sanitize (
|
|
36
|
+
rspec-support (~> 3.8.0)
|
|
37
|
+
rspec-support (3.8.0)
|
|
38
|
+
sanitize (5.0.0)
|
|
37
39
|
crass (~> 1.0.2)
|
|
38
|
-
nokogiri (>= 1.
|
|
39
|
-
nokogumbo (~>
|
|
40
|
+
nokogiri (>= 1.8.0)
|
|
41
|
+
nokogumbo (~> 2.0)
|
|
42
|
+
simplecov (0.16.1)
|
|
43
|
+
docile (~> 1.1)
|
|
44
|
+
json (>= 1.8, < 3)
|
|
45
|
+
simplecov-html (~> 0.10.0)
|
|
46
|
+
simplecov-html (0.10.2)
|
|
40
47
|
vcr (4.0.0)
|
|
41
48
|
|
|
42
49
|
PLATFORMS
|
|
@@ -47,7 +54,8 @@ DEPENDENCIES
|
|
|
47
54
|
byebug (~> 10.0)
|
|
48
55
|
html2rss!
|
|
49
56
|
rspec (~> 3.0)
|
|
57
|
+
simplecov
|
|
50
58
|
vcr (~> 4.0)
|
|
51
59
|
|
|
52
60
|
BUNDLED WITH
|
|
53
|
-
1.16.
|
|
61
|
+
1.16.6
|
data/README.md
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# html2rss [](https://travis-ci.org/gildesmarais/html2rss)
|
|
2
4
|
|
|
3
5
|
Request and convert an HTML document to an RSS feed via a config object.
|
|
4
6
|
The config contains the URL to scrape and the selectors needed to extract
|
|
@@ -24,14 +26,26 @@ Or install it yourself as:
|
|
|
24
26
|
|
|
25
27
|
$ gem install html2rss
|
|
26
28
|
|
|
27
|
-
## Usage
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
## Usage with a YAML file
|
|
28
32
|
|
|
29
|
-
Create a YAML config file. Find an example at `rspec/config.test.yml
|
|
33
|
+
Create a YAML config file. Find an example at [`rspec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
|
|
30
34
|
|
|
31
35
|
`Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')` returns
|
|
32
36
|
|
|
33
37
|
an `RSS:Rss` object.
|
|
34
38
|
|
|
39
|
+
## Usage in a web application
|
|
40
|
+
|
|
41
|
+
Find a minimal Sintra app which exposes your feeds to HTTP endpoints here:
|
|
42
|
+
[gildesmarais/html2rss-web](https://github.com/gildesmarais/html2rss-web)
|
|
43
|
+
|
|
44
|
+
### Tips and tricks
|
|
45
|
+
|
|
46
|
+
- Check that the channel url does not redirect to a mobile page
|
|
47
|
+
- fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems quite efficient
|
|
48
|
+
|
|
35
49
|
## Development
|
|
36
50
|
|
|
37
51
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
@@ -40,6 +54,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
|
40
54
|
|
|
41
55
|
Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
|
|
42
56
|
|
|
57
|
+
## Changelog generation
|
|
58
|
+
|
|
59
|
+
The `CHANGELOG.md` can be generated automatically.
|
|
60
|
+
Install [git-changelog](https://www.npmjs.com/package/git-changelog) globally and run `git-changelog` afterwards.
|
|
61
|
+
|
|
43
62
|
## License
|
|
44
63
|
|
|
45
64
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/html2rss.gemspec
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
lib = File.expand_path('
|
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
|
2
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
3
|
require 'html2rss/version'
|
|
4
4
|
|
|
@@ -28,11 +28,12 @@ Gem::Specification.new do |spec|
|
|
|
28
28
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
29
29
|
spec.require_paths = ['lib']
|
|
30
30
|
|
|
31
|
-
spec.add_dependency 'nokogiri', '~> 1.8'
|
|
32
|
-
spec.add_dependency 'sanitize', '~> 4.6'
|
|
33
31
|
spec.add_dependency 'faraday', '~> 0.15'
|
|
32
|
+
spec.add_dependency 'nokogiri', '~> 1.8'
|
|
33
|
+
spec.add_dependency 'sanitize', '~> 5.0'
|
|
34
34
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
|
35
|
+
spec.add_development_dependency 'byebug', '~> 10.0'
|
|
35
36
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
|
37
|
+
spec.add_development_dependency 'simplecov'
|
|
36
38
|
spec.add_development_dependency 'vcr', '~> 4.0'
|
|
37
|
-
spec.add_development_dependency 'byebug', '~> 10.0'
|
|
38
39
|
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require_relative 'attribute_post_processors/parse_time'
|
|
2
|
+
require_relative 'attribute_post_processors/parse_uri'
|
|
3
|
+
require_relative 'attribute_post_processors/sanitize_html'
|
|
4
|
+
require_relative 'attribute_post_processors/substring'
|
|
5
|
+
require_relative 'attribute_post_processors/template'
|
|
6
|
+
|
|
7
|
+
module Html2rss
|
|
8
|
+
module AttributePostProcessors
|
|
9
|
+
def self.get_processor(options)
|
|
10
|
+
camel_cased_option = options['name'].split('_').collect(&:capitalize).join
|
|
11
|
+
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_option].join('::')
|
|
12
|
+
|
|
13
|
+
Object.const_get(class_name)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'sanitize'
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
module AttributePostProcessors
|
|
5
|
+
class SanitizeHtml
|
|
6
|
+
def initialize(value, _options, _item)
|
|
7
|
+
@value = value
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def get
|
|
11
|
+
Sanitize.fragment(@value, Sanitize::Config.merge(
|
|
12
|
+
Sanitize::Config::RELAXED,
|
|
13
|
+
add_attributes: {
|
|
14
|
+
'a' => {
|
|
15
|
+
'rel' => 'nofollow noopener noreferrer',
|
|
16
|
+
'target' => '_blank'
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
))
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Html2rss
|
|
2
|
+
module AttributePostProcessors
|
|
3
|
+
class Substring
|
|
4
|
+
def initialize(value, options, _item)
|
|
5
|
+
@value = value
|
|
6
|
+
@options = options
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def get
|
|
10
|
+
ending = @options['end'].to_i.positive? ? @options['end'].to_i : @value.length
|
|
11
|
+
@value[@options['start'].to_i..ending]
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
require 'sanitize'
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
module AttributePostProcessors
|
|
5
|
+
class Template
|
|
6
|
+
def initialize(value, options, item)
|
|
7
|
+
@value = value
|
|
8
|
+
@options = options
|
|
9
|
+
@item = item
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def get
|
|
13
|
+
string % methods
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def string
|
|
19
|
+
@options['string']
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def methods
|
|
23
|
+
@methods ||= @options['methods'].map { |method|
|
|
24
|
+
if method == 'self'
|
|
25
|
+
@value
|
|
26
|
+
else
|
|
27
|
+
@item.send(method.to_sym)&.to_s
|
|
28
|
+
end
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
data/lib/html2rss/config.rb
CHANGED
|
@@ -38,7 +38,7 @@ module Html2rss
|
|
|
38
38
|
end
|
|
39
39
|
|
|
40
40
|
def options(name)
|
|
41
|
-
feed_config.dig('selectors',
|
|
41
|
+
feed_config.dig('selectors').fetch(name, {}).merge('channel' => channel_config)
|
|
42
42
|
end
|
|
43
43
|
|
|
44
44
|
def selector(name)
|
|
@@ -46,8 +46,8 @@ module Html2rss
|
|
|
46
46
|
end
|
|
47
47
|
|
|
48
48
|
def attribute_names
|
|
49
|
-
attribute_names = feed_config.fetch('selectors', {}).keys.map(&:
|
|
50
|
-
attribute_names.delete(
|
|
49
|
+
attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_s)
|
|
50
|
+
attribute_names.delete('items')
|
|
51
51
|
attribute_names
|
|
52
52
|
end
|
|
53
53
|
end
|
|
@@ -22,7 +22,7 @@ module Html2rss
|
|
|
22
22
|
private
|
|
23
23
|
|
|
24
24
|
def add_channel_to_maker(maker)
|
|
25
|
-
[
|
|
25
|
+
%i[language author title description link ttl].each do |attribute_name|
|
|
26
26
|
maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
|
|
27
27
|
end
|
|
28
28
|
|
|
@@ -31,12 +31,14 @@ module Html2rss
|
|
|
31
31
|
end
|
|
32
32
|
|
|
33
33
|
def feed_items
|
|
34
|
-
Item.from_url config.url, config
|
|
34
|
+
@feed_items ||= Item.from_url config.url, config
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def add_item_to_items(feed_item, items)
|
|
38
|
+
return unless feed_item.valid?
|
|
39
|
+
|
|
38
40
|
items.new_item do |rss_item|
|
|
39
|
-
|
|
41
|
+
feed_item.available_attributes.each do |attribute_name|
|
|
40
42
|
rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
|
|
41
43
|
|
|
42
44
|
rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
|
data/lib/html2rss/item.rb
CHANGED
|
@@ -2,6 +2,7 @@ require 'faraday'
|
|
|
2
2
|
require 'open-uri'
|
|
3
3
|
require 'nokogiri'
|
|
4
4
|
require_relative 'item_extractor'
|
|
5
|
+
require_relative 'attribute_post_processors'
|
|
5
6
|
|
|
6
7
|
module Html2rss
|
|
7
8
|
class Item
|
|
@@ -24,26 +25,37 @@ module Html2rss
|
|
|
24
25
|
proc = ItemExtractor.const_get extractor.upcase.to_sym
|
|
25
26
|
value = proc.call(xml, attribute_config)
|
|
26
27
|
|
|
27
|
-
post_process
|
|
28
|
+
post_process_options = attribute_config.fetch('post_process', false)
|
|
29
|
+
value = post_process(value, post_process_options) if post_process_options
|
|
30
|
+
|
|
31
|
+
value
|
|
28
32
|
end
|
|
29
33
|
|
|
30
|
-
def
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
34
|
+
def available_attributes
|
|
35
|
+
# TODO: support optional attributes, e.g. category, enclosure, source
|
|
36
|
+
@available_attributes ||= (%w[title link description author comments updated] & @config.attribute_names)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def valid?
|
|
40
|
+
return false if [title.to_s, description.to_s].join('') == ''
|
|
41
|
+
|
|
42
|
+
true
|
|
39
43
|
end
|
|
40
44
|
|
|
41
45
|
def self.from_url(url, config)
|
|
42
46
|
connection = Faraday.new(url: url, headers: config.headers)
|
|
43
47
|
page = Nokogiri::HTML(connection.get.body)
|
|
44
|
-
page.css(config.selector('items')).map
|
|
48
|
+
page.css(config.selector('items')).map do |xml_item|
|
|
45
49
|
new xml_item, config
|
|
46
|
-
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def post_process(value, options)
|
|
56
|
+
Html2rss::AttributePostProcessors.get_processor(options)
|
|
57
|
+
.new(value, options, self)
|
|
58
|
+
.get
|
|
47
59
|
end
|
|
48
60
|
end
|
|
49
61
|
end
|
|
@@ -1,25 +1,25 @@
|
|
|
1
|
-
require 'sanitize'
|
|
2
|
-
|
|
3
1
|
module Html2rss
|
|
4
2
|
module ItemExtractor
|
|
5
|
-
TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
|
|
6
|
-
ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }
|
|
3
|
+
TEXT = proc { |xml, options| xml.css(options['selector'])&.text&.strip }
|
|
4
|
+
ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']).to_s }
|
|
7
5
|
|
|
8
6
|
HREF = proc { |xml, options|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
uri
|
|
12
|
-
}
|
|
7
|
+
href = xml.css(options['selector']).attr('href').to_s
|
|
8
|
+
path, query = href.split('?')
|
|
13
9
|
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
if href.start_with?('http')
|
|
11
|
+
uri = URI(href)
|
|
12
|
+
else
|
|
13
|
+
uri = URI(options['channel']['url'])
|
|
14
|
+
uri.path = path.start_with?('/') ? path : "/#{path}"
|
|
15
|
+
uri.query = query
|
|
16
|
+
end
|
|
16
17
|
|
|
17
|
-
|
|
18
|
-
Sanitize::Config::RELAXED,
|
|
19
|
-
add_attributes: {
|
|
20
|
-
'a' => { 'rel' => 'nofollow noopener noreferrer' }
|
|
21
|
-
}
|
|
22
|
-
))
|
|
18
|
+
uri
|
|
23
19
|
}
|
|
20
|
+
|
|
21
|
+
HTML = proc { |xml, options| xml.css(options['selector']).to_s }
|
|
22
|
+
STATIC = proc { |_xml, options| options['static'] }
|
|
23
|
+
CURRENT_TIME = proc { |_xml, _options| Time.new }
|
|
24
24
|
end
|
|
25
25
|
end
|
data/lib/html2rss/version.rb
CHANGED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
<% if(logo) { %><img width="300px" src="<%= logo %>" /><%= '\n\n' %><% } %># <%= title %>
|
|
2
|
+
<% if(intro) { %><%= '\n' %>_<%= intro %>_<%= '\n' %><% } %>
|
|
3
|
+
<% if(version && (version.name || version.number)) { %>##<% if(version.name){%> <%= version.name %><% } %> <%= version.number %> <% if(version.date){ %>( <%= version.date %> )<% } %><%= '\n' %><% } %>
|
|
4
|
+
<% _.forEach(sections, function(section){
|
|
5
|
+
if(section.commitsCount > 0) { %>
|
|
6
|
+
## <%= section.title %>
|
|
7
|
+
<% _.forEach(section.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
|
|
8
|
+
<% _.forEach(section.components, function(component){ %> - **<%= component.name %>**
|
|
9
|
+
<% _.forEach(component.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
|
|
10
|
+
<% }) %>
|
|
11
|
+
<% } %>
|
|
12
|
+
<% }) %>
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
<sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
|
data/support/logo.png
ADDED
|
Binary file
|
metadata
CHANGED
|
@@ -1,57 +1,57 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0
|
|
4
|
+
version: 0.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2018-
|
|
11
|
+
date: 2018-11-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
|
-
name:
|
|
14
|
+
name: faraday
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
17
|
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: '
|
|
19
|
+
version: '0.15'
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: '
|
|
26
|
+
version: '0.15'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
28
|
+
name: nokogiri
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
31
|
- - "~>"
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '
|
|
33
|
+
version: '1.8'
|
|
34
34
|
type: :runtime
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
38
|
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: '
|
|
40
|
+
version: '1.8'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
|
-
name:
|
|
42
|
+
name: sanitize
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
45
|
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: '0
|
|
47
|
+
version: '5.0'
|
|
48
48
|
type: :runtime
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: '0
|
|
54
|
+
version: '5.0'
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
56
|
name: bundler
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -66,6 +66,20 @@ dependencies:
|
|
|
66
66
|
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: '1.16'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: byebug
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - "~>"
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '10.0'
|
|
76
|
+
type: :development
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - "~>"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '10.0'
|
|
69
83
|
- !ruby/object:Gem::Dependency
|
|
70
84
|
name: rspec
|
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -81,33 +95,33 @@ dependencies:
|
|
|
81
95
|
- !ruby/object:Gem::Version
|
|
82
96
|
version: '3.0'
|
|
83
97
|
- !ruby/object:Gem::Dependency
|
|
84
|
-
name:
|
|
98
|
+
name: simplecov
|
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
|
86
100
|
requirements:
|
|
87
|
-
- - "
|
|
101
|
+
- - ">="
|
|
88
102
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '
|
|
103
|
+
version: '0'
|
|
90
104
|
type: :development
|
|
91
105
|
prerelease: false
|
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
107
|
requirements:
|
|
94
|
-
- - "
|
|
108
|
+
- - ">="
|
|
95
109
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '
|
|
110
|
+
version: '0'
|
|
97
111
|
- !ruby/object:Gem::Dependency
|
|
98
|
-
name:
|
|
112
|
+
name: vcr
|
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
|
100
114
|
requirements:
|
|
101
115
|
- - "~>"
|
|
102
116
|
- !ruby/object:Gem::Version
|
|
103
|
-
version: '
|
|
117
|
+
version: '4.0'
|
|
104
118
|
type: :development
|
|
105
119
|
prerelease: false
|
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
121
|
requirements:
|
|
108
122
|
- - "~>"
|
|
109
123
|
- !ruby/object:Gem::Version
|
|
110
|
-
version: '
|
|
124
|
+
version: '4.0'
|
|
111
125
|
description: |-
|
|
112
126
|
Create your config object, include the url to scrape,
|
|
113
127
|
some selectors and get a RSS2 feed in return.
|
|
@@ -117,9 +131,12 @@ executables: []
|
|
|
117
131
|
extensions: []
|
|
118
132
|
extra_rdoc_files: []
|
|
119
133
|
files:
|
|
134
|
+
- ".changelogrc"
|
|
120
135
|
- ".gitignore"
|
|
121
136
|
- ".rspec"
|
|
137
|
+
- ".rubocop.yml"
|
|
122
138
|
- ".travis.yml"
|
|
139
|
+
- CHANGELOG.md
|
|
123
140
|
- Gemfile
|
|
124
141
|
- Gemfile.lock
|
|
125
142
|
- LICENSE
|
|
@@ -128,11 +145,19 @@ files:
|
|
|
128
145
|
- bin/setup
|
|
129
146
|
- html2rss.gemspec
|
|
130
147
|
- lib/html2rss.rb
|
|
148
|
+
- lib/html2rss/attribute_post_processors.rb
|
|
149
|
+
- lib/html2rss/attribute_post_processors/parse_time.rb
|
|
150
|
+
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
|
151
|
+
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
|
152
|
+
- lib/html2rss/attribute_post_processors/substring.rb
|
|
153
|
+
- lib/html2rss/attribute_post_processors/template.rb
|
|
131
154
|
- lib/html2rss/config.rb
|
|
132
155
|
- lib/html2rss/feed_builder.rb
|
|
133
156
|
- lib/html2rss/item.rb
|
|
134
157
|
- lib/html2rss/item_extractor.rb
|
|
135
158
|
- lib/html2rss/version.rb
|
|
159
|
+
- support/changelog.md
|
|
160
|
+
- support/logo.png
|
|
136
161
|
homepage: https://github.com/gildesmarais/html2rss
|
|
137
162
|
licenses:
|
|
138
163
|
- MIT
|
|
@@ -154,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
154
179
|
version: '0'
|
|
155
180
|
requirements: []
|
|
156
181
|
rubyforge_project:
|
|
157
|
-
rubygems_version: 2.7.
|
|
182
|
+
rubygems_version: 2.7.8
|
|
158
183
|
signing_key:
|
|
159
184
|
specification_version: 4
|
|
160
185
|
summary: Generate RSS feeds by scraping websites by providing a config.
|