html2rss 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 48c6ef4f6636bac787600fa7b2efa3d313361016279d9895b50895c00b74eb8b
4
- data.tar.gz: 58ea03df84c7c8f3c0a73e5186b49f766f06e476a1606b4837d3749444df1551
3
+ metadata.gz: 1cc71ee2722757d0f4cceaf8aea1b9007496acf35fdf4da37a5dea998233258d
4
+ data.tar.gz: '0009a02fe85a6e2b088aec00944e1dc46233ce6878b8558782286be0e020511b'
5
5
  SHA512:
6
- metadata.gz: 5dd2ba0fb71dcb16bf08632feedf1d5edfe666225bcfeca409be4be153de44a9a04900e0cfd3d188b18b7e9e48ae37051a48448ed06649deb281440a1cb3fbad
7
- data.tar.gz: '0908d37c6a8b3b2b0dd620c69bdfea808533b862f5bf6892f4a96bb2b9ce9c36f211bb83072ca92bcd310a42c78a7270b593acc10caa7833f67060f487636ad2'
6
+ metadata.gz: 2de4befd1f3beec2ee094f2c792ec94a1bde37c3da1f30eb737403ae04eddb36b54a43b86163f8b348229c42a5d38aae0bc7657c677bdbe95a54bee26b1fa6d9
7
+ data.tar.gz: f344bb3046c0493d9979e9daf53d5f3725798c96093dbbff6164a1339fd80bf54ad60868eb55dbc5a67239584f825cb0f5f6ca18c4e3e7f8f67b6698d26f3adf
@@ -0,0 +1,19 @@
1
+ {
2
+ "app_name": "html2rss",
3
+ "logo": "https://github.com/gildesmarais/html2rss/raw/master/support/logo.png",
4
+ "intro": "Generate RSS feeds by scraping websites by providing a config.",
5
+ "debug": "true",
6
+ "template": "support/changelog.md",
7
+ "sections": [
8
+ { "title": "Bugfixes", "grep": "^fix" },
9
+ { "title": "Features", "grep": "^feat" },
10
+ { "title": "Documentation", "grep": "^docs" },
11
+ { "title": "Breaking changes", "grep": "BREAKING" },
12
+ { "title": "Refactorings", "grep": "^refactor" },
13
+ { "title": "Code style", "grep": "^style" },
14
+ { "title": "Test", "grep": "^spec" },
15
+ { "title": "Chore", "grep": "^chore" },
16
+ { "title": "Branches merged", "grep": "^Merge branch" },
17
+ { "title": "Pull requests merged", "grep": "^Merge pull request" }
18
+ ]
19
+ }
data/.gitignore CHANGED
@@ -9,3 +9,4 @@
9
9
 
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
+ coverage
@@ -0,0 +1,46 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.4
3
+ DisplayCopNames: true
4
+
5
+ Metrics/LineLength:
6
+ Max: 110
7
+
8
+ Metrics/BlockLength:
9
+ Exclude:
10
+ - "**/*_spec.rb"
11
+ - html2rss.gemspec
12
+
13
+ Metrics/ModuleLength:
14
+ Exclude:
15
+ - "**/*_spec.rb"
16
+
17
+ Documentation:
18
+ Enabled: false
19
+
20
+ Style/BlockDelimiters:
21
+ Enabled: false
22
+
23
+ Style/FrozenStringLiteralComment:
24
+ Enabled: false
25
+
26
+ Style/ParallelAssignment:
27
+ Enabled: false
28
+
29
+ Style/AsciiComments:
30
+ Enabled: false
31
+
32
+ Style/BracesAroundHashParameters:
33
+ Description: 'Enforce braces style around hash parameters.'
34
+ Enabled: true
35
+
36
+ Style/HashSyntax:
37
+ Description: >-
38
+ Prefer Ruby 1.9 hash syntax { a: 1, b: 2 } over 1.8 syntax
39
+ { :a => 1, :b => 2 }.
40
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#hash-literals'
41
+ Enabled: true
42
+
43
+ Layout/SpaceInsideParens:
44
+ Description: 'No spaces after ( or before ).'
45
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#no-spaces-braces'
46
+ Enabled: true
@@ -1,15 +1,24 @@
1
1
  sudo: false
2
2
  language: ruby
3
+ cache: bundler
3
4
 
4
5
  before_install:
5
6
  - gem update --system
6
7
  - gem install bundler
7
- bundler_args: --jobs=3 --retry=3
8
+
9
+ bundler_args: "--jobs=3 --retry=3"
8
10
 
9
11
  rvm:
10
- - 2.3.7
11
- - 2.4.4
12
- - 2.5.1
12
+ - 2.3.8
13
+ - 2.4.5
14
+ - 2.5.3
13
15
 
14
16
  script:
15
17
  - bundle exec rspec
18
+
19
+ deploy:
20
+ provider: rubygems
21
+ api_key:
22
+ secure: bM3Yl8iWdB1Amra3Bm6bIH/mTwHcRhZrX8etFFbJANxIbkhzUOyTKcDMYiWUVM/mBzzv0NOuRejrDR6R0v7E2udrKcLQFCBtv7HqPAXIlkEEyxZy+M1kTqcPzP872E+ZKTn93vCzbiXBLYoMmqgCzqvcO87IBYNzTURHkfFjaYJJdVyZ5EVtbpXf4FhBvuQf9LTk/ocClgwYeuqd+45lO7qHoPatsvbY0vCOfKaiwkdOkBt+hjc56awcYSc9CXn0DCatebPQmQmdrqFd8fKgyCatWS3n+8TPmvzVfNJe44wg3oNfHbWruP85I2LE9ei1iG+iGQIF60fMhGgMJ4EM3REXDE5Mg+GA5uJcgH9Poirut3Ih65jtAzYNGohlmEmc7ysKc0dmG1O3ndwrHjh5KePrOAGDaW6QKG+m5ebIZ+mgrEA+ZVU1mjDM8FlbSKAayoPloslZdllSv7miwGzh6xrHWGQSCURZAkygFh+Kd+Kg1eVlEs+n6aObod82mEOfBPvWPacOrE2fY4B0ocFOKotZBCZSD0ZIixlyslRTnmcJfpRNlYLsQ56oy5uPNUccPQ86NSmmE+qbRdPCLQCKLPm2iYBgOa5iQrfHR/fUgcO0skAZiW4o9QflDgIFS/G+BE6FMHIvjkKA6Ae4KbqGzlF5pGFdo6p4MhlvubwjsVI=
23
+ on:
24
+ tags: true
@@ -0,0 +1,115 @@
1
+ <img width="300px" src="https://github.com/gildesmarais/html2rss/raw/master/support/logo.png" />
2
+
3
+ # html2rss
4
+
5
+ _Generate RSS feeds by scraping websites by providing a config._
6
+
7
+
8
+
9
+ ## Bugfixes
10
+ - handling of url query breaks processing
11
+ ([ace289e9](git@github.com:gildesmarais/html2rss/commit/ace289e911b69cb92433cac6f1ca0403715d8286))
12
+ - only set supported attributes on rss item
13
+ ([dae0d8e7](git@github.com:gildesmarais/html2rss/commit/dae0d8e75541e810275e789a23971a61e60a2154))
14
+
15
+ - **config**
16
+ - feed generation fails
17
+ ([7dd55869](git@github.com:gildesmarais/html2rss/commit/7dd55869f79b1de76c004bf0e82d13b16b5b3f0d))
18
+
19
+ - **parse_uri**
20
+ - handle non-absolute paths
21
+ ([92150257](git@github.com:gildesmarais/html2rss/commit/921502574e4436d65a30e1d34b9b31f238336247))
22
+
23
+
24
+
25
+
26
+ ## Features
27
+ - add logo [skip ci]
28
+ ([857a55fd](git@github.com:gildesmarais/html2rss/commit/857a55fd8c932930d96c47c5abe57f0507356df1))
29
+ - require updated to be present
30
+ ([e1bedaec](git@github.com:gildesmarais/html2rss/commit/e1bedaecc91e874fe24e96000612abb9cd11e9fe))
31
+ - do not fail on invalid item, just skip it
32
+ ([3b83d715](git@github.com:gildesmarais/html2rss/commit/3b83d715619abbc33b124de1945d17cb0dc7edb0))
33
+
34
+ - **item_extractor**
35
+ - text strips strings
36
+ ([f5982859](git@github.com:gildesmarais/html2rss/commit/f59828593dca663bdbe8699392594e2d18658f8f))
37
+ - add static and current_time
38
+ ([25043dcb](git@github.com:gildesmarais/html2rss/commit/25043dcbd8f0f4901202f4a2f66b355ac48825a8))
39
+ - handle absolute urls
40
+ ([f96be008](git@github.com:gildesmarais/html2rss/commit/f96be00857bdcded02d52dd62ec22b9b52c803ed))
41
+
42
+ - **post_processing**
43
+ - add configurable post_processing (#5)
44
+ ([4cf6caca](git@github.com:gildesmarais/html2rss/commit/4cf6cacac00bd3c0c53d584ca11274ba24b03ef7),
45
+ [#1](git@github.com:gildesmarais/html2rss/issues/1))
46
+
47
+ - **post_processor**
48
+ - add substring
49
+ ([6f2a32a6](git@github.com:gildesmarais/html2rss/commit/6f2a32a6304ef9956577711173de681daf93f55f))
50
+
51
+ - **postprocessors**
52
+ - add Template (#6)
53
+ ([f1db542e](git@github.com:gildesmarais/html2rss/commit/f1db542e8c1e9e09a066a3cd6c8514a6ca0aa871),
54
+ [#4](git@github.com:gildesmarais/html2rss/issues/4))
55
+
56
+ - **sanitize_html**
57
+ - add target="_blank" to anchors
58
+ ([975a73bf](git@github.com:gildesmarais/html2rss/commit/975a73bfd396ba5942bc0ea80eebd14cc37ad776))
59
+
60
+
61
+
62
+
63
+ ## Documentation
64
+ - create the changelog
65
+ ([5c561db5](git@github.com:gildesmarais/html2rss/commit/5c561db51d4e0b8592b1c82812ab5cdbe9320b70))
66
+ - add tips and tricks
67
+ ([ea978240](git@github.com:gildesmarais/html2rss/commit/ea9782408107f3637a4c9665396f511fc07be19b))
68
+ - update readme
69
+ ([4743167c](git@github.com:gildesmarais/html2rss/commit/4743167c86959e83524ffb7282c562413a651797))
70
+ - add note about html2rss-web
71
+ ([3371c12f](git@github.com:gildesmarais/html2rss/commit/3371c12ffc6c8d3c29073d03ff206886a39401cd))
72
+ - add a badge for travis-ci
73
+ ([8818d4f4](git@github.com:gildesmarais/html2rss/commit/8818d4f464a9c163ebc9665d01719e2bab132bd6))
74
+
75
+
76
+
77
+
78
+ ## Test
79
+ - don't be so lazy when matching strings
80
+ ([6a0eb627](git@github.com:gildesmarais/html2rss/commit/6a0eb62765523a1405fd269466b2fc57794eac7a))
81
+
82
+
83
+
84
+
85
+ ## Chore
86
+ - upgrade sanitze gem to version 5.0.0
87
+ ([8c4bf3a4](git@github.com:gildesmarais/html2rss/commit/8c4bf3a44885758e395568ec452a7cffdb9a0389))
88
+ - rubocop autocorrect
89
+ ([af6b9fac](git@github.com:gildesmarais/html2rss/commit/af6b9facca547d3ca3ce9ef0d1227707cd16eaea))
90
+ - build against latest ruby releases
91
+ ([17ba79ac](git@github.com:gildesmarais/html2rss/commit/17ba79acd2f68da1fcc984368d3e6de3437cbf1b))
92
+ - update dependencies
93
+ ([855279b4](git@github.com:gildesmarais/html2rss/commit/855279b46d584a8a8c2a317529f7a4be550eaf15))
94
+ - update dependencies
95
+ ([46e5a283](git@github.com:gildesmarais/html2rss/commit/46e5a2832d1f2fe1353dcc2a8d82a9786f15f6bd))
96
+ - add simplecov
97
+ ([b4e1144b](git@github.com:gildesmarais/html2rss/commit/b4e1144b7f8f90126e528cc4a4ec048113d93634))
98
+
99
+ - **changelog**
100
+ - add generation with git-changelog
101
+ ([07ad5a51](git@github.com:gildesmarais/html2rss/commit/07ad5a513f0951ee988426abda4b8c233411ead7))
102
+
103
+ - **travis**
104
+ - use cache for bundler
105
+ ([ac76b3b2](git@github.com:gildesmarais/html2rss/commit/ac76b3b2dd94adecd4927de18651800438a7e7ba))
106
+ - setup autorelease to rubygems
107
+ ([eb9c8e1b](git@github.com:gildesmarais/html2rss/commit/eb9c8e1b16902dc0e174a0cccb6eb9227307ce82))
108
+
109
+
110
+
111
+
112
+
113
+ ---
114
+ <sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
115
+
@@ -4,7 +4,7 @@ PATH
4
4
  html2rss (0.0.1)
5
5
  faraday (~> 0.15)
6
6
  nokogiri (~> 1.8)
7
- sanitize (~> 4.6)
7
+ sanitize (~> 5.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -12,31 +12,38 @@ GEM
12
12
  byebug (10.0.2)
13
13
  crass (1.0.4)
14
14
  diff-lcs (1.3)
15
- faraday (0.15.2)
15
+ docile (1.3.1)
16
+ faraday (0.15.3)
16
17
  multipart-post (>= 1.2, < 3)
18
+ json (2.1.0)
17
19
  mini_portile2 (2.3.0)
18
20
  multipart-post (2.0.0)
19
- nokogiri (1.8.2)
21
+ nokogiri (1.8.5)
20
22
  mini_portile2 (~> 2.3.0)
21
- nokogumbo (1.5.0)
22
- nokogiri
23
- rspec (3.7.0)
24
- rspec-core (~> 3.7.0)
25
- rspec-expectations (~> 3.7.0)
26
- rspec-mocks (~> 3.7.0)
27
- rspec-core (3.7.1)
28
- rspec-support (~> 3.7.0)
29
- rspec-expectations (3.7.0)
23
+ nokogumbo (2.0.0)
24
+ nokogiri (~> 1.8, >= 1.8.4)
25
+ rspec (3.8.0)
26
+ rspec-core (~> 3.8.0)
27
+ rspec-expectations (~> 3.8.0)
28
+ rspec-mocks (~> 3.8.0)
29
+ rspec-core (3.8.0)
30
+ rspec-support (~> 3.8.0)
31
+ rspec-expectations (3.8.2)
30
32
  diff-lcs (>= 1.2.0, < 2.0)
31
- rspec-support (~> 3.7.0)
32
- rspec-mocks (3.7.0)
33
+ rspec-support (~> 3.8.0)
34
+ rspec-mocks (3.8.0)
33
35
  diff-lcs (>= 1.2.0, < 2.0)
34
- rspec-support (~> 3.7.0)
35
- rspec-support (3.7.1)
36
- sanitize (4.6.5)
36
+ rspec-support (~> 3.8.0)
37
+ rspec-support (3.8.0)
38
+ sanitize (5.0.0)
37
39
  crass (~> 1.0.2)
38
- nokogiri (>= 1.4.4)
39
- nokogumbo (~> 1.4)
40
+ nokogiri (>= 1.8.0)
41
+ nokogumbo (~> 2.0)
42
+ simplecov (0.16.1)
43
+ docile (~> 1.1)
44
+ json (>= 1.8, < 3)
45
+ simplecov-html (~> 0.10.0)
46
+ simplecov-html (0.10.2)
40
47
  vcr (4.0.0)
41
48
 
42
49
  PLATFORMS
@@ -47,7 +54,8 @@ DEPENDENCIES
47
54
  byebug (~> 10.0)
48
55
  html2rss!
49
56
  rspec (~> 3.0)
57
+ simplecov
50
58
  vcr (~> 4.0)
51
59
 
52
60
  BUNDLED WITH
53
- 1.16.2
61
+ 1.16.6
data/README.md CHANGED
@@ -1,4 +1,6 @@
1
- # Html2rss
1
+ ![html2rss logo](https://github.com/gildesmarais/html2rss/raw/master/support/logo.png)
2
+
3
+ # html2rss [![Build Status](https://travis-ci.org/gildesmarais/html2rss.svg?branch=master)](https://travis-ci.org/gildesmarais/html2rss)
2
4
 
3
5
  Request and convert an HTML document to an RSS feed via a config object.
4
6
  The config contains the URL to scrape and the selectors needed to extract
@@ -24,14 +26,26 @@ Or install it yourself as:
24
26
 
25
27
  $ gem install html2rss
26
28
 
27
- ## Usage example with a YAML file
29
+ ## Usage
30
+
31
+ ## Usage with a YAML file
28
32
 
29
- Create a YAML config file. Find an example at `rspec/config.test.yml`.
33
+ Create a YAML config file. Find an example at [`rspec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
30
34
 
31
35
  `Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')` returns
32
36
 
33
37
  an `RSS:Rss` object.
34
38
 
39
+ ## Usage in a web application
40
+
41
+ Find a minimal Sintra app which exposes your feeds to HTTP endpoints here:
42
+ [gildesmarais/html2rss-web](https://github.com/gildesmarais/html2rss-web)
43
+
44
+ ### Tips and tricks
45
+
46
+ - Check that the channel url does not redirect to a mobile page
47
+ - fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems quite efficient
48
+
35
49
  ## Development
36
50
 
37
51
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -40,6 +54,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
40
54
 
41
55
  Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
42
56
 
57
+ ## Changelog generation
58
+
59
+ The `CHANGELOG.md` can be generated automatically.
60
+ Install [git-changelog](https://www.npmjs.com/package/git-changelog) globally and run `git-changelog` afterwards.
61
+
43
62
  ## License
44
63
 
45
64
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -1,4 +1,4 @@
1
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
2
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
  require 'html2rss/version'
4
4
 
@@ -28,11 +28,12 @@ Gem::Specification.new do |spec|
28
28
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
29
  spec.require_paths = ['lib']
30
30
 
31
- spec.add_dependency 'nokogiri', '~> 1.8'
32
- spec.add_dependency 'sanitize', '~> 4.6'
33
31
  spec.add_dependency 'faraday', '~> 0.15'
32
+ spec.add_dependency 'nokogiri', '~> 1.8'
33
+ spec.add_dependency 'sanitize', '~> 5.0'
34
34
  spec.add_development_dependency 'bundler', '~> 1.16'
35
+ spec.add_development_dependency 'byebug', '~> 10.0'
35
36
  spec.add_development_dependency 'rspec', '~> 3.0'
37
+ spec.add_development_dependency 'simplecov'
36
38
  spec.add_development_dependency 'vcr', '~> 4.0'
37
- spec.add_development_dependency 'byebug', '~> 10.0'
38
39
  end
@@ -0,0 +1,16 @@
1
+ require_relative 'attribute_post_processors/parse_time'
2
+ require_relative 'attribute_post_processors/parse_uri'
3
+ require_relative 'attribute_post_processors/sanitize_html'
4
+ require_relative 'attribute_post_processors/substring'
5
+ require_relative 'attribute_post_processors/template'
6
+
7
+ module Html2rss
8
+ module AttributePostProcessors
9
+ def self.get_processor(options)
10
+ camel_cased_option = options['name'].split('_').collect(&:capitalize).join
11
+ class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_option].join('::')
12
+
13
+ Object.const_get(class_name)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ module Html2rss
2
+ module AttributePostProcessors
3
+ class ParseTime
4
+ def initialize(value, _options, _item)
5
+ @value = value
6
+ end
7
+
8
+ def get
9
+ Time.parse(@value).rfc822
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ module Html2rss
2
+ module AttributePostProcessors
3
+ class ParseUri
4
+ def initialize(value, _options, _item)
5
+ @value = value
6
+ end
7
+
8
+ def get
9
+ URI(@value).to_s
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,23 @@
1
+ require 'sanitize'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ class SanitizeHtml
6
+ def initialize(value, _options, _item)
7
+ @value = value
8
+ end
9
+
10
+ def get
11
+ Sanitize.fragment(@value, Sanitize::Config.merge(
12
+ Sanitize::Config::RELAXED,
13
+ add_attributes: {
14
+ 'a' => {
15
+ 'rel' => 'nofollow noopener noreferrer',
16
+ 'target' => '_blank'
17
+ }
18
+ }
19
+ ))
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ module Html2rss
2
+ module AttributePostProcessors
3
+ class Substring
4
+ def initialize(value, options, _item)
5
+ @value = value
6
+ @options = options
7
+ end
8
+
9
+ def get
10
+ ending = @options['end'].to_i.positive? ? @options['end'].to_i : @value.length
11
+ @value[@options['start'].to_i..ending]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,33 @@
1
+ require 'sanitize'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ class Template
6
+ def initialize(value, options, item)
7
+ @value = value
8
+ @options = options
9
+ @item = item
10
+ end
11
+
12
+ def get
13
+ string % methods
14
+ end
15
+
16
+ private
17
+
18
+ def string
19
+ @options['string']
20
+ end
21
+
22
+ def methods
23
+ @methods ||= @options['methods'].map { |method|
24
+ if method == 'self'
25
+ @value
26
+ else
27
+ @item.send(method.to_sym)&.to_s
28
+ end
29
+ }
30
+ end
31
+ end
32
+ end
33
+ end
@@ -38,7 +38,7 @@ module Html2rss
38
38
  end
39
39
 
40
40
  def options(name)
41
- feed_config.dig('selectors', name).merge('channel' => channel_config)
41
+ feed_config.dig('selectors').fetch(name, {}).merge('channel' => channel_config)
42
42
  end
43
43
 
44
44
  def selector(name)
@@ -46,8 +46,8 @@ module Html2rss
46
46
  end
47
47
 
48
48
  def attribute_names
49
- attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_sym)
50
- attribute_names.delete(:items)
49
+ attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_s)
50
+ attribute_names.delete('items')
51
51
  attribute_names
52
52
  end
53
53
  end
@@ -22,7 +22,7 @@ module Html2rss
22
22
  private
23
23
 
24
24
  def add_channel_to_maker(maker)
25
- [:language, :author, :title, :description, :link, :ttl].each do |attribute_name|
25
+ %i[language author title description link ttl].each do |attribute_name|
26
26
  maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
27
27
  end
28
28
 
@@ -31,12 +31,14 @@ module Html2rss
31
31
  end
32
32
 
33
33
  def feed_items
34
- Item.from_url config.url, config
34
+ @feed_items ||= Item.from_url config.url, config
35
35
  end
36
36
 
37
37
  def add_item_to_items(feed_item, items)
38
+ return unless feed_item.valid?
39
+
38
40
  items.new_item do |rss_item|
39
- config.attribute_names.each do |attribute_name|
41
+ feed_item.available_attributes.each do |attribute_name|
40
42
  rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
41
43
 
42
44
  rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
@@ -2,6 +2,7 @@ require 'faraday'
2
2
  require 'open-uri'
3
3
  require 'nokogiri'
4
4
  require_relative 'item_extractor'
5
+ require_relative 'attribute_post_processors'
5
6
 
6
7
  module Html2rss
7
8
  class Item
@@ -24,26 +25,37 @@ module Html2rss
24
25
  proc = ItemExtractor.const_get extractor.upcase.to_sym
25
26
  value = proc.call(xml, attribute_config)
26
27
 
27
- post_process(method_name, value)
28
+ post_process_options = attribute_config.fetch('post_process', false)
29
+ value = post_process(value, post_process_options) if post_process_options
30
+
31
+ value
28
32
  end
29
33
 
30
- def post_process(method_name, value)
31
- case method_name
32
- when :link
33
- URI(value)
34
- when :updated
35
- Time.parse(value).to_s
36
- else
37
- value
38
- end
34
+ def available_attributes
35
+ # TODO: support optional attributes, e.g. category, enclosure, source
36
+ @available_attributes ||= (%w[title link description author comments updated] & @config.attribute_names)
37
+ end
38
+
39
+ def valid?
40
+ return false if [title.to_s, description.to_s].join('') == ''
41
+
42
+ true
39
43
  end
40
44
 
41
45
  def self.from_url(url, config)
42
46
  connection = Faraday.new(url: url, headers: config.headers)
43
47
  page = Nokogiri::HTML(connection.get.body)
44
- page.css(config.selector('items')).map { |xml_item|
48
+ page.css(config.selector('items')).map do |xml_item|
45
49
  new xml_item, config
46
- }
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def post_process(value, options)
56
+ Html2rss::AttributePostProcessors.get_processor(options)
57
+ .new(value, options, self)
58
+ .get
47
59
  end
48
60
  end
49
61
  end
@@ -1,25 +1,25 @@
1
- require 'sanitize'
2
-
3
1
  module Html2rss
4
2
  module ItemExtractor
5
- TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
6
- ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }
3
+ TEXT = proc { |xml, options| xml.css(options['selector'])&.text&.strip }
4
+ ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']).to_s }
7
5
 
8
6
  HREF = proc { |xml, options|
9
- uri = URI(options['channel']['url'])
10
- uri.path = xml.css(options['selector']).attr('href')
11
- uri
12
- }
7
+ href = xml.css(options['selector']).attr('href').to_s
8
+ path, query = href.split('?')
13
9
 
14
- HTML = proc { |xml, options|
15
- html = xml.css(options['selector']).to_s
10
+ if href.start_with?('http')
11
+ uri = URI(href)
12
+ else
13
+ uri = URI(options['channel']['url'])
14
+ uri.path = path.start_with?('/') ? path : "/#{path}"
15
+ uri.query = query
16
+ end
16
17
 
17
- Sanitize.fragment(html, Sanitize::Config.merge(
18
- Sanitize::Config::RELAXED,
19
- add_attributes: {
20
- 'a' => { 'rel' => 'nofollow noopener noreferrer' }
21
- }
22
- ))
18
+ uri
23
19
  }
20
+
21
+ HTML = proc { |xml, options| xml.css(options['selector']).to_s }
22
+ STATIC = proc { |_xml, options| options['static'] }
23
+ CURRENT_TIME = proc { |_xml, _options| Time.new }
24
24
  end
25
25
  end
@@ -1,3 +1,3 @@
1
1
  module Html2rss
2
- VERSION = '0.0.1'.freeze
2
+ VERSION = '0.1.0'.freeze
3
3
  end
@@ -0,0 +1,15 @@
1
+ <% if(logo) { %><img width="300px" src="<%= logo %>" /><%= '\n\n' %><% } %># <%= title %>
2
+ <% if(intro) { %><%= '\n' %>_<%= intro %>_<%= '\n' %><% } %>
3
+ <% if(version && (version.name || version.number)) { %>##<% if(version.name){%> <%= version.name %><% } %> <%= version.number %> <% if(version.date){ %>( <%= version.date %> )<% } %><%= '\n' %><% } %>
4
+ <% _.forEach(sections, function(section){
5
+ if(section.commitsCount > 0) { %>
6
+ ## <%= section.title %>
7
+ <% _.forEach(section.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
8
+ <% _.forEach(section.components, function(component){ %> - **<%= component.name %>**
9
+ <% _.forEach(component.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
10
+ <% }) %>
11
+ <% } %>
12
+ <% }) %>
13
+
14
+ ---
15
+ <sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
Binary file
metadata CHANGED
@@ -1,57 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-03 00:00:00.000000000 Z
11
+ date: 2018-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: faraday
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.8'
19
+ version: '0.15'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.8'
26
+ version: '0.15'
27
27
  - !ruby/object:Gem::Dependency
28
- name: sanitize
28
+ name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '4.6'
33
+ version: '1.8'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '4.6'
40
+ version: '1.8'
41
41
  - !ruby/object:Gem::Dependency
42
- name: faraday
42
+ name: sanitize
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0.15'
47
+ version: '5.0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0.15'
54
+ version: '5.0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: bundler
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.16'
69
+ - !ruby/object:Gem::Dependency
70
+ name: byebug
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '10.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: rspec
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -81,33 +95,33 @@ dependencies:
81
95
  - !ruby/object:Gem::Version
82
96
  version: '3.0'
83
97
  - !ruby/object:Gem::Dependency
84
- name: vcr
98
+ name: simplecov
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - "~>"
101
+ - - ">="
88
102
  - !ruby/object:Gem::Version
89
- version: '4.0'
103
+ version: '0'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - "~>"
108
+ - - ">="
95
109
  - !ruby/object:Gem::Version
96
- version: '4.0'
110
+ version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
- name: byebug
112
+ name: vcr
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '10.0'
117
+ version: '4.0'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '10.0'
124
+ version: '4.0'
111
125
  description: |-
112
126
  Create your config object, include the url to scrape,
113
127
  some selectors and get a RSS2 feed in return.
@@ -117,9 +131,12 @@ executables: []
117
131
  extensions: []
118
132
  extra_rdoc_files: []
119
133
  files:
134
+ - ".changelogrc"
120
135
  - ".gitignore"
121
136
  - ".rspec"
137
+ - ".rubocop.yml"
122
138
  - ".travis.yml"
139
+ - CHANGELOG.md
123
140
  - Gemfile
124
141
  - Gemfile.lock
125
142
  - LICENSE
@@ -128,11 +145,19 @@ files:
128
145
  - bin/setup
129
146
  - html2rss.gemspec
130
147
  - lib/html2rss.rb
148
+ - lib/html2rss/attribute_post_processors.rb
149
+ - lib/html2rss/attribute_post_processors/parse_time.rb
150
+ - lib/html2rss/attribute_post_processors/parse_uri.rb
151
+ - lib/html2rss/attribute_post_processors/sanitize_html.rb
152
+ - lib/html2rss/attribute_post_processors/substring.rb
153
+ - lib/html2rss/attribute_post_processors/template.rb
131
154
  - lib/html2rss/config.rb
132
155
  - lib/html2rss/feed_builder.rb
133
156
  - lib/html2rss/item.rb
134
157
  - lib/html2rss/item_extractor.rb
135
158
  - lib/html2rss/version.rb
159
+ - support/changelog.md
160
+ - support/logo.png
136
161
  homepage: https://github.com/gildesmarais/html2rss
137
162
  licenses:
138
163
  - MIT
@@ -154,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
179
  version: '0'
155
180
  requirements: []
156
181
  rubyforge_project:
157
- rubygems_version: 2.7.6
182
+ rubygems_version: 2.7.8
158
183
  signing_key:
159
184
  specification_version: 4
160
185
  summary: Generate RSS feeds by scraping websites by providing a config.