html2rss 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 48c6ef4f6636bac787600fa7b2efa3d313361016279d9895b50895c00b74eb8b
4
- data.tar.gz: 58ea03df84c7c8f3c0a73e5186b49f766f06e476a1606b4837d3749444df1551
3
+ metadata.gz: 1cc71ee2722757d0f4cceaf8aea1b9007496acf35fdf4da37a5dea998233258d
4
+ data.tar.gz: '0009a02fe85a6e2b088aec00944e1dc46233ce6878b8558782286be0e020511b'
5
5
  SHA512:
6
- metadata.gz: 5dd2ba0fb71dcb16bf08632feedf1d5edfe666225bcfeca409be4be153de44a9a04900e0cfd3d188b18b7e9e48ae37051a48448ed06649deb281440a1cb3fbad
7
- data.tar.gz: '0908d37c6a8b3b2b0dd620c69bdfea808533b862f5bf6892f4a96bb2b9ce9c36f211bb83072ca92bcd310a42c78a7270b593acc10caa7833f67060f487636ad2'
6
+ metadata.gz: 2de4befd1f3beec2ee094f2c792ec94a1bde37c3da1f30eb737403ae04eddb36b54a43b86163f8b348229c42a5d38aae0bc7657c677bdbe95a54bee26b1fa6d9
7
+ data.tar.gz: f344bb3046c0493d9979e9daf53d5f3725798c96093dbbff6164a1339fd80bf54ad60868eb55dbc5a67239584f825cb0f5f6ca18c4e3e7f8f67b6698d26f3adf
@@ -0,0 +1,19 @@
1
+ {
2
+ "app_name": "html2rss",
3
+ "logo": "https://github.com/gildesmarais/html2rss/raw/master/support/logo.png",
4
+ "intro": "Generate RSS feeds by scraping websites by providing a config.",
5
+ "debug": "true",
6
+ "template": "support/changelog.md",
7
+ "sections": [
8
+ { "title": "Bugfixes", "grep": "^fix" },
9
+ { "title": "Features", "grep": "^feat" },
10
+ { "title": "Documentation", "grep": "^docs" },
11
+ { "title": "Breaking changes", "grep": "BREAKING" },
12
+ { "title": "Refactorings", "grep": "^refactor" },
13
+ { "title": "Code style", "grep": "^style" },
14
+ { "title": "Test", "grep": "^spec" },
15
+ { "title": "Chore", "grep": "^chore" },
16
+ { "title": "Branches merged", "grep": "^Merge branch" },
17
+ { "title": "Pull requests merged", "grep": "^Merge pull request" }
18
+ ]
19
+ }
data/.gitignore CHANGED
@@ -9,3 +9,4 @@
9
9
 
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
+ coverage
@@ -0,0 +1,46 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.4
3
+ DisplayCopNames: true
4
+
5
+ Metrics/LineLength:
6
+ Max: 110
7
+
8
+ Metrics/BlockLength:
9
+ Exclude:
10
+ - "**/*_spec.rb"
11
+ - html2rss.gemspec
12
+
13
+ Metrics/ModuleLength:
14
+ Exclude:
15
+ - "**/*_spec.rb"
16
+
17
+ Documentation:
18
+ Enabled: false
19
+
20
+ Style/BlockDelimiters:
21
+ Enabled: false
22
+
23
+ Style/FrozenStringLiteralComment:
24
+ Enabled: false
25
+
26
+ Style/ParallelAssignment:
27
+ Enabled: false
28
+
29
+ Style/AsciiComments:
30
+ Enabled: false
31
+
32
+ Style/BracesAroundHashParameters:
33
+ Description: 'Enforce braces style around hash parameters.'
34
+ Enabled: true
35
+
36
+ Style/HashSyntax:
37
+ Description: >-
38
+ Prefer Ruby 1.9 hash syntax { a: 1, b: 2 } over 1.8 syntax
39
+ { :a => 1, :b => 2 }.
40
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#hash-literals'
41
+ Enabled: true
42
+
43
+ Layout/SpaceInsideParens:
44
+ Description: 'No spaces after ( or before ).'
45
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#no-spaces-braces'
46
+ Enabled: true
@@ -1,15 +1,24 @@
1
1
  sudo: false
2
2
  language: ruby
3
+ cache: bundler
3
4
 
4
5
  before_install:
5
6
  - gem update --system
6
7
  - gem install bundler
7
- bundler_args: --jobs=3 --retry=3
8
+
9
+ bundler_args: "--jobs=3 --retry=3"
8
10
 
9
11
  rvm:
10
- - 2.3.7
11
- - 2.4.4
12
- - 2.5.1
12
+ - 2.3.8
13
+ - 2.4.5
14
+ - 2.5.3
13
15
 
14
16
  script:
15
17
  - bundle exec rspec
18
+
19
+ deploy:
20
+ provider: rubygems
21
+ api_key:
22
+ secure: bM3Yl8iWdB1Amra3Bm6bIH/mTwHcRhZrX8etFFbJANxIbkhzUOyTKcDMYiWUVM/mBzzv0NOuRejrDR6R0v7E2udrKcLQFCBtv7HqPAXIlkEEyxZy+M1kTqcPzP872E+ZKTn93vCzbiXBLYoMmqgCzqvcO87IBYNzTURHkfFjaYJJdVyZ5EVtbpXf4FhBvuQf9LTk/ocClgwYeuqd+45lO7qHoPatsvbY0vCOfKaiwkdOkBt+hjc56awcYSc9CXn0DCatebPQmQmdrqFd8fKgyCatWS3n+8TPmvzVfNJe44wg3oNfHbWruP85I2LE9ei1iG+iGQIF60fMhGgMJ4EM3REXDE5Mg+GA5uJcgH9Poirut3Ih65jtAzYNGohlmEmc7ysKc0dmG1O3ndwrHjh5KePrOAGDaW6QKG+m5ebIZ+mgrEA+ZVU1mjDM8FlbSKAayoPloslZdllSv7miwGzh6xrHWGQSCURZAkygFh+Kd+Kg1eVlEs+n6aObod82mEOfBPvWPacOrE2fY4B0ocFOKotZBCZSD0ZIixlyslRTnmcJfpRNlYLsQ56oy5uPNUccPQ86NSmmE+qbRdPCLQCKLPm2iYBgOa5iQrfHR/fUgcO0skAZiW4o9QflDgIFS/G+BE6FMHIvjkKA6Ae4KbqGzlF5pGFdo6p4MhlvubwjsVI=
23
+ on:
24
+ tags: true
@@ -0,0 +1,115 @@
1
+ <img width="300px" src="https://github.com/gildesmarais/html2rss/raw/master/support/logo.png" />
2
+
3
+ # html2rss
4
+
5
+ _Generate RSS feeds by scraping websites by providing a config._
6
+
7
+
8
+
9
+ ## Bugfixes
10
+ - handling of url query breaks processing
11
+ ([ace289e9](git@github.com:gildesmarais/html2rss/commit/ace289e911b69cb92433cac6f1ca0403715d8286))
12
+ - only set supported attributes on rss item
13
+ ([dae0d8e7](git@github.com:gildesmarais/html2rss/commit/dae0d8e75541e810275e789a23971a61e60a2154))
14
+
15
+ - **config**
16
+ - feed generation fails
17
+ ([7dd55869](git@github.com:gildesmarais/html2rss/commit/7dd55869f79b1de76c004bf0e82d13b16b5b3f0d))
18
+
19
+ - **parse_uri**
20
+ - handle non-absolute paths
21
+ ([92150257](git@github.com:gildesmarais/html2rss/commit/921502574e4436d65a30e1d34b9b31f238336247))
22
+
23
+
24
+
25
+
26
+ ## Features
27
+ - add logo [skip ci]
28
+ ([857a55fd](git@github.com:gildesmarais/html2rss/commit/857a55fd8c932930d96c47c5abe57f0507356df1))
29
+ - require updated to be present
30
+ ([e1bedaec](git@github.com:gildesmarais/html2rss/commit/e1bedaecc91e874fe24e96000612abb9cd11e9fe))
31
+ - do not fail on invalid item, just skip it
32
+ ([3b83d715](git@github.com:gildesmarais/html2rss/commit/3b83d715619abbc33b124de1945d17cb0dc7edb0))
33
+
34
+ - **item_extractor**
35
+ - text strips strings
36
+ ([f5982859](git@github.com:gildesmarais/html2rss/commit/f59828593dca663bdbe8699392594e2d18658f8f))
37
+ - add static and current_time
38
+ ([25043dcb](git@github.com:gildesmarais/html2rss/commit/25043dcbd8f0f4901202f4a2f66b355ac48825a8))
39
+ - handle absolute urls
40
+ ([f96be008](git@github.com:gildesmarais/html2rss/commit/f96be00857bdcded02d52dd62ec22b9b52c803ed))
41
+
42
+ - **post_processing**
43
+ - add configurable post_processing (#5)
44
+ ([4cf6caca](git@github.com:gildesmarais/html2rss/commit/4cf6cacac00bd3c0c53d584ca11274ba24b03ef7),
45
+ [#1](git@github.com:gildesmarais/html2rss/issues/1))
46
+
47
+ - **post_processor**
48
+ - add substring
49
+ ([6f2a32a6](git@github.com:gildesmarais/html2rss/commit/6f2a32a6304ef9956577711173de681daf93f55f))
50
+
51
+ - **postprocessors**
52
+ - add Template (#6)
53
+ ([f1db542e](git@github.com:gildesmarais/html2rss/commit/f1db542e8c1e9e09a066a3cd6c8514a6ca0aa871),
54
+ [#4](git@github.com:gildesmarais/html2rss/issues/4))
55
+
56
+ - **sanitize_html**
57
+ - add target="_blank" to anchors
58
+ ([975a73bf](git@github.com:gildesmarais/html2rss/commit/975a73bfd396ba5942bc0ea80eebd14cc37ad776))
59
+
60
+
61
+
62
+
63
+ ## Documentation
64
+ - create the changelog
65
+ ([5c561db5](git@github.com:gildesmarais/html2rss/commit/5c561db51d4e0b8592b1c82812ab5cdbe9320b70))
66
+ - add tips and tricks
67
+ ([ea978240](git@github.com:gildesmarais/html2rss/commit/ea9782408107f3637a4c9665396f511fc07be19b))
68
+ - update readme
69
+ ([4743167c](git@github.com:gildesmarais/html2rss/commit/4743167c86959e83524ffb7282c562413a651797))
70
+ - add note about html2rss-web
71
+ ([3371c12f](git@github.com:gildesmarais/html2rss/commit/3371c12ffc6c8d3c29073d03ff206886a39401cd))
72
+ - add a badge for travis-ci
73
+ ([8818d4f4](git@github.com:gildesmarais/html2rss/commit/8818d4f464a9c163ebc9665d01719e2bab132bd6))
74
+
75
+
76
+
77
+
78
+ ## Test
79
+ - don't be so lazy when matching strings
80
+ ([6a0eb627](git@github.com:gildesmarais/html2rss/commit/6a0eb62765523a1405fd269466b2fc57794eac7a))
81
+
82
+
83
+
84
+
85
+ ## Chore
86
+ - upgrade sanitze gem to version 5.0.0
87
+ ([8c4bf3a4](git@github.com:gildesmarais/html2rss/commit/8c4bf3a44885758e395568ec452a7cffdb9a0389))
88
+ - rubocop autocorrect
89
+ ([af6b9fac](git@github.com:gildesmarais/html2rss/commit/af6b9facca547d3ca3ce9ef0d1227707cd16eaea))
90
+ - build against latest ruby releases
91
+ ([17ba79ac](git@github.com:gildesmarais/html2rss/commit/17ba79acd2f68da1fcc984368d3e6de3437cbf1b))
92
+ - update dependencies
93
+ ([855279b4](git@github.com:gildesmarais/html2rss/commit/855279b46d584a8a8c2a317529f7a4be550eaf15))
94
+ - update dependencies
95
+ ([46e5a283](git@github.com:gildesmarais/html2rss/commit/46e5a2832d1f2fe1353dcc2a8d82a9786f15f6bd))
96
+ - add simplecov
97
+ ([b4e1144b](git@github.com:gildesmarais/html2rss/commit/b4e1144b7f8f90126e528cc4a4ec048113d93634))
98
+
99
+ - **changelog**
100
+ - add generation with git-changelog
101
+ ([07ad5a51](git@github.com:gildesmarais/html2rss/commit/07ad5a513f0951ee988426abda4b8c233411ead7))
102
+
103
+ - **travis**
104
+ - use cache for bundler
105
+ ([ac76b3b2](git@github.com:gildesmarais/html2rss/commit/ac76b3b2dd94adecd4927de18651800438a7e7ba))
106
+ - setup autorelease to rubygems
107
+ ([eb9c8e1b](git@github.com:gildesmarais/html2rss/commit/eb9c8e1b16902dc0e174a0cccb6eb9227307ce82))
108
+
109
+
110
+
111
+
112
+
113
+ ---
114
+ <sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
115
+
@@ -4,7 +4,7 @@ PATH
4
4
  html2rss (0.0.1)
5
5
  faraday (~> 0.15)
6
6
  nokogiri (~> 1.8)
7
- sanitize (~> 4.6)
7
+ sanitize (~> 5.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -12,31 +12,38 @@ GEM
12
12
  byebug (10.0.2)
13
13
  crass (1.0.4)
14
14
  diff-lcs (1.3)
15
- faraday (0.15.2)
15
+ docile (1.3.1)
16
+ faraday (0.15.3)
16
17
  multipart-post (>= 1.2, < 3)
18
+ json (2.1.0)
17
19
  mini_portile2 (2.3.0)
18
20
  multipart-post (2.0.0)
19
- nokogiri (1.8.2)
21
+ nokogiri (1.8.5)
20
22
  mini_portile2 (~> 2.3.0)
21
- nokogumbo (1.5.0)
22
- nokogiri
23
- rspec (3.7.0)
24
- rspec-core (~> 3.7.0)
25
- rspec-expectations (~> 3.7.0)
26
- rspec-mocks (~> 3.7.0)
27
- rspec-core (3.7.1)
28
- rspec-support (~> 3.7.0)
29
- rspec-expectations (3.7.0)
23
+ nokogumbo (2.0.0)
24
+ nokogiri (~> 1.8, >= 1.8.4)
25
+ rspec (3.8.0)
26
+ rspec-core (~> 3.8.0)
27
+ rspec-expectations (~> 3.8.0)
28
+ rspec-mocks (~> 3.8.0)
29
+ rspec-core (3.8.0)
30
+ rspec-support (~> 3.8.0)
31
+ rspec-expectations (3.8.2)
30
32
  diff-lcs (>= 1.2.0, < 2.0)
31
- rspec-support (~> 3.7.0)
32
- rspec-mocks (3.7.0)
33
+ rspec-support (~> 3.8.0)
34
+ rspec-mocks (3.8.0)
33
35
  diff-lcs (>= 1.2.0, < 2.0)
34
- rspec-support (~> 3.7.0)
35
- rspec-support (3.7.1)
36
- sanitize (4.6.5)
36
+ rspec-support (~> 3.8.0)
37
+ rspec-support (3.8.0)
38
+ sanitize (5.0.0)
37
39
  crass (~> 1.0.2)
38
- nokogiri (>= 1.4.4)
39
- nokogumbo (~> 1.4)
40
+ nokogiri (>= 1.8.0)
41
+ nokogumbo (~> 2.0)
42
+ simplecov (0.16.1)
43
+ docile (~> 1.1)
44
+ json (>= 1.8, < 3)
45
+ simplecov-html (~> 0.10.0)
46
+ simplecov-html (0.10.2)
40
47
  vcr (4.0.0)
41
48
 
42
49
  PLATFORMS
@@ -47,7 +54,8 @@ DEPENDENCIES
47
54
  byebug (~> 10.0)
48
55
  html2rss!
49
56
  rspec (~> 3.0)
57
+ simplecov
50
58
  vcr (~> 4.0)
51
59
 
52
60
  BUNDLED WITH
53
- 1.16.2
61
+ 1.16.6
data/README.md CHANGED
@@ -1,4 +1,6 @@
1
- # Html2rss
1
+ ![html2rss logo](https://github.com/gildesmarais/html2rss/raw/master/support/logo.png)
2
+
3
+ # html2rss [![Build Status](https://travis-ci.org/gildesmarais/html2rss.svg?branch=master)](https://travis-ci.org/gildesmarais/html2rss)
2
4
 
3
5
  Request and convert an HTML document to an RSS feed via a config object.
4
6
  The config contains the URL to scrape and the selectors needed to extract
@@ -24,14 +26,26 @@ Or install it yourself as:
24
26
 
25
27
  $ gem install html2rss
26
28
 
27
- ## Usage example with a YAML file
29
+ ## Usage
30
+
31
+ ## Usage with a YAML file
28
32
 
29
- Create a YAML config file. Find an example at `rspec/config.test.yml`.
33
+ Create a YAML config file. Find an example at [`rspec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
30
34
 
31
35
  `Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')` returns
32
36
 
33
37
  an `RSS:Rss` object.
34
38
 
39
+ ## Usage in a web application
40
+
41
+ Find a minimal Sintra app which exposes your feeds to HTTP endpoints here:
42
+ [gildesmarais/html2rss-web](https://github.com/gildesmarais/html2rss-web)
43
+
44
+ ### Tips and tricks
45
+
46
+ - Check that the channel url does not redirect to a mobile page
47
+ - fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems quite efficient
48
+
35
49
  ## Development
36
50
 
37
51
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -40,6 +54,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
40
54
 
41
55
  Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
42
56
 
57
+ ## Changelog generation
58
+
59
+ The `CHANGELOG.md` can be generated automatically.
60
+ Install [git-changelog](https://www.npmjs.com/package/git-changelog) globally and run `git-changelog` afterwards.
61
+
43
62
  ## License
44
63
 
45
64
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -1,4 +1,4 @@
1
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
2
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
  require 'html2rss/version'
4
4
 
@@ -28,11 +28,12 @@ Gem::Specification.new do |spec|
28
28
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
29
  spec.require_paths = ['lib']
30
30
 
31
- spec.add_dependency 'nokogiri', '~> 1.8'
32
- spec.add_dependency 'sanitize', '~> 4.6'
33
31
  spec.add_dependency 'faraday', '~> 0.15'
32
+ spec.add_dependency 'nokogiri', '~> 1.8'
33
+ spec.add_dependency 'sanitize', '~> 5.0'
34
34
  spec.add_development_dependency 'bundler', '~> 1.16'
35
+ spec.add_development_dependency 'byebug', '~> 10.0'
35
36
  spec.add_development_dependency 'rspec', '~> 3.0'
37
+ spec.add_development_dependency 'simplecov'
36
38
  spec.add_development_dependency 'vcr', '~> 4.0'
37
- spec.add_development_dependency 'byebug', '~> 10.0'
38
39
  end
@@ -0,0 +1,16 @@
1
+ require_relative 'attribute_post_processors/parse_time'
2
+ require_relative 'attribute_post_processors/parse_uri'
3
+ require_relative 'attribute_post_processors/sanitize_html'
4
+ require_relative 'attribute_post_processors/substring'
5
+ require_relative 'attribute_post_processors/template'
6
+
7
+ module Html2rss
8
+ module AttributePostProcessors
9
+ def self.get_processor(options)
10
+ camel_cased_option = options['name'].split('_').collect(&:capitalize).join
11
+ class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_option].join('::')
12
+
13
+ Object.const_get(class_name)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ module Html2rss
2
+ module AttributePostProcessors
3
+ class ParseTime
4
+ def initialize(value, _options, _item)
5
+ @value = value
6
+ end
7
+
8
+ def get
9
+ Time.parse(@value).rfc822
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ module Html2rss
2
+ module AttributePostProcessors
3
+ class ParseUri
4
+ def initialize(value, _options, _item)
5
+ @value = value
6
+ end
7
+
8
+ def get
9
+ URI(@value).to_s
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,23 @@
1
+ require 'sanitize'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ class SanitizeHtml
6
+ def initialize(value, _options, _item)
7
+ @value = value
8
+ end
9
+
10
+ def get
11
+ Sanitize.fragment(@value, Sanitize::Config.merge(
12
+ Sanitize::Config::RELAXED,
13
+ add_attributes: {
14
+ 'a' => {
15
+ 'rel' => 'nofollow noopener noreferrer',
16
+ 'target' => '_blank'
17
+ }
18
+ }
19
+ ))
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ module Html2rss
2
+ module AttributePostProcessors
3
+ class Substring
4
+ def initialize(value, options, _item)
5
+ @value = value
6
+ @options = options
7
+ end
8
+
9
+ def get
10
+ ending = @options['end'].to_i.positive? ? @options['end'].to_i : @value.length
11
+ @value[@options['start'].to_i..ending]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,33 @@
1
+ require 'sanitize'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ class Template
6
+ def initialize(value, options, item)
7
+ @value = value
8
+ @options = options
9
+ @item = item
10
+ end
11
+
12
+ def get
13
+ string % methods
14
+ end
15
+
16
+ private
17
+
18
+ def string
19
+ @options['string']
20
+ end
21
+
22
+ def methods
23
+ @methods ||= @options['methods'].map { |method|
24
+ if method == 'self'
25
+ @value
26
+ else
27
+ @item.send(method.to_sym)&.to_s
28
+ end
29
+ }
30
+ end
31
+ end
32
+ end
33
+ end
@@ -38,7 +38,7 @@ module Html2rss
38
38
  end
39
39
 
40
40
  def options(name)
41
- feed_config.dig('selectors', name).merge('channel' => channel_config)
41
+ feed_config.dig('selectors').fetch(name, {}).merge('channel' => channel_config)
42
42
  end
43
43
 
44
44
  def selector(name)
@@ -46,8 +46,8 @@ module Html2rss
46
46
  end
47
47
 
48
48
  def attribute_names
49
- attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_sym)
50
- attribute_names.delete(:items)
49
+ attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_s)
50
+ attribute_names.delete('items')
51
51
  attribute_names
52
52
  end
53
53
  end
@@ -22,7 +22,7 @@ module Html2rss
22
22
  private
23
23
 
24
24
  def add_channel_to_maker(maker)
25
- [:language, :author, :title, :description, :link, :ttl].each do |attribute_name|
25
+ %i[language author title description link ttl].each do |attribute_name|
26
26
  maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
27
27
  end
28
28
 
@@ -31,12 +31,14 @@ module Html2rss
31
31
  end
32
32
 
33
33
  def feed_items
34
- Item.from_url config.url, config
34
+ @feed_items ||= Item.from_url config.url, config
35
35
  end
36
36
 
37
37
  def add_item_to_items(feed_item, items)
38
+ return unless feed_item.valid?
39
+
38
40
  items.new_item do |rss_item|
39
- config.attribute_names.each do |attribute_name|
41
+ feed_item.available_attributes.each do |attribute_name|
40
42
  rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
41
43
 
42
44
  rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
@@ -2,6 +2,7 @@ require 'faraday'
2
2
  require 'open-uri'
3
3
  require 'nokogiri'
4
4
  require_relative 'item_extractor'
5
+ require_relative 'attribute_post_processors'
5
6
 
6
7
  module Html2rss
7
8
  class Item
@@ -24,26 +25,37 @@ module Html2rss
24
25
  proc = ItemExtractor.const_get extractor.upcase.to_sym
25
26
  value = proc.call(xml, attribute_config)
26
27
 
27
- post_process(method_name, value)
28
+ post_process_options = attribute_config.fetch('post_process', false)
29
+ value = post_process(value, post_process_options) if post_process_options
30
+
31
+ value
28
32
  end
29
33
 
30
- def post_process(method_name, value)
31
- case method_name
32
- when :link
33
- URI(value)
34
- when :updated
35
- Time.parse(value).to_s
36
- else
37
- value
38
- end
34
+ def available_attributes
35
+ # TODO: support optional attributes, e.g. category, enclosure, source
36
+ @available_attributes ||= (%w[title link description author comments updated] & @config.attribute_names)
37
+ end
38
+
39
+ def valid?
40
+ return false if [title.to_s, description.to_s].join('') == ''
41
+
42
+ true
39
43
  end
40
44
 
41
45
  def self.from_url(url, config)
42
46
  connection = Faraday.new(url: url, headers: config.headers)
43
47
  page = Nokogiri::HTML(connection.get.body)
44
- page.css(config.selector('items')).map { |xml_item|
48
+ page.css(config.selector('items')).map do |xml_item|
45
49
  new xml_item, config
46
- }
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def post_process(value, options)
56
+ Html2rss::AttributePostProcessors.get_processor(options)
57
+ .new(value, options, self)
58
+ .get
47
59
  end
48
60
  end
49
61
  end
@@ -1,25 +1,25 @@
1
- require 'sanitize'
2
-
3
1
  module Html2rss
4
2
  module ItemExtractor
5
- TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
6
- ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }
3
+ TEXT = proc { |xml, options| xml.css(options['selector'])&.text&.strip }
4
+ ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']).to_s }
7
5
 
8
6
  HREF = proc { |xml, options|
9
- uri = URI(options['channel']['url'])
10
- uri.path = xml.css(options['selector']).attr('href')
11
- uri
12
- }
7
+ href = xml.css(options['selector']).attr('href').to_s
8
+ path, query = href.split('?')
13
9
 
14
- HTML = proc { |xml, options|
15
- html = xml.css(options['selector']).to_s
10
+ if href.start_with?('http')
11
+ uri = URI(href)
12
+ else
13
+ uri = URI(options['channel']['url'])
14
+ uri.path = path.start_with?('/') ? path : "/#{path}"
15
+ uri.query = query
16
+ end
16
17
 
17
- Sanitize.fragment(html, Sanitize::Config.merge(
18
- Sanitize::Config::RELAXED,
19
- add_attributes: {
20
- 'a' => { 'rel' => 'nofollow noopener noreferrer' }
21
- }
22
- ))
18
+ uri
23
19
  }
20
+
21
+ HTML = proc { |xml, options| xml.css(options['selector']).to_s }
22
+ STATIC = proc { |_xml, options| options['static'] }
23
+ CURRENT_TIME = proc { |_xml, _options| Time.new }
24
24
  end
25
25
  end
@@ -1,3 +1,3 @@
1
1
  module Html2rss
2
- VERSION = '0.0.1'.freeze
2
+ VERSION = '0.1.0'.freeze
3
3
  end
@@ -0,0 +1,15 @@
1
+ <% if(logo) { %><img width="300px" src="<%= logo %>" /><%= '\n\n' %><% } %># <%= title %>
2
+ <% if(intro) { %><%= '\n' %>_<%= intro %>_<%= '\n' %><% } %>
3
+ <% if(version && (version.name || version.number)) { %>##<% if(version.name){%> <%= version.name %><% } %> <%= version.number %> <% if(version.date){ %>( <%= version.date %> )<% } %><%= '\n' %><% } %>
4
+ <% _.forEach(sections, function(section){
5
+ if(section.commitsCount > 0) { %>
6
+ ## <%= section.title %>
7
+ <% _.forEach(section.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
8
+ <% _.forEach(section.components, function(component){ %> - **<%= component.name %>**
9
+ <% _.forEach(component.commits, function(commit){ %> - <%= printCommit(commit, true) %><% }) %>
10
+ <% }) %>
11
+ <% } %>
12
+ <% }) %>
13
+
14
+ ---
15
+ <sub><sup>*Generated with [git-changelog](https://github.com/rafinskipg/git-changelog). If you have any problems or suggestions, create an issue.* :) **Thanks** </sub></sup>
Binary file
metadata CHANGED
@@ -1,57 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-03 00:00:00.000000000 Z
11
+ date: 2018-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: faraday
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.8'
19
+ version: '0.15'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.8'
26
+ version: '0.15'
27
27
  - !ruby/object:Gem::Dependency
28
- name: sanitize
28
+ name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '4.6'
33
+ version: '1.8'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '4.6'
40
+ version: '1.8'
41
41
  - !ruby/object:Gem::Dependency
42
- name: faraday
42
+ name: sanitize
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0.15'
47
+ version: '5.0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0.15'
54
+ version: '5.0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: bundler
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.16'
69
+ - !ruby/object:Gem::Dependency
70
+ name: byebug
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '10.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: rspec
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -81,33 +95,33 @@ dependencies:
81
95
  - !ruby/object:Gem::Version
82
96
  version: '3.0'
83
97
  - !ruby/object:Gem::Dependency
84
- name: vcr
98
+ name: simplecov
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - "~>"
101
+ - - ">="
88
102
  - !ruby/object:Gem::Version
89
- version: '4.0'
103
+ version: '0'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - "~>"
108
+ - - ">="
95
109
  - !ruby/object:Gem::Version
96
- version: '4.0'
110
+ version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
- name: byebug
112
+ name: vcr
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '10.0'
117
+ version: '4.0'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '10.0'
124
+ version: '4.0'
111
125
  description: |-
112
126
  Create your config object, include the url to scrape,
113
127
  some selectors and get a RSS2 feed in return.
@@ -117,9 +131,12 @@ executables: []
117
131
  extensions: []
118
132
  extra_rdoc_files: []
119
133
  files:
134
+ - ".changelogrc"
120
135
  - ".gitignore"
121
136
  - ".rspec"
137
+ - ".rubocop.yml"
122
138
  - ".travis.yml"
139
+ - CHANGELOG.md
123
140
  - Gemfile
124
141
  - Gemfile.lock
125
142
  - LICENSE
@@ -128,11 +145,19 @@ files:
128
145
  - bin/setup
129
146
  - html2rss.gemspec
130
147
  - lib/html2rss.rb
148
+ - lib/html2rss/attribute_post_processors.rb
149
+ - lib/html2rss/attribute_post_processors/parse_time.rb
150
+ - lib/html2rss/attribute_post_processors/parse_uri.rb
151
+ - lib/html2rss/attribute_post_processors/sanitize_html.rb
152
+ - lib/html2rss/attribute_post_processors/substring.rb
153
+ - lib/html2rss/attribute_post_processors/template.rb
131
154
  - lib/html2rss/config.rb
132
155
  - lib/html2rss/feed_builder.rb
133
156
  - lib/html2rss/item.rb
134
157
  - lib/html2rss/item_extractor.rb
135
158
  - lib/html2rss/version.rb
159
+ - support/changelog.md
160
+ - support/logo.png
136
161
  homepage: https://github.com/gildesmarais/html2rss
137
162
  licenses:
138
163
  - MIT
@@ -154,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
179
  version: '0'
155
180
  requirements: []
156
181
  rubyforge_project:
157
- rubygems_version: 2.7.6
182
+ rubygems_version: 2.7.8
158
183
  signing_key:
159
184
  specification_version: 4
160
185
  summary: Generate RSS feeds by scraping websites by providing a config.