html2rss 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 48c6ef4f6636bac787600fa7b2efa3d313361016279d9895b50895c00b74eb8b
4
+ data.tar.gz: 58ea03df84c7c8f3c0a73e5186b49f766f06e476a1606b4837d3749444df1551
5
+ SHA512:
6
+ metadata.gz: 5dd2ba0fb71dcb16bf08632feedf1d5edfe666225bcfeca409be4be153de44a9a04900e0cfd3d188b18b7e9e48ae37051a48448ed06649deb281440a1cb3fbad
7
+ data.tar.gz: '0908d37c6a8b3b2b0dd620c69bdfea808533b862f5bf6892f4a96bb2b9ce9c36f211bb83072ca92bcd310a42c78a7270b593acc10caa7833f67060f487636ad2'
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,4 @@
1
+ --format documentation
2
+ --color
3
+ --order random
4
+ --require spec_helper
@@ -0,0 +1,15 @@
1
+ sudo: false
2
+ language: ruby
3
+
4
+ before_install:
5
+ - gem update --system
6
+ - gem install bundler
7
+ bundler_args: --jobs=3 --retry=3
8
+
9
+ rvm:
10
+ - 2.3.7
11
+ - 2.4.4
12
+ - 2.5.1
13
+
14
+ script:
15
+ - bundle exec rspec
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in html2rss.gemspec
6
+ gemspec
@@ -0,0 +1,53 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ html2rss (0.0.1)
5
+ faraday (~> 0.15)
6
+ nokogiri (~> 1.8)
7
+ sanitize (~> 4.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ byebug (10.0.2)
13
+ crass (1.0.4)
14
+ diff-lcs (1.3)
15
+ faraday (0.15.2)
16
+ multipart-post (>= 1.2, < 3)
17
+ mini_portile2 (2.3.0)
18
+ multipart-post (2.0.0)
19
+ nokogiri (1.8.2)
20
+ mini_portile2 (~> 2.3.0)
21
+ nokogumbo (1.5.0)
22
+ nokogiri
23
+ rspec (3.7.0)
24
+ rspec-core (~> 3.7.0)
25
+ rspec-expectations (~> 3.7.0)
26
+ rspec-mocks (~> 3.7.0)
27
+ rspec-core (3.7.1)
28
+ rspec-support (~> 3.7.0)
29
+ rspec-expectations (3.7.0)
30
+ diff-lcs (>= 1.2.0, < 2.0)
31
+ rspec-support (~> 3.7.0)
32
+ rspec-mocks (3.7.0)
33
+ diff-lcs (>= 1.2.0, < 2.0)
34
+ rspec-support (~> 3.7.0)
35
+ rspec-support (3.7.1)
36
+ sanitize (4.6.5)
37
+ crass (~> 1.0.2)
38
+ nokogiri (>= 1.4.4)
39
+ nokogumbo (~> 1.4)
40
+ vcr (4.0.0)
41
+
42
+ PLATFORMS
43
+ ruby
44
+
45
+ DEPENDENCIES
46
+ bundler (~> 1.16)
47
+ byebug (~> 10.0)
48
+ html2rss!
49
+ rspec (~> 3.0)
50
+ vcr (~> 4.0)
51
+
52
+ BUNDLED WITH
53
+ 1.16.2
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018 Gil Desmarais
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,45 @@
1
+ # Html2rss
2
+
3
+ Request and convert an HTML document to an RSS feed via a config object.
4
+ The config contains the URL to scrape and the selectors needed to extract
5
+ the required information. This gem provides some extractors (e.g. extract
6
+ the information from an HTML attribute).
7
+
8
+ Please always check the website's Terms of Service before if its allowed to
9
+ scrape their content!
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'html2rss'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install html2rss
26
+
27
+ ## Usage example with a YAML file
28
+
29
+ Create a YAML config file. Find an example at `rspec/config.test.yml`.
30
+
31
+ `Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')` returns
32
+
33
+ an `RSS:Rss` object.
34
+
35
+ ## Development
36
+
37
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
38
+
39
+ ## Contributing
40
+
41
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
42
+
43
+ ## License
44
+
45
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'html2rss'
5
+ require 'byebug'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,38 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'html2rss/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'html2rss'
7
+ spec.version = Html2rss::VERSION
8
+ spec.authors = ['Gil Desmarais']
9
+ spec.email = ['html2rss@desmarais.de']
10
+
11
+ spec.summary = 'Generate RSS feeds by scraping websites by providing a config.'
12
+ spec.description = 'Create your config object, include the url to scrape,
13
+ some selectors and get a RSS2 feed in return.'
14
+ spec.homepage = 'https://github.com/gildesmarais/html2rss'
15
+ spec.license = 'MIT'
16
+
17
+ if spec.respond_to?(:metadata)
18
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
19
+ else
20
+ raise 'RubyGems 2.0 or newer is required to protect against ' \
21
+ 'public gem pushes.'
22
+ end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
25
+ f.match(%r{^(test|spec|features)/})
26
+ end
27
+ spec.bindir = 'exe'
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
30
+
31
+ spec.add_dependency 'nokogiri', '~> 1.8'
32
+ spec.add_dependency 'sanitize', '~> 4.6'
33
+ spec.add_dependency 'faraday', '~> 0.15'
34
+ spec.add_development_dependency 'bundler', '~> 1.16'
35
+ spec.add_development_dependency 'rspec', '~> 3.0'
36
+ spec.add_development_dependency 'vcr', '~> 4.0'
37
+ spec.add_development_dependency 'byebug', '~> 10.0'
38
+ end
@@ -0,0 +1,16 @@
1
+ require 'html2rss/config'
2
+ require 'html2rss/feed_builder'
3
+ require 'html2rss/version'
4
+ require 'yaml'
5
+
6
+ module Html2rss
7
+ def self.feed_from_yaml_config(file, name)
8
+ config = Config.new(YAML.load(File.open(file)).freeze, name)
9
+ feed(config)
10
+ end
11
+
12
+ def self.feed(config)
13
+ feed = FeedBuilder.new config
14
+ feed.rss
15
+ end
16
+ end
@@ -0,0 +1,54 @@
1
+ module Html2rss
2
+ class Config
3
+ attr_reader :feed_config, :channel_config
4
+
5
+ def initialize(config, name)
6
+ @config = config
7
+ @feed_config = @config['feeds'][name.to_s]
8
+ @channel_config = @feed_config['channel']
9
+ end
10
+
11
+ def author
12
+ channel_config.fetch 'author', 'html2rss'
13
+ end
14
+
15
+ def ttl
16
+ (channel_config.fetch 'ttl').to_i || nil
17
+ end
18
+
19
+ def title
20
+ channel_config.fetch 'title', 'html2rss generated title'
21
+ end
22
+
23
+ def language
24
+ channel_config.fetch 'language', 'en'
25
+ end
26
+
27
+ def description
28
+ channel_config.fetch 'description', 'A description of my html2rss feed.'
29
+ end
30
+
31
+ def url
32
+ channel_config.dig 'url'
33
+ end
34
+ alias link url
35
+
36
+ def headers
37
+ @config.fetch('headers', {})
38
+ end
39
+
40
+ def options(name)
41
+ feed_config.dig('selectors', name).merge('channel' => channel_config)
42
+ end
43
+
44
+ def selector(name)
45
+ feed_config.dig('selectors', name, 'selector')
46
+ end
47
+
48
+ def attribute_names
49
+ attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_sym)
50
+ attribute_names.delete(:items)
51
+ attribute_names
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,47 @@
1
+ require 'rss'
2
+ require_relative 'item'
3
+
4
+ module Html2rss
5
+ class FeedBuilder
6
+ attr_reader :config
7
+
8
+ def initialize(feed_config)
9
+ @config = feed_config
10
+ end
11
+
12
+ def rss
13
+ RSS::Maker.make('2.0') do |maker|
14
+ add_channel_to_maker(maker)
15
+
16
+ feed_items.map do |feed_item|
17
+ add_item_to_items(feed_item, maker.items)
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def add_channel_to_maker(maker)
25
+ [:language, :author, :title, :description, :link, :ttl].each do |attribute_name|
26
+ maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
27
+ end
28
+
29
+ maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
30
+ maker.channel.lastBuildDate = Time.now.to_s
31
+ end
32
+
33
+ def feed_items
34
+ Item.from_url config.url, config
35
+ end
36
+
37
+ def add_item_to_items(feed_item, items)
38
+ items.new_item do |rss_item|
39
+ config.attribute_names.each do |attribute_name|
40
+ rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
41
+
42
+ rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,49 @@
1
+ require 'faraday'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require_relative 'item_extractor'
5
+
6
+ module Html2rss
7
+ class Item
8
+ attr_reader :xml, :config
9
+
10
+ def initialize(xml, config)
11
+ @xml = xml
12
+ @config = config
13
+ end
14
+
15
+ def respond_to_missing?(method_name, _include_private = false)
16
+ config.attribute_names.include?(method_name) || super
17
+ end
18
+
19
+ def method_missing(method_name, *_args)
20
+ attribute_config = config.options(method_name.to_s)
21
+ return super unless attribute_config
22
+
23
+ extractor = attribute_config['extractor'] || 'text'
24
+ proc = ItemExtractor.const_get extractor.upcase.to_sym
25
+ value = proc.call(xml, attribute_config)
26
+
27
+ post_process(method_name, value)
28
+ end
29
+
30
+ def post_process(method_name, value)
31
+ case method_name
32
+ when :link
33
+ URI(value)
34
+ when :updated
35
+ Time.parse(value).to_s
36
+ else
37
+ value
38
+ end
39
+ end
40
+
41
+ def self.from_url(url, config)
42
+ connection = Faraday.new(url: url, headers: config.headers)
43
+ page = Nokogiri::HTML(connection.get.body)
44
+ page.css(config.selector('items')).map { |xml_item|
45
+ new xml_item, config
46
+ }
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,25 @@
1
+ require 'sanitize'
2
+
3
+ module Html2rss
4
+ module ItemExtractor
5
+ TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
6
+ ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }
7
+
8
+ HREF = proc { |xml, options|
9
+ uri = URI(options['channel']['url'])
10
+ uri.path = xml.css(options['selector']).attr('href')
11
+ uri
12
+ }
13
+
14
+ HTML = proc { |xml, options|
15
+ html = xml.css(options['selector']).to_s
16
+
17
+ Sanitize.fragment(html, Sanitize::Config.merge(
18
+ Sanitize::Config::RELAXED,
19
+ add_attributes: {
20
+ 'a' => { 'rel' => 'nofollow noopener noreferrer' }
21
+ }
22
+ ))
23
+ }
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module Html2rss
2
+ VERSION = '0.0.1'.freeze
3
+ end
metadata ADDED
@@ -0,0 +1,161 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2rss
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Gil Desmarais
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-06-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: sanitize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '4.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '4.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faraday
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.15'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.15'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.16'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.16'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: vcr
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '4.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '4.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: byebug
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '10.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '10.0'
111
+ description: |-
112
+ Create your config object, include the url to scrape,
113
+ some selectors and get a RSS2 feed in return.
114
+ email:
115
+ - html2rss@desmarais.de
116
+ executables: []
117
+ extensions: []
118
+ extra_rdoc_files: []
119
+ files:
120
+ - ".gitignore"
121
+ - ".rspec"
122
+ - ".travis.yml"
123
+ - Gemfile
124
+ - Gemfile.lock
125
+ - LICENSE
126
+ - README.md
127
+ - bin/console
128
+ - bin/setup
129
+ - html2rss.gemspec
130
+ - lib/html2rss.rb
131
+ - lib/html2rss/config.rb
132
+ - lib/html2rss/feed_builder.rb
133
+ - lib/html2rss/item.rb
134
+ - lib/html2rss/item_extractor.rb
135
+ - lib/html2rss/version.rb
136
+ homepage: https://github.com/gildesmarais/html2rss
137
+ licenses:
138
+ - MIT
139
+ metadata:
140
+ allowed_push_host: https://rubygems.org
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ requirements: []
156
+ rubyforge_project:
157
+ rubygems_version: 2.7.6
158
+ signing_key:
159
+ specification_version: 4
160
+ summary: Generate RSS feeds by scraping websites by providing a config.
161
+ test_files: []