html2rss 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 48c6ef4f6636bac787600fa7b2efa3d313361016279d9895b50895c00b74eb8b
4
+ data.tar.gz: 58ea03df84c7c8f3c0a73e5186b49f766f06e476a1606b4837d3749444df1551
5
+ SHA512:
6
+ metadata.gz: 5dd2ba0fb71dcb16bf08632feedf1d5edfe666225bcfeca409be4be153de44a9a04900e0cfd3d188b18b7e9e48ae37051a48448ed06649deb281440a1cb3fbad
7
+ data.tar.gz: '0908d37c6a8b3b2b0dd620c69bdfea808533b862f5bf6892f4a96bb2b9ce9c36f211bb83072ca92bcd310a42c78a7270b593acc10caa7833f67060f487636ad2'
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,4 @@
1
+ --format documentation
2
+ --color
3
+ --order random
4
+ --require spec_helper
@@ -0,0 +1,15 @@
1
+ sudo: false
2
+ language: ruby
3
+
4
+ before_install:
5
+ - gem update --system
6
+ - gem install bundler
7
+ bundler_args: --jobs=3 --retry=3
8
+
9
+ rvm:
10
+ - 2.3.7
11
+ - 2.4.4
12
+ - 2.5.1
13
+
14
+ script:
15
+ - bundle exec rspec
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in html2rss.gemspec
6
+ gemspec
@@ -0,0 +1,53 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ html2rss (0.0.1)
5
+ faraday (~> 0.15)
6
+ nokogiri (~> 1.8)
7
+ sanitize (~> 4.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ byebug (10.0.2)
13
+ crass (1.0.4)
14
+ diff-lcs (1.3)
15
+ faraday (0.15.2)
16
+ multipart-post (>= 1.2, < 3)
17
+ mini_portile2 (2.3.0)
18
+ multipart-post (2.0.0)
19
+ nokogiri (1.8.2)
20
+ mini_portile2 (~> 2.3.0)
21
+ nokogumbo (1.5.0)
22
+ nokogiri
23
+ rspec (3.7.0)
24
+ rspec-core (~> 3.7.0)
25
+ rspec-expectations (~> 3.7.0)
26
+ rspec-mocks (~> 3.7.0)
27
+ rspec-core (3.7.1)
28
+ rspec-support (~> 3.7.0)
29
+ rspec-expectations (3.7.0)
30
+ diff-lcs (>= 1.2.0, < 2.0)
31
+ rspec-support (~> 3.7.0)
32
+ rspec-mocks (3.7.0)
33
+ diff-lcs (>= 1.2.0, < 2.0)
34
+ rspec-support (~> 3.7.0)
35
+ rspec-support (3.7.1)
36
+ sanitize (4.6.5)
37
+ crass (~> 1.0.2)
38
+ nokogiri (>= 1.4.4)
39
+ nokogumbo (~> 1.4)
40
+ vcr (4.0.0)
41
+
42
+ PLATFORMS
43
+ ruby
44
+
45
+ DEPENDENCIES
46
+ bundler (~> 1.16)
47
+ byebug (~> 10.0)
48
+ html2rss!
49
+ rspec (~> 3.0)
50
+ vcr (~> 4.0)
51
+
52
+ BUNDLED WITH
53
+ 1.16.2
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018 Gil Desmarais
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,45 @@
1
+ # Html2rss
2
+
3
+ Request and convert an HTML document to an RSS feed via a config object.
4
+ The config contains the URL to scrape and the selectors needed to extract
5
+ the required information. This gem provides some extractors (e.g. extract
6
+ the information from an HTML attribute).
7
+
8
+ Please always check the website's Terms of Service before if its allowed to
9
+ scrape their content!
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'html2rss'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install html2rss
26
+
27
+ ## Usage example with a YAML file
28
+
29
+ Create a YAML config file. Find an example at `rspec/config.test.yml`.
30
+
31
+ `Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')` returns
32
+
33
+ an `RSS:Rss` object.
34
+
35
+ ## Development
36
+
37
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
38
+
39
+ ## Contributing
40
+
41
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
42
+
43
+ ## License
44
+
45
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'html2rss'
5
+ require 'byebug'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,38 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'html2rss/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'html2rss'
7
+ spec.version = Html2rss::VERSION
8
+ spec.authors = ['Gil Desmarais']
9
+ spec.email = ['html2rss@desmarais.de']
10
+
11
+ spec.summary = 'Generate RSS feeds by scraping websites by providing a config.'
12
+ spec.description = 'Create your config object, include the url to scrape,
13
+ some selectors and get a RSS2 feed in return.'
14
+ spec.homepage = 'https://github.com/gildesmarais/html2rss'
15
+ spec.license = 'MIT'
16
+
17
+ if spec.respond_to?(:metadata)
18
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
19
+ else
20
+ raise 'RubyGems 2.0 or newer is required to protect against ' \
21
+ 'public gem pushes.'
22
+ end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
25
+ f.match(%r{^(test|spec|features)/})
26
+ end
27
+ spec.bindir = 'exe'
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
30
+
31
+ spec.add_dependency 'nokogiri', '~> 1.8'
32
+ spec.add_dependency 'sanitize', '~> 4.6'
33
+ spec.add_dependency 'faraday', '~> 0.15'
34
+ spec.add_development_dependency 'bundler', '~> 1.16'
35
+ spec.add_development_dependency 'rspec', '~> 3.0'
36
+ spec.add_development_dependency 'vcr', '~> 4.0'
37
+ spec.add_development_dependency 'byebug', '~> 10.0'
38
+ end
@@ -0,0 +1,16 @@
1
+ require 'html2rss/config'
2
+ require 'html2rss/feed_builder'
3
+ require 'html2rss/version'
4
+ require 'yaml'
5
+
6
+ module Html2rss
7
+ def self.feed_from_yaml_config(file, name)
8
+ config = Config.new(YAML.load(File.open(file)).freeze, name)
9
+ feed(config)
10
+ end
11
+
12
+ def self.feed(config)
13
+ feed = FeedBuilder.new config
14
+ feed.rss
15
+ end
16
+ end
@@ -0,0 +1,54 @@
1
+ module Html2rss
2
+ class Config
3
+ attr_reader :feed_config, :channel_config
4
+
5
+ def initialize(config, name)
6
+ @config = config
7
+ @feed_config = @config['feeds'][name.to_s]
8
+ @channel_config = @feed_config['channel']
9
+ end
10
+
11
+ def author
12
+ channel_config.fetch 'author', 'html2rss'
13
+ end
14
+
15
+ def ttl
16
+ (channel_config.fetch 'ttl').to_i || nil
17
+ end
18
+
19
+ def title
20
+ channel_config.fetch 'title', 'html2rss generated title'
21
+ end
22
+
23
+ def language
24
+ channel_config.fetch 'language', 'en'
25
+ end
26
+
27
+ def description
28
+ channel_config.fetch 'description', 'A description of my html2rss feed.'
29
+ end
30
+
31
+ def url
32
+ channel_config.dig 'url'
33
+ end
34
+ alias link url
35
+
36
+ def headers
37
+ @config.fetch('headers', {})
38
+ end
39
+
40
+ def options(name)
41
+ feed_config.dig('selectors', name).merge('channel' => channel_config)
42
+ end
43
+
44
+ def selector(name)
45
+ feed_config.dig('selectors', name, 'selector')
46
+ end
47
+
48
+ def attribute_names
49
+ attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_sym)
50
+ attribute_names.delete(:items)
51
+ attribute_names
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,47 @@
1
+ require 'rss'
2
+ require_relative 'item'
3
+
4
+ module Html2rss
5
+ class FeedBuilder
6
+ attr_reader :config
7
+
8
+ def initialize(feed_config)
9
+ @config = feed_config
10
+ end
11
+
12
+ def rss
13
+ RSS::Maker.make('2.0') do |maker|
14
+ add_channel_to_maker(maker)
15
+
16
+ feed_items.map do |feed_item|
17
+ add_item_to_items(feed_item, maker.items)
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def add_channel_to_maker(maker)
25
+ [:language, :author, :title, :description, :link, :ttl].each do |attribute_name|
26
+ maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
27
+ end
28
+
29
+ maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
30
+ maker.channel.lastBuildDate = Time.now.to_s
31
+ end
32
+
33
+ def feed_items
34
+ Item.from_url config.url, config
35
+ end
36
+
37
+ def add_item_to_items(feed_item, items)
38
+ items.new_item do |rss_item|
39
+ config.attribute_names.each do |attribute_name|
40
+ rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
41
+
42
+ rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,49 @@
1
+ require 'faraday'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require_relative 'item_extractor'
5
+
6
+ module Html2rss
7
+ class Item
8
+ attr_reader :xml, :config
9
+
10
+ def initialize(xml, config)
11
+ @xml = xml
12
+ @config = config
13
+ end
14
+
15
+ def respond_to_missing?(method_name, _include_private = false)
16
+ config.attribute_names.include?(method_name) || super
17
+ end
18
+
19
+ def method_missing(method_name, *_args)
20
+ attribute_config = config.options(method_name.to_s)
21
+ return super unless attribute_config
22
+
23
+ extractor = attribute_config['extractor'] || 'text'
24
+ proc = ItemExtractor.const_get extractor.upcase.to_sym
25
+ value = proc.call(xml, attribute_config)
26
+
27
+ post_process(method_name, value)
28
+ end
29
+
30
+ def post_process(method_name, value)
31
+ case method_name
32
+ when :link
33
+ URI(value)
34
+ when :updated
35
+ Time.parse(value).to_s
36
+ else
37
+ value
38
+ end
39
+ end
40
+
41
+ def self.from_url(url, config)
42
+ connection = Faraday.new(url: url, headers: config.headers)
43
+ page = Nokogiri::HTML(connection.get.body)
44
+ page.css(config.selector('items')).map { |xml_item|
45
+ new xml_item, config
46
+ }
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,25 @@
1
+ require 'sanitize'
2
+
3
+ module Html2rss
4
+ module ItemExtractor
5
+ TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
6
+ ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }
7
+
8
+ HREF = proc { |xml, options|
9
+ uri = URI(options['channel']['url'])
10
+ uri.path = xml.css(options['selector']).attr('href')
11
+ uri
12
+ }
13
+
14
+ HTML = proc { |xml, options|
15
+ html = xml.css(options['selector']).to_s
16
+
17
+ Sanitize.fragment(html, Sanitize::Config.merge(
18
+ Sanitize::Config::RELAXED,
19
+ add_attributes: {
20
+ 'a' => { 'rel' => 'nofollow noopener noreferrer' }
21
+ }
22
+ ))
23
+ }
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module Html2rss
2
+ VERSION = '0.0.1'.freeze
3
+ end
metadata ADDED
@@ -0,0 +1,161 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2rss
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Gil Desmarais
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-06-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: sanitize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '4.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '4.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faraday
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.15'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.15'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.16'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.16'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: vcr
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '4.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '4.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: byebug
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '10.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '10.0'
111
+ description: |-
112
+ Create your config object, include the url to scrape,
113
+ some selectors and get a RSS2 feed in return.
114
+ email:
115
+ - html2rss@desmarais.de
116
+ executables: []
117
+ extensions: []
118
+ extra_rdoc_files: []
119
+ files:
120
+ - ".gitignore"
121
+ - ".rspec"
122
+ - ".travis.yml"
123
+ - Gemfile
124
+ - Gemfile.lock
125
+ - LICENSE
126
+ - README.md
127
+ - bin/console
128
+ - bin/setup
129
+ - html2rss.gemspec
130
+ - lib/html2rss.rb
131
+ - lib/html2rss/config.rb
132
+ - lib/html2rss/feed_builder.rb
133
+ - lib/html2rss/item.rb
134
+ - lib/html2rss/item_extractor.rb
135
+ - lib/html2rss/version.rb
136
+ homepage: https://github.com/gildesmarais/html2rss
137
+ licenses:
138
+ - MIT
139
+ metadata:
140
+ allowed_push_host: https://rubygems.org
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ requirements: []
156
+ rubyforge_project:
157
+ rubygems_version: 2.7.6
158
+ signing_key:
159
+ specification_version: 4
160
+ summary: Generate RSS feeds by scraping websites by providing a config.
161
+ test_files: []