news_scraper 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +96 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +105 -0
- data/Rakefile +24 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/circle.yml +3 -0
- data/config/article_scrape_patterns.yml +116 -0
- data/config/temp_dirs.yml +4 -0
- data/dev.yml +13 -0
- data/lib/news_scraper/active_support_lite/string.rb +11 -0
- data/lib/news_scraper/cli.rb +106 -0
- data/lib/news_scraper/constants.rb +6 -0
- data/lib/news_scraper/errors.rb +16 -0
- data/lib/news_scraper/extractors/article.rb +17 -0
- data/lib/news_scraper/extractors/google_news_rss.rb +41 -0
- data/lib/news_scraper/extractors_helpers.rb +27 -0
- data/lib/news_scraper/scraper.rb +42 -0
- data/lib/news_scraper/trainer/preset_selector.rb +77 -0
- data/lib/news_scraper/trainer/url_trainer.rb +74 -0
- data/lib/news_scraper/trainer.rb +25 -0
- data/lib/news_scraper/transformers/article.rb +77 -0
- data/lib/news_scraper/transformers/trainer_article.rb +17 -0
- data/lib/news_scraper/uri_parser.rb +41 -0
- data/lib/news_scraper/version.rb +3 -0
- data/lib/news_scraper.rb +42 -0
- data/news_scraper.gemspec +41 -0
- metadata +337 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'rss'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module NewsScraper
|
5
|
+
module Extractors
|
6
|
+
class GoogleNewsRss
|
7
|
+
include ExtractorsHelpers
|
8
|
+
|
9
|
+
BASE_URL = 'https://news.google.com/news?cf=all&hl=en&pz=1&ned=us&output=rss'.freeze
|
10
|
+
def initialize(query:)
|
11
|
+
@query = query
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract
|
15
|
+
http_request "#{BASE_URL}&q=#{@query}" do |response|
|
16
|
+
google_urls = google_urls_from_resp(response.body)
|
17
|
+
extract_article_urls(google_urls)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def google_urls_from_resp(body)
|
24
|
+
rss = RSS::Parser.parse(body)
|
25
|
+
|
26
|
+
rss.items.flat_map do |rss_item|
|
27
|
+
Nokogiri::HTML(rss_item.description).xpath('//a').map do |anchor|
|
28
|
+
anchor['href']
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def extract_article_urls(google_urls)
|
34
|
+
google_urls.map do |google_url|
|
35
|
+
regex = google_url.match(%r{&url=(?<url>https?://.*)})
|
36
|
+
regex.nil? ? nil : regex['url']
|
37
|
+
end.compact.uniq
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module ExtractorsHelpers
|
3
|
+
# Perform an HTTP request with a standardized response
|
4
|
+
#
|
5
|
+
# *Params*
|
6
|
+
# - <code>url</code>: the url on which to perform a get request
|
7
|
+
#
|
8
|
+
def http_request(url)
|
9
|
+
url = URIParser.new(url).with_scheme
|
10
|
+
|
11
|
+
CLI.put_header(url)
|
12
|
+
CLI.log "Beginning HTTP request for #{url}"
|
13
|
+
response = HTTParty.get(url)
|
14
|
+
|
15
|
+
raise ResponseError.new("#{response.code} - #{response.message}") unless response.code == 200
|
16
|
+
|
17
|
+
CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
|
18
|
+
CLI.put_footer
|
19
|
+
|
20
|
+
if block_given?
|
21
|
+
yield response
|
22
|
+
else
|
23
|
+
response
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
class Scraper
|
3
|
+
# Initialize a Scraper object
|
4
|
+
#
|
5
|
+
# *Params*
|
6
|
+
# - <code>query</code>: a keyword arugment specifying the query to scrape
|
7
|
+
#
|
8
|
+
def initialize(query:)
|
9
|
+
@query = query
|
10
|
+
end
|
11
|
+
|
12
|
+
# Fetches articles from Extraction sources and scrapes the results
|
13
|
+
#
|
14
|
+
# *Yields*
|
15
|
+
# - Will yield individually extracted articles
|
16
|
+
#
|
17
|
+
# *Raises*
|
18
|
+
# - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
|
19
|
+
# - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
|
20
|
+
# - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
|
21
|
+
# - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
|
22
|
+
#
|
23
|
+
# *Returns*
|
24
|
+
# - <code>transformed_articles</code>: The transformed articles fetched from the extracted sources
|
25
|
+
#
|
26
|
+
def scrape
|
27
|
+
article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
|
28
|
+
|
29
|
+
transformed_articles = []
|
30
|
+
article_urls.each do |article_url|
|
31
|
+
payload = Extractors::Article.new(url: article_url).extract
|
32
|
+
|
33
|
+
transformed_article = Transformers::Article.new(url: article_url, payload: payload).transform
|
34
|
+
transformed_articles << transformed_article
|
35
|
+
|
36
|
+
yield transformed_article if block_given?
|
37
|
+
end
|
38
|
+
|
39
|
+
transformed_articles
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module Trainer
|
3
|
+
class PresetSelector
|
4
|
+
PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
|
5
|
+
|
6
|
+
def initialize(data_type:, data_type_presets:, url:, payload:)
|
7
|
+
@url = url
|
8
|
+
@payload = payload
|
9
|
+
@data_type_presets = data_type_presets
|
10
|
+
@data_type = data_type
|
11
|
+
end
|
12
|
+
|
13
|
+
def select
|
14
|
+
return unless @data_type_presets
|
15
|
+
|
16
|
+
selected_option = CLI.prompt_with_options(
|
17
|
+
"Select which preset to use for #{@data_type}:",
|
18
|
+
pattern_options.keys
|
19
|
+
)
|
20
|
+
|
21
|
+
if selected_option.start_with?(PROVIDER_PHRASE)
|
22
|
+
pattern_type = pattern_options[selected_option]
|
23
|
+
return {
|
24
|
+
'method' => pattern_type,
|
25
|
+
'pattern' => CLI.get_input("Provide the #{pattern_type} pattern:")
|
26
|
+
}
|
27
|
+
end
|
28
|
+
return if selected_option == 'skip'
|
29
|
+
|
30
|
+
selected_index = pattern_options[selected_option]
|
31
|
+
selected_preset_code = transform_results[selected_index].first
|
32
|
+
@data_type_presets[selected_preset_code].merge('variable' => [selected_preset_code, @data_type].join('_'))
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def pattern_options
|
38
|
+
return {} unless @data_type_presets
|
39
|
+
|
40
|
+
@pattern_options ||= begin
|
41
|
+
temp_options = transform_results.each_with_object({}).with_index do |(results, options_hash), index|
|
42
|
+
preset_name = "#{results[0]}_#{@data_type}"
|
43
|
+
extracted_text = results[1]
|
44
|
+
options_hash["#{preset_name}: #{extracted_text}"] = index
|
45
|
+
end
|
46
|
+
%w(xpath css).each do |pattern_provider|
|
47
|
+
temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
|
48
|
+
end
|
49
|
+
temp_options.merge('skip' => 'skip')
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def transform_results
|
54
|
+
return {} unless @data_type_presets
|
55
|
+
|
56
|
+
scrape_details = blank_scrape_details
|
57
|
+
@results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
|
58
|
+
scrape_details[@data_type] = preset_details
|
59
|
+
train_transformer = Transformers::TrainerArticle.new(
|
60
|
+
url: @url,
|
61
|
+
payload: @payload,
|
62
|
+
scrape_details: scrape_details,
|
63
|
+
)
|
64
|
+
|
65
|
+
transformed_result = train_transformer.transform[@data_type.to_sym]
|
66
|
+
hash[preset_name] = transformed_result if transformed_result && !transformed_result.empty?
|
67
|
+
end.to_a
|
68
|
+
end
|
69
|
+
|
70
|
+
def blank_scrape_details
|
71
|
+
@blank_scrape_details ||= Constants::SCRAPE_PATTERNS.each_with_object({}) do |data_type, hash|
|
72
|
+
hash[data_type] = nil
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module Trainer
|
3
|
+
class UrlTrainer
|
4
|
+
def initialize(url)
|
5
|
+
@url = url
|
6
|
+
@root_domain = URIParser.new(@url).host
|
7
|
+
@payload = Extractors::Article.new(url: @url).extract
|
8
|
+
end
|
9
|
+
|
10
|
+
def train
|
11
|
+
return if article_scrape_patterns['domains'].key?(@root_domain)
|
12
|
+
|
13
|
+
CLI.put_header(@root_domain)
|
14
|
+
CLI.log("There is no scrape pattern defined for #{@root_domain} in #{Constants::SCRAPE_PATTERN_FILEPATH}")
|
15
|
+
CLI.log "Fetching information..."
|
16
|
+
CLI.put_footer
|
17
|
+
|
18
|
+
selected_presets = {}
|
19
|
+
article_scrape_patterns['data_types'].each do |data_type|
|
20
|
+
selected_presets[data_type] = selected_pattern(data_type)
|
21
|
+
end
|
22
|
+
|
23
|
+
save_selected_presets(selected_presets)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def selected_pattern(data_type)
|
29
|
+
CLI.put_header("Determining information for #{data_type}")
|
30
|
+
data_type_presets = article_scrape_patterns['presets'][data_type]
|
31
|
+
pattern = if data_type_presets.nil?
|
32
|
+
CLI.log("No presets were found for #{data_type}. Skipping to next.")
|
33
|
+
nil
|
34
|
+
else
|
35
|
+
PresetSelector.new(
|
36
|
+
url: @url,
|
37
|
+
payload: @payload,
|
38
|
+
data_type_presets: data_type_presets,
|
39
|
+
data_type: data_type
|
40
|
+
).select
|
41
|
+
end
|
42
|
+
CLI.put_footer
|
43
|
+
|
44
|
+
pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
|
45
|
+
end
|
46
|
+
|
47
|
+
def save_selected_presets(selected_presets)
|
48
|
+
current_content = File.read(Constants::SCRAPE_PATTERN_FILEPATH).chomp
|
49
|
+
new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
|
50
|
+
|
51
|
+
File.write(Constants::SCRAPE_PATTERN_FILEPATH, new_content)
|
52
|
+
CLI.log("Successfully wrote presets for #{@root_domain} to #{Constants::SCRAPE_PATTERN_FILEPATH}.")
|
53
|
+
end
|
54
|
+
|
55
|
+
def build_domain_yaml(selected_presets)
|
56
|
+
spacer = " "
|
57
|
+
output_string = ["#{spacer}#{@root_domain}:"]
|
58
|
+
selected_presets.each do |data_type, spec|
|
59
|
+
if spec.include?('variable')
|
60
|
+
output_string << (spacer * 2) + "#{data_type}: *#{spec['variable']}"
|
61
|
+
else
|
62
|
+
output_string << (spacer * 2) + "#{data_type}:"
|
63
|
+
spec.each { |k, v| output_string << (spacer * 3) + "#{k}: #{v}" }
|
64
|
+
end
|
65
|
+
end
|
66
|
+
output_string.join("\n")
|
67
|
+
end
|
68
|
+
|
69
|
+
def article_scrape_patterns
|
70
|
+
@article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'news_scraper/trainer/preset_selector'
|
2
|
+
require 'news_scraper/trainer/url_trainer'
|
3
|
+
|
4
|
+
module NewsScraper
|
5
|
+
module Trainer
|
6
|
+
extend self
|
7
|
+
|
8
|
+
# Fetches articles from Extraction sources and trains on the results
|
9
|
+
#
|
10
|
+
# *Training* is a process where we take an untrained url (root domain
|
11
|
+
# is not in <code>article_scrape_patterns.yml</code>) and determine patterns and methods
|
12
|
+
# to match the data_types listed in <code>article_scrape_patterns.yml</code>, then record
|
13
|
+
# them to the <code>article_scrape_patterns.yml</code> file
|
14
|
+
#
|
15
|
+
# *Params*
|
16
|
+
# - <code>query</code>: a keyword arugment specifying the query to train on
|
17
|
+
#
|
18
|
+
def train(query: '')
|
19
|
+
article_urls = Extractors::GoogleNewsRss.new(query: query).extract
|
20
|
+
article_urls.each do |url|
|
21
|
+
Trainer::UrlTrainer.new(url).train
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'sanitize'
|
3
|
+
require 'readability'
|
4
|
+
require 'htmlbeautifier'
|
5
|
+
|
6
|
+
module NewsScraper
|
7
|
+
module Transformers
|
8
|
+
class Article
|
9
|
+
# Initialize a Article object
|
10
|
+
#
|
11
|
+
# *Params*
|
12
|
+
# - <code>url</code>: keyword arg - the url on which scraping was done
|
13
|
+
# - <code>payload</code>: keyword arg - the result of the scrape
|
14
|
+
#
|
15
|
+
def initialize(url:, payload:)
|
16
|
+
uri_parser = URIParser.new(url)
|
17
|
+
@uri = uri_parser.without_scheme
|
18
|
+
@root_domain = uri_parser.host
|
19
|
+
@payload = payload
|
20
|
+
end
|
21
|
+
|
22
|
+
# Transform the article
|
23
|
+
#
|
24
|
+
# *Raises*
|
25
|
+
# - ScrapePatternNotDefined: will raise this error if the root domain is not in the article_scrape_patterns.yml
|
26
|
+
#
|
27
|
+
# *Returns*
|
28
|
+
# - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
|
29
|
+
#
|
30
|
+
def transform
|
31
|
+
raise ScrapePatternNotDefined.new(uri: @uri, root_domain: @root_domain) unless scrape_details
|
32
|
+
|
33
|
+
transformed_response.merge(uri: @uri, root_domain: @root_domain)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def scrape_details
|
39
|
+
@scrape_details ||= Constants::SCRAPE_PATTERNS['domains'][@root_domain]
|
40
|
+
end
|
41
|
+
|
42
|
+
def transformed_response
|
43
|
+
Constants::SCRAPE_PATTERNS['data_types'].each_with_object({}) do |data_type, response|
|
44
|
+
response[data_type.to_sym] = parsed_data(data_type)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def parsed_data(data_type)
|
49
|
+
return nil unless scrape_details[data_type]
|
50
|
+
|
51
|
+
scrape_method = scrape_details[data_type]['method'].to_sym
|
52
|
+
case scrape_method
|
53
|
+
when :xpath
|
54
|
+
noko_html = Nokogiri::HTML(@payload)
|
55
|
+
Sanitize.fragment(
|
56
|
+
noko_html.send(scrape_method, "(#{scrape_details[data_type]['pattern']})[1]")
|
57
|
+
).squish
|
58
|
+
when :css
|
59
|
+
noko_html = Nokogiri::HTML(@payload)
|
60
|
+
Sanitize.fragment(
|
61
|
+
noko_html.send(scrape_method, scrape_details[data_type]['pattern'])
|
62
|
+
).squish
|
63
|
+
when :readability
|
64
|
+
content = Readability::Document.new(
|
65
|
+
@payload,
|
66
|
+
remove_empty_nodes: true,
|
67
|
+
tags: %w(div p img a table tr th tbody td h1 h2 h3 h4 h5 h6),
|
68
|
+
attributes: %w(src href colspan rowspan)
|
69
|
+
).content
|
70
|
+
# Remove any newlines in the text
|
71
|
+
content = content.squeeze("\n").strip
|
72
|
+
HtmlBeautifier.beautify(content)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module Transformers
|
3
|
+
class TrainerArticle < Article
|
4
|
+
# Initialize a TrainerArticle object
|
5
|
+
#
|
6
|
+
# *Params*
|
7
|
+
# - <code>url</code>: keyword arg - the url on which scraping was done
|
8
|
+
# - <code>payload</code>: keyword arg - the result of the scrape
|
9
|
+
# - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
|
10
|
+
#
|
11
|
+
def initialize(url:, payload:, scrape_details:)
|
12
|
+
@scrape_details = scrape_details
|
13
|
+
super(url: url, payload: payload)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module NewsScraper
|
4
|
+
class URIParser
|
5
|
+
# Initialize a URIParser
|
6
|
+
#
|
7
|
+
# *Params*
|
8
|
+
# - <code>url</code>: the url to parse to a uri
|
9
|
+
#
|
10
|
+
def initialize(url)
|
11
|
+
@uri = URI.parse(url)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Removes the scheme from the URI
|
15
|
+
#
|
16
|
+
# *Returns*
|
17
|
+
# - A schemeless URI string, e.g. https://google.ca will return google.ca
|
18
|
+
#
|
19
|
+
def without_scheme
|
20
|
+
@uri.scheme ? @uri.to_s.gsub(%r{^#{@uri.scheme}://}, '') : @uri.to_s
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the URI with a scheme, adding http:// if no scheme is present
|
24
|
+
#
|
25
|
+
# *Returns*
|
26
|
+
# - A URI string, with http:// if no scheme was specified
|
27
|
+
#
|
28
|
+
def with_scheme
|
29
|
+
@uri.scheme ? @uri.to_s : "http://#{@uri}"
|
30
|
+
end
|
31
|
+
|
32
|
+
# Returns the URI's host, removing paths, params, and schemes
|
33
|
+
#
|
34
|
+
# *Returns*
|
35
|
+
# - The URI's host, e.g. https://google.ca/search&q=query will return google.ca
|
36
|
+
#
|
37
|
+
def host
|
38
|
+
without_scheme.downcase.match(/^(?:[\w\d-]+\.)?(?<host>[\w\d-]+\.\w{2,})/)['host']
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/news_scraper.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
require 'news_scraper/constants'
|
5
|
+
require 'news_scraper/uri_parser'
|
6
|
+
require 'news_scraper/active_support_lite/string'
|
7
|
+
|
8
|
+
require 'news_scraper/errors'
|
9
|
+
require 'news_scraper/version'
|
10
|
+
|
11
|
+
require 'news_scraper/extractors_helpers'
|
12
|
+
|
13
|
+
require 'news_scraper/extractors/google_news_rss'
|
14
|
+
require 'news_scraper/extractors/article'
|
15
|
+
|
16
|
+
require 'news_scraper/transformers/article'
|
17
|
+
require 'news_scraper/transformers/trainer_article'
|
18
|
+
|
19
|
+
require 'news_scraper/scraper'
|
20
|
+
|
21
|
+
require 'news_scraper/cli'
|
22
|
+
require 'news_scraper/trainer'
|
23
|
+
|
24
|
+
module NewsScraper
|
25
|
+
extend self
|
26
|
+
|
27
|
+
# <code>NewsScraper::train</code> is an interactive command-line prompt that:
|
28
|
+
#
|
29
|
+
# 1. Collates all articles for the given :query
|
30
|
+
# 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>config/article_scrape_patterns.yml</code>
|
31
|
+
# 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
|
32
|
+
# 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
|
33
|
+
# N.B: User may ignore all presets and manually configure it in the YAML file
|
34
|
+
# 5. Saves the selected <code>:preset</code> to <code>config/article_scrape_patterns.yml</code>
|
35
|
+
#
|
36
|
+
# *Params*
|
37
|
+
# - <code>query</code>: a keyword arugment specifying the query to train on
|
38
|
+
#
|
39
|
+
def train(query:)
|
40
|
+
Trainer.train(query: query)
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'news_scraper/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "news_scraper"
|
8
|
+
spec.version = NewsScraper::VERSION
|
9
|
+
spec.authors = ["Richard Wu", "Julian Nadeau"]
|
10
|
+
spec.email = ["richardwu1997@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = 'Simple ETL news scraper in Ruby'
|
13
|
+
spec.description = 'A collection of extractors, transformers and loaders for scraping news websites and syndicates.'
|
14
|
+
spec.homepage = 'https://github.com/richardwu/news_scraper'
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
raise "RubyGems 2.0 or newer is required to protect against public gem pushes." unless spec.respond_to?(:metadata)
|
20
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
21
|
+
|
22
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
23
|
+
spec.bindir = "exe"
|
24
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
25
|
+
spec.require_paths = ["lib"]
|
26
|
+
|
27
|
+
spec.add_dependency 'nokogiri', '~>1.6', '>= 1.6.8'
|
28
|
+
spec.add_dependency 'httparty', '~> 0.14', '>= 0.14.0'
|
29
|
+
spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
|
30
|
+
spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
|
31
|
+
spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
|
32
|
+
|
33
|
+
spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
|
34
|
+
spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
|
35
|
+
spec.add_development_dependency 'minitest', '~> 5.9', '>= 5.9.0'
|
36
|
+
spec.add_development_dependency 'pry', '~> 0.10', '>= 0.10.4'
|
37
|
+
spec.add_development_dependency 'mocha', '~> 1.1', '>= 1.1.0'
|
38
|
+
spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
|
39
|
+
spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
|
40
|
+
spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
|
41
|
+
end
|