news_scraper 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +96 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +105 -0
- data/Rakefile +24 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/circle.yml +3 -0
- data/config/article_scrape_patterns.yml +116 -0
- data/config/temp_dirs.yml +4 -0
- data/dev.yml +13 -0
- data/lib/news_scraper/active_support_lite/string.rb +11 -0
- data/lib/news_scraper/cli.rb +106 -0
- data/lib/news_scraper/constants.rb +6 -0
- data/lib/news_scraper/errors.rb +16 -0
- data/lib/news_scraper/extractors/article.rb +17 -0
- data/lib/news_scraper/extractors/google_news_rss.rb +41 -0
- data/lib/news_scraper/extractors_helpers.rb +27 -0
- data/lib/news_scraper/scraper.rb +42 -0
- data/lib/news_scraper/trainer/preset_selector.rb +77 -0
- data/lib/news_scraper/trainer/url_trainer.rb +74 -0
- data/lib/news_scraper/trainer.rb +25 -0
- data/lib/news_scraper/transformers/article.rb +77 -0
- data/lib/news_scraper/transformers/trainer_article.rb +17 -0
- data/lib/news_scraper/uri_parser.rb +41 -0
- data/lib/news_scraper/version.rb +3 -0
- data/lib/news_scraper.rb +42 -0
- data/news_scraper.gemspec +41 -0
- metadata +337 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'rss'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module NewsScraper
|
5
|
+
module Extractors
|
6
|
+
class GoogleNewsRss
|
7
|
+
include ExtractorsHelpers
|
8
|
+
|
9
|
+
BASE_URL = 'https://news.google.com/news?cf=all&hl=en&pz=1&ned=us&output=rss'.freeze
|
10
|
+
def initialize(query:)
|
11
|
+
@query = query
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract
|
15
|
+
http_request "#{BASE_URL}&q=#{@query}" do |response|
|
16
|
+
google_urls = google_urls_from_resp(response.body)
|
17
|
+
extract_article_urls(google_urls)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def google_urls_from_resp(body)
|
24
|
+
rss = RSS::Parser.parse(body)
|
25
|
+
|
26
|
+
rss.items.flat_map do |rss_item|
|
27
|
+
Nokogiri::HTML(rss_item.description).xpath('//a').map do |anchor|
|
28
|
+
anchor['href']
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def extract_article_urls(google_urls)
|
34
|
+
google_urls.map do |google_url|
|
35
|
+
regex = google_url.match(%r{&url=(?<url>https?://.*)})
|
36
|
+
regex.nil? ? nil : regex['url']
|
37
|
+
end.compact.uniq
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module ExtractorsHelpers
|
3
|
+
# Perform an HTTP request with a standardized response
|
4
|
+
#
|
5
|
+
# *Params*
|
6
|
+
# - <code>url</code>: the url on which to perform a get request
|
7
|
+
#
|
8
|
+
def http_request(url)
|
9
|
+
url = URIParser.new(url).with_scheme
|
10
|
+
|
11
|
+
CLI.put_header(url)
|
12
|
+
CLI.log "Beginning HTTP request for #{url}"
|
13
|
+
response = HTTParty.get(url)
|
14
|
+
|
15
|
+
raise ResponseError.new("#{response.code} - #{response.message}") unless response.code == 200
|
16
|
+
|
17
|
+
CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
|
18
|
+
CLI.put_footer
|
19
|
+
|
20
|
+
if block_given?
|
21
|
+
yield response
|
22
|
+
else
|
23
|
+
response
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
class Scraper
|
3
|
+
# Initialize a Scraper object
|
4
|
+
#
|
5
|
+
# *Params*
|
6
|
+
# - <code>query</code>: a keyword arugment specifying the query to scrape
|
7
|
+
#
|
8
|
+
def initialize(query:)
|
9
|
+
@query = query
|
10
|
+
end
|
11
|
+
|
12
|
+
# Fetches articles from Extraction sources and scrapes the results
|
13
|
+
#
|
14
|
+
# *Yields*
|
15
|
+
# - Will yield individually extracted articles
|
16
|
+
#
|
17
|
+
# *Raises*
|
18
|
+
# - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
|
19
|
+
# - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
|
20
|
+
# - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
|
21
|
+
# - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
|
22
|
+
#
|
23
|
+
# *Returns*
|
24
|
+
# - <code>transformed_articles</code>: The transformed articles fetched from the extracted sources
|
25
|
+
#
|
26
|
+
def scrape
|
27
|
+
article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
|
28
|
+
|
29
|
+
transformed_articles = []
|
30
|
+
article_urls.each do |article_url|
|
31
|
+
payload = Extractors::Article.new(url: article_url).extract
|
32
|
+
|
33
|
+
transformed_article = Transformers::Article.new(url: article_url, payload: payload).transform
|
34
|
+
transformed_articles << transformed_article
|
35
|
+
|
36
|
+
yield transformed_article if block_given?
|
37
|
+
end
|
38
|
+
|
39
|
+
transformed_articles
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module Trainer
|
3
|
+
class PresetSelector
|
4
|
+
PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
|
5
|
+
|
6
|
+
def initialize(data_type:, data_type_presets:, url:, payload:)
|
7
|
+
@url = url
|
8
|
+
@payload = payload
|
9
|
+
@data_type_presets = data_type_presets
|
10
|
+
@data_type = data_type
|
11
|
+
end
|
12
|
+
|
13
|
+
def select
|
14
|
+
return unless @data_type_presets
|
15
|
+
|
16
|
+
selected_option = CLI.prompt_with_options(
|
17
|
+
"Select which preset to use for #{@data_type}:",
|
18
|
+
pattern_options.keys
|
19
|
+
)
|
20
|
+
|
21
|
+
if selected_option.start_with?(PROVIDER_PHRASE)
|
22
|
+
pattern_type = pattern_options[selected_option]
|
23
|
+
return {
|
24
|
+
'method' => pattern_type,
|
25
|
+
'pattern' => CLI.get_input("Provide the #{pattern_type} pattern:")
|
26
|
+
}
|
27
|
+
end
|
28
|
+
return if selected_option == 'skip'
|
29
|
+
|
30
|
+
selected_index = pattern_options[selected_option]
|
31
|
+
selected_preset_code = transform_results[selected_index].first
|
32
|
+
@data_type_presets[selected_preset_code].merge('variable' => [selected_preset_code, @data_type].join('_'))
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def pattern_options
|
38
|
+
return {} unless @data_type_presets
|
39
|
+
|
40
|
+
@pattern_options ||= begin
|
41
|
+
temp_options = transform_results.each_with_object({}).with_index do |(results, options_hash), index|
|
42
|
+
preset_name = "#{results[0]}_#{@data_type}"
|
43
|
+
extracted_text = results[1]
|
44
|
+
options_hash["#{preset_name}: #{extracted_text}"] = index
|
45
|
+
end
|
46
|
+
%w(xpath css).each do |pattern_provider|
|
47
|
+
temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
|
48
|
+
end
|
49
|
+
temp_options.merge('skip' => 'skip')
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def transform_results
|
54
|
+
return {} unless @data_type_presets
|
55
|
+
|
56
|
+
scrape_details = blank_scrape_details
|
57
|
+
@results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
|
58
|
+
scrape_details[@data_type] = preset_details
|
59
|
+
train_transformer = Transformers::TrainerArticle.new(
|
60
|
+
url: @url,
|
61
|
+
payload: @payload,
|
62
|
+
scrape_details: scrape_details,
|
63
|
+
)
|
64
|
+
|
65
|
+
transformed_result = train_transformer.transform[@data_type.to_sym]
|
66
|
+
hash[preset_name] = transformed_result if transformed_result && !transformed_result.empty?
|
67
|
+
end.to_a
|
68
|
+
end
|
69
|
+
|
70
|
+
def blank_scrape_details
|
71
|
+
@blank_scrape_details ||= Constants::SCRAPE_PATTERNS.each_with_object({}) do |data_type, hash|
|
72
|
+
hash[data_type] = nil
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module Trainer
|
3
|
+
class UrlTrainer
|
4
|
+
def initialize(url)
|
5
|
+
@url = url
|
6
|
+
@root_domain = URIParser.new(@url).host
|
7
|
+
@payload = Extractors::Article.new(url: @url).extract
|
8
|
+
end
|
9
|
+
|
10
|
+
def train
|
11
|
+
return if article_scrape_patterns['domains'].key?(@root_domain)
|
12
|
+
|
13
|
+
CLI.put_header(@root_domain)
|
14
|
+
CLI.log("There is no scrape pattern defined for #{@root_domain} in #{Constants::SCRAPE_PATTERN_FILEPATH}")
|
15
|
+
CLI.log "Fetching information..."
|
16
|
+
CLI.put_footer
|
17
|
+
|
18
|
+
selected_presets = {}
|
19
|
+
article_scrape_patterns['data_types'].each do |data_type|
|
20
|
+
selected_presets[data_type] = selected_pattern(data_type)
|
21
|
+
end
|
22
|
+
|
23
|
+
save_selected_presets(selected_presets)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def selected_pattern(data_type)
|
29
|
+
CLI.put_header("Determining information for #{data_type}")
|
30
|
+
data_type_presets = article_scrape_patterns['presets'][data_type]
|
31
|
+
pattern = if data_type_presets.nil?
|
32
|
+
CLI.log("No presets were found for #{data_type}. Skipping to next.")
|
33
|
+
nil
|
34
|
+
else
|
35
|
+
PresetSelector.new(
|
36
|
+
url: @url,
|
37
|
+
payload: @payload,
|
38
|
+
data_type_presets: data_type_presets,
|
39
|
+
data_type: data_type
|
40
|
+
).select
|
41
|
+
end
|
42
|
+
CLI.put_footer
|
43
|
+
|
44
|
+
pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
|
45
|
+
end
|
46
|
+
|
47
|
+
def save_selected_presets(selected_presets)
|
48
|
+
current_content = File.read(Constants::SCRAPE_PATTERN_FILEPATH).chomp
|
49
|
+
new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
|
50
|
+
|
51
|
+
File.write(Constants::SCRAPE_PATTERN_FILEPATH, new_content)
|
52
|
+
CLI.log("Successfully wrote presets for #{@root_domain} to #{Constants::SCRAPE_PATTERN_FILEPATH}.")
|
53
|
+
end
|
54
|
+
|
55
|
+
def build_domain_yaml(selected_presets)
|
56
|
+
spacer = " "
|
57
|
+
output_string = ["#{spacer}#{@root_domain}:"]
|
58
|
+
selected_presets.each do |data_type, spec|
|
59
|
+
if spec.include?('variable')
|
60
|
+
output_string << (spacer * 2) + "#{data_type}: *#{spec['variable']}"
|
61
|
+
else
|
62
|
+
output_string << (spacer * 2) + "#{data_type}:"
|
63
|
+
spec.each { |k, v| output_string << (spacer * 3) + "#{k}: #{v}" }
|
64
|
+
end
|
65
|
+
end
|
66
|
+
output_string.join("\n")
|
67
|
+
end
|
68
|
+
|
69
|
+
def article_scrape_patterns
|
70
|
+
@article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'news_scraper/trainer/preset_selector'
|
2
|
+
require 'news_scraper/trainer/url_trainer'
|
3
|
+
|
4
|
+
module NewsScraper
|
5
|
+
module Trainer
|
6
|
+
extend self
|
7
|
+
|
8
|
+
# Fetches articles from Extraction sources and trains on the results
|
9
|
+
#
|
10
|
+
# *Training* is a process where we take an untrained url (root domain
|
11
|
+
# is not in <code>article_scrape_patterns.yml</code>) and determine patterns and methods
|
12
|
+
# to match the data_types listed in <code>article_scrape_patterns.yml</code>, then record
|
13
|
+
# them to the <code>article_scrape_patterns.yml</code> file
|
14
|
+
#
|
15
|
+
# *Params*
|
16
|
+
# - <code>query</code>: a keyword arugment specifying the query to train on
|
17
|
+
#
|
18
|
+
def train(query: '')
|
19
|
+
article_urls = Extractors::GoogleNewsRss.new(query: query).extract
|
20
|
+
article_urls.each do |url|
|
21
|
+
Trainer::UrlTrainer.new(url).train
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'sanitize'
|
3
|
+
require 'readability'
|
4
|
+
require 'htmlbeautifier'
|
5
|
+
|
6
|
+
module NewsScraper
|
7
|
+
module Transformers
|
8
|
+
class Article
|
9
|
+
# Initialize a Article object
|
10
|
+
#
|
11
|
+
# *Params*
|
12
|
+
# - <code>url</code>: keyword arg - the url on which scraping was done
|
13
|
+
# - <code>payload</code>: keyword arg - the result of the scrape
|
14
|
+
#
|
15
|
+
def initialize(url:, payload:)
|
16
|
+
uri_parser = URIParser.new(url)
|
17
|
+
@uri = uri_parser.without_scheme
|
18
|
+
@root_domain = uri_parser.host
|
19
|
+
@payload = payload
|
20
|
+
end
|
21
|
+
|
22
|
+
# Transform the article
|
23
|
+
#
|
24
|
+
# *Raises*
|
25
|
+
# - ScrapePatternNotDefined: will raise this error if the root domain is not in the article_scrape_patterns.yml
|
26
|
+
#
|
27
|
+
# *Returns*
|
28
|
+
# - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
|
29
|
+
#
|
30
|
+
def transform
|
31
|
+
raise ScrapePatternNotDefined.new(uri: @uri, root_domain: @root_domain) unless scrape_details
|
32
|
+
|
33
|
+
transformed_response.merge(uri: @uri, root_domain: @root_domain)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def scrape_details
|
39
|
+
@scrape_details ||= Constants::SCRAPE_PATTERNS['domains'][@root_domain]
|
40
|
+
end
|
41
|
+
|
42
|
+
def transformed_response
|
43
|
+
Constants::SCRAPE_PATTERNS['data_types'].each_with_object({}) do |data_type, response|
|
44
|
+
response[data_type.to_sym] = parsed_data(data_type)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def parsed_data(data_type)
|
49
|
+
return nil unless scrape_details[data_type]
|
50
|
+
|
51
|
+
scrape_method = scrape_details[data_type]['method'].to_sym
|
52
|
+
case scrape_method
|
53
|
+
when :xpath
|
54
|
+
noko_html = Nokogiri::HTML(@payload)
|
55
|
+
Sanitize.fragment(
|
56
|
+
noko_html.send(scrape_method, "(#{scrape_details[data_type]['pattern']})[1]")
|
57
|
+
).squish
|
58
|
+
when :css
|
59
|
+
noko_html = Nokogiri::HTML(@payload)
|
60
|
+
Sanitize.fragment(
|
61
|
+
noko_html.send(scrape_method, scrape_details[data_type]['pattern'])
|
62
|
+
).squish
|
63
|
+
when :readability
|
64
|
+
content = Readability::Document.new(
|
65
|
+
@payload,
|
66
|
+
remove_empty_nodes: true,
|
67
|
+
tags: %w(div p img a table tr th tbody td h1 h2 h3 h4 h5 h6),
|
68
|
+
attributes: %w(src href colspan rowspan)
|
69
|
+
).content
|
70
|
+
# Remove any newlines in the text
|
71
|
+
content = content.squeeze("\n").strip
|
72
|
+
HtmlBeautifier.beautify(content)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
module Transformers
|
3
|
+
class TrainerArticle < Article
|
4
|
+
# Initialize a TrainerArticle object
|
5
|
+
#
|
6
|
+
# *Params*
|
7
|
+
# - <code>url</code>: keyword arg - the url on which scraping was done
|
8
|
+
# - <code>payload</code>: keyword arg - the result of the scrape
|
9
|
+
# - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
|
10
|
+
#
|
11
|
+
def initialize(url:, payload:, scrape_details:)
|
12
|
+
@scrape_details = scrape_details
|
13
|
+
super(url: url, payload: payload)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module NewsScraper
|
4
|
+
class URIParser
|
5
|
+
# Initialize a URIParser
|
6
|
+
#
|
7
|
+
# *Params*
|
8
|
+
# - <code>url</code>: the url to parse to a uri
|
9
|
+
#
|
10
|
+
def initialize(url)
|
11
|
+
@uri = URI.parse(url)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Removes the scheme from the URI
|
15
|
+
#
|
16
|
+
# *Returns*
|
17
|
+
# - A schemeless URI string, e.g. https://google.ca will return google.ca
|
18
|
+
#
|
19
|
+
def without_scheme
|
20
|
+
@uri.scheme ? @uri.to_s.gsub(%r{^#{@uri.scheme}://}, '') : @uri.to_s
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the URI with a scheme, adding http:// if no scheme is present
|
24
|
+
#
|
25
|
+
# *Returns*
|
26
|
+
# - A URI string, with http:// if no scheme was specified
|
27
|
+
#
|
28
|
+
def with_scheme
|
29
|
+
@uri.scheme ? @uri.to_s : "http://#{@uri}"
|
30
|
+
end
|
31
|
+
|
32
|
+
# Returns the URI's host, removing paths, params, and schemes
|
33
|
+
#
|
34
|
+
# *Returns*
|
35
|
+
# - The URI's host, e.g. https://google.ca/search&q=query will return google.ca
|
36
|
+
#
|
37
|
+
def host
|
38
|
+
without_scheme.downcase.match(/^(?:[\w\d-]+\.)?(?<host>[\w\d-]+\.\w{2,})/)['host']
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/news_scraper.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
require 'news_scraper/constants'
|
5
|
+
require 'news_scraper/uri_parser'
|
6
|
+
require 'news_scraper/active_support_lite/string'
|
7
|
+
|
8
|
+
require 'news_scraper/errors'
|
9
|
+
require 'news_scraper/version'
|
10
|
+
|
11
|
+
require 'news_scraper/extractors_helpers'
|
12
|
+
|
13
|
+
require 'news_scraper/extractors/google_news_rss'
|
14
|
+
require 'news_scraper/extractors/article'
|
15
|
+
|
16
|
+
require 'news_scraper/transformers/article'
|
17
|
+
require 'news_scraper/transformers/trainer_article'
|
18
|
+
|
19
|
+
require 'news_scraper/scraper'
|
20
|
+
|
21
|
+
require 'news_scraper/cli'
|
22
|
+
require 'news_scraper/trainer'
|
23
|
+
|
24
|
+
module NewsScraper
|
25
|
+
extend self
|
26
|
+
|
27
|
+
# <code>NewsScraper::train</code> is an interactive command-line prompt that:
|
28
|
+
#
|
29
|
+
# 1. Collates all articles for the given :query
|
30
|
+
# 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>config/article_scrape_patterns.yml</code>
|
31
|
+
# 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
|
32
|
+
# 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
|
33
|
+
# N.B: User may ignore all presets and manually configure it in the YAML file
|
34
|
+
# 5. Saves the selected <code>:preset</code> to <code>config/article_scrape_patterns.yml</code>
|
35
|
+
#
|
36
|
+
# *Params*
|
37
|
+
# - <code>query</code>: a keyword arugment specifying the query to train on
|
38
|
+
#
|
39
|
+
def train(query:)
|
40
|
+
Trainer.train(query: query)
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'news_scraper/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "news_scraper"
|
8
|
+
spec.version = NewsScraper::VERSION
|
9
|
+
spec.authors = ["Richard Wu", "Julian Nadeau"]
|
10
|
+
spec.email = ["richardwu1997@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = 'Simple ETL news scraper in Ruby'
|
13
|
+
spec.description = 'A collection of extractors, transformers and loaders for scraping news websites and syndicates.'
|
14
|
+
spec.homepage = 'https://github.com/richardwu/news_scraper'
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
raise "RubyGems 2.0 or newer is required to protect against public gem pushes." unless spec.respond_to?(:metadata)
|
20
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
21
|
+
|
22
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
23
|
+
spec.bindir = "exe"
|
24
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
25
|
+
spec.require_paths = ["lib"]
|
26
|
+
|
27
|
+
spec.add_dependency 'nokogiri', '~>1.6', '>= 1.6.8'
|
28
|
+
spec.add_dependency 'httparty', '~> 0.14', '>= 0.14.0'
|
29
|
+
spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
|
30
|
+
spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
|
31
|
+
spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
|
32
|
+
|
33
|
+
spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
|
34
|
+
spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
|
35
|
+
spec.add_development_dependency 'minitest', '~> 5.9', '>= 5.9.0'
|
36
|
+
spec.add_development_dependency 'pry', '~> 0.10', '>= 0.10.4'
|
37
|
+
spec.add_development_dependency 'mocha', '~> 1.1', '>= 1.1.0'
|
38
|
+
spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
|
39
|
+
spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
|
40
|
+
spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
|
41
|
+
end
|