news_scraper 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,41 @@
1
+ require 'rss'
2
+ require 'nokogiri'
3
+
4
+ module NewsScraper
5
+ module Extractors
6
+ class GoogleNewsRss
7
+ include ExtractorsHelpers
8
+
9
+ BASE_URL = 'https://news.google.com/news?cf=all&hl=en&pz=1&ned=us&output=rss'.freeze
10
+ def initialize(query:)
11
+ @query = query
12
+ end
13
+
14
+ def extract
15
+ http_request "#{BASE_URL}&q=#{@query}" do |response|
16
+ google_urls = google_urls_from_resp(response.body)
17
+ extract_article_urls(google_urls)
18
+ end
19
+ end
20
+
21
+ private
22
+
23
+ def google_urls_from_resp(body)
24
+ rss = RSS::Parser.parse(body)
25
+
26
+ rss.items.flat_map do |rss_item|
27
+ Nokogiri::HTML(rss_item.description).xpath('//a').map do |anchor|
28
+ anchor['href']
29
+ end
30
+ end
31
+ end
32
+
33
+ def extract_article_urls(google_urls)
34
+ google_urls.map do |google_url|
35
+ regex = google_url.match(%r{&url=(?<url>https?://.*)})
36
+ regex.nil? ? nil : regex['url']
37
+ end.compact.uniq
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,27 @@
1
+ module NewsScraper
2
+ module ExtractorsHelpers
3
+ # Perform an HTTP request with a standardized response
4
+ #
5
+ # *Params*
6
+ # - <code>url</code>: the url on which to perform a get request
7
+ #
8
+ def http_request(url)
9
+ url = URIParser.new(url).with_scheme
10
+
11
+ CLI.put_header(url)
12
+ CLI.log "Beginning HTTP request for #{url}"
13
+ response = HTTParty.get(url)
14
+
15
+ raise ResponseError.new("#{response.code} - #{response.message}") unless response.code == 200
16
+
17
+ CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
18
+ CLI.put_footer
19
+
20
+ if block_given?
21
+ yield response
22
+ else
23
+ response
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ module NewsScraper
2
+ class Scraper
3
+ # Initialize a Scraper object
4
+ #
5
+ # *Params*
6
+ # - <code>query</code>: a keyword arugment specifying the query to scrape
7
+ #
8
+ def initialize(query:)
9
+ @query = query
10
+ end
11
+
12
+ # Fetches articles from Extraction sources and scrapes the results
13
+ #
14
+ # *Yields*
15
+ # - Will yield individually extracted articles
16
+ #
17
+ # *Raises*
18
+ # - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
19
+ # - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
20
+ # - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
21
+ # - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
22
+ #
23
+ # *Returns*
24
+ # - <code>transformed_articles</code>: The transformed articles fetched from the extracted sources
25
+ #
26
+ def scrape
27
+ article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
28
+
29
+ transformed_articles = []
30
+ article_urls.each do |article_url|
31
+ payload = Extractors::Article.new(url: article_url).extract
32
+
33
+ transformed_article = Transformers::Article.new(url: article_url, payload: payload).transform
34
+ transformed_articles << transformed_article
35
+
36
+ yield transformed_article if block_given?
37
+ end
38
+
39
+ transformed_articles
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,77 @@
1
+ module NewsScraper
2
+ module Trainer
3
+ class PresetSelector
4
+ PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
5
+
6
+ def initialize(data_type:, data_type_presets:, url:, payload:)
7
+ @url = url
8
+ @payload = payload
9
+ @data_type_presets = data_type_presets
10
+ @data_type = data_type
11
+ end
12
+
13
+ def select
14
+ return unless @data_type_presets
15
+
16
+ selected_option = CLI.prompt_with_options(
17
+ "Select which preset to use for #{@data_type}:",
18
+ pattern_options.keys
19
+ )
20
+
21
+ if selected_option.start_with?(PROVIDER_PHRASE)
22
+ pattern_type = pattern_options[selected_option]
23
+ return {
24
+ 'method' => pattern_type,
25
+ 'pattern' => CLI.get_input("Provide the #{pattern_type} pattern:")
26
+ }
27
+ end
28
+ return if selected_option == 'skip'
29
+
30
+ selected_index = pattern_options[selected_option]
31
+ selected_preset_code = transform_results[selected_index].first
32
+ @data_type_presets[selected_preset_code].merge('variable' => [selected_preset_code, @data_type].join('_'))
33
+ end
34
+
35
+ private
36
+
37
+ def pattern_options
38
+ return {} unless @data_type_presets
39
+
40
+ @pattern_options ||= begin
41
+ temp_options = transform_results.each_with_object({}).with_index do |(results, options_hash), index|
42
+ preset_name = "#{results[0]}_#{@data_type}"
43
+ extracted_text = results[1]
44
+ options_hash["#{preset_name}: #{extracted_text}"] = index
45
+ end
46
+ %w(xpath css).each do |pattern_provider|
47
+ temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
48
+ end
49
+ temp_options.merge('skip' => 'skip')
50
+ end
51
+ end
52
+
53
+ def transform_results
54
+ return {} unless @data_type_presets
55
+
56
+ scrape_details = blank_scrape_details
57
+ @results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
58
+ scrape_details[@data_type] = preset_details
59
+ train_transformer = Transformers::TrainerArticle.new(
60
+ url: @url,
61
+ payload: @payload,
62
+ scrape_details: scrape_details,
63
+ )
64
+
65
+ transformed_result = train_transformer.transform[@data_type.to_sym]
66
+ hash[preset_name] = transformed_result if transformed_result && !transformed_result.empty?
67
+ end.to_a
68
+ end
69
+
70
+ def blank_scrape_details
71
+ @blank_scrape_details ||= Constants::SCRAPE_PATTERNS.each_with_object({}) do |data_type, hash|
72
+ hash[data_type] = nil
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,74 @@
1
+ module NewsScraper
2
+ module Trainer
3
+ class UrlTrainer
4
+ def initialize(url)
5
+ @url = url
6
+ @root_domain = URIParser.new(@url).host
7
+ @payload = Extractors::Article.new(url: @url).extract
8
+ end
9
+
10
+ def train
11
+ return if article_scrape_patterns['domains'].key?(@root_domain)
12
+
13
+ CLI.put_header(@root_domain)
14
+ CLI.log("There is no scrape pattern defined for #{@root_domain} in #{Constants::SCRAPE_PATTERN_FILEPATH}")
15
+ CLI.log "Fetching information..."
16
+ CLI.put_footer
17
+
18
+ selected_presets = {}
19
+ article_scrape_patterns['data_types'].each do |data_type|
20
+ selected_presets[data_type] = selected_pattern(data_type)
21
+ end
22
+
23
+ save_selected_presets(selected_presets)
24
+ end
25
+
26
+ private
27
+
28
+ def selected_pattern(data_type)
29
+ CLI.put_header("Determining information for #{data_type}")
30
+ data_type_presets = article_scrape_patterns['presets'][data_type]
31
+ pattern = if data_type_presets.nil?
32
+ CLI.log("No presets were found for #{data_type}. Skipping to next.")
33
+ nil
34
+ else
35
+ PresetSelector.new(
36
+ url: @url,
37
+ payload: @payload,
38
+ data_type_presets: data_type_presets,
39
+ data_type: data_type
40
+ ).select
41
+ end
42
+ CLI.put_footer
43
+
44
+ pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
45
+ end
46
+
47
+ def save_selected_presets(selected_presets)
48
+ current_content = File.read(Constants::SCRAPE_PATTERN_FILEPATH).chomp
49
+ new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
50
+
51
+ File.write(Constants::SCRAPE_PATTERN_FILEPATH, new_content)
52
+ CLI.log("Successfully wrote presets for #{@root_domain} to #{Constants::SCRAPE_PATTERN_FILEPATH}.")
53
+ end
54
+
55
+ def build_domain_yaml(selected_presets)
56
+ spacer = " "
57
+ output_string = ["#{spacer}#{@root_domain}:"]
58
+ selected_presets.each do |data_type, spec|
59
+ if spec.include?('variable')
60
+ output_string << (spacer * 2) + "#{data_type}: *#{spec['variable']}"
61
+ else
62
+ output_string << (spacer * 2) + "#{data_type}:"
63
+ spec.each { |k, v| output_string << (spacer * 3) + "#{k}: #{v}" }
64
+ end
65
+ end
66
+ output_string.join("\n")
67
+ end
68
+
69
+ def article_scrape_patterns
70
+ @article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,25 @@
1
+ require 'news_scraper/trainer/preset_selector'
2
+ require 'news_scraper/trainer/url_trainer'
3
+
4
+ module NewsScraper
5
+ module Trainer
6
+ extend self
7
+
8
+ # Fetches articles from Extraction sources and trains on the results
9
+ #
10
+ # *Training* is a process where we take an untrained url (root domain
11
+ # is not in <code>article_scrape_patterns.yml</code>) and determine patterns and methods
12
+ # to match the data_types listed in <code>article_scrape_patterns.yml</code>, then record
13
+ # them to the <code>article_scrape_patterns.yml</code> file
14
+ #
15
+ # *Params*
16
+ # - <code>query</code>: a keyword arugment specifying the query to train on
17
+ #
18
+ def train(query: '')
19
+ article_urls = Extractors::GoogleNewsRss.new(query: query).extract
20
+ article_urls.each do |url|
21
+ Trainer::UrlTrainer.new(url).train
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,77 @@
1
+ require 'nokogiri'
2
+ require 'sanitize'
3
+ require 'readability'
4
+ require 'htmlbeautifier'
5
+
6
+ module NewsScraper
7
+ module Transformers
8
+ class Article
9
+ # Initialize a Article object
10
+ #
11
+ # *Params*
12
+ # - <code>url</code>: keyword arg - the url on which scraping was done
13
+ # - <code>payload</code>: keyword arg - the result of the scrape
14
+ #
15
+ def initialize(url:, payload:)
16
+ uri_parser = URIParser.new(url)
17
+ @uri = uri_parser.without_scheme
18
+ @root_domain = uri_parser.host
19
+ @payload = payload
20
+ end
21
+
22
+ # Transform the article
23
+ #
24
+ # *Raises*
25
+ # - ScrapePatternNotDefined: will raise this error if the root domain is not in the article_scrape_patterns.yml
26
+ #
27
+ # *Returns*
28
+ # - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
29
+ #
30
+ def transform
31
+ raise ScrapePatternNotDefined.new(uri: @uri, root_domain: @root_domain) unless scrape_details
32
+
33
+ transformed_response.merge(uri: @uri, root_domain: @root_domain)
34
+ end
35
+
36
+ private
37
+
38
+ def scrape_details
39
+ @scrape_details ||= Constants::SCRAPE_PATTERNS['domains'][@root_domain]
40
+ end
41
+
42
+ def transformed_response
43
+ Constants::SCRAPE_PATTERNS['data_types'].each_with_object({}) do |data_type, response|
44
+ response[data_type.to_sym] = parsed_data(data_type)
45
+ end
46
+ end
47
+
48
+ def parsed_data(data_type)
49
+ return nil unless scrape_details[data_type]
50
+
51
+ scrape_method = scrape_details[data_type]['method'].to_sym
52
+ case scrape_method
53
+ when :xpath
54
+ noko_html = Nokogiri::HTML(@payload)
55
+ Sanitize.fragment(
56
+ noko_html.send(scrape_method, "(#{scrape_details[data_type]['pattern']})[1]")
57
+ ).squish
58
+ when :css
59
+ noko_html = Nokogiri::HTML(@payload)
60
+ Sanitize.fragment(
61
+ noko_html.send(scrape_method, scrape_details[data_type]['pattern'])
62
+ ).squish
63
+ when :readability
64
+ content = Readability::Document.new(
65
+ @payload,
66
+ remove_empty_nodes: true,
67
+ tags: %w(div p img a table tr th tbody td h1 h2 h3 h4 h5 h6),
68
+ attributes: %w(src href colspan rowspan)
69
+ ).content
70
+ # Remove any newlines in the text
71
+ content = content.squeeze("\n").strip
72
+ HtmlBeautifier.beautify(content)
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,17 @@
1
+ module NewsScraper
2
+ module Transformers
3
+ class TrainerArticle < Article
4
+ # Initialize a TrainerArticle object
5
+ #
6
+ # *Params*
7
+ # - <code>url</code>: keyword arg - the url on which scraping was done
8
+ # - <code>payload</code>: keyword arg - the result of the scrape
9
+ # - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
10
+ #
11
+ def initialize(url:, payload:, scrape_details:)
12
+ @scrape_details = scrape_details
13
+ super(url: url, payload: payload)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,41 @@
1
+ require 'uri'
2
+
3
+ module NewsScraper
4
+ class URIParser
5
+ # Initialize a URIParser
6
+ #
7
+ # *Params*
8
+ # - <code>url</code>: the url to parse to a uri
9
+ #
10
+ def initialize(url)
11
+ @uri = URI.parse(url)
12
+ end
13
+
14
+ # Removes the scheme from the URI
15
+ #
16
+ # *Returns*
17
+ # - A schemeless URI string, e.g. https://google.ca will return google.ca
18
+ #
19
+ def without_scheme
20
+ @uri.scheme ? @uri.to_s.gsub(%r{^#{@uri.scheme}://}, '') : @uri.to_s
21
+ end
22
+
23
+ # Returns the URI with a scheme, adding http:// if no scheme is present
24
+ #
25
+ # *Returns*
26
+ # - A URI string, with http:// if no scheme was specified
27
+ #
28
+ def with_scheme
29
+ @uri.scheme ? @uri.to_s : "http://#{@uri}"
30
+ end
31
+
32
+ # Returns the URI's host, removing paths, params, and schemes
33
+ #
34
+ # *Returns*
35
+ # - The URI's host, e.g. https://google.ca/search&q=query will return google.ca
36
+ #
37
+ def host
38
+ without_scheme.downcase.match(/^(?:[\w\d-]+\.)?(?<host>[\w\d-]+\.\w{2,})/)['host']
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,3 @@
1
+ module NewsScraper
2
+ VERSION = "0.1.1".freeze
3
+ end
@@ -0,0 +1,42 @@
1
+ require 'httparty'
2
+ require 'yaml'
3
+
4
+ require 'news_scraper/constants'
5
+ require 'news_scraper/uri_parser'
6
+ require 'news_scraper/active_support_lite/string'
7
+
8
+ require 'news_scraper/errors'
9
+ require 'news_scraper/version'
10
+
11
+ require 'news_scraper/extractors_helpers'
12
+
13
+ require 'news_scraper/extractors/google_news_rss'
14
+ require 'news_scraper/extractors/article'
15
+
16
+ require 'news_scraper/transformers/article'
17
+ require 'news_scraper/transformers/trainer_article'
18
+
19
+ require 'news_scraper/scraper'
20
+
21
+ require 'news_scraper/cli'
22
+ require 'news_scraper/trainer'
23
+
24
+ module NewsScraper
25
+ extend self
26
+
27
+ # <code>NewsScraper::train</code> is an interactive command-line prompt that:
28
+ #
29
+ # 1. Collates all articles for the given :query
30
+ # 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>config/article_scrape_patterns.yml</code>
31
+ # 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
32
+ # 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
33
+ # N.B: User may ignore all presets and manually configure it in the YAML file
34
+ # 5. Saves the selected <code>:preset</code> to <code>config/article_scrape_patterns.yml</code>
35
+ #
36
+ # *Params*
37
+ # - <code>query</code>: a keyword arugment specifying the query to train on
38
+ #
39
+ def train(query:)
40
+ Trainer.train(query: query)
41
+ end
42
+ end
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'news_scraper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "news_scraper"
8
+ spec.version = NewsScraper::VERSION
9
+ spec.authors = ["Richard Wu", "Julian Nadeau"]
10
+ spec.email = ["richardwu1997@gmail.com"]
11
+
12
+ spec.summary = 'Simple ETL news scraper in Ruby'
13
+ spec.description = 'A collection of extractors, transformers and loaders for scraping news websites and syndicates.'
14
+ spec.homepage = 'https://github.com/richardwu/news_scraper'
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ raise "RubyGems 2.0 or newer is required to protect against public gem pushes." unless spec.respond_to?(:metadata)
20
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
21
+
22
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.add_dependency 'nokogiri', '~>1.6', '>= 1.6.8'
28
+ spec.add_dependency 'httparty', '~> 0.14', '>= 0.14.0'
29
+ spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
30
+ spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
31
+ spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
32
+
33
+ spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
34
+ spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
35
+ spec.add_development_dependency 'minitest', '~> 5.9', '>= 5.9.0'
36
+ spec.add_development_dependency 'pry', '~> 0.10', '>= 0.10.4'
37
+ spec.add_development_dependency 'mocha', '~> 1.1', '>= 1.1.0'
38
+ spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
39
+ spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
40
+ spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
41
+ end