news_scraper 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ require 'rss'
2
+ require 'nokogiri'
3
+
4
+ module NewsScraper
5
+ module Extractors
6
+ class GoogleNewsRss
7
+ include ExtractorsHelpers
8
+
9
+ BASE_URL = 'https://news.google.com/news?cf=all&hl=en&pz=1&ned=us&output=rss'.freeze
10
+ def initialize(query:)
11
+ @query = query
12
+ end
13
+
14
+ def extract
15
+ http_request "#{BASE_URL}&q=#{@query}" do |response|
16
+ google_urls = google_urls_from_resp(response.body)
17
+ extract_article_urls(google_urls)
18
+ end
19
+ end
20
+
21
+ private
22
+
23
+ def google_urls_from_resp(body)
24
+ rss = RSS::Parser.parse(body)
25
+
26
+ rss.items.flat_map do |rss_item|
27
+ Nokogiri::HTML(rss_item.description).xpath('//a').map do |anchor|
28
+ anchor['href']
29
+ end
30
+ end
31
+ end
32
+
33
+ def extract_article_urls(google_urls)
34
+ google_urls.map do |google_url|
35
+ regex = google_url.match(%r{&url=(?<url>https?://.*)})
36
+ regex.nil? ? nil : regex['url']
37
+ end.compact.uniq
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,27 @@
1
+ module NewsScraper
2
+ module ExtractorsHelpers
3
+ # Perform an HTTP request with a standardized response
4
+ #
5
+ # *Params*
6
+ # - <code>url</code>: the url on which to perform a get request
7
+ #
8
+ def http_request(url)
9
+ url = URIParser.new(url).with_scheme
10
+
11
+ CLI.put_header(url)
12
+ CLI.log "Beginning HTTP request for #{url}"
13
+ response = HTTParty.get(url)
14
+
15
+ raise ResponseError.new("#{response.code} - #{response.message}") unless response.code == 200
16
+
17
+ CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
18
+ CLI.put_footer
19
+
20
+ if block_given?
21
+ yield response
22
+ else
23
+ response
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ module NewsScraper
2
+ class Scraper
3
+ # Initialize a Scraper object
4
+ #
5
+ # *Params*
6
+ # - <code>query</code>: a keyword arugment specifying the query to scrape
7
+ #
8
+ def initialize(query:)
9
+ @query = query
10
+ end
11
+
12
+ # Fetches articles from Extraction sources and scrapes the results
13
+ #
14
+ # *Yields*
15
+ # - Will yield individually extracted articles
16
+ #
17
+ # *Raises*
18
+ # - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
19
+ # - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
20
+ # - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
21
+ # - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
22
+ #
23
+ # *Returns*
24
+ # - <code>transformed_articles</code>: The transformed articles fetched from the extracted sources
25
+ #
26
+ def scrape
27
+ article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
28
+
29
+ transformed_articles = []
30
+ article_urls.each do |article_url|
31
+ payload = Extractors::Article.new(url: article_url).extract
32
+
33
+ transformed_article = Transformers::Article.new(url: article_url, payload: payload).transform
34
+ transformed_articles << transformed_article
35
+
36
+ yield transformed_article if block_given?
37
+ end
38
+
39
+ transformed_articles
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,77 @@
1
+ module NewsScraper
2
+ module Trainer
3
+ class PresetSelector
4
+ PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
5
+
6
+ def initialize(data_type:, data_type_presets:, url:, payload:)
7
+ @url = url
8
+ @payload = payload
9
+ @data_type_presets = data_type_presets
10
+ @data_type = data_type
11
+ end
12
+
13
+ def select
14
+ return unless @data_type_presets
15
+
16
+ selected_option = CLI.prompt_with_options(
17
+ "Select which preset to use for #{@data_type}:",
18
+ pattern_options.keys
19
+ )
20
+
21
+ if selected_option.start_with?(PROVIDER_PHRASE)
22
+ pattern_type = pattern_options[selected_option]
23
+ return {
24
+ 'method' => pattern_type,
25
+ 'pattern' => CLI.get_input("Provide the #{pattern_type} pattern:")
26
+ }
27
+ end
28
+ return if selected_option == 'skip'
29
+
30
+ selected_index = pattern_options[selected_option]
31
+ selected_preset_code = transform_results[selected_index].first
32
+ @data_type_presets[selected_preset_code].merge('variable' => [selected_preset_code, @data_type].join('_'))
33
+ end
34
+
35
+ private
36
+
37
+ def pattern_options
38
+ return {} unless @data_type_presets
39
+
40
+ @pattern_options ||= begin
41
+ temp_options = transform_results.each_with_object({}).with_index do |(results, options_hash), index|
42
+ preset_name = "#{results[0]}_#{@data_type}"
43
+ extracted_text = results[1]
44
+ options_hash["#{preset_name}: #{extracted_text}"] = index
45
+ end
46
+ %w(xpath css).each do |pattern_provider|
47
+ temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
48
+ end
49
+ temp_options.merge('skip' => 'skip')
50
+ end
51
+ end
52
+
53
+ def transform_results
54
+ return {} unless @data_type_presets
55
+
56
+ scrape_details = blank_scrape_details
57
+ @results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
58
+ scrape_details[@data_type] = preset_details
59
+ train_transformer = Transformers::TrainerArticle.new(
60
+ url: @url,
61
+ payload: @payload,
62
+ scrape_details: scrape_details,
63
+ )
64
+
65
+ transformed_result = train_transformer.transform[@data_type.to_sym]
66
+ hash[preset_name] = transformed_result if transformed_result && !transformed_result.empty?
67
+ end.to_a
68
+ end
69
+
70
+ def blank_scrape_details
71
+ @blank_scrape_details ||= Constants::SCRAPE_PATTERNS.each_with_object({}) do |data_type, hash|
72
+ hash[data_type] = nil
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,74 @@
1
+ module NewsScraper
2
+ module Trainer
3
+ class UrlTrainer
4
+ def initialize(url)
5
+ @url = url
6
+ @root_domain = URIParser.new(@url).host
7
+ @payload = Extractors::Article.new(url: @url).extract
8
+ end
9
+
10
+ def train
11
+ return if article_scrape_patterns['domains'].key?(@root_domain)
12
+
13
+ CLI.put_header(@root_domain)
14
+ CLI.log("There is no scrape pattern defined for #{@root_domain} in #{Constants::SCRAPE_PATTERN_FILEPATH}")
15
+ CLI.log "Fetching information..."
16
+ CLI.put_footer
17
+
18
+ selected_presets = {}
19
+ article_scrape_patterns['data_types'].each do |data_type|
20
+ selected_presets[data_type] = selected_pattern(data_type)
21
+ end
22
+
23
+ save_selected_presets(selected_presets)
24
+ end
25
+
26
+ private
27
+
28
+ def selected_pattern(data_type)
29
+ CLI.put_header("Determining information for #{data_type}")
30
+ data_type_presets = article_scrape_patterns['presets'][data_type]
31
+ pattern = if data_type_presets.nil?
32
+ CLI.log("No presets were found for #{data_type}. Skipping to next.")
33
+ nil
34
+ else
35
+ PresetSelector.new(
36
+ url: @url,
37
+ payload: @payload,
38
+ data_type_presets: data_type_presets,
39
+ data_type: data_type
40
+ ).select
41
+ end
42
+ CLI.put_footer
43
+
44
+ pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
45
+ end
46
+
47
+ def save_selected_presets(selected_presets)
48
+ current_content = File.read(Constants::SCRAPE_PATTERN_FILEPATH).chomp
49
+ new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
50
+
51
+ File.write(Constants::SCRAPE_PATTERN_FILEPATH, new_content)
52
+ CLI.log("Successfully wrote presets for #{@root_domain} to #{Constants::SCRAPE_PATTERN_FILEPATH}.")
53
+ end
54
+
55
+ def build_domain_yaml(selected_presets)
56
+ spacer = " "
57
+ output_string = ["#{spacer}#{@root_domain}:"]
58
+ selected_presets.each do |data_type, spec|
59
+ if spec.include?('variable')
60
+ output_string << (spacer * 2) + "#{data_type}: *#{spec['variable']}"
61
+ else
62
+ output_string << (spacer * 2) + "#{data_type}:"
63
+ spec.each { |k, v| output_string << (spacer * 3) + "#{k}: #{v}" }
64
+ end
65
+ end
66
+ output_string.join("\n")
67
+ end
68
+
69
+ def article_scrape_patterns
70
+ @article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,25 @@
1
+ require 'news_scraper/trainer/preset_selector'
2
+ require 'news_scraper/trainer/url_trainer'
3
+
4
+ module NewsScraper
5
+ module Trainer
6
+ extend self
7
+
8
+ # Fetches articles from Extraction sources and trains on the results
9
+ #
10
+ # *Training* is a process where we take an untrained url (root domain
11
+ # is not in <code>article_scrape_patterns.yml</code>) and determine patterns and methods
12
+ # to match the data_types listed in <code>article_scrape_patterns.yml</code>, then record
13
+ # them to the <code>article_scrape_patterns.yml</code> file
14
+ #
15
+ # *Params*
16
+ # - <code>query</code>: a keyword arugment specifying the query to train on
17
+ #
18
+ def train(query: '')
19
+ article_urls = Extractors::GoogleNewsRss.new(query: query).extract
20
+ article_urls.each do |url|
21
+ Trainer::UrlTrainer.new(url).train
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,77 @@
1
+ require 'nokogiri'
2
+ require 'sanitize'
3
+ require 'readability'
4
+ require 'htmlbeautifier'
5
+
6
+ module NewsScraper
7
+ module Transformers
8
+ class Article
9
+ # Initialize a Article object
10
+ #
11
+ # *Params*
12
+ # - <code>url</code>: keyword arg - the url on which scraping was done
13
+ # - <code>payload</code>: keyword arg - the result of the scrape
14
+ #
15
+ def initialize(url:, payload:)
16
+ uri_parser = URIParser.new(url)
17
+ @uri = uri_parser.without_scheme
18
+ @root_domain = uri_parser.host
19
+ @payload = payload
20
+ end
21
+
22
+ # Transform the article
23
+ #
24
+ # *Raises*
25
+ # - ScrapePatternNotDefined: will raise this error if the root domain is not in the article_scrape_patterns.yml
26
+ #
27
+ # *Returns*
28
+ # - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
29
+ #
30
+ def transform
31
+ raise ScrapePatternNotDefined.new(uri: @uri, root_domain: @root_domain) unless scrape_details
32
+
33
+ transformed_response.merge(uri: @uri, root_domain: @root_domain)
34
+ end
35
+
36
+ private
37
+
38
+ def scrape_details
39
+ @scrape_details ||= Constants::SCRAPE_PATTERNS['domains'][@root_domain]
40
+ end
41
+
42
+ def transformed_response
43
+ Constants::SCRAPE_PATTERNS['data_types'].each_with_object({}) do |data_type, response|
44
+ response[data_type.to_sym] = parsed_data(data_type)
45
+ end
46
+ end
47
+
48
+ def parsed_data(data_type)
49
+ return nil unless scrape_details[data_type]
50
+
51
+ scrape_method = scrape_details[data_type]['method'].to_sym
52
+ case scrape_method
53
+ when :xpath
54
+ noko_html = Nokogiri::HTML(@payload)
55
+ Sanitize.fragment(
56
+ noko_html.send(scrape_method, "(#{scrape_details[data_type]['pattern']})[1]")
57
+ ).squish
58
+ when :css
59
+ noko_html = Nokogiri::HTML(@payload)
60
+ Sanitize.fragment(
61
+ noko_html.send(scrape_method, scrape_details[data_type]['pattern'])
62
+ ).squish
63
+ when :readability
64
+ content = Readability::Document.new(
65
+ @payload,
66
+ remove_empty_nodes: true,
67
+ tags: %w(div p img a table tr th tbody td h1 h2 h3 h4 h5 h6),
68
+ attributes: %w(src href colspan rowspan)
69
+ ).content
70
+ # Remove any newlines in the text
71
+ content = content.squeeze("\n").strip
72
+ HtmlBeautifier.beautify(content)
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,17 @@
1
+ module NewsScraper
2
+ module Transformers
3
+ class TrainerArticle < Article
4
+ # Initialize a TrainerArticle object
5
+ #
6
+ # *Params*
7
+ # - <code>url</code>: keyword arg - the url on which scraping was done
8
+ # - <code>payload</code>: keyword arg - the result of the scrape
9
+ # - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
10
+ #
11
+ def initialize(url:, payload:, scrape_details:)
12
+ @scrape_details = scrape_details
13
+ super(url: url, payload: payload)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,41 @@
1
+ require 'uri'
2
+
3
+ module NewsScraper
4
+ class URIParser
5
+ # Initialize a URIParser
6
+ #
7
+ # *Params*
8
+ # - <code>url</code>: the url to parse to a uri
9
+ #
10
+ def initialize(url)
11
+ @uri = URI.parse(url)
12
+ end
13
+
14
+ # Removes the scheme from the URI
15
+ #
16
+ # *Returns*
17
+ # - A schemeless URI string, e.g. https://google.ca will return google.ca
18
+ #
19
+ def without_scheme
20
+ @uri.scheme ? @uri.to_s.gsub(%r{^#{@uri.scheme}://}, '') : @uri.to_s
21
+ end
22
+
23
+ # Returns the URI with a scheme, adding http:// if no scheme is present
24
+ #
25
+ # *Returns*
26
+ # - A URI string, with http:// if no scheme was specified
27
+ #
28
+ def with_scheme
29
+ @uri.scheme ? @uri.to_s : "http://#{@uri}"
30
+ end
31
+
32
+ # Returns the URI's host, removing paths, params, and schemes
33
+ #
34
+ # *Returns*
35
+ # - The URI's host, e.g. https://google.ca/search&q=query will return google.ca
36
+ #
37
+ def host
38
+ without_scheme.downcase.match(/^(?:[\w\d-]+\.)?(?<host>[\w\d-]+\.\w{2,})/)['host']
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,3 @@
1
+ module NewsScraper
2
+ VERSION = "0.1.1".freeze
3
+ end
@@ -0,0 +1,42 @@
1
+ require 'httparty'
2
+ require 'yaml'
3
+
4
+ require 'news_scraper/constants'
5
+ require 'news_scraper/uri_parser'
6
+ require 'news_scraper/active_support_lite/string'
7
+
8
+ require 'news_scraper/errors'
9
+ require 'news_scraper/version'
10
+
11
+ require 'news_scraper/extractors_helpers'
12
+
13
+ require 'news_scraper/extractors/google_news_rss'
14
+ require 'news_scraper/extractors/article'
15
+
16
+ require 'news_scraper/transformers/article'
17
+ require 'news_scraper/transformers/trainer_article'
18
+
19
+ require 'news_scraper/scraper'
20
+
21
+ require 'news_scraper/cli'
22
+ require 'news_scraper/trainer'
23
+
24
+ module NewsScraper
25
+ extend self
26
+
27
+ # <code>NewsScraper::train</code> is an interactive command-line prompt that:
28
+ #
29
+ # 1. Collates all articles for the given :query
30
+ # 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>config/article_scrape_patterns.yml</code>
31
+ # 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
32
+ # 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
33
+ # N.B: User may ignore all presets and manually configure it in the YAML file
34
+ # 5. Saves the selected <code>:preset</code> to <code>config/article_scrape_patterns.yml</code>
35
+ #
36
+ # *Params*
37
+ # - <code>query</code>: a keyword arugment specifying the query to train on
38
+ #
39
+ def train(query:)
40
+ Trainer.train(query: query)
41
+ end
42
+ end
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'news_scraper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "news_scraper"
8
+ spec.version = NewsScraper::VERSION
9
+ spec.authors = ["Richard Wu", "Julian Nadeau"]
10
+ spec.email = ["richardwu1997@gmail.com"]
11
+
12
+ spec.summary = 'Simple ETL news scraper in Ruby'
13
+ spec.description = 'A collection of extractors, transformers and loaders for scraping news websites and syndicates.'
14
+ spec.homepage = 'https://github.com/richardwu/news_scraper'
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ raise "RubyGems 2.0 or newer is required to protect against public gem pushes." unless spec.respond_to?(:metadata)
20
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
21
+
22
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.add_dependency 'nokogiri', '~>1.6', '>= 1.6.8'
28
+ spec.add_dependency 'httparty', '~> 0.14', '>= 0.14.0'
29
+ spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
30
+ spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
31
+ spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
32
+
33
+ spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
34
+ spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
35
+ spec.add_development_dependency 'minitest', '~> 5.9', '>= 5.9.0'
36
+ spec.add_development_dependency 'pry', '~> 0.10', '>= 0.10.4'
37
+ spec.add_development_dependency 'mocha', '~> 1.1', '>= 1.1.0'
38
+ spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
39
+ spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
40
+ spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
41
+ end