RubyGems - news_scraper - Versions diffs - 0.1.1 - Mend

news_scraper 0.1.1

Files changed (32) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.rubocop.yml +96 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +3 -0
data/LICENSE.txt +21 -0
data/README.md +105 -0
data/Rakefile +24 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/circle.yml +3 -0
data/config/article_scrape_patterns.yml +116 -0
data/config/temp_dirs.yml +4 -0
data/dev.yml +13 -0
data/lib/news_scraper/active_support_lite/string.rb +11 -0
data/lib/news_scraper/cli.rb +106 -0
data/lib/news_scraper/constants.rb +6 -0
data/lib/news_scraper/errors.rb +16 -0
data/lib/news_scraper/extractors/article.rb +17 -0
data/lib/news_scraper/extractors/google_news_rss.rb +41 -0
data/lib/news_scraper/extractors_helpers.rb +27 -0
data/lib/news_scraper/scraper.rb +42 -0
data/lib/news_scraper/trainer/preset_selector.rb +77 -0
data/lib/news_scraper/trainer/url_trainer.rb +74 -0
data/lib/news_scraper/trainer.rb +25 -0
data/lib/news_scraper/transformers/article.rb +77 -0
data/lib/news_scraper/transformers/trainer_article.rb +17 -0
data/lib/news_scraper/uri_parser.rb +41 -0
data/lib/news_scraper/version.rb +3 -0
data/lib/news_scraper.rb +42 -0
data/news_scraper.gemspec +41 -0
metadata +337 -0

data/lib/news_scraper/extractors/google_news_rss.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'rss'
+require 'nokogiri'
+module NewsScraper
+  module Extractors
+    class GoogleNewsRss
+      include ExtractorsHelpers
+      BASE_URL = 'https://news.google.com/news?cf=all&hl=en&pz=1&ned=us&output=rss'.freeze
+      def initialize(query:)
+        @query = query
+      end
+      def extract
+        http_request "#{BASE_URL}&q=#{@query}" do |response|
+          google_urls = google_urls_from_resp(response.body)
+          extract_article_urls(google_urls)
+        end
+      end
+      private
+      def google_urls_from_resp(body)
+        rss = RSS::Parser.parse(body)
+        rss.items.flat_map do |rss_item|
+          Nokogiri::HTML(rss_item.description).xpath('//a').map do |anchor|
+            anchor['href']
+          end
+        end
+      end
+      def extract_article_urls(google_urls)
+        google_urls.map do |google_url|
+          regex = google_url.match(%r{&url=(?<url>https?://.*)})
+          regex.nil? ? nil : regex['url']
+        end.compact.uniq
+      end
+    end
+  end
+end

data/lib/news_scraper/extractors_helpers.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module NewsScraper
+  module ExtractorsHelpers
+    # Perform an HTTP request with a standardized response
+    #
+    # *Params*
+    # - <code>url</code>: the url on which to perform a get request
+    #
+    def http_request(url)
+      url = URIParser.new(url).with_scheme
+      CLI.put_header(url)
+      CLI.log "Beginning HTTP request for #{url}"
+      response = HTTParty.get(url)
+      raise ResponseError.new("#{response.code} - #{response.message}") unless response.code == 200
+      CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
+      CLI.put_footer
+      if block_given?
+        yield response
+      else
+        response
+      end
+    end
+  end
+end

data/lib/news_scraper/scraper.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module NewsScraper
+  class Scraper
+    # Initialize a Scraper object
+    #
+    # *Params*
+    # - <code>query</code>: a keyword arugment specifying the query to scrape
+    #
+    def initialize(query:)
+      @query = query
+    end
+    # Fetches articles from Extraction sources and scrapes the results
+    #
+    # *Yields*
+    # - Will yield individually extracted articles
+    #
+    # *Raises*
+    # - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
+    #   - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
+    #   - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
+    #   - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
+    #
+    # *Returns*
+    # - <code>transformed_articles</code>: The transformed articles fetched from the extracted sources
+    #
+    def scrape
+      article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
+      transformed_articles = []
+      article_urls.each do |article_url|
+        payload = Extractors::Article.new(url: article_url).extract
+        transformed_article = Transformers::Article.new(url: article_url, payload: payload).transform
+        transformed_articles << transformed_article
+        yield transformed_article if block_given?
+      end
+      transformed_articles
+    end
+  end
+end

data/lib/news_scraper/trainer/preset_selector.rb ADDED Viewed

@@ -0,0 +1,77 @@
+module NewsScraper
+  module Trainer
+    class PresetSelector
+      PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
+      def initialize(data_type:, data_type_presets:, url:, payload:)
+        @url = url
+        @payload = payload
+        @data_type_presets = data_type_presets
+        @data_type = data_type
+      end
+      def select
+        return unless @data_type_presets
+        selected_option = CLI.prompt_with_options(
+          "Select which preset to use for #{@data_type}:",
+          pattern_options.keys
+        )
+        if selected_option.start_with?(PROVIDER_PHRASE)
+          pattern_type = pattern_options[selected_option]
+          return {
+            'method' => pattern_type,
+            'pattern' => CLI.get_input("Provide the #{pattern_type} pattern:")
+          }
+        end
+        return if selected_option == 'skip'
+        selected_index = pattern_options[selected_option]
+        selected_preset_code = transform_results[selected_index].first
+        @data_type_presets[selected_preset_code].merge('variable' => [selected_preset_code, @data_type].join('_'))
+      end
+      private
+      def pattern_options
+        return {} unless @data_type_presets
+        @pattern_options ||= begin
+          temp_options = transform_results.each_with_object({}).with_index do |(results, options_hash), index|
+            preset_name = "#{results[0]}_#{@data_type}"
+            extracted_text = results[1]
+            options_hash["#{preset_name}: #{extracted_text}"] = index
+          end
+          %w(xpath css).each do |pattern_provider|
+            temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
+          end
+          temp_options.merge('skip' => 'skip')
+        end
+      end
+      def transform_results
+        return {} unless @data_type_presets
+        scrape_details = blank_scrape_details
+        @results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
+          scrape_details[@data_type] = preset_details
+          train_transformer = Transformers::TrainerArticle.new(
+            url: @url,
+            payload: @payload,
+            scrape_details: scrape_details,
+          )
+          transformed_result = train_transformer.transform[@data_type.to_sym]
+          hash[preset_name] = transformed_result if transformed_result && !transformed_result.empty?
+        end.to_a
+      end
+      def blank_scrape_details
+        @blank_scrape_details ||= Constants::SCRAPE_PATTERNS.each_with_object({}) do |data_type, hash|
+          hash[data_type] = nil
+        end
+      end
+    end
+  end
+end

data/lib/news_scraper/trainer/url_trainer.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module NewsScraper
+  module Trainer
+    class UrlTrainer
+      def initialize(url)
+        @url = url
+        @root_domain = URIParser.new(@url).host
+        @payload = Extractors::Article.new(url: @url).extract
+      end
+      def train
+        return if article_scrape_patterns['domains'].key?(@root_domain)
+        CLI.put_header(@root_domain)
+        CLI.log("There is no scrape pattern defined for #{@root_domain} in #{Constants::SCRAPE_PATTERN_FILEPATH}")
+        CLI.log "Fetching information..."
+        CLI.put_footer
+        selected_presets = {}
+        article_scrape_patterns['data_types'].each do |data_type|
+          selected_presets[data_type] = selected_pattern(data_type)
+        end
+        save_selected_presets(selected_presets)
+      end
+      private
+      def selected_pattern(data_type)
+        CLI.put_header("Determining information for #{data_type}")
+        data_type_presets = article_scrape_patterns['presets'][data_type]
+        pattern = if data_type_presets.nil?
+          CLI.log("No presets were found for #{data_type}. Skipping to next.")
+          nil
+        else
+          PresetSelector.new(
+            url: @url,
+            payload: @payload,
+            data_type_presets: data_type_presets,
+            data_type: data_type
+          ).select
+        end
+        CLI.put_footer
+        pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
+      end
+      def save_selected_presets(selected_presets)
+        current_content = File.read(Constants::SCRAPE_PATTERN_FILEPATH).chomp
+        new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
+        File.write(Constants::SCRAPE_PATTERN_FILEPATH, new_content)
+        CLI.log("Successfully wrote presets for #{@root_domain} to #{Constants::SCRAPE_PATTERN_FILEPATH}.")
+      end
+      def build_domain_yaml(selected_presets)
+        spacer = "  "
+        output_string = ["#{spacer}#{@root_domain}:"]
+        selected_presets.each do |data_type, spec|
+          if spec.include?('variable')
+            output_string << (spacer * 2) + "#{data_type}: *#{spec['variable']}"
+          else
+            output_string << (spacer * 2) + "#{data_type}:"
+            spec.each { |k, v| output_string << (spacer * 3) + "#{k}: #{v}" }
+          end
+        end
+        output_string.join("\n")
+      end
+      def article_scrape_patterns
+        @article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
+      end
+    end
+  end
+end

data/lib/news_scraper/trainer.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'news_scraper/trainer/preset_selector'
+require 'news_scraper/trainer/url_trainer'
+module NewsScraper
+  module Trainer
+    extend self
+    # Fetches articles from Extraction sources and trains on the results
+    #
+    # *Training* is a process where we take an untrained url (root domain
+    # is not in <code>article_scrape_patterns.yml</code>) and determine patterns and methods
+    # to match the data_types listed in <code>article_scrape_patterns.yml</code>, then record
+    # them to the <code>article_scrape_patterns.yml</code> file
+    #
+    # *Params*
+    # - <code>query</code>: a keyword arugment specifying the query to train on
+    #
+    def train(query: '')
+      article_urls = Extractors::GoogleNewsRss.new(query: query).extract
+      article_urls.each do |url|
+        Trainer::UrlTrainer.new(url).train
+      end
+    end
+  end
+end

data/lib/news_scraper/transformers/article.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require 'nokogiri'
+require 'sanitize'
+require 'readability'
+require 'htmlbeautifier'
+module NewsScraper
+  module Transformers
+    class Article
+      # Initialize a Article object
+      #
+      # *Params*
+      # - <code>url</code>: keyword arg - the url on which scraping was done
+      # - <code>payload</code>: keyword arg - the result of the scrape
+      #
+      def initialize(url:, payload:)
+        uri_parser = URIParser.new(url)
+        @uri = uri_parser.without_scheme
+        @root_domain = uri_parser.host
+        @payload = payload
+      end
+      # Transform the article
+      #
+      # *Raises*
+      # - ScrapePatternNotDefined: will raise this error if the root domain is not in the article_scrape_patterns.yml
+      #
+      # *Returns*
+      # - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
+      #
+      def transform
+        raise ScrapePatternNotDefined.new(uri: @uri, root_domain: @root_domain) unless scrape_details
+        transformed_response.merge(uri: @uri, root_domain: @root_domain)
+      end
+      private
+      def scrape_details
+        @scrape_details ||= Constants::SCRAPE_PATTERNS['domains'][@root_domain]
+      end
+      def transformed_response
+        Constants::SCRAPE_PATTERNS['data_types'].each_with_object({}) do |data_type, response|
+          response[data_type.to_sym] = parsed_data(data_type)
+        end
+      end
+      def parsed_data(data_type)
+        return nil unless scrape_details[data_type]
+        scrape_method = scrape_details[data_type]['method'].to_sym
+        case scrape_method
+        when :xpath
+          noko_html = Nokogiri::HTML(@payload)
+          Sanitize.fragment(
+            noko_html.send(scrape_method, "(#{scrape_details[data_type]['pattern']})[1]")
+          ).squish
+        when :css
+          noko_html = Nokogiri::HTML(@payload)
+          Sanitize.fragment(
+            noko_html.send(scrape_method, scrape_details[data_type]['pattern'])
+          ).squish
+        when :readability
+          content = Readability::Document.new(
+            @payload,
+            remove_empty_nodes: true,
+            tags: %w(div p img a table tr th tbody td h1 h2 h3 h4 h5 h6),
+            attributes: %w(src href colspan rowspan)
+          ).content
+          # Remove any newlines in the text
+          content = content.squeeze("\n").strip
+          HtmlBeautifier.beautify(content)
+        end
+      end
+    end
+  end
+end

data/lib/news_scraper/transformers/trainer_article.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module NewsScraper
+  module Transformers
+    class TrainerArticle < Article
+      # Initialize a TrainerArticle object
+      #
+      # *Params*
+      # - <code>url</code>: keyword arg - the url on which scraping was done
+      # - <code>payload</code>: keyword arg - the result of the scrape
+      # - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
+      #
+      def initialize(url:, payload:, scrape_details:)
+        @scrape_details = scrape_details
+        super(url: url, payload: payload)
+      end
+    end
+  end
+end

data/lib/news_scraper/uri_parser.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'uri'
+module NewsScraper
+  class URIParser
+    # Initialize a URIParser
+    #
+    # *Params*
+    # - <code>url</code>: the url to parse to a uri
+    #
+    def initialize(url)
+      @uri = URI.parse(url)
+    end
+    # Removes the scheme from the URI
+    #
+    # *Returns*
+    # - A schemeless URI string, e.g. https://google.ca will return google.ca
+    #
+    def without_scheme
+      @uri.scheme ? @uri.to_s.gsub(%r{^#{@uri.scheme}://}, '') : @uri.to_s
+    end
+    # Returns the URI with a scheme, adding http:// if no scheme is present
+    #
+    # *Returns*
+    # - A URI string, with http:// if no scheme was specified
+    #
+    def with_scheme
+      @uri.scheme ? @uri.to_s : "http://#{@uri}"
+    end
+    # Returns the URI's host, removing paths, params, and schemes
+    #
+    # *Returns*
+    # - The URI's host, e.g. https://google.ca/search&q=query will return google.ca
+    #
+    def host
+      without_scheme.downcase.match(/^(?:[\w\d-]+\.)?(?<host>[\w\d-]+\.\w{2,})/)['host']
+    end
+  end
+end

data/lib/news_scraper/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module NewsScraper
+  VERSION = "0.1.1".freeze
+end

data/lib/news_scraper.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require 'httparty'
+require 'yaml'
+require 'news_scraper/constants'
+require 'news_scraper/uri_parser'
+require 'news_scraper/active_support_lite/string'
+require 'news_scraper/errors'
+require 'news_scraper/version'
+require 'news_scraper/extractors_helpers'
+require 'news_scraper/extractors/google_news_rss'
+require 'news_scraper/extractors/article'
+require 'news_scraper/transformers/article'
+require 'news_scraper/transformers/trainer_article'
+require 'news_scraper/scraper'
+require 'news_scraper/cli'
+require 'news_scraper/trainer'
+module NewsScraper
+  extend self
+  # <code>NewsScraper::train</code> is an interactive command-line prompt that:
+  #
+  # 1. Collates all articles for the given :query
+  # 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>config/article_scrape_patterns.yml</code>
+  # 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
+  # 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
+  # N.B: User may ignore all presets and manually configure it in the YAML file
+  # 5. Saves the selected <code>:preset</code> to <code>config/article_scrape_patterns.yml</code>
+  #
+  # *Params*
+  # - <code>query</code>: a keyword arugment specifying the query to train on
+  #
+  def train(query:)
+    Trainer.train(query: query)
+  end
+end

data/news_scraper.gemspec ADDED Viewed

@@ -0,0 +1,41 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'news_scraper/version'
+Gem::Specification.new do |spec|
+  spec.name          = "news_scraper"
+  spec.version       = NewsScraper::VERSION
+  spec.authors       = ["Richard Wu", "Julian Nadeau"]
+  spec.email         = ["richardwu1997@gmail.com"]
+  spec.summary       = 'Simple ETL news scraper in Ruby'
+  spec.description   = 'A collection of extractors, transformers and loaders for scraping news websites and syndicates.'
+  spec.homepage      = 'https://github.com/richardwu/news_scraper'
+  spec.license       = "MIT"
+  # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
+  # to allow pushing to a single host or delete this section to allow pushing to any host.
+  raise "RubyGems 2.0 or newer is required to protect against public gem pushes." unless spec.respond_to?(:metadata)
+  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_dependency 'nokogiri', '~>1.6', '>= 1.6.8'
+  spec.add_dependency 'httparty', '~> 0.14', '>= 0.14.0'
+  spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
+  spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
+  spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
+  spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
+  spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
+  spec.add_development_dependency 'minitest', '~> 5.9', '>= 5.9.0'
+  spec.add_development_dependency 'pry', '~> 0.10', '>= 0.10.4'
+  spec.add_development_dependency 'mocha', '~> 1.1', '>= 1.1.0'
+  spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
+  spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
+  spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
+end