RubyGems - news_scraper - Versions diffs - 0.1.1 - Mend

news_scraper 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.rubocop.yml +96 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +3 -0
data/LICENSE.txt +21 -0
data/README.md +105 -0
data/Rakefile +24 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/circle.yml +3 -0
data/config/article_scrape_patterns.yml +116 -0
data/config/temp_dirs.yml +4 -0
data/dev.yml +13 -0
data/lib/news_scraper/active_support_lite/string.rb +11 -0
data/lib/news_scraper/cli.rb +106 -0
data/lib/news_scraper/constants.rb +6 -0
data/lib/news_scraper/errors.rb +16 -0
data/lib/news_scraper/extractors/article.rb +17 -0
data/lib/news_scraper/extractors/google_news_rss.rb +41 -0
data/lib/news_scraper/extractors_helpers.rb +27 -0
data/lib/news_scraper/scraper.rb +42 -0
data/lib/news_scraper/trainer/preset_selector.rb +77 -0
data/lib/news_scraper/trainer/url_trainer.rb +74 -0
data/lib/news_scraper/trainer.rb +25 -0
data/lib/news_scraper/transformers/article.rb +77 -0
data/lib/news_scraper/transformers/trainer_article.rb +17 -0
data/lib/news_scraper/uri_parser.rb +41 -0
data/lib/news_scraper/version.rb +3 -0
data/lib/news_scraper.rb +42 -0
data/news_scraper.gemspec +41 -0
metadata +337 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: cadeecdf95b6d5fd907671773fc3cc521fe756a2
+  data.tar.gz: 150d667251c00f96b01b195c99fe2a44f3da8da9
+SHA512:
+  metadata.gz: 8af8f251dce23589d5e08af5f9c0510d031b91ed56188a0e6b3418b26397ad9ba98835a516c8e174d8aed3ea791793e5ceec8c3422a6e7731ac84196b49f287b
+  data.tar.gz: d55a513397be97f08b0adcbc975db4ad1c39d0887576979b138a50b5cb3601eb03036a9cfe75d1b0d4dcd2ab0303256c9112c865749905e5540f35bdefef18eb

data/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+*.gem

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,96 @@
+AllCops:
+  TargetRubyVersion: 2.3
+ClassLength:
+  Max: 500
+ModuleLength:
+  Max: 500
+Rails:
+  Enabled: false
+Lint/AssignmentInCondition:
+  Enabled: false
+Style/Documentation:
+  Enabled: false
+Style/MultilineOperationIndentation:
+  Enabled: true
+Style/AlignParameters:
+  EnforcedStyle: with_fixed_indentation
+Style/FirstParameterIndentation:
+  EnforcedStyle: consistent
+Style/TrailingCommaInLiteral:
+  Enabled: false
+Style/TrailingCommaInArguments:
+  Enabled: false
+Style/SignalException:
+  EnforcedStyle: only_raise
+Style/NumericLiterals:
+  Enabled: true
+Style/CaseIndentation:
+  IndentWhenRelativeTo: end
+Style/IndentHash:
+  EnforcedStyle: consistent
+Style/WordArray:
+  Enabled: true
+Style/ModuleFunction:
+  Enabled: false
+Style/RaiseArgs:
+  EnforcedStyle: compact
+Metrics/AbcSize:
+  Enabled: false
+Metrics/CyclomaticComplexity:
+  Enabled: false
+Style/StringLiterals:
+  Enabled: false
+Metrics/LineLength:
+  Max: 120
+Metrics/ClassLength:
+  Enabled: false
+Metrics/MethodLength:
+  Enabled: false
+Metrics/ParameterLists:
+  Max: 5
+  CountKeywordArgs: false
+Metrics/PerceivedComplexity:
+  Enabled: false
+Lint/EndAlignment:
+  AlignWith: variable
+Style/FrozenStringLiteralComment:
+  Enabled: false
+Style/Alias:
+  EnforcedStyle: prefer_alias_method
+Style/MutableConstant:
+  Enabled: true
+Performance/Casecmp:
+  Enabled: true
+Style/GuardClause:
+  Enabled: true

data/CODE_OF_CONDUCT.md ADDED Viewed

@@ -0,0 +1,49 @@
+# Contributor Code of Conduct
+As contributors and maintainers of this project, and in the interest of
+fostering an open and welcoming community, we pledge to respect all people who
+contribute through reporting issues, posting feature requests, updating
+documentation, submitting pull requests or patches, and other activities.
+We are committed to making participation in this project a harassment-free
+experience for everyone, regardless of level of experience, gender, gender
+identity and expression, sexual orientation, disability, personal appearance,
+body size, race, ethnicity, age, religion, or nationality.
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery
+* Personal attacks
+* Trolling or insulting/derogatory comments
+* Public or private harassment
+* Publishing other's private information, such as physical or electronic
+  addresses, without explicit permission
+* Other unethical or unprofessional conduct
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+By adopting this Code of Conduct, project maintainers commit themselves to
+fairly and consistently applying these principles to every aspect of managing
+this project. Project maintainers who do not follow or enforce the Code of
+Conduct may be permanently removed from the project team.
+This code of conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community.
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting a project maintainer at richardwu1997@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. Maintainers are
+obligated to maintain confidentiality with regard to the reporter of an
+incident.
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 1.3.0, available at
+[http://contributor-covenant.org/version/1/3/0/][version]
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/3/0/

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source "https://rubygems.org"
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2016 Richard Wu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,105 @@
+# NewsScraper
+### Simple ETL news scraper in Ruby
+[RubyGems](https://rubygems.org/gems/news_scraper)
+A collection of extractors, transformers and loaders for a variety of news feeds and outlets.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'news_scraper'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install news_scraper
+## Usage
+### Scraping
+`NewsScraper::Scraper#scrape` will return an array of the transformed data for all Google News RSS articles for the given query.
+Optionally, you can pass in a block and it will yield the transformed data on a per-article basis.
+It takes in 1 parameter `query:`.
+Array notation
+```
+article_hashes = NewsScraper::Scraper.new(query: 'Shopify').scrape # [ { author: ... }, { author: ... } ... ]
+```
+Block notation
+```
+NewsScraper::Scraper.new(query: 'Shopify').scrape do |article_hash|
+  # { author: ... }
+end
+```
+How the `Scraper` extracts and parses for the information is determined by scrape patterns (see **Scrape Patterns**).
+### Transformed Data
+Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
+In addition, the `uri` and `root_domain`(hostname) of the article will be returned in the hash too.
+Example
+```
+{
+  author: 'Linus Torvald',
+  body: 'The Linux kernel developed by Linus Torvald has become the backbone of most electronic devices we use to-date. It powers mobile phones, laptops, embedded devices, and even rockets...',
+  description: 'The Linux kernel is one of the most important contributions to the world of technology.',
+  keywords: 'linux,kernel,linus,torvald',
+  section: 'technology',
+  datetime: '1991-10-05T12:00:00+00:00',
+  title: 'Linus Linux',
+  uri: 'linusworld.com/the-linux-kernel',
+  root_domain: 'linusworld.com'
+}
+```
+### Scrape Patterns
+Scrape patterns are xpath or CSS patterns used by Nokogiri to extract relevant HTML elements.
+Extracting each `:data_type` (see Example under **Transformed Data**) requires a scrape pattern. A few `:presets` are specified in [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml).
+Since each news site (identified with `:root_domain`) uses a different markup, scrape patterns are defined on a per-`:root_domain` basis.
+Specifying scrape patterns for new, undefined `:root_domains` is called training (see **Training**).
+### Training
+For each `:root_domain`, it is neccesary to specify a scrape pattern for each of the `:data_type`s. A rake task was written to provide a CLI for appending new `:root_domain`s using `:preset` scrape patterns.
+Simply run
+```
+bundle exec rake scraper:train QUERY=<query>
+```
+where the CLI will step through the articles and `:root_domain`s of the articles relevant to `<query>`.
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/richardwu/news_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
+## License
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).

data/Rakefile ADDED Viewed

@@ -0,0 +1,24 @@
+require 'bundler/gem_tasks'
+require 'rake/testtask'
+require 'rdoc/task'
+require 'news_scraper'
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+  t.pattern = 'test/**/*_test.rb'
+end
+namespace :scraper do
+  desc 'CLI that steps through articles for a given query and displays preset scrape pattern results; parameters: QUERY'
+  task :train do
+    raise "QUERY param not given.\n\tUsage: bundle exec rake scraper:train QUERY=<query>" unless ENV['QUERY']
+    NewsScraper::Trainer.train(query: ENV['QUERY'])
+  end
+end
+RDoc::Task.new do |rdoc|
+  rdoc.main = "README.md"
+  rdoc.rdoc_files.include("README.md", "lib/*.rb", "lib/**/*.rb")
+  rdoc.rdoc_dir = "doc"
+end

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "news_scraper"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/circle.yml ADDED Viewed

@@ -0,0 +1,3 @@
+test:
+  pre:
+    - bundle exec rubocop

data/config/article_scrape_patterns.yml ADDED Viewed

@@ -0,0 +1,116 @@
+# All domains should include the scrape method/pattern for data_types
+data_types:
+  - "author"
+  - "body"
+  - "description"
+  - "keywords"
+  - "section"
+  - "datetime"
+  - "title"
+# All data types must include :method and :pattern
+#
+presets:
+  author:
+    class: &class_author
+      method: css
+      pattern: ".author"
+    id: &id_author
+      method: css
+      pattern: "#author"
+    name: &name_author
+      method: css
+      pattern: ".author-name"
+    link: &link_author
+      method: xpath
+      pattern: "//a[contains(@href, 'author')]"
+    meta: &meta_author
+      method: xpath
+      pattern: "//meta[@name='author']/@content"
+    rel_link: &rel_link_author
+      method: xpath
+      pattern: "//a[@rel='author']"
+    vcard: &vcard_author
+      method: css
+      pattern: ".vcard .fn"
+  body:
+    readability: &readability_body
+      method: "readability"
+      pattern: ""
+  description:
+    meta: &meta_description
+      method: "xpath"
+      pattern: "//meta[@name='description']/@content"
+    og: &og_description
+      method: "xpath"
+      pattern: "//meta[@property='og:description']/@content"
+  keywords:
+    meta: &meta_keywords
+      method: "xpath"
+      pattern: "//meta[@name='keywords']/@content"
+    article_tag: &article_tag_keywords
+      method: "xpath"
+      pattern: "//meta[@property='article:tag']/@content"
+  section:
+    meta: &meta_section
+      method: "xpath"
+      pattern: "//meta[@property='article:section']/@content"
+  datetime:
+    article_date_original: &article_date_original_datetime
+      method: xpath
+      pattern: //meta[@name='article_date_original']/@content
+    article_published_time: &article_published_time_datetime
+      method: "xpath"
+      pattern: "//meta[@property='article:published_time']/@content"
+    date: &date_datetime
+      method: xpath
+      pattern: //meta[@name='date']/@content
+    date_published: &date_published_datetime
+      method: xpath
+      pattern: //*[@itemprop='datePublished']/@datetime
+    og_published_time: &og_published_time_datetime
+      method: xpath
+      pattern: //meta[@property='og:published_time']/@content
+    original_publication_date: &original_publication_date_datetime
+      method: xpath
+      pattern: //meta[@name='OriginalPublicationDate']/@content
+    publication_date: &publication_date_datetime
+      method: xpath
+      pattern: //meta[@name='publication_date']/@content
+    publish_date: &publish_date_datetime
+      method: xpath
+      pattern: //meta[@name='PublishDate']/@content
+    rnews_date_published: &rnews_date_published_datetime
+      method: xpath
+      pattern: //meta[@property='rnews:datePublished']/@content
+    sailthru_date: &sailthru_date_datetime
+      method: xpath
+      pattern: //meta[@name='sailthru.date']/@content
+  title:
+    html: &html_title
+      method: "xpath"
+      pattern: "//title"
+    og: &og_title
+      method: "xpath"
+      pattern: "//meta[@property='og:title']/@content"
+domains:
+  investors.com:
+    author: *rel_link_author
+    body:
+      method: "css"
+      pattern: ".single-post-content"
+    description: *og_description
+    keywords: *meta_keywords
+    section: *meta_section
+    datetime: *article_published_time_datetime
+    title: *og_title
+  fool.com:
+    author: *meta_author
+    body: *readability_body
+    description: *meta_description
+    keywords: *article_tag_keywords
+    section: *meta_section
+    datetime: *date_datetime
+    title: *og_title

data/config/temp_dirs.yml ADDED Viewed

@@ -0,0 +1,4 @@
+extractors:
+  google_news_rss:
+    article_urls: "tmp/google_news_rss/article_urls"

data/dev.yml ADDED Viewed

@@ -0,0 +1,13 @@
+name: news-scraper
+up:
+  - ruby: 2.3.1
+  - bundler
+commands:
+  rubocop:
+    desc:   'Lint the Ruby code with Rubocop'
+    run:    bundle exec rubocop
+    aliases: [rubo, lint, l]
+  test: bundle exec rake test
+  docs: bundle exec rake rdoc

data/lib/news_scraper/active_support_lite/string.rb ADDED Viewed

@@ -0,0 +1,11 @@
+class String
+  def squish
+    dup.squish!
+  end
+  def squish!
+    gsub!(/[[:space:]]+/, ' ')
+    strip!
+    self
+  end
+end

data/lib/news_scraper/cli.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require 'readline'
+module NewsScraper
+  module CLI
+    extend self
+    DEFAULT_COLOR = "\x1b[36m".freeze
+    def log(message, color: DEFAULT_COLOR, new_line: false)
+      message += "\n" if new_line
+      $stdout.puts "#{color}┃\x1b[0m " + message
+    end
+    def log_lines(message, color: DEFAULT_COLOR, new_line: false)
+      message.split("\n").each do |line|
+        log(line, color: color, new_line: new_line)
+      end
+    end
+    def confirm(msg, color: DEFAULT_COLOR)
+      print "#{color}┃\x1b[0m #{msg} (y/n) "
+      $stdin.gets.chomp =~ /[Yy]/
+    end
+    def get_input(msg = nil)
+      log(msg) if msg
+      Readline.completion_append_character = " "
+      Readline.completion_proc = nil
+      result = begin
+        Readline.readline("\x1b[34m┃ > \x1b[33m", true)
+      rescue Interrupt
+        nil
+      end
+      print "\e[0m" # reset colour
+      result
+    end
+    def prompt_with_options(question, options)
+      log(question)
+      log("Your options are:")
+      options.each.with_index(1) do |v, idx|
+        log("#{idx}) #{v}")
+      end
+      log("Choose a number between 1 and #{options.length}")
+      Readline.completion_append_character = " "
+      Readline.completion_proc = nil
+      buf = -1
+      available = (1..options.length).to_a
+      until available.include?(buf.to_i)
+        begin
+          buf = Readline.readline("\x1b[34m┃ > \x1b[33m", true)
+        rescue Interrupt
+          nil
+        end
+        if buf.nil?
+          STDERR.puts
+          next
+        end
+        buf = buf.chomp
+        buf = -1 if buf.empty?
+        buf = -1 if buf.to_i.to_s != buf
+      end
+      print "\e[0m" # reset colour
+      options[buf.to_i - 1]
+    end
+    ## Fancy Headers and Footers
+    def put_header(text = "", color = DEFAULT_COLOR)
+      put_edge(color, "┏━━ ", text)
+    end
+    def put_footer(color = DEFAULT_COLOR)
+      put_edge(color, "┗", "")
+    end
+    def put_edge(color, prefix, text)
+      ptext = "#{color}#{prefix}#{text}"
+      textwidth = printing_width(ptext)
+      termwidth = IO.respond_to?(:console) && IO.console ? IO.console.winsize[1] : 80
+      termwidth = 30 if termwidth < 30
+      if textwidth > termwidth
+        ptext = ptext[0...termwidth]
+        textwidth = termwidth
+      end
+      padwidth = termwidth - textwidth
+      pad = "━" * padwidth
+      formatted = "#{ptext}#{color}#{pad}\x1b[0m\n"
+      $stdout.puts formatted
+    end
+    # ANSI escape sequences (like \x1b[31m) have zero width.
+    # when calculating the padding width, we must exclude them.
+    def printing_width(str)
+      str.gsub(/\x1b\[[\d;]+[A-z]/, '').size
+    end
+  end
+end

data/lib/news_scraper/constants.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module NewsScraper
+  module Constants
+    SCRAPE_PATTERN_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
+    SCRAPE_PATTERNS = YAML.load_file(SCRAPE_PATTERN_FILEPATH)
+  end
+end

data/lib/news_scraper/errors.rb ADDED Viewed

@@ -0,0 +1,16 @@
+module NewsScraper
+  class ResponseError < StandardError; end
+  module Transformers
+    class ScrapePatternNotDefined < StandardError
+      attr_reader :root_domain, :uri
+      def initialize(opts = {})
+        @root_domain = opts[:root_domain]
+        @uri = opts[:uri]
+        super
+      end
+    end
+  end
+end

data/lib/news_scraper/extractors/article.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require 'nokogiri'
+module NewsScraper
+  module Extractors
+    class Article
+      include ExtractorsHelpers
+      def initialize(url:)
+        @url = url
+      end
+      def extract
+        http_request(@url).body
+      end
+    end
+  end
+end