RubyGems - scrapix - Versions diffs - 0.1.3 - Mend

scrapix 0.1.3

Files changed (15) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.rvmrc +52 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +94 -0
data/Rakefile +2 -0
data/bin/scrapix +46 -0
data/lib/scrapix.rb +5 -0
data/lib/scrapix/drivers/capybara.rb +20 -0
data/lib/scrapix/google_images.rb +113 -0
data/lib/scrapix/vbulletin.rb +72 -0
data/lib/scrapix/version.rb +3 -0
data/scrapix.gemspec +28 -0
metadata +145 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 7229eb9784d2c1e6a49ebc62c56cd2639b8b18aa
+  data.tar.gz: 766a3bda0863e5126ca10188a3d629250097392b
+SHA512:
+  metadata.gz: 31237fb06e6cb6413305d1fae6f3f593072c40225619e3975f883ce805f89e8f1c2a1eddb2aaae045e37fab0427b1d6c7b07c7fe040c072b2ac5c373387c6a2c
+  data.tar.gz: 57cd2684ac04991a4ff5ae28b5a37c48362d6db4338fec0af575cc2b0288cedeb056c170781d10d17605f119f46024e9aa82388c16ea51672dc4cfdea9b27d0d

data/.gitignore ADDED

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rvmrc ADDED

@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# This is an RVM Project .rvmrc file, used to automatically load the ruby
+# development environment upon cd'ing into the directory
+# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
+# Only full ruby name is supported here, for short names use:
+#     echo "rvm use 2.0.0" > .rvmrc
+environment_id="ruby-2.0.0-p0@scrapix"
+# Uncomment the following lines if you want to verify rvm version per project
+# rvmrc_rvm_version="1.18.15 (stable)" # 1.10.1 seams as a safe start
+# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
+#   echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
+#   return 1
+# }
+# First we attempt to load the desired environment directly from the environment
+# file. This is very fast and efficient compared to running through the entire
+# CLI and selector. If you want feedback on which environment was used then
+# insert the word 'use' after --create as this triggers verbose mode.
+if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
+  && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
+then
+  \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
+  [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
+    \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
+  if [[ $- == *i* ]] # check for interactive shells
+  then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
+  else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
+  fi
+else
+  # If the environment file has not yet been created, use the RVM CLI to select.
+  rvm --create use  "$environment_id" || {
+    echo "Failed to create RVM environment '${environment_id}'."
+    return 1
+  }
+fi
+# If you use bundler, this might be useful to you:
+# if [[ -s Gemfile ]] && {
+#   ! builtin command -v bundle >/dev/null ||
+#   builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
+# }
+# then
+#   printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
+#   gem install bundler
+# fi
+# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
+# then
+#   bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
+# fi

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in scrapix.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Nikhil Gupta
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,94 @@
+# Scrapix
+A gem that is able to scrape images from various sources. The gem provides you with the
+results of these searches in a neat way, which you can then use to download these images,
+or simply obtain a list of such images.
+You can, also, use the API to call these scraping methods inside your own applications.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'scrapix'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install scrapix
+## Usage :: Google Images
+This gem is able to scrape images from Google Images search. It uses `Capybara` along with the
+`Poltergeist` driver (which works on top of `PhantomJS`) for this purpose.
+To use the `Google Images Scraper` inside your ruby applications, simply do:
+    scraper = Scrapix::GoogleImages.new # create the scraper
+    scraper.query = "programmer"      # find images for keyword: "programmer"
+    scraper.total = 30                # search is limited to 30 images (default: 100)
+    scraper.find                      # return a list of such images
+    # search for 'large' images, and put safesearch to off!
+    scraper.options = { safe: false, size: "large" }
+    scraper.find
+    # everything:
+    scraper = Scrapix::GoogleImages.new "programmer", safe: false, size: "large"
+    scraper.total = 30 # limits to 30 images - default: 100 images
+    scraper.find
+The `size` option can be supplied in following ways:
+  - __icon__, __small__, __medium__, or __large__
+  - __&lt;n&gt;__: searches for images with exact dimensions (width: _&lt;m&gt;_, height: _&lt;n&gt;_)
+  - __&lt;m&gt;x&lt;n&gt;__: searches for images with exact dimensions (width: _&lt;m&gt;_, height: _&lt;n&gt;_)
+  - __&lt;n&gt;mp__: searches for images larger than _&lt;n&gt;_ MP. Intelligently, adjusts to
+  the closest available option, if _&lt;n&gt;_ is not in the supported list of sizes
+  for this search.
+You can also use the scraper on CLI:
+    scrapix google_images "programmer" --no-safe --total=30 --size=large
+## Usage :: vBulletin Threads
+This gem is able to scrape vBulletin threads for images. It uses `Mechanize` gem for this purpose.
+To use the `vBulletin Thread Scraper` inside your ruby applications, simply do:
+    scraper = Scrapix::VBulletin.new # create the scraper
+    # find images for the following thread
+    scraper.url = "http://www.wetacollectors.com/forum/showthread.php?t=40085"
+    scraper.find     # return a list of such images
+    # start searching from page 2 of this thread till we find 10 images
+    scraper.options = { start: 2, total: 10 }
+    scraper.find
+    # everything:
+    url = "http://www.wetacollectors.com/forum/showthread.php?t=40085"
+    scraper = Scrapix::VBulletin.new url, start: 2, end: 3, total: 10
+    scraper.find
+You can also use the scraper on CLI:
+    scrapix vbulletin "http://www.wetacollectors.com/forum/showthread.php?t=40085" --total=10 --start=2
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+## TODO
+1. Check if `mechanize` can be used instead of `capybara + poltergeist` combination for scraping Google Images.

data/Rakefile ADDED

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env rake
2	+ require "bundler/gem_tasks"

data/bin/scrapix ADDED

@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+require 'thor'
+require 'scrapix'
+module Scrapix
+  class CLI < Thor
+    desc "google_images [KEYWORD]", "scrape images from Google Images"
+    method_option :safe, type: :boolean, default: true, description: "use safe search?"
+    method_option :size, default: "any", description: "size of the images to search for"
+    method_option :ref, type: :boolean, default: false, description: "provide a list of reference urls, instead"
+    method_option :verbose, type: :boolean, default: false, description: "provide all info", alias: "-v"
+    method_option :total, default: 100, description: "number of images to search", aliases: "-n"
+    def google_images(keyword)
+      scraper = Scrapix::GoogleImages.new keyword, options
+      scraper.total = options["total"].to_i
+      images = scraper.find
+      if images.empty?
+        puts "No images were found! :("
+      else
+        puts "URL, WIDTH, HEIGHT, REFERENCE_URL" if options["verbose"]
+        images.each do |image|
+          if options["verbose"]
+            puts "#{image[:url]},#{image[:width]},#{image[:height]},#{image[:reference_url]}"
+          else
+            puts options["ref"] ? image[:reference_url] : image[:url]
+          end
+        end
+      end
+    end
+    desc "vbulletin [THREAD_URL]", "scrape images from a vBulletin Thread"
+    method_option :total, default: 100000, description: "number of images to search", aliases: "-n"
+    method_option :start, default: 1, description: "starting page number"
+    method_option :end, default: 10000, description: "ending page number"
+    method_option :verbose, type: :boolean, default: false, description: "be verbose", alias: "-v"
+    def vbulletin(thread_url)
+      scraper = Scrapix::VBulletin.new thread_url, options.merge({"cli" => true})
+      images = scraper.find
+      puts "No images were found! :(" if images.empty?
+    end
+  end
+end
+Scrapix::CLI.start

data/lib/scrapix.rb ADDED

@@ -0,0 +1,5 @@
+require "scrapix/version"
+require "scrapix/drivers/capybara"
+require "scrapix/google_images"
+require "scrapix/vbulletin"
+require 'mechanize'

data/lib/scrapix/drivers/capybara.rb ADDED

@@ -0,0 +1,20 @@
+# capybara for scraping
+require 'capybara'
+require 'capybara/dsl'
+require 'capybara/poltergeist'
+Capybara.register_driver :poltergeist_debug do |app|
+  Capybara::Poltergeist::Driver.new app, {
+    timeout:            600,
+    inspector:          true,
+    # js_errors:          false,
+    phantomjs_options:  ["--web-security=no"]
+  }
+end
+# use javascript driver
+Capybara.current_driver = :poltergeist_debug
+Scrapix::UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) " +
+                     "AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"

data/lib/scrapix/google_images.rb ADDED

@@ -0,0 +1,113 @@
+module Scrapix
+  # download images from a Google Image Search
+  class GoogleImages
+    include Capybara::DSL
+    # options can be:
+    #   size: named size, e.g. icon, small, medium, large, 13mp, 1280x800, etc.
+    #   safe: true or false
+    #
+    def initialize(query = nil, options = {})
+      self.options = options
+      self.query   = query
+      self.total   = 100
+    end
+    def search_url(page_no = 1)
+      "http://google.com/search?tbm=isch&q=#{@query}#{@params}&start=#{(page_no - 1)*20}"
+    end
+    def query=(q)
+      @query = URI.escape(q) if q
+    end
+    def total=(n)
+      @num = n.to_i
+    end
+    def options=(opts)
+      # convert symbolic keys to string keys
+      options = {}
+      opts.each { |k,v| options[k.to_s] = v }
+      # merge the options with defaults!
+      @options ||= { "safe" => true, "size" => "any" }
+      @options.merge!(options)
+      sanitize_size
+      # parametrize for url purposes
+      @params = create_params
+    end
+    # params: page_no => starting page number for google results
+    def find(page_no = 1)
+      images = {}
+      return images unless @query
+      while images.count < @num
+        visit search_url(page_no)
+        links = Capybara.page.all("a")
+        links = links.select{|x| x["href"] =~ /^\/imgres/} if links.any?
+        return images unless links.any?
+        page_counter = 0
+        links.each do |link|
+          attribs = CGI.parse(URI.parse(link["href"]).query) rescue nil
+          next if attribs.nil?
+          hash = Digest::MD5.hexdigest(attribs["imgurl"][0])
+          unless images.has_key?(hash)
+            images[hash] = {
+              width:          attribs["w"][0],
+              height:         attribs["h"][0],
+              url:            attribs["imgurl"][0],
+              reference_url:  attribs["imgrefurl"][0]
+            }
+            page_counter += 1
+          end
+        end
+        page_no += 1
+        break if page_counter == 0
+      end
+      images.take(@num).map{|x| x[1]}
+    end
+    private
+    def validate_mp_size(mp)
+      mp = mp.to_i
+      lower_bound = 0; upper_bound = 9999;
+      valid_mp_sizes = [ 2, 4, 6, 8, 10, 12, 15, 20, 40, 70 ]
+      valid_mp_sizes.each do |s|
+        return s if s == mp
+        lower_bound = s if s < mp
+        upper_bound = s if s > mp && s < upper_bound
+      end
+      mp - lower_bound > upper_bound - mp ? upper_bound : lower_bound
+    end
+    # if width or height is specified, use them as 'exact' size
+    # otherwise, use a MP size for finding images larger than that size
+    # otherwise, use a given named size
+    def sanitize_size
+      @options["size"] = case
+                         when m = @options["size"].match(/^(\d*)x(\d*)$/)
+                           then "isz:ex,iszw:#{m[1]},iszh:#{m[2]}"
+                         when m = @options["size"].match(/^(\d*)$/)
+                           then "isz:ex,iszw:#{m[1]},iszh:#{m[1]}"
+                         when m = @options["size"].match(/^(\d*)mp$/)
+                           then "isz:lt,islt:#{validate_mp_size(m[1])}mp"
+                         when @options["size"] == "large" then "isz:l"
+                         when @options["size"] == "medium" then "isz:m"
+                         when @options["size"] == "small" then "isz:s"
+                         when @options["size"] == "icon" then "isz:i"
+                         else nil
+                         end
+    end
+    def create_params
+      string  = ""
+      string += "&tbs=#{@options["size"]}" if @options["size"]
+      string += "&safe=off" unless @options["safe"]
+      string
+    end
+  end
+end

data/lib/scrapix/vbulletin.rb ADDED

@@ -0,0 +1,72 @@
+module Scrapix
+  # download images from a vBulletin thread
+  class VBulletin
+    attr_reader :title, :max_pages, :options, :page_no, :images, :url
+    def initialize(url = nil, options = {})
+      @images                 = {}
+      @agent                  = Mechanize.new
+      @agent.user_agent_alias = 'Mac Safari'
+      self.options            = options
+      self.url                = url
+    end
+    # find images for this thread, specified by starting page_no
+    def find
+      reset; return @images unless @url
+      @page_no = @options["start"]
+      until @images.count > @options["total"] || thread_has_ended?
+        page      = @agent.get "#{@url}&page=#{@page_no}"
+        puts "[VERBOSE] Searching: #{@url}&page=#{@page_no}" if @options["verbose"] && options["cli"]
+        sources   = page.image_urls.map{|x| x.to_s}
+        sources   = filter_images sources # hook for sub-classes
+        @page_no += 1
+        continue if sources.empty?
+        sources.each do |source|
+          hash = Digest::MD5.hexdigest(source)
+          unless @images.has_key?(hash)
+            @images[hash] = {url: source}
+            puts source if options["cli"]
+          end
+        end
+      end
+      @images = @images.map{|x, y| y}
+    end
+    def thread_has_ended?
+      @page_no > @options["end"] || @page_no > @max_pages
+    end
+    def filter_images(sources)
+      # useful for filtering the image by sub-classes
+      return sources
+    end
+    def url=(url)
+      @url = url
+      return unless @url
+      page = @agent.get @url
+      @title = page.title.strip
+      puts @title + "\n" + ("=" * @title.length) if self.options["cli"]
+      begin
+        text = page.search(".pagenav .vbmenu_control").first.inner_text
+        @max_pages = text.match(/Page \d* of (\d*)/)[1].to_i
+      rescue
+        @max_pages = 1
+      end
+    end
+    def reset
+      @images  = {}
+      @page_no = @options["start"]
+    end
+    def options=(options = {})
+      @options = { "start" => 1, "end" => 10000, "total" => 100000, "verbose" => false, "cli" => false }
+      options.each { |k,v| @options[k.to_s] = v }
+      ["start", "end", "total"].each {|k| @options[k] = @options[k].to_i}
+      @options
+    end
+  end
+end

data/lib/scrapix/version.rb ADDED

@@ -0,0 +1,3 @@
+module Scrapix
+  VERSION = "0.1.3"
+end

data/scrapix.gemspec ADDED

@@ -0,0 +1,28 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'scrapix/version'
+Gem::Specification.new do |spec|
+  spec.name          = "scrapix"
+  spec.version       = Scrapix::VERSION
+  spec.authors       = ["Nikhil Gupta"]
+  spec.email         = ["me@nikhgupta.com"]
+  spec.description   = %q{Scrapes images from various sources e.g. Google Images, vBulletin threads, etc.}
+  spec.summary       = %q{A gem that is able to scrape images from various sources. The gem provides you with the results of these searches in a neat way, which you can then use to download these images.}
+  spec.homepage      = "http://github.com/nikhgupta/scrapix"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency "thor"
+  spec.add_dependency "capybara"
+  spec.add_dependency "mechanize"
+  spec.add_dependency "poltergeist"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "bundler", "~> 1.3"
+end

metadata ADDED

@@ -0,0 +1,145 @@
+--- !ruby/object:Gem::Specification
+name: scrapix
+version: !ruby/object:Gem::Version
+  version: 0.1.3
+platform: ruby
+authors:
+- Nikhil Gupta
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-05-03 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: capybara
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: poltergeist
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+description: Scrapes images from various sources e.g. Google Images, vBulletin threads,
+  etc.
+email:
+- me@nikhgupta.com
+executables:
+- scrapix
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rvmrc
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/scrapix
+- lib/scrapix.rb
+- lib/scrapix/drivers/capybara.rb
+- lib/scrapix/google_images.rb
+- lib/scrapix/vbulletin.rb
+- lib/scrapix/version.rb
+- scrapix.gemspec
+homepage: http://github.com/nikhgupta/scrapix
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.0
+signing_key:
+specification_version: 4
+summary: A gem that is able to scrape images from various sources. The gem provides
+  you with the results of these searches in a neat way, which you can then use to
+  download these images.
+test_files: []