RubyGems - scrapix - Versions diffs - 0.1.3 - Mend

scrapix 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.rvmrc +52 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +94 -0
data/Rakefile +2 -0
data/bin/scrapix +46 -0
data/lib/scrapix.rb +5 -0
data/lib/scrapix/drivers/capybara.rb +20 -0
data/lib/scrapix/google_images.rb +113 -0
data/lib/scrapix/vbulletin.rb +72 -0
data/lib/scrapix/version.rb +3 -0
data/scrapix.gemspec +28 -0
metadata +145 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 7229eb9784d2c1e6a49ebc62c56cd2639b8b18aa
+  data.tar.gz: 766a3bda0863e5126ca10188a3d629250097392b
+SHA512:
+  metadata.gz: 31237fb06e6cb6413305d1fae6f3f593072c40225619e3975f883ce805f89e8f1c2a1eddb2aaae045e37fab0427b1d6c7b07c7fe040c072b2ac5c373387c6a2c
+  data.tar.gz: 57cd2684ac04991a4ff5ae28b5a37c48362d6db4338fec0af575cc2b0288cedeb056c170781d10d17605f119f46024e9aa82388c16ea51672dc4cfdea9b27d0d

data/.gitignore ADDED

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rvmrc ADDED

@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# This is an RVM Project .rvmrc file, used to automatically load the ruby
+# development environment upon cd'ing into the directory
+# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
+# Only full ruby name is supported here, for short names use:
+#     echo "rvm use 2.0.0" > .rvmrc
+environment_id="ruby-2.0.0-p0@scrapix"
+# Uncomment the following lines if you want to verify rvm version per project
+# rvmrc_rvm_version="1.18.15 (stable)" # 1.10.1 seams as a safe start
+# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
+#   echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
+#   return 1
+# }
+# First we attempt to load the desired environment directly from the environment
+# file. This is very fast and efficient compared to running through the entire
+# CLI and selector. If you want feedback on which environment was used then
+# insert the word 'use' after --create as this triggers verbose mode.
+if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
+  && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
+then
+  \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
+  [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
+    \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
+  if [[ $- == *i* ]] # check for interactive shells
+  then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
+  else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
+  fi
+else
+  # If the environment file has not yet been created, use the RVM CLI to select.
+  rvm --create use  "$environment_id" || {
+    echo "Failed to create RVM environment '${environment_id}'."
+    return 1
+  }
+fi
+# If you use bundler, this might be useful to you:
+# if [[ -s Gemfile ]] && {
+#   ! builtin command -v bundle >/dev/null ||
+#   builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
+# }
+# then
+#   printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
+#   gem install bundler
+# fi
+# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
+# then
+#   bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
+# fi

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in scrapix.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Nikhil Gupta
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,94 @@
+# Scrapix
+A gem that is able to scrape images from various sources. The gem provides you with the
+results of these searches in a neat way, which you can then use to download these images,
+or simply obtain a list of such images.
+You can, also, use the API to call these scraping methods inside your own applications.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'scrapix'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install scrapix
+## Usage :: Google Images
+This gem is able to scrape images from Google Images search. It uses `Capybara` along with the
+`Poltergeist` driver (which works on top of `PhantomJS`) for this purpose.
+To use the `Google Images Scraper` inside your ruby applications, simply do:
+    scraper = Scrapix::GoogleImages.new # create the scraper
+    scraper.query = "programmer"      # find images for keyword: "programmer"
+    scraper.total = 30                # search is limited to 30 images (default: 100)
+    scraper.find                      # return a list of such images
+    # search for 'large' images, and put safesearch to off!
+    scraper.options = { safe: false, size: "large" }
+    scraper.find
+    # everything:
+    scraper = Scrapix::GoogleImages.new "programmer", safe: false, size: "large"
+    scraper.total = 30 # limits to 30 images - default: 100 images
+    scraper.find
+The `size` option can be supplied in following ways:
+  - __icon__, __small__, __medium__, or __large__
+  - __&lt;n&gt;__: searches for images with exact dimensions (width: _&lt;m&gt;_, height: _&lt;n&gt;_)
+  - __&lt;m&gt;x&lt;n&gt;__: searches for images with exact dimensions (width: _&lt;m&gt;_, height: _&lt;n&gt;_)
+  - __&lt;n&gt;mp__: searches for images larger than _&lt;n&gt;_ MP. Intelligently, adjusts to
+  the closest available option, if _&lt;n&gt;_ is not in the supported list of sizes
+  for this search.
+You can also use the scraper on CLI:
+    scrapix google_images "programmer" --no-safe --total=30 --size=large
+## Usage :: vBulletin Threads
+This gem is able to scrape vBulletin threads for images. It uses `Mechanize` gem for this purpose.
+To use the `vBulletin Thread Scraper` inside your ruby applications, simply do:
+    scraper = Scrapix::VBulletin.new # create the scraper
+    # find images for the following thread
+    scraper.url = "http://www.wetacollectors.com/forum/showthread.php?t=40085"
+    scraper.find     # return a list of such images
+    # start searching from page 2 of this thread till we find 10 images
+    scraper.options = { start: 2, total: 10 }
+    scraper.find
+    # everything:
+    url = "http://www.wetacollectors.com/forum/showthread.php?t=40085"
+    scraper = Scrapix::VBulletin.new url, start: 2, end: 3, total: 10
+    scraper.find
+You can also use the scraper on CLI:
+    scrapix vbulletin "http://www.wetacollectors.com/forum/showthread.php?t=40085" --total=10 --start=2
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+## TODO
+1. Check if `mechanize` can be used instead of `capybara + poltergeist` combination for scraping Google Images.

data/Rakefile ADDED

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env rake
2	+ require "bundler/gem_tasks"

data/bin/scrapix ADDED

@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+require 'thor'
+require 'scrapix'
+module Scrapix
+  class CLI < Thor
+    desc "google_images [KEYWORD]", "scrape images from Google Images"
+    method_option :safe, type: :boolean, default: true, description: "use safe search?"
+    method_option :size, default: "any", description: "size of the images to search for"
+    method_option :ref, type: :boolean, default: false, description: "provide a list of reference urls, instead"
+    method_option :verbose, type: :boolean, default: false, description: "provide all info", alias: "-v"
+    method_option :total, default: 100, description: "number of images to search", aliases: "-n"
+    def google_images(keyword)
+      scraper = Scrapix::GoogleImages.new keyword, options
+      scraper.total = options["total"].to_i
+      images = scraper.find
+      if images.empty?
+        puts "No images were found! :("
+      else
+        puts "URL, WIDTH, HEIGHT, REFERENCE_URL" if options["verbose"]
+        images.each do |image|
+          if options["verbose"]
+            puts "#{image[:url]},#{image[:width]},#{image[:height]},#{image[:reference_url]}"
+          else
+            puts options["ref"] ? image[:reference_url] : image[:url]
+          end
+        end
+      end
+    end
+    desc "vbulletin [THREAD_URL]", "scrape images from a vBulletin Thread"
+    method_option :total, default: 100000, description: "number of images to search", aliases: "-n"
+    method_option :start, default: 1, description: "starting page number"
+    method_option :end, default: 10000, description: "ending page number"
+    method_option :verbose, type: :boolean, default: false, description: "be verbose", alias: "-v"
+    def vbulletin(thread_url)
+      scraper = Scrapix::VBulletin.new thread_url, options.merge({"cli" => true})
+      images = scraper.find
+      puts "No images were found! :(" if images.empty?
+    end
+  end
+end
+Scrapix::CLI.start

data/lib/scrapix.rb ADDED

@@ -0,0 +1,5 @@
+require "scrapix/version"
+require "scrapix/drivers/capybara"
+require "scrapix/google_images"
+require "scrapix/vbulletin"
+require 'mechanize'

data/lib/scrapix/drivers/capybara.rb ADDED

@@ -0,0 +1,20 @@
+# capybara for scraping
+require 'capybara'
+require 'capybara/dsl'
+require 'capybara/poltergeist'
+Capybara.register_driver :poltergeist_debug do |app|
+  Capybara::Poltergeist::Driver.new app, {
+    timeout:            600,
+    inspector:          true,
+    # js_errors:          false,
+    phantomjs_options:  ["--web-security=no"]
+  }
+end
+# use javascript driver
+Capybara.current_driver = :poltergeist_debug
+Scrapix::UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) " +
+                     "AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"

data/lib/scrapix/google_images.rb ADDED

@@ -0,0 +1,113 @@
+module Scrapix
+  # download images from a Google Image Search
+  class GoogleImages
+    include Capybara::DSL
+    # options can be:
+    #   size: named size, e.g. icon, small, medium, large, 13mp, 1280x800, etc.
+    #   safe: true or false
+    #
+    def initialize(query = nil, options = {})
+      self.options = options
+      self.query   = query
+      self.total   = 100
+    end
+    def search_url(page_no = 1)
+      "http://google.com/search?tbm=isch&q=#{@query}#{@params}&start=#{(page_no - 1)*20}"
+    end
+    def query=(q)
+      @query = URI.escape(q) if q
+    end
+    def total=(n)
+      @num = n.to_i
+    end
+    def options=(opts)
+      # convert symbolic keys to string keys
+      options = {}
+      opts.each { |k,v| options[k.to_s] = v }
+      # merge the options with defaults!
+      @options ||= { "safe" => true, "size" => "any" }
+      @options.merge!(options)
+      sanitize_size
+      # parametrize for url purposes
+      @params = create_params
+    end
+    # params: page_no => starting page number for google results
+    def find(page_no = 1)
+      images = {}
+      return images unless @query
+      while images.count < @num
+        visit search_url(page_no)
+        links = Capybara.page.all("a")
+        links = links.select{|x| x["href"] =~ /^\/imgres/} if links.any?
+        return images unless links.any?
+        page_counter = 0
+        links.each do |link|
+          attribs = CGI.parse(URI.parse(link["href"]).query) rescue nil
+          next if attribs.nil?
+          hash = Digest::MD5.hexdigest(attribs["imgurl"][0])
+          unless images.has_key?(hash)
+            images[hash] = {
+              width:          attribs["w"][0],
+              height:         attribs["h"][0],
+              url:            attribs["imgurl"][0],
+              reference_url:  attribs["imgrefurl"][0]
+            }
+            page_counter += 1
+          end
+        end
+        page_no += 1
+        break if page_counter == 0
+      end
+      images.take(@num).map{|x| x[1]}
+    end
+    private
+    def validate_mp_size(mp)
+      mp = mp.to_i
+      lower_bound = 0; upper_bound = 9999;
+      valid_mp_sizes = [ 2, 4, 6, 8, 10, 12, 15, 20, 40, 70 ]
+      valid_mp_sizes.each do |s|
+        return s if s == mp
+        lower_bound = s if s < mp
+        upper_bound = s if s > mp && s < upper_bound
+      end
+      mp - lower_bound > upper_bound - mp ? upper_bound : lower_bound
+    end
+    # if width or height is specified, use them as 'exact' size
+    # otherwise, use a MP size for finding images larger than that size
+    # otherwise, use a given named size
+    def sanitize_size
+      @options["size"] = case
+                         when m = @options["size"].match(/^(\d*)x(\d*)$/)
+                           then "isz:ex,iszw:#{m[1]},iszh:#{m[2]}"
+                         when m = @options["size"].match(/^(\d*)$/)
+                           then "isz:ex,iszw:#{m[1]},iszh:#{m[1]}"
+                         when m = @options["size"].match(/^(\d*)mp$/)
+                           then "isz:lt,islt:#{validate_mp_size(m[1])}mp"
+                         when @options["size"] == "large" then "isz:l"
+                         when @options["size"] == "medium" then "isz:m"
+                         when @options["size"] == "small" then "isz:s"
+                         when @options["size"] == "icon" then "isz:i"
+                         else nil
+                         end
+    end
+    def create_params
+      string  = ""
+      string += "&tbs=#{@options["size"]}" if @options["size"]
+      string += "&safe=off" unless @options["safe"]
+      string
+    end
+  end
+end

data/lib/scrapix/vbulletin.rb ADDED

@@ -0,0 +1,72 @@
+module Scrapix
+  # download images from a vBulletin thread
+  class VBulletin
+    attr_reader :title, :max_pages, :options, :page_no, :images, :url
+    def initialize(url = nil, options = {})
+      @images                 = {}
+      @agent                  = Mechanize.new
+      @agent.user_agent_alias = 'Mac Safari'
+      self.options            = options
+      self.url                = url
+    end
+    # find images for this thread, specified by starting page_no
+    def find
+      reset; return @images unless @url
+      @page_no = @options["start"]
+      until @images.count > @options["total"] || thread_has_ended?
+        page      = @agent.get "#{@url}&page=#{@page_no}"
+        puts "[VERBOSE] Searching: #{@url}&page=#{@page_no}" if @options["verbose"] && options["cli"]
+        sources   = page.image_urls.map{|x| x.to_s}
+        sources   = filter_images sources # hook for sub-classes
+        @page_no += 1
+        continue if sources.empty?
+        sources.each do |source|
+          hash = Digest::MD5.hexdigest(source)
+          unless @images.has_key?(hash)
+            @images[hash] = {url: source}
+            puts source if options["cli"]
+          end
+        end
+      end
+      @images = @images.map{|x, y| y}
+    end
+    def thread_has_ended?
+      @page_no > @options["end"] || @page_no > @max_pages
+    end
+    def filter_images(sources)
+      # useful for filtering the image by sub-classes
+      return sources
+    end
+    def url=(url)
+      @url = url
+      return unless @url
+      page = @agent.get @url
+      @title = page.title.strip
+      puts @title + "\n" + ("=" * @title.length) if self.options["cli"]
+      begin
+        text = page.search(".pagenav .vbmenu_control").first.inner_text
+        @max_pages = text.match(/Page \d* of (\d*)/)[1].to_i
+      rescue
+        @max_pages = 1
+      end
+    end
+    def reset
+      @images  = {}
+      @page_no = @options["start"]
+    end
+    def options=(options = {})
+      @options = { "start" => 1, "end" => 10000, "total" => 100000, "verbose" => false, "cli" => false }
+      options.each { |k,v| @options[k.to_s] = v }
+      ["start", "end", "total"].each {|k| @options[k] = @options[k].to_i}
+      @options
+    end
+  end
+end

data/lib/scrapix/version.rb ADDED

@@ -0,0 +1,3 @@
+module Scrapix
+  VERSION = "0.1.3"
+end

data/scrapix.gemspec ADDED

@@ -0,0 +1,28 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'scrapix/version'
+Gem::Specification.new do |spec|
+  spec.name          = "scrapix"
+  spec.version       = Scrapix::VERSION
+  spec.authors       = ["Nikhil Gupta"]
+  spec.email         = ["me@nikhgupta.com"]
+  spec.description   = %q{Scrapes images from various sources e.g. Google Images, vBulletin threads, etc.}
+  spec.summary       = %q{A gem that is able to scrape images from various sources. The gem provides you with the results of these searches in a neat way, which you can then use to download these images.}
+  spec.homepage      = "http://github.com/nikhgupta/scrapix"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency "thor"
+  spec.add_dependency "capybara"
+  spec.add_dependency "mechanize"
+  spec.add_dependency "poltergeist"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "bundler", "~> 1.3"
+end

metadata ADDED

@@ -0,0 +1,145 @@
+--- !ruby/object:Gem::Specification
+name: scrapix
+version: !ruby/object:Gem::Version
+  version: 0.1.3
+platform: ruby
+authors:
+- Nikhil Gupta
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-05-03 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: capybara
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: poltergeist
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+description: Scrapes images from various sources e.g. Google Images, vBulletin threads,
+  etc.
+email:
+- me@nikhgupta.com
+executables:
+- scrapix
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rvmrc
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/scrapix
+- lib/scrapix.rb
+- lib/scrapix/drivers/capybara.rb
+- lib/scrapix/google_images.rb
+- lib/scrapix/vbulletin.rb
+- lib/scrapix/version.rb
+- scrapix.gemspec
+homepage: http://github.com/nikhgupta/scrapix
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.0
+signing_key:
+specification_version: 4
+summary: A gem that is able to scrape images from various sources. The gem provides
+  you with the results of these searches in a neat way, which you can then use to
+  download these images.
+test_files: []