RubyGems - scrapey - Versions diffs - 0.0.3 → 0.0.4 - Mend

scrapey 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/README.md CHANGED Viewed

@@ -1,37 +1,103 @@
 # Scrapey
-TODO: Write a gem description
+A simple framework for solving common scraping problems
-## Installation
+## Install latest version
+### Add to Gemfile
-Add this line to your application's Gemfile:
+    gem "scrapey", :git => 'https://github.com/monkeysuffrage/scrapey.git'
-    gem 'scrapey'
+### Then run:
+    $ bundle install
-And then execute:
+## Create a new scrapey project
-    $ bundle
+    $ scrapey my_scraper
-Or install it yourself as:
+## Examples
-    $ gem install scrapey
+### CSV
-## Examples
+```ruby
+require 'scrapey'
+# By default scrapey will save as 'output.csv'
+# You can change this with:
+# @output = 'mycsv.csv'
-### Concurrent downloads
+page = get 'http://www.alexa.com/topsites'
+page.search('li.site-listing').each do |li|
+  save [li.at('a').text, li.at('.description').text, li.at('.stars')[:title]]
+end
+```
+### Database
+```ruby
+require 'scrapey'
+# if you created a scrapey project you can fill out the database connection
+# information in config/config.yml
+tables 'Movie', 'Actor' # create ActiveRecord  models
+page = get 'http://www.imdb.com/movies-in-theaters/'
+page.search('div.list_item').each do |div|
+  movie = Movie.find_or_create_by_title div.at('h4 a').text
+  div.search('span[@itemprop="actors"] a').each do |a|
+    actor = Actor.find_or_create_by_name a.text
+  end
+end
+```
+### Caching
+Scrapey can cache responses so that next time they don't hit the network
+```ruby
+use_cache
+```
+You can use redis for caching if you have lots of memory
+```ruby
+require 'redis'
+use_cache :redis => Redis.new
+```
+### Retries
+Retry downloads on error a max of 3 times and sleep 30 seconds between retries.
+```ruby
+get 'some_url', :retries => 3, :sleep => 30
+```
+Or just handle errors in an on_error method (Scrapey will call it automatically if it's defined)
+```ruby
+def on_error e, method, url, options, *args
+  puts "retrying #{url} again in 30 seconds..."
+  sleep 30
+  send method, url, options, *args
+end
+```
+### Proxy switching
+```ruby
+def on_error e, method, url, options, *args
+  host, port = @config['proxies'].sample.split(':')
+  set_proxy host, port.to_i
+  send method, url, options, *args
+end
+get 'some_throttled_website_url'
+```
+### Concurrent downloads
+Scrapey will ensure that the callbacks are threadsafe
 ```ruby
 require 'scrapey'
 require 'scrapey/multi'
 fields 'url', 'title'
-def scrape url, response
+def scrape url, response, header
   doc = Nokogiri::HTML response
-  @items << {'url' => url, 'title' => doc.at('title').text}
+  save({'url' => url, 'title' => doc.at('title').text})
 end
-@items = []
-multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
-@items.each{|item| save item}
+multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
 ```

data/examples/imdb.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'scrapey'
+# if you created a scrapey project you can fill out the database connection
+# information in config/config.yml
+tables 'Movie', 'Actor' # create ActiveRecord  models
+page = get 'http://www.imdb.com/movies-in-theaters/'
+page.search('div.list_item').each do |div|
+  movie = Movie.find_or_create_by_title div.at('h4 a').text
+  div.search('span[@itemprop="actors"] a').each do |a|
+    actor = Actor.find_or_create_by_name a.text
+  end
+end

data/examples/multi.rb CHANGED Viewed

@@ -3,11 +3,9 @@ require 'scrapey/multi'
 fields 'url', 'title'
-def scrape url, response
+def scrape url, response, header
   doc = Nokogiri::HTML response
-  @items << {'url' => url, 'title' => doc.at('title').text}
+  save({'url' => url, 'title' => doc.at('title').text})
 end
-@items = []
-multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
-@items.each{|item| save item}
+multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape

data/examples/multi2.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'scrapey'
+require 'scrapey/multi'
+fields 'url', 'title'
+def scrape url, response, header
+  doc = Nokogiri::HTML response
+  save({'url' => url, 'title' => doc.at('title').text})
+  puts "scraped #{url}."
+end
+options = {
+  :threads => 3,
+  :callback => :scrape,
+  :proxy => {:host => 'localhost', :port => 8888},
+  :head => {
+    "Accept" => "*/*",
+    #"User-Agent" => "Scrapey #{Scrapey::VERSION}",
+    "Keep-alive" => "true"
+  }
+}
+multi_get ['http://www.yahoo.com/', 'http://www.google.com/', 'http://www.bing.com/'], options
+puts "this happens after all callbacks."

data/examples/redis.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'scrapey'
+require 'redis'
+require 'pry'
+@debug = true
+use_cache :redis => Redis.new
+url = 'http://www.yahoo.com/'
+google = get url
+puts google.at('title').text, (x = google.encoding rescue 'foo'), (y = google.body.encoding rescue 'foo'), '--'
+google = get url
+puts google.at('title').text, (x = google.encoding rescue 'foo'), (y = google.body.encoding rescue 'foo'), '--'

data/examples/status_check.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'scrapey'
+require 'scrapey/multi'
+fields 'url', 'status'
+def scrape url, response, header
+  save({'url' => url, 'status' => header.status})
+end
+multi_head ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/', 'http://www.bing.com/404.html'], :threads => 4, :callback => :scrape

data/lib/scrapey.rb CHANGED Viewed

@@ -4,7 +4,6 @@ require 'json'
 require 'yaml'
 require "scrapey/scrapey"
-require "scrapey/version"
 require "scrapey/constants"
 require "scrapey/cache"
 require "scrapey/database"
@@ -13,7 +12,7 @@ include Scrapey
 # some defaults that I like
 @agent ||= Mechanize.new{|a| a.history.max_size = 10}
-@agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
+@agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
 # default output file
 @output = 'output.csv'
@@ -22,8 +21,4 @@ include Scrapey
 config_file = "#{BASEDIR}/config/config.yml"
 @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
-if @config['database']
-  ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all'].each{|lib| require lib}
-	ActiveRecord::Base.establish_connection(@config['database'])
-end
+init_db if @config['database']

data/lib/scrapey/cache.rb CHANGED Viewed

@@ -1,22 +1,14 @@
 module Scrapey
-  def use_cache
-    @use_cache = true
-    @config['cache_dir'] ||= "#{BASEDIR}/cache"
-    FileUtils.mkdir_p @config['cache_dir']
-  end
-  def cache_filename url
-    @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
-  end
-  def load_cache url
-    filename = cache_filename url
-    return nil unless File::exists?(filename)
-    puts "Loading #{filename} from cache"
-    Nokogiri::HTML File.read(filename)
+  def use_cache options = {}
+    @use_cache = true
+    if @redis = options.delete(:redis)
+      require 'scrapey/cache/redis'
+    else
+      require 'scrapey/cache/disk'
+      @config['cache_dir'] ||= "#{BASEDIR}/cache"
+      FileUtils.mkdir_p @config['cache_dir']
+    end
   end
-  def save_cache url,doc
-    File.open(cache_filename(url), 'wb') {|f| f.write(doc) }
-  end
 end

data/lib/scrapey/cache/disk.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module Scrapey
+  def cache_filename url
+    @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
+  end
+  def is_cached? url
+    File.exists? cache_filename(url)
+  end
+  def load_cache url
+    filename = cache_filename url
+    return nil unless File::exists?(filename)
+    debug "Loading #{filename} from cache"
+    Nokogiri::HTML Marshal.load(File.read(filename))
+  end
+  def save_cache url, doc, options = {}
+    File.open(cache_filename(url), "w") {|f| f << Marshal.dump(doc) }
+  end
+end

data/lib/scrapey/cache/redis.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'redis'
+module Scrapey
+  def is_cached? url
+    !!@redis.get(url)
+  end
+  def load_cache url
+    debug "Loading #{url} from cache"
+    return nil unless str = @redis.get(url)
+    debug "found it"
+    #binding.pry
+    Nokogiri::HTML Marshal.load(str)
+  end
+  def save_cache url, body, options = {}
+    @redis.set url, Marshal.dump(body)
+  end
+end

data/lib/scrapey/constants.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 module Scrapey
+  VERSION = "0.0.4"
   BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
-  ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
+  URL = "https://github.com/monkeysuffrage/scrapey"
+  #ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
 end

data/lib/scrapey/database.rb CHANGED Viewed

@@ -1,13 +1,28 @@
 module Scrapey
+  def check_db_config
+    raise 'No database configured' unless @config['database']
+  end
   def tables *args
+    check_db_config
+    missing_tables = false
     args.each do |arg|
-      Object.const_set(arg, Class.new(ActiveRecord::Base) {})
+      model = Object.const_set(arg, Class.new(ActiveRecord::Base) {})
+      missing_tables = true unless model.table_exists?
     end
+    schema = "#{BASEDIR}/src/schema.rb"
+    require schema if missing_tables && File.exists?(schema)
   end
   def truncate *args
+    check_db_config
     args.each do |arg|
       ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
     end
   end
+  def init_db
+    ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars'].each{|lib| require lib}
+  	ActiveRecord::Base.establish_connection(@config['database'])
+  end
 end

data/lib/scrapey/multi.rb CHANGED Viewed

@@ -1,18 +1,25 @@
 require 'em-http-request'
 module Scrapey
-  def multi_get all_urls, num_threads = 20, callback = :save_cache
-    all_urls.each_slice(num_threads) do |urls|
+  def multi_get_or_post method, all_urls, options = {}
+    request_options = {:redirects => 10, :head => {"User-Agent" => "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"}.merge(options.delete(:head))}
+    threads = options[:threads] || 20
+    callback = options[:callback] || :save_cache
+    all_urls.reject!{|url| is_cached? url} if @use_cache
+    @lock = Mutex.new
+    all_urls.each_slice(threads) do |urls|
       next unless urls.size > 0
       EventMachine.run do
         multi = EventMachine::MultiRequest.new
         urls.each_with_index do |url, i|
-          multi.add i, EventMachine::HttpRequest.new(url).get(:redirects => 10)
+          multi.add i, EventMachine::HttpRequest.new(url, options).send(method, request_options)
         end
         multi.callback do
           (0...multi.requests.length).each do |i|
             if multi.responses[:callback][i]
-              send callback, urls[i], multi.responses[:callback][i].response
+              @lock.synchronize do
+                send callback, urls[i], multi.responses[:callback][i].response, multi.responses[:callback][i].response_header
+              end
             else
               puts "problem downloading #{urls[i]}!"
             end
@@ -22,4 +29,9 @@ module Scrapey
       end
     end
   end
+  def multi_get *args; multi_get_or_post 'get', *args; end
+  def multi_post *args; multi_get_or_post 'post', *args; end
+  def multi_head *args; multi_get_or_post 'head', *args; end
 end

data/lib/scrapey/scrapey.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module Scrapey
       return doc if doc
       page = agent.send *new_args
-      save_cache(url, page.body) if @use_cache
+      save_cache(url, page.root.to_s) if @use_cache
       #exit if Object.const_defined? :Ocra
       page
@@ -64,6 +64,10 @@ module Scrapey
     false
   end
+  def debug msg
+    puts msg if @debug
+  end
   def ts
     Time.now.to_i.to_s
   end

data/lib/scrapey/template.rb CHANGED Viewed

@@ -7,9 +7,14 @@ module Scrapey
       template = File.expand_path('../../../template', __FILE__)
       FileUtils.cp_r template, name
       Dir.chdir name
       Dir.glob(['*/*.*', '*.*']).grep(/template/).each do |fn|
         FileUtils.mv fn, fn.gsub('template', name)
       end
+      buf = File.read "#{name}.iss"
+      buf.gsub! /Template/, "rightmove_rentals".tr('_', ' ').gsub(/\w+/){|x| x.capitalize}
+      buf.gsub! /template/, name
+      File.open("#{name}.iss", 'w'){|f| f << buf}
     end
   end

data/scrapey.gemspec CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
-require File.expand_path('../lib/scrapey/version', __FILE__)
+require File.expand_path('../lib/scrapey/constants', __FILE__)
 Gem::Specification.new do |gem|
   gem.authors       = ["P Guardiario"]
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
   gem.name          = "scrapey"
   gem.require_paths = ["lib"]
   gem.version       = Scrapey::VERSION
-  gem.add_dependency(%q<mechanize>, ["~> 2.5.0"])
+  gem.add_dependency(%q<mechanize>)
   gem.add_dependency(%q<json>, ["~> 1.7.0"])
 end

data/template/Gemfile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ source 'https://rubygems.org'
2	+ gem "scrapey", :git => 'https://github.com/monkeysuffrage/scrapey.git'

data/template/Rakefile ADDED Viewed

@@ -0,0 +1,22 @@
+#!/usr/bin/env rake
+#Rake.application.options.trace = true
+require 'fileutils'
+def name
+  @name ||= Dir.pwd[/[^\/]+$/]
+end
+desc "Build project with ocra"
+task 'build' do
+  system "ocra --icon icon.ico src/#{name}.rb --no-lzma --chdir-first --no-autoload --innosetup #{name}.iss"
+end
+desc "Copy installer to dropbox folder"
+task 'dropbox' do
+  raise 'no dropbox folder!' unless ENV['DROPBOX']
+  folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
+  FileUtils.mkdir(folder) unless File.exists?(folder)
+  FileUtils.cp "Output/setup.exe", folder
+  url = [ENV['DROPBOX_public_url'], name, 'setup.exe'].join('/').squeeze('/')
+  puts "uploaded to #{url}"
+end

data/template/icon.ico ADDED Viewed

Binary file

data/template/src/schema.rb ADDED Viewed

@@ -0,0 +1,16 @@
+=begin
+# put table schemas here. this will be included if the table is not found.
+ActiveRecord::Schema.define do
+  create_table "items" do |t|
+    t.string   "string_field"
+    t.text     "text_field"
+    t.integer  "number_field"
+    t.boolean  "boolean_field"
+    t.float    "float_field"
+    t.date     "created_at"
+    t.datetime "created_on"
+  end
+  add_index "items", ["number_field"], :name => "number_field_idx", :unique => true
+end
+=end

data/template/src/template.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 require 'scrapey'
+# require 'scrapey/multi' #=> requires em-http-request
-# customizations...
+# sample customizations...
+# @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
 # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")

data/template/template.iss ADDED Viewed

@@ -0,0 +1,12 @@
+[Setup]
+AppName=Template Scraper
+AppVersion=1.0
+DefaultDirName={localappdata}\Template Scraper
+DefaultGroupName=Template Scraper
+[Files]
+Source: "config\*"; DestDir: "{app}\config";
+Source: "src\*"; DestDir: "{app}\src";
+[Icons]
+Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrapey
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
   prerelease:
 platform: ruby
 authors:
@@ -9,24 +9,24 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-27 00:00:00.000000000 Z
+date: 2012-08-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
   requirement: !ruby/object:Gem::Requirement
     none: false
     requirements:
-    - - ~>
+    - - ! '>='
       - !ruby/object:Gem::Version
-        version: 2.5.0
+        version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     none: false
     requirements:
-    - - ~>
+    - - ! '>='
       - !ruby/object:Gem::Version
-        version: 2.5.0
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: json
   requirement: !ruby/object:Gem::Requirement
@@ -56,18 +56,28 @@ files:
 - README.md
 - Rakefile
 - bin/scrapey
+- examples/imdb.rb
 - examples/multi.rb
+- examples/multi2.rb
+- examples/redis.rb
+- examples/status_check.rb
+- lib/scrapey/cache/disk.rb
+- lib/scrapey/cache/redis.rb
 - lib/scrapey/cache.rb
 - lib/scrapey/constants.rb
 - lib/scrapey/database.rb
 - lib/scrapey/multi.rb
 - lib/scrapey/scrapey.rb
 - lib/scrapey/template.rb
-- lib/scrapey/version.rb
 - lib/scrapey.rb
 - scrapey.gemspec
 - template/config/config.yml
+- template/Gemfile
+- template/icon.ico
+- template/Rakefile
+- template/src/schema.rb
 - template/src/template.rb
+- template/template.iss
 homepage: ''
 licenses: []
 post_install_message:
@@ -88,7 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.23
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: A simple scraping framework

data/lib/scrapey/version.rb DELETED Viewed

@@ -1,3 +0,0 @@
-module Scrapey
-  VERSION = "0.0.3"
-end