scrapey 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,37 +1,103 @@
1
1
  # Scrapey
2
2
 
3
- TODO: Write a gem description
3
+ A simple framework for solving common scraping problems
4
4
 
5
- ## Installation
5
+ ## Install latest version
6
+ ### Add to Gemfile
6
7
 
7
- Add this line to your application's Gemfile:
8
+ gem "scrapey", :git => 'https://github.com/monkeysuffrage/scrapey.git'
8
9
 
9
- gem 'scrapey'
10
+ ### Then run:
11
+ $ bundle install
10
12
 
11
- And then execute:
13
+ ## Create a new scrapey project
12
14
 
13
- $ bundle
15
+ $ scrapey my_scraper
14
16
 
15
- Or install it yourself as:
17
+ ## Examples
16
18
 
17
- $ gem install scrapey
19
+ ### CSV
18
20
 
19
- ## Examples
21
+ ```ruby
22
+ require 'scrapey'
23
+ # By default scrapey will save as 'output.csv'
24
+ # You can change this with:
25
+ # @output = 'mycsv.csv'
20
26
 
21
- ### Concurrent downloads
27
+ page = get 'http://www.alexa.com/topsites'
28
+ page.search('li.site-listing').each do |li|
29
+ save [li.at('a').text, li.at('.description').text, li.at('.stars')[:title]]
30
+ end
31
+ ```
32
+
33
+ ### Database
34
+ ```ruby
35
+ require 'scrapey'
36
+ # if you created a scrapey project you can fill out the database connection
37
+ # information in config/config.yml
38
+
39
+ tables 'Movie', 'Actor' # create ActiveRecord models
40
+
41
+ page = get 'http://www.imdb.com/movies-in-theaters/'
42
+
43
+ page.search('div.list_item').each do |div|
44
+ movie = Movie.find_or_create_by_title div.at('h4 a').text
45
+ div.search('span[@itemprop="actors"] a').each do |a|
46
+ actor = Actor.find_or_create_by_name a.text
47
+ end
48
+ end
49
+ ```
50
+
51
+ ### Caching
52
+ Scrapey can cache responses so that next time they don't hit the network
53
+ ```ruby
54
+ use_cache
55
+ ```
22
56
 
57
+ You can use redis for caching if you have lots of memory
58
+ ```ruby
59
+ require 'redis'
60
+ use_cache :redis => Redis.new
61
+ ```
62
+
63
+ ### Retries
64
+ Retry downloads on error a max of 3 times and sleep 30 seconds between retries.
65
+ ```ruby
66
+ get 'some_url', :retries => 3, :sleep => 30
67
+ ```
68
+ Or just handle errors in an on_error method (Scrapey will call it automatically if it's defined)
69
+ ```ruby
70
+ def on_error e, method, url, options, *args
71
+ puts "retrying #{url} again in 30 seconds..."
72
+ sleep 30
73
+ send method, url, options, *args
74
+ end
75
+ ```
76
+
77
+ ### Proxy switching
78
+
79
+ ```ruby
80
+ def on_error e, method, url, options, *args
81
+ host, port = @config['proxies'].sample.split(':')
82
+ set_proxy host, port.to_i
83
+ send method, url, options, *args
84
+ end
85
+
86
+ get 'some_throttled_website_url'
87
+ ```
88
+
89
+ ### Concurrent downloads
90
+ Scrapey will ensure that the callbacks are threadsafe
23
91
  ```ruby
24
92
  require 'scrapey'
25
93
  require 'scrapey/multi'
26
94
 
27
95
  fields 'url', 'title'
28
96
 
29
- def scrape url, response
97
+ def scrape url, response, header
30
98
  doc = Nokogiri::HTML response
31
- @items << {'url' => url, 'title' => doc.at('title').text}
99
+ save({'url' => url, 'title' => doc.at('title').text})
32
100
  end
33
101
 
34
- @items = []
35
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
36
- @items.each{|item| save item}
102
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
37
103
  ```
data/examples/imdb.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'scrapey'
2
+ # if you created a scrapey project you can fill out the database connection
3
+ # information in config/config.yml
4
+
5
+ tables 'Movie', 'Actor' # create ActiveRecord models
6
+
7
+ page = get 'http://www.imdb.com/movies-in-theaters/'
8
+
9
+ page.search('div.list_item').each do |div|
10
+ movie = Movie.find_or_create_by_title div.at('h4 a').text
11
+ div.search('span[@itemprop="actors"] a').each do |a|
12
+ actor = Actor.find_or_create_by_name a.text
13
+ end
14
+ end
data/examples/multi.rb CHANGED
@@ -3,11 +3,9 @@ require 'scrapey/multi'
3
3
 
4
4
  fields 'url', 'title'
5
5
 
6
- def scrape url, response
6
+ def scrape url, response, header
7
7
  doc = Nokogiri::HTML response
8
- @items << {'url' => url, 'title' => doc.at('title').text}
8
+ save({'url' => url, 'title' => doc.at('title').text})
9
9
  end
10
10
 
11
- @items = []
12
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
13
- @items.each{|item| save item}
11
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
@@ -0,0 +1,25 @@
1
+ require 'scrapey'
2
+ require 'scrapey/multi'
3
+
4
+ fields 'url', 'title'
5
+
6
+ def scrape url, response, header
7
+ doc = Nokogiri::HTML response
8
+ save({'url' => url, 'title' => doc.at('title').text})
9
+ puts "scraped #{url}."
10
+ end
11
+
12
+ options = {
13
+ :threads => 3,
14
+ :callback => :scrape,
15
+ :proxy => {:host => 'localhost', :port => 8888},
16
+ :head => {
17
+ "Accept" => "*/*",
18
+ #"User-Agent" => "Scrapey #{Scrapey::VERSION}",
19
+ "Keep-alive" => "true"
20
+ }
21
+ }
22
+
23
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com/', 'http://www.bing.com/'], options
24
+
25
+ puts "this happens after all callbacks."
data/examples/redis.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'scrapey'
2
+ require 'redis'
3
+ require 'pry'
4
+
5
+ @debug = true
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+ use_cache :redis => Redis.new
14
+
15
+ url = 'http://www.yahoo.com/'
16
+ google = get url
17
+ puts google.at('title').text, (x = google.encoding rescue 'foo'), (y = google.body.encoding rescue 'foo'), '--'
18
+
19
+ google = get url
20
+ puts google.at('title').text, (x = google.encoding rescue 'foo'), (y = google.body.encoding rescue 'foo'), '--'
@@ -0,0 +1,10 @@
1
+ require 'scrapey'
2
+ require 'scrapey/multi'
3
+
4
+ fields 'url', 'status'
5
+
6
+ def scrape url, response, header
7
+ save({'url' => url, 'status' => header.status})
8
+ end
9
+
10
+ multi_head ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/', 'http://www.bing.com/404.html'], :threads => 4, :callback => :scrape
data/lib/scrapey.rb CHANGED
@@ -4,7 +4,6 @@ require 'json'
4
4
  require 'yaml'
5
5
 
6
6
  require "scrapey/scrapey"
7
- require "scrapey/version"
8
7
  require "scrapey/constants"
9
8
  require "scrapey/cache"
10
9
  require "scrapey/database"
@@ -13,7 +12,7 @@ include Scrapey
13
12
 
14
13
  # some defaults that I like
15
14
  @agent ||= Mechanize.new{|a| a.history.max_size = 10}
16
- @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
15
+ @agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
17
16
 
18
17
  # default output file
19
18
  @output = 'output.csv'
@@ -22,8 +21,4 @@ include Scrapey
22
21
  config_file = "#{BASEDIR}/config/config.yml"
23
22
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
24
23
 
25
- if @config['database']
26
- ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all'].each{|lib| require lib}
27
- ActiveRecord::Base.establish_connection(@config['database'])
28
- end
29
-
24
+ init_db if @config['database']
data/lib/scrapey/cache.rb CHANGED
@@ -1,22 +1,14 @@
1
1
  module Scrapey
2
- def use_cache
3
- @use_cache = true
4
- @config['cache_dir'] ||= "#{BASEDIR}/cache"
5
- FileUtils.mkdir_p @config['cache_dir']
6
- end
7
-
8
- def cache_filename url
9
- @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
10
- end
11
2
 
12
- def load_cache url
13
- filename = cache_filename url
14
- return nil unless File::exists?(filename)
15
- puts "Loading #{filename} from cache"
16
- Nokogiri::HTML File.read(filename)
3
+ def use_cache options = {}
4
+ @use_cache = true
5
+ if @redis = options.delete(:redis)
6
+ require 'scrapey/cache/redis'
7
+ else
8
+ require 'scrapey/cache/disk'
9
+ @config['cache_dir'] ||= "#{BASEDIR}/cache"
10
+ FileUtils.mkdir_p @config['cache_dir']
11
+ end
17
12
  end
18
13
 
19
- def save_cache url,doc
20
- File.open(cache_filename(url), 'wb') {|f| f.write(doc) }
21
- end
22
14
  end
@@ -0,0 +1,21 @@
1
+ module Scrapey
2
+
3
+ def cache_filename url
4
+ @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
5
+ end
6
+
7
+ def is_cached? url
8
+ File.exists? cache_filename(url)
9
+ end
10
+
11
+ def load_cache url
12
+ filename = cache_filename url
13
+ return nil unless File::exists?(filename)
14
+ debug "Loading #{filename} from cache"
15
+ Nokogiri::HTML Marshal.load(File.read(filename))
16
+ end
17
+
18
+ def save_cache url, doc, options = {}
19
+ File.open(cache_filename(url), "w") {|f| f << Marshal.dump(doc) }
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ require 'redis'
2
+
3
+ module Scrapey
4
+
5
+ def is_cached? url
6
+ !!@redis.get(url)
7
+ end
8
+
9
+ def load_cache url
10
+ debug "Loading #{url} from cache"
11
+ return nil unless str = @redis.get(url)
12
+ debug "found it"
13
+ #binding.pry
14
+ Nokogiri::HTML Marshal.load(str)
15
+ end
16
+
17
+ def save_cache url, body, options = {}
18
+ @redis.set url, Marshal.dump(body)
19
+ end
20
+ end
@@ -1,4 +1,6 @@
1
1
  module Scrapey
2
+ VERSION = "0.0.4"
2
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
- ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
4
+ URL = "https://github.com/monkeysuffrage/scrapey"
5
+ #ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
4
6
  end
@@ -1,13 +1,28 @@
1
1
  module Scrapey
2
+ def check_db_config
3
+ raise 'No database configured' unless @config['database']
4
+ end
5
+
2
6
  def tables *args
7
+ check_db_config
8
+ missing_tables = false
3
9
  args.each do |arg|
4
- Object.const_set(arg, Class.new(ActiveRecord::Base) {})
10
+ model = Object.const_set(arg, Class.new(ActiveRecord::Base) {})
11
+ missing_tables = true unless model.table_exists?
5
12
  end
13
+ schema = "#{BASEDIR}/src/schema.rb"
14
+ require schema if missing_tables && File.exists?(schema)
6
15
  end
7
16
 
8
17
  def truncate *args
18
+ check_db_config
9
19
  args.each do |arg|
10
20
  ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
11
21
  end
12
22
  end
23
+
24
+ def init_db
25
+ ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars'].each{|lib| require lib}
26
+ ActiveRecord::Base.establish_connection(@config['database'])
27
+ end
13
28
  end
data/lib/scrapey/multi.rb CHANGED
@@ -1,18 +1,25 @@
1
1
  require 'em-http-request'
2
2
 
3
3
  module Scrapey
4
- def multi_get all_urls, num_threads = 20, callback = :save_cache
5
- all_urls.each_slice(num_threads) do |urls|
4
+ def multi_get_or_post method, all_urls, options = {}
5
+ request_options = {:redirects => 10, :head => {"User-Agent" => "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"}.merge(options.delete(:head))}
6
+ threads = options[:threads] || 20
7
+ callback = options[:callback] || :save_cache
8
+ all_urls.reject!{|url| is_cached? url} if @use_cache
9
+ @lock = Mutex.new
10
+ all_urls.each_slice(threads) do |urls|
6
11
  next unless urls.size > 0
7
12
  EventMachine.run do
8
13
  multi = EventMachine::MultiRequest.new
9
14
  urls.each_with_index do |url, i|
10
- multi.add i, EventMachine::HttpRequest.new(url).get(:redirects => 10)
15
+ multi.add i, EventMachine::HttpRequest.new(url, options).send(method, request_options)
11
16
  end
12
17
  multi.callback do
13
18
  (0...multi.requests.length).each do |i|
14
19
  if multi.responses[:callback][i]
15
- send callback, urls[i], multi.responses[:callback][i].response
20
+ @lock.synchronize do
21
+ send callback, urls[i], multi.responses[:callback][i].response, multi.responses[:callback][i].response_header
22
+ end
16
23
  else
17
24
  puts "problem downloading #{urls[i]}!"
18
25
  end
@@ -22,4 +29,9 @@ module Scrapey
22
29
  end
23
30
  end
24
31
  end
32
+
33
+ def multi_get *args; multi_get_or_post 'get', *args; end
34
+ def multi_post *args; multi_get_or_post 'post', *args; end
35
+ def multi_head *args; multi_get_or_post 'head', *args; end
36
+
25
37
  end
@@ -15,7 +15,7 @@ module Scrapey
15
15
  return doc if doc
16
16
 
17
17
  page = agent.send *new_args
18
- save_cache(url, page.body) if @use_cache
18
+ save_cache(url, page.root.to_s) if @use_cache
19
19
 
20
20
  #exit if Object.const_defined? :Ocra
21
21
  page
@@ -64,6 +64,10 @@ module Scrapey
64
64
  false
65
65
  end
66
66
 
67
+ def debug msg
68
+ puts msg if @debug
69
+ end
70
+
67
71
  def ts
68
72
  Time.now.to_i.to_s
69
73
  end
@@ -7,9 +7,14 @@ module Scrapey
7
7
  template = File.expand_path('../../../template', __FILE__)
8
8
  FileUtils.cp_r template, name
9
9
  Dir.chdir name
10
+
10
11
  Dir.glob(['*/*.*', '*.*']).grep(/template/).each do |fn|
11
12
  FileUtils.mv fn, fn.gsub('template', name)
12
13
  end
14
+ buf = File.read "#{name}.iss"
15
+ buf.gsub! /Template/, "rightmove_rentals".tr('_', ' ').gsub(/\w+/){|x| x.capitalize}
16
+ buf.gsub! /template/, name
17
+ File.open("#{name}.iss", 'w'){|f| f << buf}
13
18
 
14
19
  end
15
20
  end
data/scrapey.gemspec CHANGED
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
- require File.expand_path('../lib/scrapey/version', __FILE__)
2
+ require File.expand_path('../lib/scrapey/constants', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
5
  gem.authors = ["P Guardiario"]
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
15
15
  gem.name = "scrapey"
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Scrapey::VERSION
18
- gem.add_dependency(%q<mechanize>, ["~> 2.5.0"])
18
+ gem.add_dependency(%q<mechanize>)
19
19
  gem.add_dependency(%q<json>, ["~> 1.7.0"])
20
20
  end
21
21
 
data/template/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gem "scrapey", :git => 'https://github.com/monkeysuffrage/scrapey.git'
data/template/Rakefile ADDED
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env rake
2
+ #Rake.application.options.trace = true
3
+ require 'fileutils'
4
+
5
+ def name
6
+ @name ||= Dir.pwd[/[^\/]+$/]
7
+ end
8
+
9
+ desc "Build project with ocra"
10
+ task 'build' do
11
+ system "ocra --icon icon.ico src/#{name}.rb --no-lzma --chdir-first --no-autoload --innosetup #{name}.iss"
12
+ end
13
+
14
+ desc "Copy installer to dropbox folder"
15
+ task 'dropbox' do
16
+ raise 'no dropbox folder!' unless ENV['DROPBOX']
17
+ folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
18
+ FileUtils.mkdir(folder) unless File.exists?(folder)
19
+ FileUtils.cp "Output/setup.exe", folder
20
+ url = [ENV['DROPBOX_public_url'], name, 'setup.exe'].join('/').squeeze('/')
21
+ puts "uploaded to #{url}"
22
+ end
data/template/icon.ico ADDED
Binary file
@@ -0,0 +1,16 @@
1
+ =begin
2
+ # put table schemas here. this will be included if the table is not found.
3
+ ActiveRecord::Schema.define do
4
+ create_table "items" do |t|
5
+ t.string "string_field"
6
+ t.text "text_field"
7
+ t.integer "number_field"
8
+ t.boolean "boolean_field"
9
+ t.float "float_field"
10
+ t.date "created_at"
11
+ t.datetime "created_on"
12
+ end
13
+
14
+ add_index "items", ["number_field"], :name => "number_field_idx", :unique => true
15
+ end
16
+ =end
@@ -1,4 +1,6 @@
1
1
  require 'scrapey'
2
+ # require 'scrapey/multi' #=> requires em-http-request
2
3
 
3
- # customizations...
4
+ # sample customizations...
5
+ # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
4
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
@@ -0,0 +1,12 @@
1
+ [Setup]
2
+ AppName=Template Scraper
3
+ AppVersion=1.0
4
+ DefaultDirName={localappdata}\Template Scraper
5
+ DefaultGroupName=Template Scraper
6
+
7
+ [Files]
8
+ Source: "config\*"; DestDir: "{app}\config";
9
+ Source: "src\*"; DestDir: "{app}\src";
10
+
11
+ [Icons]
12
+ Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,24 +9,24 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-27 00:00:00.000000000 Z
12
+ date: 2012-08-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ~>
19
+ - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
- version: 2.5.0
21
+ version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
- - - ~>
27
+ - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
- version: 2.5.0
29
+ version: '0'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: json
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -56,18 +56,28 @@ files:
56
56
  - README.md
57
57
  - Rakefile
58
58
  - bin/scrapey
59
+ - examples/imdb.rb
59
60
  - examples/multi.rb
61
+ - examples/multi2.rb
62
+ - examples/redis.rb
63
+ - examples/status_check.rb
64
+ - lib/scrapey/cache/disk.rb
65
+ - lib/scrapey/cache/redis.rb
60
66
  - lib/scrapey/cache.rb
61
67
  - lib/scrapey/constants.rb
62
68
  - lib/scrapey/database.rb
63
69
  - lib/scrapey/multi.rb
64
70
  - lib/scrapey/scrapey.rb
65
71
  - lib/scrapey/template.rb
66
- - lib/scrapey/version.rb
67
72
  - lib/scrapey.rb
68
73
  - scrapey.gemspec
69
74
  - template/config/config.yml
75
+ - template/Gemfile
76
+ - template/icon.ico
77
+ - template/Rakefile
78
+ - template/src/schema.rb
70
79
  - template/src/template.rb
80
+ - template/template.iss
71
81
  homepage: ''
72
82
  licenses: []
73
83
  post_install_message:
@@ -88,7 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
98
  version: '0'
89
99
  requirements: []
90
100
  rubyforge_project:
91
- rubygems_version: 1.8.23
101
+ rubygems_version: 1.8.24
92
102
  signing_key:
93
103
  specification_version: 3
94
104
  summary: A simple scraping framework
@@ -1,3 +0,0 @@
1
- module Scrapey
2
- VERSION = "0.0.3"
3
- end