scrapey 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,37 +1,103 @@
1
1
  # Scrapey
2
2
 
3
- TODO: Write a gem description
3
+ A simple framework for solving common scraping problems
4
4
 
5
- ## Installation
5
+ ## Install latest version
6
+ ### Add to Gemfile
6
7
 
7
- Add this line to your application's Gemfile:
8
+ gem "scrapey", :git => 'https://github.com/monkeysuffrage/scrapey.git'
8
9
 
9
- gem 'scrapey'
10
+ ### Then run:
11
+ $ bundle install
10
12
 
11
- And then execute:
13
+ ## Create a new scrapey project
12
14
 
13
- $ bundle
15
+ $ scrapey my_scraper
14
16
 
15
- Or install it yourself as:
17
+ ## Examples
16
18
 
17
- $ gem install scrapey
19
+ ### CSV
18
20
 
19
- ## Examples
21
+ ```ruby
22
+ require 'scrapey'
23
+ # By default scrapey will save as 'output.csv'
24
+ # You can change this with:
25
+ # @output = 'mycsv.csv'
20
26
 
21
- ### Concurrent downloads
27
+ page = get 'http://www.alexa.com/topsites'
28
+ page.search('li.site-listing').each do |li|
29
+ save [li.at('a').text, li.at('.description').text, li.at('.stars')[:title]]
30
+ end
31
+ ```
32
+
33
+ ### Database
34
+ ```ruby
35
+ require 'scrapey'
36
+ # if you created a scrapey project you can fill out the database connection
37
+ # information in config/config.yml
38
+
39
+ tables 'Movie', 'Actor' # create ActiveRecord models
40
+
41
+ page = get 'http://www.imdb.com/movies-in-theaters/'
42
+
43
+ page.search('div.list_item').each do |div|
44
+ movie = Movie.find_or_create_by_title div.at('h4 a').text
45
+ div.search('span[@itemprop="actors"] a').each do |a|
46
+ actor = Actor.find_or_create_by_name a.text
47
+ end
48
+ end
49
+ ```
50
+
51
+ ### Caching
52
+ Scrapey can cache responses so that next time they don't hit the network
53
+ ```ruby
54
+ use_cache
55
+ ```
22
56
 
57
+ You can use redis for caching if you have lots of memory
58
+ ```ruby
59
+ require 'redis'
60
+ use_cache :redis => Redis.new
61
+ ```
62
+
63
+ ### Retries
64
+ Retry downloads on error a max of 3 times and sleep 30 seconds between retries.
65
+ ```ruby
66
+ get 'some_url', :retries => 3, :sleep => 30
67
+ ```
68
+ Or just handle errors in an on_error method (Scrapey will call it automatically if it's defined)
69
+ ```ruby
70
+ def on_error e, method, url, options, *args
71
+ puts "retrying #{url} again in 30 seconds..."
72
+ sleep 30
73
+ send method, url, options, *args
74
+ end
75
+ ```
76
+
77
+ ### Proxy switching
78
+
79
+ ```ruby
80
+ def on_error e, method, url, options, *args
81
+ host, port = @config['proxies'].sample.split(':')
82
+ set_proxy host, port.to_i
83
+ send method, url, options, *args
84
+ end
85
+
86
+ get 'some_throttled_website_url'
87
+ ```
88
+
89
+ ### Concurrent downloads
90
+ Scrapey will ensure that the callbacks are threadsafe
23
91
  ```ruby
24
92
  require 'scrapey'
25
93
  require 'scrapey/multi'
26
94
 
27
95
  fields 'url', 'title'
28
96
 
29
- def scrape url, response
97
+ def scrape url, response, header
30
98
  doc = Nokogiri::HTML response
31
- @items << {'url' => url, 'title' => doc.at('title').text}
99
+ save({'url' => url, 'title' => doc.at('title').text})
32
100
  end
33
101
 
34
- @items = []
35
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
36
- @items.each{|item| save item}
102
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
37
103
  ```
data/examples/imdb.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'scrapey'
2
+ # if you created a scrapey project you can fill out the database connection
3
+ # information in config/config.yml
4
+
5
+ tables 'Movie', 'Actor' # create ActiveRecord models
6
+
7
+ page = get 'http://www.imdb.com/movies-in-theaters/'
8
+
9
+ page.search('div.list_item').each do |div|
10
+ movie = Movie.find_or_create_by_title div.at('h4 a').text
11
+ div.search('span[@itemprop="actors"] a').each do |a|
12
+ actor = Actor.find_or_create_by_name a.text
13
+ end
14
+ end
data/examples/multi.rb CHANGED
@@ -3,11 +3,9 @@ require 'scrapey/multi'
3
3
 
4
4
  fields 'url', 'title'
5
5
 
6
- def scrape url, response
6
+ def scrape url, response, header
7
7
  doc = Nokogiri::HTML response
8
- @items << {'url' => url, 'title' => doc.at('title').text}
8
+ save({'url' => url, 'title' => doc.at('title').text})
9
9
  end
10
10
 
11
- @items = []
12
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
13
- @items.each{|item| save item}
11
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
@@ -0,0 +1,25 @@
1
+ require 'scrapey'
2
+ require 'scrapey/multi'
3
+
4
+ fields 'url', 'title'
5
+
6
+ def scrape url, response, header
7
+ doc = Nokogiri::HTML response
8
+ save({'url' => url, 'title' => doc.at('title').text})
9
+ puts "scraped #{url}."
10
+ end
11
+
12
+ options = {
13
+ :threads => 3,
14
+ :callback => :scrape,
15
+ :proxy => {:host => 'localhost', :port => 8888},
16
+ :head => {
17
+ "Accept" => "*/*",
18
+ #"User-Agent" => "Scrapey #{Scrapey::VERSION}",
19
+ "Keep-alive" => "true"
20
+ }
21
+ }
22
+
23
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com/', 'http://www.bing.com/'], options
24
+
25
+ puts "this happens after all callbacks."
data/examples/redis.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'scrapey'
2
+ require 'redis'
3
+ require 'pry'
4
+
5
+ @debug = true
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+ use_cache :redis => Redis.new
14
+
15
+ url = 'http://www.yahoo.com/'
16
+ google = get url
17
+ puts google.at('title').text, (x = google.encoding rescue 'foo'), (y = google.body.encoding rescue 'foo'), '--'
18
+
19
+ google = get url
20
+ puts google.at('title').text, (x = google.encoding rescue 'foo'), (y = google.body.encoding rescue 'foo'), '--'
@@ -0,0 +1,10 @@
1
+ require 'scrapey'
2
+ require 'scrapey/multi'
3
+
4
+ fields 'url', 'status'
5
+
6
+ def scrape url, response, header
7
+ save({'url' => url, 'status' => header.status})
8
+ end
9
+
10
+ multi_head ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/', 'http://www.bing.com/404.html'], :threads => 4, :callback => :scrape
data/lib/scrapey.rb CHANGED
@@ -4,7 +4,6 @@ require 'json'
4
4
  require 'yaml'
5
5
 
6
6
  require "scrapey/scrapey"
7
- require "scrapey/version"
8
7
  require "scrapey/constants"
9
8
  require "scrapey/cache"
10
9
  require "scrapey/database"
@@ -13,7 +12,7 @@ include Scrapey
13
12
 
14
13
  # some defaults that I like
15
14
  @agent ||= Mechanize.new{|a| a.history.max_size = 10}
16
- @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
15
+ @agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
17
16
 
18
17
  # default output file
19
18
  @output = 'output.csv'
@@ -22,8 +21,4 @@ include Scrapey
22
21
  config_file = "#{BASEDIR}/config/config.yml"
23
22
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
24
23
 
25
- if @config['database']
26
- ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all'].each{|lib| require lib}
27
- ActiveRecord::Base.establish_connection(@config['database'])
28
- end
29
-
24
+ init_db if @config['database']
data/lib/scrapey/cache.rb CHANGED
@@ -1,22 +1,14 @@
1
1
  module Scrapey
2
- def use_cache
3
- @use_cache = true
4
- @config['cache_dir'] ||= "#{BASEDIR}/cache"
5
- FileUtils.mkdir_p @config['cache_dir']
6
- end
7
-
8
- def cache_filename url
9
- @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
10
- end
11
2
 
12
- def load_cache url
13
- filename = cache_filename url
14
- return nil unless File::exists?(filename)
15
- puts "Loading #{filename} from cache"
16
- Nokogiri::HTML File.read(filename)
3
+ def use_cache options = {}
4
+ @use_cache = true
5
+ if @redis = options.delete(:redis)
6
+ require 'scrapey/cache/redis'
7
+ else
8
+ require 'scrapey/cache/disk'
9
+ @config['cache_dir'] ||= "#{BASEDIR}/cache"
10
+ FileUtils.mkdir_p @config['cache_dir']
11
+ end
17
12
  end
18
13
 
19
- def save_cache url,doc
20
- File.open(cache_filename(url), 'wb') {|f| f.write(doc) }
21
- end
22
14
  end
@@ -0,0 +1,21 @@
1
+ module Scrapey
2
+
3
+ def cache_filename url
4
+ @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
5
+ end
6
+
7
+ def is_cached? url
8
+ File.exists? cache_filename(url)
9
+ end
10
+
11
+ def load_cache url
12
+ filename = cache_filename url
13
+ return nil unless File::exists?(filename)
14
+ debug "Loading #{filename} from cache"
15
+ Nokogiri::HTML Marshal.load(File.read(filename))
16
+ end
17
+
18
+ def save_cache url, doc, options = {}
19
+ File.open(cache_filename(url), "w") {|f| f << Marshal.dump(doc) }
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ require 'redis'
2
+
3
+ module Scrapey
4
+
5
+ def is_cached? url
6
+ !!@redis.get(url)
7
+ end
8
+
9
+ def load_cache url
10
+ debug "Loading #{url} from cache"
11
+ return nil unless str = @redis.get(url)
12
+ debug "found it"
13
+ #binding.pry
14
+ Nokogiri::HTML Marshal.load(str)
15
+ end
16
+
17
+ def save_cache url, body, options = {}
18
+ @redis.set url, Marshal.dump(body)
19
+ end
20
+ end
@@ -1,4 +1,6 @@
1
1
  module Scrapey
2
+ VERSION = "0.0.4"
2
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
- ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
4
+ URL = "https://github.com/monkeysuffrage/scrapey"
5
+ #ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
4
6
  end
@@ -1,13 +1,28 @@
1
1
  module Scrapey
2
+ def check_db_config
3
+ raise 'No database configured' unless @config['database']
4
+ end
5
+
2
6
  def tables *args
7
+ check_db_config
8
+ missing_tables = false
3
9
  args.each do |arg|
4
- Object.const_set(arg, Class.new(ActiveRecord::Base) {})
10
+ model = Object.const_set(arg, Class.new(ActiveRecord::Base) {})
11
+ missing_tables = true unless model.table_exists?
5
12
  end
13
+ schema = "#{BASEDIR}/src/schema.rb"
14
+ require schema if missing_tables && File.exists?(schema)
6
15
  end
7
16
 
8
17
  def truncate *args
18
+ check_db_config
9
19
  args.each do |arg|
10
20
  ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
11
21
  end
12
22
  end
23
+
24
+ def init_db
25
+ ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars'].each{|lib| require lib}
26
+ ActiveRecord::Base.establish_connection(@config['database'])
27
+ end
13
28
  end
data/lib/scrapey/multi.rb CHANGED
@@ -1,18 +1,25 @@
1
1
  require 'em-http-request'
2
2
 
3
3
  module Scrapey
4
- def multi_get all_urls, num_threads = 20, callback = :save_cache
5
- all_urls.each_slice(num_threads) do |urls|
4
+ def multi_get_or_post method, all_urls, options = {}
5
+ request_options = {:redirects => 10, :head => {"User-Agent" => "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"}.merge(options.delete(:head))}
6
+ threads = options[:threads] || 20
7
+ callback = options[:callback] || :save_cache
8
+ all_urls.reject!{|url| is_cached? url} if @use_cache
9
+ @lock = Mutex.new
10
+ all_urls.each_slice(threads) do |urls|
6
11
  next unless urls.size > 0
7
12
  EventMachine.run do
8
13
  multi = EventMachine::MultiRequest.new
9
14
  urls.each_with_index do |url, i|
10
- multi.add i, EventMachine::HttpRequest.new(url).get(:redirects => 10)
15
+ multi.add i, EventMachine::HttpRequest.new(url, options).send(method, request_options)
11
16
  end
12
17
  multi.callback do
13
18
  (0...multi.requests.length).each do |i|
14
19
  if multi.responses[:callback][i]
15
- send callback, urls[i], multi.responses[:callback][i].response
20
+ @lock.synchronize do
21
+ send callback, urls[i], multi.responses[:callback][i].response, multi.responses[:callback][i].response_header
22
+ end
16
23
  else
17
24
  puts "problem downloading #{urls[i]}!"
18
25
  end
@@ -22,4 +29,9 @@ module Scrapey
22
29
  end
23
30
  end
24
31
  end
32
+
33
+ def multi_get *args; multi_get_or_post 'get', *args; end
34
+ def multi_post *args; multi_get_or_post 'post', *args; end
35
+ def multi_head *args; multi_get_or_post 'head', *args; end
36
+
25
37
  end
@@ -15,7 +15,7 @@ module Scrapey
15
15
  return doc if doc
16
16
 
17
17
  page = agent.send *new_args
18
- save_cache(url, page.body) if @use_cache
18
+ save_cache(url, page.root.to_s) if @use_cache
19
19
 
20
20
  #exit if Object.const_defined? :Ocra
21
21
  page
@@ -64,6 +64,10 @@ module Scrapey
64
64
  false
65
65
  end
66
66
 
67
+ def debug msg
68
+ puts msg if @debug
69
+ end
70
+
67
71
  def ts
68
72
  Time.now.to_i.to_s
69
73
  end
@@ -7,9 +7,14 @@ module Scrapey
7
7
  template = File.expand_path('../../../template', __FILE__)
8
8
  FileUtils.cp_r template, name
9
9
  Dir.chdir name
10
+
10
11
  Dir.glob(['*/*.*', '*.*']).grep(/template/).each do |fn|
11
12
  FileUtils.mv fn, fn.gsub('template', name)
12
13
  end
14
+ buf = File.read "#{name}.iss"
15
+ buf.gsub! /Template/, "rightmove_rentals".tr('_', ' ').gsub(/\w+/){|x| x.capitalize}
16
+ buf.gsub! /template/, name
17
+ File.open("#{name}.iss", 'w'){|f| f << buf}
13
18
 
14
19
  end
15
20
  end
data/scrapey.gemspec CHANGED
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
- require File.expand_path('../lib/scrapey/version', __FILE__)
2
+ require File.expand_path('../lib/scrapey/constants', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
5
  gem.authors = ["P Guardiario"]
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
15
15
  gem.name = "scrapey"
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Scrapey::VERSION
18
- gem.add_dependency(%q<mechanize>, ["~> 2.5.0"])
18
+ gem.add_dependency(%q<mechanize>)
19
19
  gem.add_dependency(%q<json>, ["~> 1.7.0"])
20
20
  end
21
21
 
data/template/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gem "scrapey", :git => 'https://github.com/monkeysuffrage/scrapey.git'
data/template/Rakefile ADDED
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env rake
2
+ #Rake.application.options.trace = true
3
+ require 'fileutils'
4
+
5
+ def name
6
+ @name ||= Dir.pwd[/[^\/]+$/]
7
+ end
8
+
9
+ desc "Build project with ocra"
10
+ task 'build' do
11
+ system "ocra --icon icon.ico src/#{name}.rb --no-lzma --chdir-first --no-autoload --innosetup #{name}.iss"
12
+ end
13
+
14
+ desc "Copy installer to dropbox folder"
15
+ task 'dropbox' do
16
+ raise 'no dropbox folder!' unless ENV['DROPBOX']
17
+ folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
18
+ FileUtils.mkdir(folder) unless File.exists?(folder)
19
+ FileUtils.cp "Output/setup.exe", folder
20
+ url = [ENV['DROPBOX_public_url'], name, 'setup.exe'].join('/').squeeze('/')
21
+ puts "uploaded to #{url}"
22
+ end
data/template/icon.ico ADDED
Binary file
@@ -0,0 +1,16 @@
1
+ =begin
2
+ # put table schemas here. this will be included if the table is not found.
3
+ ActiveRecord::Schema.define do
4
+ create_table "items" do |t|
5
+ t.string "string_field"
6
+ t.text "text_field"
7
+ t.integer "number_field"
8
+ t.boolean "boolean_field"
9
+ t.float "float_field"
10
+ t.date "created_at"
11
+ t.datetime "created_on"
12
+ end
13
+
14
+ add_index "items", ["number_field"], :name => "number_field_idx", :unique => true
15
+ end
16
+ =end
@@ -1,4 +1,6 @@
1
1
  require 'scrapey'
2
+ # require 'scrapey/multi' #=> requires em-http-request
2
3
 
3
- # customizations...
4
+ # sample customizations...
5
+ # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
4
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
@@ -0,0 +1,12 @@
1
+ [Setup]
2
+ AppName=Template Scraper
3
+ AppVersion=1.0
4
+ DefaultDirName={localappdata}\Template Scraper
5
+ DefaultGroupName=Template Scraper
6
+
7
+ [Files]
8
+ Source: "config\*"; DestDir: "{app}\config";
9
+ Source: "src\*"; DestDir: "{app}\src";
10
+
11
+ [Icons]
12
+ Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,24 +9,24 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-27 00:00:00.000000000 Z
12
+ date: 2012-08-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ~>
19
+ - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
- version: 2.5.0
21
+ version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
- - - ~>
27
+ - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
- version: 2.5.0
29
+ version: '0'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: json
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -56,18 +56,28 @@ files:
56
56
  - README.md
57
57
  - Rakefile
58
58
  - bin/scrapey
59
+ - examples/imdb.rb
59
60
  - examples/multi.rb
61
+ - examples/multi2.rb
62
+ - examples/redis.rb
63
+ - examples/status_check.rb
64
+ - lib/scrapey/cache/disk.rb
65
+ - lib/scrapey/cache/redis.rb
60
66
  - lib/scrapey/cache.rb
61
67
  - lib/scrapey/constants.rb
62
68
  - lib/scrapey/database.rb
63
69
  - lib/scrapey/multi.rb
64
70
  - lib/scrapey/scrapey.rb
65
71
  - lib/scrapey/template.rb
66
- - lib/scrapey/version.rb
67
72
  - lib/scrapey.rb
68
73
  - scrapey.gemspec
69
74
  - template/config/config.yml
75
+ - template/Gemfile
76
+ - template/icon.ico
77
+ - template/Rakefile
78
+ - template/src/schema.rb
70
79
  - template/src/template.rb
80
+ - template/template.iss
71
81
  homepage: ''
72
82
  licenses: []
73
83
  post_install_message:
@@ -88,7 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
98
  version: '0'
89
99
  requirements: []
90
100
  rubyforge_project:
91
- rubygems_version: 1.8.23
101
+ rubygems_version: 1.8.24
92
102
  signing_key:
93
103
  specification_version: 3
94
104
  summary: A simple scraping framework
@@ -1,3 +0,0 @@
1
- module Scrapey
2
- VERSION = "0.0.3"
3
- end