scrapey 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
data/lib/scrapey.rb CHANGED
@@ -3,7 +3,6 @@ require 'csv'
3
3
  require 'json'
4
4
  require 'yaml'
5
5
  require 'unf_ext'
6
- require 'coderay/tokens_proxy'
7
6
 
8
7
  require "scrapey/scrapey"
9
8
  require "scrapey/constants"
@@ -12,19 +11,18 @@ require "scrapey/database"
12
11
  require "scrapey/multi"
13
12
  require "scrapey/tee"
14
13
 
15
- include Scrapey
14
+ # don't do this stuff in rails:
15
+ unless defined? Rails
16
+ Scrapey::init binding
16
17
 
17
- # some defaults that I like
18
- @agent ||= Mechanize.new{|a| a.history.max_size = 10}
19
- @agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
20
- @agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
21
- # default output file
22
- @output = File.join BASEDIR, 'output.csv'
18
+ # default output file
19
+ @output = File.join BASEDIR, 'output.csv'
23
20
 
24
- # read config file
25
- config_file = "#{BASEDIR}/config/config.yml"
26
- @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
21
+ # read config file
22
+ config_file = "#{BASEDIR}/config/config.yml"
23
+ @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
27
24
 
28
- init_db if @config['database']
25
+ init_db if @config['database']
29
26
 
30
- $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
27
+ $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
28
+ end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -1,5 +1,15 @@
1
1
  module Scrapey
2
2
 
3
+ def self.init b
4
+ eval "include Scrapey", b
5
+
6
+ # some defaults that I like
7
+ eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b
8
+ eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b
9
+ eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b
10
+ end
11
+
12
+
3
13
  def get_or_post method, url, options={}, *args
4
14
  agent = ['goto', 'visit'].include?(method) ? @browser : @agent
5
15
  _retries = options.delete :retries
data/template/Rakefile CHANGED
@@ -13,13 +13,14 @@ end
13
13
 
14
14
  desc "Copy installer to dropbox folder"
15
15
  task 'dropbox' do
16
- file = 'setup.exe'
17
- raise 'no dropbox folder!' unless ENV['DROPBOX']
18
- folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
19
- FileUtils.mkdir(folder) unless File.exists?(folder)
20
- FileUtils.cp "Output/#{file}", folder
21
- url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
22
- puts "uploaded to #{url}"
16
+ Dir.glob('Output/*').map{|x| x[/[^\/]*$/]}.each do |file|
17
+ raise 'no dropbox folder!' unless ENV['DROPBOX']
18
+ folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
19
+ FileUtils.mkdir(folder) unless File.exists?(folder)
20
+ FileUtils.cp "Output/#{file}", folder
21
+ url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
22
+ puts "uploaded to #{url}"
23
+ end
23
24
  end
24
25
 
25
26
  task 'git' do
@@ -0,0 +1,28 @@
1
+ require 'scrapey'
2
+ require 'pry'
3
+
4
+ use_cache
5
+
6
+ # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
7
+ @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
8
+
9
+ def download agent
10
+ while url = @queue.shift
11
+ if is_cached? url
12
+ puts 'skipping'
13
+ next
14
+ end
15
+ page = agent.get url
16
+ save_cache url, page.body
17
+ puts url
18
+ end
19
+ end
20
+
21
+ threads = []
22
+ 5.times do
23
+ threads << Thread.new { download Mechanize.new{|a| a.history.max_size, a.verify_mode = 10, OpenSSL::SSL::VERIFY_NONE}}
24
+ end
25
+
26
+ threads.each { |t| t.join }
27
+
28
+ binding.pry
@@ -14,17 +14,19 @@ def scrape div
14
14
  save item
15
15
  exit if defined? Ocra
16
16
  rescue StandardError => e
17
+ puts e.message, e.backtrace
17
18
  binding.pry
18
19
  end
19
20
 
20
- #use_cache :redis
21
21
 
22
22
  fields 'name', 'address', 'zip'
23
23
 
24
24
  @url = "http://www.example.com/"
25
25
 
26
- page = get @url
27
- scrape page.at('div')
26
+ with_cache do
27
+ page = get @url
28
+ binding.pry
29
+ end
28
30
 
29
31
  #@csv.close
30
32
  #%x{call #{@output}}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-04 00:00:00.000000000 Z
12
+ date: 2013-03-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -96,6 +96,7 @@ files:
96
96
  - template/icon.ico
97
97
  - template/output.csv
98
98
  - template/Rakefile
99
+ - template/src/downloader.rb
99
100
  - template/src/schema.rb
100
101
  - template/src/template.rb
101
102
  - template/template.iss