scrapey 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/scrapey.rb CHANGED
@@ -3,7 +3,6 @@ require 'csv'
3
3
  require 'json'
4
4
  require 'yaml'
5
5
  require 'unf_ext'
6
- require 'coderay/tokens_proxy'
7
6
 
8
7
  require "scrapey/scrapey"
9
8
  require "scrapey/constants"
@@ -12,19 +11,18 @@ require "scrapey/database"
12
11
  require "scrapey/multi"
13
12
  require "scrapey/tee"
14
13
 
15
- include Scrapey
14
+ # don't do this stuff in rails:
15
+ unless defined? Rails
16
+ Scrapey::init binding
16
17
 
17
- # some defaults that I like
18
- @agent ||= Mechanize.new{|a| a.history.max_size = 10}
19
- @agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
20
- @agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
21
- # default output file
22
- @output = File.join BASEDIR, 'output.csv'
18
+ # default output file
19
+ @output = File.join BASEDIR, 'output.csv'
23
20
 
24
- # read config file
25
- config_file = "#{BASEDIR}/config/config.yml"
26
- @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
21
+ # read config file
22
+ config_file = "#{BASEDIR}/config/config.yml"
23
+ @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
27
24
 
28
- init_db if @config['database']
25
+ init_db if @config['database']
29
26
 
30
- $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
27
+ $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
28
+ end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -1,5 +1,15 @@
1
1
  module Scrapey
2
2
 
3
+ def self.init b
4
+ eval "include Scrapey", b
5
+
6
+ # some defaults that I like
7
+ eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b
8
+ eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b
9
+ eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b
10
+ end
11
+
12
+
3
13
  def get_or_post method, url, options={}, *args
4
14
  agent = ['goto', 'visit'].include?(method) ? @browser : @agent
5
15
  _retries = options.delete :retries
data/template/Rakefile CHANGED
@@ -13,13 +13,14 @@ end
13
13
 
14
14
  desc "Copy installer to dropbox folder"
15
15
  task 'dropbox' do
16
- file = 'setup.exe'
17
- raise 'no dropbox folder!' unless ENV['DROPBOX']
18
- folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
19
- FileUtils.mkdir(folder) unless File.exists?(folder)
20
- FileUtils.cp "Output/#{file}", folder
21
- url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
22
- puts "uploaded to #{url}"
16
+ Dir.glob('Output/*').map{|x| x[/[^\/]*$/]}.each do |file|
17
+ raise 'no dropbox folder!' unless ENV['DROPBOX']
18
+ folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
19
+ FileUtils.mkdir(folder) unless File.exists?(folder)
20
+ FileUtils.cp "Output/#{file}", folder
21
+ url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
22
+ puts "uploaded to #{url}"
23
+ end
23
24
  end
24
25
 
25
26
  task 'git' do
@@ -0,0 +1,28 @@
1
+ require 'scrapey'
2
+ require 'pry'
3
+
4
+ use_cache
5
+
6
+ # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
7
+ @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
8
+
9
+ def download agent
10
+ while url = @queue.shift
11
+ if is_cached? url
12
+ puts 'skipping'
13
+ next
14
+ end
15
+ page = agent.get url
16
+ save_cache url, page.body
17
+ puts url
18
+ end
19
+ end
20
+
21
+ threads = []
22
+ 5.times do
23
+ threads << Thread.new { download Mechanize.new{|a| a.history.max_size, a.verify_mode = 10, OpenSSL::SSL::VERIFY_NONE}}
24
+ end
25
+
26
+ threads.each { |t| t.join }
27
+
28
+ binding.pry
@@ -14,17 +14,19 @@ def scrape div
14
14
  save item
15
15
  exit if defined? Ocra
16
16
  rescue StandardError => e
17
+ puts e.message, e.backtrace
17
18
  binding.pry
18
19
  end
19
20
 
20
- #use_cache :redis
21
21
 
22
22
  fields 'name', 'address', 'zip'
23
23
 
24
24
  @url = "http://www.example.com/"
25
25
 
26
- page = get @url
27
- scrape page.at('div')
26
+ with_cache do
27
+ page = get @url
28
+ binding.pry
29
+ end
28
30
 
29
31
  #@csv.close
30
32
  #%x{call #{@output}}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-04 00:00:00.000000000 Z
12
+ date: 2013-03-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -96,6 +96,7 @@ files:
96
96
  - template/icon.ico
97
97
  - template/output.csv
98
98
  - template/Rakefile
99
+ - template/src/downloader.rb
99
100
  - template/src/schema.rb
100
101
  - template/src/template.rb
101
102
  - template/template.iss