scrapey 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/scrapey.rb +11 -13
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/scrapey.rb +10 -0
- data/template/Rakefile +8 -7
- data/template/src/downloader.rb +28 -0
- data/template/src/template.rb +5 -3
- metadata +3 -2
data/lib/scrapey.rb
CHANGED
@@ -3,7 +3,6 @@ require 'csv'
|
|
3
3
|
require 'json'
|
4
4
|
require 'yaml'
|
5
5
|
require 'unf_ext'
|
6
|
-
require 'coderay/tokens_proxy'
|
7
6
|
|
8
7
|
require "scrapey/scrapey"
|
9
8
|
require "scrapey/constants"
|
@@ -12,19 +11,18 @@ require "scrapey/database"
|
|
12
11
|
require "scrapey/multi"
|
13
12
|
require "scrapey/tee"
|
14
13
|
|
15
|
-
|
14
|
+
# don't do this stuff in rails:
|
15
|
+
unless defined? Rails
|
16
|
+
Scrapey::init binding
|
16
17
|
|
17
|
-
#
|
18
|
-
@
|
19
|
-
@agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
20
|
-
@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
21
|
-
# default output file
|
22
|
-
@output = File.join BASEDIR, 'output.csv'
|
18
|
+
# default output file
|
19
|
+
@output = File.join BASEDIR, 'output.csv'
|
23
20
|
|
24
|
-
# read config file
|
25
|
-
config_file = "#{BASEDIR}/config/config.yml"
|
26
|
-
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
21
|
+
# read config file
|
22
|
+
config_file = "#{BASEDIR}/config/config.yml"
|
23
|
+
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
27
24
|
|
28
|
-
init_db if @config['database']
|
25
|
+
init_db if @config['database']
|
29
26
|
|
30
|
-
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
27
|
+
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
28
|
+
end
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/scrapey.rb
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
module Scrapey
|
2
2
|
|
3
|
+
def self.init b
|
4
|
+
eval "include Scrapey", b
|
5
|
+
|
6
|
+
# some defaults that I like
|
7
|
+
eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b
|
8
|
+
eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b
|
9
|
+
eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b
|
10
|
+
end
|
11
|
+
|
12
|
+
|
3
13
|
def get_or_post method, url, options={}, *args
|
4
14
|
agent = ['goto', 'visit'].include?(method) ? @browser : @agent
|
5
15
|
_retries = options.delete :retries
|
data/template/Rakefile
CHANGED
@@ -13,13 +13,14 @@ end
|
|
13
13
|
|
14
14
|
desc "Copy installer to dropbox folder"
|
15
15
|
task 'dropbox' do
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
16
|
+
Dir.glob('Output/*').map{|x| x[/[^\/]*$/]}.each do |file|
|
17
|
+
raise 'no dropbox folder!' unless ENV['DROPBOX']
|
18
|
+
folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
|
19
|
+
FileUtils.mkdir(folder) unless File.exists?(folder)
|
20
|
+
FileUtils.cp "Output/#{file}", folder
|
21
|
+
url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
|
22
|
+
puts "uploaded to #{url}"
|
23
|
+
end
|
23
24
|
end
|
24
25
|
|
25
26
|
task 'git' do
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'scrapey'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
use_cache
|
5
|
+
|
6
|
+
# File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
|
7
|
+
@queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
|
8
|
+
|
9
|
+
def download agent
|
10
|
+
while url = @queue.shift
|
11
|
+
if is_cached? url
|
12
|
+
puts 'skipping'
|
13
|
+
next
|
14
|
+
end
|
15
|
+
page = agent.get url
|
16
|
+
save_cache url, page.body
|
17
|
+
puts url
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
threads = []
|
22
|
+
5.times do
|
23
|
+
threads << Thread.new { download Mechanize.new{|a| a.history.max_size, a.verify_mode = 10, OpenSSL::SSL::VERIFY_NONE}}
|
24
|
+
end
|
25
|
+
|
26
|
+
threads.each { |t| t.join }
|
27
|
+
|
28
|
+
binding.pry
|
data/template/src/template.rb
CHANGED
@@ -14,17 +14,19 @@ def scrape div
|
|
14
14
|
save item
|
15
15
|
exit if defined? Ocra
|
16
16
|
rescue StandardError => e
|
17
|
+
puts e.message, e.backtrace
|
17
18
|
binding.pry
|
18
19
|
end
|
19
20
|
|
20
|
-
#use_cache :redis
|
21
21
|
|
22
22
|
fields 'name', 'address', 'zip'
|
23
23
|
|
24
24
|
@url = "http://www.example.com/"
|
25
25
|
|
26
|
-
|
27
|
-
|
26
|
+
with_cache do
|
27
|
+
page = get @url
|
28
|
+
binding.pry
|
29
|
+
end
|
28
30
|
|
29
31
|
#@csv.close
|
30
32
|
#%x{call #{@output}}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-03-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -96,6 +96,7 @@ files:
|
|
96
96
|
- template/icon.ico
|
97
97
|
- template/output.csv
|
98
98
|
- template/Rakefile
|
99
|
+
- template/src/downloader.rb
|
99
100
|
- template/src/schema.rb
|
100
101
|
- template/src/template.rb
|
101
102
|
- template/template.iss
|