scrapey 0.0.17 → 0.0.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/scrapey.rb +3 -3
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/scrapey.rb +8 -16
- data/scrapey.gemspec +0 -2
- data/template/src/downloader.rb +211 -26
- metadata +2 -30
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZWMwZWY5N2ExNjliMDVhNmQzNDEyNjJmZjExNjZkNjMyM2VkMGFhZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YmNjZDFkMTg0MDZjZmZmMDM4ZjRmMzNiYjVjZWE3NTk3Y2YwYWY1Mg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NmEwY2RjZTAxNDBlM2JmNmI0MGExZDg0MGI3OWJkZjlmZTI0YTQ2ZDkyNTI4
|
10
|
+
ZDEyOGVhNDAxYjIxYWVkOTkyZTUwNDM3ZDY2MzJiZTEwYjZkN2M2ZGExYjNh
|
11
|
+
NWYwOWJmNzFiMjYwNjAyYzRlNmRiNDA2MjUzNGJlOTNmNWMzNDc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZjkzYmMzYjJlZTFkNmU0NWVjMTQ3YjBhZGE5NzY5ZTdmN2Q2N2U0NGI1YmEx
|
14
|
+
NTVmN2MzYTVkN2FjYTA2YWRkNGY1Y2RjNDY5MjE4YjIxOGNjMmVmNGQ4MzA3
|
15
|
+
MjE2MTRkNzkwMDc3MGYwMjE0M2Y3YmI2M2RhZWUzZDJlMGVhNWY=
|
data/lib/scrapey.rb
CHANGED
@@ -31,7 +31,7 @@ unless defined? Rails
|
|
31
31
|
@output = File.join BASEDIR, 'output.csv'
|
32
32
|
|
33
33
|
# read config file
|
34
|
-
config_file = "#{BASEDIR}/config/config.yml"
|
34
|
+
config_file = @config_file_path || "#{BASEDIR}/config/config.yml"
|
35
35
|
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
36
36
|
|
37
37
|
init_db if @config['database']
|
@@ -50,11 +50,11 @@ if defined?(Ocra)
|
|
50
50
|
'active_record',
|
51
51
|
'active_record/schema',
|
52
52
|
'active_record/connection_adapters/abstract/schema_definitions',
|
53
|
-
@config['database'] ? @config['database']['adapter'] : '
|
53
|
+
@config['database'] ? @config['database']['adapter'] : 'mysql2',
|
54
54
|
'tzinfo',
|
55
55
|
'active_support/all',
|
56
56
|
'active_support/multibyte/chars'
|
57
|
-
].each{|lib| require lib}
|
57
|
+
].each{|lib| puts lib; require lib}
|
58
58
|
end
|
59
59
|
end
|
60
60
|
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/scrapey.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# require 'phantom_mechanize'
|
2
|
+
|
1
3
|
module Scrapey
|
2
4
|
|
3
5
|
def self.init b
|
@@ -12,8 +14,6 @@ module Scrapey
|
|
12
14
|
|
13
15
|
def get_or_post method, url, options={}, *args
|
14
16
|
agent = ['goto', 'visit'].include?(method) ? @browser : @agent
|
15
|
-
_retries = options.delete :retries
|
16
|
-
_sleep = options.delete :sleep
|
17
17
|
begin
|
18
18
|
new_args = method, url
|
19
19
|
unless options.empty? && args.empty?
|
@@ -21,34 +21,26 @@ module Scrapey
|
|
21
21
|
args.each{|arg| new_args << arg}
|
22
22
|
end
|
23
23
|
|
24
|
-
|
24
|
+
key = method == 'post' ? url + options.to_s : url
|
25
|
+
doc = load_cache(key) if @use_cache
|
25
26
|
return doc if doc
|
26
27
|
|
27
28
|
page = agent.send *new_args
|
28
29
|
# str = page.respond_to?('root') ? page.root.to_s : page.body
|
29
30
|
# save_cache(url, str) if @use_cache
|
30
|
-
save_cache(
|
31
|
+
save_cache(key, page.body) if @use_cache
|
31
32
|
|
32
33
|
#exit if Object.const_defined? :Ocra
|
33
34
|
page
|
34
35
|
rescue Exception => e
|
35
|
-
|
36
|
-
|
37
|
-
return on_error e, method, url, options, *args
|
38
|
-
when _retries && _retries > 0
|
39
|
-
puts "Error. Retries remaining: #{options[:retries]}"
|
40
|
-
sleep _sleep if _sleep
|
41
|
-
get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
|
42
|
-
else raise e
|
43
|
-
end
|
36
|
+
puts e.message
|
37
|
+
raise e
|
44
38
|
end
|
45
39
|
end
|
46
40
|
|
47
41
|
def get *args; get_or_post 'get', *args; end
|
48
42
|
def post *args; get_or_post 'post', *args; end
|
49
|
-
def
|
50
|
-
def goto *args; get_or_post 'goto', *args; end
|
51
|
-
def visit *args; get_or_post 'visit', *args; end
|
43
|
+
def phget *args; get_or_post 'phget', *args; end
|
52
44
|
|
53
45
|
def set_proxy *args
|
54
46
|
@agent.set_proxy *args
|
data/scrapey.gemspec
CHANGED
data/template/src/downloader.rb
CHANGED
@@ -1,25 +1,168 @@
|
|
1
1
|
require 'scrapey'
|
2
|
+
require 'watir-webdriver'
|
3
|
+
require 'pry'
|
4
|
+
require "socksify"
|
5
|
+
require 'socksify/http'
|
6
|
+
require 'net/https'
|
7
|
+
|
8
|
+
# Mechanize: call @agent.set_socks(addr, port) before using
|
9
|
+
# any of it's methods; it might be working in other cases,
|
10
|
+
# but I just didn't tried :)
|
11
|
+
class Mechanize::HTTP::Agent
|
12
|
+
public
|
13
|
+
def set_socks addr, port
|
14
|
+
set_http unless @http
|
15
|
+
class << @http
|
16
|
+
attr_accessor :socks_addr, :socks_port
|
17
|
+
|
18
|
+
def http_class
|
19
|
+
Net::HTTP.SOCKSProxy(socks_addr, socks_port)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
@http.socks_addr = addr
|
23
|
+
@http.socks_port = port
|
24
|
+
@http.open_timeout = 100
|
25
|
+
@http.read_timeout = 100
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
at_exit do
|
30
|
+
Process.kill 9, Process.pid
|
31
|
+
@threads.each do |t|
|
32
|
+
Thread.kill t
|
33
|
+
print 'k'
|
34
|
+
end
|
35
|
+
end
|
2
36
|
|
3
37
|
use_cache
|
4
38
|
|
39
|
+
@failures = {}
|
40
|
+
@max_failures = 5
|
41
|
+
@max_threads = 50
|
42
|
+
|
43
|
+
if arg = ARGV.find{|x| x[/--retries=(\d+)/]}
|
44
|
+
@max_failures = $1.to_i
|
45
|
+
ARGV.delete arg
|
46
|
+
end
|
47
|
+
|
48
|
+
if arg = ARGV.find{|x| x[/--threads=(\d+)/]}
|
49
|
+
@max_threads = $1.to_i
|
50
|
+
ARGV.delete arg
|
51
|
+
end
|
52
|
+
|
53
|
+
@socks = false
|
54
|
+
if arg = ARGV.find{|x| x[/socks/]}
|
55
|
+
@socks = true
|
56
|
+
ARGV.delete arg
|
57
|
+
end
|
58
|
+
|
59
|
+
|
5
60
|
# File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
|
6
|
-
@queue
|
7
|
-
|
61
|
+
@queue ||= File.read("#{BASEDIR}/config/urls.txt").split(/[[:space:]]+/).reject{|url| is_cached?(url)}.shuffle
|
62
|
+
|
63
|
+
if arg = ARGV.find{|x| x[/nopattern/]}
|
64
|
+
@queue.reject!{|x| x[/google|facebook|twitter|findthebest|linkedin|yellowpages|bizapedia|dandb|manta|indeed|hoovers|cortera|yelp|yellowpages|whitepages|angieslist/i]}
|
65
|
+
ARGV.delete arg
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
if @socks
|
70
|
+
@proxies = File.read("#{BASEDIR}/config/socks.txt").scan(/[\w.]+:\d+/).shuffle
|
71
|
+
else
|
72
|
+
@proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/).shuffle
|
73
|
+
end
|
74
|
+
|
75
|
+
if @pattern = ARGV[0]
|
76
|
+
@queue = @queue.select{|x| x[/#{@pattern}/]}
|
77
|
+
end
|
8
78
|
|
9
|
-
|
10
|
-
|
79
|
+
# binding.pry
|
80
|
+
|
81
|
+
def response_ok? page, url = nil
|
82
|
+
if $0[/get_emails/]
|
83
|
+
return !page.body[/zscaler|captcha/i]
|
84
|
+
end
|
85
|
+
|
86
|
+
return false if page.body[/Welcome To Zscaler/]
|
87
|
+
|
88
|
+
case url
|
89
|
+
when /google.com\/search/
|
90
|
+
return page.body[/ - Google Search/i]
|
91
|
+
when /facebook/
|
92
|
+
return page.body[/akamai/i] && !page.body[/Security Check Required/i]
|
93
|
+
when /twitter/
|
94
|
+
return page.body[/tweets/i]
|
95
|
+
when /findthebest/
|
96
|
+
return page.body[/findthebest/i] && !page.body[/Captcha/i]
|
97
|
+
when /linkedin/
|
98
|
+
return page.body[/linkedin/i] && !page.body[/Captcha/i]
|
99
|
+
when /yellowpages/
|
100
|
+
return page.body[/yellowpages/i] && !page.body[/Captcha|IP Address/i]
|
101
|
+
when /bizapedia.com/
|
102
|
+
return page.body[/bizapedia/i] && !page.body[/Captcha|IP Address/i]
|
103
|
+
when /dandb.com/
|
104
|
+
return page.body[/dandb/i] && !page.body[/Captcha/i]
|
105
|
+
when /topdrz.com/
|
106
|
+
return page.body[/topdrz/i] && !page.body[/Captcha/i]
|
107
|
+
when /businessfinder\.[a-z]{2}\.com/
|
108
|
+
return page.body[/DC.title/i]
|
109
|
+
when /hipaaspace.com/
|
110
|
+
return page.body[/Fax/i]
|
111
|
+
when /manta.com/
|
112
|
+
if page.body[/(Zscaler|Captcha|IP Address|distil_ident_block)/i]
|
113
|
+
puts $1
|
114
|
+
return false
|
115
|
+
end
|
116
|
+
return page.body[/UA-10299948/]
|
117
|
+
when /indeed.com\/cmp.*$(?<!review)/
|
118
|
+
return page.body[/indeed/i] && !page.body[/Captcha|IP Address/i]
|
119
|
+
when /hoovers.com\/company-information/
|
120
|
+
return page.body[/hoovers/i] && !page.body[/Captcha|IP Address/i]
|
121
|
+
when /cortera.com/
|
122
|
+
return page.body[/cortera/i] && !page.body[/Captcha|IP Address/i]
|
123
|
+
when /yelp.com/
|
124
|
+
return !!((page.title[/Yelp/i] && !page.title[/Captcha/i]) || page.body['yelp-biz-id'])
|
125
|
+
when /yellowpages.com.au/
|
126
|
+
return !!page.body['listing-name']
|
127
|
+
when /whitepages.com\/business/
|
128
|
+
return !!page.body['app-id=287734809']
|
129
|
+
when /angieslist.com.*\d.htm/
|
130
|
+
return !!page.title['Angies List']
|
131
|
+
when /addresssearch/
|
132
|
+
return page.body['g-plusone']
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
end
|
137
|
+
return false if page.body[/exceeded your daily request/]
|
138
|
+
begin
|
139
|
+
result = JSON.parse(page.body)['results'][0]
|
140
|
+
return true if result['address_components'].find{|x|x['types'].include?('country')}['short_name'] == 'US'
|
141
|
+
rescue
|
142
|
+
end
|
143
|
+
return !page.body[/zscaler|captcha/i]
|
144
|
+
puts "no match: #{url}"
|
145
|
+
page.body[/UA-10299948/i] && !page.body[/Authentication Required/i]
|
11
146
|
end
|
12
147
|
|
13
148
|
def clean str
|
14
149
|
str.gsub(/[[:space:]]+/, ' ').strip
|
15
150
|
end
|
16
151
|
|
152
|
+
def check browser
|
153
|
+
html = browser.html.to_s
|
154
|
+
return true if html[/Pardon Our Interruption|Zscaler|captcha/i]
|
155
|
+
return true if browser.html.length > 5000
|
156
|
+
false
|
157
|
+
end
|
158
|
+
|
17
159
|
def download
|
18
160
|
loop do
|
19
161
|
Mechanize.start do |agent|
|
20
|
-
agent.read_timeout = agent.open_timeout =
|
162
|
+
agent.read_timeout = agent.open_timeout = agent.idle_timeout = 10000
|
163
|
+
keep_alive = false
|
21
164
|
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
22
|
-
agent.user_agent = [
|
165
|
+
ua = agent.user_agent = [
|
23
166
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
24
167
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
|
25
168
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
|
@@ -40,26 +183,58 @@ def download
|
|
40
183
|
end
|
41
184
|
unless proxy = @proxies.shift
|
42
185
|
puts "no more proxies"
|
43
|
-
|
186
|
+
exit
|
44
187
|
end
|
45
188
|
@proxies.push proxy
|
46
189
|
host, port = proxy.split(':')
|
47
|
-
|
190
|
+
if @socks
|
191
|
+
agent.agent.set_socks host, port.to_i
|
192
|
+
else
|
193
|
+
agent.set_proxy host, port.to_i, 'user', 'pass'
|
194
|
+
end
|
48
195
|
begin
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
196
|
+
agent.request_headers = {'Referer' => 'http://www.google.com/search'}
|
197
|
+
page = nil
|
198
|
+
if url[/manta/]
|
199
|
+
html = `phantomjs --proxy=#{proxy} #{BASEDIR}/src/cookies.js #{url}`
|
200
|
+
page = Mechanize::Page.new URI.parse(url), [], html, nil, Mechanize.new
|
201
|
+
else
|
202
|
+
page = agent.get url
|
203
|
+
end
|
204
|
+
|
205
|
+
unless response_ok?(page, url)
|
206
|
+
# binding.pry if url[/manta/] && !page.body[/timed out|blocked|forbidden/i]
|
207
|
+
if page.title
|
208
|
+
puts page.title.strip
|
209
|
+
else
|
210
|
+
raise "no title for: #{url}"
|
211
|
+
end
|
53
212
|
raise 'str'
|
54
213
|
end
|
55
214
|
save_cache url, page.body
|
56
215
|
|
57
216
|
@good += 1
|
58
|
-
puts url
|
217
|
+
puts "- [#{@queue.length + @threads.select(&:alive?).length}/#{@proxies.length}] #{url}"
|
59
218
|
rescue StandardError => e
|
60
|
-
|
61
|
-
@
|
62
|
-
@
|
219
|
+
@failures[url] ||= 0
|
220
|
+
@failures[url] += 1
|
221
|
+
unless @failures[url] >= @max_failures
|
222
|
+
@queue.push(url) # unless e.message[/no title for/]
|
223
|
+
end
|
224
|
+
# binding.pry
|
225
|
+
if e.message[/execurtion exeprrred/]
|
226
|
+
print 'r'
|
227
|
+
elsif e.message[/403/] && !@pattern
|
228
|
+
if (rand * 3).to_i == 0
|
229
|
+
@proxies -= [proxy]
|
230
|
+
print '!'
|
231
|
+
end
|
232
|
+
else
|
233
|
+
@proxies -= [proxy]
|
234
|
+
print '!'
|
235
|
+
end
|
236
|
+
puts "! - #{@failures[url]} - #{e.message[0..99]}"
|
237
|
+
|
63
238
|
agent.cookie_jar.clear!
|
64
239
|
end
|
65
240
|
end
|
@@ -67,18 +242,28 @@ def download
|
|
67
242
|
|
68
243
|
end
|
69
244
|
|
70
|
-
|
71
|
-
@
|
245
|
+
def run
|
246
|
+
puts @queue.length
|
247
|
+
@num_threads = [@max_threads, @queue.length].min
|
248
|
+
puts "#{@proxies.length} proxies, #{@queue.length} urls, #{@num_threads} threads"
|
72
249
|
|
73
|
-
|
74
|
-
@good = 0
|
75
|
-
start_time = Time.now
|
250
|
+
@banned_for = []
|
76
251
|
|
77
|
-
@
|
252
|
+
@threads = []
|
253
|
+
@deficit = 0
|
78
254
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
255
|
+
until @queue.empty? || @proxies.empty?
|
256
|
+
@good = 0
|
257
|
+
start_time = Time.now
|
258
|
+
|
259
|
+
@proxies.shuffle!
|
260
|
+
|
261
|
+
@num_threads.times do
|
262
|
+
@threads << Thread.new { download }
|
263
|
+
end
|
264
|
+
@threads.each { |t| t.join }
|
83
265
|
|
266
|
+
end
|
84
267
|
end
|
268
|
+
run
|
269
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- P Guardiario
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-04-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -24,34 +24,6 @@ dependencies:
|
|
24
24
|
- - ! '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: httpclient
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ! '>='
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ! '>='
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: json
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ~>
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: 1.7.0
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ~>
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: 1.7.0
|
55
27
|
description: A simple scraping framework
|
56
28
|
email:
|
57
29
|
- pguardiario@gmail.com
|