scrapey 0.0.17 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/scrapey.rb +3 -3
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/scrapey.rb +8 -16
- data/scrapey.gemspec +0 -2
- data/template/src/downloader.rb +211 -26
- metadata +2 -30
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZWMwZWY5N2ExNjliMDVhNmQzNDEyNjJmZjExNjZkNjMyM2VkMGFhZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YmNjZDFkMTg0MDZjZmZmMDM4ZjRmMzNiYjVjZWE3NTk3Y2YwYWY1Mg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NmEwY2RjZTAxNDBlM2JmNmI0MGExZDg0MGI3OWJkZjlmZTI0YTQ2ZDkyNTI4
|
10
|
+
ZDEyOGVhNDAxYjIxYWVkOTkyZTUwNDM3ZDY2MzJiZTEwYjZkN2M2ZGExYjNh
|
11
|
+
NWYwOWJmNzFiMjYwNjAyYzRlNmRiNDA2MjUzNGJlOTNmNWMzNDc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZjkzYmMzYjJlZTFkNmU0NWVjMTQ3YjBhZGE5NzY5ZTdmN2Q2N2U0NGI1YmEx
|
14
|
+
NTVmN2MzYTVkN2FjYTA2YWRkNGY1Y2RjNDY5MjE4YjIxOGNjMmVmNGQ4MzA3
|
15
|
+
MjE2MTRkNzkwMDc3MGYwMjE0M2Y3YmI2M2RhZWUzZDJlMGVhNWY=
|
data/lib/scrapey.rb
CHANGED
@@ -31,7 +31,7 @@ unless defined? Rails
|
|
31
31
|
@output = File.join BASEDIR, 'output.csv'
|
32
32
|
|
33
33
|
# read config file
|
34
|
-
config_file = "#{BASEDIR}/config/config.yml"
|
34
|
+
config_file = @config_file_path || "#{BASEDIR}/config/config.yml"
|
35
35
|
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
36
36
|
|
37
37
|
init_db if @config['database']
|
@@ -50,11 +50,11 @@ if defined?(Ocra)
|
|
50
50
|
'active_record',
|
51
51
|
'active_record/schema',
|
52
52
|
'active_record/connection_adapters/abstract/schema_definitions',
|
53
|
-
@config['database'] ? @config['database']['adapter'] : '
|
53
|
+
@config['database'] ? @config['database']['adapter'] : 'mysql2',
|
54
54
|
'tzinfo',
|
55
55
|
'active_support/all',
|
56
56
|
'active_support/multibyte/chars'
|
57
|
-
].each{|lib| require lib}
|
57
|
+
].each{|lib| puts lib; require lib}
|
58
58
|
end
|
59
59
|
end
|
60
60
|
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/scrapey.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# require 'phantom_mechanize'
|
2
|
+
|
1
3
|
module Scrapey
|
2
4
|
|
3
5
|
def self.init b
|
@@ -12,8 +14,6 @@ module Scrapey
|
|
12
14
|
|
13
15
|
def get_or_post method, url, options={}, *args
|
14
16
|
agent = ['goto', 'visit'].include?(method) ? @browser : @agent
|
15
|
-
_retries = options.delete :retries
|
16
|
-
_sleep = options.delete :sleep
|
17
17
|
begin
|
18
18
|
new_args = method, url
|
19
19
|
unless options.empty? && args.empty?
|
@@ -21,34 +21,26 @@ module Scrapey
|
|
21
21
|
args.each{|arg| new_args << arg}
|
22
22
|
end
|
23
23
|
|
24
|
-
|
24
|
+
key = method == 'post' ? url + options.to_s : url
|
25
|
+
doc = load_cache(key) if @use_cache
|
25
26
|
return doc if doc
|
26
27
|
|
27
28
|
page = agent.send *new_args
|
28
29
|
# str = page.respond_to?('root') ? page.root.to_s : page.body
|
29
30
|
# save_cache(url, str) if @use_cache
|
30
|
-
save_cache(
|
31
|
+
save_cache(key, page.body) if @use_cache
|
31
32
|
|
32
33
|
#exit if Object.const_defined? :Ocra
|
33
34
|
page
|
34
35
|
rescue Exception => e
|
35
|
-
|
36
|
-
|
37
|
-
return on_error e, method, url, options, *args
|
38
|
-
when _retries && _retries > 0
|
39
|
-
puts "Error. Retries remaining: #{options[:retries]}"
|
40
|
-
sleep _sleep if _sleep
|
41
|
-
get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
|
42
|
-
else raise e
|
43
|
-
end
|
36
|
+
puts e.message
|
37
|
+
raise e
|
44
38
|
end
|
45
39
|
end
|
46
40
|
|
47
41
|
def get *args; get_or_post 'get', *args; end
|
48
42
|
def post *args; get_or_post 'post', *args; end
|
49
|
-
def
|
50
|
-
def goto *args; get_or_post 'goto', *args; end
|
51
|
-
def visit *args; get_or_post 'visit', *args; end
|
43
|
+
def phget *args; get_or_post 'phget', *args; end
|
52
44
|
|
53
45
|
def set_proxy *args
|
54
46
|
@agent.set_proxy *args
|
data/scrapey.gemspec
CHANGED
data/template/src/downloader.rb
CHANGED
@@ -1,25 +1,168 @@
|
|
1
1
|
require 'scrapey'
|
2
|
+
require 'watir-webdriver'
|
3
|
+
require 'pry'
|
4
|
+
require "socksify"
|
5
|
+
require 'socksify/http'
|
6
|
+
require 'net/https'
|
7
|
+
|
8
|
+
# Mechanize: call @agent.set_socks(addr, port) before using
|
9
|
+
# any of it's methods; it might be working in other cases,
|
10
|
+
# but I just didn't tried :)
|
11
|
+
class Mechanize::HTTP::Agent
|
12
|
+
public
|
13
|
+
def set_socks addr, port
|
14
|
+
set_http unless @http
|
15
|
+
class << @http
|
16
|
+
attr_accessor :socks_addr, :socks_port
|
17
|
+
|
18
|
+
def http_class
|
19
|
+
Net::HTTP.SOCKSProxy(socks_addr, socks_port)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
@http.socks_addr = addr
|
23
|
+
@http.socks_port = port
|
24
|
+
@http.open_timeout = 100
|
25
|
+
@http.read_timeout = 100
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
at_exit do
|
30
|
+
Process.kill 9, Process.pid
|
31
|
+
@threads.each do |t|
|
32
|
+
Thread.kill t
|
33
|
+
print 'k'
|
34
|
+
end
|
35
|
+
end
|
2
36
|
|
3
37
|
use_cache
|
4
38
|
|
39
|
+
@failures = {}
|
40
|
+
@max_failures = 5
|
41
|
+
@max_threads = 50
|
42
|
+
|
43
|
+
if arg = ARGV.find{|x| x[/--retries=(\d+)/]}
|
44
|
+
@max_failures = $1.to_i
|
45
|
+
ARGV.delete arg
|
46
|
+
end
|
47
|
+
|
48
|
+
if arg = ARGV.find{|x| x[/--threads=(\d+)/]}
|
49
|
+
@max_threads = $1.to_i
|
50
|
+
ARGV.delete arg
|
51
|
+
end
|
52
|
+
|
53
|
+
@socks = false
|
54
|
+
if arg = ARGV.find{|x| x[/socks/]}
|
55
|
+
@socks = true
|
56
|
+
ARGV.delete arg
|
57
|
+
end
|
58
|
+
|
59
|
+
|
5
60
|
# File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
|
6
|
-
@queue
|
7
|
-
|
61
|
+
@queue ||= File.read("#{BASEDIR}/config/urls.txt").split(/[[:space:]]+/).reject{|url| is_cached?(url)}.shuffle
|
62
|
+
|
63
|
+
if arg = ARGV.find{|x| x[/nopattern/]}
|
64
|
+
@queue.reject!{|x| x[/google|facebook|twitter|findthebest|linkedin|yellowpages|bizapedia|dandb|manta|indeed|hoovers|cortera|yelp|yellowpages|whitepages|angieslist/i]}
|
65
|
+
ARGV.delete arg
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
if @socks
|
70
|
+
@proxies = File.read("#{BASEDIR}/config/socks.txt").scan(/[\w.]+:\d+/).shuffle
|
71
|
+
else
|
72
|
+
@proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/).shuffle
|
73
|
+
end
|
74
|
+
|
75
|
+
if @pattern = ARGV[0]
|
76
|
+
@queue = @queue.select{|x| x[/#{@pattern}/]}
|
77
|
+
end
|
8
78
|
|
9
|
-
|
10
|
-
|
79
|
+
# binding.pry
|
80
|
+
|
81
|
+
def response_ok? page, url = nil
|
82
|
+
if $0[/get_emails/]
|
83
|
+
return !page.body[/zscaler|captcha/i]
|
84
|
+
end
|
85
|
+
|
86
|
+
return false if page.body[/Welcome To Zscaler/]
|
87
|
+
|
88
|
+
case url
|
89
|
+
when /google.com\/search/
|
90
|
+
return page.body[/ - Google Search/i]
|
91
|
+
when /facebook/
|
92
|
+
return page.body[/akamai/i] && !page.body[/Security Check Required/i]
|
93
|
+
when /twitter/
|
94
|
+
return page.body[/tweets/i]
|
95
|
+
when /findthebest/
|
96
|
+
return page.body[/findthebest/i] && !page.body[/Captcha/i]
|
97
|
+
when /linkedin/
|
98
|
+
return page.body[/linkedin/i] && !page.body[/Captcha/i]
|
99
|
+
when /yellowpages/
|
100
|
+
return page.body[/yellowpages/i] && !page.body[/Captcha|IP Address/i]
|
101
|
+
when /bizapedia.com/
|
102
|
+
return page.body[/bizapedia/i] && !page.body[/Captcha|IP Address/i]
|
103
|
+
when /dandb.com/
|
104
|
+
return page.body[/dandb/i] && !page.body[/Captcha/i]
|
105
|
+
when /topdrz.com/
|
106
|
+
return page.body[/topdrz/i] && !page.body[/Captcha/i]
|
107
|
+
when /businessfinder\.[a-z]{2}\.com/
|
108
|
+
return page.body[/DC.title/i]
|
109
|
+
when /hipaaspace.com/
|
110
|
+
return page.body[/Fax/i]
|
111
|
+
when /manta.com/
|
112
|
+
if page.body[/(Zscaler|Captcha|IP Address|distil_ident_block)/i]
|
113
|
+
puts $1
|
114
|
+
return false
|
115
|
+
end
|
116
|
+
return page.body[/UA-10299948/]
|
117
|
+
when /indeed.com\/cmp.*$(?<!review)/
|
118
|
+
return page.body[/indeed/i] && !page.body[/Captcha|IP Address/i]
|
119
|
+
when /hoovers.com\/company-information/
|
120
|
+
return page.body[/hoovers/i] && !page.body[/Captcha|IP Address/i]
|
121
|
+
when /cortera.com/
|
122
|
+
return page.body[/cortera/i] && !page.body[/Captcha|IP Address/i]
|
123
|
+
when /yelp.com/
|
124
|
+
return !!((page.title[/Yelp/i] && !page.title[/Captcha/i]) || page.body['yelp-biz-id'])
|
125
|
+
when /yellowpages.com.au/
|
126
|
+
return !!page.body['listing-name']
|
127
|
+
when /whitepages.com\/business/
|
128
|
+
return !!page.body['app-id=287734809']
|
129
|
+
when /angieslist.com.*\d.htm/
|
130
|
+
return !!page.title['Angies List']
|
131
|
+
when /addresssearch/
|
132
|
+
return page.body['g-plusone']
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
end
|
137
|
+
return false if page.body[/exceeded your daily request/]
|
138
|
+
begin
|
139
|
+
result = JSON.parse(page.body)['results'][0]
|
140
|
+
return true if result['address_components'].find{|x|x['types'].include?('country')}['short_name'] == 'US'
|
141
|
+
rescue
|
142
|
+
end
|
143
|
+
return !page.body[/zscaler|captcha/i]
|
144
|
+
puts "no match: #{url}"
|
145
|
+
page.body[/UA-10299948/i] && !page.body[/Authentication Required/i]
|
11
146
|
end
|
12
147
|
|
13
148
|
def clean str
|
14
149
|
str.gsub(/[[:space:]]+/, ' ').strip
|
15
150
|
end
|
16
151
|
|
152
|
+
def check browser
|
153
|
+
html = browser.html.to_s
|
154
|
+
return true if html[/Pardon Our Interruption|Zscaler|captcha/i]
|
155
|
+
return true if browser.html.length > 5000
|
156
|
+
false
|
157
|
+
end
|
158
|
+
|
17
159
|
def download
|
18
160
|
loop do
|
19
161
|
Mechanize.start do |agent|
|
20
|
-
agent.read_timeout = agent.open_timeout =
|
162
|
+
agent.read_timeout = agent.open_timeout = agent.idle_timeout = 10000
|
163
|
+
keep_alive = false
|
21
164
|
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
22
|
-
agent.user_agent = [
|
165
|
+
ua = agent.user_agent = [
|
23
166
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
24
167
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
|
25
168
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
|
@@ -40,26 +183,58 @@ def download
|
|
40
183
|
end
|
41
184
|
unless proxy = @proxies.shift
|
42
185
|
puts "no more proxies"
|
43
|
-
|
186
|
+
exit
|
44
187
|
end
|
45
188
|
@proxies.push proxy
|
46
189
|
host, port = proxy.split(':')
|
47
|
-
|
190
|
+
if @socks
|
191
|
+
agent.agent.set_socks host, port.to_i
|
192
|
+
else
|
193
|
+
agent.set_proxy host, port.to_i, 'user', 'pass'
|
194
|
+
end
|
48
195
|
begin
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
196
|
+
agent.request_headers = {'Referer' => 'http://www.google.com/search'}
|
197
|
+
page = nil
|
198
|
+
if url[/manta/]
|
199
|
+
html = `phantomjs --proxy=#{proxy} #{BASEDIR}/src/cookies.js #{url}`
|
200
|
+
page = Mechanize::Page.new URI.parse(url), [], html, nil, Mechanize.new
|
201
|
+
else
|
202
|
+
page = agent.get url
|
203
|
+
end
|
204
|
+
|
205
|
+
unless response_ok?(page, url)
|
206
|
+
# binding.pry if url[/manta/] && !page.body[/timed out|blocked|forbidden/i]
|
207
|
+
if page.title
|
208
|
+
puts page.title.strip
|
209
|
+
else
|
210
|
+
raise "no title for: #{url}"
|
211
|
+
end
|
53
212
|
raise 'str'
|
54
213
|
end
|
55
214
|
save_cache url, page.body
|
56
215
|
|
57
216
|
@good += 1
|
58
|
-
puts url
|
217
|
+
puts "- [#{@queue.length + @threads.select(&:alive?).length}/#{@proxies.length}] #{url}"
|
59
218
|
rescue StandardError => e
|
60
|
-
|
61
|
-
@
|
62
|
-
@
|
219
|
+
@failures[url] ||= 0
|
220
|
+
@failures[url] += 1
|
221
|
+
unless @failures[url] >= @max_failures
|
222
|
+
@queue.push(url) # unless e.message[/no title for/]
|
223
|
+
end
|
224
|
+
# binding.pry
|
225
|
+
if e.message[/execurtion exeprrred/]
|
226
|
+
print 'r'
|
227
|
+
elsif e.message[/403/] && !@pattern
|
228
|
+
if (rand * 3).to_i == 0
|
229
|
+
@proxies -= [proxy]
|
230
|
+
print '!'
|
231
|
+
end
|
232
|
+
else
|
233
|
+
@proxies -= [proxy]
|
234
|
+
print '!'
|
235
|
+
end
|
236
|
+
puts "! - #{@failures[url]} - #{e.message[0..99]}"
|
237
|
+
|
63
238
|
agent.cookie_jar.clear!
|
64
239
|
end
|
65
240
|
end
|
@@ -67,18 +242,28 @@ def download
|
|
67
242
|
|
68
243
|
end
|
69
244
|
|
70
|
-
|
71
|
-
@
|
245
|
+
def run
|
246
|
+
puts @queue.length
|
247
|
+
@num_threads = [@max_threads, @queue.length].min
|
248
|
+
puts "#{@proxies.length} proxies, #{@queue.length} urls, #{@num_threads} threads"
|
72
249
|
|
73
|
-
|
74
|
-
@good = 0
|
75
|
-
start_time = Time.now
|
250
|
+
@banned_for = []
|
76
251
|
|
77
|
-
@
|
252
|
+
@threads = []
|
253
|
+
@deficit = 0
|
78
254
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
255
|
+
until @queue.empty? || @proxies.empty?
|
256
|
+
@good = 0
|
257
|
+
start_time = Time.now
|
258
|
+
|
259
|
+
@proxies.shuffle!
|
260
|
+
|
261
|
+
@num_threads.times do
|
262
|
+
@threads << Thread.new { download }
|
263
|
+
end
|
264
|
+
@threads.each { |t| t.join }
|
83
265
|
|
266
|
+
end
|
84
267
|
end
|
268
|
+
run
|
269
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- P Guardiario
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-04-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -24,34 +24,6 @@ dependencies:
|
|
24
24
|
- - ! '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: httpclient
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ! '>='
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ! '>='
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: json
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ~>
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: 1.7.0
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ~>
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: 1.7.0
|
55
27
|
description: A simple scraping framework
|
56
28
|
email:
|
57
29
|
- pguardiario@gmail.com
|