scrapey 0.0.17 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZWYyNGI3OGE3MTA2ZmQxMGE0MzE1MmE2ZjA5YTFhYTliOTY1OTY5ZQ==
4
+ ZWMwZWY5N2ExNjliMDVhNmQzNDEyNjJmZjExNjZkNjMyM2VkMGFhZA==
5
5
  data.tar.gz: !binary |-
6
- YjRjOTVjOTkzNmEwOGE2NmMzYTVkNmNjMGRkODRjZjQ2OWM3OWNhNw==
6
+ YmNjZDFkMTg0MDZjZmZmMDM4ZjRmMzNiYjVjZWE3NTk3Y2YwYWY1Mg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NzkzOWNhMTA2MGQ3MDYwYjA2ZjQ2M2Y1OTQ4YTczMDljMWQ2YjRhYjcyMTk0
10
- Yzc4ZjczNjU5MTBjN2MyOTczM2Y0NDZkNzY0MDdhOGU4MDQ1ODA3ODMwZTJi
11
- MzMyZGFlNDc4N2MxMmViYjM5MjE2N2Y1MjFiNDY0ODJiNGM3ZDE=
9
+ NmEwY2RjZTAxNDBlM2JmNmI0MGExZDg0MGI3OWJkZjlmZTI0YTQ2ZDkyNTI4
10
+ ZDEyOGVhNDAxYjIxYWVkOTkyZTUwNDM3ZDY2MzJiZTEwYjZkN2M2ZGExYjNh
11
+ NWYwOWJmNzFiMjYwNjAyYzRlNmRiNDA2MjUzNGJlOTNmNWMzNDc=
12
12
  data.tar.gz: !binary |-
13
- MWQyZjQ1NTA4NjA4ZGVmNjNjYmQ0MDY2ZDJhZWZlMGJhYWI1NDIyOTcwNzhi
14
- MDg0YWU0NmIyMmNhY2E1MTE3NGY3ODE0NDhmNDE3NTc4OGVhNjg0NjA4OWRk
15
- MTVmNmVmNDUyZGU5ZmFiMjg0N2Y5ZmVhM2UyMWRmYjM2MmQwMDE=
13
+ ZjkzYmMzYjJlZTFkNmU0NWVjMTQ3YjBhZGE5NzY5ZTdmN2Q2N2U0NGI1YmEx
14
+ NTVmN2MzYTVkN2FjYTA2YWRkNGY1Y2RjNDY5MjE4YjIxOGNjMmVmNGQ4MzA3
15
+ MjE2MTRkNzkwMDc3MGYwMjE0M2Y3YmI2M2RhZWUzZDJlMGVhNWY=
@@ -31,7 +31,7 @@ unless defined? Rails
31
31
  @output = File.join BASEDIR, 'output.csv'
32
32
 
33
33
  # read config file
34
- config_file = "#{BASEDIR}/config/config.yml"
34
+ config_file = @config_file_path || "#{BASEDIR}/config/config.yml"
35
35
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
36
36
 
37
37
  init_db if @config['database']
@@ -50,11 +50,11 @@ if defined?(Ocra)
50
50
  'active_record',
51
51
  'active_record/schema',
52
52
  'active_record/connection_adapters/abstract/schema_definitions',
53
- @config['database'] ? @config['database']['adapter'] : 'mysql',
53
+ @config['database'] ? @config['database']['adapter'] : 'mysql2',
54
54
  'tzinfo',
55
55
  'active_support/all',
56
56
  'active_support/multibyte/chars'
57
- ].each{|lib| require lib}
57
+ ].each{|lib| puts lib; require lib}
58
58
  end
59
59
  end
60
60
 
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.17"
2
+ VERSION = "0.0.19"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -1,3 +1,5 @@
1
+ # require 'phantom_mechanize'
2
+
1
3
  module Scrapey
2
4
 
3
5
  def self.init b
@@ -12,8 +14,6 @@ module Scrapey
12
14
 
13
15
  def get_or_post method, url, options={}, *args
14
16
  agent = ['goto', 'visit'].include?(method) ? @browser : @agent
15
- _retries = options.delete :retries
16
- _sleep = options.delete :sleep
17
17
  begin
18
18
  new_args = method, url
19
19
  unless options.empty? && args.empty?
@@ -21,34 +21,26 @@ module Scrapey
21
21
  args.each{|arg| new_args << arg}
22
22
  end
23
23
 
24
- doc = load_cache(url) if @use_cache
24
+ key = method == 'post' ? url + options.to_s : url
25
+ doc = load_cache(key) if @use_cache
25
26
  return doc if doc
26
27
 
27
28
  page = agent.send *new_args
28
29
  # str = page.respond_to?('root') ? page.root.to_s : page.body
29
30
  # save_cache(url, str) if @use_cache
30
- save_cache(url, page.body) if @use_cache
31
+ save_cache(key, page.body) if @use_cache
31
32
 
32
33
  #exit if Object.const_defined? :Ocra
33
34
  page
34
35
  rescue Exception => e
35
- case
36
- when defined? on_error
37
- return on_error e, method, url, options, *args
38
- when _retries && _retries > 0
39
- puts "Error. Retries remaining: #{options[:retries]}"
40
- sleep _sleep if _sleep
41
- get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
42
- else raise e
43
- end
36
+ puts e.message
37
+ raise e
44
38
  end
45
39
  end
46
40
 
47
41
  def get *args; get_or_post 'get', *args; end
48
42
  def post *args; get_or_post 'post', *args; end
49
- def head *args; get_or_post 'head', *args; end
50
- def goto *args; get_or_post 'goto', *args; end
51
- def visit *args; get_or_post 'visit', *args; end
43
+ def phget *args; get_or_post 'phget', *args; end
52
44
 
53
45
  def set_proxy *args
54
46
  @agent.set_proxy *args
@@ -16,7 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Scrapey::VERSION
18
18
  gem.add_dependency(%q<mechanize>)
19
- gem.add_dependency(%q<httpclient>)
20
- gem.add_dependency(%q<json>, ["~> 1.7.0"])
21
19
  end
22
20
 
@@ -1,25 +1,168 @@
1
1
  require 'scrapey'
2
+ require 'watir-webdriver'
3
+ require 'pry'
4
+ require "socksify"
5
+ require 'socksify/http'
6
+ require 'net/https'
7
+
8
+ # Mechanize: call @agent.set_socks(addr, port) before using
9
+ # any of it's methods; it might be working in other cases,
10
+ # but I just didn't tried :)
11
+ class Mechanize::HTTP::Agent
12
+ public
13
+ def set_socks addr, port
14
+ set_http unless @http
15
+ class << @http
16
+ attr_accessor :socks_addr, :socks_port
17
+
18
+ def http_class
19
+ Net::HTTP.SOCKSProxy(socks_addr, socks_port)
20
+ end
21
+ end
22
+ @http.socks_addr = addr
23
+ @http.socks_port = port
24
+ @http.open_timeout = 100
25
+ @http.read_timeout = 100
26
+ end
27
+ end
28
+
29
+ at_exit do
30
+ Process.kill 9, Process.pid
31
+ @threads.each do |t|
32
+ Thread.kill t
33
+ print 'k'
34
+ end
35
+ end
2
36
 
3
37
  use_cache
4
38
 
39
+ @failures = {}
40
+ @max_failures = 5
41
+ @max_threads = 50
42
+
43
+ if arg = ARGV.find{|x| x[/--retries=(\d+)/]}
44
+ @max_failures = $1.to_i
45
+ ARGV.delete arg
46
+ end
47
+
48
+ if arg = ARGV.find{|x| x[/--threads=(\d+)/]}
49
+ @max_threads = $1.to_i
50
+ ARGV.delete arg
51
+ end
52
+
53
+ @socks = false
54
+ if arg = ARGV.find{|x| x[/socks/]}
55
+ @socks = true
56
+ ARGV.delete arg
57
+ end
58
+
59
+
5
60
  # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
6
- @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
7
- @proxies ||= File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
61
+ @queue ||= File.read("#{BASEDIR}/config/urls.txt").split(/[[:space:]]+/).reject{|url| is_cached?(url)}.shuffle
62
+
63
+ if arg = ARGV.find{|x| x[/nopattern/]}
64
+ @queue.reject!{|x| x[/google|facebook|twitter|findthebest|linkedin|yellowpages|bizapedia|dandb|manta|indeed|hoovers|cortera|yelp|yellowpages|whitepages|angieslist/i]}
65
+ ARGV.delete arg
66
+ end
67
+
68
+
69
+ if @socks
70
+ @proxies = File.read("#{BASEDIR}/config/socks.txt").scan(/[\w.]+:\d+/).shuffle
71
+ else
72
+ @proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/).shuffle
73
+ end
74
+
75
+ if @pattern = ARGV[0]
76
+ @queue = @queue.select{|x| x[/#{@pattern}/]}
77
+ end
8
78
 
9
- def response_ok? page
10
- page.body[/pub-9059175907567062/] && !page.body[/IP address/i]
79
+ # binding.pry
80
+
81
+ def response_ok? page, url = nil
82
+ if $0[/get_emails/]
83
+ return !page.body[/zscaler|captcha/i]
84
+ end
85
+
86
+ return false if page.body[/Welcome To Zscaler/]
87
+
88
+ case url
89
+ when /google.com\/search/
90
+ return page.body[/ - Google Search/i]
91
+ when /facebook/
92
+ return page.body[/akamai/i] && !page.body[/Security Check Required/i]
93
+ when /twitter/
94
+ return page.body[/tweets/i]
95
+ when /findthebest/
96
+ return page.body[/findthebest/i] && !page.body[/Captcha/i]
97
+ when /linkedin/
98
+ return page.body[/linkedin/i] && !page.body[/Captcha/i]
99
+ when /yellowpages/
100
+ return page.body[/yellowpages/i] && !page.body[/Captcha|IP Address/i]
101
+ when /bizapedia.com/
102
+ return page.body[/bizapedia/i] && !page.body[/Captcha|IP Address/i]
103
+ when /dandb.com/
104
+ return page.body[/dandb/i] && !page.body[/Captcha/i]
105
+ when /topdrz.com/
106
+ return page.body[/topdrz/i] && !page.body[/Captcha/i]
107
+ when /businessfinder\.[a-z]{2}\.com/
108
+ return page.body[/DC.title/i]
109
+ when /hipaaspace.com/
110
+ return page.body[/Fax/i]
111
+ when /manta.com/
112
+ if page.body[/(Zscaler|Captcha|IP Address|distil_ident_block)/i]
113
+ puts $1
114
+ return false
115
+ end
116
+ return page.body[/UA-10299948/]
117
+ when /indeed.com\/cmp.*$(?<!review)/
118
+ return page.body[/indeed/i] && !page.body[/Captcha|IP Address/i]
119
+ when /hoovers.com\/company-information/
120
+ return page.body[/hoovers/i] && !page.body[/Captcha|IP Address/i]
121
+ when /cortera.com/
122
+ return page.body[/cortera/i] && !page.body[/Captcha|IP Address/i]
123
+ when /yelp.com/
124
+ return !!((page.title[/Yelp/i] && !page.title[/Captcha/i]) || page.body['yelp-biz-id'])
125
+ when /yellowpages.com.au/
126
+ return !!page.body['listing-name']
127
+ when /whitepages.com\/business/
128
+ return !!page.body['app-id=287734809']
129
+ when /angieslist.com.*\d.htm/
130
+ return !!page.title['Angies List']
131
+ when /addresssearch/
132
+ return page.body['g-plusone']
133
+
134
+
135
+
136
+ end
137
+ return false if page.body[/exceeded your daily request/]
138
+ begin
139
+ result = JSON.parse(page.body)['results'][0]
140
+ return true if result['address_components'].find{|x|x['types'].include?('country')}['short_name'] == 'US'
141
+ rescue
142
+ end
143
+ return !page.body[/zscaler|captcha/i]
144
+ puts "no match: #{url}"
145
+ page.body[/UA-10299948/i] && !page.body[/Authentication Required/i]
11
146
  end
12
147
 
13
148
  def clean str
14
149
  str.gsub(/[[:space:]]+/, ' ').strip
15
150
  end
16
151
 
152
+ def check browser
153
+ html = browser.html.to_s
154
+ return true if html[/Pardon Our Interruption|Zscaler|captcha/i]
155
+ return true if browser.html.length > 5000
156
+ false
157
+ end
158
+
17
159
  def download
18
160
  loop do
19
161
  Mechanize.start do |agent|
20
- agent.read_timeout = agent.open_timeout = 30
162
+ agent.read_timeout = agent.open_timeout = agent.idle_timeout = 10000
163
+ keep_alive = false
21
164
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
22
- agent.user_agent = [
165
+ ua = agent.user_agent = [
23
166
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
24
167
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
25
168
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
@@ -40,26 +183,58 @@ def download
40
183
  end
41
184
  unless proxy = @proxies.shift
42
185
  puts "no more proxies"
43
- return
186
+ exit
44
187
  end
45
188
  @proxies.push proxy
46
189
  host, port = proxy.split(':')
47
- agent.set_proxy host, port.to_i
190
+ if @socks
191
+ agent.agent.set_socks host, port.to_i
192
+ else
193
+ agent.set_proxy host, port.to_i, 'user', 'pass'
194
+ end
48
195
  begin
49
- page = agent.get url
50
- unless response_ok?(page)
51
- page.search('script,style').remove
52
- puts clean(page.body)
196
+ agent.request_headers = {'Referer' => 'http://www.google.com/search'}
197
+ page = nil
198
+ if url[/manta/]
199
+ html = `phantomjs --proxy=#{proxy} #{BASEDIR}/src/cookies.js #{url}`
200
+ page = Mechanize::Page.new URI.parse(url), [], html, nil, Mechanize.new
201
+ else
202
+ page = agent.get url
203
+ end
204
+
205
+ unless response_ok?(page, url)
206
+ # binding.pry if url[/manta/] && !page.body[/timed out|blocked|forbidden/i]
207
+ if page.title
208
+ puts page.title.strip
209
+ else
210
+ raise "no title for: #{url}"
211
+ end
53
212
  raise 'str'
54
213
  end
55
214
  save_cache url, page.body
56
215
 
57
216
  @good += 1
58
- puts url
217
+ puts "- [#{@queue.length + @threads.select(&:alive?).length}/#{@proxies.length}] #{url}"
59
218
  rescue StandardError => e
60
- puts e.message[0..99]
61
- @queue.push url
62
- @proxies -= [proxy]
219
+ @failures[url] ||= 0
220
+ @failures[url] += 1
221
+ unless @failures[url] >= @max_failures
222
+ @queue.push(url) # unless e.message[/no title for/]
223
+ end
224
+ # binding.pry
225
+ if e.message[/execurtion exeprrred/]
226
+ print 'r'
227
+ elsif e.message[/403/] && !@pattern
228
+ if (rand * 3).to_i == 0
229
+ @proxies -= [proxy]
230
+ print '!'
231
+ end
232
+ else
233
+ @proxies -= [proxy]
234
+ print '!'
235
+ end
236
+ puts "! - #{@failures[url]} - #{e.message[0..99]}"
237
+
63
238
  agent.cookie_jar.clear!
64
239
  end
65
240
  end
@@ -67,18 +242,28 @@ def download
67
242
 
68
243
  end
69
244
 
70
- threads = []
71
- @deficit = 0
245
+ def run
246
+ puts @queue.length
247
+ @num_threads = [@max_threads, @queue.length].min
248
+ puts "#{@proxies.length} proxies, #{@queue.length} urls, #{@num_threads} threads"
72
249
 
73
- until @queue.empty?
74
- @good = 0
75
- start_time = Time.now
250
+ @banned_for = []
76
251
 
77
- @proxies.shuffle!
252
+ @threads = []
253
+ @deficit = 0
78
254
 
79
- 10.times do
80
- threads << Thread.new { download }
81
- end
82
- threads.each { |t| t.join }
255
+ until @queue.empty? || @proxies.empty?
256
+ @good = 0
257
+ start_time = Time.now
258
+
259
+ @proxies.shuffle!
260
+
261
+ @num_threads.times do
262
+ @threads << Thread.new { download }
263
+ end
264
+ @threads.each { |t| t.join }
83
265
 
266
+ end
84
267
  end
268
+ run
269
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.17
4
+ version: 0.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - P Guardiario
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-29 00:00:00.000000000 Z
11
+ date: 2016-04-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -24,34 +24,6 @@ dependencies:
24
24
  - - ! '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: httpclient
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ! '>='
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ! '>='
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: json
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ~>
46
- - !ruby/object:Gem::Version
47
- version: 1.7.0
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ~>
53
- - !ruby/object:Gem::Version
54
- version: 1.7.0
55
27
  description: A simple scraping framework
56
28
  email:
57
29
  - pguardiario@gmail.com