scrapey 0.0.17 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZWYyNGI3OGE3MTA2ZmQxMGE0MzE1MmE2ZjA5YTFhYTliOTY1OTY5ZQ==
4
+ ZWMwZWY5N2ExNjliMDVhNmQzNDEyNjJmZjExNjZkNjMyM2VkMGFhZA==
5
5
  data.tar.gz: !binary |-
6
- YjRjOTVjOTkzNmEwOGE2NmMzYTVkNmNjMGRkODRjZjQ2OWM3OWNhNw==
6
+ YmNjZDFkMTg0MDZjZmZmMDM4ZjRmMzNiYjVjZWE3NTk3Y2YwYWY1Mg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NzkzOWNhMTA2MGQ3MDYwYjA2ZjQ2M2Y1OTQ4YTczMDljMWQ2YjRhYjcyMTk0
10
- Yzc4ZjczNjU5MTBjN2MyOTczM2Y0NDZkNzY0MDdhOGU4MDQ1ODA3ODMwZTJi
11
- MzMyZGFlNDc4N2MxMmViYjM5MjE2N2Y1MjFiNDY0ODJiNGM3ZDE=
9
+ NmEwY2RjZTAxNDBlM2JmNmI0MGExZDg0MGI3OWJkZjlmZTI0YTQ2ZDkyNTI4
10
+ ZDEyOGVhNDAxYjIxYWVkOTkyZTUwNDM3ZDY2MzJiZTEwYjZkN2M2ZGExYjNh
11
+ NWYwOWJmNzFiMjYwNjAyYzRlNmRiNDA2MjUzNGJlOTNmNWMzNDc=
12
12
  data.tar.gz: !binary |-
13
- MWQyZjQ1NTA4NjA4ZGVmNjNjYmQ0MDY2ZDJhZWZlMGJhYWI1NDIyOTcwNzhi
14
- MDg0YWU0NmIyMmNhY2E1MTE3NGY3ODE0NDhmNDE3NTc4OGVhNjg0NjA4OWRk
15
- MTVmNmVmNDUyZGU5ZmFiMjg0N2Y5ZmVhM2UyMWRmYjM2MmQwMDE=
13
+ ZjkzYmMzYjJlZTFkNmU0NWVjMTQ3YjBhZGE5NzY5ZTdmN2Q2N2U0NGI1YmEx
14
+ NTVmN2MzYTVkN2FjYTA2YWRkNGY1Y2RjNDY5MjE4YjIxOGNjMmVmNGQ4MzA3
15
+ MjE2MTRkNzkwMDc3MGYwMjE0M2Y3YmI2M2RhZWUzZDJlMGVhNWY=
@@ -31,7 +31,7 @@ unless defined? Rails
31
31
  @output = File.join BASEDIR, 'output.csv'
32
32
 
33
33
  # read config file
34
- config_file = "#{BASEDIR}/config/config.yml"
34
+ config_file = @config_file_path || "#{BASEDIR}/config/config.yml"
35
35
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
36
36
 
37
37
  init_db if @config['database']
@@ -50,11 +50,11 @@ if defined?(Ocra)
50
50
  'active_record',
51
51
  'active_record/schema',
52
52
  'active_record/connection_adapters/abstract/schema_definitions',
53
- @config['database'] ? @config['database']['adapter'] : 'mysql',
53
+ @config['database'] ? @config['database']['adapter'] : 'mysql2',
54
54
  'tzinfo',
55
55
  'active_support/all',
56
56
  'active_support/multibyte/chars'
57
- ].each{|lib| require lib}
57
+ ].each{|lib| puts lib; require lib}
58
58
  end
59
59
  end
60
60
 
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.17"
2
+ VERSION = "0.0.19"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -1,3 +1,5 @@
1
+ # require 'phantom_mechanize'
2
+
1
3
  module Scrapey
2
4
 
3
5
  def self.init b
@@ -12,8 +14,6 @@ module Scrapey
12
14
 
13
15
  def get_or_post method, url, options={}, *args
14
16
  agent = ['goto', 'visit'].include?(method) ? @browser : @agent
15
- _retries = options.delete :retries
16
- _sleep = options.delete :sleep
17
17
  begin
18
18
  new_args = method, url
19
19
  unless options.empty? && args.empty?
@@ -21,34 +21,26 @@ module Scrapey
21
21
  args.each{|arg| new_args << arg}
22
22
  end
23
23
 
24
- doc = load_cache(url) if @use_cache
24
+ key = method == 'post' ? url + options.to_s : url
25
+ doc = load_cache(key) if @use_cache
25
26
  return doc if doc
26
27
 
27
28
  page = agent.send *new_args
28
29
  # str = page.respond_to?('root') ? page.root.to_s : page.body
29
30
  # save_cache(url, str) if @use_cache
30
- save_cache(url, page.body) if @use_cache
31
+ save_cache(key, page.body) if @use_cache
31
32
 
32
33
  #exit if Object.const_defined? :Ocra
33
34
  page
34
35
  rescue Exception => e
35
- case
36
- when defined? on_error
37
- return on_error e, method, url, options, *args
38
- when _retries && _retries > 0
39
- puts "Error. Retries remaining: #{options[:retries]}"
40
- sleep _sleep if _sleep
41
- get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
42
- else raise e
43
- end
36
+ puts e.message
37
+ raise e
44
38
  end
45
39
  end
46
40
 
47
41
  def get *args; get_or_post 'get', *args; end
48
42
  def post *args; get_or_post 'post', *args; end
49
- def head *args; get_or_post 'head', *args; end
50
- def goto *args; get_or_post 'goto', *args; end
51
- def visit *args; get_or_post 'visit', *args; end
43
+ def phget *args; get_or_post 'phget', *args; end
52
44
 
53
45
  def set_proxy *args
54
46
  @agent.set_proxy *args
@@ -16,7 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Scrapey::VERSION
18
18
  gem.add_dependency(%q<mechanize>)
19
- gem.add_dependency(%q<httpclient>)
20
- gem.add_dependency(%q<json>, ["~> 1.7.0"])
21
19
  end
22
20
 
@@ -1,25 +1,168 @@
1
1
  require 'scrapey'
2
+ require 'watir-webdriver'
3
+ require 'pry'
4
+ require "socksify"
5
+ require 'socksify/http'
6
+ require 'net/https'
7
+
8
+ # Mechanize: call @agent.set_socks(addr, port) before using
9
+ # any of it's methods; it might be working in other cases,
10
+ # but I just didn't tried :)
11
+ class Mechanize::HTTP::Agent
12
+ public
13
+ def set_socks addr, port
14
+ set_http unless @http
15
+ class << @http
16
+ attr_accessor :socks_addr, :socks_port
17
+
18
+ def http_class
19
+ Net::HTTP.SOCKSProxy(socks_addr, socks_port)
20
+ end
21
+ end
22
+ @http.socks_addr = addr
23
+ @http.socks_port = port
24
+ @http.open_timeout = 100
25
+ @http.read_timeout = 100
26
+ end
27
+ end
28
+
29
+ at_exit do
30
+ Process.kill 9, Process.pid
31
+ @threads.each do |t|
32
+ Thread.kill t
33
+ print 'k'
34
+ end
35
+ end
2
36
 
3
37
  use_cache
4
38
 
39
+ @failures = {}
40
+ @max_failures = 5
41
+ @max_threads = 50
42
+
43
+ if arg = ARGV.find{|x| x[/--retries=(\d+)/]}
44
+ @max_failures = $1.to_i
45
+ ARGV.delete arg
46
+ end
47
+
48
+ if arg = ARGV.find{|x| x[/--threads=(\d+)/]}
49
+ @max_threads = $1.to_i
50
+ ARGV.delete arg
51
+ end
52
+
53
+ @socks = false
54
+ if arg = ARGV.find{|x| x[/socks/]}
55
+ @socks = true
56
+ ARGV.delete arg
57
+ end
58
+
59
+
5
60
  # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
6
- @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
7
- @proxies ||= File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
61
+ @queue ||= File.read("#{BASEDIR}/config/urls.txt").split(/[[:space:]]+/).reject{|url| is_cached?(url)}.shuffle
62
+
63
+ if arg = ARGV.find{|x| x[/nopattern/]}
64
+ @queue.reject!{|x| x[/google|facebook|twitter|findthebest|linkedin|yellowpages|bizapedia|dandb|manta|indeed|hoovers|cortera|yelp|yellowpages|whitepages|angieslist/i]}
65
+ ARGV.delete arg
66
+ end
67
+
68
+
69
+ if @socks
70
+ @proxies = File.read("#{BASEDIR}/config/socks.txt").scan(/[\w.]+:\d+/).shuffle
71
+ else
72
+ @proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/).shuffle
73
+ end
74
+
75
+ if @pattern = ARGV[0]
76
+ @queue = @queue.select{|x| x[/#{@pattern}/]}
77
+ end
8
78
 
9
- def response_ok? page
10
- page.body[/pub-9059175907567062/] && !page.body[/IP address/i]
79
+ # binding.pry
80
+
81
+ def response_ok? page, url = nil
82
+ if $0[/get_emails/]
83
+ return !page.body[/zscaler|captcha/i]
84
+ end
85
+
86
+ return false if page.body[/Welcome To Zscaler/]
87
+
88
+ case url
89
+ when /google.com\/search/
90
+ return page.body[/ - Google Search/i]
91
+ when /facebook/
92
+ return page.body[/akamai/i] && !page.body[/Security Check Required/i]
93
+ when /twitter/
94
+ return page.body[/tweets/i]
95
+ when /findthebest/
96
+ return page.body[/findthebest/i] && !page.body[/Captcha/i]
97
+ when /linkedin/
98
+ return page.body[/linkedin/i] && !page.body[/Captcha/i]
99
+ when /yellowpages/
100
+ return page.body[/yellowpages/i] && !page.body[/Captcha|IP Address/i]
101
+ when /bizapedia.com/
102
+ return page.body[/bizapedia/i] && !page.body[/Captcha|IP Address/i]
103
+ when /dandb.com/
104
+ return page.body[/dandb/i] && !page.body[/Captcha/i]
105
+ when /topdrz.com/
106
+ return page.body[/topdrz/i] && !page.body[/Captcha/i]
107
+ when /businessfinder\.[a-z]{2}\.com/
108
+ return page.body[/DC.title/i]
109
+ when /hipaaspace.com/
110
+ return page.body[/Fax/i]
111
+ when /manta.com/
112
+ if page.body[/(Zscaler|Captcha|IP Address|distil_ident_block)/i]
113
+ puts $1
114
+ return false
115
+ end
116
+ return page.body[/UA-10299948/]
117
+ when /indeed.com\/cmp.*$(?<!review)/
118
+ return page.body[/indeed/i] && !page.body[/Captcha|IP Address/i]
119
+ when /hoovers.com\/company-information/
120
+ return page.body[/hoovers/i] && !page.body[/Captcha|IP Address/i]
121
+ when /cortera.com/
122
+ return page.body[/cortera/i] && !page.body[/Captcha|IP Address/i]
123
+ when /yelp.com/
124
+ return !!((page.title[/Yelp/i] && !page.title[/Captcha/i]) || page.body['yelp-biz-id'])
125
+ when /yellowpages.com.au/
126
+ return !!page.body['listing-name']
127
+ when /whitepages.com\/business/
128
+ return !!page.body['app-id=287734809']
129
+ when /angieslist.com.*\d.htm/
130
+ return !!page.title['Angies List']
131
+ when /addresssearch/
132
+ return page.body['g-plusone']
133
+
134
+
135
+
136
+ end
137
+ return false if page.body[/exceeded your daily request/]
138
+ begin
139
+ result = JSON.parse(page.body)['results'][0]
140
+ return true if result['address_components'].find{|x|x['types'].include?('country')}['short_name'] == 'US'
141
+ rescue
142
+ end
143
+ return !page.body[/zscaler|captcha/i]
144
+ puts "no match: #{url}"
145
+ page.body[/UA-10299948/i] && !page.body[/Authentication Required/i]
11
146
  end
12
147
 
13
148
  def clean str
14
149
  str.gsub(/[[:space:]]+/, ' ').strip
15
150
  end
16
151
 
152
+ def check browser
153
+ html = browser.html.to_s
154
+ return true if html[/Pardon Our Interruption|Zscaler|captcha/i]
155
+ return true if browser.html.length > 5000
156
+ false
157
+ end
158
+
17
159
  def download
18
160
  loop do
19
161
  Mechanize.start do |agent|
20
- agent.read_timeout = agent.open_timeout = 30
162
+ agent.read_timeout = agent.open_timeout = agent.idle_timeout = 10000
163
+ keep_alive = false
21
164
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
22
- agent.user_agent = [
165
+ ua = agent.user_agent = [
23
166
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
24
167
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
25
168
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
@@ -40,26 +183,58 @@ def download
40
183
  end
41
184
  unless proxy = @proxies.shift
42
185
  puts "no more proxies"
43
- return
186
+ exit
44
187
  end
45
188
  @proxies.push proxy
46
189
  host, port = proxy.split(':')
47
- agent.set_proxy host, port.to_i
190
+ if @socks
191
+ agent.agent.set_socks host, port.to_i
192
+ else
193
+ agent.set_proxy host, port.to_i, 'user', 'pass'
194
+ end
48
195
  begin
49
- page = agent.get url
50
- unless response_ok?(page)
51
- page.search('script,style').remove
52
- puts clean(page.body)
196
+ agent.request_headers = {'Referer' => 'http://www.google.com/search'}
197
+ page = nil
198
+ if url[/manta/]
199
+ html = `phantomjs --proxy=#{proxy} #{BASEDIR}/src/cookies.js #{url}`
200
+ page = Mechanize::Page.new URI.parse(url), [], html, nil, Mechanize.new
201
+ else
202
+ page = agent.get url
203
+ end
204
+
205
+ unless response_ok?(page, url)
206
+ # binding.pry if url[/manta/] && !page.body[/timed out|blocked|forbidden/i]
207
+ if page.title
208
+ puts page.title.strip
209
+ else
210
+ raise "no title for: #{url}"
211
+ end
53
212
  raise 'str'
54
213
  end
55
214
  save_cache url, page.body
56
215
 
57
216
  @good += 1
58
- puts url
217
+ puts "- [#{@queue.length + @threads.select(&:alive?).length}/#{@proxies.length}] #{url}"
59
218
  rescue StandardError => e
60
- puts e.message[0..99]
61
- @queue.push url
62
- @proxies -= [proxy]
219
+ @failures[url] ||= 0
220
+ @failures[url] += 1
221
+ unless @failures[url] >= @max_failures
222
+ @queue.push(url) # unless e.message[/no title for/]
223
+ end
224
+ # binding.pry
225
+ if e.message[/execurtion exeprrred/]
226
+ print 'r'
227
+ elsif e.message[/403/] && !@pattern
228
+ if (rand * 3).to_i == 0
229
+ @proxies -= [proxy]
230
+ print '!'
231
+ end
232
+ else
233
+ @proxies -= [proxy]
234
+ print '!'
235
+ end
236
+ puts "! - #{@failures[url]} - #{e.message[0..99]}"
237
+
63
238
  agent.cookie_jar.clear!
64
239
  end
65
240
  end
@@ -67,18 +242,28 @@ def download
67
242
 
68
243
  end
69
244
 
70
- threads = []
71
- @deficit = 0
245
+ def run
246
+ puts @queue.length
247
+ @num_threads = [@max_threads, @queue.length].min
248
+ puts "#{@proxies.length} proxies, #{@queue.length} urls, #{@num_threads} threads"
72
249
 
73
- until @queue.empty?
74
- @good = 0
75
- start_time = Time.now
250
+ @banned_for = []
76
251
 
77
- @proxies.shuffle!
252
+ @threads = []
253
+ @deficit = 0
78
254
 
79
- 10.times do
80
- threads << Thread.new { download }
81
- end
82
- threads.each { |t| t.join }
255
+ until @queue.empty? || @proxies.empty?
256
+ @good = 0
257
+ start_time = Time.now
258
+
259
+ @proxies.shuffle!
260
+
261
+ @num_threads.times do
262
+ @threads << Thread.new { download }
263
+ end
264
+ @threads.each { |t| t.join }
83
265
 
266
+ end
84
267
  end
268
+ run
269
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.17
4
+ version: 0.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - P Guardiario
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-29 00:00:00.000000000 Z
11
+ date: 2016-04-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -24,34 +24,6 @@ dependencies:
24
24
  - - ! '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: httpclient
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ! '>='
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ! '>='
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: json
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ~>
46
- - !ruby/object:Gem::Version
47
- version: 1.7.0
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ~>
53
- - !ruby/object:Gem::Version
54
- version: 1.7.0
55
27
  description: A simple scraping framework
56
28
  email:
57
29
  - pguardiario@gmail.com