scrapey 0.0.13 → 0.0.16

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ Njk0YTkyMDY0MGYxMzM1ZDMwOWM5Yzg4YmQ4YTA2NGM3N2Q1ZGUxMw==
5
+ data.tar.gz: !binary |-
6
+ YjE2NjNlYjQwMTIwMWE3YmRjNWQzNDVlZWI0MjhjNWI1NWMzNTg1OQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NTcyZWFmYjdhZmMyMjVjNzc1ZTk5ZDRkMTM2YjYxMTE5NjZmMThmNTc2MGVl
10
+ MTI4Yjc0ODkwZWQ3NGNkNWM4NmI5ZGFlMmJiYWNiMjFhMDEyMGIzMjhkZmYz
11
+ ZDNlN2Y3NGFjYzdjNjlhMWE4Y2FiMzcxZmQyNDBlZTM4MTA1Yzg=
12
+ data.tar.gz: !binary |-
13
+ NmIwOGRjZTczMTE1YjE2MDQzNzc5MmJmYTQ4MzQzMGNlNGQ1Y2YxMzA1MDZk
14
+ NzkzMTdhOTI3MzAxODNiYmZhNzlhNjkzNGUwODAwZTVmYmRjZjY1ZjdkOGJm
15
+ ZTFmOWQ5OTFiZmQwY2U3NTBjYzA2ZjBkNjcwZmMzNjcxY2E2ODY=
@@ -11,6 +11,16 @@ require "scrapey/database"
11
11
  require "scrapey/multi"
12
12
  require "scrapey/tee"
13
13
 
14
+ require 'addressable/uri'
15
+
16
+ class URI::Parser
17
+ def split url
18
+ a = Addressable::URI::parse url
19
+ [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
20
+ end
21
+ end
22
+
23
+
14
24
  # don't do this stuff in rails:
15
25
  unless defined? Rails
16
26
  Scrapey::init binding
@@ -24,5 +34,26 @@ unless defined? Rails
24
34
 
25
35
  init_db if @config['database']
26
36
 
27
- $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
28
- end
37
+ #$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
38
+ end
39
+
40
+ if defined?(Ocra)
41
+ puts "doing ocra stuff..."
42
+ Mechanize.new.cookies
43
+ HTTP::Cookie::Scanner.new ''
44
+ if @config['database'] || @config['databases']
45
+ puts "doing ocra db stuff..."
46
+ ActiveRecord::Relation::PredicateBuilder.new rescue nil
47
+ [
48
+ 'active_record',
49
+ 'active_record/schema',
50
+ 'active_record/connection_adapters/abstract/schema_definitions',
51
+ @config['database'] ? @config['database']['adapter'] : 'mysql',
52
+ 'tzinfo',
53
+ 'active_support/all',
54
+ 'active_support/multibyte/chars'
55
+ ].each{|lib| require lib}
56
+ end
57
+ end
58
+
59
+ Dir.chdir BASEDIR
@@ -15,7 +15,7 @@ module Scrapey
15
15
  return nil unless File::exists?(filename)
16
16
  debug "Loading #{filename} from cache"
17
17
  begin
18
- Nokogiri::HTML Marshal.load(File.open(filename, "rb"){|f| f.read})
18
+ Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
19
19
  rescue Exception => e
20
20
  puts e.message
21
21
  end
@@ -9,7 +9,7 @@ module Scrapey
9
9
  def load_cache url
10
10
  debug "Loading #{url} from cache"
11
11
  return nil unless str = @redis.get(url)
12
- Nokogiri::HTML Marshal.load(str) rescue nil
12
+ Mechanize::Page.new(URI.parse(url), [], Marshal.load(str), nil, @agent) rescue nil
13
13
  end
14
14
 
15
15
  def save_cache url, body, options = {}
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.13"
2
+ VERSION = "0.0.16"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -0,0 +1,34 @@
1
+ require 'addressable/uri'
2
+
3
+ class URI::Parser
4
+ def split url
5
+ a = Addressable::URI::parse url
6
+ [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
7
+ end
8
+ end
9
+
10
+ class Hash
11
+ def shuffle
12
+ Hash[self.to_a.shuffle]
13
+ end
14
+ end
15
+
16
+ class Nokogiri::XML::NodeSet
17
+ def shuffle
18
+ self.to_a.shuffle
19
+ end
20
+ end
21
+
22
+ class Enumerator
23
+ def shuffle
24
+ self.to_a.shuffle
25
+ end
26
+ end
27
+
28
+ class CSV::Table
29
+ def shuffle
30
+ arr = self.to_a
31
+ k = arr.shift
32
+ arr.map{|v| Hash[k.zip v]}.shuffle
33
+ end
34
+ end
@@ -85,4 +85,10 @@ module Scrapey
85
85
  def ts
86
86
  Time.now.to_i.to_s
87
87
  end
88
+
89
+ def enqueue url
90
+ @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w')
91
+ @url_list << url
92
+ @url_list << "\n"
93
+ end
88
94
  end
@@ -0,0 +1 @@
1
+ config
@@ -18,7 +18,7 @@ task 'dropbox' do
18
18
  folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
19
19
  FileUtils.mkdir(folder) unless File.exists?(folder)
20
20
  FileUtils.cp "Output/#{file}", folder
21
- url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
21
+ url = [ENV['DROPBOX_public_url'], name, file].join('/')
22
22
  puts "uploaded to #{url}"
23
23
  end
24
24
  end
@@ -12,6 +12,9 @@ password: 12345
12
12
  # host: localhost
13
13
  # encoding: 'utf8'
14
14
 
15
+ #category: xxx
16
+ #dataset_name: yyy
17
+
15
18
  # example proxies section
16
19
  #proxies:
17
20
  #- www.host1.com:80
@@ -0,0 +1,48 @@
1
+ require 'scrapey'
2
+ require 'pry'
3
+
4
+ =begin
5
+ @config = {
6
+ 'category' => 'businesses',
7
+ 'dataset_name' => 'brazilian_companies',
8
+ 'database' => {
9
+ 'adapter' => 'mysql',
10
+ 'database' => 'stefan',
11
+ 'username' => 'root',
12
+ 'password' => '12345',
13
+ 'host' => 'localhost',
14
+ 'encoding' => 'utf8'
15
+ }
16
+ }
17
+ =end
18
+
19
+ def post url, body
20
+ page = @agent.post url, body
21
+ JSON.parse(page.body).each{|k, v|}
22
+ raise 'x' unless page.body
23
+ page
24
+ rescue Exception => e
25
+ print '!'
26
+ sleep 10
27
+ return post url, body
28
+ end
29
+
30
+ @agent.open_timeout = @agent.read_timeout = 10000
31
+
32
+ tables = ActiveRecord::Base.connection.tables
33
+
34
+ tables.each do |table|
35
+ puts table
36
+ tables table.camelize
37
+ klass = table.camelize.constantize
38
+ return unless klass.column_names.include?('website')
39
+
40
+ klass.where("website is not null and email is null").find_in_batches(:batch_size => 10) do |group|
41
+ page = post('http://www.pay4data.com/lookup/email_for_url', {urls: group.map(&:website).compact}.to_json)
42
+ JSON.parse(page.body).each do |k, v|
43
+ group.find{|r| r['website'] == k}.update_attributes(:email => v)
44
+ puts k
45
+ end
46
+ end
47
+ end
48
+
@@ -0,0 +1,133 @@
1
+ require 'aws-sdk'
2
+ require 'scrapey'
3
+ require 'pry'
4
+
5
+ =begin
6
+ @config = {
7
+ 'category' => 'businesses',
8
+ 'dataset_name' => 'brazilian_companies',
9
+ 'database' => {
10
+ 'adapter' => 'mysql',
11
+ 'database' => 'stefan',
12
+ 'username' => 'root',
13
+ 'password' => '12345',
14
+ 'host' => 'localhost',
15
+ 'encoding' => 'utf8'
16
+ }
17
+ }
18
+
19
+
20
+
21
+ CSV.open("#{BASEDIR}/#{table}.csv", 'w') do |csv|
22
+ csv << fields
23
+ klass.where(:found => true).find_each do |row|
24
+ csv << fields.map{|f| row[f]}
25
+ end
26
+ end
27
+
28
+ =end
29
+
30
+
31
+
32
+ def new_csv filename
33
+ File.open(filename, 'w') do |file|
34
+ file << 0xEF.chr + 0xBB.chr + 0xBF.chr
35
+ end
36
+ CSV.open(filename, 'a') do |csv|
37
+ yield csv
38
+ end
39
+ end
40
+
41
+ unless @config['dataset_name'] && @config['category']
42
+ puts 'Please fill out dataset_name and category in config.yml to continue'
43
+ exit
44
+ end
45
+ init_db
46
+ @tables = ActiveRecord::Base.connection.tables
47
+
48
+ all_fields = []
49
+
50
+ @tables.each do |table|
51
+ puts table
52
+ tables table.camelize
53
+ klass = table.camelize.constantize
54
+
55
+ all_fields << klass.column_names
56
+ fields = klass.column_names - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']
57
+
58
+ new_csv("#{BASEDIR}/#{table}.csv") do |csv|
59
+ csv << fields
60
+ klass.all.find_each do |row|
61
+ csv << fields.map{|f| row[f]}
62
+ end
63
+ end
64
+
65
+ new_csv("#{BASEDIR}/#{table}_sample.csv") do |csv|
66
+ csv << fields
67
+ klass.order(:id).order('rand()').limit(50).each do |row|
68
+ csv << fields.map{|f| row[f]}
69
+ end
70
+ end
71
+
72
+ end
73
+
74
+ if @tables.length == 0
75
+ table = @tables.first
76
+ `gzip -f #{BASEDIR}/#{table}_sample.csv`
77
+ `gzip -f #{BASEDIR}/#{table}.csv`
78
+ csv_name = "#{table}.csv.gz"
79
+ sample_name = "#{table}_sample.csv.gz"
80
+
81
+ csv_name = "#{@config['dataset_name']}.csv.gz"
82
+ `mv #{BASEDIR}/#{table}.csv.gz #{csv_name}`
83
+ sample_name = "#{@config['dataset_name']}_sample.csv.gz"
84
+ `mv #{BASEDIR}/#{table}_sample.csv.gz #{sample_name}`
85
+
86
+ else
87
+ csv_name = "#{@config['dataset_name']}.csv.tar.gz"
88
+ sample_name = "#{@config['dataset_name']}.sample.tar.gz"
89
+ sample_sql = "#{@config['dataset_name']}_sample.sql"
90
+
91
+ cmd = "tar -czf #{csv_name} " + @tables.map{|x| x + '.csv'}.join(' ')
92
+ `#{cmd}`
93
+ File.open(sample_sql, 'w') do |f|
94
+ f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 --where="true limit 100" #{@config['database']['database']}`
95
+ end
96
+ cmd = "tar -czf #{sample_name} #{sample_sql} " + @tables.map{|x| x + '_sample.csv'}.join(' ')
97
+ `#{cmd}`
98
+ end
99
+
100
+ # --where="true limit 100"
101
+ File.open("#{@config['dataset_name']}.sql", 'w') do |f|
102
+ f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 #{@config['database']['database']}`
103
+ end
104
+ `gzip -f #{@config['dataset_name']}.sql`
105
+ sql_name = "#{@config['dataset_name']}.sql.gz"
106
+
107
+ s3 = AWS::S3.new :access_key_id => ENV['AMAZON_ACCESS_KEY_ID'], :secret_access_key => ENV['AMAZON_SECRET_ACCESS_KEY']
108
+ bucket = s3.buckets['pay4data']
109
+
110
+ sample_object = bucket.objects["#{@config['category']}/#{sample_name}"].write :file => sample_name, :content_type => 'application/gzip', :acl => :public_read
111
+ csv_object = bucket.objects["#{@config['category']}/#{csv_name}"].write :file => csv_name, :content_type => 'application/gzip'
112
+ sql_object = bucket.objects["#{@config['category']}/#{sql_name}"].write :file => sql_name, :content_type => 'application/gzip'
113
+
114
+ sql = <<EOF
115
+ insert into datasets(sample_url, csv_url, sql_url, last_crawled, fields) values(
116
+ '#{sample_object.public_url.to_s}',
117
+ '#{csv_object.public_url.to_s}',
118
+ '#{sql_object.public_url.to_s}',
119
+ now(),
120
+ '#{fields.map{|t| (t - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']).join ', '}.join ", "}'
121
+ );
122
+
123
+ update datasets set category_id=5, name='', description='', price='', button_html='' where id=
124
+
125
+
126
+ mysqldump pay4data datasets categories | mysql2 pay4data
127
+
128
+
129
+ EOF
130
+
131
+ puts sql
132
+
133
+
@@ -0,0 +1,14 @@
1
+ require 'scrapey'
2
+ require 'pry'
3
+ require 'syck'
4
+ require "#{BASEDIR}/src/proxy.rb"
5
+
6
+
7
+ YAML::ENGINE.yamler='syck'
8
+
9
+ #proxies = Proxy::get_proxies :proxy_list
10
+ proxies = Proxy::get_proxies :all
11
+
12
+ @config['proxies'] = proxies.uniq
13
+ File.open("#{BASEDIR}/config/config.yml", 'w') { |f| YAML.dump(@config, f) }
14
+
@@ -0,0 +1,278 @@
1
+ require "base64"
2
+
3
+ class Proxy
4
+ attr_reader :current
5
+ BOOM = 'boom'
6
+
7
+ def initialize agent = nil, options = {}
8
+ @user_agents = [
9
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
10
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
11
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
12
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
13
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
14
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
15
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
16
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
17
+ 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
18
+ 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
19
+ ]
20
+ @agent = agent
21
+ @min = options[:min] || 5
22
+ @sleep = options[:sleep] || 60 * 60 # 1 hour
23
+ @verbose = options[:verbose] || false
24
+ @timeout = options[:timeout] || 30
25
+ @round_time = options[:round_time] || 5 * 60 # 5 minutes
26
+ @agent.open_timeout = @agent.read_timeout = @timeout
27
+ proxies = options[:proxies] || []
28
+ set_proxies proxies
29
+ end
30
+
31
+ def set_proxies proxies
32
+ @proxies = proxies.select{|x| x[/:/]}.uniq{|x| x[/.*:/]}
33
+ self.shuffle
34
+ end
35
+
36
+ def debug str
37
+ puts str if @verbose
38
+ end
39
+
40
+ def shuffle
41
+ @proxies = [BOOM] + (@proxies - [BOOM]).shuffle
42
+ start_round
43
+ self.rotate
44
+ end
45
+
46
+ def to_yaml
47
+ @proxies.to_yaml
48
+ end
49
+
50
+ def start_round
51
+ now = Time.now.to_i
52
+ if @round_start
53
+ sleep_time = @round_time - (now - @round_start)
54
+ if sleep_time > 0
55
+ puts "sleeping for #{sleep_time}"
56
+ sleep sleep_time
57
+ end
58
+ end
59
+ @round_start = Time.now.to_i
60
+ end
61
+
62
+ def get_more_proxies
63
+ puts 'getting more proxies'
64
+ proxies = Proxy::get_proxies
65
+ set_proxies proxies
66
+ end
67
+
68
+ def rotate
69
+ debug "rotating"
70
+ @proxies.rotate!
71
+ @user_agents.rotate!
72
+ if @proxies.length < @min
73
+ get_more_proxies
74
+ end
75
+ @current = @proxies.first
76
+ if @current == BOOM
77
+ start_round
78
+ rotate
79
+ return
80
+ end
81
+
82
+ host, port = @current.split ':'
83
+ debug "setting proxy to #{host}:#{port}"
84
+ @agent.set_proxy host, port.to_i
85
+ debug "setting user_agent to #{@user_agents.first}"
86
+ @agent.user_agent = @user_agents.first
87
+ end
88
+
89
+ def remove
90
+ debug "--- removing #{@current}"
91
+ @proxies.shift
92
+ rotate
93
+ debug @proxies.join(', ')
94
+ debug @current
95
+ end
96
+
97
+ def pause
98
+ time = @sleep / @proxies.length
99
+ debug "sleeping for #{time}"
100
+ sleep time
101
+ end
102
+
103
+ def length
104
+ @proxies.length
105
+ end
106
+
107
+
108
+
109
+ def self.get_idcloak
110
+ proxies = []
111
+ ['http://www.idcloak.com/proxylist/free-proxy-servers-list.html'].each do |url|
112
+ page = @agent.get url
113
+
114
+ page.search('#sort td[7]').each do |td|
115
+ port = td.text.strip
116
+ host = td.at('+ td').text.strip
117
+ proxies << "#{host}:#{port}"
118
+ end
119
+
120
+ end
121
+ proxies
122
+ end
123
+
124
+ def self.get_proxynova
125
+ proxies = []
126
+ ['http://www.proxynova.com/proxy-server-list/'].each do |url|
127
+ page = @agent.get url
128
+
129
+ page.search('.row_proxy_ip').each do |span|
130
+ str = span.text[/long2ip\((.*?)\)/, 1]
131
+ next if str[/a-z/i]
132
+ i = eval str
133
+ host = Proxy::long2ip(i)
134
+ port = span.parent.at('+ td').text.strip
135
+ proxies << "#{host}:#{port}"
136
+ end
137
+ end
138
+ proxies
139
+ end
140
+
141
+ def self.get_proxy_list
142
+ proxies = []
143
+ ['http://proxy-list.org/en/index.php',
144
+ 'http://proxy-list.org/en/index.php?sp=20',
145
+ 'http://proxy-list.org/en/index.php?sp=40',
146
+ 'http://proxy-list.org/en/index.php?sp=60',
147
+ 'http://proxy-list.org/en/index.php?sp=80',
148
+ 'http://proxy-list.org/en/index.php?sp=100',
149
+ 'http://proxy-list.org/en/index.php?sp=120'].each do |url|
150
+ page = @agent.get url
151
+ proxies += page.body.scan(/(?:\d+\.){3}\d+:\d+/)
152
+ end
153
+ proxies
154
+ end
155
+
156
+ def self.get_hidemyass
157
+ proxies = []
158
+ ['http://hidemyass.com/proxy-list/search-227752',
159
+ 'http://hidemyass.com/proxy-list/search-227752/2',
160
+ 'http://hidemyass.com/proxy-list/search-227752/3',
161
+ 'http://hidemyass.com/proxy-list/search-227752/4',
162
+ 'http://hidemyass.com/proxy-list/search-227752/5',
163
+ 'http://hidemyass.com/proxy-list/search-227752/6'].each do |url|
164
+ page = @agent.get url
165
+ page.search('*[style*="display:none"]').remove
166
+ page.search(page.body.scan(/(\..*?)\{display:none\}/).flatten.join(', ')).remove
167
+ page.search('style').remove
168
+ proxies += page.search('td[2]').map{|x| x.text.strip}.zip(page.search('td[3]').map{|x| x.text.strip}).map{|h,p| "#{h}:#{p}"}[1..-1]
169
+ end
170
+ proxies
171
+ end
172
+
173
+ def self.get_cool_proxy
174
+ proxies = []
175
+ page = @agent.get 'http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc'
176
+ page.search('tr')[1..-2].each do |tr|
177
+ next unless tr.at('td[2]')
178
+ host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
179
+ port = tr.at('td[2]').text
180
+ proxies << [host, port].join(':')
181
+ end
182
+
183
+ while a = page.at('a[rel=next]')
184
+ url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
185
+ begin
186
+ page = @agent.get url
187
+ rescue
188
+ return proxies
189
+ end
190
+ page.search('tr')[1..-2].each do |tr|
191
+ next unless tr.at('td[2]')
192
+ host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
193
+ port = tr.at('td[2]').text
194
+ proxies << [host, port].join(':')
195
+ end
196
+ end
197
+
198
+ proxies
199
+ end
200
+
201
+
202
+ def self.get_freeproxylists
203
+ proxies = []
204
+
205
+ @agent.follow_meta_refresh = true
206
+ page = @agent.get 'http://www.freeproxylists.net/'
207
+
208
+ page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
209
+ proxies << [URI.decode(row[0]), row[1]].join(':')
210
+ end
211
+
212
+ while a = page.at('a[text()^=Next]')
213
+ url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
214
+ puts url
215
+ page = @agent.get url
216
+ page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
217
+ proxies << [URI.decode(row[0]), row[1]].join(':')
218
+ end
219
+ end
220
+
221
+ proxies
222
+ end
223
+
224
+ def self.long2ip(long)
225
+ ip = []
226
+ 4.times do |i|
227
+ ip.push(long.to_i & 255)
228
+ long = long.to_i >> 8
229
+ end
230
+ ip.join(".")
231
+ end
232
+
233
+ def self.get_proxies provider = :all
234
+
235
+ @agent ||= Mechanize.new{|a| a.history.max_size = 10}
236
+ @agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
237
+ @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
238
+
239
+ case provider
240
+ when :proxy_list then return get_proxy_list
241
+ when :hidemyass then return get_hidemyass
242
+ when :freeproxylists then return get_freeproxylists
243
+ when :cool_proxy then return get_cool_proxy
244
+ when :proxynova then return get_proxynova
245
+ when :idcloak then return get_idcloak
246
+ when :all
247
+ proxies = []
248
+ [:proxy_list, :hidemyass, :freeproxylists, :cool_proxy, :proxynova, :idcloak].each do |key|
249
+ puts key
250
+ begin
251
+ part = get_proxies(key)
252
+ rescue Exception => e
253
+ part = []
254
+ puts e.message
255
+ end
256
+ puts part.length
257
+ proxies += part
258
+ end
259
+ proxies
260
+ end
261
+ end
262
+ end
263
+
264
+ if ARGV.include?('-p')
265
+ puts "refreshing proxies, please wait..."
266
+ require "#{BASEDIR}/src/get_proxies.rb"
267
+ puts "#{@config['proxies'].length} proxies found."
268
+ puts "Hit [enter] to exit"
269
+ $stdin.gets
270
+ exit
271
+ end
272
+
273
+ # for testing
274
+ if __FILE__ == $0
275
+ require 'mechanize'
276
+ @agent = Mechanize.new
277
+ proxy = Proxy.new @agent, :verbose => true, :min => 5
278
+ end
@@ -1,7 +1,7 @@
1
1
  =begin
2
2
  # put table schemas here. this will be included if the table is not found.
3
3
  ActiveRecord::Schema.define do
4
- create_table "items" do |t|
4
+ create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
5
5
  t.string "string_field"
6
6
  t.text "text_field"
7
7
  t.integer "number_field"
@@ -5,6 +5,12 @@ require 'pry'
5
5
  # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
6
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
7
7
 
8
+ EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
9
+
10
+ def clean str
11
+ str.gsub(/[[:space:]]+/, ' ').strip
12
+ end
13
+
8
14
  def scrape div
9
15
  a = div.at('a')
10
16
  url = URI.join(@url, a[:href]).to_s
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
5
- prerelease:
4
+ version: 0.0.16
6
5
  platform: ruby
7
6
  authors:
8
7
  - P Guardiario
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-03-26 00:00:00.000000000 Z
11
+ date: 2014-04-22 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: mechanize
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: httpclient
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: json
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ~>
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ~>
60
53
  - !ruby/object:Gem::Version
@@ -83,6 +76,7 @@ files:
83
76
  - lib/scrapey/cache/redis.rb
84
77
  - lib/scrapey/cache.rb
85
78
  - lib/scrapey/constants.rb
79
+ - lib/scrapey/core.rb
86
80
  - lib/scrapey/database.rb
87
81
  - lib/scrapey/multi.rb
88
82
  - lib/scrapey/scrapey.rb
@@ -91,37 +85,42 @@ files:
91
85
  - lib/scrapey/tor.rb
92
86
  - lib/scrapey.rb
93
87
  - scrapey.gemspec
88
+ - template/.gitignore
94
89
  - template/config/config.yml
95
90
  - template/Gemfile
96
91
  - template/icon.ico
97
92
  - template/output.csv
98
93
  - template/Rakefile
99
94
  - template/src/downloader.rb
95
+ - template/src/emails.rb
96
+ - template/src/export.rb
97
+ - template/src/get_proxies.rb
98
+ - template/src/proxy.rb
100
99
  - template/src/schema.rb
101
100
  - template/src/template.rb
102
101
  - template/template.iss
103
102
  homepage: ''
104
103
  licenses: []
104
+ metadata: {}
105
105
  post_install_message:
106
106
  rdoc_options: []
107
107
  require_paths:
108
108
  - lib
109
109
  required_ruby_version: !ruby/object:Gem::Requirement
110
- none: false
111
110
  requirements:
112
111
  - - ! '>='
113
112
  - !ruby/object:Gem::Version
114
113
  version: '0'
115
114
  required_rubygems_version: !ruby/object:Gem::Requirement
116
- none: false
117
115
  requirements:
118
116
  - - ! '>='
119
117
  - !ruby/object:Gem::Version
120
118
  version: '0'
121
119
  requirements: []
122
120
  rubyforge_project:
123
- rubygems_version: 1.8.24
121
+ rubygems_version: 2.1.5
124
122
  signing_key:
125
- specification_version: 3
123
+ specification_version: 4
126
124
  summary: A simple scraping framework
127
125
  test_files: []
126
+ has_rdoc: