scrapey 0.0.13 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ Njk0YTkyMDY0MGYxMzM1ZDMwOWM5Yzg4YmQ4YTA2NGM3N2Q1ZGUxMw==
5
+ data.tar.gz: !binary |-
6
+ YjE2NjNlYjQwMTIwMWE3YmRjNWQzNDVlZWI0MjhjNWI1NWMzNTg1OQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NTcyZWFmYjdhZmMyMjVjNzc1ZTk5ZDRkMTM2YjYxMTE5NjZmMThmNTc2MGVl
10
+ MTI4Yjc0ODkwZWQ3NGNkNWM4NmI5ZGFlMmJiYWNiMjFhMDEyMGIzMjhkZmYz
11
+ ZDNlN2Y3NGFjYzdjNjlhMWE4Y2FiMzcxZmQyNDBlZTM4MTA1Yzg=
12
+ data.tar.gz: !binary |-
13
+ NmIwOGRjZTczMTE1YjE2MDQzNzc5MmJmYTQ4MzQzMGNlNGQ1Y2YxMzA1MDZk
14
+ NzkzMTdhOTI3MzAxODNiYmZhNzlhNjkzNGUwODAwZTVmYmRjZjY1ZjdkOGJm
15
+ ZTFmOWQ5OTFiZmQwY2U3NTBjYzA2ZjBkNjcwZmMzNjcxY2E2ODY=
@@ -11,6 +11,16 @@ require "scrapey/database"
11
11
  require "scrapey/multi"
12
12
  require "scrapey/tee"
13
13
 
14
+ require 'addressable/uri'
15
+
16
+ class URI::Parser
17
+ def split url
18
+ a = Addressable::URI::parse url
19
+ [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
20
+ end
21
+ end
22
+
23
+
14
24
  # don't do this stuff in rails:
15
25
  unless defined? Rails
16
26
  Scrapey::init binding
@@ -24,5 +34,26 @@ unless defined? Rails
24
34
 
25
35
  init_db if @config['database']
26
36
 
27
- $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
28
- end
37
+ #$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
38
+ end
39
+
40
+ if defined?(Ocra)
41
+ puts "doing ocra stuff..."
42
+ Mechanize.new.cookies
43
+ HTTP::Cookie::Scanner.new ''
44
+ if @config['database'] || @config['databases']
45
+ puts "doing ocra db stuff..."
46
+ ActiveRecord::Relation::PredicateBuilder.new rescue nil
47
+ [
48
+ 'active_record',
49
+ 'active_record/schema',
50
+ 'active_record/connection_adapters/abstract/schema_definitions',
51
+ @config['database'] ? @config['database']['adapter'] : 'mysql',
52
+ 'tzinfo',
53
+ 'active_support/all',
54
+ 'active_support/multibyte/chars'
55
+ ].each{|lib| require lib}
56
+ end
57
+ end
58
+
59
+ Dir.chdir BASEDIR
@@ -15,7 +15,7 @@ module Scrapey
15
15
  return nil unless File::exists?(filename)
16
16
  debug "Loading #{filename} from cache"
17
17
  begin
18
- Nokogiri::HTML Marshal.load(File.open(filename, "rb"){|f| f.read})
18
+ Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
19
19
  rescue Exception => e
20
20
  puts e.message
21
21
  end
@@ -9,7 +9,7 @@ module Scrapey
9
9
  def load_cache url
10
10
  debug "Loading #{url} from cache"
11
11
  return nil unless str = @redis.get(url)
12
- Nokogiri::HTML Marshal.load(str) rescue nil
12
+ Mechanize::Page.new(URI.parse(url), [], Marshal.load(str), nil, @agent) rescue nil
13
13
  end
14
14
 
15
15
  def save_cache url, body, options = {}
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.13"
2
+ VERSION = "0.0.16"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -0,0 +1,34 @@
1
+ require 'addressable/uri'
2
+
3
+ class URI::Parser
4
+ def split url
5
+ a = Addressable::URI::parse url
6
+ [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
7
+ end
8
+ end
9
+
10
+ class Hash
11
+ def shuffle
12
+ Hash[self.to_a.shuffle]
13
+ end
14
+ end
15
+
16
+ class Nokogiri::XML::NodeSet
17
+ def shuffle
18
+ self.to_a.shuffle
19
+ end
20
+ end
21
+
22
+ class Enumerator
23
+ def shuffle
24
+ self.to_a.shuffle
25
+ end
26
+ end
27
+
28
+ class CSV::Table
29
+ def shuffle
30
+ arr = self.to_a
31
+ k = arr.shift
32
+ arr.map{|v| Hash[k.zip v]}.shuffle
33
+ end
34
+ end
@@ -85,4 +85,10 @@ module Scrapey
85
85
  def ts
86
86
  Time.now.to_i.to_s
87
87
  end
88
+
89
+ def enqueue url
90
+ @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w')
91
+ @url_list << url
92
+ @url_list << "\n"
93
+ end
88
94
  end
@@ -0,0 +1 @@
1
+ config
@@ -18,7 +18,7 @@ task 'dropbox' do
18
18
  folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
19
19
  FileUtils.mkdir(folder) unless File.exists?(folder)
20
20
  FileUtils.cp "Output/#{file}", folder
21
- url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
21
+ url = [ENV['DROPBOX_public_url'], name, file].join('/')
22
22
  puts "uploaded to #{url}"
23
23
  end
24
24
  end
@@ -12,6 +12,9 @@ password: 12345
12
12
  # host: localhost
13
13
  # encoding: 'utf8'
14
14
 
15
+ #category: xxx
16
+ #dataset_name: yyy
17
+
15
18
  # example proxies section
16
19
  #proxies:
17
20
  #- www.host1.com:80
@@ -0,0 +1,48 @@
1
+ require 'scrapey'
2
+ require 'pry'
3
+
4
+ =begin
5
+ @config = {
6
+ 'category' => 'businesses',
7
+ 'dataset_name' => 'brazilian_companies',
8
+ 'database' => {
9
+ 'adapter' => 'mysql',
10
+ 'database' => 'stefan',
11
+ 'username' => 'root',
12
+ 'password' => '12345',
13
+ 'host' => 'localhost',
14
+ 'encoding' => 'utf8'
15
+ }
16
+ }
17
+ =end
18
+
19
+ def post url, body
20
+ page = @agent.post url, body
21
+ JSON.parse(page.body).each{|k, v|}
22
+ raise 'x' unless page.body
23
+ page
24
+ rescue Exception => e
25
+ print '!'
26
+ sleep 10
27
+ return post url, body
28
+ end
29
+
30
+ @agent.open_timeout = @agent.read_timeout = 10000
31
+
32
+ tables = ActiveRecord::Base.connection.tables
33
+
34
+ tables.each do |table|
35
+ puts table
36
+ tables table.camelize
37
+ klass = table.camelize.constantize
38
+ return unless klass.column_names.include?('website')
39
+
40
+ klass.where("website is not null and email is null").find_in_batches(:batch_size => 10) do |group|
41
+ page = post('http://www.pay4data.com/lookup/email_for_url', {urls: group.map(&:website).compact}.to_json)
42
+ JSON.parse(page.body).each do |k, v|
43
+ group.find{|r| r['website'] == k}.update_attributes(:email => v)
44
+ puts k
45
+ end
46
+ end
47
+ end
48
+
@@ -0,0 +1,133 @@
1
+ require 'aws-sdk'
2
+ require 'scrapey'
3
+ require 'pry'
4
+
5
+ =begin
6
+ @config = {
7
+ 'category' => 'businesses',
8
+ 'dataset_name' => 'brazilian_companies',
9
+ 'database' => {
10
+ 'adapter' => 'mysql',
11
+ 'database' => 'stefan',
12
+ 'username' => 'root',
13
+ 'password' => '12345',
14
+ 'host' => 'localhost',
15
+ 'encoding' => 'utf8'
16
+ }
17
+ }
18
+
19
+
20
+
21
+ CSV.open("#{BASEDIR}/#{table}.csv", 'w') do |csv|
22
+ csv << fields
23
+ klass.where(:found => true).find_each do |row|
24
+ csv << fields.map{|f| row[f]}
25
+ end
26
+ end
27
+
28
+ =end
29
+
30
+
31
+
32
+ def new_csv filename
33
+ File.open(filename, 'w') do |file|
34
+ file << 0xEF.chr + 0xBB.chr + 0xBF.chr
35
+ end
36
+ CSV.open(filename, 'a') do |csv|
37
+ yield csv
38
+ end
39
+ end
40
+
41
+ unless @config['dataset_name'] && @config['category']
42
+ puts 'Please fill out dataset_name and category in config.yml to continue'
43
+ exit
44
+ end
45
+ init_db
46
+ @tables = ActiveRecord::Base.connection.tables
47
+
48
+ all_fields = []
49
+
50
+ @tables.each do |table|
51
+ puts table
52
+ tables table.camelize
53
+ klass = table.camelize.constantize
54
+
55
+ all_fields << klass.column_names
56
+ fields = klass.column_names - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']
57
+
58
+ new_csv("#{BASEDIR}/#{table}.csv") do |csv|
59
+ csv << fields
60
+ klass.all.find_each do |row|
61
+ csv << fields.map{|f| row[f]}
62
+ end
63
+ end
64
+
65
+ new_csv("#{BASEDIR}/#{table}_sample.csv") do |csv|
66
+ csv << fields
67
+ klass.order(:id).order('rand()').limit(50).each do |row|
68
+ csv << fields.map{|f| row[f]}
69
+ end
70
+ end
71
+
72
+ end
73
+
74
+ if @tables.length == 0
75
+ table = @tables.first
76
+ `gzip -f #{BASEDIR}/#{table}_sample.csv`
77
+ `gzip -f #{BASEDIR}/#{table}.csv`
78
+ csv_name = "#{table}.csv.gz"
79
+ sample_name = "#{table}_sample.csv.gz"
80
+
81
+ csv_name = "#{@config['dataset_name']}.csv.gz"
82
+ `mv #{BASEDIR}/#{table}.csv.gz #{csv_name}`
83
+ sample_name = "#{@config['dataset_name']}_sample.csv.gz"
84
+ `mv #{BASEDIR}/#{table}_sample.csv.gz #{sample_name}`
85
+
86
+ else
87
+ csv_name = "#{@config['dataset_name']}.csv.tar.gz"
88
+ sample_name = "#{@config['dataset_name']}.sample.tar.gz"
89
+ sample_sql = "#{@config['dataset_name']}_sample.sql"
90
+
91
+ cmd = "tar -czf #{csv_name} " + @tables.map{|x| x + '.csv'}.join(' ')
92
+ `#{cmd}`
93
+ File.open(sample_sql, 'w') do |f|
94
+ f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 --where="true limit 100" #{@config['database']['database']}`
95
+ end
96
+ cmd = "tar -czf #{sample_name} #{sample_sql} " + @tables.map{|x| x + '_sample.csv'}.join(' ')
97
+ `#{cmd}`
98
+ end
99
+
100
+ # --where="true limit 100"
101
+ File.open("#{@config['dataset_name']}.sql", 'w') do |f|
102
+ f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 #{@config['database']['database']}`
103
+ end
104
+ `gzip -f #{@config['dataset_name']}.sql`
105
+ sql_name = "#{@config['dataset_name']}.sql.gz"
106
+
107
+ s3 = AWS::S3.new :access_key_id => ENV['AMAZON_ACCESS_KEY_ID'], :secret_access_key => ENV['AMAZON_SECRET_ACCESS_KEY']
108
+ bucket = s3.buckets['pay4data']
109
+
110
+ sample_object = bucket.objects["#{@config['category']}/#{sample_name}"].write :file => sample_name, :content_type => 'application/gzip', :acl => :public_read
111
+ csv_object = bucket.objects["#{@config['category']}/#{csv_name}"].write :file => csv_name, :content_type => 'application/gzip'
112
+ sql_object = bucket.objects["#{@config['category']}/#{sql_name}"].write :file => sql_name, :content_type => 'application/gzip'
113
+
114
+ sql = <<EOF
115
+ insert into datasets(sample_url, csv_url, sql_url, last_crawled, fields) values(
116
+ '#{sample_object.public_url.to_s}',
117
+ '#{csv_object.public_url.to_s}',
118
+ '#{sql_object.public_url.to_s}',
119
+ now(),
120
+ '#{fields.map{|t| (t - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']).join ', '}.join ", "}'
121
+ );
122
+
123
+ update datasets set category_id=5, name='', description='', price='', button_html='' where id=
124
+
125
+
126
+ mysqldump pay4data datasets categories | mysql2 pay4data
127
+
128
+
129
+ EOF
130
+
131
+ puts sql
132
+
133
+
@@ -0,0 +1,14 @@
1
+ require 'scrapey'
2
+ require 'pry'
3
+ require 'syck'
4
+ require "#{BASEDIR}/src/proxy.rb"
5
+
6
+
7
+ YAML::ENGINE.yamler='syck'
8
+
9
+ #proxies = Proxy::get_proxies :proxy_list
10
+ proxies = Proxy::get_proxies :all
11
+
12
+ @config['proxies'] = proxies.uniq
13
+ File.open("#{BASEDIR}/config/config.yml", 'w') { |f| YAML.dump(@config, f) }
14
+
@@ -0,0 +1,278 @@
1
+ require "base64"
2
+
3
+ class Proxy
4
+ attr_reader :current
5
+ BOOM = 'boom'
6
+
7
+ def initialize agent = nil, options = {}
8
+ @user_agents = [
9
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
10
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
11
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
12
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
13
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
14
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
15
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
16
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
17
+ 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
18
+ 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
19
+ ]
20
+ @agent = agent
21
+ @min = options[:min] || 5
22
+ @sleep = options[:sleep] || 60 * 60 # 1 hour
23
+ @verbose = options[:verbose] || false
24
+ @timeout = options[:timeout] || 30
25
+ @round_time = options[:round_time] || 5 * 60 # 5 minutes
26
+ @agent.open_timeout = @agent.read_timeout = @timeout
27
+ proxies = options[:proxies] || []
28
+ set_proxies proxies
29
+ end
30
+
31
+ def set_proxies proxies
32
+ @proxies = proxies.select{|x| x[/:/]}.uniq{|x| x[/.*:/]}
33
+ self.shuffle
34
+ end
35
+
36
+ def debug str
37
+ puts str if @verbose
38
+ end
39
+
40
+ def shuffle
41
+ @proxies = [BOOM] + (@proxies - [BOOM]).shuffle
42
+ start_round
43
+ self.rotate
44
+ end
45
+
46
+ def to_yaml
47
+ @proxies.to_yaml
48
+ end
49
+
50
+ def start_round
51
+ now = Time.now.to_i
52
+ if @round_start
53
+ sleep_time = @round_time - (now - @round_start)
54
+ if sleep_time > 0
55
+ puts "sleeping for #{sleep_time}"
56
+ sleep sleep_time
57
+ end
58
+ end
59
+ @round_start = Time.now.to_i
60
+ end
61
+
62
+ def get_more_proxies
63
+ puts 'getting more proxies'
64
+ proxies = Proxy::get_proxies
65
+ set_proxies proxies
66
+ end
67
+
68
+ def rotate
69
+ debug "rotating"
70
+ @proxies.rotate!
71
+ @user_agents.rotate!
72
+ if @proxies.length < @min
73
+ get_more_proxies
74
+ end
75
+ @current = @proxies.first
76
+ if @current == BOOM
77
+ start_round
78
+ rotate
79
+ return
80
+ end
81
+
82
+ host, port = @current.split ':'
83
+ debug "setting proxy to #{host}:#{port}"
84
+ @agent.set_proxy host, port.to_i
85
+ debug "setting user_agent to #{@user_agents.first}"
86
+ @agent.user_agent = @user_agents.first
87
+ end
88
+
89
+ def remove
90
+ debug "--- removing #{@current}"
91
+ @proxies.shift
92
+ rotate
93
+ debug @proxies.join(', ')
94
+ debug @current
95
+ end
96
+
97
+ def pause
98
+ time = @sleep / @proxies.length
99
+ debug "sleeping for #{time}"
100
+ sleep time
101
+ end
102
+
103
+ def length
104
+ @proxies.length
105
+ end
106
+
107
+
108
+
109
+ def self.get_idcloak
110
+ proxies = []
111
+ ['http://www.idcloak.com/proxylist/free-proxy-servers-list.html'].each do |url|
112
+ page = @agent.get url
113
+
114
+ page.search('#sort td[7]').each do |td|
115
+ port = td.text.strip
116
+ host = td.at('+ td').text.strip
117
+ proxies << "#{host}:#{port}"
118
+ end
119
+
120
+ end
121
+ proxies
122
+ end
123
+
124
+ def self.get_proxynova
125
+ proxies = []
126
+ ['http://www.proxynova.com/proxy-server-list/'].each do |url|
127
+ page = @agent.get url
128
+
129
+ page.search('.row_proxy_ip').each do |span|
130
+ str = span.text[/long2ip\((.*?)\)/, 1]
131
+ next if str[/a-z/i]
132
+ i = eval str
133
+ host = Proxy::long2ip(i)
134
+ port = span.parent.at('+ td').text.strip
135
+ proxies << "#{host}:#{port}"
136
+ end
137
+ end
138
+ proxies
139
+ end
140
+
141
+ def self.get_proxy_list
142
+ proxies = []
143
+ ['http://proxy-list.org/en/index.php',
144
+ 'http://proxy-list.org/en/index.php?sp=20',
145
+ 'http://proxy-list.org/en/index.php?sp=40',
146
+ 'http://proxy-list.org/en/index.php?sp=60',
147
+ 'http://proxy-list.org/en/index.php?sp=80',
148
+ 'http://proxy-list.org/en/index.php?sp=100',
149
+ 'http://proxy-list.org/en/index.php?sp=120'].each do |url|
150
+ page = @agent.get url
151
+ proxies += page.body.scan(/(?:\d+\.){3}\d+:\d+/)
152
+ end
153
+ proxies
154
+ end
155
+
156
+ def self.get_hidemyass
157
+ proxies = []
158
+ ['http://hidemyass.com/proxy-list/search-227752',
159
+ 'http://hidemyass.com/proxy-list/search-227752/2',
160
+ 'http://hidemyass.com/proxy-list/search-227752/3',
161
+ 'http://hidemyass.com/proxy-list/search-227752/4',
162
+ 'http://hidemyass.com/proxy-list/search-227752/5',
163
+ 'http://hidemyass.com/proxy-list/search-227752/6'].each do |url|
164
+ page = @agent.get url
165
+ page.search('*[style*="display:none"]').remove
166
+ page.search(page.body.scan(/(\..*?)\{display:none\}/).flatten.join(', ')).remove
167
+ page.search('style').remove
168
+ proxies += page.search('td[2]').map{|x| x.text.strip}.zip(page.search('td[3]').map{|x| x.text.strip}).map{|h,p| "#{h}:#{p}"}[1..-1]
169
+ end
170
+ proxies
171
+ end
172
+
173
+ def self.get_cool_proxy
174
+ proxies = []
175
+ page = @agent.get 'http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc'
176
+ page.search('tr')[1..-2].each do |tr|
177
+ next unless tr.at('td[2]')
178
+ host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
179
+ port = tr.at('td[2]').text
180
+ proxies << [host, port].join(':')
181
+ end
182
+
183
+ while a = page.at('a[rel=next]')
184
+ url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
185
+ begin
186
+ page = @agent.get url
187
+ rescue
188
+ return proxies
189
+ end
190
+ page.search('tr')[1..-2].each do |tr|
191
+ next unless tr.at('td[2]')
192
+ host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
193
+ port = tr.at('td[2]').text
194
+ proxies << [host, port].join(':')
195
+ end
196
+ end
197
+
198
+ proxies
199
+ end
200
+
201
+
202
+ def self.get_freeproxylists
203
+ proxies = []
204
+
205
+ @agent.follow_meta_refresh = true
206
+ page = @agent.get 'http://www.freeproxylists.net/'
207
+
208
+ page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
209
+ proxies << [URI.decode(row[0]), row[1]].join(':')
210
+ end
211
+
212
+ while a = page.at('a[text()^=Next]')
213
+ url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
214
+ puts url
215
+ page = @agent.get url
216
+ page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
217
+ proxies << [URI.decode(row[0]), row[1]].join(':')
218
+ end
219
+ end
220
+
221
+ proxies
222
+ end
223
+
224
+ def self.long2ip(long)
225
+ ip = []
226
+ 4.times do |i|
227
+ ip.push(long.to_i & 255)
228
+ long = long.to_i >> 8
229
+ end
230
+ ip.join(".")
231
+ end
232
+
233
+ def self.get_proxies provider = :all
234
+
235
+ @agent ||= Mechanize.new{|a| a.history.max_size = 10}
236
+ @agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
237
+ @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
238
+
239
+ case provider
240
+ when :proxy_list then return get_proxy_list
241
+ when :hidemyass then return get_hidemyass
242
+ when :freeproxylists then return get_freeproxylists
243
+ when :cool_proxy then return get_cool_proxy
244
+ when :proxynova then return get_proxynova
245
+ when :idcloak then return get_idcloak
246
+ when :all
247
+ proxies = []
248
+ [:proxy_list, :hidemyass, :freeproxylists, :cool_proxy, :proxynova, :idcloak].each do |key|
249
+ puts key
250
+ begin
251
+ part = get_proxies(key)
252
+ rescue Exception => e
253
+ part = []
254
+ puts e.message
255
+ end
256
+ puts part.length
257
+ proxies += part
258
+ end
259
+ proxies
260
+ end
261
+ end
262
+ end
263
+
264
+ if ARGV.include?('-p')
265
+ puts "refreshing proxies, please wait..."
266
+ require "#{BASEDIR}/src/get_proxies.rb"
267
+ puts "#{@config['proxies'].length} proxies found."
268
+ puts "Hit [enter] to exit"
269
+ $stdin.gets
270
+ exit
271
+ end
272
+
273
+ # for testing
274
+ if __FILE__ == $0
275
+ require 'mechanize'
276
+ @agent = Mechanize.new
277
+ proxy = Proxy.new @agent, :verbose => true, :min => 5
278
+ end
@@ -1,7 +1,7 @@
1
1
  =begin
2
2
  # put table schemas here. this will be included if the table is not found.
3
3
  ActiveRecord::Schema.define do
4
- create_table "items" do |t|
4
+ create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
5
5
  t.string "string_field"
6
6
  t.text "text_field"
7
7
  t.integer "number_field"
@@ -5,6 +5,12 @@ require 'pry'
5
5
  # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
6
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
7
7
 
8
+ EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
9
+
10
+ def clean str
11
+ str.gsub(/[[:space:]]+/, ' ').strip
12
+ end
13
+
8
14
  def scrape div
9
15
  a = div.at('a')
10
16
  url = URI.join(@url, a[:href]).to_s
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
5
- prerelease:
4
+ version: 0.0.16
6
5
  platform: ruby
7
6
  authors:
8
7
  - P Guardiario
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-03-26 00:00:00.000000000 Z
11
+ date: 2014-04-22 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: mechanize
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: httpclient
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: json
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ~>
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ~>
60
53
  - !ruby/object:Gem::Version
@@ -83,6 +76,7 @@ files:
83
76
  - lib/scrapey/cache/redis.rb
84
77
  - lib/scrapey/cache.rb
85
78
  - lib/scrapey/constants.rb
79
+ - lib/scrapey/core.rb
86
80
  - lib/scrapey/database.rb
87
81
  - lib/scrapey/multi.rb
88
82
  - lib/scrapey/scrapey.rb
@@ -91,37 +85,42 @@ files:
91
85
  - lib/scrapey/tor.rb
92
86
  - lib/scrapey.rb
93
87
  - scrapey.gemspec
88
+ - template/.gitignore
94
89
  - template/config/config.yml
95
90
  - template/Gemfile
96
91
  - template/icon.ico
97
92
  - template/output.csv
98
93
  - template/Rakefile
99
94
  - template/src/downloader.rb
95
+ - template/src/emails.rb
96
+ - template/src/export.rb
97
+ - template/src/get_proxies.rb
98
+ - template/src/proxy.rb
100
99
  - template/src/schema.rb
101
100
  - template/src/template.rb
102
101
  - template/template.iss
103
102
  homepage: ''
104
103
  licenses: []
104
+ metadata: {}
105
105
  post_install_message:
106
106
  rdoc_options: []
107
107
  require_paths:
108
108
  - lib
109
109
  required_ruby_version: !ruby/object:Gem::Requirement
110
- none: false
111
110
  requirements:
112
111
  - - ! '>='
113
112
  - !ruby/object:Gem::Version
114
113
  version: '0'
115
114
  required_rubygems_version: !ruby/object:Gem::Requirement
116
- none: false
117
115
  requirements:
118
116
  - - ! '>='
119
117
  - !ruby/object:Gem::Version
120
118
  version: '0'
121
119
  requirements: []
122
120
  rubyforge_project:
123
- rubygems_version: 1.8.24
121
+ rubygems_version: 2.1.5
124
122
  signing_key:
125
- specification_version: 3
123
+ specification_version: 4
126
124
  summary: A simple scraping framework
127
125
  test_files: []
126
+ has_rdoc: