scrapey 0.0.13 → 0.0.16
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/lib/scrapey.rb +33 -2
- data/lib/scrapey/cache/disk.rb +1 -1
- data/lib/scrapey/cache/redis.rb +1 -1
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/core.rb +34 -0
- data/lib/scrapey/scrapey.rb +6 -0
- data/template/.gitignore +1 -0
- data/template/Rakefile +1 -1
- data/template/config/config.yml +3 -0
- data/template/src/emails.rb +48 -0
- data/template/src/export.rb +133 -0
- data/template/src/get_proxies.rb +14 -0
- data/template/src/proxy.rb +278 -0
- data/template/src/schema.rb +1 -1
- data/template/src/template.rb +6 -0
- metadata +12 -13
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
Njk0YTkyMDY0MGYxMzM1ZDMwOWM5Yzg4YmQ4YTA2NGM3N2Q1ZGUxMw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YjE2NjNlYjQwMTIwMWE3YmRjNWQzNDVlZWI0MjhjNWI1NWMzNTg1OQ==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NTcyZWFmYjdhZmMyMjVjNzc1ZTk5ZDRkMTM2YjYxMTE5NjZmMThmNTc2MGVl
|
10
|
+
MTI4Yjc0ODkwZWQ3NGNkNWM4NmI5ZGFlMmJiYWNiMjFhMDEyMGIzMjhkZmYz
|
11
|
+
ZDNlN2Y3NGFjYzdjNjlhMWE4Y2FiMzcxZmQyNDBlZTM4MTA1Yzg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NmIwOGRjZTczMTE1YjE2MDQzNzc5MmJmYTQ4MzQzMGNlNGQ1Y2YxMzA1MDZk
|
14
|
+
NzkzMTdhOTI3MzAxODNiYmZhNzlhNjkzNGUwODAwZTVmYmRjZjY1ZjdkOGJm
|
15
|
+
ZTFmOWQ5OTFiZmQwY2U3NTBjYzA2ZjBkNjcwZmMzNjcxY2E2ODY=
|
data/lib/scrapey.rb
CHANGED
@@ -11,6 +11,16 @@ require "scrapey/database"
|
|
11
11
|
require "scrapey/multi"
|
12
12
|
require "scrapey/tee"
|
13
13
|
|
14
|
+
require 'addressable/uri'
|
15
|
+
|
16
|
+
class URI::Parser
|
17
|
+
def split url
|
18
|
+
a = Addressable::URI::parse url
|
19
|
+
[a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
14
24
|
# don't do this stuff in rails:
|
15
25
|
unless defined? Rails
|
16
26
|
Scrapey::init binding
|
@@ -24,5 +34,26 @@ unless defined? Rails
|
|
24
34
|
|
25
35
|
init_db if @config['database']
|
26
36
|
|
27
|
-
|
28
|
-
end
|
37
|
+
#$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
38
|
+
end
|
39
|
+
|
40
|
+
if defined?(Ocra)
|
41
|
+
puts "doing ocra stuff..."
|
42
|
+
Mechanize.new.cookies
|
43
|
+
HTTP::Cookie::Scanner.new ''
|
44
|
+
if @config['database'] || @config['databases']
|
45
|
+
puts "doing ocra db stuff..."
|
46
|
+
ActiveRecord::Relation::PredicateBuilder.new rescue nil
|
47
|
+
[
|
48
|
+
'active_record',
|
49
|
+
'active_record/schema',
|
50
|
+
'active_record/connection_adapters/abstract/schema_definitions',
|
51
|
+
@config['database'] ? @config['database']['adapter'] : 'mysql',
|
52
|
+
'tzinfo',
|
53
|
+
'active_support/all',
|
54
|
+
'active_support/multibyte/chars'
|
55
|
+
].each{|lib| require lib}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
Dir.chdir BASEDIR
|
data/lib/scrapey/cache/disk.rb
CHANGED
@@ -15,7 +15,7 @@ module Scrapey
|
|
15
15
|
return nil unless File::exists?(filename)
|
16
16
|
debug "Loading #{filename} from cache"
|
17
17
|
begin
|
18
|
-
|
18
|
+
Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
|
19
19
|
rescue Exception => e
|
20
20
|
puts e.message
|
21
21
|
end
|
data/lib/scrapey/cache/redis.rb
CHANGED
@@ -9,7 +9,7 @@ module Scrapey
|
|
9
9
|
def load_cache url
|
10
10
|
debug "Loading #{url} from cache"
|
11
11
|
return nil unless str = @redis.get(url)
|
12
|
-
|
12
|
+
Mechanize::Page.new(URI.parse(url), [], Marshal.load(str), nil, @agent) rescue nil
|
13
13
|
end
|
14
14
|
|
15
15
|
def save_cache url, body, options = {}
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/core.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
class URI::Parser
|
4
|
+
def split url
|
5
|
+
a = Addressable::URI::parse url
|
6
|
+
[a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class Hash
|
11
|
+
def shuffle
|
12
|
+
Hash[self.to_a.shuffle]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Nokogiri::XML::NodeSet
|
17
|
+
def shuffle
|
18
|
+
self.to_a.shuffle
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class Enumerator
|
23
|
+
def shuffle
|
24
|
+
self.to_a.shuffle
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class CSV::Table
|
29
|
+
def shuffle
|
30
|
+
arr = self.to_a
|
31
|
+
k = arr.shift
|
32
|
+
arr.map{|v| Hash[k.zip v]}.shuffle
|
33
|
+
end
|
34
|
+
end
|
data/lib/scrapey/scrapey.rb
CHANGED
data/template/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
config
|
data/template/Rakefile
CHANGED
@@ -18,7 +18,7 @@ task 'dropbox' do
|
|
18
18
|
folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
|
19
19
|
FileUtils.mkdir(folder) unless File.exists?(folder)
|
20
20
|
FileUtils.cp "Output/#{file}", folder
|
21
|
-
url = [ENV['DROPBOX_public_url'], name, file].join('/')
|
21
|
+
url = [ENV['DROPBOX_public_url'], name, file].join('/')
|
22
22
|
puts "uploaded to #{url}"
|
23
23
|
end
|
24
24
|
end
|
data/template/config/config.yml
CHANGED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'scrapey'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
=begin
|
5
|
+
@config = {
|
6
|
+
'category' => 'businesses',
|
7
|
+
'dataset_name' => 'brazilian_companies',
|
8
|
+
'database' => {
|
9
|
+
'adapter' => 'mysql',
|
10
|
+
'database' => 'stefan',
|
11
|
+
'username' => 'root',
|
12
|
+
'password' => '12345',
|
13
|
+
'host' => 'localhost',
|
14
|
+
'encoding' => 'utf8'
|
15
|
+
}
|
16
|
+
}
|
17
|
+
=end
|
18
|
+
|
19
|
+
def post url, body
|
20
|
+
page = @agent.post url, body
|
21
|
+
JSON.parse(page.body).each{|k, v|}
|
22
|
+
raise 'x' unless page.body
|
23
|
+
page
|
24
|
+
rescue Exception => e
|
25
|
+
print '!'
|
26
|
+
sleep 10
|
27
|
+
return post url, body
|
28
|
+
end
|
29
|
+
|
30
|
+
@agent.open_timeout = @agent.read_timeout = 10000
|
31
|
+
|
32
|
+
tables = ActiveRecord::Base.connection.tables
|
33
|
+
|
34
|
+
tables.each do |table|
|
35
|
+
puts table
|
36
|
+
tables table.camelize
|
37
|
+
klass = table.camelize.constantize
|
38
|
+
return unless klass.column_names.include?('website')
|
39
|
+
|
40
|
+
klass.where("website is not null and email is null").find_in_batches(:batch_size => 10) do |group|
|
41
|
+
page = post('http://www.pay4data.com/lookup/email_for_url', {urls: group.map(&:website).compact}.to_json)
|
42
|
+
JSON.parse(page.body).each do |k, v|
|
43
|
+
group.find{|r| r['website'] == k}.update_attributes(:email => v)
|
44
|
+
puts k
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
@@ -0,0 +1,133 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'scrapey'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
=begin
|
6
|
+
@config = {
|
7
|
+
'category' => 'businesses',
|
8
|
+
'dataset_name' => 'brazilian_companies',
|
9
|
+
'database' => {
|
10
|
+
'adapter' => 'mysql',
|
11
|
+
'database' => 'stefan',
|
12
|
+
'username' => 'root',
|
13
|
+
'password' => '12345',
|
14
|
+
'host' => 'localhost',
|
15
|
+
'encoding' => 'utf8'
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
CSV.open("#{BASEDIR}/#{table}.csv", 'w') do |csv|
|
22
|
+
csv << fields
|
23
|
+
klass.where(:found => true).find_each do |row|
|
24
|
+
csv << fields.map{|f| row[f]}
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
=end
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
def new_csv filename
|
33
|
+
File.open(filename, 'w') do |file|
|
34
|
+
file << 0xEF.chr + 0xBB.chr + 0xBF.chr
|
35
|
+
end
|
36
|
+
CSV.open(filename, 'a') do |csv|
|
37
|
+
yield csv
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
unless @config['dataset_name'] && @config['category']
|
42
|
+
puts 'Please fill out dataset_name and category in config.yml to continue'
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
init_db
|
46
|
+
@tables = ActiveRecord::Base.connection.tables
|
47
|
+
|
48
|
+
all_fields = []
|
49
|
+
|
50
|
+
@tables.each do |table|
|
51
|
+
puts table
|
52
|
+
tables table.camelize
|
53
|
+
klass = table.camelize.constantize
|
54
|
+
|
55
|
+
all_fields << klass.column_names
|
56
|
+
fields = klass.column_names - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']
|
57
|
+
|
58
|
+
new_csv("#{BASEDIR}/#{table}.csv") do |csv|
|
59
|
+
csv << fields
|
60
|
+
klass.all.find_each do |row|
|
61
|
+
csv << fields.map{|f| row[f]}
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
new_csv("#{BASEDIR}/#{table}_sample.csv") do |csv|
|
66
|
+
csv << fields
|
67
|
+
klass.order(:id).order('rand()').limit(50).each do |row|
|
68
|
+
csv << fields.map{|f| row[f]}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
if @tables.length == 0
|
75
|
+
table = @tables.first
|
76
|
+
`gzip -f #{BASEDIR}/#{table}_sample.csv`
|
77
|
+
`gzip -f #{BASEDIR}/#{table}.csv`
|
78
|
+
csv_name = "#{table}.csv.gz"
|
79
|
+
sample_name = "#{table}_sample.csv.gz"
|
80
|
+
|
81
|
+
csv_name = "#{@config['dataset_name']}.csv.gz"
|
82
|
+
`mv #{BASEDIR}/#{table}.csv.gz #{csv_name}`
|
83
|
+
sample_name = "#{@config['dataset_name']}_sample.csv.gz"
|
84
|
+
`mv #{BASEDIR}/#{table}_sample.csv.gz #{sample_name}`
|
85
|
+
|
86
|
+
else
|
87
|
+
csv_name = "#{@config['dataset_name']}.csv.tar.gz"
|
88
|
+
sample_name = "#{@config['dataset_name']}.sample.tar.gz"
|
89
|
+
sample_sql = "#{@config['dataset_name']}_sample.sql"
|
90
|
+
|
91
|
+
cmd = "tar -czf #{csv_name} " + @tables.map{|x| x + '.csv'}.join(' ')
|
92
|
+
`#{cmd}`
|
93
|
+
File.open(sample_sql, 'w') do |f|
|
94
|
+
f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 --where="true limit 100" #{@config['database']['database']}`
|
95
|
+
end
|
96
|
+
cmd = "tar -czf #{sample_name} #{sample_sql} " + @tables.map{|x| x + '_sample.csv'}.join(' ')
|
97
|
+
`#{cmd}`
|
98
|
+
end
|
99
|
+
|
100
|
+
# --where="true limit 100"
|
101
|
+
File.open("#{@config['dataset_name']}.sql", 'w') do |f|
|
102
|
+
f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 #{@config['database']['database']}`
|
103
|
+
end
|
104
|
+
`gzip -f #{@config['dataset_name']}.sql`
|
105
|
+
sql_name = "#{@config['dataset_name']}.sql.gz"
|
106
|
+
|
107
|
+
s3 = AWS::S3.new :access_key_id => ENV['AMAZON_ACCESS_KEY_ID'], :secret_access_key => ENV['AMAZON_SECRET_ACCESS_KEY']
|
108
|
+
bucket = s3.buckets['pay4data']
|
109
|
+
|
110
|
+
sample_object = bucket.objects["#{@config['category']}/#{sample_name}"].write :file => sample_name, :content_type => 'application/gzip', :acl => :public_read
|
111
|
+
csv_object = bucket.objects["#{@config['category']}/#{csv_name}"].write :file => csv_name, :content_type => 'application/gzip'
|
112
|
+
sql_object = bucket.objects["#{@config['category']}/#{sql_name}"].write :file => sql_name, :content_type => 'application/gzip'
|
113
|
+
|
114
|
+
sql = <<EOF
|
115
|
+
insert into datasets(sample_url, csv_url, sql_url, last_crawled, fields) values(
|
116
|
+
'#{sample_object.public_url.to_s}',
|
117
|
+
'#{csv_object.public_url.to_s}',
|
118
|
+
'#{sql_object.public_url.to_s}',
|
119
|
+
now(),
|
120
|
+
'#{fields.map{|t| (t - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']).join ', '}.join ", "}'
|
121
|
+
);
|
122
|
+
|
123
|
+
update datasets set category_id=5, name='', description='', price='', button_html='' where id=
|
124
|
+
|
125
|
+
|
126
|
+
mysqldump pay4data datasets categories | mysql2 pay4data
|
127
|
+
|
128
|
+
|
129
|
+
EOF
|
130
|
+
|
131
|
+
puts sql
|
132
|
+
|
133
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'scrapey'
|
2
|
+
require 'pry'
|
3
|
+
require 'syck'
|
4
|
+
require "#{BASEDIR}/src/proxy.rb"
|
5
|
+
|
6
|
+
|
7
|
+
YAML::ENGINE.yamler='syck'
|
8
|
+
|
9
|
+
#proxies = Proxy::get_proxies :proxy_list
|
10
|
+
proxies = Proxy::get_proxies :all
|
11
|
+
|
12
|
+
@config['proxies'] = proxies.uniq
|
13
|
+
File.open("#{BASEDIR}/config/config.yml", 'w') { |f| YAML.dump(@config, f) }
|
14
|
+
|
@@ -0,0 +1,278 @@
|
|
1
|
+
require "base64"
|
2
|
+
|
3
|
+
class Proxy
|
4
|
+
attr_reader :current
|
5
|
+
BOOM = 'boom'
|
6
|
+
|
7
|
+
def initialize agent = nil, options = {}
|
8
|
+
@user_agents = [
|
9
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
10
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
|
11
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
|
12
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
|
13
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
|
14
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
|
15
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
16
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
|
17
|
+
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
18
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
|
19
|
+
]
|
20
|
+
@agent = agent
|
21
|
+
@min = options[:min] || 5
|
22
|
+
@sleep = options[:sleep] || 60 * 60 # 1 hour
|
23
|
+
@verbose = options[:verbose] || false
|
24
|
+
@timeout = options[:timeout] || 30
|
25
|
+
@round_time = options[:round_time] || 5 * 60 # 5 minutes
|
26
|
+
@agent.open_timeout = @agent.read_timeout = @timeout
|
27
|
+
proxies = options[:proxies] || []
|
28
|
+
set_proxies proxies
|
29
|
+
end
|
30
|
+
|
31
|
+
def set_proxies proxies
|
32
|
+
@proxies = proxies.select{|x| x[/:/]}.uniq{|x| x[/.*:/]}
|
33
|
+
self.shuffle
|
34
|
+
end
|
35
|
+
|
36
|
+
def debug str
|
37
|
+
puts str if @verbose
|
38
|
+
end
|
39
|
+
|
40
|
+
def shuffle
|
41
|
+
@proxies = [BOOM] + (@proxies - [BOOM]).shuffle
|
42
|
+
start_round
|
43
|
+
self.rotate
|
44
|
+
end
|
45
|
+
|
46
|
+
def to_yaml
|
47
|
+
@proxies.to_yaml
|
48
|
+
end
|
49
|
+
|
50
|
+
def start_round
|
51
|
+
now = Time.now.to_i
|
52
|
+
if @round_start
|
53
|
+
sleep_time = @round_time - (now - @round_start)
|
54
|
+
if sleep_time > 0
|
55
|
+
puts "sleeping for #{sleep_time}"
|
56
|
+
sleep sleep_time
|
57
|
+
end
|
58
|
+
end
|
59
|
+
@round_start = Time.now.to_i
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_more_proxies
|
63
|
+
puts 'getting more proxies'
|
64
|
+
proxies = Proxy::get_proxies
|
65
|
+
set_proxies proxies
|
66
|
+
end
|
67
|
+
|
68
|
+
def rotate
|
69
|
+
debug "rotating"
|
70
|
+
@proxies.rotate!
|
71
|
+
@user_agents.rotate!
|
72
|
+
if @proxies.length < @min
|
73
|
+
get_more_proxies
|
74
|
+
end
|
75
|
+
@current = @proxies.first
|
76
|
+
if @current == BOOM
|
77
|
+
start_round
|
78
|
+
rotate
|
79
|
+
return
|
80
|
+
end
|
81
|
+
|
82
|
+
host, port = @current.split ':'
|
83
|
+
debug "setting proxy to #{host}:#{port}"
|
84
|
+
@agent.set_proxy host, port.to_i
|
85
|
+
debug "setting user_agent to #{@user_agents.first}"
|
86
|
+
@agent.user_agent = @user_agents.first
|
87
|
+
end
|
88
|
+
|
89
|
+
def remove
|
90
|
+
debug "--- removing #{@current}"
|
91
|
+
@proxies.shift
|
92
|
+
rotate
|
93
|
+
debug @proxies.join(', ')
|
94
|
+
debug @current
|
95
|
+
end
|
96
|
+
|
97
|
+
def pause
|
98
|
+
time = @sleep / @proxies.length
|
99
|
+
debug "sleeping for #{time}"
|
100
|
+
sleep time
|
101
|
+
end
|
102
|
+
|
103
|
+
def length
|
104
|
+
@proxies.length
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
def self.get_idcloak
|
110
|
+
proxies = []
|
111
|
+
['http://www.idcloak.com/proxylist/free-proxy-servers-list.html'].each do |url|
|
112
|
+
page = @agent.get url
|
113
|
+
|
114
|
+
page.search('#sort td[7]').each do |td|
|
115
|
+
port = td.text.strip
|
116
|
+
host = td.at('+ td').text.strip
|
117
|
+
proxies << "#{host}:#{port}"
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
proxies
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.get_proxynova
|
125
|
+
proxies = []
|
126
|
+
['http://www.proxynova.com/proxy-server-list/'].each do |url|
|
127
|
+
page = @agent.get url
|
128
|
+
|
129
|
+
page.search('.row_proxy_ip').each do |span|
|
130
|
+
str = span.text[/long2ip\((.*?)\)/, 1]
|
131
|
+
next if str[/a-z/i]
|
132
|
+
i = eval str
|
133
|
+
host = Proxy::long2ip(i)
|
134
|
+
port = span.parent.at('+ td').text.strip
|
135
|
+
proxies << "#{host}:#{port}"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
proxies
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.get_proxy_list
|
142
|
+
proxies = []
|
143
|
+
['http://proxy-list.org/en/index.php',
|
144
|
+
'http://proxy-list.org/en/index.php?sp=20',
|
145
|
+
'http://proxy-list.org/en/index.php?sp=40',
|
146
|
+
'http://proxy-list.org/en/index.php?sp=60',
|
147
|
+
'http://proxy-list.org/en/index.php?sp=80',
|
148
|
+
'http://proxy-list.org/en/index.php?sp=100',
|
149
|
+
'http://proxy-list.org/en/index.php?sp=120'].each do |url|
|
150
|
+
page = @agent.get url
|
151
|
+
proxies += page.body.scan(/(?:\d+\.){3}\d+:\d+/)
|
152
|
+
end
|
153
|
+
proxies
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.get_hidemyass
|
157
|
+
proxies = []
|
158
|
+
['http://hidemyass.com/proxy-list/search-227752',
|
159
|
+
'http://hidemyass.com/proxy-list/search-227752/2',
|
160
|
+
'http://hidemyass.com/proxy-list/search-227752/3',
|
161
|
+
'http://hidemyass.com/proxy-list/search-227752/4',
|
162
|
+
'http://hidemyass.com/proxy-list/search-227752/5',
|
163
|
+
'http://hidemyass.com/proxy-list/search-227752/6'].each do |url|
|
164
|
+
page = @agent.get url
|
165
|
+
page.search('*[style*="display:none"]').remove
|
166
|
+
page.search(page.body.scan(/(\..*?)\{display:none\}/).flatten.join(', ')).remove
|
167
|
+
page.search('style').remove
|
168
|
+
proxies += page.search('td[2]').map{|x| x.text.strip}.zip(page.search('td[3]').map{|x| x.text.strip}).map{|h,p| "#{h}:#{p}"}[1..-1]
|
169
|
+
end
|
170
|
+
proxies
|
171
|
+
end
|
172
|
+
|
173
|
+
def self.get_cool_proxy
|
174
|
+
proxies = []
|
175
|
+
page = @agent.get 'http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc'
|
176
|
+
page.search('tr')[1..-2].each do |tr|
|
177
|
+
next unless tr.at('td[2]')
|
178
|
+
host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
|
179
|
+
port = tr.at('td[2]').text
|
180
|
+
proxies << [host, port].join(':')
|
181
|
+
end
|
182
|
+
|
183
|
+
while a = page.at('a[rel=next]')
|
184
|
+
url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
|
185
|
+
begin
|
186
|
+
page = @agent.get url
|
187
|
+
rescue
|
188
|
+
return proxies
|
189
|
+
end
|
190
|
+
page.search('tr')[1..-2].each do |tr|
|
191
|
+
next unless tr.at('td[2]')
|
192
|
+
host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
|
193
|
+
port = tr.at('td[2]').text
|
194
|
+
proxies << [host, port].join(':')
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
proxies
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
def self.get_freeproxylists
|
203
|
+
proxies = []
|
204
|
+
|
205
|
+
@agent.follow_meta_refresh = true
|
206
|
+
page = @agent.get 'http://www.freeproxylists.net/'
|
207
|
+
|
208
|
+
page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
|
209
|
+
proxies << [URI.decode(row[0]), row[1]].join(':')
|
210
|
+
end
|
211
|
+
|
212
|
+
while a = page.at('a[text()^=Next]')
|
213
|
+
url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
|
214
|
+
puts url
|
215
|
+
page = @agent.get url
|
216
|
+
page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
|
217
|
+
proxies << [URI.decode(row[0]), row[1]].join(':')
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
proxies
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.long2ip(long)
|
225
|
+
ip = []
|
226
|
+
4.times do |i|
|
227
|
+
ip.push(long.to_i & 255)
|
228
|
+
long = long.to_i >> 8
|
229
|
+
end
|
230
|
+
ip.join(".")
|
231
|
+
end
|
232
|
+
|
233
|
+
def self.get_proxies provider = :all
|
234
|
+
|
235
|
+
@agent ||= Mechanize.new{|a| a.history.max_size = 10}
|
236
|
+
@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
237
|
+
@agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
238
|
+
|
239
|
+
case provider
|
240
|
+
when :proxy_list then return get_proxy_list
|
241
|
+
when :hidemyass then return get_hidemyass
|
242
|
+
when :freeproxylists then return get_freeproxylists
|
243
|
+
when :cool_proxy then return get_cool_proxy
|
244
|
+
when :proxynova then return get_proxynova
|
245
|
+
when :idcloak then return get_idcloak
|
246
|
+
when :all
|
247
|
+
proxies = []
|
248
|
+
[:proxy_list, :hidemyass, :freeproxylists, :cool_proxy, :proxynova, :idcloak].each do |key|
|
249
|
+
puts key
|
250
|
+
begin
|
251
|
+
part = get_proxies(key)
|
252
|
+
rescue Exception => e
|
253
|
+
part = []
|
254
|
+
puts e.message
|
255
|
+
end
|
256
|
+
puts part.length
|
257
|
+
proxies += part
|
258
|
+
end
|
259
|
+
proxies
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
if ARGV.include?('-p')
|
265
|
+
puts "refreshing proxies, please wait..."
|
266
|
+
require "#{BASEDIR}/src/get_proxies.rb"
|
267
|
+
puts "#{@config['proxies'].length} proxies found."
|
268
|
+
puts "Hit [enter] to exit"
|
269
|
+
$stdin.gets
|
270
|
+
exit
|
271
|
+
end
|
272
|
+
|
273
|
+
# for testing
|
274
|
+
if __FILE__ == $0
|
275
|
+
require 'mechanize'
|
276
|
+
@agent = Mechanize.new
|
277
|
+
proxy = Proxy.new @agent, :verbose => true, :min => 5
|
278
|
+
end
|
data/template/src/schema.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
=begin
|
2
2
|
# put table schemas here. this will be included if the table is not found.
|
3
3
|
ActiveRecord::Schema.define do
|
4
|
-
create_table "items" do |t|
|
4
|
+
create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
|
5
5
|
t.string "string_field"
|
6
6
|
t.text "text_field"
|
7
7
|
t.integer "number_field"
|
data/template/src/template.rb
CHANGED
@@ -5,6 +5,12 @@ require 'pry'
|
|
5
5
|
# @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
6
6
|
# @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
|
7
7
|
|
8
|
+
EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
|
9
|
+
|
10
|
+
def clean str
|
11
|
+
str.gsub(/[[:space:]]+/, ' ').strip
|
12
|
+
end
|
13
|
+
|
8
14
|
def scrape div
|
9
15
|
a = div.at('a')
|
10
16
|
url = URI.join(@url, a[:href]).to_s
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.16
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- P Guardiario
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-04-22 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: mechanize
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: httpclient
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: json
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ~>
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,6 @@ dependencies:
|
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ~>
|
60
53
|
- !ruby/object:Gem::Version
|
@@ -83,6 +76,7 @@ files:
|
|
83
76
|
- lib/scrapey/cache/redis.rb
|
84
77
|
- lib/scrapey/cache.rb
|
85
78
|
- lib/scrapey/constants.rb
|
79
|
+
- lib/scrapey/core.rb
|
86
80
|
- lib/scrapey/database.rb
|
87
81
|
- lib/scrapey/multi.rb
|
88
82
|
- lib/scrapey/scrapey.rb
|
@@ -91,37 +85,42 @@ files:
|
|
91
85
|
- lib/scrapey/tor.rb
|
92
86
|
- lib/scrapey.rb
|
93
87
|
- scrapey.gemspec
|
88
|
+
- template/.gitignore
|
94
89
|
- template/config/config.yml
|
95
90
|
- template/Gemfile
|
96
91
|
- template/icon.ico
|
97
92
|
- template/output.csv
|
98
93
|
- template/Rakefile
|
99
94
|
- template/src/downloader.rb
|
95
|
+
- template/src/emails.rb
|
96
|
+
- template/src/export.rb
|
97
|
+
- template/src/get_proxies.rb
|
98
|
+
- template/src/proxy.rb
|
100
99
|
- template/src/schema.rb
|
101
100
|
- template/src/template.rb
|
102
101
|
- template/template.iss
|
103
102
|
homepage: ''
|
104
103
|
licenses: []
|
104
|
+
metadata: {}
|
105
105
|
post_install_message:
|
106
106
|
rdoc_options: []
|
107
107
|
require_paths:
|
108
108
|
- lib
|
109
109
|
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
-
none: false
|
111
110
|
requirements:
|
112
111
|
- - ! '>='
|
113
112
|
- !ruby/object:Gem::Version
|
114
113
|
version: '0'
|
115
114
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
-
none: false
|
117
115
|
requirements:
|
118
116
|
- - ! '>='
|
119
117
|
- !ruby/object:Gem::Version
|
120
118
|
version: '0'
|
121
119
|
requirements: []
|
122
120
|
rubyforge_project:
|
123
|
-
rubygems_version: 1.
|
121
|
+
rubygems_version: 2.1.5
|
124
122
|
signing_key:
|
125
|
-
specification_version:
|
123
|
+
specification_version: 4
|
126
124
|
summary: A simple scraping framework
|
127
125
|
test_files: []
|
126
|
+
has_rdoc:
|