scrapey 0.0.13 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/scrapey.rb +33 -2
- data/lib/scrapey/cache/disk.rb +1 -1
- data/lib/scrapey/cache/redis.rb +1 -1
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/core.rb +34 -0
- data/lib/scrapey/scrapey.rb +6 -0
- data/template/.gitignore +1 -0
- data/template/Rakefile +1 -1
- data/template/config/config.yml +3 -0
- data/template/src/emails.rb +48 -0
- data/template/src/export.rb +133 -0
- data/template/src/get_proxies.rb +14 -0
- data/template/src/proxy.rb +278 -0
- data/template/src/schema.rb +1 -1
- data/template/src/template.rb +6 -0
- metadata +12 -13
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
Njk0YTkyMDY0MGYxMzM1ZDMwOWM5Yzg4YmQ4YTA2NGM3N2Q1ZGUxMw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YjE2NjNlYjQwMTIwMWE3YmRjNWQzNDVlZWI0MjhjNWI1NWMzNTg1OQ==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NTcyZWFmYjdhZmMyMjVjNzc1ZTk5ZDRkMTM2YjYxMTE5NjZmMThmNTc2MGVl
|
10
|
+
MTI4Yjc0ODkwZWQ3NGNkNWM4NmI5ZGFlMmJiYWNiMjFhMDEyMGIzMjhkZmYz
|
11
|
+
ZDNlN2Y3NGFjYzdjNjlhMWE4Y2FiMzcxZmQyNDBlZTM4MTA1Yzg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NmIwOGRjZTczMTE1YjE2MDQzNzc5MmJmYTQ4MzQzMGNlNGQ1Y2YxMzA1MDZk
|
14
|
+
NzkzMTdhOTI3MzAxODNiYmZhNzlhNjkzNGUwODAwZTVmYmRjZjY1ZjdkOGJm
|
15
|
+
ZTFmOWQ5OTFiZmQwY2U3NTBjYzA2ZjBkNjcwZmMzNjcxY2E2ODY=
|
data/lib/scrapey.rb
CHANGED
@@ -11,6 +11,16 @@ require "scrapey/database"
|
|
11
11
|
require "scrapey/multi"
|
12
12
|
require "scrapey/tee"
|
13
13
|
|
14
|
+
require 'addressable/uri'
|
15
|
+
|
16
|
+
class URI::Parser
|
17
|
+
def split url
|
18
|
+
a = Addressable::URI::parse url
|
19
|
+
[a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
14
24
|
# don't do this stuff in rails:
|
15
25
|
unless defined? Rails
|
16
26
|
Scrapey::init binding
|
@@ -24,5 +34,26 @@ unless defined? Rails
|
|
24
34
|
|
25
35
|
init_db if @config['database']
|
26
36
|
|
27
|
-
|
28
|
-
end
|
37
|
+
#$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
38
|
+
end
|
39
|
+
|
40
|
+
if defined?(Ocra)
|
41
|
+
puts "doing ocra stuff..."
|
42
|
+
Mechanize.new.cookies
|
43
|
+
HTTP::Cookie::Scanner.new ''
|
44
|
+
if @config['database'] || @config['databases']
|
45
|
+
puts "doing ocra db stuff..."
|
46
|
+
ActiveRecord::Relation::PredicateBuilder.new rescue nil
|
47
|
+
[
|
48
|
+
'active_record',
|
49
|
+
'active_record/schema',
|
50
|
+
'active_record/connection_adapters/abstract/schema_definitions',
|
51
|
+
@config['database'] ? @config['database']['adapter'] : 'mysql',
|
52
|
+
'tzinfo',
|
53
|
+
'active_support/all',
|
54
|
+
'active_support/multibyte/chars'
|
55
|
+
].each{|lib| require lib}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
Dir.chdir BASEDIR
|
data/lib/scrapey/cache/disk.rb
CHANGED
@@ -15,7 +15,7 @@ module Scrapey
|
|
15
15
|
return nil unless File::exists?(filename)
|
16
16
|
debug "Loading #{filename} from cache"
|
17
17
|
begin
|
18
|
-
|
18
|
+
Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
|
19
19
|
rescue Exception => e
|
20
20
|
puts e.message
|
21
21
|
end
|
data/lib/scrapey/cache/redis.rb
CHANGED
@@ -9,7 +9,7 @@ module Scrapey
|
|
9
9
|
def load_cache url
|
10
10
|
debug "Loading #{url} from cache"
|
11
11
|
return nil unless str = @redis.get(url)
|
12
|
-
|
12
|
+
Mechanize::Page.new(URI.parse(url), [], Marshal.load(str), nil, @agent) rescue nil
|
13
13
|
end
|
14
14
|
|
15
15
|
def save_cache url, body, options = {}
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/core.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
class URI::Parser
|
4
|
+
def split url
|
5
|
+
a = Addressable::URI::parse url
|
6
|
+
[a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class Hash
|
11
|
+
def shuffle
|
12
|
+
Hash[self.to_a.shuffle]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Nokogiri::XML::NodeSet
|
17
|
+
def shuffle
|
18
|
+
self.to_a.shuffle
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class Enumerator
|
23
|
+
def shuffle
|
24
|
+
self.to_a.shuffle
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class CSV::Table
|
29
|
+
def shuffle
|
30
|
+
arr = self.to_a
|
31
|
+
k = arr.shift
|
32
|
+
arr.map{|v| Hash[k.zip v]}.shuffle
|
33
|
+
end
|
34
|
+
end
|
data/lib/scrapey/scrapey.rb
CHANGED
data/template/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
config
|
data/template/Rakefile
CHANGED
@@ -18,7 +18,7 @@ task 'dropbox' do
|
|
18
18
|
folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
|
19
19
|
FileUtils.mkdir(folder) unless File.exists?(folder)
|
20
20
|
FileUtils.cp "Output/#{file}", folder
|
21
|
-
url = [ENV['DROPBOX_public_url'], name, file].join('/')
|
21
|
+
url = [ENV['DROPBOX_public_url'], name, file].join('/')
|
22
22
|
puts "uploaded to #{url}"
|
23
23
|
end
|
24
24
|
end
|
data/template/config/config.yml
CHANGED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'scrapey'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
=begin
|
5
|
+
@config = {
|
6
|
+
'category' => 'businesses',
|
7
|
+
'dataset_name' => 'brazilian_companies',
|
8
|
+
'database' => {
|
9
|
+
'adapter' => 'mysql',
|
10
|
+
'database' => 'stefan',
|
11
|
+
'username' => 'root',
|
12
|
+
'password' => '12345',
|
13
|
+
'host' => 'localhost',
|
14
|
+
'encoding' => 'utf8'
|
15
|
+
}
|
16
|
+
}
|
17
|
+
=end
|
18
|
+
|
19
|
+
def post url, body
|
20
|
+
page = @agent.post url, body
|
21
|
+
JSON.parse(page.body).each{|k, v|}
|
22
|
+
raise 'x' unless page.body
|
23
|
+
page
|
24
|
+
rescue Exception => e
|
25
|
+
print '!'
|
26
|
+
sleep 10
|
27
|
+
return post url, body
|
28
|
+
end
|
29
|
+
|
30
|
+
@agent.open_timeout = @agent.read_timeout = 10000
|
31
|
+
|
32
|
+
tables = ActiveRecord::Base.connection.tables
|
33
|
+
|
34
|
+
tables.each do |table|
|
35
|
+
puts table
|
36
|
+
tables table.camelize
|
37
|
+
klass = table.camelize.constantize
|
38
|
+
return unless klass.column_names.include?('website')
|
39
|
+
|
40
|
+
klass.where("website is not null and email is null").find_in_batches(:batch_size => 10) do |group|
|
41
|
+
page = post('http://www.pay4data.com/lookup/email_for_url', {urls: group.map(&:website).compact}.to_json)
|
42
|
+
JSON.parse(page.body).each do |k, v|
|
43
|
+
group.find{|r| r['website'] == k}.update_attributes(:email => v)
|
44
|
+
puts k
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
@@ -0,0 +1,133 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'scrapey'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
=begin
|
6
|
+
@config = {
|
7
|
+
'category' => 'businesses',
|
8
|
+
'dataset_name' => 'brazilian_companies',
|
9
|
+
'database' => {
|
10
|
+
'adapter' => 'mysql',
|
11
|
+
'database' => 'stefan',
|
12
|
+
'username' => 'root',
|
13
|
+
'password' => '12345',
|
14
|
+
'host' => 'localhost',
|
15
|
+
'encoding' => 'utf8'
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
CSV.open("#{BASEDIR}/#{table}.csv", 'w') do |csv|
|
22
|
+
csv << fields
|
23
|
+
klass.where(:found => true).find_each do |row|
|
24
|
+
csv << fields.map{|f| row[f]}
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
=end
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
def new_csv filename
|
33
|
+
File.open(filename, 'w') do |file|
|
34
|
+
file << 0xEF.chr + 0xBB.chr + 0xBF.chr
|
35
|
+
end
|
36
|
+
CSV.open(filename, 'a') do |csv|
|
37
|
+
yield csv
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
unless @config['dataset_name'] && @config['category']
|
42
|
+
puts 'Please fill out dataset_name and category in config.yml to continue'
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
init_db
|
46
|
+
@tables = ActiveRecord::Base.connection.tables
|
47
|
+
|
48
|
+
all_fields = []
|
49
|
+
|
50
|
+
@tables.each do |table|
|
51
|
+
puts table
|
52
|
+
tables table.camelize
|
53
|
+
klass = table.camelize.constantize
|
54
|
+
|
55
|
+
all_fields << klass.column_names
|
56
|
+
fields = klass.column_names - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']
|
57
|
+
|
58
|
+
new_csv("#{BASEDIR}/#{table}.csv") do |csv|
|
59
|
+
csv << fields
|
60
|
+
klass.all.find_each do |row|
|
61
|
+
csv << fields.map{|f| row[f]}
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
new_csv("#{BASEDIR}/#{table}_sample.csv") do |csv|
|
66
|
+
csv << fields
|
67
|
+
klass.order(:id).order('rand()').limit(50).each do |row|
|
68
|
+
csv << fields.map{|f| row[f]}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
if @tables.length == 0
|
75
|
+
table = @tables.first
|
76
|
+
`gzip -f #{BASEDIR}/#{table}_sample.csv`
|
77
|
+
`gzip -f #{BASEDIR}/#{table}.csv`
|
78
|
+
csv_name = "#{table}.csv.gz"
|
79
|
+
sample_name = "#{table}_sample.csv.gz"
|
80
|
+
|
81
|
+
csv_name = "#{@config['dataset_name']}.csv.gz"
|
82
|
+
`mv #{BASEDIR}/#{table}.csv.gz #{csv_name}`
|
83
|
+
sample_name = "#{@config['dataset_name']}_sample.csv.gz"
|
84
|
+
`mv #{BASEDIR}/#{table}_sample.csv.gz #{sample_name}`
|
85
|
+
|
86
|
+
else
|
87
|
+
csv_name = "#{@config['dataset_name']}.csv.tar.gz"
|
88
|
+
sample_name = "#{@config['dataset_name']}.sample.tar.gz"
|
89
|
+
sample_sql = "#{@config['dataset_name']}_sample.sql"
|
90
|
+
|
91
|
+
cmd = "tar -czf #{csv_name} " + @tables.map{|x| x + '.csv'}.join(' ')
|
92
|
+
`#{cmd}`
|
93
|
+
File.open(sample_sql, 'w') do |f|
|
94
|
+
f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 --where="true limit 100" #{@config['database']['database']}`
|
95
|
+
end
|
96
|
+
cmd = "tar -czf #{sample_name} #{sample_sql} " + @tables.map{|x| x + '_sample.csv'}.join(' ')
|
97
|
+
`#{cmd}`
|
98
|
+
end
|
99
|
+
|
100
|
+
# --where="true limit 100"
|
101
|
+
File.open("#{@config['dataset_name']}.sql", 'w') do |f|
|
102
|
+
f << `"C:\\Program Files\\MySQL\\MySQL Server 5.6\\bin\\mysqldump.exe" -uroot -p12345 #{@config['database']['database']}`
|
103
|
+
end
|
104
|
+
`gzip -f #{@config['dataset_name']}.sql`
|
105
|
+
sql_name = "#{@config['dataset_name']}.sql.gz"
|
106
|
+
|
107
|
+
s3 = AWS::S3.new :access_key_id => ENV['AMAZON_ACCESS_KEY_ID'], :secret_access_key => ENV['AMAZON_SECRET_ACCESS_KEY']
|
108
|
+
bucket = s3.buckets['pay4data']
|
109
|
+
|
110
|
+
sample_object = bucket.objects["#{@config['category']}/#{sample_name}"].write :file => sample_name, :content_type => 'application/gzip', :acl => :public_read
|
111
|
+
csv_object = bucket.objects["#{@config['category']}/#{csv_name}"].write :file => csv_name, :content_type => 'application/gzip'
|
112
|
+
sql_object = bucket.objects["#{@config['category']}/#{sql_name}"].write :file => sql_name, :content_type => 'application/gzip'
|
113
|
+
|
114
|
+
sql = <<EOF
|
115
|
+
insert into datasets(sample_url, csv_url, sql_url, last_crawled, fields) values(
|
116
|
+
'#{sample_object.public_url.to_s}',
|
117
|
+
'#{csv_object.public_url.to_s}',
|
118
|
+
'#{sql_object.public_url.to_s}',
|
119
|
+
now(),
|
120
|
+
'#{fields.map{|t| (t - ['id', 'updated_at', 'created_at', 'updated_on', 'created_on']).join ', '}.join ", "}'
|
121
|
+
);
|
122
|
+
|
123
|
+
update datasets set category_id=5, name='', description='', price='', button_html='' where id=
|
124
|
+
|
125
|
+
|
126
|
+
mysqldump pay4data datasets categories | mysql2 pay4data
|
127
|
+
|
128
|
+
|
129
|
+
EOF
|
130
|
+
|
131
|
+
puts sql
|
132
|
+
|
133
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'scrapey'
|
2
|
+
require 'pry'
|
3
|
+
require 'syck'
|
4
|
+
require "#{BASEDIR}/src/proxy.rb"
|
5
|
+
|
6
|
+
|
7
|
+
YAML::ENGINE.yamler='syck'
|
8
|
+
|
9
|
+
#proxies = Proxy::get_proxies :proxy_list
|
10
|
+
proxies = Proxy::get_proxies :all
|
11
|
+
|
12
|
+
@config['proxies'] = proxies.uniq
|
13
|
+
File.open("#{BASEDIR}/config/config.yml", 'w') { |f| YAML.dump(@config, f) }
|
14
|
+
|
@@ -0,0 +1,278 @@
|
|
1
|
+
require "base64"
|
2
|
+
|
3
|
+
class Proxy
|
4
|
+
attr_reader :current
|
5
|
+
BOOM = 'boom'
|
6
|
+
|
7
|
+
def initialize agent = nil, options = {}
|
8
|
+
@user_agents = [
|
9
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
10
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
|
11
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
|
12
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
|
13
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
|
14
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
|
15
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
16
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
|
17
|
+
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
|
18
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
|
19
|
+
]
|
20
|
+
@agent = agent
|
21
|
+
@min = options[:min] || 5
|
22
|
+
@sleep = options[:sleep] || 60 * 60 # 1 hour
|
23
|
+
@verbose = options[:verbose] || false
|
24
|
+
@timeout = options[:timeout] || 30
|
25
|
+
@round_time = options[:round_time] || 5 * 60 # 5 minutes
|
26
|
+
@agent.open_timeout = @agent.read_timeout = @timeout
|
27
|
+
proxies = options[:proxies] || []
|
28
|
+
set_proxies proxies
|
29
|
+
end
|
30
|
+
|
31
|
+
def set_proxies proxies
|
32
|
+
@proxies = proxies.select{|x| x[/:/]}.uniq{|x| x[/.*:/]}
|
33
|
+
self.shuffle
|
34
|
+
end
|
35
|
+
|
36
|
+
def debug str
|
37
|
+
puts str if @verbose
|
38
|
+
end
|
39
|
+
|
40
|
+
def shuffle
|
41
|
+
@proxies = [BOOM] + (@proxies - [BOOM]).shuffle
|
42
|
+
start_round
|
43
|
+
self.rotate
|
44
|
+
end
|
45
|
+
|
46
|
+
def to_yaml
|
47
|
+
@proxies.to_yaml
|
48
|
+
end
|
49
|
+
|
50
|
+
def start_round
|
51
|
+
now = Time.now.to_i
|
52
|
+
if @round_start
|
53
|
+
sleep_time = @round_time - (now - @round_start)
|
54
|
+
if sleep_time > 0
|
55
|
+
puts "sleeping for #{sleep_time}"
|
56
|
+
sleep sleep_time
|
57
|
+
end
|
58
|
+
end
|
59
|
+
@round_start = Time.now.to_i
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_more_proxies
|
63
|
+
puts 'getting more proxies'
|
64
|
+
proxies = Proxy::get_proxies
|
65
|
+
set_proxies proxies
|
66
|
+
end
|
67
|
+
|
68
|
+
def rotate
|
69
|
+
debug "rotating"
|
70
|
+
@proxies.rotate!
|
71
|
+
@user_agents.rotate!
|
72
|
+
if @proxies.length < @min
|
73
|
+
get_more_proxies
|
74
|
+
end
|
75
|
+
@current = @proxies.first
|
76
|
+
if @current == BOOM
|
77
|
+
start_round
|
78
|
+
rotate
|
79
|
+
return
|
80
|
+
end
|
81
|
+
|
82
|
+
host, port = @current.split ':'
|
83
|
+
debug "setting proxy to #{host}:#{port}"
|
84
|
+
@agent.set_proxy host, port.to_i
|
85
|
+
debug "setting user_agent to #{@user_agents.first}"
|
86
|
+
@agent.user_agent = @user_agents.first
|
87
|
+
end
|
88
|
+
|
89
|
+
def remove
|
90
|
+
debug "--- removing #{@current}"
|
91
|
+
@proxies.shift
|
92
|
+
rotate
|
93
|
+
debug @proxies.join(', ')
|
94
|
+
debug @current
|
95
|
+
end
|
96
|
+
|
97
|
+
def pause
|
98
|
+
time = @sleep / @proxies.length
|
99
|
+
debug "sleeping for #{time}"
|
100
|
+
sleep time
|
101
|
+
end
|
102
|
+
|
103
|
+
def length
|
104
|
+
@proxies.length
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
def self.get_idcloak
|
110
|
+
proxies = []
|
111
|
+
['http://www.idcloak.com/proxylist/free-proxy-servers-list.html'].each do |url|
|
112
|
+
page = @agent.get url
|
113
|
+
|
114
|
+
page.search('#sort td[7]').each do |td|
|
115
|
+
port = td.text.strip
|
116
|
+
host = td.at('+ td').text.strip
|
117
|
+
proxies << "#{host}:#{port}"
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
proxies
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.get_proxynova
|
125
|
+
proxies = []
|
126
|
+
['http://www.proxynova.com/proxy-server-list/'].each do |url|
|
127
|
+
page = @agent.get url
|
128
|
+
|
129
|
+
page.search('.row_proxy_ip').each do |span|
|
130
|
+
str = span.text[/long2ip\((.*?)\)/, 1]
|
131
|
+
next if str[/a-z/i]
|
132
|
+
i = eval str
|
133
|
+
host = Proxy::long2ip(i)
|
134
|
+
port = span.parent.at('+ td').text.strip
|
135
|
+
proxies << "#{host}:#{port}"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
proxies
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.get_proxy_list
|
142
|
+
proxies = []
|
143
|
+
['http://proxy-list.org/en/index.php',
|
144
|
+
'http://proxy-list.org/en/index.php?sp=20',
|
145
|
+
'http://proxy-list.org/en/index.php?sp=40',
|
146
|
+
'http://proxy-list.org/en/index.php?sp=60',
|
147
|
+
'http://proxy-list.org/en/index.php?sp=80',
|
148
|
+
'http://proxy-list.org/en/index.php?sp=100',
|
149
|
+
'http://proxy-list.org/en/index.php?sp=120'].each do |url|
|
150
|
+
page = @agent.get url
|
151
|
+
proxies += page.body.scan(/(?:\d+\.){3}\d+:\d+/)
|
152
|
+
end
|
153
|
+
proxies
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.get_hidemyass
|
157
|
+
proxies = []
|
158
|
+
['http://hidemyass.com/proxy-list/search-227752',
|
159
|
+
'http://hidemyass.com/proxy-list/search-227752/2',
|
160
|
+
'http://hidemyass.com/proxy-list/search-227752/3',
|
161
|
+
'http://hidemyass.com/proxy-list/search-227752/4',
|
162
|
+
'http://hidemyass.com/proxy-list/search-227752/5',
|
163
|
+
'http://hidemyass.com/proxy-list/search-227752/6'].each do |url|
|
164
|
+
page = @agent.get url
|
165
|
+
page.search('*[style*="display:none"]').remove
|
166
|
+
page.search(page.body.scan(/(\..*?)\{display:none\}/).flatten.join(', ')).remove
|
167
|
+
page.search('style').remove
|
168
|
+
proxies += page.search('td[2]').map{|x| x.text.strip}.zip(page.search('td[3]').map{|x| x.text.strip}).map{|h,p| "#{h}:#{p}"}[1..-1]
|
169
|
+
end
|
170
|
+
proxies
|
171
|
+
end
|
172
|
+
|
173
|
+
def self.get_cool_proxy
|
174
|
+
proxies = []
|
175
|
+
page = @agent.get 'http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc'
|
176
|
+
page.search('tr')[1..-2].each do |tr|
|
177
|
+
next unless tr.at('td[2]')
|
178
|
+
host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
|
179
|
+
port = tr.at('td[2]').text
|
180
|
+
proxies << [host, port].join(':')
|
181
|
+
end
|
182
|
+
|
183
|
+
while a = page.at('a[rel=next]')
|
184
|
+
url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
|
185
|
+
begin
|
186
|
+
page = @agent.get url
|
187
|
+
rescue
|
188
|
+
return proxies
|
189
|
+
end
|
190
|
+
page.search('tr')[1..-2].each do |tr|
|
191
|
+
next unless tr.at('td[2]')
|
192
|
+
host = Base64.decode64 tr.at('td[1]').text[/"(.*?)"/, 1]
|
193
|
+
port = tr.at('td[2]').text
|
194
|
+
proxies << [host, port].join(':')
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
proxies
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
def self.get_freeproxylists
|
203
|
+
proxies = []
|
204
|
+
|
205
|
+
@agent.follow_meta_refresh = true
|
206
|
+
page = @agent.get 'http://www.freeproxylists.net/'
|
207
|
+
|
208
|
+
page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
|
209
|
+
proxies << [URI.decode(row[0]), row[1]].join(':')
|
210
|
+
end
|
211
|
+
|
212
|
+
while a = page.at('a[text()^=Next]')
|
213
|
+
url = URI.join('http://www.freeproxylists.net/', a[:href]).to_s
|
214
|
+
puts url
|
215
|
+
page = @agent.get url
|
216
|
+
page.body.scan(/IPDecode\("([^"]+)"\)<\/script><\/td><td align="center">(\d+)/).each do |row|
|
217
|
+
proxies << [URI.decode(row[0]), row[1]].join(':')
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
proxies
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.long2ip(long)
|
225
|
+
ip = []
|
226
|
+
4.times do |i|
|
227
|
+
ip.push(long.to_i & 255)
|
228
|
+
long = long.to_i >> 8
|
229
|
+
end
|
230
|
+
ip.join(".")
|
231
|
+
end
|
232
|
+
|
233
|
+
def self.get_proxies provider = :all
|
234
|
+
|
235
|
+
@agent ||= Mechanize.new{|a| a.history.max_size = 10}
|
236
|
+
@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
237
|
+
@agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
238
|
+
|
239
|
+
case provider
|
240
|
+
when :proxy_list then return get_proxy_list
|
241
|
+
when :hidemyass then return get_hidemyass
|
242
|
+
when :freeproxylists then return get_freeproxylists
|
243
|
+
when :cool_proxy then return get_cool_proxy
|
244
|
+
when :proxynova then return get_proxynova
|
245
|
+
when :idcloak then return get_idcloak
|
246
|
+
when :all
|
247
|
+
proxies = []
|
248
|
+
[:proxy_list, :hidemyass, :freeproxylists, :cool_proxy, :proxynova, :idcloak].each do |key|
|
249
|
+
puts key
|
250
|
+
begin
|
251
|
+
part = get_proxies(key)
|
252
|
+
rescue Exception => e
|
253
|
+
part = []
|
254
|
+
puts e.message
|
255
|
+
end
|
256
|
+
puts part.length
|
257
|
+
proxies += part
|
258
|
+
end
|
259
|
+
proxies
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
if ARGV.include?('-p')
|
265
|
+
puts "refreshing proxies, please wait..."
|
266
|
+
require "#{BASEDIR}/src/get_proxies.rb"
|
267
|
+
puts "#{@config['proxies'].length} proxies found."
|
268
|
+
puts "Hit [enter] to exit"
|
269
|
+
$stdin.gets
|
270
|
+
exit
|
271
|
+
end
|
272
|
+
|
273
|
+
# for testing
|
274
|
+
if __FILE__ == $0
|
275
|
+
require 'mechanize'
|
276
|
+
@agent = Mechanize.new
|
277
|
+
proxy = Proxy.new @agent, :verbose => true, :min => 5
|
278
|
+
end
|
data/template/src/schema.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
=begin
|
2
2
|
# put table schemas here. this will be included if the table is not found.
|
3
3
|
ActiveRecord::Schema.define do
|
4
|
-
create_table "items" do |t|
|
4
|
+
create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
|
5
5
|
t.string "string_field"
|
6
6
|
t.text "text_field"
|
7
7
|
t.integer "number_field"
|
data/template/src/template.rb
CHANGED
@@ -5,6 +5,12 @@ require 'pry'
|
|
5
5
|
# @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
6
6
|
# @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
|
7
7
|
|
8
|
+
EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
|
9
|
+
|
10
|
+
def clean str
|
11
|
+
str.gsub(/[[:space:]]+/, ' ').strip
|
12
|
+
end
|
13
|
+
|
8
14
|
def scrape div
|
9
15
|
a = div.at('a')
|
10
16
|
url = URI.join(@url, a[:href]).to_s
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.16
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- P Guardiario
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-04-22 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: mechanize
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: httpclient
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: json
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ~>
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,6 @@ dependencies:
|
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ~>
|
60
53
|
- !ruby/object:Gem::Version
|
@@ -83,6 +76,7 @@ files:
|
|
83
76
|
- lib/scrapey/cache/redis.rb
|
84
77
|
- lib/scrapey/cache.rb
|
85
78
|
- lib/scrapey/constants.rb
|
79
|
+
- lib/scrapey/core.rb
|
86
80
|
- lib/scrapey/database.rb
|
87
81
|
- lib/scrapey/multi.rb
|
88
82
|
- lib/scrapey/scrapey.rb
|
@@ -91,37 +85,42 @@ files:
|
|
91
85
|
- lib/scrapey/tor.rb
|
92
86
|
- lib/scrapey.rb
|
93
87
|
- scrapey.gemspec
|
88
|
+
- template/.gitignore
|
94
89
|
- template/config/config.yml
|
95
90
|
- template/Gemfile
|
96
91
|
- template/icon.ico
|
97
92
|
- template/output.csv
|
98
93
|
- template/Rakefile
|
99
94
|
- template/src/downloader.rb
|
95
|
+
- template/src/emails.rb
|
96
|
+
- template/src/export.rb
|
97
|
+
- template/src/get_proxies.rb
|
98
|
+
- template/src/proxy.rb
|
100
99
|
- template/src/schema.rb
|
101
100
|
- template/src/template.rb
|
102
101
|
- template/template.iss
|
103
102
|
homepage: ''
|
104
103
|
licenses: []
|
104
|
+
metadata: {}
|
105
105
|
post_install_message:
|
106
106
|
rdoc_options: []
|
107
107
|
require_paths:
|
108
108
|
- lib
|
109
109
|
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
-
none: false
|
111
110
|
requirements:
|
112
111
|
- - ! '>='
|
113
112
|
- !ruby/object:Gem::Version
|
114
113
|
version: '0'
|
115
114
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
-
none: false
|
117
115
|
requirements:
|
118
116
|
- - ! '>='
|
119
117
|
- !ruby/object:Gem::Version
|
120
118
|
version: '0'
|
121
119
|
requirements: []
|
122
120
|
rubyforge_project:
|
123
|
-
rubygems_version: 1.
|
121
|
+
rubygems_version: 2.1.5
|
124
122
|
signing_key:
|
125
|
-
specification_version:
|
123
|
+
specification_version: 4
|
126
124
|
summary: A simple scraping framework
|
127
125
|
test_files: []
|
126
|
+
has_rdoc:
|