scrapey 0.0.16 → 0.0.17

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- Njk0YTkyMDY0MGYxMzM1ZDMwOWM5Yzg4YmQ4YTA2NGM3N2Q1ZGUxMw==
4
+ ZWYyNGI3OGE3MTA2ZmQxMGE0MzE1MmE2ZjA5YTFhYTliOTY1OTY5ZQ==
5
5
  data.tar.gz: !binary |-
6
- YjE2NjNlYjQwMTIwMWE3YmRjNWQzNDVlZWI0MjhjNWI1NWMzNTg1OQ==
6
+ YjRjOTVjOTkzNmEwOGE2NmMzYTVkNmNjMGRkODRjZjQ2OWM3OWNhNw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NTcyZWFmYjdhZmMyMjVjNzc1ZTk5ZDRkMTM2YjYxMTE5NjZmMThmNTc2MGVl
10
- MTI4Yjc0ODkwZWQ3NGNkNWM4NmI5ZGFlMmJiYWNiMjFhMDEyMGIzMjhkZmYz
11
- ZDNlN2Y3NGFjYzdjNjlhMWE4Y2FiMzcxZmQyNDBlZTM4MTA1Yzg=
9
+ NzkzOWNhMTA2MGQ3MDYwYjA2ZjQ2M2Y1OTQ4YTczMDljMWQ2YjRhYjcyMTk0
10
+ Yzc4ZjczNjU5MTBjN2MyOTczM2Y0NDZkNzY0MDdhOGU4MDQ1ODA3ODMwZTJi
11
+ MzMyZGFlNDc4N2MxMmViYjM5MjE2N2Y1MjFiNDY0ODJiNGM3ZDE=
12
12
  data.tar.gz: !binary |-
13
- NmIwOGRjZTczMTE1YjE2MDQzNzc5MmJmYTQ4MzQzMGNlNGQ1Y2YxMzA1MDZk
14
- NzkzMTdhOTI3MzAxODNiYmZhNzlhNjkzNGUwODAwZTVmYmRjZjY1ZjdkOGJm
15
- ZTFmOWQ5OTFiZmQwY2U3NTBjYzA2ZjBkNjcwZmMzNjcxY2E2ODY=
13
+ MWQyZjQ1NTA4NjA4ZGVmNjNjYmQ0MDY2ZDJhZWZlMGJhYWI1NDIyOTcwNzhi
14
+ MDg0YWU0NmIyMmNhY2E1MTE3NGY3ODE0NDhmNDE3NTc4OGVhNjg0NjA4OWRk
15
+ MTVmNmVmNDUyZGU5ZmFiMjg0N2Y5ZmVhM2UyMWRmYjM2MmQwMDE=
@@ -2,7 +2,7 @@ require 'mechanize'
2
2
  require 'csv'
3
3
  require 'json'
4
4
  require 'yaml'
5
- require 'unf_ext'
5
+ # require 'unf_ext'
6
6
 
7
7
  require "scrapey/scrapey"
8
8
  require "scrapey/constants"
@@ -13,6 +13,8 @@ require "scrapey/tee"
13
13
 
14
14
  require 'addressable/uri'
15
15
 
16
+ EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
17
+
16
18
  class URI::Parser
17
19
  def split url
18
20
  a = Addressable::URI::parse url
@@ -10,6 +10,7 @@ module Scrapey
10
10
  File.exists? cache_filename(url)
11
11
  end
12
12
 
13
+ =begin
13
14
  def load_cache url
14
15
  filename = cache_filename url
15
16
  return nil unless File::exists?(filename)
@@ -24,9 +25,26 @@ module Scrapey
24
25
  def save_cache url, doc, options = {}
25
26
  File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
26
27
  end
28
+ =end
29
+
30
+ def load_cache url
31
+ filename = cache_filename url
32
+ return nil unless File::exists?(filename)
33
+ debug "Loading #{filename} from cache"
34
+ begin
35
+ Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
36
+ rescue Exception => e
37
+ puts e.message
38
+ end
39
+ end
40
+
41
+ def save_cache url, doc, options = {}
42
+ File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) }
43
+ end
44
+
27
45
 
28
46
  def delete_cache url
29
- FileUtils.rm cache_filename(url)
47
+ FileUtils.rm(cache_filename(url)) rescue nil
30
48
  end
31
49
 
32
50
  end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.16"
2
+ VERSION = "0.0.17"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -25,8 +25,9 @@ module Scrapey
25
25
  return doc if doc
26
26
 
27
27
  page = agent.send *new_args
28
- str = page.respond_to?('root') ? page.root.to_s : page.body
29
- save_cache(url, str) if @use_cache
28
+ # str = page.respond_to?('root') ? page.root.to_s : page.body
29
+ # save_cache(url, str) if @use_cache
30
+ save_cache(url, page.body) if @use_cache
30
31
 
31
32
  #exit if Object.const_defined? :Ocra
32
33
  page
@@ -57,20 +58,46 @@ module Scrapey
57
58
  @fields = args
58
59
  end
59
60
 
60
- def save item
61
- unless @csv && !@csv.closed?
62
- @csv = CSV.open @output, 'w'
63
- @csv << @fields if @fields
61
+ def save_images urls
62
+ folder = "#{BASEDIR}/images"
63
+ Dir.mkdir(folder) unless Dir.exists?(folder)
64
+ names = []
65
+ urls.each do |url|
66
+ name = url[/[^\/]+$/]
67
+ binding.pry unless name
68
+ names << name
69
+ fn = "#{folder}/#{name}"
70
+ next if File.exists?(fn)
71
+ file = @agent.get(url)
72
+ File.open(fn, 'wb'){|f| f << file.body}
64
73
  end
65
- case
66
- when item.is_a?(Array) then @csv << item
67
- when item.is_a?(Hash) || item.is_a?(CSV::Row)
68
- raise 'No fields defined!' unless @fields
69
- @csv << @fields.map{|f| item[f]}
70
- else raise "unsupported type: #{item.class}"
74
+ names
75
+ end
76
+
77
+ def save item, output = nil
78
+ output ||= @output
79
+ @csvs ||= {}
80
+ unless @csvs[output]
81
+ obj = {}
82
+ begin
83
+ fn = output.gsub(/(?<!csv)$/, '.csv')
84
+ obj[:csv] = CSV.open fn, 'w'
85
+ rescue Exception => e
86
+ if e.is_a?(Errno::EACCES)
87
+ puts "Unable to access #{fn} - is it locked?"
88
+ exit
89
+ else
90
+ raise e
91
+ end
92
+ end
93
+ obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys
94
+ obj[:csv] << obj[:fields]
95
+ @csvs[output] = obj
71
96
  end
97
+ @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]}
72
98
  end
73
99
 
100
+
74
101
  def visited? url
75
102
  @visited ||= []
76
103
  return true if @visited.include? url
@@ -0,0 +1,63 @@
1
+ require 'scrapey'
2
+ require 'watir-webdriver'
3
+
4
+ use_cache
5
+
6
+ # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
7
+ @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
8
+ @proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
9
+
10
+ @lock = Mutex.new
11
+
12
+ def response_ok? str
13
+ str[/Lidnummer/] && !str[/IP address/i]
14
+ end
15
+
16
+ def clean str
17
+ str.gsub(/[[:space:]]+/, ' ').strip
18
+ end
19
+
20
+ def download
21
+ browser = nil
22
+ @lock.synchronize do
23
+ browser = Watir::Browser.new
24
+ end
25
+ loop do
26
+ return unless url = @queue.shift
27
+
28
+ if is_cached?(url)
29
+ puts 'skipping'
30
+ next
31
+ end
32
+
33
+ begin
34
+ browser.goto url
35
+ unless response_ok?(browser.html)
36
+ raise 'str'
37
+ end
38
+ save_cache url, browser.html
39
+
40
+ puts browser.html[EMAIL_REGEX]
41
+ rescue StandardError => e
42
+ puts e.message[0..99]
43
+ @queue.push url
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ threads = []
50
+ @deficit = 0
51
+
52
+ until @queue.empty?
53
+ @good = 0
54
+ start_time = Time.now
55
+
56
+ @proxies.shuffle!
57
+
58
+ 1.times do
59
+ threads << Thread.new { download }
60
+ end
61
+ threads.each { |t| t.join }
62
+
63
+ end
@@ -1,28 +1,84 @@
1
1
  require 'scrapey'
2
- require 'pry'
3
2
 
4
3
  use_cache
5
4
 
6
5
  # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
7
- @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
6
+ @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
7
+ @proxies ||= File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
8
8
 
9
- def download agent
10
- while url = @queue.shift
11
- if is_cached? url
12
- puts 'skipping'
13
- next
9
+ def response_ok? page
10
+ page.body[/pub-9059175907567062/] && !page.body[/IP address/i]
11
+ end
12
+
13
+ def clean str
14
+ str.gsub(/[[:space:]]+/, ' ').strip
15
+ end
16
+
17
+ def download
18
+ loop do
19
+ Mechanize.start do |agent|
20
+ agent.read_timeout = agent.open_timeout = 30
21
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
22
+ agent.user_agent = [
23
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
24
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
25
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
26
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
27
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
28
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
29
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
30
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
31
+ 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
32
+ 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
33
+ ].sample
34
+
35
+ return unless url = @queue.shift
36
+
37
+ if is_cached?(url)
38
+ puts 'skipping'
39
+ next
40
+ end
41
+ unless proxy = @proxies.shift
42
+ puts "no more proxies"
43
+ return
44
+ end
45
+ @proxies.push proxy
46
+ host, port = proxy.split(':')
47
+ agent.set_proxy host, port.to_i
48
+ begin
49
+ page = agent.get url
50
+ unless response_ok?(page)
51
+ page.search('script,style').remove
52
+ puts clean(page.body)
53
+ raise 'str'
54
+ end
55
+ save_cache url, page.body
56
+
57
+ @good += 1
58
+ puts url
59
+ rescue StandardError => e
60
+ puts e.message[0..99]
61
+ @queue.push url
62
+ @proxies -= [proxy]
63
+ agent.cookie_jar.clear!
64
+ end
14
65
  end
15
- page = agent.get url
16
- save_cache url, page.body
17
- puts url
18
66
  end
67
+
19
68
  end
20
69
 
21
70
  threads = []
22
- 5.times do
23
- threads << Thread.new { download Mechanize.new{|a| a.history.max_size, a.verify_mode = 10, OpenSSL::SSL::VERIFY_NONE}}
24
- end
71
+ @deficit = 0
72
+
73
+ until @queue.empty?
74
+ @good = 0
75
+ start_time = Time.now
25
76
 
26
- threads.each { |t| t.join }
77
+ @proxies.shuffle!
27
78
 
28
- binding.pry
79
+ 10.times do
80
+ threads << Thread.new { download }
81
+ end
82
+ threads.each { |t| t.join }
83
+
84
+ end
@@ -0,0 +1,77 @@
1
+ require 'scrapey'
2
+ require 'chronic'
3
+ require 'pry'
4
+
5
+ # sample customizations...
6
+ # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
7
+ # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
8
+
9
+ def guess_type column
10
+ case column
11
+ when /RaceId/i then 'integer'
12
+ when /date/i then 'datetime'
13
+ when /is_/i then 'boolean'
14
+ when /descr/i then 'text'
15
+ when /price/i then 'float'
16
+ else 'string'
17
+ end
18
+ end
19
+
20
+ def new_table name, columns
21
+
22
+ ActiveRecord::Schema.define do
23
+ create_table name, options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
24
+ columns.each do |column|
25
+ type = guess_type column
26
+ t.send type, column
27
+ end
28
+
29
+ =begin
30
+ t.string "string_field"
31
+ t.text "text_field"
32
+ t.integer "number_field"
33
+ t.boolean "boolean_field"
34
+ t.float "float_field"
35
+ t.date "created_at"
36
+ t.datetime "created_on"
37
+ =end
38
+ end
39
+ end
40
+
41
+ end
42
+
43
+ def map row
44
+ item = {}
45
+ row.headers.each do |k|
46
+ v = row[k]
47
+ item[k] = case guess_type(k)
48
+ when /date/ then Chronic.parse(v)
49
+ when 'boolean' then v && v != 0
50
+ else v
51
+ end
52
+ end
53
+ item
54
+ end
55
+
56
+ Dir.glob('input/*.csv').each do |fn|
57
+ @table = nil
58
+ table_name = fn[/\/(.*)\.csv/, 1].gsub(/\W+/,'_')
59
+ puts table_name
60
+
61
+ CSV.foreach(fn, :headers => true, :header_converters => lambda{|h| h.downcase.gsub(/\W+/, '_')}) do |row|
62
+
63
+ if !@table
64
+ new_table table_name, row.headers
65
+ tables table_name.singularize.camelize
66
+ @table = table_name.singularize.camelize.constantize
67
+ end
68
+
69
+ data = map row
70
+ #binding.pry
71
+
72
+ @table.new(data).save
73
+
74
+ print '.'
75
+ end
76
+ end
77
+
@@ -270,6 +270,42 @@ if ARGV.include?('-p')
270
270
  exit
271
271
  end
272
272
 
273
+ def pget url, skip_ok = false
274
+ raise 'no gaq' unless @gaq
275
+ return nil unless url[/^http/]
276
+ if @use_cache && is_cached?(url)
277
+ return get(url)
278
+ end
279
+ @proxy.rotate
280
+ begin
281
+ page = get url
282
+ rescue StandardError => e
283
+ puts e.message
284
+ @proxy.remove
285
+ @agent.cookie_jar.clear!
286
+ return pget(url)
287
+ end
288
+
289
+ case
290
+ when page.respond_to?(:title) && page.title && page.body[@gaq] && page.code == '200'
291
+ return page
292
+ else
293
+ delete_cache url
294
+ puts page.code
295
+ @proxy.remove
296
+ @agent.cookie_jar.clear!
297
+ return pget(url)
298
+ end
299
+ end
300
+
301
+ @config['proxies'] = File.read("#{BASEDIR}/config/proxies.txt").scan /[\w.]+:\d+/
302
+
303
+ puts "starting with #{@config['proxies'].length} proxies..."
304
+ @proxy = Proxy.new @agent, :proxies => @config['proxies'], :round_time => 60, :min => 0
305
+
306
+
307
+
308
+
273
309
  # for testing
274
310
  if __FILE__ == $0
275
311
  require 'mechanize'
@@ -1,4 +1,5 @@
1
1
  =begin
2
+ # ActiveRecord::SchemaDumper.dump ActiveRecord::Base.connection, File.open('schema.rb', 'w')
2
3
  # put table schemas here. this will be included if the table is not found.
3
4
  ActiveRecord::Schema.define do
4
5
  create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
@@ -2,10 +2,10 @@ require 'scrapey'
2
2
  require 'pry'
3
3
 
4
4
  # sample customizations...
5
- # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
5
+ @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
6
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
7
+ @output = "template.csv"
7
8
 
8
- EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
9
9
 
10
10
  def clean str
11
11
  str.gsub(/[[:space:]]+/, ' ').strip
@@ -25,14 +25,15 @@ rescue StandardError => e
25
25
  end
26
26
 
27
27
 
28
- fields 'name', 'address', 'zip'
28
+ # fields 'name', 'address', 'zip'
29
29
 
30
30
  @url = "http://www.example.com/"
31
31
 
32
- with_cache do
33
- page = get @url
34
- binding.pry
35
- end
32
+ use_cache
33
+
34
+ page = get @url
35
+ binding.pry
36
+
36
37
 
37
38
  #@csv.close
38
39
  #%x{call #{@output}}
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.16
4
+ version: 0.0.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - P Guardiario
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-22 00:00:00.000000000 Z
11
+ date: 2014-10-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -91,10 +91,12 @@ files:
91
91
  - template/icon.ico
92
92
  - template/output.csv
93
93
  - template/Rakefile
94
+ - template/src/brownloader.rb
94
95
  - template/src/downloader.rb
95
96
  - template/src/emails.rb
96
97
  - template/src/export.rb
97
98
  - template/src/get_proxies.rb
99
+ - template/src/import.rb
98
100
  - template/src/proxy.rb
99
101
  - template/src/schema.rb
100
102
  - template/src/template.rb
@@ -118,9 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
120
  version: '0'
119
121
  requirements: []
120
122
  rubyforge_project:
121
- rubygems_version: 2.1.5
123
+ rubygems_version: 2.1.0
122
124
  signing_key:
123
125
  specification_version: 4
124
126
  summary: A simple scraping framework
125
127
  test_files: []
126
- has_rdoc: