scrapey 0.0.16 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- Njk0YTkyMDY0MGYxMzM1ZDMwOWM5Yzg4YmQ4YTA2NGM3N2Q1ZGUxMw==
4
+ ZWYyNGI3OGE3MTA2ZmQxMGE0MzE1MmE2ZjA5YTFhYTliOTY1OTY5ZQ==
5
5
  data.tar.gz: !binary |-
6
- YjE2NjNlYjQwMTIwMWE3YmRjNWQzNDVlZWI0MjhjNWI1NWMzNTg1OQ==
6
+ YjRjOTVjOTkzNmEwOGE2NmMzYTVkNmNjMGRkODRjZjQ2OWM3OWNhNw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NTcyZWFmYjdhZmMyMjVjNzc1ZTk5ZDRkMTM2YjYxMTE5NjZmMThmNTc2MGVl
10
- MTI4Yjc0ODkwZWQ3NGNkNWM4NmI5ZGFlMmJiYWNiMjFhMDEyMGIzMjhkZmYz
11
- ZDNlN2Y3NGFjYzdjNjlhMWE4Y2FiMzcxZmQyNDBlZTM4MTA1Yzg=
9
+ NzkzOWNhMTA2MGQ3MDYwYjA2ZjQ2M2Y1OTQ4YTczMDljMWQ2YjRhYjcyMTk0
10
+ Yzc4ZjczNjU5MTBjN2MyOTczM2Y0NDZkNzY0MDdhOGU4MDQ1ODA3ODMwZTJi
11
+ MzMyZGFlNDc4N2MxMmViYjM5MjE2N2Y1MjFiNDY0ODJiNGM3ZDE=
12
12
  data.tar.gz: !binary |-
13
- NmIwOGRjZTczMTE1YjE2MDQzNzc5MmJmYTQ4MzQzMGNlNGQ1Y2YxMzA1MDZk
14
- NzkzMTdhOTI3MzAxODNiYmZhNzlhNjkzNGUwODAwZTVmYmRjZjY1ZjdkOGJm
15
- ZTFmOWQ5OTFiZmQwY2U3NTBjYzA2ZjBkNjcwZmMzNjcxY2E2ODY=
13
+ MWQyZjQ1NTA4NjA4ZGVmNjNjYmQ0MDY2ZDJhZWZlMGJhYWI1NDIyOTcwNzhi
14
+ MDg0YWU0NmIyMmNhY2E1MTE3NGY3ODE0NDhmNDE3NTc4OGVhNjg0NjA4OWRk
15
+ MTVmNmVmNDUyZGU5ZmFiMjg0N2Y5ZmVhM2UyMWRmYjM2MmQwMDE=
@@ -2,7 +2,7 @@ require 'mechanize'
2
2
  require 'csv'
3
3
  require 'json'
4
4
  require 'yaml'
5
- require 'unf_ext'
5
+ # require 'unf_ext'
6
6
 
7
7
  require "scrapey/scrapey"
8
8
  require "scrapey/constants"
@@ -13,6 +13,8 @@ require "scrapey/tee"
13
13
 
14
14
  require 'addressable/uri'
15
15
 
16
+ EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
17
+
16
18
  class URI::Parser
17
19
  def split url
18
20
  a = Addressable::URI::parse url
@@ -10,6 +10,7 @@ module Scrapey
10
10
  File.exists? cache_filename(url)
11
11
  end
12
12
 
13
+ =begin
13
14
  def load_cache url
14
15
  filename = cache_filename url
15
16
  return nil unless File::exists?(filename)
@@ -24,9 +25,26 @@ module Scrapey
24
25
  def save_cache url, doc, options = {}
25
26
  File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
26
27
  end
28
+ =end
29
+
30
+ def load_cache url
31
+ filename = cache_filename url
32
+ return nil unless File::exists?(filename)
33
+ debug "Loading #{filename} from cache"
34
+ begin
35
+ Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
36
+ rescue Exception => e
37
+ puts e.message
38
+ end
39
+ end
40
+
41
+ def save_cache url, doc, options = {}
42
+ File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) }
43
+ end
44
+
27
45
 
28
46
  def delete_cache url
29
- FileUtils.rm cache_filename(url)
47
+ FileUtils.rm(cache_filename(url)) rescue nil
30
48
  end
31
49
 
32
50
  end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.16"
2
+ VERSION = "0.0.17"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -25,8 +25,9 @@ module Scrapey
25
25
  return doc if doc
26
26
 
27
27
  page = agent.send *new_args
28
- str = page.respond_to?('root') ? page.root.to_s : page.body
29
- save_cache(url, str) if @use_cache
28
+ # str = page.respond_to?('root') ? page.root.to_s : page.body
29
+ # save_cache(url, str) if @use_cache
30
+ save_cache(url, page.body) if @use_cache
30
31
 
31
32
  #exit if Object.const_defined? :Ocra
32
33
  page
@@ -57,20 +58,46 @@ module Scrapey
57
58
  @fields = args
58
59
  end
59
60
 
60
- def save item
61
- unless @csv && !@csv.closed?
62
- @csv = CSV.open @output, 'w'
63
- @csv << @fields if @fields
61
+ def save_images urls
62
+ folder = "#{BASEDIR}/images"
63
+ Dir.mkdir(folder) unless Dir.exists?(folder)
64
+ names = []
65
+ urls.each do |url|
66
+ name = url[/[^\/]+$/]
67
+ binding.pry unless name
68
+ names << name
69
+ fn = "#{folder}/#{name}"
70
+ next if File.exists?(fn)
71
+ file = @agent.get(url)
72
+ File.open(fn, 'wb'){|f| f << file.body}
64
73
  end
65
- case
66
- when item.is_a?(Array) then @csv << item
67
- when item.is_a?(Hash) || item.is_a?(CSV::Row)
68
- raise 'No fields defined!' unless @fields
69
- @csv << @fields.map{|f| item[f]}
70
- else raise "unsupported type: #{item.class}"
74
+ names
75
+ end
76
+
77
+ def save item, output = nil
78
+ output ||= @output
79
+ @csvs ||= {}
80
+ unless @csvs[output]
81
+ obj = {}
82
+ begin
83
+ fn = output.gsub(/(?<!csv)$/, '.csv')
84
+ obj[:csv] = CSV.open fn, 'w'
85
+ rescue Exception => e
86
+ if e.is_a?(Errno::EACCES)
87
+ puts "Unable to access #{fn} - is it locked?"
88
+ exit
89
+ else
90
+ raise e
91
+ end
92
+ end
93
+ obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys
94
+ obj[:csv] << obj[:fields]
95
+ @csvs[output] = obj
71
96
  end
97
+ @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]}
72
98
  end
73
99
 
100
+
74
101
  def visited? url
75
102
  @visited ||= []
76
103
  return true if @visited.include? url
@@ -0,0 +1,63 @@
1
+ require 'scrapey'
2
+ require 'watir-webdriver'
3
+
4
+ use_cache
5
+
6
+ # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
7
+ @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
8
+ @proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
9
+
10
+ @lock = Mutex.new
11
+
12
+ def response_ok? str
13
+ str[/Lidnummer/] && !str[/IP address/i]
14
+ end
15
+
16
+ def clean str
17
+ str.gsub(/[[:space:]]+/, ' ').strip
18
+ end
19
+
20
+ def download
21
+ browser = nil
22
+ @lock.synchronize do
23
+ browser = Watir::Browser.new
24
+ end
25
+ loop do
26
+ return unless url = @queue.shift
27
+
28
+ if is_cached?(url)
29
+ puts 'skipping'
30
+ next
31
+ end
32
+
33
+ begin
34
+ browser.goto url
35
+ unless response_ok?(browser.html)
36
+ raise 'str'
37
+ end
38
+ save_cache url, browser.html
39
+
40
+ puts browser.html[EMAIL_REGEX]
41
+ rescue StandardError => e
42
+ puts e.message[0..99]
43
+ @queue.push url
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ threads = []
50
+ @deficit = 0
51
+
52
+ until @queue.empty?
53
+ @good = 0
54
+ start_time = Time.now
55
+
56
+ @proxies.shuffle!
57
+
58
+ 1.times do
59
+ threads << Thread.new { download }
60
+ end
61
+ threads.each { |t| t.join }
62
+
63
+ end
@@ -1,28 +1,84 @@
1
1
  require 'scrapey'
2
- require 'pry'
3
2
 
4
3
  use_cache
5
4
 
6
5
  # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
7
- @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
6
+ @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
7
+ @proxies ||= File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
8
8
 
9
- def download agent
10
- while url = @queue.shift
11
- if is_cached? url
12
- puts 'skipping'
13
- next
9
+ def response_ok? page
10
+ page.body[/pub-9059175907567062/] && !page.body[/IP address/i]
11
+ end
12
+
13
+ def clean str
14
+ str.gsub(/[[:space:]]+/, ' ').strip
15
+ end
16
+
17
+ def download
18
+ loop do
19
+ Mechanize.start do |agent|
20
+ agent.read_timeout = agent.open_timeout = 30
21
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
22
+ agent.user_agent = [
23
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
24
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
25
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
26
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
27
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
28
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
29
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
30
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
31
+ 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
32
+ 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
33
+ ].sample
34
+
35
+ return unless url = @queue.shift
36
+
37
+ if is_cached?(url)
38
+ puts 'skipping'
39
+ next
40
+ end
41
+ unless proxy = @proxies.shift
42
+ puts "no more proxies"
43
+ return
44
+ end
45
+ @proxies.push proxy
46
+ host, port = proxy.split(':')
47
+ agent.set_proxy host, port.to_i
48
+ begin
49
+ page = agent.get url
50
+ unless response_ok?(page)
51
+ page.search('script,style').remove
52
+ puts clean(page.body)
53
+ raise 'str'
54
+ end
55
+ save_cache url, page.body
56
+
57
+ @good += 1
58
+ puts url
59
+ rescue StandardError => e
60
+ puts e.message[0..99]
61
+ @queue.push url
62
+ @proxies -= [proxy]
63
+ agent.cookie_jar.clear!
64
+ end
14
65
  end
15
- page = agent.get url
16
- save_cache url, page.body
17
- puts url
18
66
  end
67
+
19
68
  end
20
69
 
21
70
  threads = []
22
- 5.times do
23
- threads << Thread.new { download Mechanize.new{|a| a.history.max_size, a.verify_mode = 10, OpenSSL::SSL::VERIFY_NONE}}
24
- end
71
+ @deficit = 0
72
+
73
+ until @queue.empty?
74
+ @good = 0
75
+ start_time = Time.now
25
76
 
26
- threads.each { |t| t.join }
77
+ @proxies.shuffle!
27
78
 
28
- binding.pry
79
+ 10.times do
80
+ threads << Thread.new { download }
81
+ end
82
+ threads.each { |t| t.join }
83
+
84
+ end
@@ -0,0 +1,77 @@
1
+ require 'scrapey'
2
+ require 'chronic'
3
+ require 'pry'
4
+
5
+ # sample customizations...
6
+ # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
7
+ # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
8
+
9
+ def guess_type column
10
+ case column
11
+ when /RaceId/i then 'integer'
12
+ when /date/i then 'datetime'
13
+ when /is_/i then 'boolean'
14
+ when /descr/i then 'text'
15
+ when /price/i then 'float'
16
+ else 'string'
17
+ end
18
+ end
19
+
20
+ def new_table name, columns
21
+
22
+ ActiveRecord::Schema.define do
23
+ create_table name, options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
24
+ columns.each do |column|
25
+ type = guess_type column
26
+ t.send type, column
27
+ end
28
+
29
+ =begin
30
+ t.string "string_field"
31
+ t.text "text_field"
32
+ t.integer "number_field"
33
+ t.boolean "boolean_field"
34
+ t.float "float_field"
35
+ t.date "created_at"
36
+ t.datetime "created_on"
37
+ =end
38
+ end
39
+ end
40
+
41
+ end
42
+
43
+ def map row
44
+ item = {}
45
+ row.headers.each do |k|
46
+ v = row[k]
47
+ item[k] = case guess_type(k)
48
+ when /date/ then Chronic.parse(v)
49
+ when 'boolean' then v && v != 0
50
+ else v
51
+ end
52
+ end
53
+ item
54
+ end
55
+
56
+ Dir.glob('input/*.csv').each do |fn|
57
+ @table = nil
58
+ table_name = fn[/\/(.*)\.csv/, 1].gsub(/\W+/,'_')
59
+ puts table_name
60
+
61
+ CSV.foreach(fn, :headers => true, :header_converters => lambda{|h| h.downcase.gsub(/\W+/, '_')}) do |row|
62
+
63
+ if !@table
64
+ new_table table_name, row.headers
65
+ tables table_name.singularize.camelize
66
+ @table = table_name.singularize.camelize.constantize
67
+ end
68
+
69
+ data = map row
70
+ #binding.pry
71
+
72
+ @table.new(data).save
73
+
74
+ print '.'
75
+ end
76
+ end
77
+
@@ -270,6 +270,42 @@ if ARGV.include?('-p')
270
270
  exit
271
271
  end
272
272
 
273
+ def pget url, skip_ok = false
274
+ raise 'no gaq' unless @gaq
275
+ return nil unless url[/^http/]
276
+ if @use_cache && is_cached?(url)
277
+ return get(url)
278
+ end
279
+ @proxy.rotate
280
+ begin
281
+ page = get url
282
+ rescue StandardError => e
283
+ puts e.message
284
+ @proxy.remove
285
+ @agent.cookie_jar.clear!
286
+ return pget(url)
287
+ end
288
+
289
+ case
290
+ when page.respond_to?(:title) && page.title && page.body[@gaq] && page.code == '200'
291
+ return page
292
+ else
293
+ delete_cache url
294
+ puts page.code
295
+ @proxy.remove
296
+ @agent.cookie_jar.clear!
297
+ return pget(url)
298
+ end
299
+ end
300
+
301
+ @config['proxies'] = File.read("#{BASEDIR}/config/proxies.txt").scan /[\w.]+:\d+/
302
+
303
+ puts "starting with #{@config['proxies'].length} proxies..."
304
+ @proxy = Proxy.new @agent, :proxies => @config['proxies'], :round_time => 60, :min => 0
305
+
306
+
307
+
308
+
273
309
  # for testing
274
310
  if __FILE__ == $0
275
311
  require 'mechanize'
@@ -1,4 +1,5 @@
1
1
  =begin
2
+ # ActiveRecord::SchemaDumper.dump ActiveRecord::Base.connection, File.open('schema.rb', 'w')
2
3
  # put table schemas here. this will be included if the table is not found.
3
4
  ActiveRecord::Schema.define do
4
5
  create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
@@ -2,10 +2,10 @@ require 'scrapey'
2
2
  require 'pry'
3
3
 
4
4
  # sample customizations...
5
- # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
5
+ @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
6
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
7
+ @output = "template.csv"
7
8
 
8
- EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
9
9
 
10
10
  def clean str
11
11
  str.gsub(/[[:space:]]+/, ' ').strip
@@ -25,14 +25,15 @@ rescue StandardError => e
25
25
  end
26
26
 
27
27
 
28
- fields 'name', 'address', 'zip'
28
+ # fields 'name', 'address', 'zip'
29
29
 
30
30
  @url = "http://www.example.com/"
31
31
 
32
- with_cache do
33
- page = get @url
34
- binding.pry
35
- end
32
+ use_cache
33
+
34
+ page = get @url
35
+ binding.pry
36
+
36
37
 
37
38
  #@csv.close
38
39
  #%x{call #{@output}}
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.16
4
+ version: 0.0.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - P Guardiario
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-22 00:00:00.000000000 Z
11
+ date: 2014-10-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -91,10 +91,12 @@ files:
91
91
  - template/icon.ico
92
92
  - template/output.csv
93
93
  - template/Rakefile
94
+ - template/src/brownloader.rb
94
95
  - template/src/downloader.rb
95
96
  - template/src/emails.rb
96
97
  - template/src/export.rb
97
98
  - template/src/get_proxies.rb
99
+ - template/src/import.rb
98
100
  - template/src/proxy.rb
99
101
  - template/src/schema.rb
100
102
  - template/src/template.rb
@@ -118,9 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
120
  version: '0'
119
121
  requirements: []
120
122
  rubyforge_project:
121
- rubygems_version: 2.1.5
123
+ rubygems_version: 2.1.0
122
124
  signing_key:
123
125
  specification_version: 4
124
126
  summary: A simple scraping framework
125
127
  test_files: []
126
- has_rdoc: