RubyGems - scrapey - Versions diffs - 0.0.16 → 0.0.17 - Mend

scrapey 0.0.16 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml CHANGED

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    Njk0YTkyMDY0MGYxMzM1ZDMwOWM5Yzg4YmQ4YTA2NGM3N2Q1ZGUxMw==
+    ZWYyNGI3OGE3MTA2ZmQxMGE0MzE1MmE2ZjA5YTFhYTliOTY1OTY5ZQ==
   data.tar.gz: !binary |-
-    YjE2NjNlYjQwMTIwMWE3YmRjNWQzNDVlZWI0MjhjNWI1NWMzNTg1OQ==
+    YjRjOTVjOTkzNmEwOGE2NmMzYTVkNmNjMGRkODRjZjQ2OWM3OWNhNw==
 SHA512:
   metadata.gz: !binary |-
-    NTcyZWFmYjdhZmMyMjVjNzc1ZTk5ZDRkMTM2YjYxMTE5NjZmMThmNTc2MGVl
-    MTI4Yjc0ODkwZWQ3NGNkNWM4NmI5ZGFlMmJiYWNiMjFhMDEyMGIzMjhkZmYz
-    ZDNlN2Y3NGFjYzdjNjlhMWE4Y2FiMzcxZmQyNDBlZTM4MTA1Yzg=
+    NzkzOWNhMTA2MGQ3MDYwYjA2ZjQ2M2Y1OTQ4YTczMDljMWQ2YjRhYjcyMTk0
+    Yzc4ZjczNjU5MTBjN2MyOTczM2Y0NDZkNzY0MDdhOGU4MDQ1ODA3ODMwZTJi
+    MzMyZGFlNDc4N2MxMmViYjM5MjE2N2Y1MjFiNDY0ODJiNGM3ZDE=
   data.tar.gz: !binary |-
-    NmIwOGRjZTczMTE1YjE2MDQzNzc5MmJmYTQ4MzQzMGNlNGQ1Y2YxMzA1MDZk
-    NzkzMTdhOTI3MzAxODNiYmZhNzlhNjkzNGUwODAwZTVmYmRjZjY1ZjdkOGJm
-    ZTFmOWQ5OTFiZmQwY2U3NTBjYzA2ZjBkNjcwZmMzNjcxY2E2ODY=
+    MWQyZjQ1NTA4NjA4ZGVmNjNjYmQ0MDY2ZDJhZWZlMGJhYWI1NDIyOTcwNzhi
+    MDg0YWU0NmIyMmNhY2E1MTE3NGY3ODE0NDhmNDE3NTc4OGVhNjg0NjA4OWRk
+    MTVmNmVmNDUyZGU5ZmFiMjg0N2Y5ZmVhM2UyMWRmYjM2MmQwMDE=

data/lib/scrapey.rb CHANGED

@@ -2,7 +2,7 @@ require 'mechanize'
 require 'csv'
 require 'json'
 require 'yaml'
-require 'unf_ext'
+# require 'unf_ext'
 require "scrapey/scrapey"
 require "scrapey/constants"
@@ -13,6 +13,8 @@ require "scrapey/tee"
 require 'addressable/uri'
+EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
 class URI::Parser
   def split url
     a = Addressable::URI::parse url

data/lib/scrapey/cache/disk.rb CHANGED

@@ -10,6 +10,7 @@ module Scrapey
     File.exists? cache_filename(url)
   end
+=begin
   def load_cache url
     filename = cache_filename url
     return nil unless File::exists?(filename)
@@ -24,9 +25,26 @@ module Scrapey
   def save_cache url, doc, options = {}
     File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
   end
+=end
+  def load_cache url
+    filename = cache_filename url
+    return nil unless File::exists?(filename)
+    debug "Loading #{filename} from cache"
+    begin
+      Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
+    rescue Exception => e
+      puts e.message
+    end
+  end
+  def save_cache url, doc, options = {}
+    File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) }
+  end
   def delete_cache url
-    FileUtils.rm cache_filename(url)
+    FileUtils.rm(cache_filename(url)) rescue nil
   end
 end

data/lib/scrapey/constants.rb CHANGED

@@ -1,5 +1,5 @@
 module Scrapey
-  VERSION = "0.0.16"
+  VERSION = "0.0.17"
   BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
   URL = "https://github.com/monkeysuffrage/scrapey"
 end

data/lib/scrapey/scrapey.rb CHANGED

@@ -25,8 +25,9 @@ module Scrapey
       return doc if doc
       page = agent.send *new_args
-      str = page.respond_to?('root') ? page.root.to_s : page.body
-      save_cache(url, str) if @use_cache
+      # str = page.respond_to?('root') ? page.root.to_s : page.body
+      # save_cache(url, str) if @use_cache
+      save_cache(url, page.body) if @use_cache
       #exit if Object.const_defined? :Ocra
       page
@@ -57,20 +58,46 @@ module Scrapey
     @fields = args
   end
-  def save item
-    unless @csv && !@csv.closed?
-      @csv = CSV.open @output, 'w'
-      @csv << @fields if @fields
+  def save_images urls
+    folder = "#{BASEDIR}/images"
+    Dir.mkdir(folder) unless Dir.exists?(folder)
+    names = []
+    urls.each do |url|
+      name = url[/[^\/]+$/]
+      binding.pry unless name
+      names << name
+      fn = "#{folder}/#{name}"
+      next if File.exists?(fn)
+      file = @agent.get(url)
+      File.open(fn, 'wb'){|f| f << file.body}
     end
-    case
-      when item.is_a?(Array) then @csv << item
-      when item.is_a?(Hash) || item.is_a?(CSV::Row)
-        raise 'No fields defined!' unless @fields
-        @csv << @fields.map{|f| item[f]}
-      else raise "unsupported type: #{item.class}"
+    names
+  end
+  def save item, output = nil
+    output ||= @output
+    @csvs ||= {}
+    unless @csvs[output]
+      obj = {}
+      begin
+        fn = output.gsub(/(?<!csv)$/, '.csv')
+        obj[:csv] = CSV.open fn, 'w'
+      rescue Exception => e
+        if e.is_a?(Errno::EACCES)
+          puts "Unable to access #{fn} - is it locked?"
+          exit
+        else
+          raise e
+        end
+      end
+      obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys
+      obj[:csv] << obj[:fields]
+      @csvs[output] = obj
     end
+    @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]}
   end
   def visited? url
     @visited ||= []
     return true if @visited.include? url

data/template/src/brownloader.rb ADDED

@@ -0,0 +1,63 @@
+require 'scrapey'
+require 'watir-webdriver'
+use_cache
+# File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
+@queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
+@proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
+@lock = Mutex.new
+def response_ok? str
+  str[/Lidnummer/] && !str[/IP address/i]
+end
+def clean str
+  str.gsub(/[[:space:]]+/, ' ').strip
+end
+def download
+  browser = nil
+  @lock.synchronize do
+    browser = Watir::Browser.new
+  end
+  loop do
+    return unless url = @queue.shift
+    if is_cached?(url)
+      puts 'skipping'
+      next
+    end
+    begin
+      browser.goto url
+      unless response_ok?(browser.html)
+        raise 'str'
+      end
+      save_cache url, browser.html
+      puts browser.html[EMAIL_REGEX]
+    rescue StandardError => e
+      puts e.message[0..99]
+      @queue.push url
+    end
+  end
+end
+threads = []
+@deficit = 0
+until @queue.empty?
+  @good = 0
+  start_time = Time.now
+  @proxies.shuffle!
+  1.times do
+    threads << Thread.new { download }
+  end
+  threads.each { |t| t.join }
+end

data/template/src/downloader.rb CHANGED

@@ -1,28 +1,84 @@
 require 'scrapey'
-require 'pry'
 use_cache
 # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
-@queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
+@queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
+@proxies ||= File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
-def download agent
-  while url = @queue.shift
-    if is_cached? url
-      puts 'skipping'
-      next
+def response_ok? page
+  page.body[/pub-9059175907567062/] && !page.body[/IP address/i]
+end
+def clean str
+  str.gsub(/[[:space:]]+/, ' ').strip
+end
+def download
+  loop do
+    Mechanize.start do |agent|
+      agent.read_timeout = agent.open_timeout = 30
+      agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      agent.user_agent = [
+      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
+      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
+      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
+      'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
+      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
+      'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
+      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
+      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
+      'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
+      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
+      ].sample
+      return unless url = @queue.shift
+      if is_cached?(url)
+        puts 'skipping'
+        next
+      end
+      unless proxy = @proxies.shift
+        puts "no more proxies"
+        return
+      end
+      @proxies.push proxy
+      host, port = proxy.split(':')
+      agent.set_proxy host, port.to_i
+      begin
+        page = agent.get url
+        unless response_ok?(page)
+          page.search('script,style').remove
+          puts clean(page.body)
+          raise 'str'
+        end
+        save_cache url, page.body
+        @good += 1
+        puts url
+      rescue StandardError => e
+        puts e.message[0..99]
+        @queue.push url
+        @proxies -= [proxy]
+        agent.cookie_jar.clear!
+      end
     end
-    page = agent.get url
-    save_cache url, page.body
-    puts url
   end
 end
 threads = []
-5.times do
-  threads << Thread.new { download Mechanize.new{|a| a.history.max_size, a.verify_mode = 10, OpenSSL::SSL::VERIFY_NONE}}
-end
+@deficit = 0
+until @queue.empty?
+  @good = 0
+  start_time = Time.now
-threads.each { |t| t.join }
+  @proxies.shuffle!
-binding.pry
+  10.times do
+    threads << Thread.new { download }
+  end
+  threads.each { |t| t.join }
+end

data/template/src/import.rb ADDED

@@ -0,0 +1,77 @@
+require 'scrapey'
+require 'chronic'
+require 'pry'
+# sample customizations...
+# @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
+# @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
+def guess_type column
+  case column
+    when /RaceId/i then 'integer'
+    when /date/i then 'datetime'
+    when /is_/i then 'boolean'
+    when /descr/i then 'text'
+    when /price/i then 'float'
+    else 'string'
+  end
+end
+def new_table name, columns
+  ActiveRecord::Schema.define do
+    create_table name, options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
+      columns.each do |column|
+        type = guess_type column
+        t.send type, column
+      end
+=begin
+      t.string   "string_field"
+      t.text     "text_field"
+      t.integer  "number_field"
+      t.boolean  "boolean_field"
+      t.float    "float_field"
+      t.date     "created_at"
+      t.datetime "created_on"
+=end
+    end
+  end
+end
+def map row
+  item = {}
+  row.headers.each do |k|
+    v = row[k]
+    item[k] = case guess_type(k)
+      when /date/ then Chronic.parse(v)
+      when 'boolean' then v && v != 0
+      else v
+    end
+  end
+  item
+end
+Dir.glob('input/*.csv').each do |fn|
+  @table = nil
+  table_name = fn[/\/(.*)\.csv/, 1].gsub(/\W+/,'_')
+  puts table_name
+  CSV.foreach(fn, :headers => true, :header_converters => lambda{|h| h.downcase.gsub(/\W+/, '_')}) do |row|
+    if !@table
+      new_table table_name, row.headers
+      tables table_name.singularize.camelize
+      @table = table_name.singularize.camelize.constantize
+    end
+    data = map row
+    #binding.pry
+    @table.new(data).save
+    print '.'
+  end
+end

data/template/src/proxy.rb CHANGED

@@ -270,6 +270,42 @@ if ARGV.include?('-p')
   exit
 end
+def pget url, skip_ok = false
+  raise 'no gaq' unless @gaq
+  return nil unless url[/^http/]
+  if @use_cache && is_cached?(url)
+    return get(url)
+  end
+  @proxy.rotate
+  begin
+    page = get url
+  rescue StandardError => e
+    puts e.message
+    @proxy.remove
+    @agent.cookie_jar.clear!
+    return pget(url)
+  end
+  case
+    when page.respond_to?(:title) && page.title  && page.body[@gaq] && page.code == '200'
+      return page
+    else
+      delete_cache url
+      puts page.code
+      @proxy.remove
+      @agent.cookie_jar.clear!
+      return pget(url)
+  end
+end
+@config['proxies'] = File.read("#{BASEDIR}/config/proxies.txt").scan /[\w.]+:\d+/
+puts "starting with #{@config['proxies'].length} proxies..."
+@proxy = Proxy.new @agent, :proxies => @config['proxies'], :round_time => 60, :min => 0
 # for testing
 if __FILE__ == $0
   require 'mechanize'

data/template/src/schema.rb CHANGED

@@ -1,4 +1,5 @@
 =begin
+# ActiveRecord::SchemaDumper.dump ActiveRecord::Base.connection, File.open('schema.rb', 'w')
 # put table schemas here. this will be included if the table is not found.
 ActiveRecord::Schema.define do
   create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|

data/template/src/template.rb CHANGED

@@ -2,10 +2,10 @@ require 'scrapey'
 require 'pry'
 # sample customizations...
-# @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
+@agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
 # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
+@output = "template.csv"
-EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
 def clean str
   str.gsub(/[[:space:]]+/, ' ').strip
@@ -25,14 +25,15 @@ rescue StandardError => e
 end
-fields 'name', 'address', 'zip'
+# fields 'name', 'address', 'zip'
 @url = "http://www.example.com/"
-with_cache do
-  page = get @url
-  binding.pry
-end
+use_cache
+page = get @url
+binding.pry
 #@csv.close
 #%x{call #{@output}}

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scrapey
 version: !ruby/object:Gem::Version
-  version: 0.0.16
+  version: 0.0.17
 platform: ruby
 authors:
 - P Guardiario
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-04-22 00:00:00.000000000 Z
+date: 2014-10-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -91,10 +91,12 @@ files:
 - template/icon.ico
 - template/output.csv
 - template/Rakefile
+- template/src/brownloader.rb
 - template/src/downloader.rb
 - template/src/emails.rb
 - template/src/export.rb
 - template/src/get_proxies.rb
+- template/src/import.rb
 - template/src/proxy.rb
 - template/src/schema.rb
 - template/src/template.rb
@@ -118,9 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.1.5
+rubygems_version: 2.1.0
 signing_key:
 specification_version: 4
 summary: A simple scraping framework
 test_files: []
-has_rdoc: