scrapey 0.0.16 → 0.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/scrapey.rb +3 -1
- data/lib/scrapey/cache/disk.rb +19 -1
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/scrapey.rb +39 -12
- data/template/src/brownloader.rb +63 -0
- data/template/src/downloader.rb +71 -15
- data/template/src/import.rb +77 -0
- data/template/src/proxy.rb +36 -0
- data/template/src/schema.rb +1 -0
- data/template/src/template.rb +8 -7
- metadata +5 -4
    
        checksums.yaml
    CHANGED
    
    | @@ -1,15 +1,15 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            !binary "U0hBMQ==":
         | 
| 3 3 | 
             
              metadata.gz: !binary |-
         | 
| 4 | 
            -
                 | 
| 4 | 
            +
                ZWYyNGI3OGE3MTA2ZmQxMGE0MzE1MmE2ZjA5YTFhYTliOTY1OTY5ZQ==
         | 
| 5 5 | 
             
              data.tar.gz: !binary |-
         | 
| 6 | 
            -
                 | 
| 6 | 
            +
                YjRjOTVjOTkzNmEwOGE2NmMzYTVkNmNjMGRkODRjZjQ2OWM3OWNhNw==
         | 
| 7 7 | 
             
            SHA512:
         | 
| 8 8 | 
             
              metadata.gz: !binary |-
         | 
| 9 | 
            -
                 | 
| 10 | 
            -
                 | 
| 11 | 
            -
                 | 
| 9 | 
            +
                NzkzOWNhMTA2MGQ3MDYwYjA2ZjQ2M2Y1OTQ4YTczMDljMWQ2YjRhYjcyMTk0
         | 
| 10 | 
            +
                Yzc4ZjczNjU5MTBjN2MyOTczM2Y0NDZkNzY0MDdhOGU4MDQ1ODA3ODMwZTJi
         | 
| 11 | 
            +
                MzMyZGFlNDc4N2MxMmViYjM5MjE2N2Y1MjFiNDY0ODJiNGM3ZDE=
         | 
| 12 12 | 
             
              data.tar.gz: !binary |-
         | 
| 13 | 
            -
                 | 
| 14 | 
            -
                 | 
| 15 | 
            -
                 | 
| 13 | 
            +
                MWQyZjQ1NTA4NjA4ZGVmNjNjYmQ0MDY2ZDJhZWZlMGJhYWI1NDIyOTcwNzhi
         | 
| 14 | 
            +
                MDg0YWU0NmIyMmNhY2E1MTE3NGY3ODE0NDhmNDE3NTc4OGVhNjg0NjA4OWRk
         | 
| 15 | 
            +
                MTVmNmVmNDUyZGU5ZmFiMjg0N2Y5ZmVhM2UyMWRmYjM2MmQwMDE=
         | 
    
        data/lib/scrapey.rb
    CHANGED
    
    | @@ -2,7 +2,7 @@ require 'mechanize' | |
| 2 2 | 
             
            require 'csv'
         | 
| 3 3 | 
             
            require 'json'
         | 
| 4 4 | 
             
            require 'yaml'
         | 
| 5 | 
            -
            require 'unf_ext'
         | 
| 5 | 
            +
            # require 'unf_ext'
         | 
| 6 6 |  | 
| 7 7 | 
             
            require "scrapey/scrapey"
         | 
| 8 8 | 
             
            require "scrapey/constants"
         | 
| @@ -13,6 +13,8 @@ require "scrapey/tee" | |
| 13 13 |  | 
| 14 14 | 
             
            require 'addressable/uri'
         | 
| 15 15 |  | 
| 16 | 
            +
            EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
         | 
| 17 | 
            +
             | 
| 16 18 | 
             
            class URI::Parser
         | 
| 17 19 | 
             
              def split url
         | 
| 18 20 | 
             
                a = Addressable::URI::parse url
         | 
    
        data/lib/scrapey/cache/disk.rb
    CHANGED
    
    | @@ -10,6 +10,7 @@ module Scrapey | |
| 10 10 | 
             
                File.exists? cache_filename(url)
         | 
| 11 11 | 
             
              end
         | 
| 12 12 |  | 
| 13 | 
            +
            =begin
         | 
| 13 14 | 
             
              def load_cache url
         | 
| 14 15 | 
             
                filename = cache_filename url
         | 
| 15 16 | 
             
                return nil unless File::exists?(filename)
         | 
| @@ -24,9 +25,26 @@ module Scrapey | |
| 24 25 | 
             
              def save_cache url, doc, options = {}
         | 
| 25 26 | 
             
                File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
         | 
| 26 27 | 
             
              end
         | 
| 28 | 
            +
            =end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
              def load_cache url
         | 
| 31 | 
            +
                filename = cache_filename url
         | 
| 32 | 
            +
                return nil unless File::exists?(filename)
         | 
| 33 | 
            +
                debug "Loading #{filename} from cache"
         | 
| 34 | 
            +
                begin
         | 
| 35 | 
            +
                  Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
         | 
| 36 | 
            +
                rescue Exception => e
         | 
| 37 | 
            +
                  puts e.message
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
              def save_cache url, doc, options = {}
         | 
| 42 | 
            +
                File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) }
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
             | 
| 27 45 |  | 
| 28 46 | 
             
              def delete_cache url
         | 
| 29 | 
            -
                FileUtils.rm | 
| 47 | 
            +
                FileUtils.rm(cache_filename(url)) rescue nil
         | 
| 30 48 | 
             
              end
         | 
| 31 49 |  | 
| 32 50 | 
             
            end
         | 
    
        data/lib/scrapey/constants.rb
    CHANGED
    
    
    
        data/lib/scrapey/scrapey.rb
    CHANGED
    
    | @@ -25,8 +25,9 @@ module Scrapey | |
| 25 25 | 
             
                  return doc if doc
         | 
| 26 26 |  | 
| 27 27 | 
             
                  page = agent.send *new_args
         | 
| 28 | 
            -
                  str = page.respond_to?('root') ? page.root.to_s : page.body
         | 
| 29 | 
            -
                  save_cache(url, str) if @use_cache
         | 
| 28 | 
            +
                  # str = page.respond_to?('root') ? page.root.to_s : page.body
         | 
| 29 | 
            +
                  # save_cache(url, str) if @use_cache
         | 
| 30 | 
            +
                  save_cache(url, page.body) if @use_cache
         | 
| 30 31 |  | 
| 31 32 | 
             
                  #exit if Object.const_defined? :Ocra
         | 
| 32 33 | 
             
                  page
         | 
| @@ -57,20 +58,46 @@ module Scrapey | |
| 57 58 | 
             
                @fields = args
         | 
| 58 59 | 
             
              end
         | 
| 59 60 |  | 
| 60 | 
            -
              def  | 
| 61 | 
            -
                 | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 61 | 
            +
              def save_images urls
         | 
| 62 | 
            +
                folder = "#{BASEDIR}/images"
         | 
| 63 | 
            +
                Dir.mkdir(folder) unless Dir.exists?(folder)
         | 
| 64 | 
            +
                names = []
         | 
| 65 | 
            +
                urls.each do |url|
         | 
| 66 | 
            +
                  name = url[/[^\/]+$/]
         | 
| 67 | 
            +
                  binding.pry unless name
         | 
| 68 | 
            +
                  names << name
         | 
| 69 | 
            +
                  fn = "#{folder}/#{name}"
         | 
| 70 | 
            +
                  next if File.exists?(fn)
         | 
| 71 | 
            +
                  file = @agent.get(url)
         | 
| 72 | 
            +
                  File.open(fn, 'wb'){|f| f << file.body}
         | 
| 64 73 | 
             
                end
         | 
| 65 | 
            -
                 | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 74 | 
            +
                names
         | 
| 75 | 
            +
              end
         | 
| 76 | 
            +
             | 
| 77 | 
            +
              def save item, output = nil
         | 
| 78 | 
            +
                output ||= @output
         | 
| 79 | 
            +
                @csvs ||= {}
         | 
| 80 | 
            +
                unless @csvs[output]
         | 
| 81 | 
            +
                  obj = {}
         | 
| 82 | 
            +
                  begin
         | 
| 83 | 
            +
                    fn = output.gsub(/(?<!csv)$/, '.csv')
         | 
| 84 | 
            +
                    obj[:csv] = CSV.open fn, 'w'
         | 
| 85 | 
            +
                  rescue Exception => e
         | 
| 86 | 
            +
                    if e.is_a?(Errno::EACCES)
         | 
| 87 | 
            +
                      puts "Unable to access #{fn} - is it locked?"
         | 
| 88 | 
            +
                      exit
         | 
| 89 | 
            +
                    else
         | 
| 90 | 
            +
                      raise e
         | 
| 91 | 
            +
                    end
         | 
| 92 | 
            +
                  end
         | 
| 93 | 
            +
                  obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys
         | 
| 94 | 
            +
                  obj[:csv] << obj[:fields]
         | 
| 95 | 
            +
                  @csvs[output] = obj
         | 
| 71 96 | 
             
                end
         | 
| 97 | 
            +
                @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]}
         | 
| 72 98 | 
             
              end
         | 
| 73 99 |  | 
| 100 | 
            +
             | 
| 74 101 | 
             
              def visited? url
         | 
| 75 102 | 
             
                @visited ||= []
         | 
| 76 103 | 
             
                return true if @visited.include? url
         | 
| @@ -0,0 +1,63 @@ | |
| 1 | 
            +
            require 'scrapey'
         | 
| 2 | 
            +
            require 'watir-webdriver'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            use_cache
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
         | 
| 7 | 
            +
            @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
         | 
| 8 | 
            +
            @proxies = File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            @lock = Mutex.new
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            def response_ok? str
         | 
| 13 | 
            +
              str[/Lidnummer/] && !str[/IP address/i]
         | 
| 14 | 
            +
            end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            def clean str
         | 
| 17 | 
            +
              str.gsub(/[[:space:]]+/, ' ').strip
         | 
| 18 | 
            +
            end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            def download
         | 
| 21 | 
            +
              browser = nil
         | 
| 22 | 
            +
              @lock.synchronize do
         | 
| 23 | 
            +
                browser = Watir::Browser.new
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
              loop do
         | 
| 26 | 
            +
                return unless url = @queue.shift
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                if is_cached?(url)
         | 
| 29 | 
            +
                  puts 'skipping'
         | 
| 30 | 
            +
                  next
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
                
         | 
| 33 | 
            +
                begin
         | 
| 34 | 
            +
                  browser.goto url
         | 
| 35 | 
            +
                  unless response_ok?(browser.html)
         | 
| 36 | 
            +
                    raise 'str'
         | 
| 37 | 
            +
                  end
         | 
| 38 | 
            +
                  save_cache url, browser.html
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                  puts browser.html[EMAIL_REGEX]
         | 
| 41 | 
            +
                rescue StandardError => e
         | 
| 42 | 
            +
                  puts e.message[0..99]
         | 
| 43 | 
            +
                  @queue.push url
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
              end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            threads = []
         | 
| 50 | 
            +
            @deficit = 0
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            until @queue.empty?
         | 
| 53 | 
            +
              @good = 0
         | 
| 54 | 
            +
              start_time = Time.now
         | 
| 55 | 
            +
             | 
| 56 | 
            +
              @proxies.shuffle!
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              1.times do
         | 
| 59 | 
            +
                threads << Thread.new { download }
         | 
| 60 | 
            +
              end
         | 
| 61 | 
            +
              threads.each { |t| t.join }
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            end
         | 
    
        data/template/src/downloader.rb
    CHANGED
    
    | @@ -1,28 +1,84 @@ | |
| 1 1 | 
             
            require 'scrapey'
         | 
| 2 | 
            -
            require 'pry'
         | 
| 3 2 |  | 
| 4 3 | 
             
            use_cache
         | 
| 5 4 |  | 
| 6 5 | 
             
            # File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
         | 
| 7 | 
            -
            @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
         | 
| 6 | 
            +
            @queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").reject{|url| is_cached?(url)}.shuffle
         | 
| 7 | 
            +
            @proxies ||= File.read("#{BASEDIR}/config/proxies.txt").scan(/[\w.]+:\d+/)
         | 
| 8 8 |  | 
| 9 | 
            -
            def  | 
| 10 | 
            -
               | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 9 | 
            +
            def response_ok? page
         | 
| 10 | 
            +
              page.body[/pub-9059175907567062/] && !page.body[/IP address/i]
         | 
| 11 | 
            +
            end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            def clean str
         | 
| 14 | 
            +
              str.gsub(/[[:space:]]+/, ' ').strip
         | 
| 15 | 
            +
            end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            def download
         | 
| 18 | 
            +
              loop do
         | 
| 19 | 
            +
                Mechanize.start do |agent|
         | 
| 20 | 
            +
                  agent.read_timeout = agent.open_timeout = 30
         | 
| 21 | 
            +
                  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
         | 
| 22 | 
            +
                  agent.user_agent = [
         | 
| 23 | 
            +
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
         | 
| 24 | 
            +
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36',
         | 
| 25 | 
            +
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1',
         | 
| 26 | 
            +
                  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
         | 
| 27 | 
            +
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
         | 
| 28 | 
            +
                  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
         | 
| 29 | 
            +
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
         | 
| 30 | 
            +
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
         | 
| 31 | 
            +
                  'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
         | 
| 32 | 
            +
                  'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
         | 
| 33 | 
            +
                  ].sample
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  return unless url = @queue.shift
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  if is_cached?(url)
         | 
| 38 | 
            +
                    puts 'skipping'
         | 
| 39 | 
            +
                    next
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
                  unless proxy = @proxies.shift
         | 
| 42 | 
            +
                    puts "no more proxies"
         | 
| 43 | 
            +
                    return
         | 
| 44 | 
            +
                  end
         | 
| 45 | 
            +
                  @proxies.push proxy
         | 
| 46 | 
            +
                  host, port = proxy.split(':')
         | 
| 47 | 
            +
                  agent.set_proxy host, port.to_i
         | 
| 48 | 
            +
                  begin
         | 
| 49 | 
            +
                    page = agent.get url
         | 
| 50 | 
            +
                    unless response_ok?(page)
         | 
| 51 | 
            +
                      page.search('script,style').remove
         | 
| 52 | 
            +
                      puts clean(page.body)
         | 
| 53 | 
            +
                      raise 'str'
         | 
| 54 | 
            +
                    end
         | 
| 55 | 
            +
                    save_cache url, page.body
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                    @good += 1
         | 
| 58 | 
            +
                    puts url
         | 
| 59 | 
            +
                  rescue StandardError => e
         | 
| 60 | 
            +
                    puts e.message[0..99]
         | 
| 61 | 
            +
                    @queue.push url
         | 
| 62 | 
            +
                    @proxies -= [proxy]
         | 
| 63 | 
            +
                    agent.cookie_jar.clear!
         | 
| 64 | 
            +
                  end
         | 
| 14 65 | 
             
                end
         | 
| 15 | 
            -
                page = agent.get url
         | 
| 16 | 
            -
                save_cache url, page.body
         | 
| 17 | 
            -
                puts url
         | 
| 18 66 | 
             
              end
         | 
| 67 | 
            +
             | 
| 19 68 | 
             
            end
         | 
| 20 69 |  | 
| 21 70 | 
             
            threads = []
         | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 71 | 
            +
            @deficit = 0
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            until @queue.empty?
         | 
| 74 | 
            +
              @good = 0
         | 
| 75 | 
            +
              start_time = Time.now
         | 
| 25 76 |  | 
| 26 | 
            -
             | 
| 77 | 
            +
              @proxies.shuffle!
         | 
| 27 78 |  | 
| 28 | 
            -
             | 
| 79 | 
            +
              10.times do
         | 
| 80 | 
            +
                threads << Thread.new { download }
         | 
| 81 | 
            +
              end
         | 
| 82 | 
            +
              threads.each { |t| t.join }
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            end
         | 
| @@ -0,0 +1,77 @@ | |
| 1 | 
            +
            require 'scrapey'
         | 
| 2 | 
            +
            require 'chronic'
         | 
| 3 | 
            +
            require 'pry'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # sample customizations...
         | 
| 6 | 
            +
            # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
         | 
| 7 | 
            +
            # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            def guess_type column
         | 
| 10 | 
            +
              case column
         | 
| 11 | 
            +
                when /RaceId/i then 'integer'
         | 
| 12 | 
            +
                when /date/i then 'datetime'
         | 
| 13 | 
            +
                when /is_/i then 'boolean'
         | 
| 14 | 
            +
                when /descr/i then 'text'
         | 
| 15 | 
            +
                when /price/i then 'float'
         | 
| 16 | 
            +
                else 'string'
         | 
| 17 | 
            +
              end
         | 
| 18 | 
            +
            end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            def new_table name, columns
         | 
| 21 | 
            +
             | 
| 22 | 
            +
              ActiveRecord::Schema.define do
         | 
| 23 | 
            +
                create_table name, options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
         | 
| 24 | 
            +
                  columns.each do |column|
         | 
| 25 | 
            +
                    type = guess_type column
         | 
| 26 | 
            +
                    t.send type, column
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            =begin
         | 
| 30 | 
            +
                  t.string   "string_field"
         | 
| 31 | 
            +
                  t.text     "text_field"
         | 
| 32 | 
            +
                  t.integer  "number_field"
         | 
| 33 | 
            +
                  t.boolean  "boolean_field"
         | 
| 34 | 
            +
                  t.float    "float_field"
         | 
| 35 | 
            +
                  t.date     "created_at"
         | 
| 36 | 
            +
                  t.datetime "created_on"
         | 
| 37 | 
            +
            =end
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            def map row
         | 
| 44 | 
            +
              item = {}
         | 
| 45 | 
            +
              row.headers.each do |k|
         | 
| 46 | 
            +
                v = row[k]
         | 
| 47 | 
            +
                item[k] = case guess_type(k)
         | 
| 48 | 
            +
                  when /date/ then Chronic.parse(v)
         | 
| 49 | 
            +
                  when 'boolean' then v && v != 0
         | 
| 50 | 
            +
                  else v
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
              end
         | 
| 53 | 
            +
              item
         | 
| 54 | 
            +
            end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            Dir.glob('input/*.csv').each do |fn|
         | 
| 57 | 
            +
              @table = nil
         | 
| 58 | 
            +
              table_name = fn[/\/(.*)\.csv/, 1].gsub(/\W+/,'_')
         | 
| 59 | 
            +
              puts table_name
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              CSV.foreach(fn, :headers => true, :header_converters => lambda{|h| h.downcase.gsub(/\W+/, '_')}) do |row|
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                if !@table
         | 
| 64 | 
            +
                  new_table table_name, row.headers
         | 
| 65 | 
            +
                  tables table_name.singularize.camelize
         | 
| 66 | 
            +
                  @table = table_name.singularize.camelize.constantize
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                data = map row
         | 
| 70 | 
            +
                #binding.pry
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                @table.new(data).save
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                print '.'
         | 
| 75 | 
            +
              end
         | 
| 76 | 
            +
            end
         | 
| 77 | 
            +
             | 
    
        data/template/src/proxy.rb
    CHANGED
    
    | @@ -270,6 +270,42 @@ if ARGV.include?('-p') | |
| 270 270 | 
             
              exit
         | 
| 271 271 | 
             
            end
         | 
| 272 272 |  | 
| 273 | 
            +
            def pget url, skip_ok = false
         | 
| 274 | 
            +
              raise 'no gaq' unless @gaq
         | 
| 275 | 
            +
              return nil unless url[/^http/]
         | 
| 276 | 
            +
              if @use_cache && is_cached?(url)
         | 
| 277 | 
            +
                return get(url)
         | 
| 278 | 
            +
              end
         | 
| 279 | 
            +
              @proxy.rotate
         | 
| 280 | 
            +
              begin
         | 
| 281 | 
            +
                page = get url
         | 
| 282 | 
            +
              rescue StandardError => e
         | 
| 283 | 
            +
                puts e.message
         | 
| 284 | 
            +
                @proxy.remove
         | 
| 285 | 
            +
                @agent.cookie_jar.clear!
         | 
| 286 | 
            +
                return pget(url)
         | 
| 287 | 
            +
              end
         | 
| 288 | 
            +
             | 
| 289 | 
            +
              case
         | 
| 290 | 
            +
                when page.respond_to?(:title) && page.title  && page.body[@gaq] && page.code == '200'
         | 
| 291 | 
            +
                  return page
         | 
| 292 | 
            +
                else
         | 
| 293 | 
            +
                  delete_cache url
         | 
| 294 | 
            +
                  puts page.code
         | 
| 295 | 
            +
                  @proxy.remove
         | 
| 296 | 
            +
                  @agent.cookie_jar.clear!
         | 
| 297 | 
            +
                  return pget(url)
         | 
| 298 | 
            +
              end
         | 
| 299 | 
            +
            end
         | 
| 300 | 
            +
             | 
| 301 | 
            +
            @config['proxies'] = File.read("#{BASEDIR}/config/proxies.txt").scan /[\w.]+:\d+/
         | 
| 302 | 
            +
             | 
| 303 | 
            +
            puts "starting with #{@config['proxies'].length} proxies..."
         | 
| 304 | 
            +
            @proxy = Proxy.new @agent, :proxies => @config['proxies'], :round_time => 60, :min => 0
         | 
| 305 | 
            +
             | 
| 306 | 
            +
             | 
| 307 | 
            +
             | 
| 308 | 
            +
             | 
| 273 309 | 
             
            # for testing
         | 
| 274 310 | 
             
            if __FILE__ == $0
         | 
| 275 311 | 
             
              require 'mechanize'
         | 
    
        data/template/src/schema.rb
    CHANGED
    
    | @@ -1,4 +1,5 @@ | |
| 1 1 | 
             
            =begin
         | 
| 2 | 
            +
            # ActiveRecord::SchemaDumper.dump ActiveRecord::Base.connection, File.open('schema.rb', 'w')
         | 
| 2 3 | 
             
            # put table schemas here. this will be included if the table is not found.
         | 
| 3 4 | 
             
            ActiveRecord::Schema.define do
         | 
| 4 5 | 
             
              create_table "items", options: 'ENGINE=InnoDB DEFAULT CHARSET=utf8' do |t|
         | 
    
        data/template/src/template.rb
    CHANGED
    
    | @@ -2,10 +2,10 @@ require 'scrapey' | |
| 2 2 | 
             
            require 'pry'
         | 
| 3 3 |  | 
| 4 4 | 
             
            # sample customizations...
         | 
| 5 | 
            -
             | 
| 5 | 
            +
            @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
         | 
| 6 6 | 
             
            # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
         | 
| 7 | 
            +
            @output = "template.csv"
         | 
| 7 8 |  | 
| 8 | 
            -
            EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
         | 
| 9 9 |  | 
| 10 10 | 
             
            def clean str
         | 
| 11 11 | 
             
              str.gsub(/[[:space:]]+/, ' ').strip
         | 
| @@ -25,14 +25,15 @@ rescue StandardError => e | |
| 25 25 | 
             
            end
         | 
| 26 26 |  | 
| 27 27 |  | 
| 28 | 
            -
            fields 'name', 'address', 'zip'
         | 
| 28 | 
            +
            # fields 'name', 'address', 'zip'
         | 
| 29 29 |  | 
| 30 30 | 
             
            @url = "http://www.example.com/"
         | 
| 31 31 |  | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 32 | 
            +
            use_cache
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            page = get @url
         | 
| 35 | 
            +
            binding.pry
         | 
| 36 | 
            +
             | 
| 36 37 |  | 
| 37 38 | 
             
            #@csv.close
         | 
| 38 39 | 
             
            #%x{call #{@output}}
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: scrapey
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.17
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - P Guardiario
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2014- | 
| 11 | 
            +
            date: 2014-10-29 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: mechanize
         | 
| @@ -91,10 +91,12 @@ files: | |
| 91 91 | 
             
            - template/icon.ico
         | 
| 92 92 | 
             
            - template/output.csv
         | 
| 93 93 | 
             
            - template/Rakefile
         | 
| 94 | 
            +
            - template/src/brownloader.rb
         | 
| 94 95 | 
             
            - template/src/downloader.rb
         | 
| 95 96 | 
             
            - template/src/emails.rb
         | 
| 96 97 | 
             
            - template/src/export.rb
         | 
| 97 98 | 
             
            - template/src/get_proxies.rb
         | 
| 99 | 
            +
            - template/src/import.rb
         | 
| 98 100 | 
             
            - template/src/proxy.rb
         | 
| 99 101 | 
             
            - template/src/schema.rb
         | 
| 100 102 | 
             
            - template/src/template.rb
         | 
| @@ -118,9 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 118 120 | 
             
                  version: '0'
         | 
| 119 121 | 
             
            requirements: []
         | 
| 120 122 | 
             
            rubyforge_project: 
         | 
| 121 | 
            -
            rubygems_version: 2.1. | 
| 123 | 
            +
            rubygems_version: 2.1.0
         | 
| 122 124 | 
             
            signing_key: 
         | 
| 123 125 | 
             
            specification_version: 4
         | 
| 124 126 | 
             
            summary: A simple scraping framework
         | 
| 125 127 | 
             
            test_files: []
         | 
| 126 | 
            -
            has_rdoc: 
         |