RubyGems - list_spider - Versions diffs - 0.1.1 → 0.1.2 - Mend

list_spider 0.1.1 → 0.1.2

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f73a2e9b358cac55336907ac76ebdb666b9d31f5
-  data.tar.gz: 2df50eff29a1963224ca3f7d0cd9b3ac0c89156f
+  metadata.gz: 45ea1dba6db98ca7a9cdaecde7f744728cd20b03
+  data.tar.gz: 118764345cebb58a37e15af591b3f007451c2486
 SHA512:
-  metadata.gz: 2cb02f9eb8593a05cc6b0a0c9d015ad93bf08663750d3dfe3007c30febfaa47d57c222960eba3b8e9275fd1f5acb942278180c8b613d8ffc0d983333f059ea8a
-  data.tar.gz: a1800a9b27c769adbae11bc8e3f08e5d57d15b3b345d19acbd329142048e742acab8878328e3c8a9053a957c3c27ecf14cd067848d1bc056f552386243c33730
+  metadata.gz: 673150361b67fd16cf7dc86560c0bbe17d3d432f3f40dc4456019e9700d0d68f3b1d9eea8d6c036fc3ea904866497d248b51a36007e345a9233a43b827d0846b
+  data.tar.gz: 5c2b99885733c979d9e1f9f2426521b125fce8dd951a3f51c96d25c33ae1c180b0aeb70654b5b4422b0691bb337fdd517834cb28fa0edbee2798e895c6aa2465

data/lib/list_spider.rb CHANGED Viewed

@@ -1,81 +1,233 @@
-require File.expand_path('../spider_base', __FILE__)
+require 'em-http-request'
+require 'nokogiri'
+require 'fileutils'
+require 'set'
+require "addressable/uri"
+require File.expand_path('../spider_helper', __FILE__)
 require File.expand_path('../delete_unvalid', __FILE__)
-class ListSpider
-  RANDOM_TIME = -1
-  NO_LIMIT_CONCURRENT = -1
+class TaskStruct
+  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
+    @origin_href = href
+    @href = href
+    if @href.class == "".class
+      @href = SpiderHelper.string_to_uri(@href)
+    end
+    @local_path = local_path
+    @http_method = http_method
+    @params = params
+    @extra_data = extra_data
+    @parse_method = parse_method
+  end
-  @@random_time_range = 3..10
+  def == (o)
+    o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
+  end
-  include SpiderBase
+  attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
-  def initialize(down_list, inter_val: 0, max: 30)
-    @down_list = down_list
-    @inter_val = inter_val
-    @max = max
-    @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
-    @succeed_size = 0
-    @failed_size = 0
-  end
+end
+module ListSpider
+  RANDOM_TIME = -1
+  NO_LIMIT_CONCURRENT = -1
-  attr_reader :succeed_size, :failed_size
+  @@random_time_range = 3..10
+  @@conver_to_utf8 = false
+  @@connection_opts = {:connect_timeout => 2*60}
+  @@overwrite_exist = false
+  @@max_redirects = 10
+  @@url_set = Set.new
   class << self
-    attr_accessor :random_time_range
+    attr_accessor :random_time_range, :conver_to_utf8, :overwrite_exist, :max_redirects
-  end
+    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
+      @@connection_opts = {
+        :proxy => {
+        :host => proxy_addr,
+        :port => proxy_port
+      }
+      }
+      @@connection_opts[:proxy][:authorization] = [username, password] if username && password
+    end
-  def add_task(task)
-    if task.is_a?Array
-      @down_list = @down_list + task
-    elsif task.is_a?TaskStruct
-      @down_list << task
-    else
-      puts "error task type:#{task.class}"
+    def connect_timeout(max_connect_time)
+      @@connection_opts[:connect_timeout] = max_connect_time
     end
-  end
-  def complete(multi, success_list, failed_list)
-    @succeed_size += success_list.size
-    @failed_size += failed_list.size
-    # puts "success size:#{success_list.size}"
-    # puts "failed size:#{failed_list.size}"
-    success_list.each do |e|
-      e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
+    def set_header_option(header_option)
+      @@header_option = optHash
     end
-    todo = @down_list.slice!(0, @max)
-    if todo.empty?
-      puts "success size:#{@succeed_size}"
-      puts "failed size:#{@failed_size}"
-      EventMachine.stop
-    else
-      if @inter_val != 0
-        if success_list.size != 0 || failed_list.size !=0
-          if @inter_val == RANDOM_TIME
-            sleep(rand(@@random_time_range))
+    def event_machine_down(link_struct_list, callback = nil)
+      failed_list = []
+      succeed_list = []
+      multi = EventMachine::MultiRequest.new
+      # no_job = true
+      begin_time = Time.now
+      for_each_proc = proc do |e|
+        # if !@@overwrite_exist && File.exist?(e.local_path)
+        #   succeed_list << e
+        # else
+          next unless @@url_set.add?(e.href)
+          # no_job = false
+          opt = {}
+          opt = {:redirects => @@max_redirects}
+          opt[:head] = @@header_option if defined? @@header_option
+          if e.http_method == :post
+            opt[:body] = e.params unless e.params.empty?
+            if @@connection_opts
+              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
+            else
+              w = EventMachine::HttpRequest.new(e.href).post opt
+            end
           else
-            sleep(@inter_val)
+            if @@connection_opts
+              opt[:query] = e.params unless e.params.empty?
+              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
+            else
+              w = EventMachine::HttpRequest.new(e.href).get opt
+            end
           end
+          w.callback {
+            @@url_set.delete(e.href)
+            # puts "complete:#{w.response_header}"
+            s = w.response_header.status
+            puts s
+            if s != 404
+              local_dir = File.dirname(e.local_path)
+              FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
+              begin
+                File.open(e.local_path, "w") do |f|
+                  if @@conver_to_utf8 == true
+                    f << SpiderHelper.to_utf8( w.response)
+                  else
+                    f << w.response
+                  end
+                end
+                succeed_list << e
+              rescue Exception => e
+                puts e
+              end
+            end
+          }
+          w.errback {
+            @@url_set.delete(e.href)
+            puts "errback:#{w.response_header}"
+            puts e.origin_href
+            puts e.href
+            puts w.response_header.status
+            failed_list << e
+            if e.http_method == :get
+              SpiderHelper.direct_http_get(e.href, e.local_path)
+            elsif e.http_method == :post
+              SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
+            end
+          }
+          multi.add e.local_path, w
+        # end
+      end
+      cb = Proc.new do
+        end_time = Time.now
+        puts "use time:#{end_time-begin_time} seconds"
+        if callback.nil?
+          puts "success size:#{self.succeed_size}"
+          puts "failed size:#{self.failed_size}"
+          EventMachine.stop
+        else
+          callback.call(multi, succeed_list, failed_list)
         end
       end
-      batch_down_list(todo, method(:complete))
+      link_struct_list.each &for_each_proc
+      multi.callback &cb
     end
-  end
-  def start
-    puts "total size:#{@down_list.size}"
-    event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
-  end
+    def complete(multi, success_list, failed_list)
+      @@succeed_size += success_list.size
+      @@failed_size += failed_list.size
+      success_list.each do |e|
+        e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
+      end
-  def self.get_list(down_list, inter_val: 0, max: 30)
-    ListSpider.new(down_list, inter_val: inter_val, max: max).start
-  end
+      todo = @@down_list.slice!(0, @@max)
+      if todo.empty?
+        puts "success size:#{@@succeed_size}"
+        puts "failed size:#{@@failed_size}"
+        EventMachine.stop
+      else
+        if @@inter_val != 0
+          if success_list.size != 0 || failed_list.size !=0
+            if @@inter_val == RANDOM_TIME
+              sleep(rand(@@random_time_range))
+            else
+              sleep(@@inter_val)
+            end
+          end
+        end
+        event_machine_down(todo, method(:complete))
+      end
+    end
-  def self.get_one(task)
-    ListSpider.new([task]).start
-  end
+    def event_machine_start_list(down_list, callback = nil)
+      EventMachine.run {
+        if down_list.empty?
+          callback.call(nil, [], []) if callback
+        else
+          event_machine_down(down_list, callback)
+        end
+      }
+    end
+    def filter_list(down_list)
+      need_down_list = []
+      down_list.each do |ts|
+        if !@@overwrite_exist && File.exist?(ts.local_path)
+          ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
+        else
+          need_down_list << ts
+        end
+      end
+      return need_down_list
+    end
+    def get_list(down_list, inter_val: 0, max: 30)
+      @@down_list = []
+      need_down_list = filter_list(down_list)
+      @@down_list = @@down_list + need_down_list
+      @@inter_val = inter_val
+      @@max = max
+      @@max = @@down_list.size if @@max == NO_LIMIT_CONCURRENT
+      @@succeed_size = 0
+      @@failed_size = 0
+      puts "total size:#{@@down_list.size}"
+      event_machine_start_list(@@down_list.slice!(0, @@max), method(:complete))
+    end
+    def get_one(task)
+      get_list([task])
+    end
+    def add_task(task)
+      if task.is_a?Array
+        need_down_list = filter_list(task)
+        @@down_list = @@down_list + need_down_list
+      elsif task.is_a?TaskStruct
+        need_down_list = filter_list([task])
+        @@down_list = @@down_list + need_down_list
+      else
+        puts "error task type:#{task.class}"
+      end
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: list_spider
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Charles Zhang
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-04-29 00:00:00.000000000 Z
+date: 2016-05-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-http-request
@@ -58,7 +58,6 @@ extra_rdoc_files: []
 files:
 - lib/delete_unvalid.rb
 - lib/list_spider.rb
-- lib/spider_base.rb
 - lib/spider_helper.rb
 homepage: https://github.com/chinazhangchao/list_spider
 licenses:

data/lib/spider_base.rb DELETED Viewed

@@ -1,298 +0,0 @@
-require 'em-http-request'
-require 'nokogiri'
-require 'fileutils'
-require 'set'
-require File.expand_path('../spider_helper', __FILE__)
-require "addressable/uri"
-class TaskStruct
-  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
-    @origin_href = href
-    @href = href
-    if @href.class == "".class
-      @href = SpiderHelper.string_to_uri(@href)
-    end
-    @local_path = local_path
-    @http_method = http_method
-    @params = params
-    @extra_data = extra_data
-    @parse_method = parse_method
-  end
-  def == (o)
-    o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
-  end
-  attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
-end
-module SpiderBase
-  @@conver_to_utf8 = false
-  @@connection_opts = {:connect_timeout => 2*60}
-  @@overwrite_exist = false
-  @@max_redirects = 10
-  class << self
-    attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
-    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-      @@connection_opts = {
-        :proxy => {
-          :host => proxy_addr,
-          :port => proxy_port
-        }
-      }
-      @@connection_opts[:proxy][:authorization] = [username, password] if username && password
-    end
-    def connect_timeout(max_connect_time)
-      @@connection_opts[:connect_timeout] = max_connect_time
-    end
-    def set_header_option(header_option)
-      @@header_option = optHash
-    end
-    def event_machine_down(link_struct_list, callback = nil)
-      failed_list = []
-      succeed_list = []
-      # puts "event_machine_down callback:#{callback}"
-      multi = EventMachine::MultiRequest.new
-      no_job = true
-      begin_time = Time.now
-      for_each_proc = proc do |e|
-        if !@@overwrite_exist && File.exist?(e.local_path)
-          succeed_list << e
-        else
-          no_job = false
-          opt = {}
-          opt = {:redirects => @@max_redirects}
-          opt[:head] = @@header_option if defined? @@header_option
-          if e.http_method == :post
-            opt[:body] = e.params unless e.params.empty?
-            if @@connection_opts
-              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
-            else
-              w = EventMachine::HttpRequest.new(e.href).post opt
-            end
-          else
-            if @@connection_opts
-              opt[:query] = e.params unless e.params.empty?
-              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
-            else
-              w = EventMachine::HttpRequest.new(e.href).get opt
-            end
-          end
-          w.callback {
-            # puts "complete:#{w.response_header}"
-            s = w.response_header.status
-            puts s
-            if s == 403 || s == 502 #Forbidden
-              # EventMachine.stop
-            elsif s != 404
-              local_dir = File.dirname(e.local_path)
-              FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
-              begin
-                File.open(e.local_path, "w") do |f|
-                  if @@conver_to_utf8 == true
-                    f << SpiderHelper.to_utf8( w.response)
-                  else
-                    f << w.response
-                  end
-                end
-                succeed_list << e
-              rescue Exception => e
-                puts e
-              end
-            end
-          }
-          w.errback {
-            puts "errback:#{w.response_header}"
-            puts e.origin_href
-            puts e.href
-            puts w.response_header.status
-            failed_list << e
-            if e.http_method == :get
-              SpiderHelper.direct_http_get(e.href, e.local_path)
-            elsif e.http_method == :post
-              SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
-            end
-          }
-          multi.add e.local_path, w
-        end
-      end
-      # em_for_each_proc = proc do |e, iter|
-      #   for_each_proc.call(e)
-      #   iter.next
-      # end
-      cb = Proc.new do
-        end_time = Time.now
-        puts "use time:#{end_time-begin_time} seconds"
-        if callback.nil?
-          puts "success size:#{self.succeed_size}"
-          puts "failed size:#{self.failed_size}"
-          EventMachine.stop
-        else
-          callback.call(multi, succeed_list, failed_list)
-        end
-      end
-      after_proc = proc {
-        if no_job #没有任务直接调回调
-          cb.call
-        else
-          multi.callback &cb
-        end
-      }
-      # if DownLoadConfig::MaxConcurrent <= 0
-        link_struct_list.each &for_each_proc
-        after_proc.call
-      # else
-        # EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
-      # end
-    end
-    def event_machine_start(url, down_dir, file_name, callback = nil)
-      down_dir << "/" unless down_dir.end_with?("/")
-      FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
-      down_list = []
-      down_list << TaskStruct.new(url, down_dir + file_name)
-      EventMachine.run {
-        index = 0
-        begin_time = Time.now
-        event_machine_down(down_list, callback)
-        end_time = Time.now
-      }
-    end
-    def event_machine_start_list(down_list, callback = nil)
-      EventMachine.run {
-        index = 0
-        begin_time = Time.now
-        event_machine_down(down_list, callback)
-        end_time = Time.now
-      }
-    end
-  end#self end
-end#SpiderBase end
-def batch_down_list(down_list, callback = nil)
-  SpiderBase.event_machine_down(down_list, callback)
-end
-def event_machine_start_list(down_list, callback = nil)
-  SpiderBase.event_machine_start_list(down_list, callback)
-end
-def parse_down_load_url(url, down_dir, file_name, callback = nil)
-  SpiderBase.event_machine_start(url, down_dir, file_name, callback)
-end
-class GetRelative
-  def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
-    @get_depth = get_depth
-    @base_url = base_url
-    @down_dir = down_dir
-    @suffix = suffix
-  end
-  def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
-    puts "success"
-    puts succeed_list.size
-    puts "error"
-    puts failed_list.size
-    puts failed_list
-    puts "get index complete"
-    if succeed_list.size > 0
-      link_list = []
-      succeed_list.each do |e|
-        doc = Nokogiri::HTML(open(e.local_path))
-        link_list.concat(doc.css("a"))
-      end
-      puts "extrat href complete"
-      down_dir << "/" unless down_dir.end_with?("/")
-      FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
-      down_list = []
-      set_list = Set.new
-      link_list.each do |link|
-        href = link['href']
-        next if href.nil? || !href.include?(@suffix)
-        #process such as "scheme_2.html#SEC15"
-        href = href[0, href.index(@suffix) + 5]
-        #process such as "./preface.html"
-        href = href[2..-1] if href.start_with?("./")
-        next if !set_list.add?(href)
-        unless base_url.end_with?("/")
-          i = base_url.rindex"/"
-          base_url = base_url[0..i]
-        end
-        #process such as "http://www.ccs.neu.edu/~dorai"
-        next if href.start_with?("http:") || href.start_with?("https:")
-        local_path = down_dir + href
-        down_list.push( TaskStruct.new(base_url + href, local_path))
-      end
-      puts "down list complete,size:#{down_list.size}"
-      batch_down_list(down_list, callback)
-    end
-  end
-  def down_other_node (multi, succeed_list, failed_list)
-    puts "down_other_node"
-    @get_depth = @get_depth - 1
-    puts "depth:#{@get_depth}"
-    if @get_depth <= 0
-      down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
-    else
-      down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
-    end
-  end
-  def event_all_complete (multi, succeed_list, failed_list)
-    puts "all complete"
-    puts "success"
-    puts succeed_list.size
-    puts "error"
-    puts failed_list.size
-    puts failed_list
-    EventMachine.stop
-  end
-  attr_writer :get_depth,:base_url,:down_dir
-  def start
-    index_file_name = "index.html"
-    #http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
-    unless @base_url.end_with?("/")
-      i = @base_url.rindex"/"
-      index_file_name = @base_url[i+1 .. -1]
-    end
-    @get_depth = @get_depth - 1
-    puts @get_depth
-    if @get_depth <= 0
-      parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
-    else
-      parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
-    end
-  end
-  def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
-    GetRelative.new(base_url,down_dir, get_depth, suffix).start
-  end
-end #GetRelative