RubyGems - list_spider - Versions diffs - 0.1.1 → 0.1.2 - Mend

list_spider 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f73a2e9b358cac55336907ac76ebdb666b9d31f5
-  data.tar.gz: 2df50eff29a1963224ca3f7d0cd9b3ac0c89156f
+  metadata.gz: 45ea1dba6db98ca7a9cdaecde7f744728cd20b03
+  data.tar.gz: 118764345cebb58a37e15af591b3f007451c2486
 SHA512:
-  metadata.gz: 2cb02f9eb8593a05cc6b0a0c9d015ad93bf08663750d3dfe3007c30febfaa47d57c222960eba3b8e9275fd1f5acb942278180c8b613d8ffc0d983333f059ea8a
-  data.tar.gz: a1800a9b27c769adbae11bc8e3f08e5d57d15b3b345d19acbd329142048e742acab8878328e3c8a9053a957c3c27ecf14cd067848d1bc056f552386243c33730
+  metadata.gz: 673150361b67fd16cf7dc86560c0bbe17d3d432f3f40dc4456019e9700d0d68f3b1d9eea8d6c036fc3ea904866497d248b51a36007e345a9233a43b827d0846b
+  data.tar.gz: 5c2b99885733c979d9e1f9f2426521b125fce8dd951a3f51c96d25c33ae1c180b0aeb70654b5b4422b0691bb337fdd517834cb28fa0edbee2798e895c6aa2465

data/lib/list_spider.rb CHANGED Viewed

@@ -1,81 +1,233 @@
-require File.expand_path('../spider_base', __FILE__)
+require 'em-http-request'
+require 'nokogiri'
+require 'fileutils'
+require 'set'
+require "addressable/uri"
+require File.expand_path('../spider_helper', __FILE__)
 require File.expand_path('../delete_unvalid', __FILE__)
-class ListSpider
-  RANDOM_TIME = -1
-  NO_LIMIT_CONCURRENT = -1
+class TaskStruct
+  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
+    @origin_href = href
+    @href = href
+    if @href.class == "".class
+      @href = SpiderHelper.string_to_uri(@href)
+    end
+    @local_path = local_path
+    @http_method = http_method
+    @params = params
+    @extra_data = extra_data
+    @parse_method = parse_method
+  end
-  @@random_time_range = 3..10
+  def == (o)
+    o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
+  end
-  include SpiderBase
+  attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
-  def initialize(down_list, inter_val: 0, max: 30)
-    @down_list = down_list
-    @inter_val = inter_val
-    @max = max
-    @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
-    @succeed_size = 0
-    @failed_size = 0
-  end
+end
+module ListSpider
+  RANDOM_TIME = -1
+  NO_LIMIT_CONCURRENT = -1
-  attr_reader :succeed_size, :failed_size
+  @@random_time_range = 3..10
+  @@conver_to_utf8 = false
+  @@connection_opts = {:connect_timeout => 2*60}
+  @@overwrite_exist = false
+  @@max_redirects = 10
+  @@url_set = Set.new
   class << self
-    attr_accessor :random_time_range
+    attr_accessor :random_time_range, :conver_to_utf8, :overwrite_exist, :max_redirects
-  end
+    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
+      @@connection_opts = {
+        :proxy => {
+        :host => proxy_addr,
+        :port => proxy_port
+      }
+      }
+      @@connection_opts[:proxy][:authorization] = [username, password] if username && password
+    end
-  def add_task(task)
-    if task.is_a?Array
-      @down_list = @down_list + task
-    elsif task.is_a?TaskStruct
-      @down_list << task
-    else
-      puts "error task type:#{task.class}"
+    def connect_timeout(max_connect_time)
+      @@connection_opts[:connect_timeout] = max_connect_time
     end
-  end
-  def complete(multi, success_list, failed_list)
-    @succeed_size += success_list.size
-    @failed_size += failed_list.size
-    # puts "success size:#{success_list.size}"
-    # puts "failed size:#{failed_list.size}"
-    success_list.each do |e|
-      e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
+    def set_header_option(header_option)
+      @@header_option = optHash
     end
-    todo = @down_list.slice!(0, @max)
-    if todo.empty?
-      puts "success size:#{@succeed_size}"
-      puts "failed size:#{@failed_size}"
-      EventMachine.stop
-    else
-      if @inter_val != 0
-        if success_list.size != 0 || failed_list.size !=0
-          if @inter_val == RANDOM_TIME
-            sleep(rand(@@random_time_range))
+    def event_machine_down(link_struct_list, callback = nil)
+      failed_list = []
+      succeed_list = []
+      multi = EventMachine::MultiRequest.new
+      # no_job = true
+      begin_time = Time.now
+      for_each_proc = proc do |e|
+        # if !@@overwrite_exist && File.exist?(e.local_path)
+        #   succeed_list << e
+        # else
+          next unless @@url_set.add?(e.href)
+          # no_job = false
+          opt = {}
+          opt = {:redirects => @@max_redirects}
+          opt[:head] = @@header_option if defined? @@header_option
+          if e.http_method == :post
+            opt[:body] = e.params unless e.params.empty?
+            if @@connection_opts
+              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
+            else
+              w = EventMachine::HttpRequest.new(e.href).post opt
+            end
           else
-            sleep(@inter_val)
+            if @@connection_opts
+              opt[:query] = e.params unless e.params.empty?
+              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
+            else
+              w = EventMachine::HttpRequest.new(e.href).get opt
+            end
           end
+          w.callback {
+            @@url_set.delete(e.href)
+            # puts "complete:#{w.response_header}"
+            s = w.response_header.status
+            puts s
+            if s != 404
+              local_dir = File.dirname(e.local_path)
+              FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
+              begin
+                File.open(e.local_path, "w") do |f|
+                  if @@conver_to_utf8 == true
+                    f << SpiderHelper.to_utf8( w.response)
+                  else
+                    f << w.response
+                  end
+                end
+                succeed_list << e
+              rescue Exception => e
+                puts e
+              end
+            end
+          }
+          w.errback {
+            @@url_set.delete(e.href)
+            puts "errback:#{w.response_header}"
+            puts e.origin_href
+            puts e.href
+            puts w.response_header.status
+            failed_list << e
+            if e.http_method == :get
+              SpiderHelper.direct_http_get(e.href, e.local_path)
+            elsif e.http_method == :post
+              SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
+            end
+          }
+          multi.add e.local_path, w
+        # end
+      end
+      cb = Proc.new do
+        end_time = Time.now
+        puts "use time:#{end_time-begin_time} seconds"
+        if callback.nil?
+          puts "success size:#{self.succeed_size}"
+          puts "failed size:#{self.failed_size}"
+          EventMachine.stop
+        else
+          callback.call(multi, succeed_list, failed_list)
         end
       end
-      batch_down_list(todo, method(:complete))
+      link_struct_list.each &for_each_proc
+      multi.callback &cb
     end
-  end
-  def start
-    puts "total size:#{@down_list.size}"
-    event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
-  end
+    def complete(multi, success_list, failed_list)
+      @@succeed_size += success_list.size
+      @@failed_size += failed_list.size
+      success_list.each do |e|
+        e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
+      end
-  def self.get_list(down_list, inter_val: 0, max: 30)
-    ListSpider.new(down_list, inter_val: inter_val, max: max).start
-  end
+      todo = @@down_list.slice!(0, @@max)
+      if todo.empty?
+        puts "success size:#{@@succeed_size}"
+        puts "failed size:#{@@failed_size}"
+        EventMachine.stop
+      else
+        if @@inter_val != 0
+          if success_list.size != 0 || failed_list.size !=0
+            if @@inter_val == RANDOM_TIME
+              sleep(rand(@@random_time_range))
+            else
+              sleep(@@inter_val)
+            end
+          end
+        end
+        event_machine_down(todo, method(:complete))
+      end
+    end
-  def self.get_one(task)
-    ListSpider.new([task]).start
-  end
+    def event_machine_start_list(down_list, callback = nil)
+      EventMachine.run {
+        if down_list.empty?
+          callback.call(nil, [], []) if callback
+        else
+          event_machine_down(down_list, callback)
+        end
+      }
+    end
+    def filter_list(down_list)
+      need_down_list = []
+      down_list.each do |ts|
+        if !@@overwrite_exist && File.exist?(ts.local_path)
+          ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
+        else
+          need_down_list << ts
+        end
+      end
+      return need_down_list
+    end
+    def get_list(down_list, inter_val: 0, max: 30)
+      @@down_list = []
+      need_down_list = filter_list(down_list)
+      @@down_list = @@down_list + need_down_list
+      @@inter_val = inter_val
+      @@max = max
+      @@max = @@down_list.size if @@max == NO_LIMIT_CONCURRENT
+      @@succeed_size = 0
+      @@failed_size = 0
+      puts "total size:#{@@down_list.size}"
+      event_machine_start_list(@@down_list.slice!(0, @@max), method(:complete))
+    end
+    def get_one(task)
+      get_list([task])
+    end
+    def add_task(task)
+      if task.is_a?Array
+        need_down_list = filter_list(task)
+        @@down_list = @@down_list + need_down_list
+      elsif task.is_a?TaskStruct
+        need_down_list = filter_list([task])
+        @@down_list = @@down_list + need_down_list
+      else
+        puts "error task type:#{task.class}"
+      end
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: list_spider
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Charles Zhang
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-04-29 00:00:00.000000000 Z
+date: 2016-05-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-http-request
@@ -58,7 +58,6 @@ extra_rdoc_files: []
 files:
 - lib/delete_unvalid.rb
 - lib/list_spider.rb
-- lib/spider_base.rb
 - lib/spider_helper.rb
 homepage: https://github.com/chinazhangchao/list_spider
 licenses:

data/lib/spider_base.rb DELETED Viewed

@@ -1,298 +0,0 @@
-require 'em-http-request'
-require 'nokogiri'
-require 'fileutils'
-require 'set'
-require File.expand_path('../spider_helper', __FILE__)
-require "addressable/uri"
-class TaskStruct
-  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
-    @origin_href = href
-    @href = href
-    if @href.class == "".class
-      @href = SpiderHelper.string_to_uri(@href)
-    end
-    @local_path = local_path
-    @http_method = http_method
-    @params = params
-    @extra_data = extra_data
-    @parse_method = parse_method
-  end
-  def == (o)
-    o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
-  end
-  attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
-end
-module SpiderBase
-  @@conver_to_utf8 = false
-  @@connection_opts = {:connect_timeout => 2*60}
-  @@overwrite_exist = false
-  @@max_redirects = 10
-  class << self
-    attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
-    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-      @@connection_opts = {
-        :proxy => {
-          :host => proxy_addr,
-          :port => proxy_port
-        }
-      }
-      @@connection_opts[:proxy][:authorization] = [username, password] if username && password
-    end
-    def connect_timeout(max_connect_time)
-      @@connection_opts[:connect_timeout] = max_connect_time
-    end
-    def set_header_option(header_option)
-      @@header_option = optHash
-    end
-    def event_machine_down(link_struct_list, callback = nil)
-      failed_list = []
-      succeed_list = []
-      # puts "event_machine_down callback:#{callback}"
-      multi = EventMachine::MultiRequest.new
-      no_job = true
-      begin_time = Time.now
-      for_each_proc = proc do |e|
-        if !@@overwrite_exist && File.exist?(e.local_path)
-          succeed_list << e
-        else
-          no_job = false
-          opt = {}
-          opt = {:redirects => @@max_redirects}
-          opt[:head] = @@header_option if defined? @@header_option
-          if e.http_method == :post
-            opt[:body] = e.params unless e.params.empty?
-            if @@connection_opts
-              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
-            else
-              w = EventMachine::HttpRequest.new(e.href).post opt
-            end
-          else
-            if @@connection_opts
-              opt[:query] = e.params unless e.params.empty?
-              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
-            else
-              w = EventMachine::HttpRequest.new(e.href).get opt
-            end
-          end
-          w.callback {
-            # puts "complete:#{w.response_header}"
-            s = w.response_header.status
-            puts s
-            if s == 403 || s == 502 #Forbidden
-              # EventMachine.stop
-            elsif s != 404
-              local_dir = File.dirname(e.local_path)
-              FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
-              begin
-                File.open(e.local_path, "w") do |f|
-                  if @@conver_to_utf8 == true
-                    f << SpiderHelper.to_utf8( w.response)
-                  else
-                    f << w.response
-                  end
-                end
-                succeed_list << e
-              rescue Exception => e
-                puts e
-              end
-            end
-          }
-          w.errback {
-            puts "errback:#{w.response_header}"
-            puts e.origin_href
-            puts e.href
-            puts w.response_header.status
-            failed_list << e
-            if e.http_method == :get
-              SpiderHelper.direct_http_get(e.href, e.local_path)
-            elsif e.http_method == :post
-              SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
-            end
-          }
-          multi.add e.local_path, w
-        end
-      end
-      # em_for_each_proc = proc do |e, iter|
-      #   for_each_proc.call(e)
-      #   iter.next
-      # end
-      cb = Proc.new do
-        end_time = Time.now
-        puts "use time:#{end_time-begin_time} seconds"
-        if callback.nil?
-          puts "success size:#{self.succeed_size}"
-          puts "failed size:#{self.failed_size}"
-          EventMachine.stop
-        else
-          callback.call(multi, succeed_list, failed_list)
-        end
-      end
-      after_proc = proc {
-        if no_job #没有任务直接调回调
-          cb.call
-        else
-          multi.callback &cb
-        end
-      }
-      # if DownLoadConfig::MaxConcurrent <= 0
-        link_struct_list.each &for_each_proc
-        after_proc.call
-      # else
-        # EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
-      # end
-    end
-    def event_machine_start(url, down_dir, file_name, callback = nil)
-      down_dir << "/" unless down_dir.end_with?("/")
-      FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
-      down_list = []
-      down_list << TaskStruct.new(url, down_dir + file_name)
-      EventMachine.run {
-        index = 0
-        begin_time = Time.now
-        event_machine_down(down_list, callback)
-        end_time = Time.now
-      }
-    end
-    def event_machine_start_list(down_list, callback = nil)
-      EventMachine.run {
-        index = 0
-        begin_time = Time.now
-        event_machine_down(down_list, callback)
-        end_time = Time.now
-      }
-    end
-  end#self end
-end#SpiderBase end
-def batch_down_list(down_list, callback = nil)
-  SpiderBase.event_machine_down(down_list, callback)
-end
-def event_machine_start_list(down_list, callback = nil)
-  SpiderBase.event_machine_start_list(down_list, callback)
-end
-def parse_down_load_url(url, down_dir, file_name, callback = nil)
-  SpiderBase.event_machine_start(url, down_dir, file_name, callback)
-end
-class GetRelative
-  def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
-    @get_depth = get_depth
-    @base_url = base_url
-    @down_dir = down_dir
-    @suffix = suffix
-  end
-  def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
-    puts "success"
-    puts succeed_list.size
-    puts "error"
-    puts failed_list.size
-    puts failed_list
-    puts "get index complete"
-    if succeed_list.size > 0
-      link_list = []
-      succeed_list.each do |e|
-        doc = Nokogiri::HTML(open(e.local_path))
-        link_list.concat(doc.css("a"))
-      end
-      puts "extrat href complete"
-      down_dir << "/" unless down_dir.end_with?("/")
-      FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
-      down_list = []
-      set_list = Set.new
-      link_list.each do |link|
-        href = link['href']
-        next if href.nil? || !href.include?(@suffix)
-        #process such as "scheme_2.html#SEC15"
-        href = href[0, href.index(@suffix) + 5]
-        #process such as "./preface.html"
-        href = href[2..-1] if href.start_with?("./")
-        next if !set_list.add?(href)
-        unless base_url.end_with?("/")
-          i = base_url.rindex"/"
-          base_url = base_url[0..i]
-        end
-        #process such as "http://www.ccs.neu.edu/~dorai"
-        next if href.start_with?("http:") || href.start_with?("https:")
-        local_path = down_dir + href
-        down_list.push( TaskStruct.new(base_url + href, local_path))
-      end
-      puts "down list complete,size:#{down_list.size}"
-      batch_down_list(down_list, callback)
-    end
-  end
-  def down_other_node (multi, succeed_list, failed_list)
-    puts "down_other_node"
-    @get_depth = @get_depth - 1
-    puts "depth:#{@get_depth}"
-    if @get_depth <= 0
-      down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
-    else
-      down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
-    end
-  end
-  def event_all_complete (multi, succeed_list, failed_list)
-    puts "all complete"
-    puts "success"
-    puts succeed_list.size
-    puts "error"
-    puts failed_list.size
-    puts failed_list
-    EventMachine.stop
-  end
-  attr_writer :get_depth,:base_url,:down_dir
-  def start
-    index_file_name = "index.html"
-    #http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
-    unless @base_url.end_with?("/")
-      i = @base_url.rindex"/"
-      index_file_name = @base_url[i+1 .. -1]
-    end
-    @get_depth = @get_depth - 1
-    puts @get_depth
-    if @get_depth <= 0
-      parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
-    else
-      parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
-    end
-  end
-  def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
-    GetRelative.new(base_url,down_dir, get_depth, suffix).start
-  end
-end #GetRelative