RubyGems - spider_bot - Versions diffs - 0.0.4 - Mend

spider_bot 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +7 -0
data/.gitignore +16 -0
data/.yardopts +2 -0
data/Gemfile +10 -0
data/LICENSE.txt +22 -0
data/README.md +148 -0
data/Rakefile +2 -0
data/bin/spider +12 -0
data/lib/spider_bot/base.rb +31 -0
data/lib/spider_bot/cli.rb +183 -0
data/lib/spider_bot/crawl.rb +235 -0
data/lib/spider_bot/error.rb +5 -0
data/lib/spider_bot/http/client.rb +166 -0
data/lib/spider_bot/http/response.rb +83 -0
data/lib/spider_bot/load.rb +30 -0
data/lib/spider_bot/logging.rb +21 -0
data/lib/spider_bot/railte.rb +6 -0
data/lib/spider_bot/string/date.yml +29 -0
data/lib/spider_bot/string/time.rb +119 -0
data/lib/spider_bot/version.rb +3 -0
data/lib/spider_bot.rb +37 -0
data/spider_bot.gemspec +32 -0
metadata +206 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6c8009ebd495001ed425bd236e2b894d1b3124dd
+  data.tar.gz: 97e5ce361bc6165956fac9f879c526ffb7da922d
+SHA512:
+  metadata.gz: 951b3f8a49ec3034b6b3031a40270eea9fcb64f1685f1982878523005ae292c0f45dbee000f67bc67ecd16a4a569b39c166d5be5559503ecdde2daa2e4ece00e
+  data.tar.gz: 26d664fb1edc26e8edcd8c5aab9f38cd4bc48b3618c098384e000c71aaa7f8f2c357e8dd447c87940d55f64c40ff73db84a01b356e09cbc88ec8c36a8a6f8818

data/.gitignore ADDED Viewed

@@ -0,0 +1,16 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+*.bundle
+*.so
+*.o
+*.a
+*.swp
+*.swo
+mkmf.log

data/.yardopts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --markup-provider=redcarpet
2	+ --markup=markdown

data/Gemfile ADDED Viewed

@@ -0,0 +1,10 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in spider_bot.gemspec
+gemspec
+group :doc do
+  gem 'yard'
+  gem 'redcarpet'
+  gem 'github-markup'
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2015 yee.li
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,148 @@
+# SpiderBot
+一个简单的机器爬虫
+## SpiderBot 安装
+将下列文字添加到你程序中的Gemfile里
+```
+gem 'spider_bot'
+```
+并执行:
+    $ bundle
+或者直接通过命令安装:
+    $ gem install spider_bot
+## SpiderBot 文件
+#### 文件格式
+1.单站单页爬取， 返回html文本
+```
+SpiderBot.crawl("http://example.com", #{origin_options})
+```
+2.单站多页爬取
+```
+SpiderBot.crawl("#{url}", data: Proce.new{ |data| data }, since: Proce.new{ |data| data }) do
+  paginate do
+    option :type, :json
+    option :path, '#{path}'
+    # 翻页页码设置
+    option :start, 0
+    option :add, 10
+    option :expire, 100 #如果设置为-1，将默认进行无限次爬取
+    option :sleep, 6
+    # 翻页后获取信息设置
+    option :data, Proc.new{ |data| data }
+    option :since, Proc.new{ |since| since }
+    option query, {page: "%{page}", since_id: %{since}}
+  end
+  crawl_data do |data|
+    # 解析爬取的数据...
+  end
+end
+```
+3.多站，多页内容爬取， 可以配合Rails或者padrino进行任务爬去
+```
+class Mybot < SpiderBot::Base
+  #通过 "spider start” 或者 “spider crawl" 自动执行的方法
+  auto do
+    origin "#{url}", data: Proc.new{ |data| data }, since: Proce.new{ |since| since }
+    execute do
+      paginate do
+        option :type, :json
+        option :path, '#{path}'
+        # 翻页页码设置
+        option :start, 0
+        option :add, 10
+        option :expire, 100
+        option :sleep, 6
+        # 翻页后获取信息设置
+        option :data, Proc.new{ |data| data }
+        option :since, Proc.new{ |since| since }
+       option query, { page: "%{page}", since_id: "%{since}" }
+      end
+      crawl_data do |data|
+        # 解析爬取的数据...
+      end
+    end
+  end
+end
+```
+####初始页面参数设置 origin_options
+* path
+* type
+* headers
+* query
+* data 获取初始页面数据
+* since 获取初始页码数据最后一条参数，用户翻页
+####翻页参数设置
+1.翻页后文本设置
+* paginate_type 翻页后类型[:html, :json, :xml]
+* paginate_path 翻页后的Path
+* paginate_query 翻页后的参数设置 {page: "%{page}", since: "%{since}"}
+2.翻页设置
+* paginate_start #翻页起始页， 默认为0
+* paginate_add #翻页增加数， 默认为 1
+* paginate_expire #翻页总结数， 默认为30
+* paginate_sleep #翻页休息数， 默认为 0
+3.翻页信息获取
+* paginate_data 获取翻页后的数据, 不填写，默认为origin data
+* paginate_since 获取翻页后最后数据， 不填写， 默认为 origin_since
+## SpiderBot 命令
+* spider url #直接通过命令爬取, 返回html文本
+  - -q query， 设置Query
+  - -d data， 爬取数据
+  - -o out，输出到文件
+* spider crawl #运行bot文件
+  - -b bot, 运行单一bot文件
+  - -d dir, 运行指定目录里的bot文件
+  - -p expire_page, 总翻页数（用以替代 option :export）
+* spider start #运行爬取服务
+  - -d daemon, 后台运行
+  - -t time, #设置爬取时间间隔， 默认为10
+  - -r random #将爬取时间间隔， 设置为时间下一个随机数， 默认为10的随机数
+  - -e env #设置Sipder运行环境， 如果配合Rails或者Padrino， 获取指定运行环境
+  - -p expire_page, 总翻页数（用以替代 option :export）
+* spider stop #停止爬取服务

data/Rakefile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require "bundler/gem_tasks"
2	+

data/bin/spider ADDED Viewed

@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+require_relative '../lib/spider_bot/cli'
+begin
+  SpiderBot::CLI.start
+rescue SystemExit => e
+  Kernel.exit(e.status)
+rescue Exception => e
+  STDERR.puts e.message
+  STDERR.puts e.backtrace.join("\n")
+  Kernel.exit 1
+end

data/lib/spider_bot/base.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module SpiderBot
+  class Base
+    class << self
+      #
+      # execute method with command "spider start" and "spider crawl"
+      #
+      def auto &block
+        if defined?(BOTCONSOLE)
+          klass = Class.new do
+            def origin url, options = {}
+              @origin_url = url
+              @origin_options = options
+            end
+            def execute name = nil, &block
+              crawl_instance = Crawl.new(@origin_url, @origin_options)
+              crawl_instance.instance_eval &block
+            end
+          end
+          klass.allocate.instance_eval &block
+        end
+      end
+      def crawl url, options = {}
+        crawl_instance = Crawl.new(url, options)
+        crawl_instance.crawl_data
+      end
+    end
+  end
+end

data/lib/spider_bot/cli.rb ADDED Viewed

@@ -0,0 +1,183 @@
+# encoding: utf-8
+$LOAD_PATH.unshift(File.expand_path('../..', __FILE__))
+require "thor"
+require 'spider_bot'
+require 'daemons'
+BOTCONSOLE = true
+module SpiderBot
+  class CLI < Thor
+    desc "url", "Crawl url"
+    method_option :query,
+      aliases: "-q",
+      desc: "Set url query"
+    method_option :data,
+      aliases: "-d",
+      desc: "Match html data"
+    method_option :out,
+      aliases: ["-o"],
+      desc: "Write to file"
+    def url(arg)
+      data = Crawl.new(arg, options).crawl_data
+      return File.open(options[:out], "w"){ file.puts data } if options[:out]
+      return puts data
+    end
+    desc "crawl", "Run spider bot file"
+    method_option :bot,
+      aliases: ["-b"],
+      desc: "Read bot flle"
+    method_option :dir,
+      aliases: ["-d"],
+      desc: "Read bot directory"
+    method_option :expire,
+      aliases: ["-p"],
+      desc: "Read data expired number"
+    def crawl
+      $expire_num = options[:expire].to_i if options[:expire]
+      require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
+      if options[:bot]
+        bot_file = File.expand_path(options[:bot])
+        return raise "Bot file not found" if !File.exists?(bot_file)
+        load bot_file
+      end
+      if options[:dir]
+        bot_dir = File.expand_path(options[:dir])
+        return raise "Dir is not found" if !Dir.exists?(bot_dir)
+        threads = []
+        Dir.glob("#{bot_dir}/*_bot.rb").each do |file|
+          threads << Thread.new do
+            begin
+              SpiderBot.logger.info "loading bot file with #{file}."
+              load file
+            rescue Exception => e
+              SpiderBot.logger.error "has errors with loading bot file #{ file }"
+              SpiderBot.logger.error e.to_s
+            end
+          end
+        end
+        threads.each { |t| t.join }
+      end
+    end
+    desc "start", "Run spider bot service"
+    method_option :daemon,
+      aliases: ["-d"],
+      desc: "Run spider bot service in background"
+    method_option :time,
+      aliases: ["-t"],
+      desc: "Set crawl interval"
+    method_option :random,
+     aliases: ["-r"],
+     desc: "Set crawl interval to random "
+    method_option :env,
+     aliases: ["-e"],
+     desc: "set spider service environment"
+    method_option :expire,
+      aliases: ["-p"],
+      desc: "Read data expired page_number"
+    def start
+      puts "start....."
+      $expire_num = options[:expire].to_i if options[:expire]
+      if options[:env]
+        ENV['RACK_ENV'] = options[:env]
+      else
+        ENV['RACK_ENV']= 'development'
+      end
+      require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
+      FileUtils.mkdir_p("tmp/pids") if !File.exists?("tmp/pids")
+      daemon_options = {
+        app_name: 'spider',
+        ontop: true,
+        dir: 'tmp/pids',
+      }
+      sleep_time = 10
+      if options[:daemon]
+        daemon_options[:ontop] = false
+      else
+        puts "press ctrl-c exit"
+      end
+      stop if File.exists?("tmp/spider.pid")
+      if option_time = options[:time]
+        parse_time = option_time.match(/[d|h|m]/)
+        sleep_time = if parse_time
+          case parse_time[0]
+          when "d"
+            option_time.to_i * 60 * 60 * 24
+          when "h"
+            option_time.to_i * 60 * 60
+          when "m"
+            option_time.to_i * 60
+          end
+        else
+          option_time.to_i
+        end
+      end
+      Daemons.daemonize(daemon_options)
+      loop do
+        threads = []
+        BOTDIR.each do |file|
+          threads << Thread.new do
+            begin
+              SpiderBot.logger.info "loading bot file with #{file}."
+              load file
+            rescue Exception => e
+              SpiderBot.logger.error "has errors with loading bot file #{ file }"
+              SpiderBot.logger.error e.to_s
+            end
+            sleep(10)
+          end
+        end
+        threads.each { |t| t.join }
+        if options[:random]
+          random_time = Random.new.rand(sleep_time)
+          sleep(random_time.to_i)
+        else
+          sleep(sleep_time.to_i)
+        end
+      end
+    end
+    desc 'stop', "Stop spider bot service"
+    def stop
+      pid = File.read("tmp/pids/spider.pid").to_i
+      Process.kill(9, pid)
+      File.delete("tmp/pids/spider.pid")
+    end
+  end
+end

data/lib/spider_bot/crawl.rb ADDED Viewed

@@ -0,0 +1,235 @@
+module SpiderBot
+  class Crawl
+    # Initialize a new Spider Bot
+    #
+    # @param url [String] the spider target website curl
+    # @param options [Hash] the spider crawl configurate options
+    # @option options :type [Symbol] the request body format, `:html` or `:json`
+    # @option options :headers [Hash] the custom request headers
+    # @option options :path, [String] the custom request path
+    # @option options :query [Hash] the request query
+    # @option options :user_agent [String] the custom request user agent
+    # @option options :source [Boolean]
+    # @option options :data [Proc] get crawl data list in body
+    # @option options :first [Proc] get crawl data list first item
+    # @option options :last [Porc] get crawl data list last item
+    # @option options :encode [String] custom request encode
+    def initialize(url, options = {})
+      parse_uri = URI.parse url
+      @uri = parse_uri.scheme + "://" + parse_uri.host
+      # don't add 443 port append to url when access https website
+      if !["80", "443"].include?(parse_uri.port.to_s)
+        @uri = @uri + ":" + parse_uri.port.to_s
+      end
+      @origin_path = parse_uri.path || "/"
+      @origin_type = options[:type] || :html
+      @origin_headers = options[:headers] || {}
+      @origin_query = options[:query] || {}
+      @origin_user_agent = options[:user_agent] || "Mac Safari"
+      @origin_source = options[:source] || false
+      @origin_data = options[:data]
+      @origin_first = options[:first]
+      @origin_last = options[:last]
+      @origin_encode = options[:encode]
+      @page_path = @origin_path
+      @page_type = @origin_type
+      @page_headers = @origin_headers || {}
+      @page_query = {}
+      @page_data = @origin_data
+      @page_first = @origin_first
+      @page_last = @origin_last
+      @page_start = 1
+      @page_add = 1
+      @page_expire = 10
+      @page_sleep = 0
+      @paginate_last = nil
+      @paginate_error = 0
+      @paginate_type = :html
+      @paginate_path = ""
+      @paginate_query = {}
+      @connection = Http::Client.new do |http|
+        http.url= @uri
+        http.user_agent= @origin_user_agent
+        http.headers= @origin_headers
+      end
+    end
+    # Process crawl data
+    #
+    # @param a [block]
+    def crawl_data(&block)
+      @paginate_num = @page_start
+      catch :all do
+        begin
+          crawl_response = crawl_request(@origin_path, @origin_query, @origin_type, @origin_data, @origin_first, @origin_last, &block)
+          return crawl_response if !block_given?
+          process_response(crawl_response, &block)
+        rescue Exception => e
+          handle_error(e)
+          crawl_data(&block)
+        end
+        @paginate_error = 0
+        return if @page_query.blank? && @page_path == @origin_path
+        crawl_paginate(&block)
+      end
+    end
+    private
+    def crawl_paginate(&block)
+      @page_headers.merge({"X-Requested-With" => "XMLHttpRequest"}) if @page_type.to_s == 'json'
+      @connection.headers = @page_headers
+      begin
+        loop do
+          real_page_num  = (@page_start == 0 && @page_add > 1) ? (@paginate_num / @page_add) + 1 : @paginate_num
+          if defined?($expire_num)
+            if $expire_num > 1
+              break if real_page_num > $expire_num.to_i
+            else
+              break if real_page_num > 1
+            end
+          end
+          #  break crawl_paginate current page number more than @page_expire and @page_expre
+          if real_page_num > @page_expire && @page_expire != -1
+            SpiderBot.logger.info "Crawl finished..."
+            SpiderBot.logger.info "Finish reson: The current page more than setting paginate expire"
+            break
+          end
+          sleep(@page_sleep) if @page_sleep > 0
+          path = @page_path.to_s % {page: @paginate_num}
+          query_str = @page_query.to_s % { page: @paginate_num, last: @paginate_last, first: @paginate_first }
+          query = eval(query_str)
+          crawl_response = crawl_request(path, query, @page_type, @page_data, @page_first, @page_last, &block)
+          process_response(crawl_response, &block)
+        end
+      rescue Exception => e
+        @paginate_num += @page_add if @paginate_error == 2
+        handle_error(e)
+        crawl_paginate(&block)
+      end
+    end
+    def crawl_request(path, query, type, data, first, last, &block)
+      @paginate_path = path
+      @paginate_query = query
+      response = @connection.get(path, query)
+      return if !response
+      return if response.status != 200
+      options = { encode: @origin_encode } if @origin_encode
+      if @origin_source && !block_given?
+        return response.body(options)
+      end
+      if type.to_s == "html"
+        @paginate_type = :html
+        body = Nokogiri::HTML response.body(options)
+      elsif type.to_s == "json"
+        @paginate_type = :json
+        body = MultiJson.load response.body(options)
+      else
+        @paginate_type = response.parser
+        body = response.parsed
+      end
+      return if body.nil?
+      return body if data.nil?
+      body_data = data.call(body) if data
+      @paginate_first = first.call(body_data, body) if first
+      @paginate_last = last.call(body_data, body) if last
+      return body_data
+    end
+    def get_page_url
+      if !@paginate_query.blank?
+        @uri + @paginate_path + "?" + @paginate_query.map{ |k,v| "#{k}=#{v}" }.join("&")
+      else
+        @uri + @paginate_path
+      end
+    end
+    def set_paginate_headers(arg)
+      @page_headers = arg || {}
+    end
+    # set crawl paginate settings
+    #
+    # @example
+    #   paginate do
+    #     option :path, '/path'
+    #     option :query, {page: "%{page}"}
+    #     option :first, Proc.new{|data| data.css("#item")}
+    #     option :last, Proc.new{|data| data.css("#item")}
+    #     option :type, :html
+    #     option :data, Proc.new{|body| body.css("#item")}
+    #     option :start, 1
+    #     option :add, 1
+    #     option :expire, 100
+    #     option :sleep, 100
+    #   end
+    def paginate(&block)
+      block.call
+    end
+    def option(name, params)
+      raise "set paginate options has error" if %i(path query first last type data start add expire sleep).include?(name.to_s)
+      eval("@page_#{name} = params")
+    end
+    def break_all
+      throw :all
+    end
+    def handle_error(error)
+      SpiderBot.logger.error "crawling url #{ get_page_url } has error..."
+      SpiderBot.logger.error error.to_s
+      break_all if @paginate_error == 3
+      @paginate_error += 1
+      sleep( 60 * @paginate_error )
+    end
+    # Print error infomation with http client response blank
+    #
+    # @param response [Object] The Faraday connection builder
+    def process_response(response, &block)
+      if response.blank?
+        SpiderBot.logger.info "Crawl finished..."
+        SpiderBot.logger.info "Finish reson: Crawl response body is blank..."
+        break_all
+      end
+      SpiderBot.logger.info "crawling page for #{get_page_url}"
+      yield response, @paginate_num, @paginate_type
+      @paginate_num += @page_add
+      @paginate_error = 0
+    end
+  end
+end

data/lib/spider_bot/error.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module SpiderBot
+  class Error < StandardError; end
+  class TimeoutError < Faraday::TimeoutError; end
+  class ConnectionFaild < Faraday::ConnectionFailed; end
+end