spider2 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +15 -0
- data/Rakefile +23 -0
- data/init.rb +3 -0
- data/install.rb +2 -0
- data/lib/generators/spider/spider_generator.rb +42 -0
- data/lib/generators/spider/templates/base_page.rb +6 -0
- data/lib/generators/spider/templates/base_page_spec.rb +13 -0
- data/lib/generators/spider/templates/index_page.rb +6 -0
- data/lib/generators/spider/templates/index_page_spec.rb +14 -0
- data/lib/generators/spider/templates/index_page_test.rb +10 -0
- data/lib/generators/spider/templates/list_page.rb +6 -0
- data/lib/generators/spider/templates/list_page_spec.rb +22 -0
- data/lib/generators/spider/templates/list_page_test.rb +10 -0
- data/lib/generators/spider/templates/show_page.rb +14 -0
- data/lib/generators/spider/templates/show_page_spec.rb +19 -0
- data/lib/generators/spider/templates/show_page_test.rb +10 -0
- data/lib/generators/spider/templates/site.rb +7 -0
- data/lib/generators/spider/templates/site_spec.rb +13 -0
- data/lib/generators/spider/templates/test.rb +10 -0
- data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
- data/lib/generators/spider_migration/templates/migration.rb +42 -0
- data/lib/spider/active_record_methods.rb +60 -0
- data/lib/spider/http.rb +43 -0
- data/lib/spider/page/filter.rb +132 -0
- data/lib/spider/page/label.rb +28 -0
- data/lib/spider/page/pagination.rb +142 -0
- data/lib/spider/page/proxy.rb +149 -0
- data/lib/spider/page/publish.rb +78 -0
- data/lib/spider/page/validation.rb +136 -0
- data/lib/spider/page.rb +759 -0
- data/lib/spider/site.rb +225 -0
- data/lib/spider/spider_page.rb +18 -0
- data/lib/spider/spider_page_label.rb +5 -0
- data/lib/spider/version.rb +3 -0
- data/lib/spider.rb +81 -0
- data/lib/tasks/spider_tasks.rake +86 -0
- data/test/spider_fu_test.rb +9 -0
- data/test/test_helper.rb +4 -0
- data/uninstall.rb +2 -0
- metadata +151 -0
| @@ -0,0 +1,142 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
            # 分页处理
         | 
| 3 | 
            +
            # 分页一般有三种形式
         | 
| 4 | 
            +
            # 1)
         | 
| 5 | 
            +
            # 上页 1 2 3 4 5 6 下页
         | 
| 6 | 
            +
            # 可以使用
         | 
| 7 | 
            +
            # paginate :scope=>"#paginte-div"
         | 
| 8 | 
            +
            # 还有一种形式是 一共有多少页
         | 
| 9 | 
            +
            # 比如 88 页
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            module Spider::Page::Pagination
         | 
| 12 | 
            +
              module ClassMethods
         | 
| 13 | 
            +
                # 实现一个简单的可以解决大部分分页的功能
         | 
| 14 | 
            +
                # paginate :body,:links=>".paginate",:to=>[:body] do |page|
         | 
| 15 | 
            +
                # end
         | 
| 16 | 
            +
                # options:
         | 
| 17 | 
            +
                #  scope: 指定分页链接范围
         | 
| 18 | 
            +
                #  比如:<div class="pagination"><a href="?1">1</a></div>
         | 
| 19 | 
            +
                #  可以指定scope为 div.pagination
         | 
| 20 | 
            +
                #  那么所有 div.pagination下的链接都认为是分页链接
         | 
| 21 | 
            +
                # include_self: boolean
         | 
| 22 | 
            +
                #  是否包含当前页面,默认为false
         | 
| 23 | 
            +
                # sort: boolean
         | 
| 24 | 
            +
                #  是否对链接进行排序,默认为false
         | 
| 25 | 
            +
                def paginate(to=:body,options={},&block)
         | 
| 26 | 
            +
                  options.assert_valid_keys :scope,:include_self,:sort
         | 
| 27 | 
            +
                  options.reverse_merge! :include_self=>false,:sort=>false,:scope=>".pagination"
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  partial_name = "#{to}_partial"
         | 
| 30 | 
            +
                  paginated_pages_name = "#{to}_paginated_pages"
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  # define_method partial_name,&block # 获得页面部分的body
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                  define_method paginated_pages_name do
         | 
| 35 | 
            +
                    pages = self.pages(:scope=>options[:scope])
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    if options[:include_self]
         | 
| 38 | 
            +
                      pages.unshift(self)
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    if options[:sort]
         | 
| 42 | 
            +
                      pages = pages.sort_by{|page| page.url }
         | 
| 43 | 
            +
                    end
         | 
| 44 | 
            +
                    pages
         | 
| 45 | 
            +
                  end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  # 定义获取全文的method
         | 
| 48 | 
            +
                  define_method to do 
         | 
| 49 | 
            +
                    send(paginated_pages_name).collect{|page|
         | 
| 50 | 
            +
                      page.send(partial_name)
         | 
| 51 | 
            +
                    } * Spider::Page.paginate_symbol
         | 
| 52 | 
            +
                  end
         | 
| 53 | 
            +
                end
         | 
| 54 | 
            +
                
         | 
| 55 | 
            +
                # 产生分页链接
         | 
| 56 | 
            +
                # generate_paginated_urls 10,"http://www.google.com/article_%d.html",:first=>"http://www.google.com/article.html"
         | 
| 57 | 
            +
                def generate_paginated_urls(page_count,url,options={})
         | 
| 58 | 
            +
                  options.assert_valid_keys :first,:until
         | 
| 59 | 
            +
                  page_count = 1000 if options[:until] # 1000页总够了吧
         | 
| 60 | 
            +
                  urls = []
         | 
| 61 | 
            +
                  page_count.times do |i|
         | 
| 62 | 
            +
                    if options[:first] && i.zero?
         | 
| 63 | 
            +
                      urls << options[:first]
         | 
| 64 | 
            +
                    else
         | 
| 65 | 
            +
                      urls << url % i
         | 
| 66 | 
            +
                    end
         | 
| 67 | 
            +
                  end
         | 
| 68 | 
            +
                  urls
         | 
| 69 | 
            +
                end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
              end
         | 
| 72 | 
            +
              
         | 
| 73 | 
            +
              module InstanceMethods
         | 
| 74 | 
            +
                # http://www.google.com/article_1.html
         | 
| 75 | 
            +
                # page.generate_paginated_urls(10,/article(_\d+?)\.html/)
         | 
| 76 | 
            +
                # 一般有几种方式
         | 
| 77 | 
            +
                # 1):
         | 
| 78 | 
            +
                # xxxxx.html
         | 
| 79 | 
            +
                # xxxxx_1.html
         | 
| 80 | 
            +
                # xxxxx_2.html
         | 
| 81 | 
            +
                # 2):
         | 
| 82 | 
            +
                # xxx.asp
         | 
| 83 | 
            +
                # xxx.asp?page=2
         | 
| 84 | 
            +
                # xxx.asp?page=3
         | 
| 85 | 
            +
                #  xxxxx(_%d).html
         | 
| 86 | 
            +
                #  xxx.asp(?page=%d)
         | 
| 87 | 
            +
                # 比如 http://www.powerapple.com/articles/11111.html
         | 
| 88 | 
            +
                #      http://www.powerapple.com/articles/11111_1.html
         | 
| 89 | 
            +
                # 如果已知是 5 页
         | 
| 90 | 
            +
                # 那么:http://www.powerapple.com/articles/11111.html
         | 
| 91 | 
            +
                #       http://www.powerapple.com/articles/11111_2.html
         | 
| 92 | 
            +
                #       http://www.powerapple.com/articles/11111_3.html
         | 
| 93 | 
            +
                #       http://www.powerapple.com/articles/11111_4.html
         | 
| 94 | 
            +
                #       http://www.powerapple.com/articles/11111_5.html
         | 
| 95 | 
            +
                # generate_paginated_urls 5,
         | 
| 96 | 
            +
                #   :url=>"http://www.powerapple.com/articles/1111_%d.html",
         | 
| 97 | 
            +
                #   :start=>1,
         | 
| 98 | 
            +
                #   :unshift=>"http://www.powerapple.com/articles/1111.html"
         | 
| 99 | 
            +
                def generate_paginated_urls(url,options={})
         | 
| 100 | 
            +
                  options.assert_valid_keys :start,:prepend,:append,:until_failure,:count
         | 
| 101 | 
            +
                  number = options[:count].to_i
         | 
| 102 | 
            +
                  number = 1000 if options[:until_failure] # 根据条件,获取页面
         | 
| 103 | 
            +
                  new_url = url
         | 
| 104 | 
            +
                  start   = options[:start]
         | 
| 105 | 
            +
                  urls = []
         | 
| 106 | 
            +
                  number.times do |index|
         | 
| 107 | 
            +
                    index += 1
         | 
| 108 | 
            +
                    next if start && index < start
         | 
| 109 | 
            +
                    page_url = (url % index)
         | 
| 110 | 
            +
                    if options[:until_failure]
         | 
| 111 | 
            +
                      response = Spider::Http.head(page_url)
         | 
| 112 | 
            +
                      unless response.code == 200
         | 
| 113 | 
            +
                        break # 遇到404就退出
         | 
| 114 | 
            +
                      end
         | 
| 115 | 
            +
                    end
         | 
| 116 | 
            +
                    urls << page_url
         | 
| 117 | 
            +
                  end
         | 
| 118 | 
            +
                  if options[:prepend]
         | 
| 119 | 
            +
                    [options[:prepend]].flatten.reverse.each do |url|
         | 
| 120 | 
            +
                      urls.unshift(url)
         | 
| 121 | 
            +
                    end
         | 
| 122 | 
            +
                  end
         | 
| 123 | 
            +
                  if options[:append]
         | 
| 124 | 
            +
                    [options[:append]].flatten.reverse.each do |url|
         | 
| 125 | 
            +
                      urls.push(url)
         | 
| 126 | 
            +
                    end
         | 
| 127 | 
            +
                  end
         | 
| 128 | 
            +
                  urls
         | 
| 129 | 
            +
                end
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                def generate_paginated_pages(*args)
         | 
| 132 | 
            +
                  urls = generate_paginated_urls(*args)
         | 
| 133 | 
            +
                  urls.collect{|url| go(url) }
         | 
| 134 | 
            +
                end
         | 
| 135 | 
            +
              end
         | 
| 136 | 
            +
              
         | 
| 137 | 
            +
              def self.included(base)
         | 
| 138 | 
            +
                base.send(:include,InstanceMethods)
         | 
| 139 | 
            +
                base.send(:extend,ClassMethods)
         | 
| 140 | 
            +
              end
         | 
| 141 | 
            +
              
         | 
| 142 | 
            +
            end
         | 
| @@ -0,0 +1,149 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
            module Spider::Page::Proxy
         | 
| 3 | 
            +
              def self.included(base)
         | 
| 4 | 
            +
                base.send(:include,InstanceMethods)
         | 
| 5 | 
            +
                base.send(:extend,ClassMethods)
         | 
| 6 | 
            +
                base.class_eval do
         | 
| 7 | 
            +
                  class_attribute :proxies
         | 
| 8 | 
            +
                  class_attribute :disabled_proxies
         | 
| 9 | 
            +
                  self.proxies = []
         | 
| 10 | 
            +
                  self.disabled_proxies = []
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  before_fetch do |page|
         | 
| 13 | 
            +
                    proxies.compact!
         | 
| 14 | 
            +
                    proxies.uniq!
         | 
| 15 | 
            +
                    host,port = proxies.shuffle.first
         | 
| 16 | 
            +
                    port ||= 80
         | 
| 17 | 
            +
                    if host
         | 
| 18 | 
            +
                      logger.debug "set proxy: #{host}:#{port}"
         | 
| 19 | 
            +
                      Spider::Http.http_proxy host,port
         | 
| 20 | 
            +
                    else
         | 
| 21 | 
            +
                      Spider::Http.clear_proxy
         | 
| 22 | 
            +
                    end
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  after_fetch do |page|
         | 
| 26 | 
            +
                    logger.debug "reset proxy"
         | 
| 27 | 
            +
                    # Spider::Http.http_proxy old_host,old_port
         | 
| 28 | 
            +
                    if page.content.blank?
         | 
| 29 | 
            +
                      # retry, and set proxy to disabled
         | 
| 30 | 
            +
                      # proxies
         | 
| 31 | 
            +
                      puts "proxies before:#{self.proxies.inspect}"
         | 
| 32 | 
            +
                      disabled_proxy = proxies.find{|proxy| proxy.first == Spider::Http.default_options[:http_proxyaddr] }
         | 
| 33 | 
            +
                      proxies.delete disabled_proxy
         | 
| 34 | 
            +
                      self.disabled_proxies += [disabled_proxy]
         | 
| 35 | 
            +
                      puts "proxies after:#{self.proxies.inspect}"
         | 
| 36 | 
            +
                      unless proxies.empty?
         | 
| 37 | 
            +
                        puts 'retry'
         | 
| 38 | 
            +
                        page.request
         | 
| 39 | 
            +
                        next
         | 
| 40 | 
            +
                      else
         | 
| 41 | 
            +
                        puts 'finished retry.'
         | 
| 42 | 
            +
                        # no proxies available
         | 
| 43 | 
            +
                        # recover proxies
         | 
| 44 | 
            +
                        # 以便下次仍然使用(防止一次意外失败,而永久排除)
         | 
| 45 | 
            +
                        self.proxies += self.disabled_proxies
         | 
| 46 | 
            +
                        self.disabled_proxies = []
         | 
| 47 | 
            +
                        # 不用代理服务器使用自身来获取
         | 
| 48 | 
            +
                      end
         | 
| 49 | 
            +
                    end
         | 
| 50 | 
            +
                    Spider::Http.clear_proxy
         | 
| 51 | 
            +
                  end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
             | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
              end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
              module ClassMethods
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                def disable_proxy
         | 
| 60 | 
            +
                  proxy(nil,nil)
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                def validate_proxies
         | 
| 64 | 
            +
                  valid_proxies = proxies.find_all do |proxy|
         | 
| 65 | 
            +
                    valid_proxy?(*proxy)
         | 
| 66 | 
            +
                  end
         | 
| 67 | 
            +
                  invalid_proxies = proxies - valid_proxies
         | 
| 68 | 
            +
                  {:valid => valid_proxies,:invalid => invalid_proxies}
         | 
| 69 | 
            +
                end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                # 指定一个 file 作为 proxy 来源
         | 
| 72 | 
            +
                # # ip:port
         | 
| 73 | 
            +
                def proxy_file(file)
         | 
| 74 | 
            +
                  config_root = File.join(Rails.root,"config","spiders")
         | 
| 75 | 
            +
                  if file =~ /^\//
         | 
| 76 | 
            +
                    # absolute path
         | 
| 77 | 
            +
                    content = File.read file
         | 
| 78 | 
            +
                  else
         | 
| 79 | 
            +
                    content = File.read(File.join(config_root,file))
         | 
| 80 | 
            +
                  end
         | 
| 81 | 
            +
                  proxies = []
         | 
| 82 | 
            +
                  content.each_line do |line|
         | 
| 83 | 
            +
                    line = line.strip
         | 
| 84 | 
            +
                    if line =~ /^\s*#/
         | 
| 85 | 
            +
                      # 注释
         | 
| 86 | 
            +
                    else
         | 
| 87 | 
            +
                      if line =~ /\d+?\.\d+?\.\d+?\.\d+?/
         | 
| 88 | 
            +
                        ip,port = line.split(":")
         | 
| 89 | 
            +
                        port ||= 80
         | 
| 90 | 
            +
                        proxies += [[ip,port]]
         | 
| 91 | 
            +
                      end
         | 
| 92 | 
            +
                    end
         | 
| 93 | 
            +
                  end
         | 
| 94 | 
            +
                  self.proxy do |the_proxies|
         | 
| 95 | 
            +
                    proxies.each do |p|
         | 
| 96 | 
            +
                      the_proxies += [p]
         | 
| 97 | 
            +
                    end
         | 
| 98 | 
            +
                  end
         | 
| 99 | 
            +
                end
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                def valid_proxy?(ip,port = 80,options = {})
         | 
| 102 | 
            +
                  options[:url] ||= "http://www.google.com"
         | 
| 103 | 
            +
                  options[:code] ||= 200
         | 
| 104 | 
            +
                  options[:timeout] ||= 10
         | 
| 105 | 
            +
                  # options[:match] ||= //
         | 
| 106 | 
            +
                  Spider::Http.with_proxy ip,port do
         | 
| 107 | 
            +
                    begin
         | 
| 108 | 
            +
                      timeout options[:timeout] do
         | 
| 109 | 
            +
                        response = Spider::Http.get options[:url]
         | 
| 110 | 
            +
                        r = response.code == options[:code]
         | 
| 111 | 
            +
                        if options[:match]
         | 
| 112 | 
            +
                          r && (response.to_s =~ options[:match])
         | 
| 113 | 
            +
                        else
         | 
| 114 | 
            +
                          r
         | 
| 115 | 
            +
                        end
         | 
| 116 | 
            +
                      end
         | 
| 117 | 
            +
                    rescue Exception => e
         | 
| 118 | 
            +
                      false
         | 
| 119 | 
            +
                    end
         | 
| 120 | 
            +
                  end
         | 
| 121 | 
            +
                end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                # 直接设置 proxies
         | 
| 124 | 
            +
                def proxies=(arr)
         | 
| 125 | 
            +
                  proxy do |ps|
         | 
| 126 | 
            +
                    arr.each do |a|
         | 
| 127 | 
            +
                      ps += [a]
         | 
| 128 | 
            +
                    end
         | 
| 129 | 
            +
                  end
         | 
| 130 | 
            +
                end
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                def proxy(host = nil,port = 80,&block)
         | 
| 133 | 
            +
                  self.proxies += [[host,port]] if host
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                  if block_given?
         | 
| 136 | 
            +
                    yield self.proxies
         | 
| 137 | 
            +
                  end
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                end
         | 
| 140 | 
            +
             | 
| 141 | 
            +
              end
         | 
| 142 | 
            +
             | 
| 143 | 
            +
             | 
| 144 | 
            +
             | 
| 145 | 
            +
              module InstanceMethods
         | 
| 146 | 
            +
             | 
| 147 | 
            +
              end
         | 
| 148 | 
            +
             | 
| 149 | 
            +
            end
         | 
| @@ -0,0 +1,78 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
            module Spider::Page::Publish
         | 
| 3 | 
            +
             | 
| 4 | 
            +
              extend ActiveSupport::Concern
         | 
| 5 | 
            +
             | 
| 6 | 
            +
              included do
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  define_model_callbacks :publish
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  cattr_accessor :publishers
         | 
| 11 | 
            +
                  self.publishers = []
         | 
| 12 | 
            +
                  after_crawl :publish
         | 
| 13 | 
            +
                  
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              module ClassMethods
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                # publish_to Article
         | 
| 19 | 
            +
                # Article will set to publisher
         | 
| 20 | 
            +
                # publish_to Article will called when #crawl
         | 
| 21 | 
            +
                def publish_to(*publishers) 
         | 
| 22 | 
            +
                  logger.debug "[#{self}] set publisher: #{publishers}"
         | 
| 23 | 
            +
                  self.publishers += publishers
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              module InstanceMethods
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def publish_to(*publishers)
         | 
| 31 | 
            +
                  run_callbacks :publish do
         | 
| 32 | 
            +
                    logger.debug "publish to #{publishers}"
         | 
| 33 | 
            +
                    results = []
         | 
| 34 | 
            +
                    [publishers].flatten.each do |publisher|
         | 
| 35 | 
            +
                      logger.info "send self to #{publisher}"
         | 
| 36 | 
            +
                      logger.debug "class:#{publisher.class.name}"
         | 
| 37 | 
            +
                      publisher = case publisher
         | 
| 38 | 
            +
                                  when String,Symbol
         | 
| 39 | 
            +
                                    publisher.to_s.classify.constantize
         | 
| 40 | 
            +
                                  else
         | 
| 41 | 
            +
                                    # puts "default: #{publisher}"
         | 
| 42 | 
            +
                                    publisher
         | 
| 43 | 
            +
                                  end
         | 
| 44 | 
            +
                      logger.debug "publisher: #{publisher}"
         | 
| 45 | 
            +
                      result = nil
         | 
| 46 | 
            +
                      begin
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                        if publisher.respond_to?(:receive_spider_page) 
         | 
| 49 | 
            +
                          logger.debug "#{publisher} receive spider page #{self}"
         | 
| 50 | 
            +
                          result = publisher.receive_spider_page self
         | 
| 51 | 
            +
                          logger.debug "#{publisher} return #{result}" 
         | 
| 52 | 
            +
                        else
         | 
| 53 | 
            +
                          logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
         | 
| 54 | 
            +
                        end
         | 
| 55 | 
            +
                      rescue Exception=>e
         | 
| 56 | 
            +
                        logger.error e.message
         | 
| 57 | 
            +
                        logger.error e.backtrace.join("\n")
         | 
| 58 | 
            +
                      end
         | 
| 59 | 
            +
                      results << result
         | 
| 60 | 
            +
                    end
         | 
| 61 | 
            +
                    results
         | 
| 62 | 
            +
                  end
         | 
| 63 | 
            +
                end
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                def publish
         | 
| 66 | 
            +
                  publishers = self.publishers.uniq
         | 
| 67 | 
            +
                  if [:title,:body].all?{|name| attribute_names.include?(name) }
         | 
| 68 | 
            +
                    logger.debug "[#{self} publish to #{publishers}"
         | 
| 69 | 
            +
                    publish_to(publishers)
         | 
| 70 | 
            +
                  else
         | 
| 71 | 
            +
                    logger.debug "attribute names not include :title, :body,so publish canceled."
         | 
| 72 | 
            +
                  end
         | 
| 73 | 
            +
                end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
             | 
| 76 | 
            +
              end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            end
         | 
| @@ -0,0 +1,136 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
            module Spider::Page::Validation
         | 
| 3 | 
            +
              
         | 
| 4 | 
            +
              class ValidationError < Exception; end
         | 
| 5 | 
            +
              
         | 
| 6 | 
            +
              module InstanceMethods
         | 
| 7 | 
            +
                
         | 
| 8 | 
            +
                def valid_url?
         | 
| 9 | 
            +
                  self.class.valid_url?(url)
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
                
         | 
| 12 | 
            +
              end
         | 
| 13 | 
            +
              
         | 
| 14 | 
            +
              module ClassMethods
         | 
| 15 | 
            +
                
         | 
| 16 | 
            +
                # 根据 url 来返回合适的 Spider::Page 类
         | 
| 17 | 
            +
                def find_by_url(url)
         | 
| 18 | 
            +
                  Spider::Site.find_pages.find do |page|
         | 
| 19 | 
            +
                    page.valid_url?(url)
         | 
| 20 | 
            +
                  end
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
                
         | 
| 23 | 
            +
                # 返回一个数组,匹配url的所有page
         | 
| 24 | 
            +
                def find_all_by_url(url)
         | 
| 25 | 
            +
                  pages = Spider::Site.find_pages.find_all do |page|
         | 
| 26 | 
            +
                    page.valid_url?(url)
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
                  if pages.empty?
         | 
| 29 | 
            +
                    # 获得该 domain 下的所有 pages
         | 
| 30 | 
            +
                    sites = Spider::Site.all.find_all do |site|
         | 
| 31 | 
            +
                      site.valid_domain?(url)
         | 
| 32 | 
            +
                    end
         | 
| 33 | 
            +
                    pages = sites.collect{|site| site.pages }.flatten
         | 
| 34 | 
            +
                  end
         | 
| 35 | 
            +
                  pages
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                # 直接创建 page 的实例
         | 
| 39 | 
            +
                def create_all_by_url(url)
         | 
| 40 | 
            +
                  find_all_by_url(url).collect{|klass| klass.new url }
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                # 直接创建 page 的实例
         | 
| 44 | 
            +
                def create_by_url(url)
         | 
| 45 | 
            +
                  find_by_url(url).try(:new,url)
         | 
| 46 | 
            +
                end
         | 
| 47 | 
            +
                
         | 
| 48 | 
            +
                # 判断 url 是不是该类能处理的
         | 
| 49 | 
            +
                def valid_url?(url)
         | 
| 50 | 
            +
                  valid = false
         | 
| 51 | 
            +
                  self.validate_url_procs.each do |p|
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    if p.call(url)
         | 
| 54 | 
            +
                      valid = true  
         | 
| 55 | 
            +
                      break
         | 
| 56 | 
            +
                    end
         | 
| 57 | 
            +
                             
         | 
| 58 | 
            +
                  end
         | 
| 59 | 
            +
                  valid
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
                
         | 
| 62 | 
            +
                # 例子:
         | 
| 63 | 
            +
                # validate_url "china.com",:match=>/suzhou/,:unmatch=>/beijing/
         | 
| 64 | 
            +
                # validate_url "google.com",:match=>/baidu/,:unmatch=>/yahoo/
         | 
| 65 | 
            +
                def validate_url(options={},&block)
         | 
| 66 | 
            +
                  options.assert_valid_keys :match,:unmatch,:example,:domain
         | 
| 67 | 
            +
                  p = lambda do |url|
         | 
| 68 | 
            +
                    
         | 
| 69 | 
            +
                    begin 
         | 
| 70 | 
            +
                      uri = URI(url)
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                      domain  = [options[:domain]].flatten.compact.uniq
         | 
| 73 | 
            +
                      unless domain.empty?
         | 
| 74 | 
            +
                        unless domain.any?{|d| uri.host.end_with? d }
         | 
| 75 | 
            +
                          raise ValidationError.new(" domain: #{uri.host} not in #{domain.inspect} ")
         | 
| 76 | 
            +
                        end
         | 
| 77 | 
            +
                      end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
             | 
| 80 | 
            +
                      match   = [options[:match]].flatten.compact.uniq
         | 
| 81 | 
            +
                      unmatch = [options[:unmatch]].flatten.compact.uniq
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                      unless match.all?{ |regexp| 
         | 
| 84 | 
            +
                        next regexp.call(url) if regexp.is_a? Proc
         | 
| 85 | 
            +
                        regexp = Regexp.escape regexp if regexp.is_a? String
         | 
| 86 | 
            +
                        regexp =~ url
         | 
| 87 | 
            +
                      }
         | 
| 88 | 
            +
                        raise ValidationError.new("#{url} not match #{match.inspect}")
         | 
| 89 | 
            +
                      end
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                      if unmatch.any?{|regexp| 
         | 
| 92 | 
            +
                        regexp = Regexp.escape regexp if regexp.is_a? String
         | 
| 93 | 
            +
                        regexp =~ url
         | 
| 94 | 
            +
                      }
         | 
| 95 | 
            +
                        raise ValidationError.new("#{url} match #{unmatch.inspect}")
         | 
| 96 | 
            +
                      end
         | 
| 97 | 
            +
                    rescue ValidationError=>e
         | 
| 98 | 
            +
                      logger.debug e.message
         | 
| 99 | 
            +
                      next false
         | 
| 100 | 
            +
                    rescue Exception=>e
         | 
| 101 | 
            +
                      logger.error e.message
         | 
| 102 | 
            +
                      logger.error e.backtrace.join("\n")
         | 
| 103 | 
            +
                      next false
         | 
| 104 | 
            +
                    end
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                    true
         | 
| 107 | 
            +
                  end
         | 
| 108 | 
            +
                  
         | 
| 109 | 
            +
                  if options[:example]
         | 
| 110 | 
            +
                    set_example_url options[:example]
         | 
| 111 | 
            +
                    # 执行自我检查
         | 
| 112 | 
            +
                    [options[:example]].flatten.each do |url|
         | 
| 113 | 
            +
                      unless p.call(url)
         | 
| 114 | 
            +
                        raise ValidationError.new("#{url} is not a valid url for me.")
         | 
| 115 | 
            +
                      end
         | 
| 116 | 
            +
                    end
         | 
| 117 | 
            +
                  end
         | 
| 118 | 
            +
                  
         | 
| 119 | 
            +
                  self.validate_url_procs += [p]
         | 
| 120 | 
            +
                end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
              end
         | 
| 123 | 
            +
              
         | 
| 124 | 
            +
              def self.included(base)
         | 
| 125 | 
            +
                base.class_eval do
         | 
| 126 | 
            +
                  class_attribute :validate_url_procs
         | 
| 127 | 
            +
                  self.validate_url_procs = []
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                  
         | 
| 130 | 
            +
                  include InstanceMethods
         | 
| 131 | 
            +
                  extend  ClassMethods
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                end
         | 
| 134 | 
            +
              end
         | 
| 135 | 
            +
              
         | 
| 136 | 
            +
            end
         |