the_scrap 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - data/.gitignore +22 -0
 - data/Gemfile +4 -0
 - data/LICENSE.txt +22 -0
 - data/README.md +435 -0
 - data/Rakefile +2 -0
 - data/lib/the_scrap/detail_obj.rb +41 -0
 - data/lib/the_scrap/list_obj.rb +100 -0
 - data/lib/the_scrap/scrap.rb +100 -0
 - data/lib/the_scrap/version.rb +3 -0
 - data/lib/the_scrap.rb +5 -0
 - data/the_scrap.gemspec +24 -0
 - metadata +97 -0
 
    
        checksums.yaml
    ADDED
    
    | 
         @@ -0,0 +1,7 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ---
         
     | 
| 
      
 2 
     | 
    
         
            +
            SHA1:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 3b6dbb1e2bbe11284969c7a5bcc79e3bba665b96
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: acfc7d5ac75f238fc77c578b52a68954e71b64c7
         
     | 
| 
      
 5 
     | 
    
         
            +
            SHA512:
         
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: b14dd5d813c97c2a4c9f8c954d8d523004b3d3472e502c8e881301d4213d1539e417da1fb19ca0b864335356065e9b23d8ec8f546b4e464649f92a0814005ec9
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: c989f8ccca09cdef3f892ca78264f9c53c8ef52beaa0020e3482109625e92a14f6e3a49c0b64131cb7a174688ce4734272e0b8c59cd6052ee98e130e3c5fc4b0
         
     | 
    
        data/.gitignore
    ADDED
    
    | 
         @@ -0,0 +1,22 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            *.gem
         
     | 
| 
      
 2 
     | 
    
         
            +
            *.rbc
         
     | 
| 
      
 3 
     | 
    
         
            +
            .bundle
         
     | 
| 
      
 4 
     | 
    
         
            +
            .config
         
     | 
| 
      
 5 
     | 
    
         
            +
            .yardoc
         
     | 
| 
      
 6 
     | 
    
         
            +
            Gemfile.lock
         
     | 
| 
      
 7 
     | 
    
         
            +
            InstalledFiles
         
     | 
| 
      
 8 
     | 
    
         
            +
            _yardoc
         
     | 
| 
      
 9 
     | 
    
         
            +
            coverage
         
     | 
| 
      
 10 
     | 
    
         
            +
            doc/
         
     | 
| 
      
 11 
     | 
    
         
            +
            lib/bundler/man
         
     | 
| 
      
 12 
     | 
    
         
            +
            pkg
         
     | 
| 
      
 13 
     | 
    
         
            +
            rdoc
         
     | 
| 
      
 14 
     | 
    
         
            +
            spec/reports
         
     | 
| 
      
 15 
     | 
    
         
            +
            test/tmp
         
     | 
| 
      
 16 
     | 
    
         
            +
            test/version_tmp
         
     | 
| 
      
 17 
     | 
    
         
            +
            tmp
         
     | 
| 
      
 18 
     | 
    
         
            +
            *.bundle
         
     | 
| 
      
 19 
     | 
    
         
            +
            *.so
         
     | 
| 
      
 20 
     | 
    
         
            +
            *.o
         
     | 
| 
      
 21 
     | 
    
         
            +
            *.a
         
     | 
| 
      
 22 
     | 
    
         
            +
            mkmf.log
         
     | 
    
        data/Gemfile
    ADDED
    
    
    
        data/LICENSE.txt
    ADDED
    
    | 
         @@ -0,0 +1,22 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            Copyright (c) 2014 H.J.LeoChen
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            MIT License
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            Permission is hereby granted, free of charge, to any person obtaining
         
     | 
| 
      
 6 
     | 
    
         
            +
            a copy of this software and associated documentation files (the
         
     | 
| 
      
 7 
     | 
    
         
            +
            "Software"), to deal in the Software without restriction, including
         
     | 
| 
      
 8 
     | 
    
         
            +
            without limitation the rights to use, copy, modify, merge, publish,
         
     | 
| 
      
 9 
     | 
    
         
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         
     | 
| 
      
 10 
     | 
    
         
            +
            permit persons to whom the Software is furnished to do so, subject to
         
     | 
| 
      
 11 
     | 
    
         
            +
            the following conditions:
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            The above copyright notice and this permission notice shall be
         
     | 
| 
      
 14 
     | 
    
         
            +
            included in all copies or substantial portions of the Software.
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         
     | 
| 
      
 17 
     | 
    
         
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         
     | 
| 
      
 18 
     | 
    
         
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         
     | 
| 
      
 19 
     | 
    
         
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
         
     | 
| 
      
 20 
     | 
    
         
            +
            LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
         
     | 
| 
      
 21 
     | 
    
         
            +
            OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
         
     | 
| 
      
 22 
     | 
    
         
            +
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         
     | 
    
        data/README.md
    ADDED
    
    | 
         @@ -0,0 +1,435 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ## The Scrap
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            The Scrap 是一个基于Nokogiri的网页数据抓取的框架
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            目标是使用简单、高效、高自定义、高适配性。
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            ## Installation
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            Add this line to your application's Gemfile:
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                gem 'the_scrap'
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            And then execute:
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                $ bundle    
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            Or install it yourself as:
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                $ gem install the_scrap
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            ## Usage
         
     | 
| 
      
 22 
     | 
    
         
            +
            ### 0. 全景
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 28 
     | 
    
         
            +
            require 'rubygems'
         
     | 
| 
      
 29 
     | 
    
         
            +
            require 'the_scrap'
         
     | 
| 
      
 30 
     | 
    
         
            +
            require 'pp'
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            #create Object
         
     | 
| 
      
 33 
     | 
    
         
            +
            scrap = TheScrap::ListObj.new
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
            #set start url
         
     | 
| 
      
 36 
     | 
    
         
            +
            scrap.url = "http://fz.ganji.com/shouji/"
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            #fragment css selector
         
     | 
| 
      
 39 
     | 
    
         
            +
            #表示,表格的每一行,或者列表的每个元素
         
     | 
| 
      
 40 
     | 
    
         
            +
            #这个行或者元素里面应该包含这条记录的详细信息
         
     | 
| 
      
 41 
     | 
    
         
            +
            #详细信息通过attr列表来获取。
         
     | 
| 
      
 42 
     | 
    
         
            +
            scrap.item_frag = ".layoutlist .list-bigpic"
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
            #scrap attr list
         
     | 
| 
      
 45 
     | 
    
         
            +
            scrap.attr_name = ['.ft-tit',:inner_html]
         
     | 
| 
      
 46 
     | 
    
         
            +
            scrap.attr_detail_url = ['.ft-tit',:href]
         
     | 
| 
      
 47 
     | 
    
         
            +
            scrap.attr_img = ['dt a img',:src]
         
     | 
| 
      
 48 
     | 
    
         
            +
            scrap.attr_desc = '.feature p'
         
     | 
| 
      
 49 
     | 
    
         
            +
            scrap.attr_price = '.fc-org'
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            #debug
         
     | 
| 
      
 52 
     | 
    
         
            +
            scrap.debug = true
         
     | 
| 
      
 53 
     | 
    
         
            +
            scrap.verbose = true
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
            #html preprocess
         
     | 
| 
      
 57 
     | 
    
         
            +
            scrap.html_proc << lambda { |html|
         
     | 
| 
      
 58 
     | 
    
         
            +
              #html.gsub(/abcd/,'efgh')
         
     | 
| 
      
 59 
     | 
    
         
            +
            }
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
            #filter scraped item
         
     | 
| 
      
 62 
     | 
    
         
            +
            scrap.item_filters << lambda { |item_info| 
         
     | 
| 
      
 63 
     | 
    
         
            +
              return false if item_info['name'].nil? || item_info['name'].length == 0
         
     | 
| 
      
 64 
     | 
    
         
            +
              return true
         
     | 
| 
      
 65 
     | 
    
         
            +
            }
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
            #data process
         
     | 
| 
      
 68 
     | 
    
         
            +
            scrap.data_proc << lambda {|url,i|
         
     | 
| 
      
 69 
     | 
    
         
            +
              i['name'] = i['name'].strip
         
     | 
| 
      
 70 
     | 
    
         
            +
            }
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
            #result process
         
     | 
| 
      
 73 
     | 
    
         
            +
            scrap.result_proc << lambda {|url,items|
         
     | 
| 
      
 74 
     | 
    
         
            +
              items.each do |item| 
         
     | 
| 
      
 75 
     | 
    
         
            +
                pp item
         
     | 
| 
      
 76 
     | 
    
         
            +
              end
         
     | 
| 
      
 77 
     | 
    
         
            +
            }
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
            ##### 此处可以添加 多页分页 抓取功能 参见 2
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
            ##### 此处可以添加 详细信息页面 抓取功能 参见 3
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
            #scrap
         
     | 
| 
      
 84 
     | 
    
         
            +
            scrap.scrap_list
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
            ```
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
            ### 1. 列表抓取
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
            参考上一节
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
            ### 2. 多页列表抓取
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
            #create ListObj
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
            #...
         
     | 
| 
      
 99 
     | 
    
         
            +
             
     | 
| 
      
 100 
     | 
    
         
            +
            ########### has many pages ###########
         
     | 
| 
      
 101 
     | 
    
         
            +
            #如果设置了可以根据不同的分页方式抓取多页列表
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
            scrap.has_many_pages = true
         
     | 
| 
      
 104 
     | 
    
         
            +
             
     | 
| 
      
 105 
     | 
    
         
            +
            #next page link
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
            # [:next_page, :total_pages, :total_records]
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
            #:next_page
         
     | 
| 
      
 111 
     | 
    
         
            +
            scrap.page_method = :next_page
         
     | 
| 
      
 112 
     | 
    
         
            +
            scrap.next_page_css = ".next_page a"
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
            #:total_page
         
     | 
| 
      
 116 
     | 
    
         
            +
            scrap.page_method = :total_pages
         
     | 
| 
      
 117 
     | 
    
         
            +
            scrap.get_page_count = lambda { |doc|
         
     | 
| 
      
 118 
     | 
    
         
            +
              if doc.css('.total_p[age').text =~ /(\d+)页/
         
     | 
| 
      
 119 
     | 
    
         
            +
                $~[1].to_i
         
     | 
| 
      
 120 
     | 
    
         
            +
              else
         
     | 
| 
      
 121 
     | 
    
         
            +
                0
         
     | 
| 
      
 122 
     | 
    
         
            +
              end
         
     | 
| 
      
 123 
     | 
    
         
            +
            }
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
            scrap.get_next_url = lambda { |url,next_page_number|
         
     | 
| 
      
 126 
     | 
    
         
            +
              #url is  http://fz.ganji.com/shouji/
         
     | 
| 
      
 127 
     | 
    
         
            +
              #page url pattern http://fz.ganji.com/shouji/o#{page_number}/
         
     | 
| 
      
 128 
     | 
    
         
            +
              url += "/o#{next_page_number}"
         
     | 
| 
      
 129 
     | 
    
         
            +
            }
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
      
 131 
     | 
    
         
            +
            #**total_record in progress
         
     | 
| 
      
 132 
     | 
    
         
            +
            scrap.page_method = :total_records
         
     | 
| 
      
 133 
     | 
    
         
            +
            #...
         
     | 
| 
      
 134 
     | 
    
         
            +
             
     | 
| 
      
 135 
     | 
    
         
            +
            scrap.scrap_list
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
      
 137 
     | 
    
         
            +
            ```
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
            ### 3. 带详细页面信息提取
         
     | 
| 
      
 140 
     | 
    
         
            +
             
     | 
| 
      
 141 
     | 
    
         
            +
            **如果DetailObj不是单独运行而是在ListObj中运行,抓取的信息将合并到ListObj的结果中去**
         
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
            #create ListObj
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
      
 147 
     | 
    
         
            +
            #extra detail page url
         
     | 
| 
      
 148 
     | 
    
         
            +
            scrap.attr_detail_url = [".list a",:href]
         
     | 
| 
      
 149 
     | 
    
         
            +
             
     | 
| 
      
 150 
     | 
    
         
            +
            ...
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
            ################# has detail page ################
         
     | 
| 
      
 153 
     | 
    
         
            +
            #如果设置了可以根据之前抓取的详细页面URL获取详细页面信息
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
            #1. define a detail object
         
     | 
| 
      
 156 
     | 
    
         
            +
            scrap_detail = TheScrap::DetailObj.new
         
     | 
| 
      
 157 
     | 
    
         
            +
            scrap_detail.attr_title = ".Tbox h3"
         
     | 
| 
      
 158 
     | 
    
         
            +
            scrap_detail.attr_detail = ".Tbox .newsatr"
         
     | 
| 
      
 159 
     | 
    
         
            +
            scrap_detail.attr_content = [".Tbox .view",:inner_html]
         
     | 
| 
      
 160 
     | 
    
         
            +
             
     | 
| 
      
 161 
     | 
    
         
            +
             
     | 
| 
      
 162 
     | 
    
         
            +
            #optional html preprocess
         
     | 
| 
      
 163 
     | 
    
         
            +
            scrap_detail.html_proc << lambda{ |response|
         
     | 
| 
      
 164 
     | 
    
         
            +
            }
         
     | 
| 
      
 165 
     | 
    
         
            +
             
     | 
| 
      
 166 
     | 
    
         
            +
            #optional data process
         
     | 
| 
      
 167 
     | 
    
         
            +
            scrap_detail.data_proc << lambda {|url,i|
         
     | 
| 
      
 168 
     | 
    
         
            +
            }
         
     | 
| 
      
 169 
     | 
    
         
            +
             
     | 
| 
      
 170 
     | 
    
         
            +
            #optional result process
         
     | 
| 
      
 171 
     | 
    
         
            +
            #此处可选,抓取的信息将合并到列表页面抓取的记录中去,也可以单独入库了。
         
     | 
| 
      
 172 
     | 
    
         
            +
            scrap_detail.result_proc << lambda {|url,items|
         
     | 
| 
      
 173 
     | 
    
         
            +
            }
         
     | 
| 
      
 174 
     | 
    
         
            +
             
     | 
| 
      
 175 
     | 
    
         
            +
            #get url from list attr and extra data by scrap_detail
         
     | 
| 
      
 176 
     | 
    
         
            +
            scrap.detail_info << [scrap_detail,'detail_url']
         
     | 
| 
      
 177 
     | 
    
         
            +
             
     | 
| 
      
 178 
     | 
    
         
            +
            #scrap.detail_info << [scrap_detail_1,'detail_url_1']
         
     | 
| 
      
 179 
     | 
    
         
            +
             
     | 
| 
      
 180 
     | 
    
         
            +
            #...
         
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
      
 182 
     | 
    
         
            +
            scrap.scrap_list
         
     | 
| 
      
 183 
     | 
    
         
            +
             
     | 
| 
      
 184 
     | 
    
         
            +
            ```
         
     | 
| 
      
 185 
     | 
    
         
            +
             
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
            ### 4. 元素属性说明
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
      
 189 
     | 
    
         
            +
            元素属性使用 **scrap.attr_#{元素名称} = 规则** 来表示
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
            **抓取后将全部放到一个Hash中,其中“元素名称”为Hash的Key,获取的数据为Hash的值**
         
     | 
| 
      
 192 
     | 
    
         
            +
             
     | 
| 
      
 193 
     | 
    
         
            +
            如
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
            	scrap.attr_name = ".title"
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
      
 197 
     | 
    
         
            +
            则结果item['name'] = ".title 对应的节点内容"
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
            其中规则可以使用多种方式表示
         
     | 
| 
      
 200 
     | 
    
         
            +
             
     | 
| 
      
 201 
     | 
    
         
            +
            #### 4.1 直接使用CSS Selector
         
     | 
| 
      
 202 
     | 
    
         
            +
            	
         
     | 
| 
      
 203 
     | 
    
         
            +
            直接使用CSS Selector的情况下,则取得CSS节点对应的 文本内容(inner_text)
         
     | 
| 
      
 204 
     | 
    
         
            +
             
     | 
| 
      
 205 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 206 
     | 
    
         
            +
            @book_info.attr_author = "#divBookInfo .title a"
         
     | 
| 
      
 207 
     | 
    
         
            +
            ```
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
            #### 4.2 一个数组
         
     | 
| 
      
 210 
     | 
    
         
            +
             
     | 
| 
      
 211 
     | 
    
         
            +
            scrap.attr_name = [css_selector,attrs]
         
     | 
| 
      
 212 
     | 
    
         
            +
             
     | 
| 
      
 213 
     | 
    
         
            +
            其中数值的第一个元素为: css_selector
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
            第二个元素可选值为:
         
     | 
| 
      
 216 
     | 
    
         
            +
             
     | 
| 
      
 217 
     | 
    
         
            +
            **:frag_attr**
         
     | 
| 
      
 218 
     | 
    
         
            +
             
     | 
| 
      
 219 
     | 
    
         
            +
            直接去Fragmengt的属性,如list的属性,因为在实际使用过程中遇到过需要取列表或表格行的某个属性的情况。
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
            scrap.attr_name = [:frag_attr,'href']
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
            数组第一个元素为frag_attr而非css selector因为css selector 已经在 scrap.item_frag 中指定,此为特例仅此一处出现此用法。
         
     | 
| 
      
 224 
     | 
    
         
            +
             
     | 
| 
      
 225 
     | 
    
         
            +
            **:inner_html**
         
     | 
| 
      
 226 
     | 
    
         
            +
             
     | 
| 
      
 227 
     | 
    
         
            +
            取节点内的html
         
     | 
| 
      
 228 
     | 
    
         
            +
             
     | 
| 
      
 229 
     | 
    
         
            +
            **:join**
         
     | 
| 
      
 230 
     | 
    
         
            +
             
     | 
| 
      
 231 
     | 
    
         
            +
            遇到某个list时,需要把里面的元素全部获取并使用逗号分隔。如:tags
         
     | 
| 
      
 232 
     | 
    
         
            +
             
     | 
| 
      
 233 
     | 
    
         
            +
            ```html
         
     | 
| 
      
 234 
     | 
    
         
            +
            <ul class="tags">
         
     | 
| 
      
 235 
     | 
    
         
            +
            <li>ruby</li>
         
     | 
| 
      
 236 
     | 
    
         
            +
            <li>rails</li>
         
     | 
| 
      
 237 
     | 
    
         
            +
            <li>activerecord</li>
         
     | 
| 
      
 238 
     | 
    
         
            +
            </ul>
         
     | 
| 
      
 239 
     | 
    
         
            +
            ```
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 242 
     | 
    
         
            +
            scrap.attr_name = ['.tags', :join]
         
     | 
| 
      
 243 
     | 
    
         
            +
            ```
         
     | 
| 
      
 244 
     | 
    
         
            +
             
     | 
| 
      
 245 
     | 
    
         
            +
            使用上述取得一个字符串:
         
     | 
| 
      
 246 
     | 
    
         
            +
             
     | 
| 
      
 247 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 248 
     | 
    
         
            +
            "ruby,rails,activerecord"
         
     | 
| 
      
 249 
     | 
    
         
            +
            ```
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
            **:array**
         
     | 
| 
      
 252 
     | 
    
         
            +
             
     | 
| 
      
 253 
     | 
    
         
            +
            遇到某个list时,需要把里面的元素全部获取并返回一个Array
         
     | 
| 
      
 254 
     | 
    
         
            +
             
     | 
| 
      
 255 
     | 
    
         
            +
            ```html
         
     | 
| 
      
 256 
     | 
    
         
            +
            <ul class="tags">
         
     | 
| 
      
 257 
     | 
    
         
            +
            <li>ruby</li>
         
     | 
| 
      
 258 
     | 
    
         
            +
            <li>rails</li>
         
     | 
| 
      
 259 
     | 
    
         
            +
            <li>activerecord</li>
         
     | 
| 
      
 260 
     | 
    
         
            +
            </ul>
         
     | 
| 
      
 261 
     | 
    
         
            +
            ```
         
     | 
| 
      
 262 
     | 
    
         
            +
             
     | 
| 
      
 263 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 264 
     | 
    
         
            +
            scrap.attr_name = ['.tags', :array]
         
     | 
| 
      
 265 
     | 
    
         
            +
            ```
         
     | 
| 
      
 266 
     | 
    
         
            +
             
     | 
| 
      
 267 
     | 
    
         
            +
            使用上述取得一个字数组:
         
     | 
| 
      
 268 
     | 
    
         
            +
             
     | 
| 
      
 269 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 270 
     | 
    
         
            +
            ['ruby','rails','activerecord']
         
     | 
| 
      
 271 
     | 
    
         
            +
            ```
         
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
      
 273 
     | 
    
         
            +
            **:src**
         
     | 
| 
      
 274 
     | 
    
         
            +
             
     | 
| 
      
 275 
     | 
    
         
            +
            取得图片的SRC属性,并且使用URI.join(current_page_url,src_value)
         
     | 
| 
      
 276 
     | 
    
         
            +
             
     | 
| 
      
 277 
     | 
    
         
            +
            **:href**
         
     | 
| 
      
 278 
     | 
    
         
            +
             
     | 
| 
      
 279 
     | 
    
         
            +
            取得链接的href属性,并且使用URI.join(current_page_url,href_value)
         
     | 
| 
      
 280 
     | 
    
         
            +
             
     | 
| 
      
 281 
     | 
    
         
            +
            **"else"**
         
     | 
| 
      
 282 
     | 
    
         
            +
             
     | 
| 
      
 283 
     | 
    
         
            +
            直接获取元素属性的,不做任何其他处理。
         
     | 
| 
      
 284 
     | 
    
         
            +
             
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
            **实例**
         
     | 
| 
      
 287 
     | 
    
         
            +
             
     | 
| 
      
 288 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 289 
     | 
    
         
            +
            @book_info = TheScrap::DetailObj.new
         
     | 
| 
      
 290 
     | 
    
         
            +
            @book_info.attr_name = "#divBookInfo .title h1"
         
     | 
| 
      
 291 
     | 
    
         
            +
            @book_info.attr_author = "#divBookInfo .title a"
         
     | 
| 
      
 292 
     | 
    
         
            +
            @book_info.attr_desc = [".intro .txt",:inner_html]
         
     | 
| 
      
 293 
     | 
    
         
            +
            @book_info.attr_pic_url = ['.pic_box a img',:src]
         
     | 
| 
      
 294 
     | 
    
         
            +
            @book_info.attr_chapters_url = ['.book_pic .opt li[1] a',:href]
         
     | 
| 
      
 295 
     | 
    
         
            +
            @book_info.attr_book_info = ".info_box table tr"
         
     | 
| 
      
 296 
     | 
    
         
            +
            @book_info.attr_cat_1 = '.box_title .page_site a[2]'
         
     | 
| 
      
 297 
     | 
    
         
            +
            @book_info.attr_tags = ['.book_info .other .labels .box[1] a',:array]
         
     | 
| 
      
 298 
     | 
    
         
            +
            @book_info.attr_user_tags = ['.book_info .other .labels .box[2] a',:join]
         
     | 
| 
      
 299 
     | 
    
         
            +
            @book_info.attr_rate = '#bzhjshu'
         
     | 
| 
      
 300 
     | 
    
         
            +
            @book_info.attr_rate_cnt = ["#div_pingjiarenshu",'title']
         
     | 
| 
      
 301 
     | 
    
         
            +
            @book_info.attr_last_updated_at ="#divBookInfo .tabs .right"
         
     | 
| 
      
 302 
     | 
    
         
            +
            @book_info.attr_last_chapter = '.updata_cont .title a' 
         
     | 
| 
      
 303 
     | 
    
         
            +
            @book_info.attr_last_chapter_desc = ['.updata_cont .cont a',:inner_html]
         
     | 
| 
      
 304 
     | 
    
         
            +
            ```
         
     | 
| 
      
 305 
     | 
    
         
            +
             
     | 
| 
      
 306 
     | 
    
         
            +
            ### 5. 分页模式
         
     | 
| 
      
 307 
     | 
    
         
            +
             
     | 
| 
      
 308 
     | 
    
         
            +
            参考 2. 多页列表抓取
         
     | 
| 
      
 309 
     | 
    
         
            +
             
     | 
| 
      
 310 
     | 
    
         
            +
            ### 6. 获取的记录处理方法
         
     | 
| 
      
 311 
     | 
    
         
            +
             
     | 
| 
      
 312 
     | 
    
         
            +
            可以多获取的结果进行处理后再执行入库操作:
         
     | 
| 
      
 313 
     | 
    
         
            +
             
     | 
| 
      
 314 
     | 
    
         
            +
            简单举例:
         
     | 
| 
      
 315 
     | 
    
         
            +
             
     | 
| 
      
 316 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 317 
     | 
    
         
            +
            baidu.data_proc << lambda {|url,i|
         
     | 
| 
      
 318 
     | 
    
         
            +
              i['title'] = i['title'].strip
         
     | 
| 
      
 319 
     | 
    
         
            +
              if i['ori_url'] =~ /view.aspx\?id=(\d+)/
         
     | 
| 
      
 320 
     | 
    
         
            +
                i['ori_id'] = $~[1].to_i
         
     | 
| 
      
 321 
     | 
    
         
            +
              end
         
     | 
| 
      
 322 
     | 
    
         
            +
             
     | 
| 
      
 323 
     | 
    
         
            +
              if i['detail'] =~ /发布时间:(.*?) /
         
     | 
| 
      
 324 
     | 
    
         
            +
                i['updated_at'] = i['created_at'] = $~[1]
         
     | 
| 
      
 325 
     | 
    
         
            +
              end
         
     | 
| 
      
 326 
     | 
    
         
            +
             
     | 
| 
      
 327 
     | 
    
         
            +
              if i['detail'] =~ /来源:(.*?)作者:/
         
     | 
| 
      
 328 
     | 
    
         
            +
                i['description'] = $~[1].strip
         
     | 
| 
      
 329 
     | 
    
         
            +
              end
         
     | 
| 
      
 330 
     | 
    
         
            +
             
     | 
| 
      
 331 
     | 
    
         
            +
              i.delete('detail')
         
     | 
| 
      
 332 
     | 
    
         
            +
              
         
     | 
| 
      
 333 
     | 
    
         
            +
              i['content'].gsub!(/<script type="text\/javascript">.*?<\/script>/m,'');
         
     | 
| 
      
 334 
     | 
    
         
            +
              i['content'].gsub!(/<style>.*?<\/style>/m,'');
         
     | 
| 
      
 335 
     | 
    
         
            +
              i['content'].gsub!(/<img class="img_(sina|qq)_share".*?>/m,'');
         
     | 
| 
      
 336 
     | 
    
         
            +
              if i['content'] =~ /image=(.*?)"/
         
     | 
| 
      
 337 
     | 
    
         
            +
                #i['image'] = open($~[1]) if $~[1].length > 0
         
     | 
| 
      
 338 
     | 
    
         
            +
              end
         
     | 
| 
      
 339 
     | 
    
         
            +
             
     | 
| 
      
 340 
     | 
    
         
            +
              i['site_id'] = @site_id
         
     | 
| 
      
 341 
     | 
    
         
            +
              i['cat_id'] = @cat_id
         
     | 
| 
      
 342 
     | 
    
         
            +
             
     | 
| 
      
 343 
     | 
    
         
            +
              time = Time.parse(i['updated_at'])
         
     | 
| 
      
 344 
     | 
    
         
            +
              prep = '['+time.strftime('%y%m%d')+']'
         
     | 
| 
      
 345 
     | 
    
         
            +
            }
         
     | 
| 
      
 346 
     | 
    
         
            +
            ```
         
     | 
| 
      
 347 
     | 
    
         
            +
             
     | 
| 
      
 348 
     | 
    
         
            +
            ### 7. 结果处理
         
     | 
| 
      
 349 
     | 
    
         
            +
             
     | 
| 
      
 350 
     | 
    
         
            +
            #### mysql
         
     | 
| 
      
 351 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 352 
     | 
    
         
            +
            require 'active_record'
         
     | 
| 
      
 353 
     | 
    
         
            +
            require 'mysql2'
         
     | 
| 
      
 354 
     | 
    
         
            +
            require 'activerecord-import' #recommend
         
     | 
| 
      
 355 
     | 
    
         
            +
             
     | 
| 
      
 356 
     | 
    
         
            +
             
     | 
| 
      
 357 
     | 
    
         
            +
            ActiveRecord::Base.establish_connection( :adapter => "mysql2",  :host => "localhost",
         
     | 
| 
      
 358 
     | 
    
         
            +
             :database => "test", :username => "test", :password => ""  )
         
     | 
| 
      
 359 
     | 
    
         
            +
             
     | 
| 
      
 360 
     | 
    
         
            +
            ActiveRecord::Base.record_timestamps = false
         
     | 
| 
      
 361 
     | 
    
         
            +
            class Article < ActiveRecord::Base
         
     | 
| 
      
 362 
     | 
    
         
            +
              validates :ori_id, :uniqueness => true
         
     | 
| 
      
 363 
     | 
    
         
            +
            end
         
     | 
| 
      
 364 
     | 
    
         
            +
             
     | 
| 
      
 365 
     | 
    
         
            +
            # OR load Rails env!
         
     | 
| 
      
 366 
     | 
    
         
            +
             
     | 
| 
      
 367 
     | 
    
         
            +
            scrap.result_proc << lambda {|url,items|
         
     | 
| 
      
 368 
     | 
    
         
            +
              articles = []
         
     | 
| 
      
 369 
     | 
    
         
            +
              items.each do |item| 
         
     | 
| 
      
 370 
     | 
    
         
            +
            		#item[:user_id] = 1
         
     | 
| 
      
 371 
     | 
    
         
            +
            		articles << Article.new(item)
         
     | 
| 
      
 372 
     | 
    
         
            +
            	end
         
     | 
| 
      
 373 
     | 
    
         
            +
              Article.import articles
         
     | 
| 
      
 374 
     | 
    
         
            +
            }
         
     | 
| 
      
 375 
     | 
    
         
            +
            ```
         
     | 
| 
      
 376 
     | 
    
         
            +
            #### mongodb
         
     | 
| 
      
 377 
     | 
    
         
            +
             
     | 
| 
      
 378 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 379 
     | 
    
         
            +
            require 'mongoid'
         
     | 
| 
      
 380 
     | 
    
         
            +
             
     | 
| 
      
 381 
     | 
    
         
            +
            Mongoid.load!("./mongoid.yml", :production)
         
     | 
| 
      
 382 
     | 
    
         
            +
            Mongoid.allow_dynamic_fields = true
         
     | 
| 
      
 383 
     | 
    
         
            +
             
     | 
| 
      
 384 
     | 
    
         
            +
            class Article
         
     | 
| 
      
 385 
     | 
    
         
            +
              include Mongoid::Document
         
     | 
| 
      
 386 
     | 
    
         
            +
            	#....
         
     | 
| 
      
 387 
     | 
    
         
            +
            end
         
     | 
| 
      
 388 
     | 
    
         
            +
             
     | 
| 
      
 389 
     | 
    
         
            +
            # OR load Rails env!
         
     | 
| 
      
 390 
     | 
    
         
            +
             
     | 
| 
      
 391 
     | 
    
         
            +
            scrap.result_proc << lambda {|url,items|
         
     | 
| 
      
 392 
     | 
    
         
            +
              items.each do |item| 
         
     | 
| 
      
 393 
     | 
    
         
            +
            		#item[:user_id] = 1
         
     | 
| 
      
 394 
     | 
    
         
            +
            		Article.create(item)
         
     | 
| 
      
 395 
     | 
    
         
            +
            	end
         
     | 
| 
      
 396 
     | 
    
         
            +
            }
         
     | 
| 
      
 397 
     | 
    
         
            +
            ```
         
     | 
| 
      
 398 
     | 
    
         
            +
             
     | 
| 
      
 399 
     | 
    
         
            +
            ### json,xml...
         
     | 
| 
      
 400 
     | 
    
         
            +
             
     | 
| 
      
 401 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 402 
     | 
    
         
            +
            #json
         
     | 
| 
      
 403 
     | 
    
         
            +
            scrap.result_proc << lambda {|url,items|
         
     | 
| 
      
 404 
     | 
    
         
            +
            	File.open("xxx.json",'w').write(items.to_json)
         
     | 
| 
      
 405 
     | 
    
         
            +
            }
         
     | 
| 
      
 406 
     | 
    
         
            +
             
     | 
| 
      
 407 
     | 
    
         
            +
            #xml
         
     | 
| 
      
 408 
     | 
    
         
            +
            scrap.result_proc << lambda {|url,items|
         
     | 
| 
      
 409 
     | 
    
         
            +
            	articles = []
         
     | 
| 
      
 410 
     | 
    
         
            +
              items.each do |item| 
         
     | 
| 
      
 411 
     | 
    
         
            +
            		articles << item.to_xml
         
     | 
| 
      
 412 
     | 
    
         
            +
            	end
         
     | 
| 
      
 413 
     | 
    
         
            +
            	file  = File.open("xxx.xml",'w')
         
     | 
| 
      
 414 
     | 
    
         
            +
            	file.write('<articles>')
         
     | 
| 
      
 415 
     | 
    
         
            +
            	file.write(articles.join(''))
         
     | 
| 
      
 416 
     | 
    
         
            +
            	file.write('</articles>')
         
     | 
| 
      
 417 
     | 
    
         
            +
            	file.close
         
     | 
| 
      
 418 
     | 
    
         
            +
            }
         
     | 
| 
      
 419 
     | 
    
         
            +
            ```
         
     | 
| 
      
 420 
     | 
    
         
            +
             
     | 
| 
      
 421 
     | 
    
         
            +
            ## TODO
         
     | 
| 
      
 422 
     | 
    
         
            +
             
     | 
| 
      
 423 
     | 
    
         
            +
            1. 多线程抓取
         
     | 
| 
      
 424 
     | 
    
         
            +
            2. 线程管理
         
     | 
| 
      
 425 
     | 
    
         
            +
            3. 完善文档
         
     | 
| 
      
 426 
     | 
    
         
            +
             
     | 
| 
      
 427 
     | 
    
         
            +
             
     | 
| 
      
 428 
     | 
    
         
            +
            ## Contributing
         
     | 
| 
      
 429 
     | 
    
         
            +
             
     | 
| 
      
 430 
     | 
    
         
            +
            1. Fork it ( https://github.com/[my-github-username]/thescrap/fork )
         
     | 
| 
      
 431 
     | 
    
         
            +
            2. Create your feature branch (`git checkout -b my-new-feature`)
         
     | 
| 
      
 432 
     | 
    
         
            +
            3. Commit your changes (`git commit -am 'Add some feature'`)
         
     | 
| 
      
 433 
     | 
    
         
            +
            4. Push to the branch (`git push origin my-new-feature`)
         
     | 
| 
      
 434 
     | 
    
         
            +
            5. Create a new Pull Request
         
     | 
| 
      
 435 
     | 
    
         
            +
             
     | 
    
        data/Rakefile
    ADDED
    
    
| 
         @@ -0,0 +1,41 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            module TheScrap
         
     | 
| 
      
 3 
     | 
    
         
            +
              class DetailObj < Scrap
         
     | 
| 
      
 4 
     | 
    
         
            +
                def scrap( url, item_info )
         
     | 
| 
      
 5 
     | 
    
         
            +
                  return retryable(:tries => 3, :on => Timeout::Error) do
         
     | 
| 
      
 6 
     | 
    
         
            +
                    do_scrap(url,item_info)
         
     | 
| 
      
 7 
     | 
    
         
            +
                  end
         
     | 
| 
      
 8 
     | 
    
         
            +
                end
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                def do_scrap( url, item_info )
         
     | 
| 
      
 11 
     | 
    
         
            +
                  html = open(url).read
         
     | 
| 
      
 12 
     | 
    
         
            +
                  html_proc.each do |dp|
         
     | 
| 
      
 13 
     | 
    
         
            +
                    html = dp.call(html)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  end
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                  doc = Nokogiri::HTML(html,nil,encoding)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  get_attrs(url,doc,item_info)
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                  #has detail page?
         
     | 
| 
      
 20 
     | 
    
         
            +
                  #可以递归下层
         
     | 
| 
      
 21 
     | 
    
         
            +
                  detail_info.each do |detail|
         
     | 
| 
      
 22 
     | 
    
         
            +
                    detail[0].scrap(item_info[detail[1]],item_info)
         
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                  #proc data
         
     | 
| 
      
 26 
     | 
    
         
            +
                  data_proc.each do |dp|
         
     | 
| 
      
 27 
     | 
    
         
            +
                    dp.call(url,item_info)
         
     | 
| 
      
 28 
     | 
    
         
            +
                  end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                  #proc result
         
     | 
| 
      
 31 
     | 
    
         
            +
                  #此处可以单独指定对明细信息的入库处理
         
     | 
| 
      
 32 
     | 
    
         
            +
                  result_proc.each do |rp|
         
     | 
| 
      
 33 
     | 
    
         
            +
                    rp.call(url,[item_info])
         
     | 
| 
      
 34 
     | 
    
         
            +
                  end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                  pp item_info if debug?
         
     | 
| 
      
 37 
     | 
    
         
            +
                  return item_info
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
              end
         
     | 
| 
      
 40 
     | 
    
         
            +
            end
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
         @@ -0,0 +1,100 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            module TheScrap
         
     | 
| 
      
 3 
     | 
    
         
            +
              class ListObj < Scrap
         
     | 
| 
      
 4 
     | 
    
         
            +
                attr_accessor :item_filters #条目过滤
         
     | 
| 
      
 5 
     | 
    
         
            +
                attr_accessor :has_many_pages #是否多页
         
     | 
| 
      
 6 
     | 
    
         
            +
                attr_accessor :pager_method #分页模式
         
     | 
| 
      
 7 
     | 
    
         
            +
                attr_accessor :next_page_css #下一页模式时取下一页链接的 css selector
         
     | 
| 
      
 8 
     | 
    
         
            +
                attr_accessor :get_page_count #总页数模式时取总页数方法,不用CSS因为很可能需要重新处理数字。
         
     | 
| 
      
 9 
     | 
    
         
            +
                attr_accessor :get_next_url #总页数模式时,下一页的URL生成方式,方法
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                def initialize()
         
     | 
| 
      
 12 
     | 
    
         
            +
                  super
         
     | 
| 
      
 13 
     | 
    
         
            +
                  @item_filters = []
         
     | 
| 
      
 14 
     | 
    
         
            +
                end
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                def scrap( url )
         
     | 
| 
      
 17 
     | 
    
         
            +
                  items = []
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                  html = open(url)
         
     | 
| 
      
 20 
     | 
    
         
            +
                  html_proc.each do |dp|
         
     | 
| 
      
 21 
     | 
    
         
            +
                    html = dp.call(html)
         
     | 
| 
      
 22 
     | 
    
         
            +
                  end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                  doc = Nokogiri::HTML(html,nil,encoding)
         
     | 
| 
      
 25 
     | 
    
         
            +
                  doc.css(item_frag).each do |item|
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                    item_info = {}
         
     | 
| 
      
 28 
     | 
    
         
            +
                    get_attrs(url,item,item_info)
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                    #filter items
         
     | 
| 
      
 31 
     | 
    
         
            +
                    need_skip = false
         
     | 
| 
      
 32 
     | 
    
         
            +
                    item_filters.each do |filter|
         
     | 
| 
      
 33 
     | 
    
         
            +
                      unless filter.call(item_info)
         
     | 
| 
      
 34 
     | 
    
         
            +
                        need_skip = true
         
     | 
| 
      
 35 
     | 
    
         
            +
                        break
         
     | 
| 
      
 36 
     | 
    
         
            +
                      end
         
     | 
| 
      
 37 
     | 
    
         
            +
                    end
         
     | 
| 
      
 38 
     | 
    
         
            +
                    next if need_skip
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                    #has detail page?
         
     | 
| 
      
 41 
     | 
    
         
            +
                    detail_info.each do |detail|
         
     | 
| 
      
 42 
     | 
    
         
            +
                      detail[0].scrap(item_info[detail[1]],item_info)
         
     | 
| 
      
 43 
     | 
    
         
            +
                    end
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                    #proc result
         
     | 
| 
      
 46 
     | 
    
         
            +
                    data_proc.each do |dp|
         
     | 
| 
      
 47 
     | 
    
         
            +
                      dp.call(url,item_info)
         
     | 
| 
      
 48 
     | 
    
         
            +
                    end
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                    items << item_info
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                    pp item_info if debug?
         
     | 
| 
      
 53 
     | 
    
         
            +
                    break if debug?
         
     | 
| 
      
 54 
     | 
    
         
            +
                  end
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                  result_proc.each do |rp|
         
     | 
| 
      
 57 
     | 
    
         
            +
                    rp.call(url,items)
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                  return doc,items
         
     | 
| 
      
 61 
     | 
    
         
            +
                end
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                def scrap_list
         
     | 
| 
      
 64 
     | 
    
         
            +
                  doc,items = retryable(:tries => 3, :on => Timeout::Error) do
         
     | 
| 
      
 65 
     | 
    
         
            +
                    scrap(url)
         
     | 
| 
      
 66 
     | 
    
         
            +
                  end
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                  return unless has_many_pages
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                  #TODO Refactor it
         
     | 
| 
      
 71 
     | 
    
         
            +
                  next_page_url = nil
         
     | 
| 
      
 72 
     | 
    
         
            +
                  if pager_method == :next_page #有下一页连接的方式
         
     | 
| 
      
 73 
     | 
    
         
            +
                    while node = doc.css(next_page_css).first
         
     | 
| 
      
 74 
     | 
    
         
            +
                      next_page_url = URI.join(next_page_url||url,node['href']).to_s
         
     | 
| 
      
 75 
     | 
    
         
            +
                      puts next_page_url if verbose?
         
     | 
| 
      
 76 
     | 
    
         
            +
                      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
         
     | 
| 
      
 77 
     | 
    
         
            +
                        scrap(next_page_url)
         
     | 
| 
      
 78 
     | 
    
         
            +
                      end
         
     | 
| 
      
 79 
     | 
    
         
            +
                      break if items.count == 0
         
     | 
| 
      
 80 
     | 
    
         
            +
                      break if debug?
         
     | 
| 
      
 81 
     | 
    
         
            +
                    end
         
     | 
| 
      
 82 
     | 
    
         
            +
                  elsif pager_method == :total_pages #可以获取总页数的方式,start by 1
         
     | 
| 
      
 83 
     | 
    
         
            +
                    page_cnt = get_page_count.call(doc)
         
     | 
| 
      
 84 
     | 
    
         
            +
                    (2..page_cnt).each do |idx|
         
     | 
| 
      
 85 
     | 
    
         
            +
                      next_page_url = get_next_url.call(url,idx)
         
     | 
| 
      
 86 
     | 
    
         
            +
                      puts next_page_url if verbose?
         
     | 
| 
      
 87 
     | 
    
         
            +
                      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
         
     | 
| 
      
 88 
     | 
    
         
            +
                        scrap(next_page_url)
         
     | 
| 
      
 89 
     | 
    
         
            +
                      end
         
     | 
| 
      
 90 
     | 
    
         
            +
                      break if items.count == 0
         
     | 
| 
      
 91 
     | 
    
         
            +
                      break if debug?
         
     | 
| 
      
 92 
     | 
    
         
            +
                    end
         
     | 
| 
      
 93 
     | 
    
         
            +
                  elsif pager_method == :total_records
         
     | 
| 
      
 94 
     | 
    
         
            +
                    #TODO
         
     | 
| 
      
 95 
     | 
    
         
            +
                    #可以取到总条数的方式 , 其实也可以使用上一方式(总页数)实现,只是在外部先使用总条数计算一下总页数
         
     | 
| 
      
 96 
     | 
    
         
            +
                  end
         
     | 
| 
      
 97 
     | 
    
         
            +
                end
         
     | 
| 
      
 98 
     | 
    
         
            +
              end
         
     | 
| 
      
 99 
     | 
    
         
            +
            end
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
         @@ -0,0 +1,100 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'rubygems'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'open-uri'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'pp'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'timeout'
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            module TheScrap
         
     | 
| 
      
 9 
     | 
    
         
            +
              class Scrap
         
     | 
| 
      
 10 
     | 
    
         
            +
                attr_accessor :item_frag #条目
         
     | 
| 
      
 11 
     | 
    
         
            +
                attr_accessor :url #起点URL
         
     | 
| 
      
 12 
     | 
    
         
            +
                attr_accessor :base_url #图片,连接base url
         
     | 
| 
      
 13 
     | 
    
         
            +
                attr_accessor :html_proc #获取页面html后的处理方法
         
     | 
| 
      
 14 
     | 
    
         
            +
                attr_accessor :data_proc #抓取完内容后手工对数据进行加工
         
     | 
| 
      
 15 
     | 
    
         
            +
                attr_accessor :result_proc #入库,文件生成等。
         
     | 
| 
      
 16 
     | 
    
         
            +
                attr_accessor :detail_info #详细页面对象
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                attr_accessor :encoding
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                attr_accessor :debug
         
     | 
| 
      
 21 
     | 
    
         
            +
                alias_method :debug?, :debug
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                attr_accessor :verbose
         
     | 
| 
      
 24 
     | 
    
         
            +
                alias_method :verbose?, :verbose
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                def initialize()
         
     | 
| 
      
 27 
     | 
    
         
            +
                  @attrs = {}
         
     | 
| 
      
 28 
     | 
    
         
            +
                  @more_info = []
         
     | 
| 
      
 29 
     | 
    
         
            +
                  @debug = false
         
     | 
| 
      
 30 
     | 
    
         
            +
                  #@encoding = 'utf-8'
         
     | 
| 
      
 31 
     | 
    
         
            +
                  @result_proc = []
         
     | 
| 
      
 32 
     | 
    
         
            +
                  @detail_info = []
         
     | 
| 
      
 33 
     | 
    
         
            +
                  @data_proc = []
         
     | 
| 
      
 34 
     | 
    
         
            +
                  @html_proc = []
         
     | 
| 
      
 35 
     | 
    
         
            +
                end
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                def retryable( options = {} )
         
     | 
| 
      
 38 
     | 
    
         
            +
                  opts = { :tries => 1, :on => Exception }.merge(options)
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                  retry_exception, retries = opts[:on], opts[:tries]
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 43 
     | 
    
         
            +
                    return yield
         
     | 
| 
      
 44 
     | 
    
         
            +
                  rescue retry_exception
         
     | 
| 
      
 45 
     | 
    
         
            +
                    if (retries -= 1) > 0
         
     | 
| 
      
 46 
     | 
    
         
            +
                      sleep 2
         
     | 
| 
      
 47 
     | 
    
         
            +
                      retry 
         
     | 
| 
      
 48 
     | 
    
         
            +
                    else
         
     | 
| 
      
 49 
     | 
    
         
            +
                      raise
         
     | 
| 
      
 50 
     | 
    
         
            +
                    end
         
     | 
| 
      
 51 
     | 
    
         
            +
                  end
         
     | 
| 
      
 52 
     | 
    
         
            +
                end
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                def method_missing( method_id, *arguments, &block )
         
     | 
| 
      
 55 
     | 
    
         
            +
                  if(method_id =~ /attr_(.*)=/)
         
     | 
| 
      
 56 
     | 
    
         
            +
                    name = $~[1]
         
     | 
| 
      
 57 
     | 
    
         
            +
                    @attrs[name] = arguments.first
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
                protected
         
     | 
| 
      
 62 
     | 
    
         
            +
                #TODO document
         
     | 
| 
      
 63 
     | 
    
         
            +
                def get_attrs( url, doc, item_info )
         
     | 
| 
      
 64 
     | 
    
         
            +
                  @attrs.keys.each do |k|
         
     | 
| 
      
 65 
     | 
    
         
            +
                    unless @attrs[k].is_a? Array
         
     | 
| 
      
 66 
     | 
    
         
            +
                      item_info[k] = doc.css(@attrs[k]).text.strip
         
     | 
| 
      
 67 
     | 
    
         
            +
                    else
         
     | 
| 
      
 68 
     | 
    
         
            +
                      option = @attrs[k]
         
     | 
| 
      
 69 
     | 
    
         
            +
                      if option[0] == :frag_attr
         
     | 
| 
      
 70 
     | 
    
         
            +
                        item_info[k] = doc[option[1]]
         
     | 
| 
      
 71 
     | 
    
         
            +
                        next
         
     | 
| 
      
 72 
     | 
    
         
            +
                      end
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                      node = doc.css(option[0]).first
         
     | 
| 
      
 75 
     | 
    
         
            +
                      next unless node
         
     | 
| 
      
 76 
     | 
    
         
            +
                      if(option[1] == :inner_html)
         
     | 
| 
      
 77 
     | 
    
         
            +
                        item_info[k] = node.inner_html
         
     | 
| 
      
 78 
     | 
    
         
            +
                      elsif(option[1] == :join)
         
     | 
| 
      
 79 
     | 
    
         
            +
                        item_info[k] = doc.css(option[0]).map{|i|i.text}.join(',')
         
     | 
| 
      
 80 
     | 
    
         
            +
                      elsif(option[1] == :array)
         
     | 
| 
      
 81 
     | 
    
         
            +
                        item_info[k] = doc.css(option[0]).map{|i|i.text}
         
     | 
| 
      
 82 
     | 
    
         
            +
                      else
         
     | 
| 
      
 83 
     | 
    
         
            +
                        if [:href,:src].include? option[1].to_sym
         
     | 
| 
      
 84 
     | 
    
         
            +
                          #why ???
         
     | 
| 
      
 85 
     | 
    
         
            +
                          src = node[option[1]].strip.gsub(" ","%20")
         
     | 
| 
      
 86 
     | 
    
         
            +
                          begin
         
     | 
| 
      
 87 
     | 
    
         
            +
                            item_info[k] = URI.join(base_url||url,src).to_s  
         
     | 
| 
      
 88 
     | 
    
         
            +
                          rescue
         
     | 
| 
      
 89 
     | 
    
         
            +
                            item_info[k] = src.to_s
         
     | 
| 
      
 90 
     | 
    
         
            +
                          end
         
     | 
| 
      
 91 
     | 
    
         
            +
                        else
         
     | 
| 
      
 92 
     | 
    
         
            +
                          item_info[k] = node[option[1]].strip
         
     | 
| 
      
 93 
     | 
    
         
            +
                        end
         
     | 
| 
      
 94 
     | 
    
         
            +
                      end
         
     | 
| 
      
 95 
     | 
    
         
            +
                    end
         
     | 
| 
      
 96 
     | 
    
         
            +
                  end
         
     | 
| 
      
 97 
     | 
    
         
            +
                end
         
     | 
| 
      
 98 
     | 
    
         
            +
              end
         
     | 
| 
      
 99 
     | 
    
         
            +
            end
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
    
        data/lib/the_scrap.rb
    ADDED
    
    
    
        data/the_scrap.gemspec
    ADDED
    
    | 
         @@ -0,0 +1,24 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # coding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            lib = File.expand_path('../lib', __FILE__)
         
     | 
| 
      
 3 
     | 
    
         
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'the_scrap/version'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            Gem::Specification.new do |spec|
         
     | 
| 
      
 7 
     | 
    
         
            +
              spec.name          = "the_scrap"
         
     | 
| 
      
 8 
     | 
    
         
            +
              spec.version       = TheScrap::VERSION
         
     | 
| 
      
 9 
     | 
    
         
            +
              spec.authors       = ["H.J.LeoChen"]
         
     | 
| 
      
 10 
     | 
    
         
            +
              spec.email         = ["hjleochen@hotmail.com"]
         
     | 
| 
      
 11 
     | 
    
         
            +
              spec.summary       = %q{The webpage scrapping.}
         
     | 
| 
      
 12 
     | 
    
         
            +
              spec.description   = %q{The webpage scrapping based Nokogiri.}
         
     | 
| 
      
 13 
     | 
    
         
            +
              spec.homepage      = ""
         
     | 
| 
      
 14 
     | 
    
         
            +
              spec.license       = "MIT"
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
              spec.files         = `git ls-files -z`.split("\x0")
         
     | 
| 
      
 17 
     | 
    
         
            +
              spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
         
     | 
| 
      
 18 
     | 
    
         
            +
              spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
         
     | 
| 
      
 19 
     | 
    
         
            +
              spec.require_paths = ["lib"]
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
              spec.add_development_dependency "bundler", "~> 1.6"
         
     | 
| 
      
 22 
     | 
    
         
            +
              spec.add_development_dependency "rake"
         
     | 
| 
      
 23 
     | 
    
         
            +
              spec.add_dependency "nokogiri"
         
     | 
| 
      
 24 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,97 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: the_scrap
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.1
         
     | 
| 
      
 5 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 6 
     | 
    
         
            +
            authors:
         
     | 
| 
      
 7 
     | 
    
         
            +
            - H.J.LeoChen
         
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 9 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 10 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2014-08-18 00:00:00.000000000 Z
         
     | 
| 
      
 12 
     | 
    
         
            +
            dependencies:
         
     | 
| 
      
 13 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 14 
     | 
    
         
            +
              name: bundler
         
     | 
| 
      
 15 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 16 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 17 
     | 
    
         
            +
                - - ~>
         
     | 
| 
      
 18 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 19 
     | 
    
         
            +
                    version: '1.6'
         
     | 
| 
      
 20 
     | 
    
         
            +
              type: :development
         
     | 
| 
      
 21 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 22 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 23 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 24 
     | 
    
         
            +
                - - ~>
         
     | 
| 
      
 25 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 26 
     | 
    
         
            +
                    version: '1.6'
         
     | 
| 
      
 27 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 28 
     | 
    
         
            +
              name: rake
         
     | 
| 
      
 29 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 30 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 31 
     | 
    
         
            +
                - - '>='
         
     | 
| 
      
 32 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 33 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 34 
     | 
    
         
            +
              type: :development
         
     | 
| 
      
 35 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 36 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 37 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 38 
     | 
    
         
            +
                - - '>='
         
     | 
| 
      
 39 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 40 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 41 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 42 
     | 
    
         
            +
              name: nokogiri
         
     | 
| 
      
 43 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 44 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 45 
     | 
    
         
            +
                - - '>='
         
     | 
| 
      
 46 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 47 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 48 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 49 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 50 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 51 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 52 
     | 
    
         
            +
                - - '>='
         
     | 
| 
      
 53 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 54 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 55 
     | 
    
         
            +
            description: The webpage scrapping based Nokogiri.
         
     | 
| 
      
 56 
     | 
    
         
            +
            email:
         
     | 
| 
      
 57 
     | 
    
         
            +
            - hjleochen@hotmail.com
         
     | 
| 
      
 58 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 59 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 60 
     | 
    
         
            +
            extra_rdoc_files: []
         
     | 
| 
      
 61 
     | 
    
         
            +
            files:
         
     | 
| 
      
 62 
     | 
    
         
            +
            - .gitignore
         
     | 
| 
      
 63 
     | 
    
         
            +
            - Gemfile
         
     | 
| 
      
 64 
     | 
    
         
            +
            - LICENSE.txt
         
     | 
| 
      
 65 
     | 
    
         
            +
            - README.md
         
     | 
| 
      
 66 
     | 
    
         
            +
            - Rakefile
         
     | 
| 
      
 67 
     | 
    
         
            +
            - lib/the_scrap.rb
         
     | 
| 
      
 68 
     | 
    
         
            +
            - lib/the_scrap/detail_obj.rb
         
     | 
| 
      
 69 
     | 
    
         
            +
            - lib/the_scrap/list_obj.rb
         
     | 
| 
      
 70 
     | 
    
         
            +
            - lib/the_scrap/scrap.rb
         
     | 
| 
      
 71 
     | 
    
         
            +
            - lib/the_scrap/version.rb
         
     | 
| 
      
 72 
     | 
    
         
            +
            - the_scrap.gemspec
         
     | 
| 
      
 73 
     | 
    
         
            +
            homepage: ''
         
     | 
| 
      
 74 
     | 
    
         
            +
            licenses:
         
     | 
| 
      
 75 
     | 
    
         
            +
            - MIT
         
     | 
| 
      
 76 
     | 
    
         
            +
            metadata: {}
         
     | 
| 
      
 77 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 78 
     | 
    
         
            +
            rdoc_options: []
         
     | 
| 
      
 79 
     | 
    
         
            +
            require_paths:
         
     | 
| 
      
 80 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 81 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 82 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 83 
     | 
    
         
            +
              - - '>='
         
     | 
| 
      
 84 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 85 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 86 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 87 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 88 
     | 
    
         
            +
              - - '>='
         
     | 
| 
      
 89 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 90 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 91 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 92 
     | 
    
         
            +
            rubyforge_project: 
         
     | 
| 
      
 93 
     | 
    
         
            +
            rubygems_version: 2.2.2
         
     | 
| 
      
 94 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 95 
     | 
    
         
            +
            specification_version: 4
         
     | 
| 
      
 96 
     | 
    
         
            +
            summary: The webpage scrapping.
         
     | 
| 
      
 97 
     | 
    
         
            +
            test_files: []
         
     |