page_by_page 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +21 -1
- data/lib/page_by_page.rb +24 -70
- data/lib/page_by_page/fetch.rb +81 -0
- data/lib/page_by_page/jump.rb +45 -0
- data/lib/page_by_page/version.rb +1 -1
- metadata +5 -3
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 | 
            -
             | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: 47ea21770030654ef4b0d4a7b5d3dec3c1c20d7d43b11e39c5ea7e68be86478f
         | 
| 4 | 
            +
              data.tar.gz: 7341f8e5293250b308bb4c223a14f1ec8480513269f3b6c1ef06c5aa9f825e92
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: '0912271bc28adfd2e97313bbecdeed894976eb37814c84bbbb009ca8ec10dbf04c59b25ce4716dee1701b42cbc1015e7dd83441afeb9a8d7cf7678c477328beb'
         | 
| 7 | 
            +
              data.tar.gz: 6e46ac599c44dff52b9527de7d526bac874c166191f10e212168034b1477fead6194d1dabfddad46ec74494676e4fc7260b1f25616e5d3c6a08d0bc6250e5fab
         | 
    
        data/README.md
    CHANGED
    
    | @@ -20,6 +20,8 @@ Or install it yourself as: | |
| 20 20 |  | 
| 21 21 | 
             
            ## Usage
         | 
| 22 22 |  | 
| 23 | 
            +
            If you know page number pattern, use fetch:
         | 
| 24 | 
            +
             | 
| 23 25 | 
             
            ```ruby
         | 
| 24 26 | 
             
            nodes = PageByPage.fetch do
         | 
| 25 27 | 
             
              url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
         | 
| @@ -27,12 +29,28 @@ nodes = PageByPage.fetch do | |
| 27 29 | 
             
              # from 2
         | 
| 28 30 | 
             
              # step 2
         | 
| 29 31 | 
             
              # to 100
         | 
| 32 | 
            +
              # interval 3
         | 
| 30 33 | 
             
              # threads 4
         | 
| 31 34 | 
             
              # no_progress
         | 
| 35 | 
            +
              # header Cookie: 'douban-fav-remind=1'
         | 
| 36 | 
            +
            end
         | 
| 37 | 
            +
            ```
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            If you don't know the pattern, but you see link to next page, use jump:
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            ```ruby
         | 
| 42 | 
            +
            nodes = PageByPage.jump do
         | 
| 43 | 
            +
              start 'https://book.douban.com/subject/25846075/comments/hot'
         | 
| 44 | 
            +
              iterate '.comment-paginator li:nth-child(3) a'
         | 
| 45 | 
            +
              selector '.comment-item'
         | 
| 46 | 
            +
              # to 100
         | 
| 47 | 
            +
              # interval 3
         | 
| 48 | 
            +
              # no_progress
         | 
| 49 | 
            +
              # header Cookie: 'douban-fav-remind=1'
         | 
| 32 50 | 
             
            end
         | 
| 33 51 | 
             
            ```
         | 
| 34 52 |  | 
| 35 | 
            -
             | 
| 53 | 
            +
            You may just pass parameters instead of block:
         | 
| 36 54 |  | 
| 37 55 | 
             
            ```ruby
         | 
| 38 56 | 
             
            nodes = PageByPage.fetch(
         | 
| @@ -41,7 +59,9 @@ nodes = PageByPage.fetch( | |
| 41 59 | 
             
              # from: 2,
         | 
| 42 60 | 
             
              # step: 2,
         | 
| 43 61 | 
             
              # to: 100,
         | 
| 62 | 
            +
              # interval: 3
         | 
| 44 63 | 
             
              # threads: 4,
         | 
| 45 64 | 
             
              # no_progress: true
         | 
| 65 | 
            +
              # header: {Cookie: 'douban-fav-remind=1'}
         | 
| 46 66 | 
             
            )
         | 
| 47 67 | 
             
            ```
         | 
    
        data/lib/page_by_page.rb
    CHANGED
    
    | @@ -1,16 +1,21 @@ | |
| 1 1 | 
             
            require 'page_by_page/version'
         | 
| 2 | 
            -
            require 'page_by_page/ | 
| 3 | 
            -
            require 'page_by_page/ | 
| 2 | 
            +
            require 'page_by_page/fetch'
         | 
| 3 | 
            +
            require 'page_by_page/jump'
         | 
| 4 4 | 
             
            require 'nokogiri'
         | 
| 5 5 | 
             
            require 'open-uri'
         | 
| 6 | 
            -
            require 'erb'
         | 
| 7 6 |  | 
| 8 7 | 
             
            class PageByPage
         | 
| 9 8 |  | 
| 9 | 
            +
              include Fetch
         | 
| 10 | 
            +
              include Jump
         | 
| 11 | 
            +
             | 
| 10 12 | 
             
              class << self
         | 
| 11 | 
            -
                def fetch( | 
| 12 | 
            -
                   | 
| 13 | 
            -
             | 
| 13 | 
            +
                def fetch(*args, &block)
         | 
| 14 | 
            +
                  new(*args, &block).fetch
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                def jump(*args, &block)
         | 
| 18 | 
            +
                  new(*args, &block).jump
         | 
| 14 19 | 
             
                end
         | 
| 15 20 | 
             
              end
         | 
| 16 21 |  | 
| @@ -21,81 +26,26 @@ class PageByPage | |
| 21 26 | 
             
                instance_eval &block if block
         | 
| 22 27 | 
             
              end
         | 
| 23 28 |  | 
| 24 | 
            -
              def url tmpl
         | 
| 25 | 
            -
                @tmpl = ERB.new tmpl
         | 
| 26 | 
            -
              end
         | 
| 27 | 
            -
             | 
| 28 | 
            -
              def selector sl
         | 
| 29 | 
            -
                @selector = sl
         | 
| 30 | 
            -
              end
         | 
| 31 | 
            -
             | 
| 32 | 
            -
              def from n
         | 
| 33 | 
            -
                @from = n
         | 
| 34 | 
            -
              end
         | 
| 35 | 
            -
             | 
| 36 | 
            -
              def step n
         | 
| 37 | 
            -
                @step = n
         | 
| 38 | 
            -
              end
         | 
| 39 | 
            -
             | 
| 40 29 | 
             
              def to n
         | 
| 41 30 | 
             
                @to = n
         | 
| 42 31 | 
             
              end
         | 
| 43 32 |  | 
| 44 | 
            -
              def  | 
| 45 | 
            -
                @ | 
| 33 | 
            +
              def selector sl
         | 
| 34 | 
            +
                @selector = sl
         | 
| 46 35 | 
             
              end
         | 
| 47 36 |  | 
| 48 | 
            -
              def  | 
| 49 | 
            -
                @ | 
| 37 | 
            +
              def header hash
         | 
| 38 | 
            +
                @header = hash
         | 
| 50 39 | 
             
              end
         | 
| 51 40 |  | 
| 52 | 
            -
              def  | 
| 53 | 
            -
                 | 
| 54 | 
            -
                  unless defined? @threads
         | 
| 55 | 
            -
                    @enum = Enum.new options
         | 
| 56 | 
            -
                    _fetch
         | 
| 57 | 
            -
                  else
         | 
| 58 | 
            -
                    @enum = MutexEnum.new options
         | 
| 59 | 
            -
                    parallel_fetch
         | 
| 60 | 
            -
                  end
         | 
| 61 | 
            -
                puts if @progress
         | 
| 62 | 
            -
                nodes_2d.reject(&:nil?).flatten
         | 
| 41 | 
            +
              def interval second
         | 
| 42 | 
            +
                @interval = second
         | 
| 63 43 | 
             
              end
         | 
| 64 44 |  | 
| 65 45 | 
             
              private
         | 
| 66 46 |  | 
| 67 | 
            -
              def _fetch
         | 
| 68 | 
            -
                items, pages = [nil], []
         | 
| 69 | 
            -
                catch :no_more do
         | 
| 70 | 
            -
                  until items.empty?
         | 
| 71 | 
            -
                    n = @enum.next
         | 
| 72 | 
            -
                    break if n > limit
         | 
| 73 | 
            -
                    url = @tmpl.result binding
         | 
| 74 | 
            -
                    doc = parse url
         | 
| 75 | 
            -
                    items = doc.css @selector
         | 
| 76 | 
            -
                    pages[n] = items
         | 
| 77 | 
            -
                    update_progress Thread.current, n if @progress
         | 
| 78 | 
            -
                  end
         | 
| 79 | 
            -
                end
         | 
| 80 | 
            -
                pages
         | 
| 81 | 
            -
              end
         | 
| 82 | 
            -
             | 
| 83 | 
            -
              def parallel_fetch
         | 
| 84 | 
            -
                ts = @threads.times.map do |n|
         | 
| 85 | 
            -
                  Thread.new do
         | 
| 86 | 
            -
                    Thread.current[:sub] = _fetch
         | 
| 87 | 
            -
                  end
         | 
| 88 | 
            -
                end
         | 
| 89 | 
            -
                ts.each_with_object([]) do |t, pages|
         | 
| 90 | 
            -
                  t.join
         | 
| 91 | 
            -
                  t[:sub].each_with_index do |items, i|
         | 
| 92 | 
            -
                    pages[i] = items if items
         | 
| 93 | 
            -
                  end
         | 
| 94 | 
            -
                end
         | 
| 95 | 
            -
              end
         | 
| 96 | 
            -
             | 
| 97 47 | 
             
              def parse url
         | 
| 98 | 
            -
                page = open(url)
         | 
| 48 | 
            +
                page = open(url, http_header)
         | 
| 99 49 | 
             
                Nokogiri::HTML page.read
         | 
| 100 50 | 
             
              rescue OpenURI::HTTPError => e
         | 
| 101 51 | 
             
                if e.message == '404 Not Found'
         | 
| @@ -105,8 +55,12 @@ class PageByPage | |
| 105 55 | 
             
                end
         | 
| 106 56 | 
             
              end
         | 
| 107 57 |  | 
| 108 | 
            -
              def  | 
| 109 | 
            -
                 | 
| 58 | 
            +
              def http_header
         | 
| 59 | 
            +
                @http_header ||= (
         | 
| 60 | 
            +
                  h = {}
         | 
| 61 | 
            +
                  Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
         | 
| 62 | 
            +
                  h
         | 
| 63 | 
            +
                )
         | 
| 110 64 | 
             
              end
         | 
| 111 65 |  | 
| 112 66 | 
             
              def limit
         | 
| @@ -0,0 +1,81 @@ | |
| 1 | 
            +
            require 'page_by_page/enum'
         | 
| 2 | 
            +
            require 'page_by_page/mutex_enum'
         | 
| 3 | 
            +
            require 'erb'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            class PageByPage
         | 
| 6 | 
            +
              module Fetch
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                def url tmpl
         | 
| 9 | 
            +
                  @tmpl = ERB.new tmpl
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def from n
         | 
| 13 | 
            +
                  @from = n
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                def step n
         | 
| 17 | 
            +
                  @step = n
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def threads n
         | 
| 21 | 
            +
                  @threads = n
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                def no_progress *arg
         | 
| 25 | 
            +
                  @progress = nil
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                def fetch
         | 
| 29 | 
            +
                  nodes_2d =
         | 
| 30 | 
            +
                    unless defined? @threads
         | 
| 31 | 
            +
                      @enum = Enum.new enum_options
         | 
| 32 | 
            +
                      _fetch
         | 
| 33 | 
            +
                    else
         | 
| 34 | 
            +
                      @enum = MutexEnum.new enum_options
         | 
| 35 | 
            +
                      parallel_fetch
         | 
| 36 | 
            +
                    end
         | 
| 37 | 
            +
                  puts if @progress
         | 
| 38 | 
            +
                  nodes_2d.reject(&:nil?).flatten
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                protected
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                def _fetch
         | 
| 44 | 
            +
                  items, pages = [nil], []
         | 
| 45 | 
            +
                  catch :no_more do
         | 
| 46 | 
            +
                    until items.empty?
         | 
| 47 | 
            +
                      n = @enum.next
         | 
| 48 | 
            +
                      break if n > limit
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                      url = @tmpl.result binding
         | 
| 51 | 
            +
                      doc = parse url
         | 
| 52 | 
            +
                      items = doc.css @selector
         | 
| 53 | 
            +
                      pages[n] = items
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                      update_progress Thread.current, n if @progress
         | 
| 56 | 
            +
                      sleep @interval if @interval
         | 
| 57 | 
            +
                    end
         | 
| 58 | 
            +
                  end
         | 
| 59 | 
            +
                  pages
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                def parallel_fetch
         | 
| 63 | 
            +
                  ts = @threads.times.map do |n|
         | 
| 64 | 
            +
                    Thread.new do
         | 
| 65 | 
            +
                      Thread.current[:sub] = _fetch
         | 
| 66 | 
            +
                    end
         | 
| 67 | 
            +
                  end
         | 
| 68 | 
            +
                  ts.each_with_object([]) do |t, pages|
         | 
| 69 | 
            +
                    t.join
         | 
| 70 | 
            +
                    t[:sub].each_with_index do |items, i|
         | 
| 71 | 
            +
                      pages[i] = items if items
         | 
| 72 | 
            +
                    end
         | 
| 73 | 
            +
                  end
         | 
| 74 | 
            +
                end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                def enum_options
         | 
| 77 | 
            +
                  {from: @from, step: @step}
         | 
| 78 | 
            +
                end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
              end
         | 
| 81 | 
            +
            end
         | 
| @@ -0,0 +1,45 @@ | |
| 1 | 
            +
            class PageByPage
         | 
| 2 | 
            +
              module Jump
         | 
| 3 | 
            +
             | 
| 4 | 
            +
                def start url
         | 
| 5 | 
            +
                  @start = url
         | 
| 6 | 
            +
                end
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                def iterate selector
         | 
| 9 | 
            +
                  @iterate = selector
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def jump
         | 
| 13 | 
            +
                  url, items, page_count = @start, [], 0
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  while true do
         | 
| 16 | 
            +
                    doc = parse url
         | 
| 17 | 
            +
                    doc.css(@selector).each{ |item| items << item }
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    next_url = doc.at_css(@iterate)
         | 
| 20 | 
            +
                    break unless next_url
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                    path = next_url.attr('href')
         | 
| 23 | 
            +
                    url = concat_host path
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    page_count += 1
         | 
| 26 | 
            +
                    update_progress Thread.current, page_count if @progress
         | 
| 27 | 
            +
                    break if page_count >= limit
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    sleep @interval if @interval
         | 
| 30 | 
            +
                  end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  items
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                private
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                def concat_host path
         | 
| 38 | 
            +
                  @prefix = (
         | 
| 39 | 
            +
                    regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
         | 
| 40 | 
            +
                    @start.gsub(regex, '\1')
         | 
| 41 | 
            +
                  )
         | 
| 42 | 
            +
                  File.join @prefix, path
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
              end
         | 
| 45 | 
            +
            end
         | 
    
        data/lib/page_by_page/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: page_by_page
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.10
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - ken
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2019-02-11 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: bundler
         | 
| @@ -84,6 +84,8 @@ files: | |
| 84 84 | 
             
            - bin/setup
         | 
| 85 85 | 
             
            - lib/page_by_page.rb
         | 
| 86 86 | 
             
            - lib/page_by_page/enum.rb
         | 
| 87 | 
            +
            - lib/page_by_page/fetch.rb
         | 
| 88 | 
            +
            - lib/page_by_page/jump.rb
         | 
| 87 89 | 
             
            - lib/page_by_page/mutex_enum.rb
         | 
| 88 90 | 
             
            - lib/page_by_page/version.rb
         | 
| 89 91 | 
             
            - page_by_page.gemspec
         | 
| @@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 108 110 | 
             
                  version: '0'
         | 
| 109 111 | 
             
            requirements: []
         | 
| 110 112 | 
             
            rubyforge_project: 
         | 
| 111 | 
            -
            rubygems_version: 2.6 | 
| 113 | 
            +
            rubygems_version: 2.7.6
         | 
| 112 114 | 
             
            signing_key: 
         | 
| 113 115 | 
             
            specification_version: 4
         | 
| 114 116 | 
             
            summary: scrape page by page , according to url pattern
         |