page_by_page 0.1.9 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: b7b7ac7eab723d81c77692420ccd3fd3e6a69d55
4
- data.tar.gz: cfc4977c9d1e3f635e0bc0bded551eb29ee97195
2
+ SHA256:
3
+ metadata.gz: 51cda3ebfc8bc9a353a51f1386a91cf6ea3d2fbbc219334f278844765b35ad98
4
+ data.tar.gz: 8329a8af72dd945284617f053cbb1ae686cd5eb9151dd1cf9be0c99b7cdd12ac
5
5
  SHA512:
6
- metadata.gz: 1840cd52ed71ad5f21c9647a75a9a8630d5df17c6c731e16220d7b8c92361711fe2fdc8eefa2afa458dc106342c133f46c9fde0246e21d6cd0cb7fac279e7be9
7
- data.tar.gz: c9625120e03c85610165c6f0b791a9f00ba9939c31c3097371bcd0a0756f62afab99001b976bfbb23beffec977eb3a3158afcffa3e3b383a2c7817c0a84748f1
6
+ metadata.gz: 6761cb5c91ed8736be1193237caa2725110070b9de332af5bd6175371064b3b3d939ba997634c8e3711f8f1d81c7dc3988436d807858951c6fd398e39249b8c7
7
+ data.tar.gz: 1cee71ddc3dab888aaf75a8cf7139bfd630ef63c29dc954d00c5cdac7c4a251e78a19dedc56ea3e6b8713289b514cabe2519322efc5accd977fb6084a1768088
data/README.md CHANGED
@@ -20,6 +20,10 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
+ ### number pattern
24
+
25
+ If you know page number pattern, use `fetch`:
26
+
23
27
  ```ruby
24
28
  nodes = PageByPage.fetch do
25
29
  url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
@@ -27,12 +31,44 @@ nodes = PageByPage.fetch do
27
31
  # from 2
28
32
  # step 2
29
33
  # to 100
34
+ # interval 3
30
35
  # threads 4
31
36
  # no_progress
37
+ # header Cookie: 'douban-fav-remind=1'
38
+ end
39
+ ```
40
+
41
+ ### other pattern
42
+
43
+ If the pattern is not simple numbers, use `enumerator` in `fetch`:
44
+
45
+ ```ruby
46
+ nodes = PageByiPage.fetch do
47
+ url 'http://mysql.taobao.org/monthly/<%= n %>'
48
+ selector 'h3'
49
+ enumerator ['2020/09/', '2020/08/'].to_enum
50
+ end
51
+ ```
52
+
53
+ ### unknown pattern
54
+
55
+ If you don't know the pattern, but you see link to next page, use `jump`:
56
+
57
+ ```ruby
58
+ nodes = PageByPage.jump do
59
+ start 'https://book.douban.com/subject/25846075/comments/hot'
60
+ iterate '.comment-paginator li:nth-child(3) a'
61
+ selector '.comment-item'
62
+ # to 100
63
+ # interval 3
64
+ # no_progress
65
+ # header Cookie: 'douban-fav-remind=1'
32
66
  end
33
67
  ```
34
68
 
35
- Or just passing parameters:
69
+ ### parameters instead of block
70
+
71
+ You may just pass parameters instead of block:
36
72
 
37
73
  ```ruby
38
74
  nodes = PageByPage.fetch(
@@ -41,7 +77,19 @@ nodes = PageByPage.fetch(
41
77
  # from: 2,
42
78
  # step: 2,
43
79
  # to: 100,
80
+ # interval: 3
44
81
  # threads: 4,
45
82
  # no_progress: true
83
+ # header: {Cookie: 'douban-fav-remind=1'}
84
+ )
85
+ ```
86
+
87
+ ### lazy
88
+
89
+ Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
90
+
91
+ ```ruby
92
+ nodes = PageByPage.lazy_fetch(
93
+ #...
46
94
  )
47
95
  ```
@@ -10,5 +10,5 @@ require "page_by_page"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
14
- IRB.start
13
+ require "pry"
14
+ pry
@@ -1,121 +1,21 @@
1
1
  require 'page_by_page/version'
2
- require 'page_by_page/enum'
3
- require 'page_by_page/mutex_enum'
4
- require 'nokogiri'
5
- require 'open-uri'
6
- require 'erb'
2
+ require 'page_by_page/fetch'
3
+ require 'page_by_page/jump'
7
4
 
8
- class PageByPage
5
+ module PageByPage
9
6
 
10
7
  class << self
11
- def fetch(opt ={}, &block)
12
- pbp = self.new(opt, &block)
13
- pbp.fetch
8
+ def fetch(*args, &block)
9
+ Fetch.new(*args, &block).process
14
10
  end
15
- end
16
-
17
- def initialize(opt = {}, &block)
18
- @from, @step, @to = 1, 1, Float::INFINITY
19
- @progress = {}
20
- opt.each{ |name, value| send name, value }
21
- instance_eval &block if block
22
- end
23
-
24
- def url tmpl
25
- @tmpl = ERB.new tmpl
26
- end
27
-
28
- def selector sl
29
- @selector = sl
30
- end
31
-
32
- def from n
33
- @from = n
34
- end
35
-
36
- def step n
37
- @step = n
38
- end
39
-
40
- def to n
41
- @to = n
42
- end
43
-
44
- def threads n
45
- @threads = n
46
- end
47
11
 
48
- def no_progress *arg
49
- @progress = nil
50
- end
51
-
52
- def fetch
53
- nodes_2d =
54
- unless defined? @threads
55
- @enum = Enum.new options
56
- _fetch
57
- else
58
- @enum = MutexEnum.new options
59
- parallel_fetch
60
- end
61
- puts if @progress
62
- nodes_2d.reject(&:nil?).flatten
63
- end
64
-
65
- private
66
-
67
- def _fetch
68
- items, pages = [nil], []
69
- catch :no_more do
70
- until items.empty?
71
- n = @enum.next
72
- break if n > limit
73
- url = @tmpl.result binding
74
- doc = parse url
75
- items = doc.css @selector
76
- pages[n] = items
77
- update_progress Thread.current, n if @progress
78
- end
12
+ def lazy_fetch(*args, &block)
13
+ Fetch.new(*args, &block).iterator
79
14
  end
80
- pages
81
- end
82
15
 
83
- def parallel_fetch
84
- ts = @threads.times.map do |n|
85
- Thread.new do
86
- Thread.current[:sub] = _fetch
87
- end
88
- end
89
- ts.each_with_object([]) do |t, pages|
90
- t.join
91
- t[:sub].each_with_index do |items, i|
92
- pages[i] = items if items
93
- end
16
+ def jump(*args, &block)
17
+ Jump.new(*args, &block).process
94
18
  end
95
19
  end
96
20
 
97
- def parse url
98
- page = open(url)
99
- Nokogiri::HTML page.read
100
- rescue OpenURI::HTTPError => e
101
- if e.message == '404 Not Found'
102
- throw :no_more
103
- else
104
- raise e
105
- end
106
- end
107
-
108
- def options
109
- {from: @from, step: @step}
110
- end
111
-
112
- def limit
113
- @to ||= Float::INFINITY
114
- end
115
-
116
- def update_progress thread, page_num
117
- @progress[thread] = page_num
118
- printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
119
- end
120
-
121
21
  end
@@ -0,0 +1,64 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module PageByPage
5
+ module Common
6
+ def initialize(opt = {}, &block)
7
+ @progress = {}
8
+ opt.each{ |name, value| send name, value }
9
+ instance_eval &block if block
10
+ end
11
+
12
+ def to n
13
+ @to = n
14
+ end
15
+
16
+ def selector sl
17
+ @selector = sl
18
+ end
19
+
20
+ def header hash
21
+ @header = hash
22
+ end
23
+
24
+ def interval second
25
+ @interval = second
26
+ end
27
+
28
+ def no_progress *arg
29
+ @progress = nil
30
+ end
31
+
32
+ protected
33
+
34
+ def parse url
35
+ url = URI::encode url
36
+ page = open(url, http_header)
37
+ Nokogiri::HTML page.read
38
+ rescue OpenURI::HTTPError => e
39
+ if e.message == '404 Not Found'
40
+ throw :no_more
41
+ else
42
+ raise e
43
+ end
44
+ end
45
+
46
+ def http_header
47
+ @http_header ||= (
48
+ h = {}
49
+ Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
50
+ h
51
+ )
52
+ end
53
+
54
+ def limit
55
+ @to ||= Float::INFINITY
56
+ end
57
+
58
+ def update_progress thread, page_num
59
+ @progress[thread] = page_num
60
+ printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
61
+ end
62
+
63
+ end
64
+ end
@@ -1,12 +1,12 @@
1
- class PageByPage
1
+ module PageByPage
2
2
  class Enum
3
3
 
4
- def initialize from: 1, step: 1
5
- @enum = (from..Float::INFINITY).step(step).lazy.map(&:to_i).to_enum
4
+ def initialize from: 1, step: 1, limit: nil, enumerator: nil
5
+ @enum = enumerator || (from..limit).step(step).lazy.map(&:to_i).to_enum
6
6
  end
7
7
 
8
8
  def next
9
- @enum.next
9
+ @enum.next rescue nil
10
10
  end
11
11
 
12
12
  end
@@ -0,0 +1,106 @@
1
+ require 'page_by_page/enum'
2
+ require 'page_by_page/mutex_enum'
3
+ require 'page_by_page/common'
4
+ require 'erb'
5
+
6
+ module PageByPage
7
+ class Fetch
8
+
9
+ include Common
10
+
11
+ def initialize(opt = {}, &block)
12
+ @from, @step, @to = 1, 1, Float::INFINITY
13
+ super
14
+ @enum = Enum.new(enum_options)
15
+ @enum = MutexEnum.new(@enum) if defined? @threads
16
+ end
17
+
18
+ def url tmpl
19
+ @tmpl = ERB.new tmpl
20
+ end
21
+
22
+ def from n
23
+ @from = n
24
+ end
25
+
26
+ def step n
27
+ @step = n
28
+ end
29
+
30
+ def threads n
31
+ @threads = n
32
+ end
33
+
34
+ def enumerator e
35
+ @enumerator = e
36
+ end
37
+
38
+ def process
39
+ nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
40
+ puts if @progress
41
+
42
+ nodes_2d.sort.each_with_object([]) do |key_items, res|
43
+ res.concat key_items[1] unless key_items[1].nil?
44
+ end
45
+ end
46
+
47
+ def iterator
48
+ Enumerator.new do |yielder|
49
+ items_enum.each do |_, items|
50
+ items.each do |i|
51
+ yielder.yield(i)
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ protected
58
+
59
+ def _fetch
60
+ pages = {}
61
+
62
+ items_enum.each do |page_num, items|
63
+ pages[page_num] = items
64
+ end
65
+
66
+ pages
67
+ end
68
+
69
+ def items_enum
70
+ Enumerator.new do |yielder|
71
+ items = [nil]
72
+ catch :no_more do
73
+ until items.empty?
74
+ n = @enum.next
75
+ break if n.nil?
76
+
77
+ url = @tmpl.result binding
78
+ doc = parse url
79
+ items = doc.css @selector
80
+ yielder.yield(n, items)
81
+
82
+ update_progress Thread.current, n if @progress
83
+ sleep @interval if @interval
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ def parallel_fetch
90
+ ts = @threads.times.map do |n|
91
+ Thread.new do
92
+ Thread.current[:sub] = _fetch
93
+ end
94
+ end
95
+ ts.each_with_object({}) do |t, pages|
96
+ t.join
97
+ pages.merge! t[:sub]
98
+ end
99
+ end
100
+
101
+ def enum_options
102
+ {from: @from, step: @step, limit: limit, enumerator: @enumerator}
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,50 @@
1
+ require 'page_by_page/common'
2
+
3
+ module PageByPage
4
+ class Jump
5
+
6
+ include Common
7
+
8
+ def start url
9
+ @start = url
10
+ end
11
+
12
+ def iterate selector
13
+ @iterate = selector
14
+ end
15
+
16
+ def process
17
+ url, items, page_count = @start, [], 0
18
+
19
+ while true do
20
+ doc = parse url
21
+ doc.css(@selector).each{ |item| items << item }
22
+
23
+ page_count += 1
24
+ update_progress Thread.current, page_count if @progress
25
+ break if page_count >= limit
26
+
27
+ next_url = doc.at_css(@iterate)
28
+ break unless next_url
29
+
30
+ path = next_url.attr('href')
31
+ url = path.start_with?('/') ? concat_host(path) : path
32
+
33
+ sleep @interval if @interval
34
+ end
35
+
36
+ puts if @progress
37
+ items
38
+ end
39
+
40
+ private
41
+
42
+ def concat_host path
43
+ @prefix = (
44
+ regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
45
+ @start.gsub(regex, '\1')
46
+ )
47
+ File.join @prefix, path
48
+ end
49
+ end
50
+ end
@@ -1,15 +1,15 @@
1
1
  require 'page_by_page/enum'
2
+ require 'thread'
2
3
 
3
- class PageByPage
4
+ module PageByPage
4
5
  class MutexEnum < Enum
5
6
 
6
- def initialize from: 1, step: 1
7
- super
7
+ def initialize enum
8
8
  @q = SizedQueue.new 10
9
+ @enum = enum
9
10
  Thread.new do
10
11
  loop do
11
12
  @q << @enum.next
12
- sleep 0.1
13
13
  end
14
14
  end
15
15
  end
@@ -1,3 +1,3 @@
1
- class PageByPage
2
- VERSION = "0.1.9"
1
+ module PageByPage
2
+ VERSION = "0.1.14"
3
3
  end
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "bundler", "~> 1.13"
33
33
  spec.add_development_dependency "rake", "~> 10.0"
34
34
  spec.add_development_dependency "minitest", "~> 5.0"
35
+ spec.add_development_dependency "pry"
35
36
  spec.add_dependency 'nokogiri', '~> 1.6'
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-10-08 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: nokogiri
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -83,7 +97,10 @@ files:
83
97
  - bin/console
84
98
  - bin/setup
85
99
  - lib/page_by_page.rb
100
+ - lib/page_by_page/common.rb
86
101
  - lib/page_by_page/enum.rb
102
+ - lib/page_by_page/fetch.rb
103
+ - lib/page_by_page/jump.rb
87
104
  - lib/page_by_page/mutex_enum.rb
88
105
  - lib/page_by_page/version.rb
89
106
  - page_by_page.gemspec
@@ -107,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
107
124
  - !ruby/object:Gem::Version
108
125
  version: '0'
109
126
  requirements: []
110
- rubyforge_project:
111
- rubygems_version: 2.6.8
127
+ rubygems_version: 3.0.3
112
128
  signing_key:
113
129
  specification_version: 4
114
130
  summary: scrape page by page , according to url pattern