page_by_page 0.1.9 → 0.1.14

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: b7b7ac7eab723d81c77692420ccd3fd3e6a69d55
4
- data.tar.gz: cfc4977c9d1e3f635e0bc0bded551eb29ee97195
2
+ SHA256:
3
+ metadata.gz: 51cda3ebfc8bc9a353a51f1386a91cf6ea3d2fbbc219334f278844765b35ad98
4
+ data.tar.gz: 8329a8af72dd945284617f053cbb1ae686cd5eb9151dd1cf9be0c99b7cdd12ac
5
5
  SHA512:
6
- metadata.gz: 1840cd52ed71ad5f21c9647a75a9a8630d5df17c6c731e16220d7b8c92361711fe2fdc8eefa2afa458dc106342c133f46c9fde0246e21d6cd0cb7fac279e7be9
7
- data.tar.gz: c9625120e03c85610165c6f0b791a9f00ba9939c31c3097371bcd0a0756f62afab99001b976bfbb23beffec977eb3a3158afcffa3e3b383a2c7817c0a84748f1
6
+ metadata.gz: 6761cb5c91ed8736be1193237caa2725110070b9de332af5bd6175371064b3b3d939ba997634c8e3711f8f1d81c7dc3988436d807858951c6fd398e39249b8c7
7
+ data.tar.gz: 1cee71ddc3dab888aaf75a8cf7139bfd630ef63c29dc954d00c5cdac7c4a251e78a19dedc56ea3e6b8713289b514cabe2519322efc5accd977fb6084a1768088
data/README.md CHANGED
@@ -20,6 +20,10 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
+ ### number pattern
24
+
25
+ If you know page number pattern, use `fetch`:
26
+
23
27
  ```ruby
24
28
  nodes = PageByPage.fetch do
25
29
  url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
@@ -27,12 +31,44 @@ nodes = PageByPage.fetch do
27
31
  # from 2
28
32
  # step 2
29
33
  # to 100
34
+ # interval 3
30
35
  # threads 4
31
36
  # no_progress
37
+ # header Cookie: 'douban-fav-remind=1'
38
+ end
39
+ ```
40
+
41
+ ### other pattern
42
+
43
+ If the pattern is not simple numbers, use `enumerator` in `fetch`:
44
+
45
+ ```ruby
46
+ nodes = PageByiPage.fetch do
47
+ url 'http://mysql.taobao.org/monthly/<%= n %>'
48
+ selector 'h3'
49
+ enumerator ['2020/09/', '2020/08/'].to_enum
50
+ end
51
+ ```
52
+
53
+ ### unknown pattern
54
+
55
+ If you don't know the pattern, but you see link to next page, use `jump`:
56
+
57
+ ```ruby
58
+ nodes = PageByPage.jump do
59
+ start 'https://book.douban.com/subject/25846075/comments/hot'
60
+ iterate '.comment-paginator li:nth-child(3) a'
61
+ selector '.comment-item'
62
+ # to 100
63
+ # interval 3
64
+ # no_progress
65
+ # header Cookie: 'douban-fav-remind=1'
32
66
  end
33
67
  ```
34
68
 
35
- Or just passing parameters:
69
+ ### parameters instead of block
70
+
71
+ You may just pass parameters instead of block:
36
72
 
37
73
  ```ruby
38
74
  nodes = PageByPage.fetch(
@@ -41,7 +77,19 @@ nodes = PageByPage.fetch(
41
77
  # from: 2,
42
78
  # step: 2,
43
79
  # to: 100,
80
+ # interval: 3
44
81
  # threads: 4,
45
82
  # no_progress: true
83
+ # header: {Cookie: 'douban-fav-remind=1'}
84
+ )
85
+ ```
86
+
87
+ ### lazy
88
+
89
+ Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
90
+
91
+ ```ruby
92
+ nodes = PageByPage.lazy_fetch(
93
+ #...
46
94
  )
47
95
  ```
@@ -10,5 +10,5 @@ require "page_by_page"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
14
- IRB.start
13
+ require "pry"
14
+ pry
@@ -1,121 +1,21 @@
1
1
  require 'page_by_page/version'
2
- require 'page_by_page/enum'
3
- require 'page_by_page/mutex_enum'
4
- require 'nokogiri'
5
- require 'open-uri'
6
- require 'erb'
2
+ require 'page_by_page/fetch'
3
+ require 'page_by_page/jump'
7
4
 
8
- class PageByPage
5
+ module PageByPage
9
6
 
10
7
  class << self
11
- def fetch(opt ={}, &block)
12
- pbp = self.new(opt, &block)
13
- pbp.fetch
8
+ def fetch(*args, &block)
9
+ Fetch.new(*args, &block).process
14
10
  end
15
- end
16
-
17
- def initialize(opt = {}, &block)
18
- @from, @step, @to = 1, 1, Float::INFINITY
19
- @progress = {}
20
- opt.each{ |name, value| send name, value }
21
- instance_eval &block if block
22
- end
23
-
24
- def url tmpl
25
- @tmpl = ERB.new tmpl
26
- end
27
-
28
- def selector sl
29
- @selector = sl
30
- end
31
-
32
- def from n
33
- @from = n
34
- end
35
-
36
- def step n
37
- @step = n
38
- end
39
-
40
- def to n
41
- @to = n
42
- end
43
-
44
- def threads n
45
- @threads = n
46
- end
47
11
 
48
- def no_progress *arg
49
- @progress = nil
50
- end
51
-
52
- def fetch
53
- nodes_2d =
54
- unless defined? @threads
55
- @enum = Enum.new options
56
- _fetch
57
- else
58
- @enum = MutexEnum.new options
59
- parallel_fetch
60
- end
61
- puts if @progress
62
- nodes_2d.reject(&:nil?).flatten
63
- end
64
-
65
- private
66
-
67
- def _fetch
68
- items, pages = [nil], []
69
- catch :no_more do
70
- until items.empty?
71
- n = @enum.next
72
- break if n > limit
73
- url = @tmpl.result binding
74
- doc = parse url
75
- items = doc.css @selector
76
- pages[n] = items
77
- update_progress Thread.current, n if @progress
78
- end
12
+ def lazy_fetch(*args, &block)
13
+ Fetch.new(*args, &block).iterator
79
14
  end
80
- pages
81
- end
82
15
 
83
- def parallel_fetch
84
- ts = @threads.times.map do |n|
85
- Thread.new do
86
- Thread.current[:sub] = _fetch
87
- end
88
- end
89
- ts.each_with_object([]) do |t, pages|
90
- t.join
91
- t[:sub].each_with_index do |items, i|
92
- pages[i] = items if items
93
- end
16
+ def jump(*args, &block)
17
+ Jump.new(*args, &block).process
94
18
  end
95
19
  end
96
20
 
97
- def parse url
98
- page = open(url)
99
- Nokogiri::HTML page.read
100
- rescue OpenURI::HTTPError => e
101
- if e.message == '404 Not Found'
102
- throw :no_more
103
- else
104
- raise e
105
- end
106
- end
107
-
108
- def options
109
- {from: @from, step: @step}
110
- end
111
-
112
- def limit
113
- @to ||= Float::INFINITY
114
- end
115
-
116
- def update_progress thread, page_num
117
- @progress[thread] = page_num
118
- printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
119
- end
120
-
121
21
  end
@@ -0,0 +1,64 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module PageByPage
5
+ module Common
6
+ def initialize(opt = {}, &block)
7
+ @progress = {}
8
+ opt.each{ |name, value| send name, value }
9
+ instance_eval &block if block
10
+ end
11
+
12
+ def to n
13
+ @to = n
14
+ end
15
+
16
+ def selector sl
17
+ @selector = sl
18
+ end
19
+
20
+ def header hash
21
+ @header = hash
22
+ end
23
+
24
+ def interval second
25
+ @interval = second
26
+ end
27
+
28
+ def no_progress *arg
29
+ @progress = nil
30
+ end
31
+
32
+ protected
33
+
34
+ def parse url
35
+ url = URI::encode url
36
+ page = open(url, http_header)
37
+ Nokogiri::HTML page.read
38
+ rescue OpenURI::HTTPError => e
39
+ if e.message == '404 Not Found'
40
+ throw :no_more
41
+ else
42
+ raise e
43
+ end
44
+ end
45
+
46
+ def http_header
47
+ @http_header ||= (
48
+ h = {}
49
+ Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
50
+ h
51
+ )
52
+ end
53
+
54
+ def limit
55
+ @to ||= Float::INFINITY
56
+ end
57
+
58
+ def update_progress thread, page_num
59
+ @progress[thread] = page_num
60
+ printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
61
+ end
62
+
63
+ end
64
+ end
@@ -1,12 +1,12 @@
1
- class PageByPage
1
+ module PageByPage
2
2
  class Enum
3
3
 
4
- def initialize from: 1, step: 1
5
- @enum = (from..Float::INFINITY).step(step).lazy.map(&:to_i).to_enum
4
+ def initialize from: 1, step: 1, limit: nil, enumerator: nil
5
+ @enum = enumerator || (from..limit).step(step).lazy.map(&:to_i).to_enum
6
6
  end
7
7
 
8
8
  def next
9
- @enum.next
9
+ @enum.next rescue nil
10
10
  end
11
11
 
12
12
  end
@@ -0,0 +1,106 @@
1
+ require 'page_by_page/enum'
2
+ require 'page_by_page/mutex_enum'
3
+ require 'page_by_page/common'
4
+ require 'erb'
5
+
6
+ module PageByPage
7
+ class Fetch
8
+
9
+ include Common
10
+
11
+ def initialize(opt = {}, &block)
12
+ @from, @step, @to = 1, 1, Float::INFINITY
13
+ super
14
+ @enum = Enum.new(enum_options)
15
+ @enum = MutexEnum.new(@enum) if defined? @threads
16
+ end
17
+
18
+ def url tmpl
19
+ @tmpl = ERB.new tmpl
20
+ end
21
+
22
+ def from n
23
+ @from = n
24
+ end
25
+
26
+ def step n
27
+ @step = n
28
+ end
29
+
30
+ def threads n
31
+ @threads = n
32
+ end
33
+
34
+ def enumerator e
35
+ @enumerator = e
36
+ end
37
+
38
+ def process
39
+ nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
40
+ puts if @progress
41
+
42
+ nodes_2d.sort.each_with_object([]) do |key_items, res|
43
+ res.concat key_items[1] unless key_items[1].nil?
44
+ end
45
+ end
46
+
47
+ def iterator
48
+ Enumerator.new do |yielder|
49
+ items_enum.each do |_, items|
50
+ items.each do |i|
51
+ yielder.yield(i)
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ protected
58
+
59
+ def _fetch
60
+ pages = {}
61
+
62
+ items_enum.each do |page_num, items|
63
+ pages[page_num] = items
64
+ end
65
+
66
+ pages
67
+ end
68
+
69
+ def items_enum
70
+ Enumerator.new do |yielder|
71
+ items = [nil]
72
+ catch :no_more do
73
+ until items.empty?
74
+ n = @enum.next
75
+ break if n.nil?
76
+
77
+ url = @tmpl.result binding
78
+ doc = parse url
79
+ items = doc.css @selector
80
+ yielder.yield(n, items)
81
+
82
+ update_progress Thread.current, n if @progress
83
+ sleep @interval if @interval
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ def parallel_fetch
90
+ ts = @threads.times.map do |n|
91
+ Thread.new do
92
+ Thread.current[:sub] = _fetch
93
+ end
94
+ end
95
+ ts.each_with_object({}) do |t, pages|
96
+ t.join
97
+ pages.merge! t[:sub]
98
+ end
99
+ end
100
+
101
+ def enum_options
102
+ {from: @from, step: @step, limit: limit, enumerator: @enumerator}
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,50 @@
1
+ require 'page_by_page/common'
2
+
3
+ module PageByPage
4
+ class Jump
5
+
6
+ include Common
7
+
8
+ def start url
9
+ @start = url
10
+ end
11
+
12
+ def iterate selector
13
+ @iterate = selector
14
+ end
15
+
16
+ def process
17
+ url, items, page_count = @start, [], 0
18
+
19
+ while true do
20
+ doc = parse url
21
+ doc.css(@selector).each{ |item| items << item }
22
+
23
+ page_count += 1
24
+ update_progress Thread.current, page_count if @progress
25
+ break if page_count >= limit
26
+
27
+ next_url = doc.at_css(@iterate)
28
+ break unless next_url
29
+
30
+ path = next_url.attr('href')
31
+ url = path.start_with?('/') ? concat_host(path) : path
32
+
33
+ sleep @interval if @interval
34
+ end
35
+
36
+ puts if @progress
37
+ items
38
+ end
39
+
40
+ private
41
+
42
+ def concat_host path
43
+ @prefix = (
44
+ regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
45
+ @start.gsub(regex, '\1')
46
+ )
47
+ File.join @prefix, path
48
+ end
49
+ end
50
+ end
@@ -1,15 +1,15 @@
1
1
  require 'page_by_page/enum'
2
+ require 'thread'
2
3
 
3
- class PageByPage
4
+ module PageByPage
4
5
  class MutexEnum < Enum
5
6
 
6
- def initialize from: 1, step: 1
7
- super
7
+ def initialize enum
8
8
  @q = SizedQueue.new 10
9
+ @enum = enum
9
10
  Thread.new do
10
11
  loop do
11
12
  @q << @enum.next
12
- sleep 0.1
13
13
  end
14
14
  end
15
15
  end
@@ -1,3 +1,3 @@
1
- class PageByPage
2
- VERSION = "0.1.9"
1
+ module PageByPage
2
+ VERSION = "0.1.14"
3
3
  end
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "bundler", "~> 1.13"
33
33
  spec.add_development_dependency "rake", "~> 10.0"
34
34
  spec.add_development_dependency "minitest", "~> 5.0"
35
+ spec.add_development_dependency "pry"
35
36
  spec.add_dependency 'nokogiri', '~> 1.6'
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-10-08 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: nokogiri
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -83,7 +97,10 @@ files:
83
97
  - bin/console
84
98
  - bin/setup
85
99
  - lib/page_by_page.rb
100
+ - lib/page_by_page/common.rb
86
101
  - lib/page_by_page/enum.rb
102
+ - lib/page_by_page/fetch.rb
103
+ - lib/page_by_page/jump.rb
87
104
  - lib/page_by_page/mutex_enum.rb
88
105
  - lib/page_by_page/version.rb
89
106
  - page_by_page.gemspec
@@ -107,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
107
124
  - !ruby/object:Gem::Version
108
125
  version: '0'
109
126
  requirements: []
110
- rubyforge_project:
111
- rubygems_version: 2.6.8
127
+ rubygems_version: 3.0.3
112
128
  signing_key:
113
129
  specification_version: 4
114
130
  summary: scrape page by page , according to url pattern