page_by_page 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: b7b7ac7eab723d81c77692420ccd3fd3e6a69d55
4
- data.tar.gz: cfc4977c9d1e3f635e0bc0bded551eb29ee97195
2
+ SHA256:
3
+ metadata.gz: 47ea21770030654ef4b0d4a7b5d3dec3c1c20d7d43b11e39c5ea7e68be86478f
4
+ data.tar.gz: 7341f8e5293250b308bb4c223a14f1ec8480513269f3b6c1ef06c5aa9f825e92
5
5
  SHA512:
6
- metadata.gz: 1840cd52ed71ad5f21c9647a75a9a8630d5df17c6c731e16220d7b8c92361711fe2fdc8eefa2afa458dc106342c133f46c9fde0246e21d6cd0cb7fac279e7be9
7
- data.tar.gz: c9625120e03c85610165c6f0b791a9f00ba9939c31c3097371bcd0a0756f62afab99001b976bfbb23beffec977eb3a3158afcffa3e3b383a2c7817c0a84748f1
6
+ metadata.gz: '0912271bc28adfd2e97313bbecdeed894976eb37814c84bbbb009ca8ec10dbf04c59b25ce4716dee1701b42cbc1015e7dd83441afeb9a8d7cf7678c477328beb'
7
+ data.tar.gz: 6e46ac599c44dff52b9527de7d526bac874c166191f10e212168034b1477fead6194d1dabfddad46ec74494676e4fc7260b1f25616e5d3c6a08d0bc6250e5fab
data/README.md CHANGED
@@ -20,6 +20,8 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
+ If you know page number pattern, use fetch:
24
+
23
25
  ```ruby
24
26
  nodes = PageByPage.fetch do
25
27
  url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
@@ -27,12 +29,28 @@ nodes = PageByPage.fetch do
27
29
  # from 2
28
30
  # step 2
29
31
  # to 100
32
+ # interval 3
30
33
  # threads 4
31
34
  # no_progress
35
+ # header Cookie: 'douban-fav-remind=1'
36
+ end
37
+ ```
38
+
39
+ If you don't know the pattern, but you see link to next page, use jump:
40
+
41
+ ```ruby
42
+ nodes = PageByPage.jump do
43
+ start 'https://book.douban.com/subject/25846075/comments/hot'
44
+ iterate '.comment-paginator li:nth-child(3) a'
45
+ selector '.comment-item'
46
+ # to 100
47
+ # interval 3
48
+ # no_progress
49
+ # header Cookie: 'douban-fav-remind=1'
32
50
  end
33
51
  ```
34
52
 
35
- Or just passing parameters:
53
+ You may just pass parameters instead of block:
36
54
 
37
55
  ```ruby
38
56
  nodes = PageByPage.fetch(
@@ -41,7 +59,9 @@ nodes = PageByPage.fetch(
41
59
  # from: 2,
42
60
  # step: 2,
43
61
  # to: 100,
62
+ # interval: 3
44
63
  # threads: 4,
45
64
  # no_progress: true
65
+ # header: {Cookie: 'douban-fav-remind=1'}
46
66
  )
47
67
  ```
data/lib/page_by_page.rb CHANGED
@@ -1,16 +1,21 @@
1
1
  require 'page_by_page/version'
2
- require 'page_by_page/enum'
3
- require 'page_by_page/mutex_enum'
2
+ require 'page_by_page/fetch'
3
+ require 'page_by_page/jump'
4
4
  require 'nokogiri'
5
5
  require 'open-uri'
6
- require 'erb'
7
6
 
8
7
  class PageByPage
9
8
 
9
+ include Fetch
10
+ include Jump
11
+
10
12
  class << self
11
- def fetch(opt ={}, &block)
12
- pbp = self.new(opt, &block)
13
- pbp.fetch
13
+ def fetch(*args, &block)
14
+ new(*args, &block).fetch
15
+ end
16
+
17
+ def jump(*args, &block)
18
+ new(*args, &block).jump
14
19
  end
15
20
  end
16
21
 
@@ -21,81 +26,26 @@ class PageByPage
21
26
  instance_eval &block if block
22
27
  end
23
28
 
24
- def url tmpl
25
- @tmpl = ERB.new tmpl
26
- end
27
-
28
- def selector sl
29
- @selector = sl
30
- end
31
-
32
- def from n
33
- @from = n
34
- end
35
-
36
- def step n
37
- @step = n
38
- end
39
-
40
29
  def to n
41
30
  @to = n
42
31
  end
43
32
 
44
- def threads n
45
- @threads = n
33
+ def selector sl
34
+ @selector = sl
46
35
  end
47
36
 
48
- def no_progress *arg
49
- @progress = nil
37
+ def header hash
38
+ @header = hash
50
39
  end
51
40
 
52
- def fetch
53
- nodes_2d =
54
- unless defined? @threads
55
- @enum = Enum.new options
56
- _fetch
57
- else
58
- @enum = MutexEnum.new options
59
- parallel_fetch
60
- end
61
- puts if @progress
62
- nodes_2d.reject(&:nil?).flatten
41
+ def interval second
42
+ @interval = second
63
43
  end
64
44
 
65
45
  private
66
46
 
67
- def _fetch
68
- items, pages = [nil], []
69
- catch :no_more do
70
- until items.empty?
71
- n = @enum.next
72
- break if n > limit
73
- url = @tmpl.result binding
74
- doc = parse url
75
- items = doc.css @selector
76
- pages[n] = items
77
- update_progress Thread.current, n if @progress
78
- end
79
- end
80
- pages
81
- end
82
-
83
- def parallel_fetch
84
- ts = @threads.times.map do |n|
85
- Thread.new do
86
- Thread.current[:sub] = _fetch
87
- end
88
- end
89
- ts.each_with_object([]) do |t, pages|
90
- t.join
91
- t[:sub].each_with_index do |items, i|
92
- pages[i] = items if items
93
- end
94
- end
95
- end
96
-
97
47
  def parse url
98
- page = open(url)
48
+ page = open(url, http_header)
99
49
  Nokogiri::HTML page.read
100
50
  rescue OpenURI::HTTPError => e
101
51
  if e.message == '404 Not Found'
@@ -105,8 +55,12 @@ class PageByPage
105
55
  end
106
56
  end
107
57
 
108
- def options
109
- {from: @from, step: @step}
58
+ def http_header
59
+ @http_header ||= (
60
+ h = {}
61
+ Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
62
+ h
63
+ )
110
64
  end
111
65
 
112
66
  def limit
@@ -0,0 +1,81 @@
1
+ require 'page_by_page/enum'
2
+ require 'page_by_page/mutex_enum'
3
+ require 'erb'
4
+
5
+ class PageByPage
6
+ module Fetch
7
+
8
+ def url tmpl
9
+ @tmpl = ERB.new tmpl
10
+ end
11
+
12
+ def from n
13
+ @from = n
14
+ end
15
+
16
+ def step n
17
+ @step = n
18
+ end
19
+
20
+ def threads n
21
+ @threads = n
22
+ end
23
+
24
+ def no_progress *arg
25
+ @progress = nil
26
+ end
27
+
28
+ def fetch
29
+ nodes_2d =
30
+ unless defined? @threads
31
+ @enum = Enum.new enum_options
32
+ _fetch
33
+ else
34
+ @enum = MutexEnum.new enum_options
35
+ parallel_fetch
36
+ end
37
+ puts if @progress
38
+ nodes_2d.reject(&:nil?).flatten
39
+ end
40
+
41
+ protected
42
+
43
+ def _fetch
44
+ items, pages = [nil], []
45
+ catch :no_more do
46
+ until items.empty?
47
+ n = @enum.next
48
+ break if n > limit
49
+
50
+ url = @tmpl.result binding
51
+ doc = parse url
52
+ items = doc.css @selector
53
+ pages[n] = items
54
+
55
+ update_progress Thread.current, n if @progress
56
+ sleep @interval if @interval
57
+ end
58
+ end
59
+ pages
60
+ end
61
+
62
+ def parallel_fetch
63
+ ts = @threads.times.map do |n|
64
+ Thread.new do
65
+ Thread.current[:sub] = _fetch
66
+ end
67
+ end
68
+ ts.each_with_object([]) do |t, pages|
69
+ t.join
70
+ t[:sub].each_with_index do |items, i|
71
+ pages[i] = items if items
72
+ end
73
+ end
74
+ end
75
+
76
+ def enum_options
77
+ {from: @from, step: @step}
78
+ end
79
+
80
+ end
81
+ end
@@ -0,0 +1,45 @@
1
+ class PageByPage
2
+ module Jump
3
+
4
+ def start url
5
+ @start = url
6
+ end
7
+
8
+ def iterate selector
9
+ @iterate = selector
10
+ end
11
+
12
+ def jump
13
+ url, items, page_count = @start, [], 0
14
+
15
+ while true do
16
+ doc = parse url
17
+ doc.css(@selector).each{ |item| items << item }
18
+
19
+ next_url = doc.at_css(@iterate)
20
+ break unless next_url
21
+
22
+ path = next_url.attr('href')
23
+ url = concat_host path
24
+
25
+ page_count += 1
26
+ update_progress Thread.current, page_count if @progress
27
+ break if page_count >= limit
28
+
29
+ sleep @interval if @interval
30
+ end
31
+
32
+ items
33
+ end
34
+
35
+ private
36
+
37
+ def concat_host path
38
+ @prefix = (
39
+ regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
40
+ @start.gsub(regex, '\1')
41
+ )
42
+ File.join @prefix, path
43
+ end
44
+ end
45
+ end
@@ -1,3 +1,3 @@
1
1
  class PageByPage
2
- VERSION = "0.1.9"
2
+ VERSION = "0.1.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-10-08 00:00:00.000000000 Z
11
+ date: 2019-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -84,6 +84,8 @@ files:
84
84
  - bin/setup
85
85
  - lib/page_by_page.rb
86
86
  - lib/page_by_page/enum.rb
87
+ - lib/page_by_page/fetch.rb
88
+ - lib/page_by_page/jump.rb
87
89
  - lib/page_by_page/mutex_enum.rb
88
90
  - lib/page_by_page/version.rb
89
91
  - page_by_page.gemspec
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
110
  version: '0'
109
111
  requirements: []
110
112
  rubyforge_project:
111
- rubygems_version: 2.6.8
113
+ rubygems_version: 2.7.6
112
114
  signing_key:
113
115
  specification_version: 4
114
116
  summary: scrape page by page , according to url pattern