page_by_page 0.1.9 → 0.1.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: b7b7ac7eab723d81c77692420ccd3fd3e6a69d55
4
- data.tar.gz: cfc4977c9d1e3f635e0bc0bded551eb29ee97195
2
+ SHA256:
3
+ metadata.gz: 47ea21770030654ef4b0d4a7b5d3dec3c1c20d7d43b11e39c5ea7e68be86478f
4
+ data.tar.gz: 7341f8e5293250b308bb4c223a14f1ec8480513269f3b6c1ef06c5aa9f825e92
5
5
  SHA512:
6
- metadata.gz: 1840cd52ed71ad5f21c9647a75a9a8630d5df17c6c731e16220d7b8c92361711fe2fdc8eefa2afa458dc106342c133f46c9fde0246e21d6cd0cb7fac279e7be9
7
- data.tar.gz: c9625120e03c85610165c6f0b791a9f00ba9939c31c3097371bcd0a0756f62afab99001b976bfbb23beffec977eb3a3158afcffa3e3b383a2c7817c0a84748f1
6
+ metadata.gz: '0912271bc28adfd2e97313bbecdeed894976eb37814c84bbbb009ca8ec10dbf04c59b25ce4716dee1701b42cbc1015e7dd83441afeb9a8d7cf7678c477328beb'
7
+ data.tar.gz: 6e46ac599c44dff52b9527de7d526bac874c166191f10e212168034b1477fead6194d1dabfddad46ec74494676e4fc7260b1f25616e5d3c6a08d0bc6250e5fab
data/README.md CHANGED
@@ -20,6 +20,8 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
+ If you know page number pattern, use fetch:
24
+
23
25
  ```ruby
24
26
  nodes = PageByPage.fetch do
25
27
  url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
@@ -27,12 +29,28 @@ nodes = PageByPage.fetch do
27
29
  # from 2
28
30
  # step 2
29
31
  # to 100
32
+ # interval 3
30
33
  # threads 4
31
34
  # no_progress
35
+ # header Cookie: 'douban-fav-remind=1'
36
+ end
37
+ ```
38
+
39
+ If you don't know the pattern, but you see link to next page, use jump:
40
+
41
+ ```ruby
42
+ nodes = PageByPage.jump do
43
+ start 'https://book.douban.com/subject/25846075/comments/hot'
44
+ iterate '.comment-paginator li:nth-child(3) a'
45
+ selector '.comment-item'
46
+ # to 100
47
+ # interval 3
48
+ # no_progress
49
+ # header Cookie: 'douban-fav-remind=1'
32
50
  end
33
51
  ```
34
52
 
35
- Or just passing parameters:
53
+ You may just pass parameters instead of block:
36
54
 
37
55
  ```ruby
38
56
  nodes = PageByPage.fetch(
@@ -41,7 +59,9 @@ nodes = PageByPage.fetch(
41
59
  # from: 2,
42
60
  # step: 2,
43
61
  # to: 100,
62
+ # interval: 3
44
63
  # threads: 4,
45
64
  # no_progress: true
65
+ # header: {Cookie: 'douban-fav-remind=1'}
46
66
  )
47
67
  ```
data/lib/page_by_page.rb CHANGED
@@ -1,16 +1,21 @@
1
1
  require 'page_by_page/version'
2
- require 'page_by_page/enum'
3
- require 'page_by_page/mutex_enum'
2
+ require 'page_by_page/fetch'
3
+ require 'page_by_page/jump'
4
4
  require 'nokogiri'
5
5
  require 'open-uri'
6
- require 'erb'
7
6
 
8
7
  class PageByPage
9
8
 
9
+ include Fetch
10
+ include Jump
11
+
10
12
  class << self
11
- def fetch(opt ={}, &block)
12
- pbp = self.new(opt, &block)
13
- pbp.fetch
13
+ def fetch(*args, &block)
14
+ new(*args, &block).fetch
15
+ end
16
+
17
+ def jump(*args, &block)
18
+ new(*args, &block).jump
14
19
  end
15
20
  end
16
21
 
@@ -21,81 +26,26 @@ class PageByPage
21
26
  instance_eval &block if block
22
27
  end
23
28
 
24
- def url tmpl
25
- @tmpl = ERB.new tmpl
26
- end
27
-
28
- def selector sl
29
- @selector = sl
30
- end
31
-
32
- def from n
33
- @from = n
34
- end
35
-
36
- def step n
37
- @step = n
38
- end
39
-
40
29
  def to n
41
30
  @to = n
42
31
  end
43
32
 
44
- def threads n
45
- @threads = n
33
+ def selector sl
34
+ @selector = sl
46
35
  end
47
36
 
48
- def no_progress *arg
49
- @progress = nil
37
+ def header hash
38
+ @header = hash
50
39
  end
51
40
 
52
- def fetch
53
- nodes_2d =
54
- unless defined? @threads
55
- @enum = Enum.new options
56
- _fetch
57
- else
58
- @enum = MutexEnum.new options
59
- parallel_fetch
60
- end
61
- puts if @progress
62
- nodes_2d.reject(&:nil?).flatten
41
+ def interval second
42
+ @interval = second
63
43
  end
64
44
 
65
45
  private
66
46
 
67
- def _fetch
68
- items, pages = [nil], []
69
- catch :no_more do
70
- until items.empty?
71
- n = @enum.next
72
- break if n > limit
73
- url = @tmpl.result binding
74
- doc = parse url
75
- items = doc.css @selector
76
- pages[n] = items
77
- update_progress Thread.current, n if @progress
78
- end
79
- end
80
- pages
81
- end
82
-
83
- def parallel_fetch
84
- ts = @threads.times.map do |n|
85
- Thread.new do
86
- Thread.current[:sub] = _fetch
87
- end
88
- end
89
- ts.each_with_object([]) do |t, pages|
90
- t.join
91
- t[:sub].each_with_index do |items, i|
92
- pages[i] = items if items
93
- end
94
- end
95
- end
96
-
97
47
  def parse url
98
- page = open(url)
48
+ page = open(url, http_header)
99
49
  Nokogiri::HTML page.read
100
50
  rescue OpenURI::HTTPError => e
101
51
  if e.message == '404 Not Found'
@@ -105,8 +55,12 @@ class PageByPage
105
55
  end
106
56
  end
107
57
 
108
- def options
109
- {from: @from, step: @step}
58
+ def http_header
59
+ @http_header ||= (
60
+ h = {}
61
+ Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
62
+ h
63
+ )
110
64
  end
111
65
 
112
66
  def limit
@@ -0,0 +1,81 @@
1
+ require 'page_by_page/enum'
2
+ require 'page_by_page/mutex_enum'
3
+ require 'erb'
4
+
5
+ class PageByPage
6
+ module Fetch
7
+
8
+ def url tmpl
9
+ @tmpl = ERB.new tmpl
10
+ end
11
+
12
+ def from n
13
+ @from = n
14
+ end
15
+
16
+ def step n
17
+ @step = n
18
+ end
19
+
20
+ def threads n
21
+ @threads = n
22
+ end
23
+
24
+ def no_progress *arg
25
+ @progress = nil
26
+ end
27
+
28
+ def fetch
29
+ nodes_2d =
30
+ unless defined? @threads
31
+ @enum = Enum.new enum_options
32
+ _fetch
33
+ else
34
+ @enum = MutexEnum.new enum_options
35
+ parallel_fetch
36
+ end
37
+ puts if @progress
38
+ nodes_2d.reject(&:nil?).flatten
39
+ end
40
+
41
+ protected
42
+
43
+ def _fetch
44
+ items, pages = [nil], []
45
+ catch :no_more do
46
+ until items.empty?
47
+ n = @enum.next
48
+ break if n > limit
49
+
50
+ url = @tmpl.result binding
51
+ doc = parse url
52
+ items = doc.css @selector
53
+ pages[n] = items
54
+
55
+ update_progress Thread.current, n if @progress
56
+ sleep @interval if @interval
57
+ end
58
+ end
59
+ pages
60
+ end
61
+
62
+ def parallel_fetch
63
+ ts = @threads.times.map do |n|
64
+ Thread.new do
65
+ Thread.current[:sub] = _fetch
66
+ end
67
+ end
68
+ ts.each_with_object([]) do |t, pages|
69
+ t.join
70
+ t[:sub].each_with_index do |items, i|
71
+ pages[i] = items if items
72
+ end
73
+ end
74
+ end
75
+
76
+ def enum_options
77
+ {from: @from, step: @step}
78
+ end
79
+
80
+ end
81
+ end
@@ -0,0 +1,45 @@
1
+ class PageByPage
2
+ module Jump
3
+
4
+ def start url
5
+ @start = url
6
+ end
7
+
8
+ def iterate selector
9
+ @iterate = selector
10
+ end
11
+
12
+ def jump
13
+ url, items, page_count = @start, [], 0
14
+
15
+ while true do
16
+ doc = parse url
17
+ doc.css(@selector).each{ |item| items << item }
18
+
19
+ next_url = doc.at_css(@iterate)
20
+ break unless next_url
21
+
22
+ path = next_url.attr('href')
23
+ url = concat_host path
24
+
25
+ page_count += 1
26
+ update_progress Thread.current, page_count if @progress
27
+ break if page_count >= limit
28
+
29
+ sleep @interval if @interval
30
+ end
31
+
32
+ items
33
+ end
34
+
35
+ private
36
+
37
+ def concat_host path
38
+ @prefix = (
39
+ regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
40
+ @start.gsub(regex, '\1')
41
+ )
42
+ File.join @prefix, path
43
+ end
44
+ end
45
+ end
@@ -1,3 +1,3 @@
1
1
  class PageByPage
2
- VERSION = "0.1.9"
2
+ VERSION = "0.1.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-10-08 00:00:00.000000000 Z
11
+ date: 2019-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -84,6 +84,8 @@ files:
84
84
  - bin/setup
85
85
  - lib/page_by_page.rb
86
86
  - lib/page_by_page/enum.rb
87
+ - lib/page_by_page/fetch.rb
88
+ - lib/page_by_page/jump.rb
87
89
  - lib/page_by_page/mutex_enum.rb
88
90
  - lib/page_by_page/version.rb
89
91
  - page_by_page.gemspec
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
110
  version: '0'
109
111
  requirements: []
110
112
  rubyforge_project:
111
- rubygems_version: 2.6.8
113
+ rubygems_version: 2.7.6
112
114
  signing_key:
113
115
  specification_version: 4
114
116
  summary: scrape page by page , according to url pattern