page_by_page 0.1.9 → 0.1.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +21 -1
- data/lib/page_by_page.rb +24 -70
- data/lib/page_by_page/fetch.rb +81 -0
- data/lib/page_by_page/jump.rb +45 -0
- data/lib/page_by_page/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 47ea21770030654ef4b0d4a7b5d3dec3c1c20d7d43b11e39c5ea7e68be86478f
|
4
|
+
data.tar.gz: 7341f8e5293250b308bb4c223a14f1ec8480513269f3b6c1ef06c5aa9f825e92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0912271bc28adfd2e97313bbecdeed894976eb37814c84bbbb009ca8ec10dbf04c59b25ce4716dee1701b42cbc1015e7dd83441afeb9a8d7cf7678c477328beb'
|
7
|
+
data.tar.gz: 6e46ac599c44dff52b9527de7d526bac874c166191f10e212168034b1477fead6194d1dabfddad46ec74494676e4fc7260b1f25616e5d3c6a08d0bc6250e5fab
|
data/README.md
CHANGED
@@ -20,6 +20,8 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
If you know page number pattern, use fetch:
|
24
|
+
|
23
25
|
```ruby
|
24
26
|
nodes = PageByPage.fetch do
|
25
27
|
url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
|
@@ -27,12 +29,28 @@ nodes = PageByPage.fetch do
|
|
27
29
|
# from 2
|
28
30
|
# step 2
|
29
31
|
# to 100
|
32
|
+
# interval 3
|
30
33
|
# threads 4
|
31
34
|
# no_progress
|
35
|
+
# header Cookie: 'douban-fav-remind=1'
|
36
|
+
end
|
37
|
+
```
|
38
|
+
|
39
|
+
If you don't know the pattern, but you see link to next page, use jump:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
nodes = PageByPage.jump do
|
43
|
+
start 'https://book.douban.com/subject/25846075/comments/hot'
|
44
|
+
iterate '.comment-paginator li:nth-child(3) a'
|
45
|
+
selector '.comment-item'
|
46
|
+
# to 100
|
47
|
+
# interval 3
|
48
|
+
# no_progress
|
49
|
+
# header Cookie: 'douban-fav-remind=1'
|
32
50
|
end
|
33
51
|
```
|
34
52
|
|
35
|
-
|
53
|
+
You may just pass parameters instead of block:
|
36
54
|
|
37
55
|
```ruby
|
38
56
|
nodes = PageByPage.fetch(
|
@@ -41,7 +59,9 @@ nodes = PageByPage.fetch(
|
|
41
59
|
# from: 2,
|
42
60
|
# step: 2,
|
43
61
|
# to: 100,
|
62
|
+
# interval: 3
|
44
63
|
# threads: 4,
|
45
64
|
# no_progress: true
|
65
|
+
# header: {Cookie: 'douban-fav-remind=1'}
|
46
66
|
)
|
47
67
|
```
|
data/lib/page_by_page.rb
CHANGED
@@ -1,16 +1,21 @@
|
|
1
1
|
require 'page_by_page/version'
|
2
|
-
require 'page_by_page/
|
3
|
-
require 'page_by_page/
|
2
|
+
require 'page_by_page/fetch'
|
3
|
+
require 'page_by_page/jump'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'open-uri'
|
6
|
-
require 'erb'
|
7
6
|
|
8
7
|
class PageByPage
|
9
8
|
|
9
|
+
include Fetch
|
10
|
+
include Jump
|
11
|
+
|
10
12
|
class << self
|
11
|
-
def fetch(
|
12
|
-
|
13
|
-
|
13
|
+
def fetch(*args, &block)
|
14
|
+
new(*args, &block).fetch
|
15
|
+
end
|
16
|
+
|
17
|
+
def jump(*args, &block)
|
18
|
+
new(*args, &block).jump
|
14
19
|
end
|
15
20
|
end
|
16
21
|
|
@@ -21,81 +26,26 @@ class PageByPage
|
|
21
26
|
instance_eval &block if block
|
22
27
|
end
|
23
28
|
|
24
|
-
def url tmpl
|
25
|
-
@tmpl = ERB.new tmpl
|
26
|
-
end
|
27
|
-
|
28
|
-
def selector sl
|
29
|
-
@selector = sl
|
30
|
-
end
|
31
|
-
|
32
|
-
def from n
|
33
|
-
@from = n
|
34
|
-
end
|
35
|
-
|
36
|
-
def step n
|
37
|
-
@step = n
|
38
|
-
end
|
39
|
-
|
40
29
|
def to n
|
41
30
|
@to = n
|
42
31
|
end
|
43
32
|
|
44
|
-
def
|
45
|
-
@
|
33
|
+
def selector sl
|
34
|
+
@selector = sl
|
46
35
|
end
|
47
36
|
|
48
|
-
def
|
49
|
-
@
|
37
|
+
def header hash
|
38
|
+
@header = hash
|
50
39
|
end
|
51
40
|
|
52
|
-
def
|
53
|
-
|
54
|
-
unless defined? @threads
|
55
|
-
@enum = Enum.new options
|
56
|
-
_fetch
|
57
|
-
else
|
58
|
-
@enum = MutexEnum.new options
|
59
|
-
parallel_fetch
|
60
|
-
end
|
61
|
-
puts if @progress
|
62
|
-
nodes_2d.reject(&:nil?).flatten
|
41
|
+
def interval second
|
42
|
+
@interval = second
|
63
43
|
end
|
64
44
|
|
65
45
|
private
|
66
46
|
|
67
|
-
def _fetch
|
68
|
-
items, pages = [nil], []
|
69
|
-
catch :no_more do
|
70
|
-
until items.empty?
|
71
|
-
n = @enum.next
|
72
|
-
break if n > limit
|
73
|
-
url = @tmpl.result binding
|
74
|
-
doc = parse url
|
75
|
-
items = doc.css @selector
|
76
|
-
pages[n] = items
|
77
|
-
update_progress Thread.current, n if @progress
|
78
|
-
end
|
79
|
-
end
|
80
|
-
pages
|
81
|
-
end
|
82
|
-
|
83
|
-
def parallel_fetch
|
84
|
-
ts = @threads.times.map do |n|
|
85
|
-
Thread.new do
|
86
|
-
Thread.current[:sub] = _fetch
|
87
|
-
end
|
88
|
-
end
|
89
|
-
ts.each_with_object([]) do |t, pages|
|
90
|
-
t.join
|
91
|
-
t[:sub].each_with_index do |items, i|
|
92
|
-
pages[i] = items if items
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
47
|
def parse url
|
98
|
-
page = open(url)
|
48
|
+
page = open(url, http_header)
|
99
49
|
Nokogiri::HTML page.read
|
100
50
|
rescue OpenURI::HTTPError => e
|
101
51
|
if e.message == '404 Not Found'
|
@@ -105,8 +55,12 @@ class PageByPage
|
|
105
55
|
end
|
106
56
|
end
|
107
57
|
|
108
|
-
def
|
109
|
-
|
58
|
+
def http_header
|
59
|
+
@http_header ||= (
|
60
|
+
h = {}
|
61
|
+
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
62
|
+
h
|
63
|
+
)
|
110
64
|
end
|
111
65
|
|
112
66
|
def limit
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'page_by_page/enum'
|
2
|
+
require 'page_by_page/mutex_enum'
|
3
|
+
require 'erb'
|
4
|
+
|
5
|
+
class PageByPage
|
6
|
+
module Fetch
|
7
|
+
|
8
|
+
def url tmpl
|
9
|
+
@tmpl = ERB.new tmpl
|
10
|
+
end
|
11
|
+
|
12
|
+
def from n
|
13
|
+
@from = n
|
14
|
+
end
|
15
|
+
|
16
|
+
def step n
|
17
|
+
@step = n
|
18
|
+
end
|
19
|
+
|
20
|
+
def threads n
|
21
|
+
@threads = n
|
22
|
+
end
|
23
|
+
|
24
|
+
def no_progress *arg
|
25
|
+
@progress = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch
|
29
|
+
nodes_2d =
|
30
|
+
unless defined? @threads
|
31
|
+
@enum = Enum.new enum_options
|
32
|
+
_fetch
|
33
|
+
else
|
34
|
+
@enum = MutexEnum.new enum_options
|
35
|
+
parallel_fetch
|
36
|
+
end
|
37
|
+
puts if @progress
|
38
|
+
nodes_2d.reject(&:nil?).flatten
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
def _fetch
|
44
|
+
items, pages = [nil], []
|
45
|
+
catch :no_more do
|
46
|
+
until items.empty?
|
47
|
+
n = @enum.next
|
48
|
+
break if n > limit
|
49
|
+
|
50
|
+
url = @tmpl.result binding
|
51
|
+
doc = parse url
|
52
|
+
items = doc.css @selector
|
53
|
+
pages[n] = items
|
54
|
+
|
55
|
+
update_progress Thread.current, n if @progress
|
56
|
+
sleep @interval if @interval
|
57
|
+
end
|
58
|
+
end
|
59
|
+
pages
|
60
|
+
end
|
61
|
+
|
62
|
+
def parallel_fetch
|
63
|
+
ts = @threads.times.map do |n|
|
64
|
+
Thread.new do
|
65
|
+
Thread.current[:sub] = _fetch
|
66
|
+
end
|
67
|
+
end
|
68
|
+
ts.each_with_object([]) do |t, pages|
|
69
|
+
t.join
|
70
|
+
t[:sub].each_with_index do |items, i|
|
71
|
+
pages[i] = items if items
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def enum_options
|
77
|
+
{from: @from, step: @step}
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class PageByPage
|
2
|
+
module Jump
|
3
|
+
|
4
|
+
def start url
|
5
|
+
@start = url
|
6
|
+
end
|
7
|
+
|
8
|
+
def iterate selector
|
9
|
+
@iterate = selector
|
10
|
+
end
|
11
|
+
|
12
|
+
def jump
|
13
|
+
url, items, page_count = @start, [], 0
|
14
|
+
|
15
|
+
while true do
|
16
|
+
doc = parse url
|
17
|
+
doc.css(@selector).each{ |item| items << item }
|
18
|
+
|
19
|
+
next_url = doc.at_css(@iterate)
|
20
|
+
break unless next_url
|
21
|
+
|
22
|
+
path = next_url.attr('href')
|
23
|
+
url = concat_host path
|
24
|
+
|
25
|
+
page_count += 1
|
26
|
+
update_progress Thread.current, page_count if @progress
|
27
|
+
break if page_count >= limit
|
28
|
+
|
29
|
+
sleep @interval if @interval
|
30
|
+
end
|
31
|
+
|
32
|
+
items
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def concat_host path
|
38
|
+
@prefix = (
|
39
|
+
regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
|
40
|
+
@start.gsub(regex, '\1')
|
41
|
+
)
|
42
|
+
File.join @prefix, path
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/page_by_page/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -84,6 +84,8 @@ files:
|
|
84
84
|
- bin/setup
|
85
85
|
- lib/page_by_page.rb
|
86
86
|
- lib/page_by_page/enum.rb
|
87
|
+
- lib/page_by_page/fetch.rb
|
88
|
+
- lib/page_by_page/jump.rb
|
87
89
|
- lib/page_by_page/mutex_enum.rb
|
88
90
|
- lib/page_by_page/version.rb
|
89
91
|
- page_by_page.gemspec
|
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
110
|
version: '0'
|
109
111
|
requirements: []
|
110
112
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.6
|
113
|
+
rubygems_version: 2.7.6
|
112
114
|
signing_key:
|
113
115
|
specification_version: 4
|
114
116
|
summary: scrape page by page , according to url pattern
|