page_by_page 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +21 -1
- data/lib/page_by_page.rb +24 -70
- data/lib/page_by_page/fetch.rb +81 -0
- data/lib/page_by_page/jump.rb +45 -0
- data/lib/page_by_page/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 47ea21770030654ef4b0d4a7b5d3dec3c1c20d7d43b11e39c5ea7e68be86478f
|
4
|
+
data.tar.gz: 7341f8e5293250b308bb4c223a14f1ec8480513269f3b6c1ef06c5aa9f825e92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0912271bc28adfd2e97313bbecdeed894976eb37814c84bbbb009ca8ec10dbf04c59b25ce4716dee1701b42cbc1015e7dd83441afeb9a8d7cf7678c477328beb'
|
7
|
+
data.tar.gz: 6e46ac599c44dff52b9527de7d526bac874c166191f10e212168034b1477fead6194d1dabfddad46ec74494676e4fc7260b1f25616e5d3c6a08d0bc6250e5fab
|
data/README.md
CHANGED
@@ -20,6 +20,8 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
If you know page number pattern, use fetch:
|
24
|
+
|
23
25
|
```ruby
|
24
26
|
nodes = PageByPage.fetch do
|
25
27
|
url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
|
@@ -27,12 +29,28 @@ nodes = PageByPage.fetch do
|
|
27
29
|
# from 2
|
28
30
|
# step 2
|
29
31
|
# to 100
|
32
|
+
# interval 3
|
30
33
|
# threads 4
|
31
34
|
# no_progress
|
35
|
+
# header Cookie: 'douban-fav-remind=1'
|
36
|
+
end
|
37
|
+
```
|
38
|
+
|
39
|
+
If you don't know the pattern, but you see link to next page, use jump:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
nodes = PageByPage.jump do
|
43
|
+
start 'https://book.douban.com/subject/25846075/comments/hot'
|
44
|
+
iterate '.comment-paginator li:nth-child(3) a'
|
45
|
+
selector '.comment-item'
|
46
|
+
# to 100
|
47
|
+
# interval 3
|
48
|
+
# no_progress
|
49
|
+
# header Cookie: 'douban-fav-remind=1'
|
32
50
|
end
|
33
51
|
```
|
34
52
|
|
35
|
-
|
53
|
+
You may just pass parameters instead of block:
|
36
54
|
|
37
55
|
```ruby
|
38
56
|
nodes = PageByPage.fetch(
|
@@ -41,7 +59,9 @@ nodes = PageByPage.fetch(
|
|
41
59
|
# from: 2,
|
42
60
|
# step: 2,
|
43
61
|
# to: 100,
|
62
|
+
# interval: 3
|
44
63
|
# threads: 4,
|
45
64
|
# no_progress: true
|
65
|
+
# header: {Cookie: 'douban-fav-remind=1'}
|
46
66
|
)
|
47
67
|
```
|
data/lib/page_by_page.rb
CHANGED
@@ -1,16 +1,21 @@
|
|
1
1
|
require 'page_by_page/version'
|
2
|
-
require 'page_by_page/
|
3
|
-
require 'page_by_page/
|
2
|
+
require 'page_by_page/fetch'
|
3
|
+
require 'page_by_page/jump'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'open-uri'
|
6
|
-
require 'erb'
|
7
6
|
|
8
7
|
class PageByPage
|
9
8
|
|
9
|
+
include Fetch
|
10
|
+
include Jump
|
11
|
+
|
10
12
|
class << self
|
11
|
-
def fetch(
|
12
|
-
|
13
|
-
|
13
|
+
def fetch(*args, &block)
|
14
|
+
new(*args, &block).fetch
|
15
|
+
end
|
16
|
+
|
17
|
+
def jump(*args, &block)
|
18
|
+
new(*args, &block).jump
|
14
19
|
end
|
15
20
|
end
|
16
21
|
|
@@ -21,81 +26,26 @@ class PageByPage
|
|
21
26
|
instance_eval &block if block
|
22
27
|
end
|
23
28
|
|
24
|
-
def url tmpl
|
25
|
-
@tmpl = ERB.new tmpl
|
26
|
-
end
|
27
|
-
|
28
|
-
def selector sl
|
29
|
-
@selector = sl
|
30
|
-
end
|
31
|
-
|
32
|
-
def from n
|
33
|
-
@from = n
|
34
|
-
end
|
35
|
-
|
36
|
-
def step n
|
37
|
-
@step = n
|
38
|
-
end
|
39
|
-
|
40
29
|
def to n
|
41
30
|
@to = n
|
42
31
|
end
|
43
32
|
|
44
|
-
def
|
45
|
-
@
|
33
|
+
def selector sl
|
34
|
+
@selector = sl
|
46
35
|
end
|
47
36
|
|
48
|
-
def
|
49
|
-
@
|
37
|
+
def header hash
|
38
|
+
@header = hash
|
50
39
|
end
|
51
40
|
|
52
|
-
def
|
53
|
-
|
54
|
-
unless defined? @threads
|
55
|
-
@enum = Enum.new options
|
56
|
-
_fetch
|
57
|
-
else
|
58
|
-
@enum = MutexEnum.new options
|
59
|
-
parallel_fetch
|
60
|
-
end
|
61
|
-
puts if @progress
|
62
|
-
nodes_2d.reject(&:nil?).flatten
|
41
|
+
def interval second
|
42
|
+
@interval = second
|
63
43
|
end
|
64
44
|
|
65
45
|
private
|
66
46
|
|
67
|
-
def _fetch
|
68
|
-
items, pages = [nil], []
|
69
|
-
catch :no_more do
|
70
|
-
until items.empty?
|
71
|
-
n = @enum.next
|
72
|
-
break if n > limit
|
73
|
-
url = @tmpl.result binding
|
74
|
-
doc = parse url
|
75
|
-
items = doc.css @selector
|
76
|
-
pages[n] = items
|
77
|
-
update_progress Thread.current, n if @progress
|
78
|
-
end
|
79
|
-
end
|
80
|
-
pages
|
81
|
-
end
|
82
|
-
|
83
|
-
def parallel_fetch
|
84
|
-
ts = @threads.times.map do |n|
|
85
|
-
Thread.new do
|
86
|
-
Thread.current[:sub] = _fetch
|
87
|
-
end
|
88
|
-
end
|
89
|
-
ts.each_with_object([]) do |t, pages|
|
90
|
-
t.join
|
91
|
-
t[:sub].each_with_index do |items, i|
|
92
|
-
pages[i] = items if items
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
47
|
def parse url
|
98
|
-
page = open(url)
|
48
|
+
page = open(url, http_header)
|
99
49
|
Nokogiri::HTML page.read
|
100
50
|
rescue OpenURI::HTTPError => e
|
101
51
|
if e.message == '404 Not Found'
|
@@ -105,8 +55,12 @@ class PageByPage
|
|
105
55
|
end
|
106
56
|
end
|
107
57
|
|
108
|
-
def
|
109
|
-
|
58
|
+
def http_header
|
59
|
+
@http_header ||= (
|
60
|
+
h = {}
|
61
|
+
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
62
|
+
h
|
63
|
+
)
|
110
64
|
end
|
111
65
|
|
112
66
|
def limit
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'page_by_page/enum'
|
2
|
+
require 'page_by_page/mutex_enum'
|
3
|
+
require 'erb'
|
4
|
+
|
5
|
+
class PageByPage
|
6
|
+
module Fetch
|
7
|
+
|
8
|
+
def url tmpl
|
9
|
+
@tmpl = ERB.new tmpl
|
10
|
+
end
|
11
|
+
|
12
|
+
def from n
|
13
|
+
@from = n
|
14
|
+
end
|
15
|
+
|
16
|
+
def step n
|
17
|
+
@step = n
|
18
|
+
end
|
19
|
+
|
20
|
+
def threads n
|
21
|
+
@threads = n
|
22
|
+
end
|
23
|
+
|
24
|
+
def no_progress *arg
|
25
|
+
@progress = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch
|
29
|
+
nodes_2d =
|
30
|
+
unless defined? @threads
|
31
|
+
@enum = Enum.new enum_options
|
32
|
+
_fetch
|
33
|
+
else
|
34
|
+
@enum = MutexEnum.new enum_options
|
35
|
+
parallel_fetch
|
36
|
+
end
|
37
|
+
puts if @progress
|
38
|
+
nodes_2d.reject(&:nil?).flatten
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
def _fetch
|
44
|
+
items, pages = [nil], []
|
45
|
+
catch :no_more do
|
46
|
+
until items.empty?
|
47
|
+
n = @enum.next
|
48
|
+
break if n > limit
|
49
|
+
|
50
|
+
url = @tmpl.result binding
|
51
|
+
doc = parse url
|
52
|
+
items = doc.css @selector
|
53
|
+
pages[n] = items
|
54
|
+
|
55
|
+
update_progress Thread.current, n if @progress
|
56
|
+
sleep @interval if @interval
|
57
|
+
end
|
58
|
+
end
|
59
|
+
pages
|
60
|
+
end
|
61
|
+
|
62
|
+
def parallel_fetch
|
63
|
+
ts = @threads.times.map do |n|
|
64
|
+
Thread.new do
|
65
|
+
Thread.current[:sub] = _fetch
|
66
|
+
end
|
67
|
+
end
|
68
|
+
ts.each_with_object([]) do |t, pages|
|
69
|
+
t.join
|
70
|
+
t[:sub].each_with_index do |items, i|
|
71
|
+
pages[i] = items if items
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def enum_options
|
77
|
+
{from: @from, step: @step}
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class PageByPage
|
2
|
+
module Jump
|
3
|
+
|
4
|
+
def start url
|
5
|
+
@start = url
|
6
|
+
end
|
7
|
+
|
8
|
+
def iterate selector
|
9
|
+
@iterate = selector
|
10
|
+
end
|
11
|
+
|
12
|
+
def jump
|
13
|
+
url, items, page_count = @start, [], 0
|
14
|
+
|
15
|
+
while true do
|
16
|
+
doc = parse url
|
17
|
+
doc.css(@selector).each{ |item| items << item }
|
18
|
+
|
19
|
+
next_url = doc.at_css(@iterate)
|
20
|
+
break unless next_url
|
21
|
+
|
22
|
+
path = next_url.attr('href')
|
23
|
+
url = concat_host path
|
24
|
+
|
25
|
+
page_count += 1
|
26
|
+
update_progress Thread.current, page_count if @progress
|
27
|
+
break if page_count >= limit
|
28
|
+
|
29
|
+
sleep @interval if @interval
|
30
|
+
end
|
31
|
+
|
32
|
+
items
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def concat_host path
|
38
|
+
@prefix = (
|
39
|
+
regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
|
40
|
+
@start.gsub(regex, '\1')
|
41
|
+
)
|
42
|
+
File.join @prefix, path
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/page_by_page/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -84,6 +84,8 @@ files:
|
|
84
84
|
- bin/setup
|
85
85
|
- lib/page_by_page.rb
|
86
86
|
- lib/page_by_page/enum.rb
|
87
|
+
- lib/page_by_page/fetch.rb
|
88
|
+
- lib/page_by_page/jump.rb
|
87
89
|
- lib/page_by_page/mutex_enum.rb
|
88
90
|
- lib/page_by_page/version.rb
|
89
91
|
- page_by_page.gemspec
|
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
110
|
version: '0'
|
109
111
|
requirements: []
|
110
112
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.6
|
113
|
+
rubygems_version: 2.7.6
|
112
114
|
signing_key:
|
113
115
|
specification_version: 4
|
114
116
|
summary: scrape page by page , according to url pattern
|