page_by_page 0.1.9 → 0.1.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +49 -1
- data/bin/console +2 -2
- data/lib/page_by_page.rb +9 -109
- data/lib/page_by_page/common.rb +64 -0
- data/lib/page_by_page/enum.rb +4 -4
- data/lib/page_by_page/fetch.rb +106 -0
- data/lib/page_by_page/jump.rb +50 -0
- data/lib/page_by_page/mutex_enum.rb +4 -4
- data/lib/page_by_page/version.rb +2 -2
- data/page_by_page.gemspec +1 -0
- metadata +20 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 51cda3ebfc8bc9a353a51f1386a91cf6ea3d2fbbc219334f278844765b35ad98
|
4
|
+
data.tar.gz: 8329a8af72dd945284617f053cbb1ae686cd5eb9151dd1cf9be0c99b7cdd12ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6761cb5c91ed8736be1193237caa2725110070b9de332af5bd6175371064b3b3d939ba997634c8e3711f8f1d81c7dc3988436d807858951c6fd398e39249b8c7
|
7
|
+
data.tar.gz: 1cee71ddc3dab888aaf75a8cf7139bfd630ef63c29dc954d00c5cdac7c4a251e78a19dedc56ea3e6b8713289b514cabe2519322efc5accd977fb6084a1768088
|
data/README.md
CHANGED
@@ -20,6 +20,10 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
### number pattern
|
24
|
+
|
25
|
+
If you know page number pattern, use `fetch`:
|
26
|
+
|
23
27
|
```ruby
|
24
28
|
nodes = PageByPage.fetch do
|
25
29
|
url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
|
@@ -27,12 +31,44 @@ nodes = PageByPage.fetch do
|
|
27
31
|
# from 2
|
28
32
|
# step 2
|
29
33
|
# to 100
|
34
|
+
# interval 3
|
30
35
|
# threads 4
|
31
36
|
# no_progress
|
37
|
+
# header Cookie: 'douban-fav-remind=1'
|
38
|
+
end
|
39
|
+
```
|
40
|
+
|
41
|
+
### other pattern
|
42
|
+
|
43
|
+
If the pattern is not simple numbers, use `enumerator` in `fetch`:
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
nodes = PageByiPage.fetch do
|
47
|
+
url 'http://mysql.taobao.org/monthly/<%= n %>'
|
48
|
+
selector 'h3'
|
49
|
+
enumerator ['2020/09/', '2020/08/'].to_enum
|
50
|
+
end
|
51
|
+
```
|
52
|
+
|
53
|
+
### unknown pattern
|
54
|
+
|
55
|
+
If you don't know the pattern, but you see link to next page, use `jump`:
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
nodes = PageByPage.jump do
|
59
|
+
start 'https://book.douban.com/subject/25846075/comments/hot'
|
60
|
+
iterate '.comment-paginator li:nth-child(3) a'
|
61
|
+
selector '.comment-item'
|
62
|
+
# to 100
|
63
|
+
# interval 3
|
64
|
+
# no_progress
|
65
|
+
# header Cookie: 'douban-fav-remind=1'
|
32
66
|
end
|
33
67
|
```
|
34
68
|
|
35
|
-
|
69
|
+
### parameters instead of block
|
70
|
+
|
71
|
+
You may just pass parameters instead of block:
|
36
72
|
|
37
73
|
```ruby
|
38
74
|
nodes = PageByPage.fetch(
|
@@ -41,7 +77,19 @@ nodes = PageByPage.fetch(
|
|
41
77
|
# from: 2,
|
42
78
|
# step: 2,
|
43
79
|
# to: 100,
|
80
|
+
# interval: 3
|
44
81
|
# threads: 4,
|
45
82
|
# no_progress: true
|
83
|
+
# header: {Cookie: 'douban-fav-remind=1'}
|
84
|
+
)
|
85
|
+
```
|
86
|
+
|
87
|
+
### lazy
|
88
|
+
|
89
|
+
Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
nodes = PageByPage.lazy_fetch(
|
93
|
+
#...
|
46
94
|
)
|
47
95
|
```
|
data/bin/console
CHANGED
data/lib/page_by_page.rb
CHANGED
@@ -1,121 +1,21 @@
|
|
1
1
|
require 'page_by_page/version'
|
2
|
-
require 'page_by_page/
|
3
|
-
require 'page_by_page/
|
4
|
-
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
|
-
require 'erb'
|
2
|
+
require 'page_by_page/fetch'
|
3
|
+
require 'page_by_page/jump'
|
7
4
|
|
8
|
-
|
5
|
+
module PageByPage
|
9
6
|
|
10
7
|
class << self
|
11
|
-
def fetch(
|
12
|
-
|
13
|
-
pbp.fetch
|
8
|
+
def fetch(*args, &block)
|
9
|
+
Fetch.new(*args, &block).process
|
14
10
|
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def initialize(opt = {}, &block)
|
18
|
-
@from, @step, @to = 1, 1, Float::INFINITY
|
19
|
-
@progress = {}
|
20
|
-
opt.each{ |name, value| send name, value }
|
21
|
-
instance_eval &block if block
|
22
|
-
end
|
23
|
-
|
24
|
-
def url tmpl
|
25
|
-
@tmpl = ERB.new tmpl
|
26
|
-
end
|
27
|
-
|
28
|
-
def selector sl
|
29
|
-
@selector = sl
|
30
|
-
end
|
31
|
-
|
32
|
-
def from n
|
33
|
-
@from = n
|
34
|
-
end
|
35
|
-
|
36
|
-
def step n
|
37
|
-
@step = n
|
38
|
-
end
|
39
|
-
|
40
|
-
def to n
|
41
|
-
@to = n
|
42
|
-
end
|
43
|
-
|
44
|
-
def threads n
|
45
|
-
@threads = n
|
46
|
-
end
|
47
11
|
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
def fetch
|
53
|
-
nodes_2d =
|
54
|
-
unless defined? @threads
|
55
|
-
@enum = Enum.new options
|
56
|
-
_fetch
|
57
|
-
else
|
58
|
-
@enum = MutexEnum.new options
|
59
|
-
parallel_fetch
|
60
|
-
end
|
61
|
-
puts if @progress
|
62
|
-
nodes_2d.reject(&:nil?).flatten
|
63
|
-
end
|
64
|
-
|
65
|
-
private
|
66
|
-
|
67
|
-
def _fetch
|
68
|
-
items, pages = [nil], []
|
69
|
-
catch :no_more do
|
70
|
-
until items.empty?
|
71
|
-
n = @enum.next
|
72
|
-
break if n > limit
|
73
|
-
url = @tmpl.result binding
|
74
|
-
doc = parse url
|
75
|
-
items = doc.css @selector
|
76
|
-
pages[n] = items
|
77
|
-
update_progress Thread.current, n if @progress
|
78
|
-
end
|
12
|
+
def lazy_fetch(*args, &block)
|
13
|
+
Fetch.new(*args, &block).iterator
|
79
14
|
end
|
80
|
-
pages
|
81
|
-
end
|
82
15
|
|
83
|
-
|
84
|
-
|
85
|
-
Thread.new do
|
86
|
-
Thread.current[:sub] = _fetch
|
87
|
-
end
|
88
|
-
end
|
89
|
-
ts.each_with_object([]) do |t, pages|
|
90
|
-
t.join
|
91
|
-
t[:sub].each_with_index do |items, i|
|
92
|
-
pages[i] = items if items
|
93
|
-
end
|
16
|
+
def jump(*args, &block)
|
17
|
+
Jump.new(*args, &block).process
|
94
18
|
end
|
95
19
|
end
|
96
20
|
|
97
|
-
def parse url
|
98
|
-
page = open(url)
|
99
|
-
Nokogiri::HTML page.read
|
100
|
-
rescue OpenURI::HTTPError => e
|
101
|
-
if e.message == '404 Not Found'
|
102
|
-
throw :no_more
|
103
|
-
else
|
104
|
-
raise e
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def options
|
109
|
-
{from: @from, step: @step}
|
110
|
-
end
|
111
|
-
|
112
|
-
def limit
|
113
|
-
@to ||= Float::INFINITY
|
114
|
-
end
|
115
|
-
|
116
|
-
def update_progress thread, page_num
|
117
|
-
@progress[thread] = page_num
|
118
|
-
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
119
|
-
end
|
120
|
-
|
121
21
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module PageByPage
|
5
|
+
module Common
|
6
|
+
def initialize(opt = {}, &block)
|
7
|
+
@progress = {}
|
8
|
+
opt.each{ |name, value| send name, value }
|
9
|
+
instance_eval &block if block
|
10
|
+
end
|
11
|
+
|
12
|
+
def to n
|
13
|
+
@to = n
|
14
|
+
end
|
15
|
+
|
16
|
+
def selector sl
|
17
|
+
@selector = sl
|
18
|
+
end
|
19
|
+
|
20
|
+
def header hash
|
21
|
+
@header = hash
|
22
|
+
end
|
23
|
+
|
24
|
+
def interval second
|
25
|
+
@interval = second
|
26
|
+
end
|
27
|
+
|
28
|
+
def no_progress *arg
|
29
|
+
@progress = nil
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def parse url
|
35
|
+
url = URI::encode url
|
36
|
+
page = open(url, http_header)
|
37
|
+
Nokogiri::HTML page.read
|
38
|
+
rescue OpenURI::HTTPError => e
|
39
|
+
if e.message == '404 Not Found'
|
40
|
+
throw :no_more
|
41
|
+
else
|
42
|
+
raise e
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def http_header
|
47
|
+
@http_header ||= (
|
48
|
+
h = {}
|
49
|
+
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
50
|
+
h
|
51
|
+
)
|
52
|
+
end
|
53
|
+
|
54
|
+
def limit
|
55
|
+
@to ||= Float::INFINITY
|
56
|
+
end
|
57
|
+
|
58
|
+
def update_progress thread, page_num
|
59
|
+
@progress[thread] = page_num
|
60
|
+
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
data/lib/page_by_page/enum.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
|
1
|
+
module PageByPage
|
2
2
|
class Enum
|
3
3
|
|
4
|
-
def initialize from: 1, step: 1
|
5
|
-
@enum = (from..
|
4
|
+
def initialize from: 1, step: 1, limit: nil, enumerator: nil
|
5
|
+
@enum = enumerator || (from..limit).step(step).lazy.map(&:to_i).to_enum
|
6
6
|
end
|
7
7
|
|
8
8
|
def next
|
9
|
-
@enum.next
|
9
|
+
@enum.next rescue nil
|
10
10
|
end
|
11
11
|
|
12
12
|
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'page_by_page/enum'
|
2
|
+
require 'page_by_page/mutex_enum'
|
3
|
+
require 'page_by_page/common'
|
4
|
+
require 'erb'
|
5
|
+
|
6
|
+
module PageByPage
|
7
|
+
class Fetch
|
8
|
+
|
9
|
+
include Common
|
10
|
+
|
11
|
+
def initialize(opt = {}, &block)
|
12
|
+
@from, @step, @to = 1, 1, Float::INFINITY
|
13
|
+
super
|
14
|
+
@enum = Enum.new(enum_options)
|
15
|
+
@enum = MutexEnum.new(@enum) if defined? @threads
|
16
|
+
end
|
17
|
+
|
18
|
+
def url tmpl
|
19
|
+
@tmpl = ERB.new tmpl
|
20
|
+
end
|
21
|
+
|
22
|
+
def from n
|
23
|
+
@from = n
|
24
|
+
end
|
25
|
+
|
26
|
+
def step n
|
27
|
+
@step = n
|
28
|
+
end
|
29
|
+
|
30
|
+
def threads n
|
31
|
+
@threads = n
|
32
|
+
end
|
33
|
+
|
34
|
+
def enumerator e
|
35
|
+
@enumerator = e
|
36
|
+
end
|
37
|
+
|
38
|
+
def process
|
39
|
+
nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
|
40
|
+
puts if @progress
|
41
|
+
|
42
|
+
nodes_2d.sort.each_with_object([]) do |key_items, res|
|
43
|
+
res.concat key_items[1] unless key_items[1].nil?
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def iterator
|
48
|
+
Enumerator.new do |yielder|
|
49
|
+
items_enum.each do |_, items|
|
50
|
+
items.each do |i|
|
51
|
+
yielder.yield(i)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
|
59
|
+
def _fetch
|
60
|
+
pages = {}
|
61
|
+
|
62
|
+
items_enum.each do |page_num, items|
|
63
|
+
pages[page_num] = items
|
64
|
+
end
|
65
|
+
|
66
|
+
pages
|
67
|
+
end
|
68
|
+
|
69
|
+
def items_enum
|
70
|
+
Enumerator.new do |yielder|
|
71
|
+
items = [nil]
|
72
|
+
catch :no_more do
|
73
|
+
until items.empty?
|
74
|
+
n = @enum.next
|
75
|
+
break if n.nil?
|
76
|
+
|
77
|
+
url = @tmpl.result binding
|
78
|
+
doc = parse url
|
79
|
+
items = doc.css @selector
|
80
|
+
yielder.yield(n, items)
|
81
|
+
|
82
|
+
update_progress Thread.current, n if @progress
|
83
|
+
sleep @interval if @interval
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def parallel_fetch
|
90
|
+
ts = @threads.times.map do |n|
|
91
|
+
Thread.new do
|
92
|
+
Thread.current[:sub] = _fetch
|
93
|
+
end
|
94
|
+
end
|
95
|
+
ts.each_with_object({}) do |t, pages|
|
96
|
+
t.join
|
97
|
+
pages.merge! t[:sub]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def enum_options
|
102
|
+
{from: @from, step: @step, limit: limit, enumerator: @enumerator}
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'page_by_page/common'
|
2
|
+
|
3
|
+
module PageByPage
|
4
|
+
class Jump
|
5
|
+
|
6
|
+
include Common
|
7
|
+
|
8
|
+
def start url
|
9
|
+
@start = url
|
10
|
+
end
|
11
|
+
|
12
|
+
def iterate selector
|
13
|
+
@iterate = selector
|
14
|
+
end
|
15
|
+
|
16
|
+
def process
|
17
|
+
url, items, page_count = @start, [], 0
|
18
|
+
|
19
|
+
while true do
|
20
|
+
doc = parse url
|
21
|
+
doc.css(@selector).each{ |item| items << item }
|
22
|
+
|
23
|
+
page_count += 1
|
24
|
+
update_progress Thread.current, page_count if @progress
|
25
|
+
break if page_count >= limit
|
26
|
+
|
27
|
+
next_url = doc.at_css(@iterate)
|
28
|
+
break unless next_url
|
29
|
+
|
30
|
+
path = next_url.attr('href')
|
31
|
+
url = path.start_with?('/') ? concat_host(path) : path
|
32
|
+
|
33
|
+
sleep @interval if @interval
|
34
|
+
end
|
35
|
+
|
36
|
+
puts if @progress
|
37
|
+
items
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def concat_host path
|
43
|
+
@prefix = (
|
44
|
+
regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
|
45
|
+
@start.gsub(regex, '\1')
|
46
|
+
)
|
47
|
+
File.join @prefix, path
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
require 'page_by_page/enum'
|
2
|
+
require 'thread'
|
2
3
|
|
3
|
-
|
4
|
+
module PageByPage
|
4
5
|
class MutexEnum < Enum
|
5
6
|
|
6
|
-
def initialize
|
7
|
-
super
|
7
|
+
def initialize enum
|
8
8
|
@q = SizedQueue.new 10
|
9
|
+
@enum = enum
|
9
10
|
Thread.new do
|
10
11
|
loop do
|
11
12
|
@q << @enum.next
|
12
|
-
sleep 0.1
|
13
13
|
end
|
14
14
|
end
|
15
15
|
end
|
data/lib/page_by_page/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.1.
|
1
|
+
module PageByPage
|
2
|
+
VERSION = "0.1.14"
|
3
3
|
end
|
data/page_by_page.gemspec
CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "bundler", "~> 1.13"
|
33
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
34
34
|
spec.add_development_dependency "minitest", "~> 5.0"
|
35
|
+
spec.add_development_dependency "pry"
|
35
36
|
spec.add_dependency 'nokogiri', '~> 1.6'
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: nokogiri
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -83,7 +97,10 @@ files:
|
|
83
97
|
- bin/console
|
84
98
|
- bin/setup
|
85
99
|
- lib/page_by_page.rb
|
100
|
+
- lib/page_by_page/common.rb
|
86
101
|
- lib/page_by_page/enum.rb
|
102
|
+
- lib/page_by_page/fetch.rb
|
103
|
+
- lib/page_by_page/jump.rb
|
87
104
|
- lib/page_by_page/mutex_enum.rb
|
88
105
|
- lib/page_by_page/version.rb
|
89
106
|
- page_by_page.gemspec
|
@@ -107,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
124
|
- !ruby/object:Gem::Version
|
108
125
|
version: '0'
|
109
126
|
requirements: []
|
110
|
-
|
111
|
-
rubygems_version: 2.6.8
|
127
|
+
rubygems_version: 3.0.3
|
112
128
|
signing_key:
|
113
129
|
specification_version: 4
|
114
130
|
summary: scrape page by page , according to url pattern
|