page_by_page 0.1.9 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +49 -1
- data/bin/console +2 -2
- data/lib/page_by_page.rb +9 -109
- data/lib/page_by_page/common.rb +64 -0
- data/lib/page_by_page/enum.rb +4 -4
- data/lib/page_by_page/fetch.rb +106 -0
- data/lib/page_by_page/jump.rb +50 -0
- data/lib/page_by_page/mutex_enum.rb +4 -4
- data/lib/page_by_page/version.rb +2 -2
- data/page_by_page.gemspec +1 -0
- metadata +20 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 51cda3ebfc8bc9a353a51f1386a91cf6ea3d2fbbc219334f278844765b35ad98
|
4
|
+
data.tar.gz: 8329a8af72dd945284617f053cbb1ae686cd5eb9151dd1cf9be0c99b7cdd12ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6761cb5c91ed8736be1193237caa2725110070b9de332af5bd6175371064b3b3d939ba997634c8e3711f8f1d81c7dc3988436d807858951c6fd398e39249b8c7
|
7
|
+
data.tar.gz: 1cee71ddc3dab888aaf75a8cf7139bfd630ef63c29dc954d00c5cdac7c4a251e78a19dedc56ea3e6b8713289b514cabe2519322efc5accd977fb6084a1768088
|
data/README.md
CHANGED
@@ -20,6 +20,10 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
### number pattern
|
24
|
+
|
25
|
+
If you know page number pattern, use `fetch`:
|
26
|
+
|
23
27
|
```ruby
|
24
28
|
nodes = PageByPage.fetch do
|
25
29
|
url 'https://book.douban.com/subject/25846075/comments/hot?p=<%= n %>'
|
@@ -27,12 +31,44 @@ nodes = PageByPage.fetch do
|
|
27
31
|
# from 2
|
28
32
|
# step 2
|
29
33
|
# to 100
|
34
|
+
# interval 3
|
30
35
|
# threads 4
|
31
36
|
# no_progress
|
37
|
+
# header Cookie: 'douban-fav-remind=1'
|
38
|
+
end
|
39
|
+
```
|
40
|
+
|
41
|
+
### other pattern
|
42
|
+
|
43
|
+
If the pattern is not simple numbers, use `enumerator` in `fetch`:
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
nodes = PageByiPage.fetch do
|
47
|
+
url 'http://mysql.taobao.org/monthly/<%= n %>'
|
48
|
+
selector 'h3'
|
49
|
+
enumerator ['2020/09/', '2020/08/'].to_enum
|
50
|
+
end
|
51
|
+
```
|
52
|
+
|
53
|
+
### unknown pattern
|
54
|
+
|
55
|
+
If you don't know the pattern, but you see link to next page, use `jump`:
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
nodes = PageByPage.jump do
|
59
|
+
start 'https://book.douban.com/subject/25846075/comments/hot'
|
60
|
+
iterate '.comment-paginator li:nth-child(3) a'
|
61
|
+
selector '.comment-item'
|
62
|
+
# to 100
|
63
|
+
# interval 3
|
64
|
+
# no_progress
|
65
|
+
# header Cookie: 'douban-fav-remind=1'
|
32
66
|
end
|
33
67
|
```
|
34
68
|
|
35
|
-
|
69
|
+
### parameters instead of block
|
70
|
+
|
71
|
+
You may just pass parameters instead of block:
|
36
72
|
|
37
73
|
```ruby
|
38
74
|
nodes = PageByPage.fetch(
|
@@ -41,7 +77,19 @@ nodes = PageByPage.fetch(
|
|
41
77
|
# from: 2,
|
42
78
|
# step: 2,
|
43
79
|
# to: 100,
|
80
|
+
# interval: 3
|
44
81
|
# threads: 4,
|
45
82
|
# no_progress: true
|
83
|
+
# header: {Cookie: 'douban-fav-remind=1'}
|
84
|
+
)
|
85
|
+
```
|
86
|
+
|
87
|
+
### lazy
|
88
|
+
|
89
|
+
Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
nodes = PageByPage.lazy_fetch(
|
93
|
+
#...
|
46
94
|
)
|
47
95
|
```
|
data/bin/console
CHANGED
data/lib/page_by_page.rb
CHANGED
@@ -1,121 +1,21 @@
|
|
1
1
|
require 'page_by_page/version'
|
2
|
-
require 'page_by_page/
|
3
|
-
require 'page_by_page/
|
4
|
-
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
|
-
require 'erb'
|
2
|
+
require 'page_by_page/fetch'
|
3
|
+
require 'page_by_page/jump'
|
7
4
|
|
8
|
-
|
5
|
+
module PageByPage
|
9
6
|
|
10
7
|
class << self
|
11
|
-
def fetch(
|
12
|
-
|
13
|
-
pbp.fetch
|
8
|
+
def fetch(*args, &block)
|
9
|
+
Fetch.new(*args, &block).process
|
14
10
|
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def initialize(opt = {}, &block)
|
18
|
-
@from, @step, @to = 1, 1, Float::INFINITY
|
19
|
-
@progress = {}
|
20
|
-
opt.each{ |name, value| send name, value }
|
21
|
-
instance_eval &block if block
|
22
|
-
end
|
23
|
-
|
24
|
-
def url tmpl
|
25
|
-
@tmpl = ERB.new tmpl
|
26
|
-
end
|
27
|
-
|
28
|
-
def selector sl
|
29
|
-
@selector = sl
|
30
|
-
end
|
31
|
-
|
32
|
-
def from n
|
33
|
-
@from = n
|
34
|
-
end
|
35
|
-
|
36
|
-
def step n
|
37
|
-
@step = n
|
38
|
-
end
|
39
|
-
|
40
|
-
def to n
|
41
|
-
@to = n
|
42
|
-
end
|
43
|
-
|
44
|
-
def threads n
|
45
|
-
@threads = n
|
46
|
-
end
|
47
11
|
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
def fetch
|
53
|
-
nodes_2d =
|
54
|
-
unless defined? @threads
|
55
|
-
@enum = Enum.new options
|
56
|
-
_fetch
|
57
|
-
else
|
58
|
-
@enum = MutexEnum.new options
|
59
|
-
parallel_fetch
|
60
|
-
end
|
61
|
-
puts if @progress
|
62
|
-
nodes_2d.reject(&:nil?).flatten
|
63
|
-
end
|
64
|
-
|
65
|
-
private
|
66
|
-
|
67
|
-
def _fetch
|
68
|
-
items, pages = [nil], []
|
69
|
-
catch :no_more do
|
70
|
-
until items.empty?
|
71
|
-
n = @enum.next
|
72
|
-
break if n > limit
|
73
|
-
url = @tmpl.result binding
|
74
|
-
doc = parse url
|
75
|
-
items = doc.css @selector
|
76
|
-
pages[n] = items
|
77
|
-
update_progress Thread.current, n if @progress
|
78
|
-
end
|
12
|
+
def lazy_fetch(*args, &block)
|
13
|
+
Fetch.new(*args, &block).iterator
|
79
14
|
end
|
80
|
-
pages
|
81
|
-
end
|
82
15
|
|
83
|
-
|
84
|
-
|
85
|
-
Thread.new do
|
86
|
-
Thread.current[:sub] = _fetch
|
87
|
-
end
|
88
|
-
end
|
89
|
-
ts.each_with_object([]) do |t, pages|
|
90
|
-
t.join
|
91
|
-
t[:sub].each_with_index do |items, i|
|
92
|
-
pages[i] = items if items
|
93
|
-
end
|
16
|
+
def jump(*args, &block)
|
17
|
+
Jump.new(*args, &block).process
|
94
18
|
end
|
95
19
|
end
|
96
20
|
|
97
|
-
def parse url
|
98
|
-
page = open(url)
|
99
|
-
Nokogiri::HTML page.read
|
100
|
-
rescue OpenURI::HTTPError => e
|
101
|
-
if e.message == '404 Not Found'
|
102
|
-
throw :no_more
|
103
|
-
else
|
104
|
-
raise e
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def options
|
109
|
-
{from: @from, step: @step}
|
110
|
-
end
|
111
|
-
|
112
|
-
def limit
|
113
|
-
@to ||= Float::INFINITY
|
114
|
-
end
|
115
|
-
|
116
|
-
def update_progress thread, page_num
|
117
|
-
@progress[thread] = page_num
|
118
|
-
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
119
|
-
end
|
120
|
-
|
121
21
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module PageByPage
|
5
|
+
module Common
|
6
|
+
def initialize(opt = {}, &block)
|
7
|
+
@progress = {}
|
8
|
+
opt.each{ |name, value| send name, value }
|
9
|
+
instance_eval &block if block
|
10
|
+
end
|
11
|
+
|
12
|
+
def to n
|
13
|
+
@to = n
|
14
|
+
end
|
15
|
+
|
16
|
+
def selector sl
|
17
|
+
@selector = sl
|
18
|
+
end
|
19
|
+
|
20
|
+
def header hash
|
21
|
+
@header = hash
|
22
|
+
end
|
23
|
+
|
24
|
+
def interval second
|
25
|
+
@interval = second
|
26
|
+
end
|
27
|
+
|
28
|
+
def no_progress *arg
|
29
|
+
@progress = nil
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def parse url
|
35
|
+
url = URI::encode url
|
36
|
+
page = open(url, http_header)
|
37
|
+
Nokogiri::HTML page.read
|
38
|
+
rescue OpenURI::HTTPError => e
|
39
|
+
if e.message == '404 Not Found'
|
40
|
+
throw :no_more
|
41
|
+
else
|
42
|
+
raise e
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def http_header
|
47
|
+
@http_header ||= (
|
48
|
+
h = {}
|
49
|
+
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
50
|
+
h
|
51
|
+
)
|
52
|
+
end
|
53
|
+
|
54
|
+
def limit
|
55
|
+
@to ||= Float::INFINITY
|
56
|
+
end
|
57
|
+
|
58
|
+
def update_progress thread, page_num
|
59
|
+
@progress[thread] = page_num
|
60
|
+
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
data/lib/page_by_page/enum.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
|
1
|
+
module PageByPage
|
2
2
|
class Enum
|
3
3
|
|
4
|
-
def initialize from: 1, step: 1
|
5
|
-
@enum = (from..
|
4
|
+
def initialize from: 1, step: 1, limit: nil, enumerator: nil
|
5
|
+
@enum = enumerator || (from..limit).step(step).lazy.map(&:to_i).to_enum
|
6
6
|
end
|
7
7
|
|
8
8
|
def next
|
9
|
-
@enum.next
|
9
|
+
@enum.next rescue nil
|
10
10
|
end
|
11
11
|
|
12
12
|
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'page_by_page/enum'
|
2
|
+
require 'page_by_page/mutex_enum'
|
3
|
+
require 'page_by_page/common'
|
4
|
+
require 'erb'
|
5
|
+
|
6
|
+
module PageByPage
|
7
|
+
class Fetch
|
8
|
+
|
9
|
+
include Common
|
10
|
+
|
11
|
+
def initialize(opt = {}, &block)
|
12
|
+
@from, @step, @to = 1, 1, Float::INFINITY
|
13
|
+
super
|
14
|
+
@enum = Enum.new(enum_options)
|
15
|
+
@enum = MutexEnum.new(@enum) if defined? @threads
|
16
|
+
end
|
17
|
+
|
18
|
+
def url tmpl
|
19
|
+
@tmpl = ERB.new tmpl
|
20
|
+
end
|
21
|
+
|
22
|
+
def from n
|
23
|
+
@from = n
|
24
|
+
end
|
25
|
+
|
26
|
+
def step n
|
27
|
+
@step = n
|
28
|
+
end
|
29
|
+
|
30
|
+
def threads n
|
31
|
+
@threads = n
|
32
|
+
end
|
33
|
+
|
34
|
+
def enumerator e
|
35
|
+
@enumerator = e
|
36
|
+
end
|
37
|
+
|
38
|
+
def process
|
39
|
+
nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
|
40
|
+
puts if @progress
|
41
|
+
|
42
|
+
nodes_2d.sort.each_with_object([]) do |key_items, res|
|
43
|
+
res.concat key_items[1] unless key_items[1].nil?
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def iterator
|
48
|
+
Enumerator.new do |yielder|
|
49
|
+
items_enum.each do |_, items|
|
50
|
+
items.each do |i|
|
51
|
+
yielder.yield(i)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
|
59
|
+
def _fetch
|
60
|
+
pages = {}
|
61
|
+
|
62
|
+
items_enum.each do |page_num, items|
|
63
|
+
pages[page_num] = items
|
64
|
+
end
|
65
|
+
|
66
|
+
pages
|
67
|
+
end
|
68
|
+
|
69
|
+
def items_enum
|
70
|
+
Enumerator.new do |yielder|
|
71
|
+
items = [nil]
|
72
|
+
catch :no_more do
|
73
|
+
until items.empty?
|
74
|
+
n = @enum.next
|
75
|
+
break if n.nil?
|
76
|
+
|
77
|
+
url = @tmpl.result binding
|
78
|
+
doc = parse url
|
79
|
+
items = doc.css @selector
|
80
|
+
yielder.yield(n, items)
|
81
|
+
|
82
|
+
update_progress Thread.current, n if @progress
|
83
|
+
sleep @interval if @interval
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def parallel_fetch
|
90
|
+
ts = @threads.times.map do |n|
|
91
|
+
Thread.new do
|
92
|
+
Thread.current[:sub] = _fetch
|
93
|
+
end
|
94
|
+
end
|
95
|
+
ts.each_with_object({}) do |t, pages|
|
96
|
+
t.join
|
97
|
+
pages.merge! t[:sub]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def enum_options
|
102
|
+
{from: @from, step: @step, limit: limit, enumerator: @enumerator}
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'page_by_page/common'
|
2
|
+
|
3
|
+
module PageByPage
|
4
|
+
class Jump
|
5
|
+
|
6
|
+
include Common
|
7
|
+
|
8
|
+
def start url
|
9
|
+
@start = url
|
10
|
+
end
|
11
|
+
|
12
|
+
def iterate selector
|
13
|
+
@iterate = selector
|
14
|
+
end
|
15
|
+
|
16
|
+
def process
|
17
|
+
url, items, page_count = @start, [], 0
|
18
|
+
|
19
|
+
while true do
|
20
|
+
doc = parse url
|
21
|
+
doc.css(@selector).each{ |item| items << item }
|
22
|
+
|
23
|
+
page_count += 1
|
24
|
+
update_progress Thread.current, page_count if @progress
|
25
|
+
break if page_count >= limit
|
26
|
+
|
27
|
+
next_url = doc.at_css(@iterate)
|
28
|
+
break unless next_url
|
29
|
+
|
30
|
+
path = next_url.attr('href')
|
31
|
+
url = path.start_with?('/') ? concat_host(path) : path
|
32
|
+
|
33
|
+
sleep @interval if @interval
|
34
|
+
end
|
35
|
+
|
36
|
+
puts if @progress
|
37
|
+
items
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def concat_host path
|
43
|
+
@prefix = (
|
44
|
+
regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
|
45
|
+
@start.gsub(regex, '\1')
|
46
|
+
)
|
47
|
+
File.join @prefix, path
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
require 'page_by_page/enum'
|
2
|
+
require 'thread'
|
2
3
|
|
3
|
-
|
4
|
+
module PageByPage
|
4
5
|
class MutexEnum < Enum
|
5
6
|
|
6
|
-
def initialize
|
7
|
-
super
|
7
|
+
def initialize enum
|
8
8
|
@q = SizedQueue.new 10
|
9
|
+
@enum = enum
|
9
10
|
Thread.new do
|
10
11
|
loop do
|
11
12
|
@q << @enum.next
|
12
|
-
sleep 0.1
|
13
13
|
end
|
14
14
|
end
|
15
15
|
end
|
data/lib/page_by_page/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.1.
|
1
|
+
module PageByPage
|
2
|
+
VERSION = "0.1.14"
|
3
3
|
end
|
data/page_by_page.gemspec
CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "bundler", "~> 1.13"
|
33
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
34
34
|
spec.add_development_dependency "minitest", "~> 5.0"
|
35
|
+
spec.add_development_dependency "pry"
|
35
36
|
spec.add_dependency 'nokogiri', '~> 1.6'
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: nokogiri
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -83,7 +97,10 @@ files:
|
|
83
97
|
- bin/console
|
84
98
|
- bin/setup
|
85
99
|
- lib/page_by_page.rb
|
100
|
+
- lib/page_by_page/common.rb
|
86
101
|
- lib/page_by_page/enum.rb
|
102
|
+
- lib/page_by_page/fetch.rb
|
103
|
+
- lib/page_by_page/jump.rb
|
87
104
|
- lib/page_by_page/mutex_enum.rb
|
88
105
|
- lib/page_by_page/version.rb
|
89
106
|
- page_by_page.gemspec
|
@@ -107,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
124
|
- !ruby/object:Gem::Version
|
108
125
|
version: '0'
|
109
126
|
requirements: []
|
110
|
-
|
111
|
-
rubygems_version: 2.6.8
|
127
|
+
rubygems_version: 3.0.3
|
112
128
|
signing_key:
|
113
129
|
specification_version: 4
|
114
130
|
summary: scrape page by page , according to url pattern
|