page_by_page 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -2
- data/lib/page_by_page/enum.rb +3 -3
- data/lib/page_by_page/fetch.rb +15 -9
- data/lib/page_by_page/jump.rb +1 -1
- data/lib/page_by_page/mutex_enum.rb +3 -3
- data/lib/page_by_page/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51cda3ebfc8bc9a353a51f1386a91cf6ea3d2fbbc219334f278844765b35ad98
|
4
|
+
data.tar.gz: 8329a8af72dd945284617f053cbb1ae686cd5eb9151dd1cf9be0c99b7cdd12ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6761cb5c91ed8736be1193237caa2725110070b9de332af5bd6175371064b3b3d939ba997634c8e3711f8f1d81c7dc3988436d807858951c6fd398e39249b8c7
|
7
|
+
data.tar.gz: 1cee71ddc3dab888aaf75a8cf7139bfd630ef63c29dc954d00c5cdac7c4a251e78a19dedc56ea3e6b8713289b514cabe2519322efc5accd977fb6084a1768088
|
data/README.md
CHANGED
@@ -20,7 +20,9 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
23
|
+
### number pattern
|
24
|
+
|
25
|
+
If you know page number pattern, use `fetch`:
|
24
26
|
|
25
27
|
```ruby
|
26
28
|
nodes = PageByPage.fetch do
|
@@ -36,7 +38,21 @@ nodes = PageByPage.fetch do
|
|
36
38
|
end
|
37
39
|
```
|
38
40
|
|
39
|
-
|
41
|
+
### other pattern
|
42
|
+
|
43
|
+
If the pattern is not simple numbers, use `enumerator` in `fetch`:
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
nodes = PageByiPage.fetch do
|
47
|
+
url 'http://mysql.taobao.org/monthly/<%= n %>'
|
48
|
+
selector 'h3'
|
49
|
+
enumerator ['2020/09/', '2020/08/'].to_enum
|
50
|
+
end
|
51
|
+
```
|
52
|
+
|
53
|
+
### unknown pattern
|
54
|
+
|
55
|
+
If you don't know the pattern, but you see link to next page, use `jump`:
|
40
56
|
|
41
57
|
```ruby
|
42
58
|
nodes = PageByPage.jump do
|
@@ -50,6 +66,8 @@ nodes = PageByPage.jump do
|
|
50
66
|
end
|
51
67
|
```
|
52
68
|
|
69
|
+
### parameters instead of block
|
70
|
+
|
53
71
|
You may just pass parameters instead of block:
|
54
72
|
|
55
73
|
```ruby
|
@@ -66,6 +84,8 @@ nodes = PageByPage.fetch(
|
|
66
84
|
)
|
67
85
|
```
|
68
86
|
|
87
|
+
### lazy
|
88
|
+
|
69
89
|
Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
|
70
90
|
|
71
91
|
```ruby
|
data/lib/page_by_page/enum.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
module PageByPage
|
2
2
|
class Enum
|
3
3
|
|
4
|
-
def initialize from: 1, step: 1
|
5
|
-
@enum = (from..
|
4
|
+
def initialize from: 1, step: 1, limit: nil, enumerator: nil
|
5
|
+
@enum = enumerator || (from..limit).step(step).lazy.map(&:to_i).to_enum
|
6
6
|
end
|
7
7
|
|
8
8
|
def next
|
9
|
-
@enum.next
|
9
|
+
@enum.next rescue nil
|
10
10
|
end
|
11
11
|
|
12
12
|
end
|
data/lib/page_by_page/fetch.rb
CHANGED
@@ -11,7 +11,8 @@ module PageByPage
|
|
11
11
|
def initialize(opt = {}, &block)
|
12
12
|
@from, @step, @to = 1, 1, Float::INFINITY
|
13
13
|
super
|
14
|
-
@enum =
|
14
|
+
@enum = Enum.new(enum_options)
|
15
|
+
@enum = MutexEnum.new(@enum) if defined? @threads
|
15
16
|
end
|
16
17
|
|
17
18
|
def url tmpl
|
@@ -30,10 +31,17 @@ module PageByPage
|
|
30
31
|
@threads = n
|
31
32
|
end
|
32
33
|
|
34
|
+
def enumerator e
|
35
|
+
@enumerator = e
|
36
|
+
end
|
37
|
+
|
33
38
|
def process
|
34
39
|
nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
|
35
40
|
puts if @progress
|
36
|
-
|
41
|
+
|
42
|
+
nodes_2d.sort.each_with_object([]) do |key_items, res|
|
43
|
+
res.concat key_items[1] unless key_items[1].nil?
|
44
|
+
end
|
37
45
|
end
|
38
46
|
|
39
47
|
def iterator
|
@@ -49,7 +57,7 @@ module PageByPage
|
|
49
57
|
protected
|
50
58
|
|
51
59
|
def _fetch
|
52
|
-
pages =
|
60
|
+
pages = {}
|
53
61
|
|
54
62
|
items_enum.each do |page_num, items|
|
55
63
|
pages[page_num] = items
|
@@ -64,7 +72,7 @@ module PageByPage
|
|
64
72
|
catch :no_more do
|
65
73
|
until items.empty?
|
66
74
|
n = @enum.next
|
67
|
-
break if n
|
75
|
+
break if n.nil?
|
68
76
|
|
69
77
|
url = @tmpl.result binding
|
70
78
|
doc = parse url
|
@@ -84,16 +92,14 @@ module PageByPage
|
|
84
92
|
Thread.current[:sub] = _fetch
|
85
93
|
end
|
86
94
|
end
|
87
|
-
ts.each_with_object(
|
95
|
+
ts.each_with_object({}) do |t, pages|
|
88
96
|
t.join
|
89
|
-
t[:sub]
|
90
|
-
pages[i] = items if items
|
91
|
-
end
|
97
|
+
pages.merge! t[:sub]
|
92
98
|
end
|
93
99
|
end
|
94
100
|
|
95
101
|
def enum_options
|
96
|
-
{from: @from, step: @step}
|
102
|
+
{from: @from, step: @step, limit: limit, enumerator: @enumerator}
|
97
103
|
end
|
98
104
|
|
99
105
|
end
|
data/lib/page_by_page/jump.rb
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
require 'page_by_page/enum'
|
2
|
+
require 'thread'
|
2
3
|
|
3
4
|
module PageByPage
|
4
5
|
class MutexEnum < Enum
|
5
6
|
|
6
|
-
def initialize
|
7
|
-
super
|
7
|
+
def initialize enum
|
8
8
|
@q = SizedQueue.new 10
|
9
|
+
@enum = enum
|
9
10
|
Thread.new do
|
10
11
|
loop do
|
11
12
|
@q << @enum.next
|
12
|
-
sleep 0.1
|
13
13
|
end
|
14
14
|
end
|
15
15
|
end
|
data/lib/page_by_page/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|