page_by_page 0.1.13 → 0.1.14

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a00d32af004cc151bf366b9498e18f2e7a2da73629a5db0373b7aad7992f995d
4
- data.tar.gz: 5301c76cc3968c6f187e3349d1de5bf2f4a19a48440f1484a29e25ea35af0342
3
+ metadata.gz: 51cda3ebfc8bc9a353a51f1386a91cf6ea3d2fbbc219334f278844765b35ad98
4
+ data.tar.gz: 8329a8af72dd945284617f053cbb1ae686cd5eb9151dd1cf9be0c99b7cdd12ac
5
5
  SHA512:
6
- metadata.gz: d1da5e785a40849da75b27c9067098e700444be462791654de47914e45b7f45adb3bd5e94945d7908384cd22668ed30cfbbac6a39a73ef6386d31f378ac6dad4
7
- data.tar.gz: 751d3f450cecbf45b2fe5cede0af225adee8948cc5352c936e279c88f765e589fe8c8cad201c320bab03dacb7bff66be70fc5b9325fbebd3e9a017474366d0ca
6
+ metadata.gz: 6761cb5c91ed8736be1193237caa2725110070b9de332af5bd6175371064b3b3d939ba997634c8e3711f8f1d81c7dc3988436d807858951c6fd398e39249b8c7
7
+ data.tar.gz: 1cee71ddc3dab888aaf75a8cf7139bfd630ef63c29dc954d00c5cdac7c4a251e78a19dedc56ea3e6b8713289b514cabe2519322efc5accd977fb6084a1768088
data/README.md CHANGED
@@ -20,7 +20,9 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- If you know page number pattern, use fetch:
23
+ ### number pattern
24
+
25
+ If you know page number pattern, use `fetch`:
24
26
 
25
27
  ```ruby
26
28
  nodes = PageByPage.fetch do
@@ -36,7 +38,21 @@ nodes = PageByPage.fetch do
36
38
  end
37
39
  ```
38
40
 
39
- If you don't know the pattern, but you see link to next page, use jump:
41
+ ### other pattern
42
+
43
+ If the pattern is not simple numbers, use `enumerator` in `fetch`:
44
+
45
+ ```ruby
46
+ nodes = PageByiPage.fetch do
47
+ url 'http://mysql.taobao.org/monthly/<%= n %>'
48
+ selector 'h3'
49
+ enumerator ['2020/09/', '2020/08/'].to_enum
50
+ end
51
+ ```
52
+
53
+ ### unknown pattern
54
+
55
+ If you don't know the pattern, but you see link to next page, use `jump`:
40
56
 
41
57
  ```ruby
42
58
  nodes = PageByPage.jump do
@@ -50,6 +66,8 @@ nodes = PageByPage.jump do
50
66
  end
51
67
  ```
52
68
 
69
+ ### parameters instead of block
70
+
53
71
  You may just pass parameters instead of block:
54
72
 
55
73
  ```ruby
@@ -66,6 +84,8 @@ nodes = PageByPage.fetch(
66
84
  )
67
85
  ```
68
86
 
87
+ ### lazy
88
+
69
89
  Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
70
90
 
71
91
  ```ruby
@@ -1,12 +1,12 @@
1
1
  module PageByPage
2
2
  class Enum
3
3
 
4
- def initialize from: 1, step: 1
5
- @enum = (from..Float::INFINITY).step(step).lazy.map(&:to_i).to_enum
4
+ def initialize from: 1, step: 1, limit: nil, enumerator: nil
5
+ @enum = enumerator || (from..limit).step(step).lazy.map(&:to_i).to_enum
6
6
  end
7
7
 
8
8
  def next
9
- @enum.next
9
+ @enum.next rescue nil
10
10
  end
11
11
 
12
12
  end
@@ -11,7 +11,8 @@ module PageByPage
11
11
  def initialize(opt = {}, &block)
12
12
  @from, @step, @to = 1, 1, Float::INFINITY
13
13
  super
14
- @enum = (defined?(@threads) ? MutexEnum : Enum).new(enum_options)
14
+ @enum = Enum.new(enum_options)
15
+ @enum = MutexEnum.new(@enum) if defined? @threads
15
16
  end
16
17
 
17
18
  def url tmpl
@@ -30,10 +31,17 @@ module PageByPage
30
31
  @threads = n
31
32
  end
32
33
 
34
+ def enumerator e
35
+ @enumerator = e
36
+ end
37
+
33
38
  def process
34
39
  nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
35
40
  puts if @progress
36
- nodes_2d.reject(&:nil?).flatten
41
+
42
+ nodes_2d.sort.each_with_object([]) do |key_items, res|
43
+ res.concat key_items[1] unless key_items[1].nil?
44
+ end
37
45
  end
38
46
 
39
47
  def iterator
@@ -49,7 +57,7 @@ module PageByPage
49
57
  protected
50
58
 
51
59
  def _fetch
52
- pages = []
60
+ pages = {}
53
61
 
54
62
  items_enum.each do |page_num, items|
55
63
  pages[page_num] = items
@@ -64,7 +72,7 @@ module PageByPage
64
72
  catch :no_more do
65
73
  until items.empty?
66
74
  n = @enum.next
67
- break if n > limit
75
+ break if n.nil?
68
76
 
69
77
  url = @tmpl.result binding
70
78
  doc = parse url
@@ -84,16 +92,14 @@ module PageByPage
84
92
  Thread.current[:sub] = _fetch
85
93
  end
86
94
  end
87
- ts.each_with_object([]) do |t, pages|
95
+ ts.each_with_object({}) do |t, pages|
88
96
  t.join
89
- t[:sub].each_with_index do |items, i|
90
- pages[i] = items if items
91
- end
97
+ pages.merge! t[:sub]
92
98
  end
93
99
  end
94
100
 
95
101
  def enum_options
96
- {from: @from, step: @step}
102
+ {from: @from, step: @step, limit: limit, enumerator: @enumerator}
97
103
  end
98
104
 
99
105
  end
@@ -28,7 +28,7 @@ module PageByPage
28
28
  break unless next_url
29
29
 
30
30
  path = next_url.attr('href')
31
- url = concat_host path
31
+ url = path.start_with?('/') ? concat_host(path) : path
32
32
 
33
33
  sleep @interval if @interval
34
34
  end
@@ -1,15 +1,15 @@
1
1
  require 'page_by_page/enum'
2
+ require 'thread'
2
3
 
3
4
  module PageByPage
4
5
  class MutexEnum < Enum
5
6
 
6
- def initialize from: 1, step: 1
7
- super
7
+ def initialize enum
8
8
  @q = SizedQueue.new 10
9
+ @enum = enum
9
10
  Thread.new do
10
11
  loop do
11
12
  @q << @enum.next
12
- sleep 0.1
13
13
  end
14
14
  end
15
15
  end
@@ -1,3 +1,3 @@
1
1
  module PageByPage
2
- VERSION = "0.1.13"
2
+ VERSION = "0.1.14"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-20 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler