page_by_page 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a00d32af004cc151bf366b9498e18f2e7a2da73629a5db0373b7aad7992f995d
4
- data.tar.gz: 5301c76cc3968c6f187e3349d1de5bf2f4a19a48440f1484a29e25ea35af0342
3
+ metadata.gz: 51cda3ebfc8bc9a353a51f1386a91cf6ea3d2fbbc219334f278844765b35ad98
4
+ data.tar.gz: 8329a8af72dd945284617f053cbb1ae686cd5eb9151dd1cf9be0c99b7cdd12ac
5
5
  SHA512:
6
- metadata.gz: d1da5e785a40849da75b27c9067098e700444be462791654de47914e45b7f45adb3bd5e94945d7908384cd22668ed30cfbbac6a39a73ef6386d31f378ac6dad4
7
- data.tar.gz: 751d3f450cecbf45b2fe5cede0af225adee8948cc5352c936e279c88f765e589fe8c8cad201c320bab03dacb7bff66be70fc5b9325fbebd3e9a017474366d0ca
6
+ metadata.gz: 6761cb5c91ed8736be1193237caa2725110070b9de332af5bd6175371064b3b3d939ba997634c8e3711f8f1d81c7dc3988436d807858951c6fd398e39249b8c7
7
+ data.tar.gz: 1cee71ddc3dab888aaf75a8cf7139bfd630ef63c29dc954d00c5cdac7c4a251e78a19dedc56ea3e6b8713289b514cabe2519322efc5accd977fb6084a1768088
data/README.md CHANGED
@@ -20,7 +20,9 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- If you know page number pattern, use fetch:
23
+ ### number pattern
24
+
25
+ If you know page number pattern, use `fetch`:
24
26
 
25
27
  ```ruby
26
28
  nodes = PageByPage.fetch do
@@ -36,7 +38,21 @@ nodes = PageByPage.fetch do
36
38
  end
37
39
  ```
38
40
 
39
- If you don't know the pattern, but you see link to next page, use jump:
41
+ ### other pattern
42
+
43
+ If the pattern is not simple numbers, use `enumerator` in `fetch`:
44
+
45
+ ```ruby
46
+ nodes = PageByiPage.fetch do
47
+ url 'http://mysql.taobao.org/monthly/<%= n %>'
48
+ selector 'h3'
49
+ enumerator ['2020/09/', '2020/08/'].to_enum
50
+ end
51
+ ```
52
+
53
+ ### unknown pattern
54
+
55
+ If you don't know the pattern, but you see link to next page, use `jump`:
40
56
 
41
57
  ```ruby
42
58
  nodes = PageByPage.jump do
@@ -50,6 +66,8 @@ nodes = PageByPage.jump do
50
66
  end
51
67
  ```
52
68
 
69
+ ### parameters instead of block
70
+
53
71
  You may just pass parameters instead of block:
54
72
 
55
73
  ```ruby
@@ -66,6 +84,8 @@ nodes = PageByPage.fetch(
66
84
  )
67
85
  ```
68
86
 
87
+ ### lazy
88
+
69
89
  Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
70
90
 
71
91
  ```ruby
@@ -1,12 +1,12 @@
1
1
  module PageByPage
2
2
  class Enum
3
3
 
4
- def initialize from: 1, step: 1
5
- @enum = (from..Float::INFINITY).step(step).lazy.map(&:to_i).to_enum
4
+ def initialize from: 1, step: 1, limit: nil, enumerator: nil
5
+ @enum = enumerator || (from..limit).step(step).lazy.map(&:to_i).to_enum
6
6
  end
7
7
 
8
8
  def next
9
- @enum.next
9
+ @enum.next rescue nil
10
10
  end
11
11
 
12
12
  end
@@ -11,7 +11,8 @@ module PageByPage
11
11
  def initialize(opt = {}, &block)
12
12
  @from, @step, @to = 1, 1, Float::INFINITY
13
13
  super
14
- @enum = (defined?(@threads) ? MutexEnum : Enum).new(enum_options)
14
+ @enum = Enum.new(enum_options)
15
+ @enum = MutexEnum.new(@enum) if defined? @threads
15
16
  end
16
17
 
17
18
  def url tmpl
@@ -30,10 +31,17 @@ module PageByPage
30
31
  @threads = n
31
32
  end
32
33
 
34
+ def enumerator e
35
+ @enumerator = e
36
+ end
37
+
33
38
  def process
34
39
  nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
35
40
  puts if @progress
36
- nodes_2d.reject(&:nil?).flatten
41
+
42
+ nodes_2d.sort.each_with_object([]) do |key_items, res|
43
+ res.concat key_items[1] unless key_items[1].nil?
44
+ end
37
45
  end
38
46
 
39
47
  def iterator
@@ -49,7 +57,7 @@ module PageByPage
49
57
  protected
50
58
 
51
59
  def _fetch
52
- pages = []
60
+ pages = {}
53
61
 
54
62
  items_enum.each do |page_num, items|
55
63
  pages[page_num] = items
@@ -64,7 +72,7 @@ module PageByPage
64
72
  catch :no_more do
65
73
  until items.empty?
66
74
  n = @enum.next
67
- break if n > limit
75
+ break if n.nil?
68
76
 
69
77
  url = @tmpl.result binding
70
78
  doc = parse url
@@ -84,16 +92,14 @@ module PageByPage
84
92
  Thread.current[:sub] = _fetch
85
93
  end
86
94
  end
87
- ts.each_with_object([]) do |t, pages|
95
+ ts.each_with_object({}) do |t, pages|
88
96
  t.join
89
- t[:sub].each_with_index do |items, i|
90
- pages[i] = items if items
91
- end
97
+ pages.merge! t[:sub]
92
98
  end
93
99
  end
94
100
 
95
101
  def enum_options
96
- {from: @from, step: @step}
102
+ {from: @from, step: @step, limit: limit, enumerator: @enumerator}
97
103
  end
98
104
 
99
105
  end
@@ -28,7 +28,7 @@ module PageByPage
28
28
  break unless next_url
29
29
 
30
30
  path = next_url.attr('href')
31
- url = concat_host path
31
+ url = path.start_with?('/') ? concat_host(path) : path
32
32
 
33
33
  sleep @interval if @interval
34
34
  end
@@ -1,15 +1,15 @@
1
1
  require 'page_by_page/enum'
2
+ require 'thread'
2
3
 
3
4
  module PageByPage
4
5
  class MutexEnum < Enum
5
6
 
6
- def initialize from: 1, step: 1
7
- super
7
+ def initialize enum
8
8
  @q = SizedQueue.new 10
9
+ @enum = enum
9
10
  Thread.new do
10
11
  loop do
11
12
  @q << @enum.next
12
- sleep 0.1
13
13
  end
14
14
  end
15
15
  end
@@ -1,3 +1,3 @@
1
1
  module PageByPage
2
- VERSION = "0.1.13"
2
+ VERSION = "0.1.14"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-20 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler