page_by_page 0.1.12 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 46a381dd7f98d7e663cc854796d1db3394b7a7b65f9a415669106f1c6be8f61d
4
- data.tar.gz: d8db8c6100555d3c7a015ec010e12fb291a67cd1fda762c04dc046419bd8ff6d
3
+ metadata.gz: a00d32af004cc151bf366b9498e18f2e7a2da73629a5db0373b7aad7992f995d
4
+ data.tar.gz: 5301c76cc3968c6f187e3349d1de5bf2f4a19a48440f1484a29e25ea35af0342
5
5
  SHA512:
6
- metadata.gz: 61a81b079f177ac91b9d4dbf17bbea9ec4f78b2f6ee17a4da240a6ed012309c2d7dd75bc6996af3f2b7ad8be947baa0c6531978317aac02f2dc5791ebeb8f2e3
7
- data.tar.gz: 85032f0d49c01bb26ec4c714aa499cdb152a67a75752cf57c60e71a65e77a0a0b83ef81d11f70a60374cdcc18915bf158f51d9646ddbe8c05bd1dfc564450e53
6
+ metadata.gz: d1da5e785a40849da75b27c9067098e700444be462791654de47914e45b7f45adb3bd5e94945d7908384cd22668ed30cfbbac6a39a73ef6386d31f378ac6dad4
7
+ data.tar.gz: 751d3f450cecbf45b2fe5cede0af225adee8948cc5352c936e279c88f765e589fe8c8cad201c320bab03dacb7bff66be70fc5b9325fbebd3e9a017474366d0ca
data/README.md CHANGED
@@ -65,3 +65,11 @@ nodes = PageByPage.fetch(
65
65
  # header: {Cookie: 'douban-fav-remind=1'}
66
66
  )
67
67
  ```
68
+
69
+ Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
70
+
71
+ ```ruby
72
+ nodes = PageByPage.lazy_fetch(
73
+ #...
74
+ )
75
+ ```
data/bin/console CHANGED
@@ -10,5 +10,5 @@ require "page_by_page"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
14
- IRB.start
13
+ require "pry"
14
+ pry
data/lib/page_by_page.rb CHANGED
@@ -9,6 +9,10 @@ module PageByPage
9
9
  Fetch.new(*args, &block).process
10
10
  end
11
11
 
12
+ def lazy_fetch(*args, &block)
13
+ Fetch.new(*args, &block).iterator
14
+ end
15
+
12
16
  def jump(*args, &block)
13
17
  Jump.new(*args, &block).process
14
18
  end
@@ -11,6 +11,7 @@ module PageByPage
11
11
  def initialize(opt = {}, &block)
12
12
  @from, @step, @to = 1, 1, Float::INFINITY
13
13
  super
14
+ @enum = (defined?(@threads) ? MutexEnum : Enum).new(enum_options)
14
15
  end
15
16
 
16
17
  def url tmpl
@@ -30,39 +31,53 @@ module PageByPage
30
31
  end
31
32
 
32
33
  def process
33
- nodes_2d =
34
- unless defined? @threads
35
- @enum = Enum.new enum_options
36
- _fetch
37
- else
38
- @enum = MutexEnum.new enum_options
39
- parallel_fetch
40
- end
34
+ nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
41
35
  puts if @progress
42
36
  nodes_2d.reject(&:nil?).flatten
43
37
  end
44
38
 
39
+ def iterator
40
+ Enumerator.new do |yielder|
41
+ items_enum.each do |_, items|
42
+ items.each do |i|
43
+ yielder.yield(i)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
45
49
  protected
46
50
 
47
51
  def _fetch
48
- items, pages = [nil], []
49
- catch :no_more do
50
- until items.empty?
51
- n = @enum.next
52
- break if n > limit
53
-
54
- url = @tmpl.result binding
55
- doc = parse url
56
- items = doc.css @selector
57
- pages[n] = items
58
-
59
- update_progress Thread.current, n if @progress
60
- sleep @interval if @interval
61
- end
52
+ pages = []
53
+
54
+ items_enum.each do |page_num, items|
55
+ pages[page_num] = items
62
56
  end
57
+
63
58
  pages
64
59
  end
65
60
 
61
+ def items_enum
62
+ Enumerator.new do |yielder|
63
+ items = [nil]
64
+ catch :no_more do
65
+ until items.empty?
66
+ n = @enum.next
67
+ break if n > limit
68
+
69
+ url = @tmpl.result binding
70
+ doc = parse url
71
+ items = doc.css @selector
72
+ yielder.yield(n, items)
73
+
74
+ update_progress Thread.current, n if @progress
75
+ sleep @interval if @interval
76
+ end
77
+ end
78
+ end
79
+ end
80
+
66
81
  def parallel_fetch
67
82
  ts = @threads.times.map do |n|
68
83
  Thread.new do
@@ -1,3 +1,3 @@
1
1
  module PageByPage
2
- VERSION = "0.1.12"
2
+ VERSION = "0.1.13"
3
3
  end
data/page_by_page.gemspec CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "bundler", "~> 1.13"
33
33
  spec.add_development_dependency "rake", "~> 10.0"
34
34
  spec.add_development_dependency "minitest", "~> 5.0"
35
+ spec.add_development_dependency "pry"
35
36
  spec.add_dependency 'nokogiri', '~> 1.6'
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.12
4
+ version: 0.1.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-02 00:00:00.000000000 Z
11
+ date: 2019-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: nokogiri
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -110,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
124
  - !ruby/object:Gem::Version
111
125
  version: '0'
112
126
  requirements: []
113
- rubyforge_project:
114
- rubygems_version: 2.7.6
127
+ rubygems_version: 3.0.3
115
128
  signing_key:
116
129
  specification_version: 4
117
130
  summary: scrape page by page , according to url pattern