page_by_page 0.1.12 → 0.1.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 46a381dd7f98d7e663cc854796d1db3394b7a7b65f9a415669106f1c6be8f61d
4
- data.tar.gz: d8db8c6100555d3c7a015ec010e12fb291a67cd1fda762c04dc046419bd8ff6d
3
+ metadata.gz: a00d32af004cc151bf366b9498e18f2e7a2da73629a5db0373b7aad7992f995d
4
+ data.tar.gz: 5301c76cc3968c6f187e3349d1de5bf2f4a19a48440f1484a29e25ea35af0342
5
5
  SHA512:
6
- metadata.gz: 61a81b079f177ac91b9d4dbf17bbea9ec4f78b2f6ee17a4da240a6ed012309c2d7dd75bc6996af3f2b7ad8be947baa0c6531978317aac02f2dc5791ebeb8f2e3
7
- data.tar.gz: 85032f0d49c01bb26ec4c714aa499cdb152a67a75752cf57c60e71a65e77a0a0b83ef81d11f70a60374cdcc18915bf158f51d9646ddbe8c05bd1dfc564450e53
6
+ metadata.gz: d1da5e785a40849da75b27c9067098e700444be462791654de47914e45b7f45adb3bd5e94945d7908384cd22668ed30cfbbac6a39a73ef6386d31f378ac6dad4
7
+ data.tar.gz: 751d3f450cecbf45b2fe5cede0af225adee8948cc5352c936e279c88f765e589fe8c8cad201c320bab03dacb7bff66be70fc5b9325fbebd3e9a017474366d0ca
data/README.md CHANGED
@@ -65,3 +65,11 @@ nodes = PageByPage.fetch(
65
65
  # header: {Cookie: 'douban-fav-remind=1'}
66
66
  )
67
67
  ```
68
+
69
+ Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
70
+
71
+ ```ruby
72
+ nodes = PageByPage.lazy_fetch(
73
+ #...
74
+ )
75
+ ```
data/bin/console CHANGED
@@ -10,5 +10,5 @@ require "page_by_page"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
14
- IRB.start
13
+ require "pry"
14
+ pry
data/lib/page_by_page.rb CHANGED
@@ -9,6 +9,10 @@ module PageByPage
9
9
  Fetch.new(*args, &block).process
10
10
  end
11
11
 
12
+ def lazy_fetch(*args, &block)
13
+ Fetch.new(*args, &block).iterator
14
+ end
15
+
12
16
  def jump(*args, &block)
13
17
  Jump.new(*args, &block).process
14
18
  end
@@ -11,6 +11,7 @@ module PageByPage
11
11
  def initialize(opt = {}, &block)
12
12
  @from, @step, @to = 1, 1, Float::INFINITY
13
13
  super
14
+ @enum = (defined?(@threads) ? MutexEnum : Enum).new(enum_options)
14
15
  end
15
16
 
16
17
  def url tmpl
@@ -30,39 +31,53 @@ module PageByPage
30
31
  end
31
32
 
32
33
  def process
33
- nodes_2d =
34
- unless defined? @threads
35
- @enum = Enum.new enum_options
36
- _fetch
37
- else
38
- @enum = MutexEnum.new enum_options
39
- parallel_fetch
40
- end
34
+ nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
41
35
  puts if @progress
42
36
  nodes_2d.reject(&:nil?).flatten
43
37
  end
44
38
 
39
+ def iterator
40
+ Enumerator.new do |yielder|
41
+ items_enum.each do |_, items|
42
+ items.each do |i|
43
+ yielder.yield(i)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
45
49
  protected
46
50
 
47
51
  def _fetch
48
- items, pages = [nil], []
49
- catch :no_more do
50
- until items.empty?
51
- n = @enum.next
52
- break if n > limit
53
-
54
- url = @tmpl.result binding
55
- doc = parse url
56
- items = doc.css @selector
57
- pages[n] = items
58
-
59
- update_progress Thread.current, n if @progress
60
- sleep @interval if @interval
61
- end
52
+ pages = []
53
+
54
+ items_enum.each do |page_num, items|
55
+ pages[page_num] = items
62
56
  end
57
+
63
58
  pages
64
59
  end
65
60
 
61
+ def items_enum
62
+ Enumerator.new do |yielder|
63
+ items = [nil]
64
+ catch :no_more do
65
+ until items.empty?
66
+ n = @enum.next
67
+ break if n > limit
68
+
69
+ url = @tmpl.result binding
70
+ doc = parse url
71
+ items = doc.css @selector
72
+ yielder.yield(n, items)
73
+
74
+ update_progress Thread.current, n if @progress
75
+ sleep @interval if @interval
76
+ end
77
+ end
78
+ end
79
+ end
80
+
66
81
  def parallel_fetch
67
82
  ts = @threads.times.map do |n|
68
83
  Thread.new do
@@ -1,3 +1,3 @@
1
1
  module PageByPage
2
- VERSION = "0.1.12"
2
+ VERSION = "0.1.13"
3
3
  end
data/page_by_page.gemspec CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "bundler", "~> 1.13"
33
33
  spec.add_development_dependency "rake", "~> 10.0"
34
34
  spec.add_development_dependency "minitest", "~> 5.0"
35
+ spec.add_development_dependency "pry"
35
36
  spec.add_dependency 'nokogiri', '~> 1.6'
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.12
4
+ version: 0.1.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-02 00:00:00.000000000 Z
11
+ date: 2019-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: nokogiri
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -110,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
124
  - !ruby/object:Gem::Version
111
125
  version: '0'
112
126
  requirements: []
113
- rubyforge_project:
114
- rubygems_version: 2.7.6
127
+ rubygems_version: 3.0.3
115
128
  signing_key:
116
129
  specification_version: 4
117
130
  summary: scrape page by page , according to url pattern