page_by_page 0.1.12 → 0.1.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -0
- data/bin/console +2 -2
- data/lib/page_by_page.rb +4 -0
- data/lib/page_by_page/fetch.rb +37 -22
- data/lib/page_by_page/version.rb +1 -1
- data/page_by_page.gemspec +1 -0
- metadata +17 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a00d32af004cc151bf366b9498e18f2e7a2da73629a5db0373b7aad7992f995d
|
4
|
+
data.tar.gz: 5301c76cc3968c6f187e3349d1de5bf2f4a19a48440f1484a29e25ea35af0342
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d1da5e785a40849da75b27c9067098e700444be462791654de47914e45b7f45adb3bd5e94945d7908384cd22668ed30cfbbac6a39a73ef6386d31f378ac6dad4
|
7
|
+
data.tar.gz: 751d3f450cecbf45b2fe5cede0af225adee8948cc5352c936e279c88f765e589fe8c8cad201c320bab03dacb7bff66be70fc5b9325fbebd3e9a017474366d0ca
|
data/README.md
CHANGED
@@ -65,3 +65,11 @@ nodes = PageByPage.fetch(
|
|
65
65
|
# header: {Cookie: 'douban-fav-remind=1'}
|
66
66
|
)
|
67
67
|
```
|
68
|
+
|
69
|
+
Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
nodes = PageByPage.lazy_fetch(
|
73
|
+
#...
|
74
|
+
)
|
75
|
+
```
|
data/bin/console
CHANGED
data/lib/page_by_page.rb
CHANGED
data/lib/page_by_page/fetch.rb
CHANGED
@@ -11,6 +11,7 @@ module PageByPage
|
|
11
11
|
def initialize(opt = {}, &block)
|
12
12
|
@from, @step, @to = 1, 1, Float::INFINITY
|
13
13
|
super
|
14
|
+
@enum = (defined?(@threads) ? MutexEnum : Enum).new(enum_options)
|
14
15
|
end
|
15
16
|
|
16
17
|
def url tmpl
|
@@ -30,39 +31,53 @@ module PageByPage
|
|
30
31
|
end
|
31
32
|
|
32
33
|
def process
|
33
|
-
nodes_2d =
|
34
|
-
unless defined? @threads
|
35
|
-
@enum = Enum.new enum_options
|
36
|
-
_fetch
|
37
|
-
else
|
38
|
-
@enum = MutexEnum.new enum_options
|
39
|
-
parallel_fetch
|
40
|
-
end
|
34
|
+
nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
|
41
35
|
puts if @progress
|
42
36
|
nodes_2d.reject(&:nil?).flatten
|
43
37
|
end
|
44
38
|
|
39
|
+
def iterator
|
40
|
+
Enumerator.new do |yielder|
|
41
|
+
items_enum.each do |_, items|
|
42
|
+
items.each do |i|
|
43
|
+
yielder.yield(i)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
45
49
|
protected
|
46
50
|
|
47
51
|
def _fetch
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
break if n > limit
|
53
|
-
|
54
|
-
url = @tmpl.result binding
|
55
|
-
doc = parse url
|
56
|
-
items = doc.css @selector
|
57
|
-
pages[n] = items
|
58
|
-
|
59
|
-
update_progress Thread.current, n if @progress
|
60
|
-
sleep @interval if @interval
|
61
|
-
end
|
52
|
+
pages = []
|
53
|
+
|
54
|
+
items_enum.each do |page_num, items|
|
55
|
+
pages[page_num] = items
|
62
56
|
end
|
57
|
+
|
63
58
|
pages
|
64
59
|
end
|
65
60
|
|
61
|
+
def items_enum
|
62
|
+
Enumerator.new do |yielder|
|
63
|
+
items = [nil]
|
64
|
+
catch :no_more do
|
65
|
+
until items.empty?
|
66
|
+
n = @enum.next
|
67
|
+
break if n > limit
|
68
|
+
|
69
|
+
url = @tmpl.result binding
|
70
|
+
doc = parse url
|
71
|
+
items = doc.css @selector
|
72
|
+
yielder.yield(n, items)
|
73
|
+
|
74
|
+
update_progress Thread.current, n if @progress
|
75
|
+
sleep @interval if @interval
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
66
81
|
def parallel_fetch
|
67
82
|
ts = @threads.times.map do |n|
|
68
83
|
Thread.new do
|
data/lib/page_by_page/version.rb
CHANGED
data/page_by_page.gemspec
CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "bundler", "~> 1.13"
|
33
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
34
34
|
spec.add_development_dependency "minitest", "~> 5.0"
|
35
|
+
spec.add_development_dependency "pry"
|
35
36
|
spec.add_dependency 'nokogiri', '~> 1.6'
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: nokogiri
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -110,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
124
|
- !ruby/object:Gem::Version
|
111
125
|
version: '0'
|
112
126
|
requirements: []
|
113
|
-
|
114
|
-
rubygems_version: 2.7.6
|
127
|
+
rubygems_version: 3.0.3
|
115
128
|
signing_key:
|
116
129
|
specification_version: 4
|
117
130
|
summary: scrape page by page , according to url pattern
|