page_by_page 0.1.12 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -0
- data/bin/console +2 -2
- data/lib/page_by_page.rb +4 -0
- data/lib/page_by_page/fetch.rb +37 -22
- data/lib/page_by_page/version.rb +1 -1
- data/page_by_page.gemspec +1 -0
- metadata +17 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a00d32af004cc151bf366b9498e18f2e7a2da73629a5db0373b7aad7992f995d
|
4
|
+
data.tar.gz: 5301c76cc3968c6f187e3349d1de5bf2f4a19a48440f1484a29e25ea35af0342
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d1da5e785a40849da75b27c9067098e700444be462791654de47914e45b7f45adb3bd5e94945d7908384cd22668ed30cfbbac6a39a73ef6386d31f378ac6dad4
|
7
|
+
data.tar.gz: 751d3f450cecbf45b2fe5cede0af225adee8948cc5352c936e279c88f765e589fe8c8cad201c320bab03dacb7bff66be70fc5b9325fbebd3e9a017474366d0ca
|
data/README.md
CHANGED
@@ -65,3 +65,11 @@ nodes = PageByPage.fetch(
|
|
65
65
|
# header: {Cookie: 'douban-fav-remind=1'}
|
66
66
|
)
|
67
67
|
```
|
68
|
+
|
69
|
+
Also note that, instead of Array, `lazy_fetch` returns an Enumerator, which is native lazy-loading:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
nodes = PageByPage.lazy_fetch(
|
73
|
+
#...
|
74
|
+
)
|
75
|
+
```
|
data/bin/console
CHANGED
data/lib/page_by_page.rb
CHANGED
data/lib/page_by_page/fetch.rb
CHANGED
@@ -11,6 +11,7 @@ module PageByPage
|
|
11
11
|
def initialize(opt = {}, &block)
|
12
12
|
@from, @step, @to = 1, 1, Float::INFINITY
|
13
13
|
super
|
14
|
+
@enum = (defined?(@threads) ? MutexEnum : Enum).new(enum_options)
|
14
15
|
end
|
15
16
|
|
16
17
|
def url tmpl
|
@@ -30,39 +31,53 @@ module PageByPage
|
|
30
31
|
end
|
31
32
|
|
32
33
|
def process
|
33
|
-
nodes_2d =
|
34
|
-
unless defined? @threads
|
35
|
-
@enum = Enum.new enum_options
|
36
|
-
_fetch
|
37
|
-
else
|
38
|
-
@enum = MutexEnum.new enum_options
|
39
|
-
parallel_fetch
|
40
|
-
end
|
34
|
+
nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
|
41
35
|
puts if @progress
|
42
36
|
nodes_2d.reject(&:nil?).flatten
|
43
37
|
end
|
44
38
|
|
39
|
+
def iterator
|
40
|
+
Enumerator.new do |yielder|
|
41
|
+
items_enum.each do |_, items|
|
42
|
+
items.each do |i|
|
43
|
+
yielder.yield(i)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
45
49
|
protected
|
46
50
|
|
47
51
|
def _fetch
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
break if n > limit
|
53
|
-
|
54
|
-
url = @tmpl.result binding
|
55
|
-
doc = parse url
|
56
|
-
items = doc.css @selector
|
57
|
-
pages[n] = items
|
58
|
-
|
59
|
-
update_progress Thread.current, n if @progress
|
60
|
-
sleep @interval if @interval
|
61
|
-
end
|
52
|
+
pages = []
|
53
|
+
|
54
|
+
items_enum.each do |page_num, items|
|
55
|
+
pages[page_num] = items
|
62
56
|
end
|
57
|
+
|
63
58
|
pages
|
64
59
|
end
|
65
60
|
|
61
|
+
def items_enum
|
62
|
+
Enumerator.new do |yielder|
|
63
|
+
items = [nil]
|
64
|
+
catch :no_more do
|
65
|
+
until items.empty?
|
66
|
+
n = @enum.next
|
67
|
+
break if n > limit
|
68
|
+
|
69
|
+
url = @tmpl.result binding
|
70
|
+
doc = parse url
|
71
|
+
items = doc.css @selector
|
72
|
+
yielder.yield(n, items)
|
73
|
+
|
74
|
+
update_progress Thread.current, n if @progress
|
75
|
+
sleep @interval if @interval
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
66
81
|
def parallel_fetch
|
67
82
|
ts = @threads.times.map do |n|
|
68
83
|
Thread.new do
|
data/lib/page_by_page/version.rb
CHANGED
data/page_by_page.gemspec
CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "bundler", "~> 1.13"
|
33
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
34
34
|
spec.add_development_dependency "minitest", "~> 5.0"
|
35
|
+
spec.add_development_dependency "pry"
|
35
36
|
spec.add_dependency 'nokogiri', '~> 1.6'
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: nokogiri
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -110,8 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
124
|
- !ruby/object:Gem::Version
|
111
125
|
version: '0'
|
112
126
|
requirements: []
|
113
|
-
|
114
|
-
rubygems_version: 2.7.6
|
127
|
+
rubygems_version: 3.0.3
|
115
128
|
signing_key:
|
116
129
|
specification_version: 4
|
117
130
|
summary: scrape page by page , according to url pattern
|