page_by_page 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/page_by_page.rb +36 -5
- data/lib/page_by_page/enum.rb +5 -13
- data/lib/page_by_page/mutex_enum.rb +22 -0
- data/lib/page_by_page/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ac89d6a7b6b45addd50b7738d2a8b90c6609b1f9
|
4
|
+
data.tar.gz: 3ea2de6bddb75c604dea419092b50e1f353fc4ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7a826438ea81f09e717674bbbc955af03676303056ad576c28905bfb74eb45a935f34e3127e85c19c0e5c31d3329a9f91b8e0fbdd87fc76a65b76ffa19dd41cd
|
7
|
+
data.tar.gz: 2706c0b89de6f87b454061e9372db5f5cb1885d5cd735df4d587421a6aa464576b64e948cbc1f5d22e91f3814a3d9b3935410a96d1977d6fab1becf0652aa54c
|
data/lib/page_by_page.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
require 'page_by_page/version'
|
2
2
|
require 'page_by_page/enum'
|
3
|
+
require 'page_by_page/mutex_enum'
|
3
4
|
require 'nokogiri'
|
4
5
|
require 'open-uri'
|
6
|
+
require 'erb'
|
5
7
|
|
6
8
|
class PageByPage
|
7
9
|
|
@@ -36,23 +38,52 @@ class PageByPage
|
|
36
38
|
@to = n
|
37
39
|
end
|
38
40
|
|
41
|
+
def threads n
|
42
|
+
@threads = n
|
43
|
+
end
|
44
|
+
|
39
45
|
def fetch
|
40
|
-
|
46
|
+
nodes_2d =
|
47
|
+
unless @threads
|
48
|
+
@enum = Enum.new options
|
49
|
+
_fetch
|
50
|
+
else
|
51
|
+
@enum = MutexEnum.new options
|
52
|
+
parallel_fetch
|
53
|
+
end
|
54
|
+
nodes_2d.flatten
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def _fetch
|
41
60
|
items, all_items = [nil], []
|
42
61
|
catch :no_more do
|
43
62
|
until items.empty?
|
44
|
-
n = enum.next
|
63
|
+
n = @enum.next
|
45
64
|
break if n > limit
|
46
65
|
url = @tmpl.result binding
|
47
66
|
doc = parse url
|
48
67
|
items = doc.css @selector
|
49
|
-
all_items
|
68
|
+
all_items[n] = items
|
50
69
|
end
|
51
70
|
end
|
52
|
-
all_items
|
71
|
+
all_items
|
53
72
|
end
|
54
73
|
|
55
|
-
|
74
|
+
def parallel_fetch
|
75
|
+
ts = @threads.times.map do |n|
|
76
|
+
Thread.new do
|
77
|
+
Thread.current[:sub] = _fetch
|
78
|
+
end
|
79
|
+
end
|
80
|
+
ts.each_with_object([]) do |t, pages|
|
81
|
+
t.join
|
82
|
+
t[:sub].each_with_index do |items, i|
|
83
|
+
pages[i] = items if items
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
56
87
|
|
57
88
|
def parse url
|
58
89
|
page = open(url)
|
data/lib/page_by_page/enum.rb
CHANGED
@@ -1,20 +1,12 @@
|
|
1
|
-
require 'forwardable'
|
2
|
-
require 'erb'
|
3
|
-
|
4
1
|
class PageByPage
|
5
2
|
class Enum
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
def_delegator :@enum, :next
|
9
3
|
|
10
4
|
def initialize from: 1, step: 1
|
11
|
-
@enum =
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
end
|
5
|
+
@enum = (from..Float::INFINITY).step(step).lazy.map(&:to_i).to_enum
|
6
|
+
end
|
7
|
+
|
8
|
+
def next
|
9
|
+
@enum.next
|
18
10
|
end
|
19
11
|
|
20
12
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'page_by_page/enum'
|
2
|
+
|
3
|
+
class PageByPage
|
4
|
+
class MutexEnum < Enum
|
5
|
+
|
6
|
+
def initialize from: 1, step: 1
|
7
|
+
super
|
8
|
+
@q = SizedQueue.new 10
|
9
|
+
Thread.new do
|
10
|
+
loop do
|
11
|
+
@q << @enum.next
|
12
|
+
sleep 0.1
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def next
|
18
|
+
@q.deq
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
data/lib/page_by_page/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-01-
|
11
|
+
date: 2017-01-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -84,6 +84,7 @@ files:
|
|
84
84
|
- bin/setup
|
85
85
|
- lib/page_by_page.rb
|
86
86
|
- lib/page_by_page/enum.rb
|
87
|
+
- lib/page_by_page/mutex_enum.rb
|
87
88
|
- lib/page_by_page/version.rb
|
88
89
|
- page_by_page.gemspec
|
89
90
|
homepage: https://github.com/turnon/page_by_page
|