page_by_page 0.1.10 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/page_by_page/common.rb +63 -0
- data/lib/page_by_page/enum.rb +1 -1
- data/lib/page_by_page/fetch.rb +11 -7
- data/lib/page_by_page/jump.rb +12 -7
- data/lib/page_by_page/mutex_enum.rb +1 -1
- data/lib/page_by_page/version.rb +2 -2
- data/lib/page_by_page.rb +3 -61
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46595cb8e0d8590c5b614eac4301b0423824a12f43b94e83294e325fd1c7f5c7
|
4
|
+
data.tar.gz: 11ac56d8a0061a00d2807307f285332c5548b06b915bfcd94675d5d57d7b8454
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fa81661e065393f115f355e666f43b5bf636019f50c4c40a33faf8bfe44e1134cce5f7eb3255f68a427d178b3e90a986a69b059551cfecca51ff779b3bd3c25d
|
7
|
+
data.tar.gz: 3f92a09ec86789cd0517597a5f15850bcb48303c4e1c3980efb3d57c6ba2d34c4c46fc287df060fb9328ca6efc5543f3c5abe5b2f717974b618b2a171fd0116f
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module PageByPage
|
5
|
+
module Common
|
6
|
+
def initialize(opt = {}, &block)
|
7
|
+
@progress = {}
|
8
|
+
opt.each{ |name, value| send name, value }
|
9
|
+
instance_eval &block if block
|
10
|
+
end
|
11
|
+
|
12
|
+
def to n
|
13
|
+
@to = n
|
14
|
+
end
|
15
|
+
|
16
|
+
def selector sl
|
17
|
+
@selector = sl
|
18
|
+
end
|
19
|
+
|
20
|
+
def header hash
|
21
|
+
@header = hash
|
22
|
+
end
|
23
|
+
|
24
|
+
def interval second
|
25
|
+
@interval = second
|
26
|
+
end
|
27
|
+
|
28
|
+
def no_progress *arg
|
29
|
+
@progress = nil
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def parse url
|
35
|
+
page = open(url, http_header)
|
36
|
+
Nokogiri::HTML page.read
|
37
|
+
rescue OpenURI::HTTPError => e
|
38
|
+
if e.message == '404 Not Found'
|
39
|
+
throw :no_more
|
40
|
+
else
|
41
|
+
raise e
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def http_header
|
46
|
+
@http_header ||= (
|
47
|
+
h = {}
|
48
|
+
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
49
|
+
h
|
50
|
+
)
|
51
|
+
end
|
52
|
+
|
53
|
+
def limit
|
54
|
+
@to ||= Float::INFINITY
|
55
|
+
end
|
56
|
+
|
57
|
+
def update_progress thread, page_num
|
58
|
+
@progress[thread] = page_num
|
59
|
+
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
data/lib/page_by_page/enum.rb
CHANGED
data/lib/page_by_page/fetch.rb
CHANGED
@@ -1,9 +1,17 @@
|
|
1
1
|
require 'page_by_page/enum'
|
2
2
|
require 'page_by_page/mutex_enum'
|
3
|
+
require 'page_by_page/common'
|
3
4
|
require 'erb'
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
module PageByPage
|
7
|
+
class Fetch
|
8
|
+
|
9
|
+
include Common
|
10
|
+
|
11
|
+
def initialize(opt = {}, &block)
|
12
|
+
@from, @step, @to = 1, 1, Float::INFINITY
|
13
|
+
super
|
14
|
+
end
|
7
15
|
|
8
16
|
def url tmpl
|
9
17
|
@tmpl = ERB.new tmpl
|
@@ -21,11 +29,7 @@ class PageByPage
|
|
21
29
|
@threads = n
|
22
30
|
end
|
23
31
|
|
24
|
-
def
|
25
|
-
@progress = nil
|
26
|
-
end
|
27
|
-
|
28
|
-
def fetch
|
32
|
+
def process
|
29
33
|
nodes_2d =
|
30
34
|
unless defined? @threads
|
31
35
|
@enum = Enum.new enum_options
|
data/lib/page_by_page/jump.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require 'page_by_page/common'
|
2
|
+
|
3
|
+
module PageByPage
|
4
|
+
class Jump
|
5
|
+
|
6
|
+
include Common
|
3
7
|
|
4
8
|
def start url
|
5
9
|
@start = url
|
@@ -9,26 +13,27 @@ class PageByPage
|
|
9
13
|
@iterate = selector
|
10
14
|
end
|
11
15
|
|
12
|
-
def
|
16
|
+
def process
|
13
17
|
url, items, page_count = @start, [], 0
|
14
18
|
|
15
19
|
while true do
|
16
20
|
doc = parse url
|
17
21
|
doc.css(@selector).each{ |item| items << item }
|
18
22
|
|
23
|
+
page_count += 1
|
24
|
+
update_progress Thread.current, page_count if @progress
|
25
|
+
break if page_count >= limit
|
26
|
+
|
19
27
|
next_url = doc.at_css(@iterate)
|
20
28
|
break unless next_url
|
21
29
|
|
22
30
|
path = next_url.attr('href')
|
23
31
|
url = concat_host path
|
24
32
|
|
25
|
-
page_count += 1
|
26
|
-
update_progress Thread.current, page_count if @progress
|
27
|
-
break if page_count >= limit
|
28
|
-
|
29
33
|
sleep @interval if @interval
|
30
34
|
end
|
31
35
|
|
36
|
+
puts if @progress
|
32
37
|
items
|
33
38
|
end
|
34
39
|
|
data/lib/page_by_page/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.1.
|
1
|
+
module PageByPage
|
2
|
+
VERSION = "0.1.11"
|
3
3
|
end
|
data/lib/page_by_page.rb
CHANGED
@@ -1,75 +1,17 @@
|
|
1
1
|
require 'page_by_page/version'
|
2
2
|
require 'page_by_page/fetch'
|
3
3
|
require 'page_by_page/jump'
|
4
|
-
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
4
|
|
7
|
-
|
8
|
-
|
9
|
-
include Fetch
|
10
|
-
include Jump
|
5
|
+
module PageByPage
|
11
6
|
|
12
7
|
class << self
|
13
8
|
def fetch(*args, &block)
|
14
|
-
new(*args, &block).
|
9
|
+
Fetch.new(*args, &block).process
|
15
10
|
end
|
16
11
|
|
17
12
|
def jump(*args, &block)
|
18
|
-
new(*args, &block).
|
13
|
+
Jump.new(*args, &block).process
|
19
14
|
end
|
20
15
|
end
|
21
16
|
|
22
|
-
def initialize(opt = {}, &block)
|
23
|
-
@from, @step, @to = 1, 1, Float::INFINITY
|
24
|
-
@progress = {}
|
25
|
-
opt.each{ |name, value| send name, value }
|
26
|
-
instance_eval &block if block
|
27
|
-
end
|
28
|
-
|
29
|
-
def to n
|
30
|
-
@to = n
|
31
|
-
end
|
32
|
-
|
33
|
-
def selector sl
|
34
|
-
@selector = sl
|
35
|
-
end
|
36
|
-
|
37
|
-
def header hash
|
38
|
-
@header = hash
|
39
|
-
end
|
40
|
-
|
41
|
-
def interval second
|
42
|
-
@interval = second
|
43
|
-
end
|
44
|
-
|
45
|
-
private
|
46
|
-
|
47
|
-
def parse url
|
48
|
-
page = open(url, http_header)
|
49
|
-
Nokogiri::HTML page.read
|
50
|
-
rescue OpenURI::HTTPError => e
|
51
|
-
if e.message == '404 Not Found'
|
52
|
-
throw :no_more
|
53
|
-
else
|
54
|
-
raise e
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def http_header
|
59
|
-
@http_header ||= (
|
60
|
-
h = {}
|
61
|
-
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
62
|
-
h
|
63
|
-
)
|
64
|
-
end
|
65
|
-
|
66
|
-
def limit
|
67
|
-
@to ||= Float::INFINITY
|
68
|
-
end
|
69
|
-
|
70
|
-
def update_progress thread, page_num
|
71
|
-
@progress[thread] = page_num
|
72
|
-
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
73
|
-
end
|
74
|
-
|
75
17
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: page_by_page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ken
|
@@ -83,6 +83,7 @@ files:
|
|
83
83
|
- bin/console
|
84
84
|
- bin/setup
|
85
85
|
- lib/page_by_page.rb
|
86
|
+
- lib/page_by_page/common.rb
|
86
87
|
- lib/page_by_page/enum.rb
|
87
88
|
- lib/page_by_page/fetch.rb
|
88
89
|
- lib/page_by_page/jump.rb
|