page_by_page 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/page_by_page/common.rb +63 -0
- data/lib/page_by_page/enum.rb +1 -1
- data/lib/page_by_page/fetch.rb +11 -7
- data/lib/page_by_page/jump.rb +12 -7
- data/lib/page_by_page/mutex_enum.rb +1 -1
- data/lib/page_by_page/version.rb +2 -2
- data/lib/page_by_page.rb +3 -61
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 46595cb8e0d8590c5b614eac4301b0423824a12f43b94e83294e325fd1c7f5c7
|
|
4
|
+
data.tar.gz: 11ac56d8a0061a00d2807307f285332c5548b06b915bfcd94675d5d57d7b8454
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fa81661e065393f115f355e666f43b5bf636019f50c4c40a33faf8bfe44e1134cce5f7eb3255f68a427d178b3e90a986a69b059551cfecca51ff779b3bd3c25d
|
|
7
|
+
data.tar.gz: 3f92a09ec86789cd0517597a5f15850bcb48303c4e1c3980efb3d57c6ba2d34c4c46fc287df060fb9328ca6efc5543f3c5abe5b2f717974b618b2a171fd0116f
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
require 'open-uri'
|
|
3
|
+
|
|
4
|
+
module PageByPage
|
|
5
|
+
module Common
|
|
6
|
+
def initialize(opt = {}, &block)
|
|
7
|
+
@progress = {}
|
|
8
|
+
opt.each{ |name, value| send name, value }
|
|
9
|
+
instance_eval &block if block
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def to n
|
|
13
|
+
@to = n
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def selector sl
|
|
17
|
+
@selector = sl
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def header hash
|
|
21
|
+
@header = hash
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def interval second
|
|
25
|
+
@interval = second
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def no_progress *arg
|
|
29
|
+
@progress = nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
protected
|
|
33
|
+
|
|
34
|
+
def parse url
|
|
35
|
+
page = open(url, http_header)
|
|
36
|
+
Nokogiri::HTML page.read
|
|
37
|
+
rescue OpenURI::HTTPError => e
|
|
38
|
+
if e.message == '404 Not Found'
|
|
39
|
+
throw :no_more
|
|
40
|
+
else
|
|
41
|
+
raise e
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def http_header
|
|
46
|
+
@http_header ||= (
|
|
47
|
+
h = {}
|
|
48
|
+
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
|
49
|
+
h
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def limit
|
|
54
|
+
@to ||= Float::INFINITY
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def update_progress thread, page_num
|
|
58
|
+
@progress[thread] = page_num
|
|
59
|
+
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/page_by_page/enum.rb
CHANGED
data/lib/page_by_page/fetch.rb
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
require 'page_by_page/enum'
|
|
2
2
|
require 'page_by_page/mutex_enum'
|
|
3
|
+
require 'page_by_page/common'
|
|
3
4
|
require 'erb'
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
|
|
6
|
+
module PageByPage
|
|
7
|
+
class Fetch
|
|
8
|
+
|
|
9
|
+
include Common
|
|
10
|
+
|
|
11
|
+
def initialize(opt = {}, &block)
|
|
12
|
+
@from, @step, @to = 1, 1, Float::INFINITY
|
|
13
|
+
super
|
|
14
|
+
end
|
|
7
15
|
|
|
8
16
|
def url tmpl
|
|
9
17
|
@tmpl = ERB.new tmpl
|
|
@@ -21,11 +29,7 @@ class PageByPage
|
|
|
21
29
|
@threads = n
|
|
22
30
|
end
|
|
23
31
|
|
|
24
|
-
def
|
|
25
|
-
@progress = nil
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
def fetch
|
|
32
|
+
def process
|
|
29
33
|
nodes_2d =
|
|
30
34
|
unless defined? @threads
|
|
31
35
|
@enum = Enum.new enum_options
|
data/lib/page_by_page/jump.rb
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
require 'page_by_page/common'
|
|
2
|
+
|
|
3
|
+
module PageByPage
|
|
4
|
+
class Jump
|
|
5
|
+
|
|
6
|
+
include Common
|
|
3
7
|
|
|
4
8
|
def start url
|
|
5
9
|
@start = url
|
|
@@ -9,26 +13,27 @@ class PageByPage
|
|
|
9
13
|
@iterate = selector
|
|
10
14
|
end
|
|
11
15
|
|
|
12
|
-
def
|
|
16
|
+
def process
|
|
13
17
|
url, items, page_count = @start, [], 0
|
|
14
18
|
|
|
15
19
|
while true do
|
|
16
20
|
doc = parse url
|
|
17
21
|
doc.css(@selector).each{ |item| items << item }
|
|
18
22
|
|
|
23
|
+
page_count += 1
|
|
24
|
+
update_progress Thread.current, page_count if @progress
|
|
25
|
+
break if page_count >= limit
|
|
26
|
+
|
|
19
27
|
next_url = doc.at_css(@iterate)
|
|
20
28
|
break unless next_url
|
|
21
29
|
|
|
22
30
|
path = next_url.attr('href')
|
|
23
31
|
url = concat_host path
|
|
24
32
|
|
|
25
|
-
page_count += 1
|
|
26
|
-
update_progress Thread.current, page_count if @progress
|
|
27
|
-
break if page_count >= limit
|
|
28
|
-
|
|
29
33
|
sleep @interval if @interval
|
|
30
34
|
end
|
|
31
35
|
|
|
36
|
+
puts if @progress
|
|
32
37
|
items
|
|
33
38
|
end
|
|
34
39
|
|
data/lib/page_by_page/version.rb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
|
|
2
|
-
VERSION = "0.1.
|
|
1
|
+
module PageByPage
|
|
2
|
+
VERSION = "0.1.11"
|
|
3
3
|
end
|
data/lib/page_by_page.rb
CHANGED
|
@@ -1,75 +1,17 @@
|
|
|
1
1
|
require 'page_by_page/version'
|
|
2
2
|
require 'page_by_page/fetch'
|
|
3
3
|
require 'page_by_page/jump'
|
|
4
|
-
require 'nokogiri'
|
|
5
|
-
require 'open-uri'
|
|
6
4
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
include Fetch
|
|
10
|
-
include Jump
|
|
5
|
+
module PageByPage
|
|
11
6
|
|
|
12
7
|
class << self
|
|
13
8
|
def fetch(*args, &block)
|
|
14
|
-
new(*args, &block).
|
|
9
|
+
Fetch.new(*args, &block).process
|
|
15
10
|
end
|
|
16
11
|
|
|
17
12
|
def jump(*args, &block)
|
|
18
|
-
new(*args, &block).
|
|
13
|
+
Jump.new(*args, &block).process
|
|
19
14
|
end
|
|
20
15
|
end
|
|
21
16
|
|
|
22
|
-
def initialize(opt = {}, &block)
|
|
23
|
-
@from, @step, @to = 1, 1, Float::INFINITY
|
|
24
|
-
@progress = {}
|
|
25
|
-
opt.each{ |name, value| send name, value }
|
|
26
|
-
instance_eval &block if block
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
def to n
|
|
30
|
-
@to = n
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def selector sl
|
|
34
|
-
@selector = sl
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
def header hash
|
|
38
|
-
@header = hash
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def interval second
|
|
42
|
-
@interval = second
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
private
|
|
46
|
-
|
|
47
|
-
def parse url
|
|
48
|
-
page = open(url, http_header)
|
|
49
|
-
Nokogiri::HTML page.read
|
|
50
|
-
rescue OpenURI::HTTPError => e
|
|
51
|
-
if e.message == '404 Not Found'
|
|
52
|
-
throw :no_more
|
|
53
|
-
else
|
|
54
|
-
raise e
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
def http_header
|
|
59
|
-
@http_header ||= (
|
|
60
|
-
h = {}
|
|
61
|
-
Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
|
|
62
|
-
h
|
|
63
|
-
)
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def limit
|
|
67
|
-
@to ||= Float::INFINITY
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
def update_progress thread, page_num
|
|
71
|
-
@progress[thread] = page_num
|
|
72
|
-
printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
|
|
73
|
-
end
|
|
74
|
-
|
|
75
17
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: page_by_page
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.11
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- ken
|
|
@@ -83,6 +83,7 @@ files:
|
|
|
83
83
|
- bin/console
|
|
84
84
|
- bin/setup
|
|
85
85
|
- lib/page_by_page.rb
|
|
86
|
+
- lib/page_by_page/common.rb
|
|
86
87
|
- lib/page_by_page/enum.rb
|
|
87
88
|
- lib/page_by_page/fetch.rb
|
|
88
89
|
- lib/page_by_page/jump.rb
|