page_by_page 0.1.10 → 0.1.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 47ea21770030654ef4b0d4a7b5d3dec3c1c20d7d43b11e39c5ea7e68be86478f
4
- data.tar.gz: 7341f8e5293250b308bb4c223a14f1ec8480513269f3b6c1ef06c5aa9f825e92
3
+ metadata.gz: 46595cb8e0d8590c5b614eac4301b0423824a12f43b94e83294e325fd1c7f5c7
4
+ data.tar.gz: 11ac56d8a0061a00d2807307f285332c5548b06b915bfcd94675d5d57d7b8454
5
5
  SHA512:
6
- metadata.gz: '0912271bc28adfd2e97313bbecdeed894976eb37814c84bbbb009ca8ec10dbf04c59b25ce4716dee1701b42cbc1015e7dd83441afeb9a8d7cf7678c477328beb'
7
- data.tar.gz: 6e46ac599c44dff52b9527de7d526bac874c166191f10e212168034b1477fead6194d1dabfddad46ec74494676e4fc7260b1f25616e5d3c6a08d0bc6250e5fab
6
+ metadata.gz: fa81661e065393f115f355e666f43b5bf636019f50c4c40a33faf8bfe44e1134cce5f7eb3255f68a427d178b3e90a986a69b059551cfecca51ff779b3bd3c25d
7
+ data.tar.gz: 3f92a09ec86789cd0517597a5f15850bcb48303c4e1c3980efb3d57c6ba2d34c4c46fc287df060fb9328ca6efc5543f3c5abe5b2f717974b618b2a171fd0116f
@@ -0,0 +1,63 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module PageByPage
5
+ module Common
6
+ def initialize(opt = {}, &block)
7
+ @progress = {}
8
+ opt.each{ |name, value| send name, value }
9
+ instance_eval &block if block
10
+ end
11
+
12
+ def to n
13
+ @to = n
14
+ end
15
+
16
+ def selector sl
17
+ @selector = sl
18
+ end
19
+
20
+ def header hash
21
+ @header = hash
22
+ end
23
+
24
+ def interval second
25
+ @interval = second
26
+ end
27
+
28
+ def no_progress *arg
29
+ @progress = nil
30
+ end
31
+
32
+ protected
33
+
34
+ def parse url
35
+ page = open(url, http_header)
36
+ Nokogiri::HTML page.read
37
+ rescue OpenURI::HTTPError => e
38
+ if e.message == '404 Not Found'
39
+ throw :no_more
40
+ else
41
+ raise e
42
+ end
43
+ end
44
+
45
+ def http_header
46
+ @http_header ||= (
47
+ h = {}
48
+ Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
49
+ h
50
+ )
51
+ end
52
+
53
+ def limit
54
+ @to ||= Float::INFINITY
55
+ end
56
+
57
+ def update_progress thread, page_num
58
+ @progress[thread] = page_num
59
+ printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
60
+ end
61
+
62
+ end
63
+ end
@@ -1,4 +1,4 @@
1
- class PageByPage
1
+ module PageByPage
2
2
  class Enum
3
3
 
4
4
  def initialize from: 1, step: 1
@@ -1,9 +1,17 @@
1
1
  require 'page_by_page/enum'
2
2
  require 'page_by_page/mutex_enum'
3
+ require 'page_by_page/common'
3
4
  require 'erb'
4
5
 
5
- class PageByPage
6
- module Fetch
6
+ module PageByPage
7
+ class Fetch
8
+
9
+ include Common
10
+
11
+ def initialize(opt = {}, &block)
12
+ @from, @step, @to = 1, 1, Float::INFINITY
13
+ super
14
+ end
7
15
 
8
16
  def url tmpl
9
17
  @tmpl = ERB.new tmpl
@@ -21,11 +29,7 @@ class PageByPage
21
29
  @threads = n
22
30
  end
23
31
 
24
- def no_progress *arg
25
- @progress = nil
26
- end
27
-
28
- def fetch
32
+ def process
29
33
  nodes_2d =
30
34
  unless defined? @threads
31
35
  @enum = Enum.new enum_options
@@ -1,5 +1,9 @@
1
- class PageByPage
2
- module Jump
1
+ require 'page_by_page/common'
2
+
3
+ module PageByPage
4
+ class Jump
5
+
6
+ include Common
3
7
 
4
8
  def start url
5
9
  @start = url
@@ -9,26 +13,27 @@ class PageByPage
9
13
  @iterate = selector
10
14
  end
11
15
 
12
- def jump
16
+ def process
13
17
  url, items, page_count = @start, [], 0
14
18
 
15
19
  while true do
16
20
  doc = parse url
17
21
  doc.css(@selector).each{ |item| items << item }
18
22
 
23
+ page_count += 1
24
+ update_progress Thread.current, page_count if @progress
25
+ break if page_count >= limit
26
+
19
27
  next_url = doc.at_css(@iterate)
20
28
  break unless next_url
21
29
 
22
30
  path = next_url.attr('href')
23
31
  url = concat_host path
24
32
 
25
- page_count += 1
26
- update_progress Thread.current, page_count if @progress
27
- break if page_count >= limit
28
-
29
33
  sleep @interval if @interval
30
34
  end
31
35
 
36
+ puts if @progress
32
37
  items
33
38
  end
34
39
 
@@ -1,6 +1,6 @@
1
1
  require 'page_by_page/enum'
2
2
 
3
- class PageByPage
3
+ module PageByPage
4
4
  class MutexEnum < Enum
5
5
 
6
6
  def initialize from: 1, step: 1
@@ -1,3 +1,3 @@
1
- class PageByPage
2
- VERSION = "0.1.10"
1
+ module PageByPage
2
+ VERSION = "0.1.11"
3
3
  end
data/lib/page_by_page.rb CHANGED
@@ -1,75 +1,17 @@
1
1
  require 'page_by_page/version'
2
2
  require 'page_by_page/fetch'
3
3
  require 'page_by_page/jump'
4
- require 'nokogiri'
5
- require 'open-uri'
6
4
 
7
- class PageByPage
8
-
9
- include Fetch
10
- include Jump
5
+ module PageByPage
11
6
 
12
7
  class << self
13
8
  def fetch(*args, &block)
14
- new(*args, &block).fetch
9
+ Fetch.new(*args, &block).process
15
10
  end
16
11
 
17
12
  def jump(*args, &block)
18
- new(*args, &block).jump
13
+ Jump.new(*args, &block).process
19
14
  end
20
15
  end
21
16
 
22
- def initialize(opt = {}, &block)
23
- @from, @step, @to = 1, 1, Float::INFINITY
24
- @progress = {}
25
- opt.each{ |name, value| send name, value }
26
- instance_eval &block if block
27
- end
28
-
29
- def to n
30
- @to = n
31
- end
32
-
33
- def selector sl
34
- @selector = sl
35
- end
36
-
37
- def header hash
38
- @header = hash
39
- end
40
-
41
- def interval second
42
- @interval = second
43
- end
44
-
45
- private
46
-
47
- def parse url
48
- page = open(url, http_header)
49
- Nokogiri::HTML page.read
50
- rescue OpenURI::HTTPError => e
51
- if e.message == '404 Not Found'
52
- throw :no_more
53
- else
54
- raise e
55
- end
56
- end
57
-
58
- def http_header
59
- @http_header ||= (
60
- h = {}
61
- Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
62
- h
63
- )
64
- end
65
-
66
- def limit
67
- @to ||= Float::INFINITY
68
- end
69
-
70
- def update_progress thread, page_num
71
- @progress[thread] = page_num
72
- printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
73
- end
74
-
75
17
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.10
4
+ version: 0.1.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
@@ -83,6 +83,7 @@ files:
83
83
  - bin/console
84
84
  - bin/setup
85
85
  - lib/page_by_page.rb
86
+ - lib/page_by_page/common.rb
86
87
  - lib/page_by_page/enum.rb
87
88
  - lib/page_by_page/fetch.rb
88
89
  - lib/page_by_page/jump.rb