page_by_page 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 47ea21770030654ef4b0d4a7b5d3dec3c1c20d7d43b11e39c5ea7e68be86478f
4
- data.tar.gz: 7341f8e5293250b308bb4c223a14f1ec8480513269f3b6c1ef06c5aa9f825e92
3
+ metadata.gz: 46595cb8e0d8590c5b614eac4301b0423824a12f43b94e83294e325fd1c7f5c7
4
+ data.tar.gz: 11ac56d8a0061a00d2807307f285332c5548b06b915bfcd94675d5d57d7b8454
5
5
  SHA512:
6
- metadata.gz: '0912271bc28adfd2e97313bbecdeed894976eb37814c84bbbb009ca8ec10dbf04c59b25ce4716dee1701b42cbc1015e7dd83441afeb9a8d7cf7678c477328beb'
7
- data.tar.gz: 6e46ac599c44dff52b9527de7d526bac874c166191f10e212168034b1477fead6194d1dabfddad46ec74494676e4fc7260b1f25616e5d3c6a08d0bc6250e5fab
6
+ metadata.gz: fa81661e065393f115f355e666f43b5bf636019f50c4c40a33faf8bfe44e1134cce5f7eb3255f68a427d178b3e90a986a69b059551cfecca51ff779b3bd3c25d
7
+ data.tar.gz: 3f92a09ec86789cd0517597a5f15850bcb48303c4e1c3980efb3d57c6ba2d34c4c46fc287df060fb9328ca6efc5543f3c5abe5b2f717974b618b2a171fd0116f
@@ -0,0 +1,63 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module PageByPage
5
+ module Common
6
+ def initialize(opt = {}, &block)
7
+ @progress = {}
8
+ opt.each{ |name, value| send name, value }
9
+ instance_eval &block if block
10
+ end
11
+
12
+ def to n
13
+ @to = n
14
+ end
15
+
16
+ def selector sl
17
+ @selector = sl
18
+ end
19
+
20
+ def header hash
21
+ @header = hash
22
+ end
23
+
24
+ def interval second
25
+ @interval = second
26
+ end
27
+
28
+ def no_progress *arg
29
+ @progress = nil
30
+ end
31
+
32
+ protected
33
+
34
+ def parse url
35
+ page = open(url, http_header)
36
+ Nokogiri::HTML page.read
37
+ rescue OpenURI::HTTPError => e
38
+ if e.message == '404 Not Found'
39
+ throw :no_more
40
+ else
41
+ raise e
42
+ end
43
+ end
44
+
45
+ def http_header
46
+ @http_header ||= (
47
+ h = {}
48
+ Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
49
+ h
50
+ )
51
+ end
52
+
53
+ def limit
54
+ @to ||= Float::INFINITY
55
+ end
56
+
57
+ def update_progress thread, page_num
58
+ @progress[thread] = page_num
59
+ printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
60
+ end
61
+
62
+ end
63
+ end
@@ -1,4 +1,4 @@
1
- class PageByPage
1
+ module PageByPage
2
2
  class Enum
3
3
 
4
4
  def initialize from: 1, step: 1
@@ -1,9 +1,17 @@
1
1
  require 'page_by_page/enum'
2
2
  require 'page_by_page/mutex_enum'
3
+ require 'page_by_page/common'
3
4
  require 'erb'
4
5
 
5
- class PageByPage
6
- module Fetch
6
+ module PageByPage
7
+ class Fetch
8
+
9
+ include Common
10
+
11
+ def initialize(opt = {}, &block)
12
+ @from, @step, @to = 1, 1, Float::INFINITY
13
+ super
14
+ end
7
15
 
8
16
  def url tmpl
9
17
  @tmpl = ERB.new tmpl
@@ -21,11 +29,7 @@ class PageByPage
21
29
  @threads = n
22
30
  end
23
31
 
24
- def no_progress *arg
25
- @progress = nil
26
- end
27
-
28
- def fetch
32
+ def process
29
33
  nodes_2d =
30
34
  unless defined? @threads
31
35
  @enum = Enum.new enum_options
@@ -1,5 +1,9 @@
1
- class PageByPage
2
- module Jump
1
+ require 'page_by_page/common'
2
+
3
+ module PageByPage
4
+ class Jump
5
+
6
+ include Common
3
7
 
4
8
  def start url
5
9
  @start = url
@@ -9,26 +13,27 @@ class PageByPage
9
13
  @iterate = selector
10
14
  end
11
15
 
12
- def jump
16
+ def process
13
17
  url, items, page_count = @start, [], 0
14
18
 
15
19
  while true do
16
20
  doc = parse url
17
21
  doc.css(@selector).each{ |item| items << item }
18
22
 
23
+ page_count += 1
24
+ update_progress Thread.current, page_count if @progress
25
+ break if page_count >= limit
26
+
19
27
  next_url = doc.at_css(@iterate)
20
28
  break unless next_url
21
29
 
22
30
  path = next_url.attr('href')
23
31
  url = concat_host path
24
32
 
25
- page_count += 1
26
- update_progress Thread.current, page_count if @progress
27
- break if page_count >= limit
28
-
29
33
  sleep @interval if @interval
30
34
  end
31
35
 
36
+ puts if @progress
32
37
  items
33
38
  end
34
39
 
@@ -1,6 +1,6 @@
1
1
  require 'page_by_page/enum'
2
2
 
3
- class PageByPage
3
+ module PageByPage
4
4
  class MutexEnum < Enum
5
5
 
6
6
  def initialize from: 1, step: 1
@@ -1,3 +1,3 @@
1
- class PageByPage
2
- VERSION = "0.1.10"
1
+ module PageByPage
2
+ VERSION = "0.1.11"
3
3
  end
data/lib/page_by_page.rb CHANGED
@@ -1,75 +1,17 @@
1
1
  require 'page_by_page/version'
2
2
  require 'page_by_page/fetch'
3
3
  require 'page_by_page/jump'
4
- require 'nokogiri'
5
- require 'open-uri'
6
4
 
7
- class PageByPage
8
-
9
- include Fetch
10
- include Jump
5
+ module PageByPage
11
6
 
12
7
  class << self
13
8
  def fetch(*args, &block)
14
- new(*args, &block).fetch
9
+ Fetch.new(*args, &block).process
15
10
  end
16
11
 
17
12
  def jump(*args, &block)
18
- new(*args, &block).jump
13
+ Jump.new(*args, &block).process
19
14
  end
20
15
  end
21
16
 
22
- def initialize(opt = {}, &block)
23
- @from, @step, @to = 1, 1, Float::INFINITY
24
- @progress = {}
25
- opt.each{ |name, value| send name, value }
26
- instance_eval &block if block
27
- end
28
-
29
- def to n
30
- @to = n
31
- end
32
-
33
- def selector sl
34
- @selector = sl
35
- end
36
-
37
- def header hash
38
- @header = hash
39
- end
40
-
41
- def interval second
42
- @interval = second
43
- end
44
-
45
- private
46
-
47
- def parse url
48
- page = open(url, http_header)
49
- Nokogiri::HTML page.read
50
- rescue OpenURI::HTTPError => e
51
- if e.message == '404 Not Found'
52
- throw :no_more
53
- else
54
- raise e
55
- end
56
- end
57
-
58
- def http_header
59
- @http_header ||= (
60
- h = {}
61
- Hash(@header).each_pair{ |k, v| h[k.to_s] = v }
62
- h
63
- )
64
- end
65
-
66
- def limit
67
- @to ||= Float::INFINITY
68
- end
69
-
70
- def update_progress thread, page_num
71
- @progress[thread] = page_num
72
- printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
73
- end
74
-
75
17
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: page_by_page
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.10
4
+ version: 0.1.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - ken
@@ -83,6 +83,7 @@ files:
83
83
  - bin/console
84
84
  - bin/setup
85
85
  - lib/page_by_page.rb
86
+ - lib/page_by_page/common.rb
86
87
  - lib/page_by_page/enum.rb
87
88
  - lib/page_by_page/fetch.rb
88
89
  - lib/page_by_page/jump.rb