digger 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f3e89f179fa868ecd2879180d1fbfbf03ba0ebee3731b9c8b4741d22663ff4aa
4
- data.tar.gz: 1b27e4a1446e9835203bf5497aeebc3bc4ab58998a0fc443eeaaf7e7ec86c2c7
3
+ metadata.gz: 2f506fd615df8b9d732d6b67bedc72644df26dc3f5725cfe5dba10c1098bae0b
4
+ data.tar.gz: 0e9afcb19ba0be5ce4a90787d54c3b43d58100bac3ec45bf5ef93781a60b6eb1
5
5
  SHA512:
6
- metadata.gz: 5671e5d2484ca744e5c75f97beeb473e1970291fc094c0274714af27cc847ef47bcf3b1b81534e6f299f35648fdf3aabec9d90777e4b2fbe49dc2629c048f610
7
- data.tar.gz: 496534bb394d17792dc7173759b83c0dd62b8569a241b2a6a57e60a14ea99ddafd7e030fe16480c5782ec319f6dc1d3563919f7202026fb008522dd26cae6c01
6
+ metadata.gz: 8608a2ee8e06ddd846772d40dc3e417560229729b7b736eda8a7f50977a7d2c6fc523f86fe64480b5172c5295137eae7c85f945f7ca310502c54c8b90dd75e8d
7
+ data.tar.gz: 59fef1a13adc8f983c16428ee4d08d6ccdecea09b7583452b6ca07689727c3d6a386f5a5b767a20d117c8c4bc9775a2a6b32fd4caff1c1a5e85ee2af82e39d1b
data/lib/digger/index.rb CHANGED
@@ -8,33 +8,34 @@ module Digger
8
8
 
9
9
  def urls
10
10
  @urls ||= begin
11
- args = self.args.map{|a| (a.respond_to? :each) ? a.to_a : [a]}
12
- args.shift.product(*args).map{|arg| pattern_applied_url(arg)}
11
+ args = self.args.map { |a| a.respond_to?(:each) ? a.to_a : [a] }
12
+ args.shift.product(*args).map { |arg| pattern_applied_url(arg) }
13
13
  end
14
14
  end
15
15
 
16
16
  def pattern_applied_url(arg)
17
- pattern.gsub('*').each_with_index{|_, i| arg[i]}
17
+ pattern.gsub('*').each_with_index { |_, i| arg[i] }
18
18
  end
19
19
 
20
20
  def self.batch(entities, cocurrence = 1, &block)
21
- raise NoBlockError, "No block given" unless block
21
+ raise NoBlockError, 'No block given' unless block
22
22
 
23
23
  if cocurrence > 1
24
- results = {}
25
- entities.each_slice(cocurrence) do |group|
24
+ results = Array.new(entities.size)
25
+ entities.each_slice(cocurrence).with_index do |group, idx1|
26
26
  threads = []
27
- group.each do |entity|
27
+ group.each_with_index do |entity, idx2|
28
+ index = idx1 * cocurrence + idx2
28
29
  threads << Thread.new(entity) do |ent|
29
- results[ent] = block.call(ent)
30
+ results[index] = block.call(ent)
30
31
  end
31
32
  end
32
- threads.each{|thread| thread.join}
33
+ threads.each(&:join)
33
34
  end
34
- entities.map{|ent| results[ent]}
35
+ results
35
36
  else
36
- entities.map{|ent| block.call(ent) }
37
+ entities.map { |ent| block.call(ent) }
37
38
  end
38
39
  end
39
40
  end
40
- end
41
+ end
data/lib/digger/model.rb CHANGED
@@ -1,16 +1,19 @@
1
1
 
2
2
  module Digger
3
3
  class Model
4
- @@digger_config = {'pattern'=>{}, 'index'=>{}}
4
+ @@digger_config = {
5
+ 'pattern' => {},
6
+ 'index' => {}
7
+ }
5
8
 
6
9
  class << self
7
10
  # patterns
8
11
  def pattern_config
9
- @@digger_config['pattern'][self.name] ||= {}
12
+ @@digger_config['pattern'][name] ||= {}
10
13
  end
11
14
 
12
15
  Pattern::TYPES.each do |method|
13
- define_method method, ->(pairs, &block){
16
+ define_method method, -> (pairs, &block) {
14
17
  pairs.each_pair do |key, value|
15
18
  pattern_config[key] = Pattern.new(type: method, value: value, block: block)
16
19
  end
@@ -18,21 +21,22 @@ module Digger
18
21
  end
19
22
 
20
23
  def validate_presence(*keys)
21
- keys_all = pattern_config.keys
22
- raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
24
+ is_all = pattern_config.keys.all? { |k| keys.include?(k) }
25
+ raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless is_all
23
26
  end
24
27
 
25
28
  def validate_includeness(*keys)
26
- raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
29
+ is_all = pattern_config.keys.all? { |k| keys.include?(k) }
30
+ raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" if is_all
27
31
  end
28
32
 
29
33
  # index page
30
34
  def index_config
31
- @@digger_config['index'][self.name]
35
+ @@digger_config['index'][name]
32
36
  end
33
37
 
34
38
  def index_page(pattern, *args)
35
- @@digger_config['index'][self.name] = Index.new(pattern, args)
39
+ @@digger_config['index'][name] = Index.new(pattern, args)
36
40
  end
37
41
 
38
42
  def index_page?
@@ -55,13 +59,15 @@ module Digger
55
59
  end
56
60
 
57
61
  def dig_urls(urls, cocurrence = 1, opts = {})
58
- Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
62
+ Index.batch(urls, cocurrence) { |url| dig_url(url, opts) }
59
63
  end
60
64
 
61
65
  def dig(cocurrence = 1)
62
66
  if self.class.index_page?
63
- self.class.index_config.process(cocurrence){|url| dig_url(url) }
67
+ self.class.index_config.process(cocurrence) do |url|
68
+ dig_url(url)
69
+ end
64
70
  end
65
71
  end
66
72
  end
67
- end
73
+ end
data/lib/digger/page.rb CHANGED
@@ -28,16 +28,12 @@ module Digger
28
28
  # OpenStruct it holds users defined data
29
29
  attr_accessor :user_data
30
30
 
31
- attr_accessor :aliases
32
-
33
- attr_accessor :domain_aliases
31
+ attr_accessor :aliases, :domain_aliases, :fetched_at
34
32
 
35
33
  # Whether the current page should be stored
36
34
  # Default: true
37
35
  attr_accessor :storable
38
36
 
39
- attr_accessor :fetched_at
40
-
41
37
  #
42
38
  # Create a new page
43
39
  #
@@ -61,7 +57,7 @@ module Digger
61
57
  end
62
58
 
63
59
  def title
64
- doc.title if doc
60
+ doc&.title
65
61
  end
66
62
 
67
63
  #
@@ -75,6 +71,7 @@ module Digger
75
71
  doc.search('//a[@href]').each do |a|
76
72
  u = a['href']
77
73
  next if u.nil? || u.empty?
74
+
78
75
  abs = to_absolute(u) rescue next
79
76
  @links << abs if abs && in_domain?(abs)
80
77
  end
@@ -101,7 +98,7 @@ module Digger
101
98
  end
102
99
 
103
100
  def jsonp
104
- @jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
101
+ @jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
105
102
  end
106
103
 
107
104
  #
@@ -163,7 +160,7 @@ module Digger
163
160
  # returns +false+ otherwise.
164
161
  #
165
162
  def not_found?
166
- 404 == @code
163
+ @code == 404
167
164
  end
168
165
 
169
166
  #
@@ -177,6 +174,7 @@ module Digger
177
174
  end unless @base
178
175
 
179
176
  return nil if @base && @base.to_s.empty?
177
+
180
178
  @base
181
179
  end
182
180
 
@@ -245,6 +243,7 @@ module Digger
245
243
 
246
244
  def expired?(ttl)
247
245
  return false if fetched_at.nil?
246
+
248
247
  (Time.now.to_i - ttl) > fetched_at
249
248
  end
250
249
 
@@ -1,111 +1,113 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  module Digger
4
+ # Extractor patterns definition
4
5
  class Pattern
5
6
  attr_accessor :type, :value, :block
6
7
 
7
8
  def initialize(hash = {})
8
- hash.each_pair{|key, value| send("#{key}=", value) if %w{type value block}.include?(key.to_s)}
9
+ hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
9
10
  end
10
11
 
11
- def safe_block
12
- block && begin
13
- if block.respond_to?(:call)
14
- block
15
- elsif block.strip == '' #
16
- nil
17
- else
18
- proc{ $SAFE = 2; eval block }.call
19
- end
20
- rescue StandardError
21
- nil
12
+ def safe_block(&default_block)
13
+ if block.nil? || (block.is_a?(String) && block.strip.empty?)
14
+ default_block
15
+ elsif block.respond_to?(:call)
16
+ block
17
+ else
18
+ proc {
19
+ $SAFE = 2
20
+ eval block
21
+ }.call
22
22
  end
23
23
  end
24
24
 
25
25
  def self.wrap(hash)
26
- Hash[hash.map{|key, value| [key, value.is_a?(Pattern) ? value : Pattern.new(value)]}]
26
+ hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) }
27
27
  end
28
28
 
29
29
  MATCH_MAX = 3
30
30
 
31
- TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
32
- TYPES_CSS = %w{css_one css_many}
33
- TYPES_JSON = %w{json jsonp}
34
-
31
+ TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
32
+ TYPES_CSS = %w[css_one css_many].freeze
33
+ TYPES_JSON = %w[json jsonp].freeze
34
+
35
35
  TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
36
36
 
37
- def match_page(page, &callback)
38
- blk = callback || safe_block
37
+ def match_page(page)
38
+ return unless page.success?
39
39
  if TYPES_REGEXP.include?(type) # regular expression
40
- blk ||= ->(text){ text.strip }
41
- # content is String
42
- if type == 'match_many'
43
- match = page.body.gsub(value).to_a
44
- else
45
- index = TYPES_REGEXP.index(type)
46
- matches = page.body.match(value)
47
- match = matches.nil? ? nil : matches[index]
48
- end
40
+ regexp_match(page.body)
49
41
  elsif TYPES_CSS.include?(type) # css expression
50
- blk ||= ->(node){ node.content.strip }
51
- # content is Nokogiri::HTML::Document
52
- if type == 'css_one'
53
- match = page.doc.css(value).first
54
- else
55
- match = page.doc.css(value)
56
- end
42
+ css_match(page.doc)
57
43
  elsif TYPES_JSON.include?(type)
58
- json = page.send(type)
59
- match = json_fetch(json, value)
44
+ json_match(page)
45
+ end
46
+ end
47
+
48
+ def json_match(page)
49
+ block = safe_block { |j| j }
50
+ json = page.send(type)
51
+ keys = json_index_keys(value)
52
+ match = json_fetch(json, keys)
53
+ block.call(match)
54
+ end
55
+
56
+ def css_match(doc)
57
+ block = safe_block { |node| node.content.strip }
58
+ # content is Nokogiri::HTML::Document
59
+ contents = doc.css(value)
60
+ if type == 'css_many'
61
+ contents.map { |node| block.call(node) }.uniq
62
+ else
63
+ block.call(contents.first)
60
64
  end
61
- if match.nil?
62
- nil
63
- elsif %w{css_many match_many}.include? type
64
- match.map{|node| blk.call(node) }.uniq
65
+ end
66
+
67
+ def regexp_match(body)
68
+ block = safe_block(&:strip)
69
+ # content is String
70
+ if type == 'match_many'
71
+ body.gsub(value).to_a.map { |node| block.call(node) }.uniq
65
72
  else
66
- blk.call(match)
73
+ index = TYPES_REGEXP.index(type)
74
+ matches = body.match(value)
75
+ block.call(matches[index]) unless matches.nil?
67
76
  end
68
- rescue
69
- nil
70
77
  end
71
78
 
72
79
  def json_fetch(json, keys)
73
- if keys.is_a? String
74
- # parse json keys like '$.k1.k2[0]'
75
- parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
76
- p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
77
- end
78
- json_fetch(json, parts)
79
- elsif keys.is_a? Array
80
- if keys.length == 0
81
- json
82
- else
83
- pt = keys.shift
84
- json_fetch(json[pt[:index] || pt[:key]], keys)
85
- end
80
+ if keys.empty?
81
+ json
82
+ else
83
+ pt = keys.shift
84
+ json_fetch(json[pt[:index] || pt[:key]], keys)
86
85
  end
87
86
  end
88
87
 
88
+ def json_index_keys(keys)
89
+ keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p|
90
+ p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
91
+ end
92
+ end
93
+
94
+ private :json_index_keys, :json_fetch
95
+
96
+ # Nokogiri node methods
89
97
  class Nokogiri::XML::Node
90
- %w{one many}.each do |name|
91
- define_method "inner_#{name}" do |css, &block|
92
- callback = ->(node) do
93
- if node
94
- (block || ->(n){n.text.strip}).call(node)
95
- else
96
- nil
97
- end
98
- end
98
+ %w[one many].each do |name|
99
+ define_method "inner_#{name}" do |css, &block|
100
+ callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
99
101
  if name == 'one' # inner_one
100
102
  callback.call(self.css(css).first)
101
103
  else # inner_many
102
- self.css(css).map{|node| callback.call(node)}
104
+ self.css(css).map { |node| callback.call(node) }
103
105
  end
104
106
  end
105
107
  end
106
108
  def source
107
109
  to_xml
108
110
  end
109
- end # nokogiri
111
+ end
110
112
  end
111
- end
113
+ end
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = "0.1.6"
2
+ VERSION = '0.1.7'.freeze
3
3
  end
@@ -0,0 +1,12 @@
1
+ require 'digger'
2
+
3
+ describe Digger::Index do
4
+ it 'batch digger' do
5
+ list = [1, 2, 3, 4, 5, 6, 7, 8]
6
+ pt = Digger::Index.batch(list, 3) do |num|
7
+ sleep(rand(1..3))
8
+ "##{num}"
9
+ end
10
+ expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
11
+ end
12
+ end
data/spec/pattern_spec.rb CHANGED
@@ -3,12 +3,12 @@ require 'json'
3
3
 
4
4
  describe Digger::Pattern do
5
5
  it 'json fetch' do
6
- json = JSON.parse('{"a":1,"b":[1,2,3]}')
6
+ json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
7
7
  pt = Digger::Pattern.new
8
- expect(pt.json_fetch(json, '$')['a']).to eq(1)
9
- expect(pt.json_fetch(json, '$.a')).to eq(1)
10
- expect(pt.json_fetch(json, '$.b').length).to eq(3)
11
- expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
8
+ expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
9
+ expect(pt.json_fetch(json, '$[0].a')).to eq(1)
10
+ expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
11
+ expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
12
12
  end
13
13
 
14
14
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
@@ -87,6 +87,7 @@ files:
87
87
  - lib/digger/pattern.rb
88
88
  - lib/digger/version.rb
89
89
  - spec/digger_spec.rb
90
+ - spec/index_spec.rb
90
91
  - spec/page_spec.rb
91
92
  - spec/pattern_spec.rb
92
93
  - spec/validate_spec.rb
@@ -115,6 +116,7 @@ specification_version: 4
115
116
  summary: Dig need stractual infomation from web page.
116
117
  test_files:
117
118
  - spec/digger_spec.rb
119
+ - spec/index_spec.rb
118
120
  - spec/page_spec.rb
119
121
  - spec/pattern_spec.rb
120
122
  - spec/validate_spec.rb