digger 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/digger/index.rb +13 -12
- data/lib/digger/model.rb +17 -11
- data/lib/digger/page.rb +7 -8
- data/lib/digger/pattern.rb +71 -69
- data/lib/digger/version.rb +1 -1
- data/spec/index_spec.rb +12 -0
- data/spec/pattern_spec.rb +5 -5
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f506fd615df8b9d732d6b67bedc72644df26dc3f5725cfe5dba10c1098bae0b
|
4
|
+
data.tar.gz: 0e9afcb19ba0be5ce4a90787d54c3b43d58100bac3ec45bf5ef93781a60b6eb1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8608a2ee8e06ddd846772d40dc3e417560229729b7b736eda8a7f50977a7d2c6fc523f86fe64480b5172c5295137eae7c85f945f7ca310502c54c8b90dd75e8d
|
7
|
+
data.tar.gz: 59fef1a13adc8f983c16428ee4d08d6ccdecea09b7583452b6ca07689727c3d6a386f5a5b767a20d117c8c4bc9775a2a6b32fd4caff1c1a5e85ee2af82e39d1b
|
data/lib/digger/index.rb
CHANGED
@@ -8,33 +8,34 @@ module Digger
|
|
8
8
|
|
9
9
|
def urls
|
10
10
|
@urls ||= begin
|
11
|
-
args = self.args.map{|a|
|
12
|
-
args.shift.product(*args).map{|arg| pattern_applied_url(arg)}
|
11
|
+
args = self.args.map { |a| a.respond_to?(:each) ? a.to_a : [a] }
|
12
|
+
args.shift.product(*args).map { |arg| pattern_applied_url(arg) }
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
16
|
def pattern_applied_url(arg)
|
17
|
-
pattern.gsub('*').each_with_index{|_, i| arg[i]}
|
17
|
+
pattern.gsub('*').each_with_index { |_, i| arg[i] }
|
18
18
|
end
|
19
19
|
|
20
20
|
def self.batch(entities, cocurrence = 1, &block)
|
21
|
-
raise NoBlockError,
|
21
|
+
raise NoBlockError, 'No block given' unless block
|
22
22
|
|
23
23
|
if cocurrence > 1
|
24
|
-
results =
|
25
|
-
entities.each_slice(cocurrence) do |group|
|
24
|
+
results = Array.new(entities.size)
|
25
|
+
entities.each_slice(cocurrence).with_index do |group, idx1|
|
26
26
|
threads = []
|
27
|
-
group.
|
27
|
+
group.each_with_index do |entity, idx2|
|
28
|
+
index = idx1 * cocurrence + idx2
|
28
29
|
threads << Thread.new(entity) do |ent|
|
29
|
-
results[
|
30
|
+
results[index] = block.call(ent)
|
30
31
|
end
|
31
32
|
end
|
32
|
-
threads.each
|
33
|
+
threads.each(&:join)
|
33
34
|
end
|
34
|
-
|
35
|
+
results
|
35
36
|
else
|
36
|
-
entities.map{|ent| block.call(ent) }
|
37
|
+
entities.map { |ent| block.call(ent) }
|
37
38
|
end
|
38
39
|
end
|
39
40
|
end
|
40
|
-
end
|
41
|
+
end
|
data/lib/digger/model.rb
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
|
2
2
|
module Digger
|
3
3
|
class Model
|
4
|
-
@@digger_config = {
|
4
|
+
@@digger_config = {
|
5
|
+
'pattern' => {},
|
6
|
+
'index' => {}
|
7
|
+
}
|
5
8
|
|
6
9
|
class << self
|
7
10
|
# patterns
|
8
11
|
def pattern_config
|
9
|
-
@@digger_config['pattern'][
|
12
|
+
@@digger_config['pattern'][name] ||= {}
|
10
13
|
end
|
11
14
|
|
12
15
|
Pattern::TYPES.each do |method|
|
13
|
-
define_method method, ->(pairs, &block){
|
16
|
+
define_method method, -> (pairs, &block) {
|
14
17
|
pairs.each_pair do |key, value|
|
15
18
|
pattern_config[key] = Pattern.new(type: method, value: value, block: block)
|
16
19
|
end
|
@@ -18,21 +21,22 @@ module Digger
|
|
18
21
|
end
|
19
22
|
|
20
23
|
def validate_presence(*keys)
|
21
|
-
|
22
|
-
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless
|
24
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
25
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless is_all
|
23
26
|
end
|
24
27
|
|
25
28
|
def validate_includeness(*keys)
|
26
|
-
|
29
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
30
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" if is_all
|
27
31
|
end
|
28
32
|
|
29
33
|
# index page
|
30
34
|
def index_config
|
31
|
-
@@digger_config['index'][
|
35
|
+
@@digger_config['index'][name]
|
32
36
|
end
|
33
37
|
|
34
38
|
def index_page(pattern, *args)
|
35
|
-
|
39
|
+
@@digger_config['index'][name] = Index.new(pattern, args)
|
36
40
|
end
|
37
41
|
|
38
42
|
def index_page?
|
@@ -55,13 +59,15 @@ module Digger
|
|
55
59
|
end
|
56
60
|
|
57
61
|
def dig_urls(urls, cocurrence = 1, opts = {})
|
58
|
-
Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
|
62
|
+
Index.batch(urls, cocurrence) { |url| dig_url(url, opts) }
|
59
63
|
end
|
60
64
|
|
61
65
|
def dig(cocurrence = 1)
|
62
66
|
if self.class.index_page?
|
63
|
-
self.class.index_config.process(cocurrence)
|
67
|
+
self.class.index_config.process(cocurrence) do |url|
|
68
|
+
dig_url(url)
|
69
|
+
end
|
64
70
|
end
|
65
71
|
end
|
66
72
|
end
|
67
|
-
end
|
73
|
+
end
|
data/lib/digger/page.rb
CHANGED
@@ -28,16 +28,12 @@ module Digger
|
|
28
28
|
# OpenStruct it holds users defined data
|
29
29
|
attr_accessor :user_data
|
30
30
|
|
31
|
-
attr_accessor :aliases
|
32
|
-
|
33
|
-
attr_accessor :domain_aliases
|
31
|
+
attr_accessor :aliases, :domain_aliases, :fetched_at
|
34
32
|
|
35
33
|
# Whether the current page should be stored
|
36
34
|
# Default: true
|
37
35
|
attr_accessor :storable
|
38
36
|
|
39
|
-
attr_accessor :fetched_at
|
40
|
-
|
41
37
|
#
|
42
38
|
# Create a new page
|
43
39
|
#
|
@@ -61,7 +57,7 @@ module Digger
|
|
61
57
|
end
|
62
58
|
|
63
59
|
def title
|
64
|
-
doc
|
60
|
+
doc&.title
|
65
61
|
end
|
66
62
|
|
67
63
|
#
|
@@ -75,6 +71,7 @@ module Digger
|
|
75
71
|
doc.search('//a[@href]').each do |a|
|
76
72
|
u = a['href']
|
77
73
|
next if u.nil? || u.empty?
|
74
|
+
|
78
75
|
abs = to_absolute(u) rescue next
|
79
76
|
@links << abs if abs && in_domain?(abs)
|
80
77
|
end
|
@@ -101,7 +98,7 @@ module Digger
|
|
101
98
|
end
|
102
99
|
|
103
100
|
def jsonp
|
104
|
-
@jsonp ||= JSON.parse body.match(/^[
|
101
|
+
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
105
102
|
end
|
106
103
|
|
107
104
|
#
|
@@ -163,7 +160,7 @@ module Digger
|
|
163
160
|
# returns +false+ otherwise.
|
164
161
|
#
|
165
162
|
def not_found?
|
166
|
-
|
163
|
+
@code == 404
|
167
164
|
end
|
168
165
|
|
169
166
|
#
|
@@ -177,6 +174,7 @@ module Digger
|
|
177
174
|
end unless @base
|
178
175
|
|
179
176
|
return nil if @base && @base.to_s.empty?
|
177
|
+
|
180
178
|
@base
|
181
179
|
end
|
182
180
|
|
@@ -245,6 +243,7 @@ module Digger
|
|
245
243
|
|
246
244
|
def expired?(ttl)
|
247
245
|
return false if fetched_at.nil?
|
246
|
+
|
248
247
|
(Time.now.to_i - ttl) > fetched_at
|
249
248
|
end
|
250
249
|
|
data/lib/digger/pattern.rb
CHANGED
@@ -1,111 +1,113 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
module Digger
|
4
|
+
# Extractor patterns definition
|
4
5
|
class Pattern
|
5
6
|
attr_accessor :type, :value, :block
|
6
7
|
|
7
8
|
def initialize(hash = {})
|
8
|
-
hash.each_pair{|key, value| send("#{key}=", value) if %w
|
9
|
+
hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
|
9
10
|
end
|
10
11
|
|
11
|
-
def safe_block
|
12
|
-
block &&
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
nil
|
12
|
+
def safe_block(&default_block)
|
13
|
+
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
14
|
+
default_block
|
15
|
+
elsif block.respond_to?(:call)
|
16
|
+
block
|
17
|
+
else
|
18
|
+
proc {
|
19
|
+
$SAFE = 2
|
20
|
+
eval block
|
21
|
+
}.call
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
25
|
def self.wrap(hash)
|
26
|
-
|
26
|
+
hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) }
|
27
27
|
end
|
28
28
|
|
29
29
|
MATCH_MAX = 3
|
30
30
|
|
31
|
-
TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w
|
32
|
-
TYPES_CSS = %w
|
33
|
-
TYPES_JSON = %w
|
34
|
-
|
31
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
32
|
+
TYPES_CSS = %w[css_one css_many].freeze
|
33
|
+
TYPES_JSON = %w[json jsonp].freeze
|
34
|
+
|
35
35
|
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
36
36
|
|
37
|
-
def match_page(page
|
38
|
-
|
37
|
+
def match_page(page)
|
38
|
+
return unless page.success?
|
39
39
|
if TYPES_REGEXP.include?(type) # regular expression
|
40
|
-
|
41
|
-
# content is String
|
42
|
-
if type == 'match_many'
|
43
|
-
match = page.body.gsub(value).to_a
|
44
|
-
else
|
45
|
-
index = TYPES_REGEXP.index(type)
|
46
|
-
matches = page.body.match(value)
|
47
|
-
match = matches.nil? ? nil : matches[index]
|
48
|
-
end
|
40
|
+
regexp_match(page.body)
|
49
41
|
elsif TYPES_CSS.include?(type) # css expression
|
50
|
-
|
51
|
-
# content is Nokogiri::HTML::Document
|
52
|
-
if type == 'css_one'
|
53
|
-
match = page.doc.css(value).first
|
54
|
-
else
|
55
|
-
match = page.doc.css(value)
|
56
|
-
end
|
42
|
+
css_match(page.doc)
|
57
43
|
elsif TYPES_JSON.include?(type)
|
58
|
-
|
59
|
-
|
44
|
+
json_match(page)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def json_match(page)
|
49
|
+
block = safe_block { |j| j }
|
50
|
+
json = page.send(type)
|
51
|
+
keys = json_index_keys(value)
|
52
|
+
match = json_fetch(json, keys)
|
53
|
+
block.call(match)
|
54
|
+
end
|
55
|
+
|
56
|
+
def css_match(doc)
|
57
|
+
block = safe_block { |node| node.content.strip }
|
58
|
+
# content is Nokogiri::HTML::Document
|
59
|
+
contents = doc.css(value)
|
60
|
+
if type == 'css_many'
|
61
|
+
contents.map { |node| block.call(node) }.uniq
|
62
|
+
else
|
63
|
+
block.call(contents.first)
|
60
64
|
end
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
+
end
|
66
|
+
|
67
|
+
def regexp_match(body)
|
68
|
+
block = safe_block(&:strip)
|
69
|
+
# content is String
|
70
|
+
if type == 'match_many'
|
71
|
+
body.gsub(value).to_a.map { |node| block.call(node) }.uniq
|
65
72
|
else
|
66
|
-
|
73
|
+
index = TYPES_REGEXP.index(type)
|
74
|
+
matches = body.match(value)
|
75
|
+
block.call(matches[index]) unless matches.nil?
|
67
76
|
end
|
68
|
-
rescue
|
69
|
-
nil
|
70
77
|
end
|
71
78
|
|
72
79
|
def json_fetch(json, keys)
|
73
|
-
if keys.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
json_fetch(json, parts)
|
79
|
-
elsif keys.is_a? Array
|
80
|
-
if keys.length == 0
|
81
|
-
json
|
82
|
-
else
|
83
|
-
pt = keys.shift
|
84
|
-
json_fetch(json[pt[:index] || pt[:key]], keys)
|
85
|
-
end
|
80
|
+
if keys.empty?
|
81
|
+
json
|
82
|
+
else
|
83
|
+
pt = keys.shift
|
84
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
86
85
|
end
|
87
86
|
end
|
88
87
|
|
88
|
+
def json_index_keys(keys)
|
89
|
+
keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p|
|
90
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
private :json_index_keys, :json_fetch
|
95
|
+
|
96
|
+
# Nokogiri node methods
|
89
97
|
class Nokogiri::XML::Node
|
90
|
-
%w
|
91
|
-
define_method "inner_#{name}" do |css, &block|
|
92
|
-
callback = ->(node)
|
93
|
-
if node
|
94
|
-
(block || ->(n){n.text.strip}).call(node)
|
95
|
-
else
|
96
|
-
nil
|
97
|
-
end
|
98
|
-
end
|
98
|
+
%w[one many].each do |name|
|
99
|
+
define_method "inner_#{name}" do |css, &block|
|
100
|
+
callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
|
99
101
|
if name == 'one' # inner_one
|
100
102
|
callback.call(self.css(css).first)
|
101
103
|
else # inner_many
|
102
|
-
self.css(css).map{|node| callback.call(node)}
|
104
|
+
self.css(css).map { |node| callback.call(node) }
|
103
105
|
end
|
104
106
|
end
|
105
107
|
end
|
106
108
|
def source
|
107
109
|
to_xml
|
108
110
|
end
|
109
|
-
end
|
111
|
+
end
|
110
112
|
end
|
111
|
-
end
|
113
|
+
end
|
data/lib/digger/version.rb
CHANGED
data/spec/index_spec.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'digger'
|
2
|
+
|
3
|
+
describe Digger::Index do
|
4
|
+
it 'batch digger' do
|
5
|
+
list = [1, 2, 3, 4, 5, 6, 7, 8]
|
6
|
+
pt = Digger::Index.batch(list, 3) do |num|
|
7
|
+
sleep(rand(1..3))
|
8
|
+
"##{num}"
|
9
|
+
end
|
10
|
+
expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
|
11
|
+
end
|
12
|
+
end
|
data/spec/pattern_spec.rb
CHANGED
@@ -3,12 +3,12 @@ require 'json'
|
|
3
3
|
|
4
4
|
describe Digger::Pattern do
|
5
5
|
it 'json fetch' do
|
6
|
-
json = JSON.parse('{"a":1,"b":[1,2,3]}')
|
6
|
+
json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
7
7
|
pt = Digger::Pattern.new
|
8
|
-
expect(pt.json_fetch(json, '$')['a']).to eq(1)
|
9
|
-
expect(pt.json_fetch(json, '
|
10
|
-
expect(pt.json_fetch(json, '
|
11
|
-
expect(pt.json_fetch(json, '
|
8
|
+
expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
9
|
+
expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
10
|
+
expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
11
|
+
expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
12
|
end
|
13
13
|
|
14
14
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
@@ -87,6 +87,7 @@ files:
|
|
87
87
|
- lib/digger/pattern.rb
|
88
88
|
- lib/digger/version.rb
|
89
89
|
- spec/digger_spec.rb
|
90
|
+
- spec/index_spec.rb
|
90
91
|
- spec/page_spec.rb
|
91
92
|
- spec/pattern_spec.rb
|
92
93
|
- spec/validate_spec.rb
|
@@ -115,6 +116,7 @@ specification_version: 4
|
|
115
116
|
summary: Dig need stractual infomation from web page.
|
116
117
|
test_files:
|
117
118
|
- spec/digger_spec.rb
|
119
|
+
- spec/index_spec.rb
|
118
120
|
- spec/page_spec.rb
|
119
121
|
- spec/pattern_spec.rb
|
120
122
|
- spec/validate_spec.rb
|