digger 0.1.3 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/digger.gemspec +2 -2
- data/lib/digger/http.rb +9 -2
- data/lib/digger/index.rb +13 -12
- data/lib/digger/model.rb +17 -11
- data/lib/digger/page.rb +14 -17
- data/lib/digger/pattern.rb +78 -56
- data/lib/digger/version.rb +1 -1
- data/spec/digger_spec.rb +6 -7
- data/spec/index_spec.rb +12 -0
- data/spec/page_spec.rb +27 -0
- data/spec/pattern_spec.rb +15 -0
- metadata +18 -13
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 2f506fd615df8b9d732d6b67bedc72644df26dc3f5725cfe5dba10c1098bae0b
|
|
4
|
+
data.tar.gz: 0e9afcb19ba0be5ce4a90787d54c3b43d58100bac3ec45bf5ef93781a60b6eb1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8608a2ee8e06ddd846772d40dc3e417560229729b7b736eda8a7f50977a7d2c6fc523f86fe64480b5172c5295137eae7c85f945f7ca310502c54c8b90dd75e8d
|
|
7
|
+
data.tar.gz: 59fef1a13adc8f983c16428ee4d08d6ccdecea09b7583452b6ca07689727c3d6a386f5a5b767a20d117c8c4bc9775a2a6b32fd4caff1c1a5e85ee2af82e39d1b
|
data/digger.gemspec
CHANGED
|
@@ -18,8 +18,8 @@ Gem::Specification.new do |spec|
|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
19
|
spec.require_paths = ["lib"]
|
|
20
20
|
|
|
21
|
-
spec.add_development_dependency "bundler", "~>
|
|
22
|
-
spec.add_development_dependency "rake", "
|
|
21
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
|
22
|
+
spec.add_development_dependency "rake", ">= 12.3.3"
|
|
23
23
|
|
|
24
24
|
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
data/lib/digger/http.rb
CHANGED
|
@@ -49,7 +49,7 @@ module Digger
|
|
|
49
49
|
url = URI(url)
|
|
50
50
|
pages = []
|
|
51
51
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
|
52
|
-
handle_compression response
|
|
52
|
+
handle_compression response if handle_compression?
|
|
53
53
|
pages << Page.new(location, body: response.body,
|
|
54
54
|
code: code,
|
|
55
55
|
headers: response.to_hash,
|
|
@@ -70,6 +70,13 @@ module Digger
|
|
|
70
70
|
[Page.new(url, error: e, referer: referer, depth: depth)]
|
|
71
71
|
end
|
|
72
72
|
|
|
73
|
+
#
|
|
74
|
+
# Accept response compression, may bring encoding error if true
|
|
75
|
+
#
|
|
76
|
+
def handle_compression?
|
|
77
|
+
@opts[:handle_compression]
|
|
78
|
+
end
|
|
79
|
+
|
|
73
80
|
#
|
|
74
81
|
# The maximum number of redirects to follow
|
|
75
82
|
#
|
|
@@ -185,7 +192,7 @@ module Digger
|
|
|
185
192
|
opts['User-Agent'] = user_agent if user_agent
|
|
186
193
|
opts['Referer'] = referer.to_s if referer
|
|
187
194
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
|
188
|
-
opts['Accept-Encoding'] = 'gzip,deflate'
|
|
195
|
+
opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
|
|
189
196
|
|
|
190
197
|
retries = 0
|
|
191
198
|
begin
|
data/lib/digger/index.rb
CHANGED
|
@@ -8,33 +8,34 @@ module Digger
|
|
|
8
8
|
|
|
9
9
|
def urls
|
|
10
10
|
@urls ||= begin
|
|
11
|
-
args = self.args.map{|a|
|
|
12
|
-
args.shift.product(*args).map{|arg| pattern_applied_url(arg)}
|
|
11
|
+
args = self.args.map { |a| a.respond_to?(:each) ? a.to_a : [a] }
|
|
12
|
+
args.shift.product(*args).map { |arg| pattern_applied_url(arg) }
|
|
13
13
|
end
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def pattern_applied_url(arg)
|
|
17
|
-
pattern.gsub('*').each_with_index{|_, i| arg[i]}
|
|
17
|
+
pattern.gsub('*').each_with_index { |_, i| arg[i] }
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
def self.batch(entities, cocurrence = 1, &block)
|
|
21
|
-
raise NoBlockError,
|
|
21
|
+
raise NoBlockError, 'No block given' unless block
|
|
22
22
|
|
|
23
23
|
if cocurrence > 1
|
|
24
|
-
results =
|
|
25
|
-
entities.each_slice(cocurrence) do |group|
|
|
24
|
+
results = Array.new(entities.size)
|
|
25
|
+
entities.each_slice(cocurrence).with_index do |group, idx1|
|
|
26
26
|
threads = []
|
|
27
|
-
group.
|
|
27
|
+
group.each_with_index do |entity, idx2|
|
|
28
|
+
index = idx1 * cocurrence + idx2
|
|
28
29
|
threads << Thread.new(entity) do |ent|
|
|
29
|
-
results[
|
|
30
|
+
results[index] = block.call(ent)
|
|
30
31
|
end
|
|
31
32
|
end
|
|
32
|
-
threads.each
|
|
33
|
+
threads.each(&:join)
|
|
33
34
|
end
|
|
34
|
-
|
|
35
|
+
results
|
|
35
36
|
else
|
|
36
|
-
entities.map{|ent| block.call(ent) }
|
|
37
|
+
entities.map { |ent| block.call(ent) }
|
|
37
38
|
end
|
|
38
39
|
end
|
|
39
40
|
end
|
|
40
|
-
end
|
|
41
|
+
end
|
data/lib/digger/model.rb
CHANGED
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
|
|
2
2
|
module Digger
|
|
3
3
|
class Model
|
|
4
|
-
@@digger_config = {
|
|
4
|
+
@@digger_config = {
|
|
5
|
+
'pattern' => {},
|
|
6
|
+
'index' => {}
|
|
7
|
+
}
|
|
5
8
|
|
|
6
9
|
class << self
|
|
7
10
|
# patterns
|
|
8
11
|
def pattern_config
|
|
9
|
-
@@digger_config['pattern'][
|
|
12
|
+
@@digger_config['pattern'][name] ||= {}
|
|
10
13
|
end
|
|
11
14
|
|
|
12
15
|
Pattern::TYPES.each do |method|
|
|
13
|
-
define_method method, ->(pairs, &block){
|
|
16
|
+
define_method method, -> (pairs, &block) {
|
|
14
17
|
pairs.each_pair do |key, value|
|
|
15
18
|
pattern_config[key] = Pattern.new(type: method, value: value, block: block)
|
|
16
19
|
end
|
|
@@ -18,21 +21,22 @@ module Digger
|
|
|
18
21
|
end
|
|
19
22
|
|
|
20
23
|
def validate_presence(*keys)
|
|
21
|
-
|
|
22
|
-
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless
|
|
24
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
|
25
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless is_all
|
|
23
26
|
end
|
|
24
27
|
|
|
25
28
|
def validate_includeness(*keys)
|
|
26
|
-
|
|
29
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
|
30
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" if is_all
|
|
27
31
|
end
|
|
28
32
|
|
|
29
33
|
# index page
|
|
30
34
|
def index_config
|
|
31
|
-
@@digger_config['index'][
|
|
35
|
+
@@digger_config['index'][name]
|
|
32
36
|
end
|
|
33
37
|
|
|
34
38
|
def index_page(pattern, *args)
|
|
35
|
-
|
|
39
|
+
@@digger_config['index'][name] = Index.new(pattern, args)
|
|
36
40
|
end
|
|
37
41
|
|
|
38
42
|
def index_page?
|
|
@@ -55,13 +59,15 @@ module Digger
|
|
|
55
59
|
end
|
|
56
60
|
|
|
57
61
|
def dig_urls(urls, cocurrence = 1, opts = {})
|
|
58
|
-
Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
|
|
62
|
+
Index.batch(urls, cocurrence) { |url| dig_url(url, opts) }
|
|
59
63
|
end
|
|
60
64
|
|
|
61
65
|
def dig(cocurrence = 1)
|
|
62
66
|
if self.class.index_page?
|
|
63
|
-
self.class.index_config.process(cocurrence)
|
|
67
|
+
self.class.index_config.process(cocurrence) do |url|
|
|
68
|
+
dig_url(url)
|
|
69
|
+
end
|
|
64
70
|
end
|
|
65
71
|
end
|
|
66
72
|
end
|
|
67
|
-
end
|
|
73
|
+
end
|
data/lib/digger/page.rb
CHANGED
|
@@ -3,6 +3,7 @@ require 'json'
|
|
|
3
3
|
require 'ostruct'
|
|
4
4
|
require 'set'
|
|
5
5
|
require 'kconv'
|
|
6
|
+
require 'uri'
|
|
6
7
|
|
|
7
8
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
|
8
9
|
module Digger
|
|
@@ -27,16 +28,12 @@ module Digger
|
|
|
27
28
|
# OpenStruct it holds users defined data
|
|
28
29
|
attr_accessor :user_data
|
|
29
30
|
|
|
30
|
-
attr_accessor :aliases
|
|
31
|
-
|
|
32
|
-
attr_accessor :domain_aliases
|
|
31
|
+
attr_accessor :aliases, :domain_aliases, :fetched_at
|
|
33
32
|
|
|
34
33
|
# Whether the current page should be stored
|
|
35
34
|
# Default: true
|
|
36
35
|
attr_accessor :storable
|
|
37
36
|
|
|
38
|
-
attr_accessor :fetched_at
|
|
39
|
-
|
|
40
37
|
#
|
|
41
38
|
# Create a new page
|
|
42
39
|
#
|
|
@@ -60,7 +57,7 @@ module Digger
|
|
|
60
57
|
end
|
|
61
58
|
|
|
62
59
|
def title
|
|
63
|
-
doc
|
|
60
|
+
doc&.title
|
|
64
61
|
end
|
|
65
62
|
|
|
66
63
|
#
|
|
@@ -74,6 +71,7 @@ module Digger
|
|
|
74
71
|
doc.search('//a[@href]').each do |a|
|
|
75
72
|
u = a['href']
|
|
76
73
|
next if u.nil? || u.empty?
|
|
74
|
+
|
|
77
75
|
abs = to_absolute(u) rescue next
|
|
78
76
|
@links << abs if abs && in_domain?(abs)
|
|
79
77
|
end
|
|
@@ -95,7 +93,13 @@ module Digger
|
|
|
95
93
|
end
|
|
96
94
|
end
|
|
97
95
|
|
|
96
|
+
def json
|
|
97
|
+
@json ||= JSON.parse body
|
|
98
|
+
end
|
|
98
99
|
|
|
100
|
+
def jsonp
|
|
101
|
+
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
|
102
|
+
end
|
|
99
103
|
|
|
100
104
|
#
|
|
101
105
|
# Discard links, a next call of page.links will return an empty array
|
|
@@ -156,7 +160,7 @@ module Digger
|
|
|
156
160
|
# returns +false+ otherwise.
|
|
157
161
|
#
|
|
158
162
|
def not_found?
|
|
159
|
-
|
|
163
|
+
@code == 404
|
|
160
164
|
end
|
|
161
165
|
|
|
162
166
|
#
|
|
@@ -170,6 +174,7 @@ module Digger
|
|
|
170
174
|
end unless @base
|
|
171
175
|
|
|
172
176
|
return nil if @base && @base.to_s.empty?
|
|
177
|
+
|
|
173
178
|
@base
|
|
174
179
|
end
|
|
175
180
|
|
|
@@ -180,16 +185,7 @@ module Digger
|
|
|
180
185
|
def to_absolute(link)
|
|
181
186
|
return nil if link.nil?
|
|
182
187
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
# remove anchor
|
|
186
|
-
link =
|
|
187
|
-
begin
|
|
188
|
-
URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
|
189
|
-
rescue URI::Error
|
|
190
|
-
return nil
|
|
191
|
-
end
|
|
192
|
-
|
|
188
|
+
link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
|
|
193
189
|
relative = begin
|
|
194
190
|
URI(link)
|
|
195
191
|
rescue URI::Error
|
|
@@ -247,6 +243,7 @@ module Digger
|
|
|
247
243
|
|
|
248
244
|
def expired?(ttl)
|
|
249
245
|
return false if fetched_at.nil?
|
|
246
|
+
|
|
250
247
|
(Time.now.to_i - ttl) > fetched_at
|
|
251
248
|
end
|
|
252
249
|
|
data/lib/digger/pattern.rb
CHANGED
|
@@ -1,91 +1,113 @@
|
|
|
1
1
|
require 'nokogiri'
|
|
2
2
|
|
|
3
3
|
module Digger
|
|
4
|
+
# Extractor patterns definition
|
|
4
5
|
class Pattern
|
|
5
6
|
attr_accessor :type, :value, :block
|
|
6
7
|
|
|
7
8
|
def initialize(hash = {})
|
|
8
|
-
hash.each_pair{|key, value| send("#{key}=", value) if %w
|
|
9
|
+
hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
|
|
9
10
|
end
|
|
10
11
|
|
|
11
|
-
def safe_block
|
|
12
|
-
block &&
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
nil
|
|
12
|
+
def safe_block(&default_block)
|
|
13
|
+
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
|
14
|
+
default_block
|
|
15
|
+
elsif block.respond_to?(:call)
|
|
16
|
+
block
|
|
17
|
+
else
|
|
18
|
+
proc {
|
|
19
|
+
$SAFE = 2
|
|
20
|
+
eval block
|
|
21
|
+
}.call
|
|
22
22
|
end
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
def self.wrap(hash)
|
|
26
|
-
|
|
26
|
+
hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) }
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
MATCH_MAX = 3
|
|
30
|
-
|
|
31
|
-
TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
|
|
32
30
|
|
|
33
|
-
|
|
34
|
-
|
|
31
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
|
32
|
+
TYPES_CSS = %w[css_one css_many].freeze
|
|
33
|
+
TYPES_JSON = %w[json jsonp].freeze
|
|
34
|
+
|
|
35
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
|
36
|
+
|
|
37
|
+
def match_page(page)
|
|
38
|
+
return unless page.success?
|
|
39
|
+
if TYPES_REGEXP.include?(type) # regular expression
|
|
40
|
+
regexp_match(page.body)
|
|
41
|
+
elsif TYPES_CSS.include?(type) # css expression
|
|
42
|
+
css_match(page.doc)
|
|
43
|
+
elsif TYPES_JSON.include?(type)
|
|
44
|
+
json_match(page)
|
|
45
|
+
end
|
|
35
46
|
end
|
|
36
47
|
|
|
37
|
-
def
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
48
|
+
def json_match(page)
|
|
49
|
+
block = safe_block { |j| j }
|
|
50
|
+
json = page.send(type)
|
|
51
|
+
keys = json_index_keys(value)
|
|
52
|
+
match = json_fetch(json, keys)
|
|
53
|
+
block.call(match)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def css_match(doc)
|
|
57
|
+
block = safe_block { |node| node.content.strip }
|
|
58
|
+
# content is Nokogiri::HTML::Document
|
|
59
|
+
contents = doc.css(value)
|
|
60
|
+
if type == 'css_many'
|
|
61
|
+
contents.map { |node| block.call(node) }.uniq
|
|
62
|
+
else
|
|
63
|
+
block.call(contents.first)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def regexp_match(body)
|
|
68
|
+
block = safe_block(&:strip)
|
|
69
|
+
# content is String
|
|
70
|
+
if type == 'match_many'
|
|
71
|
+
body.gsub(value).to_a.map { |node| block.call(node) }.uniq
|
|
72
|
+
else
|
|
73
|
+
index = TYPES_REGEXP.index(type)
|
|
74
|
+
matches = body.match(value)
|
|
75
|
+
block.call(matches[index]) unless matches.nil?
|
|
57
76
|
end
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def json_fetch(json, keys)
|
|
80
|
+
if keys.empty?
|
|
81
|
+
json
|
|
62
82
|
else
|
|
63
|
-
|
|
83
|
+
pt = keys.shift
|
|
84
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
|
64
85
|
end
|
|
65
|
-
rescue
|
|
66
|
-
nil
|
|
67
86
|
end
|
|
68
87
|
|
|
88
|
+
def json_index_keys(keys)
|
|
89
|
+
keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p|
|
|
90
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private :json_index_keys, :json_fetch
|
|
95
|
+
|
|
96
|
+
# Nokogiri node methods
|
|
69
97
|
class Nokogiri::XML::Node
|
|
70
|
-
%w
|
|
71
|
-
define_method "inner_#{name}" do |css, &block|
|
|
72
|
-
callback = ->(node)
|
|
73
|
-
if node
|
|
74
|
-
(block || ->(n){n.text.strip}).call(node)
|
|
75
|
-
else
|
|
76
|
-
nil
|
|
77
|
-
end
|
|
78
|
-
end
|
|
98
|
+
%w[one many].each do |name|
|
|
99
|
+
define_method "inner_#{name}" do |css, &block|
|
|
100
|
+
callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
|
|
79
101
|
if name == 'one' # inner_one
|
|
80
102
|
callback.call(self.css(css).first)
|
|
81
103
|
else # inner_many
|
|
82
|
-
self.css(css).map{|node| callback.call(node)}
|
|
104
|
+
self.css(css).map { |node| callback.call(node) }
|
|
83
105
|
end
|
|
84
106
|
end
|
|
85
107
|
end
|
|
86
108
|
def source
|
|
87
109
|
to_xml
|
|
88
110
|
end
|
|
89
|
-
end
|
|
111
|
+
end
|
|
90
112
|
end
|
|
91
|
-
end
|
|
113
|
+
end
|
data/lib/digger/version.rb
CHANGED
data/spec/digger_spec.rb
CHANGED
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
require 'digger'
|
|
2
2
|
|
|
3
3
|
http = Digger::HTTP.new
|
|
4
|
-
page = http.fetch_page('http://
|
|
4
|
+
page = http.fetch_page('http://www.baidu.com/')
|
|
5
5
|
|
|
6
|
-
pattern = Digger::Pattern.new({type: 'css_many', value: '
|
|
6
|
+
pattern = Digger::Pattern.new({ type: 'css_many', value: '#s-top-left>a' })
|
|
7
7
|
|
|
8
8
|
class Item < Digger::Model
|
|
9
|
-
css_many sites: '
|
|
10
|
-
css_one logo: '.logo'
|
|
9
|
+
css_many sites: '#s-top-left>a'
|
|
11
10
|
validate_presence :sites
|
|
12
|
-
validate_includeness :sites
|
|
11
|
+
validate_includeness :sites
|
|
13
12
|
end
|
|
14
13
|
|
|
15
14
|
describe Digger do
|
|
@@ -19,12 +18,12 @@ describe Digger do
|
|
|
19
18
|
|
|
20
19
|
it "pattern should match content" do
|
|
21
20
|
sites = pattern.match_page(page)
|
|
22
|
-
expect(sites.include?('
|
|
21
|
+
expect(sites.include?('新闻')).to eq(true)
|
|
23
22
|
end
|
|
24
23
|
|
|
25
24
|
it "model should dig content" do
|
|
26
25
|
item = Item.new.match_page(page)
|
|
27
|
-
expect(item[:sites].include?('
|
|
26
|
+
expect(item[:sites].include?('新闻')).to be(true)
|
|
28
27
|
end
|
|
29
28
|
|
|
30
29
|
it "validation support" do
|
data/spec/index_spec.rb
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require 'digger'
|
|
2
|
+
|
|
3
|
+
describe Digger::Index do
|
|
4
|
+
it 'batch digger' do
|
|
5
|
+
list = [1, 2, 3, 4, 5, 6, 7, 8]
|
|
6
|
+
pt = Digger::Index.batch(list, 3) do |num|
|
|
7
|
+
sleep(rand(1..3))
|
|
8
|
+
"##{num}"
|
|
9
|
+
end
|
|
10
|
+
expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
|
|
11
|
+
end
|
|
12
|
+
end
|
data/spec/page_spec.rb
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require 'digger'
|
|
2
|
+
require 'json'
|
|
3
|
+
require 'uri'
|
|
4
|
+
|
|
5
|
+
describe Digger::Page do
|
|
6
|
+
it 'page json' do
|
|
7
|
+
json_str = '{"a":1,"b":[1,2,3]}'
|
|
8
|
+
j1 = Digger::Page.new('', body: json_str)
|
|
9
|
+
j2 = Digger::Page.new('', body: "hello(#{json_str});")
|
|
10
|
+
expect(j1.json['a']).to eq(1)
|
|
11
|
+
expect(j2.jsonp['a']).to eq(1)
|
|
12
|
+
expect(j1.json['b'][0]).to eq(1)
|
|
13
|
+
expect(j2.jsonp['b'][1]).to eq(2)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it 'fetch baidu' do
|
|
17
|
+
http = Digger::HTTP.new
|
|
18
|
+
page = http.fetch_page('http://www.baidu.com/')
|
|
19
|
+
expect(page.code).to eq(200)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it 'page uri' do
|
|
23
|
+
link ='https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
|
24
|
+
link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
|
|
25
|
+
p link
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'digger'
|
|
2
|
+
require 'json'
|
|
3
|
+
|
|
4
|
+
describe Digger::Pattern do
|
|
5
|
+
it 'json fetch' do
|
|
6
|
+
json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
|
7
|
+
pt = Digger::Pattern.new
|
|
8
|
+
expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
|
9
|
+
expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
|
10
|
+
expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
|
11
|
+
expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: digger
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- binz
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-12-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -16,28 +16,28 @@ dependencies:
|
|
|
16
16
|
requirements:
|
|
17
17
|
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: '
|
|
19
|
+
version: '2.0'
|
|
20
20
|
type: :development
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: '
|
|
26
|
+
version: '2.0'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: rake
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- - "
|
|
31
|
+
- - ">="
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version:
|
|
33
|
+
version: 12.3.3
|
|
34
34
|
type: :development
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- - "
|
|
38
|
+
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
|
-
version:
|
|
40
|
+
version: 12.3.3
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: nokogiri
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -87,12 +87,15 @@ files:
|
|
|
87
87
|
- lib/digger/pattern.rb
|
|
88
88
|
- lib/digger/version.rb
|
|
89
89
|
- spec/digger_spec.rb
|
|
90
|
+
- spec/index_spec.rb
|
|
91
|
+
- spec/page_spec.rb
|
|
92
|
+
- spec/pattern_spec.rb
|
|
90
93
|
- spec/validate_spec.rb
|
|
91
94
|
homepage: ''
|
|
92
95
|
licenses:
|
|
93
96
|
- MIT
|
|
94
97
|
metadata: {}
|
|
95
|
-
post_install_message:
|
|
98
|
+
post_install_message:
|
|
96
99
|
rdoc_options: []
|
|
97
100
|
require_paths:
|
|
98
101
|
- lib
|
|
@@ -107,11 +110,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
107
110
|
- !ruby/object:Gem::Version
|
|
108
111
|
version: '0'
|
|
109
112
|
requirements: []
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
signing_key:
|
|
113
|
+
rubygems_version: 3.2.32
|
|
114
|
+
signing_key:
|
|
113
115
|
specification_version: 4
|
|
114
116
|
summary: Dig need stractual infomation from web page.
|
|
115
117
|
test_files:
|
|
116
118
|
- spec/digger_spec.rb
|
|
119
|
+
- spec/index_spec.rb
|
|
120
|
+
- spec/page_spec.rb
|
|
121
|
+
- spec/pattern_spec.rb
|
|
117
122
|
- spec/validate_spec.rb
|