digger 0.1.5 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/digger.gemspec +10 -9
- data/lib/digger/index.rb +42 -12
- data/lib/digger/model.rb +17 -11
- data/lib/digger/page.rb +15 -19
- data/lib/digger/pattern.rb +79 -70
- data/lib/digger/version.rb +1 -1
- data/spec/digger_spec.rb +6 -7
- data/spec/index_spec.rb +28 -0
- data/spec/page_spec.rb +14 -1
- data/spec/pattern_spec.rb +15 -10
- metadata +18 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ed96af1c5ae92569e1de4885958ac5852864c045f6c6337d5f17d91747d8ed80
|
4
|
+
data.tar.gz: 82003ae80f54cd3f9b805757e5dcb4c7894bba91c4c376cf08ebd43e6de6e80b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b7aad69fd46c7d1688026ece2e1efe14d7dea29b42f94656d794655e12a92677bd3e1034f0c776bf197bcd75c96bd49377df399433bd2e1c13507520af1addc5
|
7
|
+
data.tar.gz: 60055a69ec3ad77e80fc4f1b50bb3a6c298e2274827ae21b64d1d82dae53a1d5338ccc99bcdb09c7d2e946abd007a488ff3034a753bc54586e9b651aeb3c5ce7
|
data/digger.gemspec
CHANGED
@@ -1,26 +1,27 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'digger/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'digger'
|
8
9
|
spec.version = Digger::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
10
|
+
spec.authors = ['binz']
|
11
|
+
spec.email = ['xinkiang@gmail.com']
|
11
12
|
spec.summary = %q{Dig need stractual infomation from web page.}
|
12
13
|
spec.description = %q{}
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
14
|
+
spec.homepage = ''
|
15
|
+
spec.license = 'MIT'
|
15
16
|
|
16
17
|
spec.files = `git ls-files -z`.split("\x0")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
22
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
23
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
24
|
|
24
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
26
27
|
end
|
data/lib/digger/index.rb
CHANGED
@@ -8,33 +8,63 @@ module Digger
|
|
8
8
|
|
9
9
|
def urls
|
10
10
|
@urls ||= begin
|
11
|
-
args = self.args.map{|a|
|
12
|
-
args.shift.product(*args).map{|arg| pattern_applied_url(arg)}
|
11
|
+
args = self.args.map { |a| a.respond_to?(:each) ? a.to_a : [a] }
|
12
|
+
args.shift.product(*args).map { |arg| pattern_applied_url(arg) }
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
16
|
def pattern_applied_url(arg)
|
17
|
-
pattern.gsub('*').each_with_index{|_, i| arg[i]}
|
17
|
+
pattern.gsub('*').each_with_index { |_, i| arg[i] }
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.slow_down(entities, conf = {}, &block)
|
21
|
+
raise NoBlockError, 'No block given' unless block
|
22
|
+
|
23
|
+
config = {
|
24
|
+
sleep_range_seconds: 4...10, # 随机等待时间范围
|
25
|
+
fail_max_cnt: 10, # 最多失败次数
|
26
|
+
fail_unit_seconds: 10 * 60, # 失败等待时间
|
27
|
+
when_fail: ->(ent, e, failed_cnt) {}
|
28
|
+
}.merge(conf)
|
29
|
+
failed_cnt = 0
|
30
|
+
cursor = 0
|
31
|
+
result = []
|
32
|
+
while cursor < entities.length
|
33
|
+
begin
|
34
|
+
result << block.call(entities[cursor])
|
35
|
+
rescue StandardError => e
|
36
|
+
failed_cnt += 1
|
37
|
+
config[:when_fail].call(entities[cursor], e, failed_cnt)
|
38
|
+
break if failed_cnt >= config[:fail_max_cnt]
|
39
|
+
|
40
|
+
sleep(failed_cnt * config[:fail_unit_seconds])
|
41
|
+
else
|
42
|
+
cursor += 1
|
43
|
+
sleep(rand(config[:sleep_range_seconds]))
|
44
|
+
end
|
45
|
+
end
|
46
|
+
result
|
18
47
|
end
|
19
48
|
|
20
49
|
def self.batch(entities, cocurrence = 1, &block)
|
21
|
-
raise NoBlockError,
|
50
|
+
raise NoBlockError, 'No block given' unless block
|
22
51
|
|
23
52
|
if cocurrence > 1
|
24
|
-
results =
|
25
|
-
entities.each_slice(cocurrence) do |group|
|
53
|
+
results = Array.new(entities.size)
|
54
|
+
entities.each_slice(cocurrence).with_index do |group, idx1|
|
26
55
|
threads = []
|
27
|
-
group.
|
56
|
+
group.each_with_index do |entity, idx2|
|
57
|
+
index = idx1 * cocurrence + idx2
|
28
58
|
threads << Thread.new(entity) do |ent|
|
29
|
-
results[
|
59
|
+
results[index] = block.call(ent)
|
30
60
|
end
|
31
61
|
end
|
32
|
-
threads.each
|
62
|
+
threads.each(&:join)
|
33
63
|
end
|
34
|
-
|
64
|
+
results
|
35
65
|
else
|
36
|
-
entities.map{|ent| block.call(ent) }
|
66
|
+
entities.map { |ent| block.call(ent) }
|
37
67
|
end
|
38
68
|
end
|
39
69
|
end
|
40
|
-
end
|
70
|
+
end
|
data/lib/digger/model.rb
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
|
2
2
|
module Digger
|
3
3
|
class Model
|
4
|
-
@@digger_config = {
|
4
|
+
@@digger_config = {
|
5
|
+
'pattern' => {},
|
6
|
+
'index' => {}
|
7
|
+
}
|
5
8
|
|
6
9
|
class << self
|
7
10
|
# patterns
|
8
11
|
def pattern_config
|
9
|
-
@@digger_config['pattern'][
|
12
|
+
@@digger_config['pattern'][name] ||= {}
|
10
13
|
end
|
11
14
|
|
12
15
|
Pattern::TYPES.each do |method|
|
13
|
-
define_method method, ->(pairs, &block){
|
16
|
+
define_method method, -> (pairs, &block) {
|
14
17
|
pairs.each_pair do |key, value|
|
15
18
|
pattern_config[key] = Pattern.new(type: method, value: value, block: block)
|
16
19
|
end
|
@@ -18,21 +21,22 @@ module Digger
|
|
18
21
|
end
|
19
22
|
|
20
23
|
def validate_presence(*keys)
|
21
|
-
|
22
|
-
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless
|
24
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
25
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless is_all
|
23
26
|
end
|
24
27
|
|
25
28
|
def validate_includeness(*keys)
|
26
|
-
|
29
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
30
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" if is_all
|
27
31
|
end
|
28
32
|
|
29
33
|
# index page
|
30
34
|
def index_config
|
31
|
-
@@digger_config['index'][
|
35
|
+
@@digger_config['index'][name]
|
32
36
|
end
|
33
37
|
|
34
38
|
def index_page(pattern, *args)
|
35
|
-
|
39
|
+
@@digger_config['index'][name] = Index.new(pattern, args)
|
36
40
|
end
|
37
41
|
|
38
42
|
def index_page?
|
@@ -55,13 +59,15 @@ module Digger
|
|
55
59
|
end
|
56
60
|
|
57
61
|
def dig_urls(urls, cocurrence = 1, opts = {})
|
58
|
-
Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
|
62
|
+
Index.batch(urls, cocurrence) { |url| dig_url(url, opts) }
|
59
63
|
end
|
60
64
|
|
61
65
|
def dig(cocurrence = 1)
|
62
66
|
if self.class.index_page?
|
63
|
-
self.class.index_config.process(cocurrence)
|
67
|
+
self.class.index_config.process(cocurrence) do |url|
|
68
|
+
dig_url(url)
|
69
|
+
end
|
64
70
|
end
|
65
71
|
end
|
66
72
|
end
|
67
|
-
end
|
73
|
+
end
|
data/lib/digger/page.rb
CHANGED
@@ -3,6 +3,8 @@ require 'json'
|
|
3
3
|
require 'ostruct'
|
4
4
|
require 'set'
|
5
5
|
require 'kconv'
|
6
|
+
require 'uri'
|
7
|
+
require 'http/cookie'
|
6
8
|
|
7
9
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
8
10
|
module Digger
|
@@ -27,16 +29,12 @@ module Digger
|
|
27
29
|
# OpenStruct it holds users defined data
|
28
30
|
attr_accessor :user_data
|
29
31
|
|
30
|
-
attr_accessor :aliases
|
31
|
-
|
32
|
-
attr_accessor :domain_aliases
|
32
|
+
attr_accessor :aliases, :domain_aliases, :fetched_at
|
33
33
|
|
34
34
|
# Whether the current page should be stored
|
35
35
|
# Default: true
|
36
36
|
attr_accessor :storable
|
37
37
|
|
38
|
-
attr_accessor :fetched_at
|
39
|
-
|
40
38
|
#
|
41
39
|
# Create a new page
|
42
40
|
#
|
@@ -60,7 +58,7 @@ module Digger
|
|
60
58
|
end
|
61
59
|
|
62
60
|
def title
|
63
|
-
doc
|
61
|
+
doc&.title
|
64
62
|
end
|
65
63
|
|
66
64
|
#
|
@@ -74,6 +72,7 @@ module Digger
|
|
74
72
|
doc.search('//a[@href]').each do |a|
|
75
73
|
u = a['href']
|
76
74
|
next if u.nil? || u.empty?
|
75
|
+
|
77
76
|
abs = to_absolute(u) rescue next
|
78
77
|
@links << abs if abs && in_domain?(abs)
|
79
78
|
end
|
@@ -100,7 +99,11 @@ module Digger
|
|
100
99
|
end
|
101
100
|
|
102
101
|
def jsonp
|
103
|
-
@jsonp ||= JSON.parse body.match(/^[
|
102
|
+
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
103
|
+
end
|
104
|
+
|
105
|
+
def cookies
|
106
|
+
@cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
|
104
107
|
end
|
105
108
|
|
106
109
|
#
|
@@ -162,7 +165,7 @@ module Digger
|
|
162
165
|
# returns +false+ otherwise.
|
163
166
|
#
|
164
167
|
def not_found?
|
165
|
-
|
168
|
+
@code == 404
|
166
169
|
end
|
167
170
|
|
168
171
|
#
|
@@ -176,6 +179,7 @@ module Digger
|
|
176
179
|
end unless @base
|
177
180
|
|
178
181
|
return nil if @base && @base.to_s.empty?
|
182
|
+
|
179
183
|
@base
|
180
184
|
end
|
181
185
|
|
@@ -186,16 +190,7 @@ module Digger
|
|
186
190
|
def to_absolute(link)
|
187
191
|
return nil if link.nil?
|
188
192
|
|
189
|
-
|
190
|
-
|
191
|
-
# remove anchor
|
192
|
-
link =
|
193
|
-
begin
|
194
|
-
URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
195
|
-
rescue URI::Error
|
196
|
-
return nil
|
197
|
-
end
|
198
|
-
|
193
|
+
link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
|
199
194
|
relative = begin
|
200
195
|
URI(link)
|
201
196
|
rescue URI::Error
|
@@ -253,6 +248,7 @@ module Digger
|
|
253
248
|
|
254
249
|
def expired?(ttl)
|
255
250
|
return false if fetched_at.nil?
|
251
|
+
|
256
252
|
(Time.now.to_i - ttl) > fetched_at
|
257
253
|
end
|
258
254
|
|
@@ -282,4 +278,4 @@ module Digger
|
|
282
278
|
from_hash hash
|
283
279
|
end
|
284
280
|
end
|
285
|
-
end
|
281
|
+
end
|
data/lib/digger/pattern.rb
CHANGED
@@ -1,111 +1,120 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
module Digger
|
4
|
+
# Extractor patterns definition
|
4
5
|
class Pattern
|
5
6
|
attr_accessor :type, :value, :block
|
6
7
|
|
7
8
|
def initialize(hash = {})
|
8
|
-
hash.each_pair{|key, value| send("#{key}=", value) if %w
|
9
|
+
hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
|
9
10
|
end
|
10
11
|
|
11
|
-
def safe_block
|
12
|
-
block &&
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
nil
|
12
|
+
def safe_block(&default_block)
|
13
|
+
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
14
|
+
default_block || ->(v) { v }
|
15
|
+
elsif block.respond_to?(:call)
|
16
|
+
block
|
17
|
+
else
|
18
|
+
proc {
|
19
|
+
$SAFE = 2
|
20
|
+
eval block
|
21
|
+
}.call
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
25
|
def self.wrap(hash)
|
26
|
-
|
26
|
+
hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) }
|
27
27
|
end
|
28
28
|
|
29
29
|
MATCH_MAX = 3
|
30
30
|
|
31
|
-
TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w
|
32
|
-
TYPES_CSS = %w
|
33
|
-
TYPES_JSON = %w
|
34
|
-
|
35
|
-
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
31
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
32
|
+
TYPES_CSS = %w[css_one css_many].freeze
|
33
|
+
TYPES_JSON = %w[json jsonp].freeze
|
34
|
+
|
35
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + ['cookie']
|
36
|
+
|
37
|
+
def match_page(page)
|
38
|
+
return unless page.success?
|
36
39
|
|
37
|
-
def match_page(page, &callback)
|
38
|
-
blk = callback || safe_block
|
39
40
|
if TYPES_REGEXP.include?(type) # regular expression
|
40
|
-
|
41
|
-
# content is String
|
42
|
-
if type == 'match_many'
|
43
|
-
match = page.body.gsub(value).to_a
|
44
|
-
else
|
45
|
-
index = TYPES_REGEXP.index(type)
|
46
|
-
matches = page.body.match(value)
|
47
|
-
match = matches.nil? ? nil : matches[index]
|
48
|
-
end
|
41
|
+
regexp_match(page.body)
|
49
42
|
elsif TYPES_CSS.include?(type) # css expression
|
50
|
-
|
51
|
-
# content is Nokogiri::HTML::Document
|
52
|
-
if type == 'css_one'
|
53
|
-
match = page.doc.css(value).first
|
54
|
-
else
|
55
|
-
match = page.doc.css(value)
|
56
|
-
end
|
43
|
+
css_match(page.doc)
|
57
44
|
elsif TYPES_JSON.include?(type)
|
58
|
-
|
59
|
-
|
45
|
+
json_match(page)
|
46
|
+
else
|
47
|
+
cookie_get(page.cookies)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def cookie_get(cookies)
|
52
|
+
cookie = cookies.find { |c| c.name == value }&.value
|
53
|
+
safe_block.call(cookie)
|
54
|
+
end
|
55
|
+
|
56
|
+
def json_match(page)
|
57
|
+
json = page.send(type)
|
58
|
+
keys = json_index_keys(value)
|
59
|
+
match = json_fetch(json, keys)
|
60
|
+
safe_block.call(match)
|
61
|
+
end
|
62
|
+
|
63
|
+
def css_match(doc)
|
64
|
+
block = safe_block { |node| node.content.strip }
|
65
|
+
# content is Nokogiri::HTML::Document
|
66
|
+
contents = doc.css(value)
|
67
|
+
if type == 'css_many'
|
68
|
+
contents.map { |node| block.call(node) }.uniq
|
69
|
+
else
|
70
|
+
block.call(contents.first)
|
60
71
|
end
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
72
|
+
end
|
73
|
+
|
74
|
+
def regexp_match(body)
|
75
|
+
block = safe_block(&:strip)
|
76
|
+
# content is String
|
77
|
+
if type == 'match_many'
|
78
|
+
body.gsub(value).to_a.map { |node| block.call(node) }.uniq
|
65
79
|
else
|
66
|
-
|
80
|
+
index = TYPES_REGEXP.index(type)
|
81
|
+
matches = body.match(value)
|
82
|
+
block.call(matches[index]) unless matches.nil?
|
67
83
|
end
|
68
|
-
rescue
|
69
|
-
nil
|
70
84
|
end
|
71
85
|
|
72
86
|
def json_fetch(json, keys)
|
73
|
-
if keys.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
json_fetch(json, parts)
|
79
|
-
elsif keys.is_a? Array
|
80
|
-
if keys.length == 0
|
81
|
-
json
|
82
|
-
else
|
83
|
-
pt = keys.shift
|
84
|
-
json_fetch(json[pt[:index] || pt[:key]], keys)
|
85
|
-
end
|
87
|
+
if keys.empty?
|
88
|
+
json
|
89
|
+
else
|
90
|
+
pt = keys.shift
|
91
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
86
92
|
end
|
87
93
|
end
|
88
94
|
|
95
|
+
def json_index_keys(keys)
|
96
|
+
keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p|
|
97
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
private :json_index_keys, :json_fetch
|
102
|
+
|
103
|
+
# Nokogiri node methods
|
89
104
|
class Nokogiri::XML::Node
|
90
|
-
%w
|
91
|
-
define_method "inner_#{name}" do |css, &block|
|
92
|
-
callback = ->(node)
|
93
|
-
if node
|
94
|
-
(block || ->(n){n.text.strip}).call(node)
|
95
|
-
else
|
96
|
-
nil
|
97
|
-
end
|
98
|
-
end
|
105
|
+
%w[one many].each do |name|
|
106
|
+
define_method "inner_#{name}" do |css, &block|
|
107
|
+
callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
|
99
108
|
if name == 'one' # inner_one
|
100
109
|
callback.call(self.css(css).first)
|
101
110
|
else # inner_many
|
102
|
-
self.css(css).map{|node| callback.call(node)}
|
111
|
+
self.css(css).map { |node| callback.call(node) }
|
103
112
|
end
|
104
113
|
end
|
105
114
|
end
|
106
115
|
def source
|
107
116
|
to_xml
|
108
117
|
end
|
109
|
-
end
|
118
|
+
end
|
110
119
|
end
|
111
|
-
end
|
120
|
+
end
|
data/lib/digger/version.rb
CHANGED
data/spec/digger_spec.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
require 'digger'
|
2
2
|
|
3
3
|
http = Digger::HTTP.new
|
4
|
-
page = http.fetch_page('http://
|
4
|
+
page = http.fetch_page('http://www.baidu.com/')
|
5
5
|
|
6
|
-
pattern = Digger::Pattern.new({type: 'css_many', value: '
|
6
|
+
pattern = Digger::Pattern.new({ type: 'css_many', value: '#s-top-left>a' })
|
7
7
|
|
8
8
|
class Item < Digger::Model
|
9
|
-
css_many sites: '
|
10
|
-
css_one logo: '.logo'
|
9
|
+
css_many sites: '#s-top-left>a'
|
11
10
|
validate_presence :sites
|
12
|
-
validate_includeness :sites
|
11
|
+
validate_includeness :sites
|
13
12
|
end
|
14
13
|
|
15
14
|
describe Digger do
|
@@ -19,12 +18,12 @@ describe Digger do
|
|
19
18
|
|
20
19
|
it "pattern should match content" do
|
21
20
|
sites = pattern.match_page(page)
|
22
|
-
expect(sites.include?('
|
21
|
+
expect(sites.include?('新闻')).to eq(true)
|
23
22
|
end
|
24
23
|
|
25
24
|
it "model should dig content" do
|
26
25
|
item = Item.new.match_page(page)
|
27
|
-
expect(item[:sites].include?('
|
26
|
+
expect(item[:sites].include?('新闻')).to be(true)
|
28
27
|
end
|
29
28
|
|
30
29
|
it "validation support" do
|
data/spec/index_spec.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'digger'
|
2
|
+
|
3
|
+
describe Digger::Index do
|
4
|
+
it 'batch digger' do
|
5
|
+
list = [1, 2, 3, 4, 5, 6, 7, 8]
|
6
|
+
pt = Digger::Index.batch(list, 3) do |num|
|
7
|
+
sleep(rand(1..3))
|
8
|
+
"##{num}"
|
9
|
+
end
|
10
|
+
expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'slow down' do
|
14
|
+
list = [1, 2, 3, 4]
|
15
|
+
conf = {
|
16
|
+
sleep_range_seconds: 1...2,
|
17
|
+
fail_unit_seconds: 1,
|
18
|
+
fail_max_cnt: 2,
|
19
|
+
when_fail: ->(_, e, nth) { puts "#{nth}: #{e.message}" }
|
20
|
+
}
|
21
|
+
pt = Digger::Index.slow_down(list, conf) do |num|
|
22
|
+
raise 'error' if num == 3
|
23
|
+
num
|
24
|
+
end
|
25
|
+
p pt
|
26
|
+
expect(pt.size).to eq(2)
|
27
|
+
end
|
28
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'digger'
|
2
2
|
require 'json'
|
3
|
+
require 'uri'
|
4
|
+
require 'cgi'
|
3
5
|
|
4
6
|
describe Digger::Page do
|
5
7
|
it 'page json' do
|
@@ -11,4 +13,15 @@ describe Digger::Page do
|
|
11
13
|
expect(j1.json['b'][0]).to eq(1)
|
12
14
|
expect(j2.jsonp['b'][1]).to eq(2)
|
13
15
|
end
|
14
|
-
|
16
|
+
|
17
|
+
it 'fetch baidu' do
|
18
|
+
http = Digger::HTTP.new
|
19
|
+
page = http.fetch_page('http://baidu.com/')
|
20
|
+
expect(page.code).to eq(200)
|
21
|
+
end
|
22
|
+
|
23
|
+
# it 'page uri' do
|
24
|
+
# link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
25
|
+
# link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
|
26
|
+
# end
|
27
|
+
end
|
data/spec/pattern_spec.rb
CHANGED
@@ -2,14 +2,19 @@ require 'digger'
|
|
2
2
|
require 'json'
|
3
3
|
|
4
4
|
describe Digger::Pattern do
|
5
|
-
it 'json fetch' do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
end
|
13
|
-
|
5
|
+
# it 'json fetch' do
|
6
|
+
# json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
7
|
+
# pt = Digger::Pattern.new
|
8
|
+
# expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
9
|
+
# expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
10
|
+
# expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
11
|
+
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
|
+
# end
|
14
13
|
|
15
|
-
|
14
|
+
it 'parse cookoe' do
|
15
|
+
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
16
|
+
pt = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
17
|
+
result = pt.match_page(page)
|
18
|
+
expect(result.length).to eq(42)
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -1,71 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-12-
|
11
|
+
date: 2021-12-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 12.3.3
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 12.3.3
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: http-cookie
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1.
|
47
|
+
version: '1.0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1.
|
54
|
+
version: '1.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
61
|
+
version: '1.6'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
68
|
+
version: '1.6'
|
69
69
|
description: ''
|
70
70
|
email:
|
71
71
|
- xinkiang@gmail.com
|
@@ -87,6 +87,7 @@ files:
|
|
87
87
|
- lib/digger/pattern.rb
|
88
88
|
- lib/digger/version.rb
|
89
89
|
- spec/digger_spec.rb
|
90
|
+
- spec/index_spec.rb
|
90
91
|
- spec/page_spec.rb
|
91
92
|
- spec/pattern_spec.rb
|
92
93
|
- spec/validate_spec.rb
|
@@ -115,6 +116,7 @@ specification_version: 4
|
|
115
116
|
summary: Dig need stractual infomation from web page.
|
116
117
|
test_files:
|
117
118
|
- spec/digger_spec.rb
|
119
|
+
- spec/index_spec.rb
|
118
120
|
- spec/page_spec.rb
|
119
121
|
- spec/pattern_spec.rb
|
120
122
|
- spec/validate_spec.rb
|