digger 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/digger.gemspec +10 -9
- data/lib/digger/index.rb +42 -12
- data/lib/digger/model.rb +17 -11
- data/lib/digger/page.rb +13 -9
- data/lib/digger/pattern.rb +101 -70
- data/lib/digger/version.rb +1 -1
- data/spec/index_spec.rb +28 -0
- data/spec/page_spec.rb +7 -7
- data/spec/pattern_spec.rb +22 -10
- metadata +20 -18
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 29c3945e9377348e1152eea7f46e0f11aa2e59cc5568fad57d25ecd3d271a9df
|
|
4
|
+
data.tar.gz: 1e4862f9939aa9c62e175a39df078fe12a2f51190af1f280f6d418cbab7e6390
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 177e393de76bc35e31d6cc0eeda839d543d13fd81c40eba7d08704a131ec01396872734ea689c49fb1d09ceb4ba604fae76c3c201a2706dc9c889161038e0323
|
|
7
|
+
data.tar.gz: da76004a179aaed5cf75a96f90da3ebc739416e0ec162ff95c6aa27625590826fcc179b749156ea96937e364fb6a251515cb335a14b317001f3a47ea30330aeb
|
data/digger.gemspec
CHANGED
|
@@ -1,26 +1,27 @@
|
|
|
1
1
|
# coding: utf-8
|
|
2
|
+
|
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
5
|
require 'digger/version'
|
|
5
6
|
|
|
6
7
|
Gem::Specification.new do |spec|
|
|
7
|
-
spec.name =
|
|
8
|
+
spec.name = 'digger'
|
|
8
9
|
spec.version = Digger::VERSION
|
|
9
|
-
spec.authors = [
|
|
10
|
-
spec.email = [
|
|
10
|
+
spec.authors = ['binz']
|
|
11
|
+
spec.email = ['xinkiang@gmail.com']
|
|
11
12
|
spec.summary = %q{Dig need stractual infomation from web page.}
|
|
12
13
|
spec.description = %q{}
|
|
13
|
-
spec.homepage =
|
|
14
|
-
spec.license =
|
|
14
|
+
spec.homepage = ''
|
|
15
|
+
spec.license = 'MIT'
|
|
15
16
|
|
|
16
17
|
spec.files = `git ls-files -z`.split("\x0")
|
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
|
-
spec.require_paths = [
|
|
20
|
+
spec.require_paths = ['lib']
|
|
20
21
|
|
|
21
|
-
spec.add_development_dependency
|
|
22
|
-
spec.add_development_dependency
|
|
22
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
|
23
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
|
23
24
|
|
|
24
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
|
26
27
|
end
|
data/lib/digger/index.rb
CHANGED
|
@@ -8,33 +8,63 @@ module Digger
|
|
|
8
8
|
|
|
9
9
|
def urls
|
|
10
10
|
@urls ||= begin
|
|
11
|
-
args = self.args.map{|a|
|
|
12
|
-
args.shift.product(*args).map{|arg| pattern_applied_url(arg)}
|
|
11
|
+
args = self.args.map { |a| a.respond_to?(:each) ? a.to_a : [a] }
|
|
12
|
+
args.shift.product(*args).map { |arg| pattern_applied_url(arg) }
|
|
13
13
|
end
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def pattern_applied_url(arg)
|
|
17
|
-
pattern.gsub('*').each_with_index{|_, i| arg[i]}
|
|
17
|
+
pattern.gsub('*').each_with_index { |_, i| arg[i] }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def self.slow_down(entities, conf = {}, &block)
|
|
21
|
+
raise NoBlockError, 'No block given' unless block
|
|
22
|
+
|
|
23
|
+
config = {
|
|
24
|
+
sleep_range_seconds: 4...10, # 随机等待时间范围
|
|
25
|
+
fail_max_cnt: 10, # 最多失败次数
|
|
26
|
+
fail_unit_seconds: 10 * 60, # 失败等待时间
|
|
27
|
+
when_fail: ->(ent, e, failed_cnt) {}
|
|
28
|
+
}.merge(conf)
|
|
29
|
+
failed_cnt = 0
|
|
30
|
+
cursor = 0
|
|
31
|
+
result = []
|
|
32
|
+
while cursor < entities.length
|
|
33
|
+
begin
|
|
34
|
+
result << block.call(entities[cursor])
|
|
35
|
+
rescue StandardError => e
|
|
36
|
+
failed_cnt += 1
|
|
37
|
+
config[:when_fail].call(entities[cursor], e, failed_cnt)
|
|
38
|
+
break if failed_cnt >= config[:fail_max_cnt]
|
|
39
|
+
|
|
40
|
+
sleep(failed_cnt * config[:fail_unit_seconds])
|
|
41
|
+
else
|
|
42
|
+
cursor += 1
|
|
43
|
+
sleep(rand(config[:sleep_range_seconds]))
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
result
|
|
18
47
|
end
|
|
19
48
|
|
|
20
49
|
def self.batch(entities, cocurrence = 1, &block)
|
|
21
|
-
raise NoBlockError,
|
|
50
|
+
raise NoBlockError, 'No block given' unless block
|
|
22
51
|
|
|
23
52
|
if cocurrence > 1
|
|
24
|
-
results =
|
|
25
|
-
entities.each_slice(cocurrence) do |group|
|
|
53
|
+
results = Array.new(entities.size)
|
|
54
|
+
entities.each_slice(cocurrence).with_index do |group, idx1|
|
|
26
55
|
threads = []
|
|
27
|
-
group.
|
|
56
|
+
group.each_with_index do |entity, idx2|
|
|
57
|
+
index = idx1 * cocurrence + idx2
|
|
28
58
|
threads << Thread.new(entity) do |ent|
|
|
29
|
-
results[
|
|
59
|
+
results[index] = block.call(ent)
|
|
30
60
|
end
|
|
31
61
|
end
|
|
32
|
-
threads.each
|
|
62
|
+
threads.each(&:join)
|
|
33
63
|
end
|
|
34
|
-
|
|
64
|
+
results
|
|
35
65
|
else
|
|
36
|
-
entities.map{|ent| block.call(ent) }
|
|
66
|
+
entities.map { |ent| block.call(ent) }
|
|
37
67
|
end
|
|
38
68
|
end
|
|
39
69
|
end
|
|
40
|
-
end
|
|
70
|
+
end
|
data/lib/digger/model.rb
CHANGED
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
|
|
2
2
|
module Digger
|
|
3
3
|
class Model
|
|
4
|
-
@@digger_config = {
|
|
4
|
+
@@digger_config = {
|
|
5
|
+
'pattern' => {},
|
|
6
|
+
'index' => {}
|
|
7
|
+
}
|
|
5
8
|
|
|
6
9
|
class << self
|
|
7
10
|
# patterns
|
|
8
11
|
def pattern_config
|
|
9
|
-
@@digger_config['pattern'][
|
|
12
|
+
@@digger_config['pattern'][name] ||= {}
|
|
10
13
|
end
|
|
11
14
|
|
|
12
15
|
Pattern::TYPES.each do |method|
|
|
13
|
-
define_method method, ->(pairs, &block){
|
|
16
|
+
define_method method, -> (pairs, &block) {
|
|
14
17
|
pairs.each_pair do |key, value|
|
|
15
18
|
pattern_config[key] = Pattern.new(type: method, value: value, block: block)
|
|
16
19
|
end
|
|
@@ -18,21 +21,22 @@ module Digger
|
|
|
18
21
|
end
|
|
19
22
|
|
|
20
23
|
def validate_presence(*keys)
|
|
21
|
-
|
|
22
|
-
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless
|
|
24
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
|
25
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless is_all
|
|
23
26
|
end
|
|
24
27
|
|
|
25
28
|
def validate_includeness(*keys)
|
|
26
|
-
|
|
29
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
|
30
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" if is_all
|
|
27
31
|
end
|
|
28
32
|
|
|
29
33
|
# index page
|
|
30
34
|
def index_config
|
|
31
|
-
@@digger_config['index'][
|
|
35
|
+
@@digger_config['index'][name]
|
|
32
36
|
end
|
|
33
37
|
|
|
34
38
|
def index_page(pattern, *args)
|
|
35
|
-
|
|
39
|
+
@@digger_config['index'][name] = Index.new(pattern, args)
|
|
36
40
|
end
|
|
37
41
|
|
|
38
42
|
def index_page?
|
|
@@ -55,13 +59,15 @@ module Digger
|
|
|
55
59
|
end
|
|
56
60
|
|
|
57
61
|
def dig_urls(urls, cocurrence = 1, opts = {})
|
|
58
|
-
Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
|
|
62
|
+
Index.batch(urls, cocurrence) { |url| dig_url(url, opts) }
|
|
59
63
|
end
|
|
60
64
|
|
|
61
65
|
def dig(cocurrence = 1)
|
|
62
66
|
if self.class.index_page?
|
|
63
|
-
self.class.index_config.process(cocurrence)
|
|
67
|
+
self.class.index_config.process(cocurrence) do |url|
|
|
68
|
+
dig_url(url)
|
|
69
|
+
end
|
|
64
70
|
end
|
|
65
71
|
end
|
|
66
72
|
end
|
|
67
|
-
end
|
|
73
|
+
end
|
data/lib/digger/page.rb
CHANGED
|
@@ -4,6 +4,7 @@ require 'ostruct'
|
|
|
4
4
|
require 'set'
|
|
5
5
|
require 'kconv'
|
|
6
6
|
require 'uri'
|
|
7
|
+
require 'http/cookie'
|
|
7
8
|
|
|
8
9
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
|
9
10
|
module Digger
|
|
@@ -28,16 +29,12 @@ module Digger
|
|
|
28
29
|
# OpenStruct it holds users defined data
|
|
29
30
|
attr_accessor :user_data
|
|
30
31
|
|
|
31
|
-
attr_accessor :aliases
|
|
32
|
-
|
|
33
|
-
attr_accessor :domain_aliases
|
|
32
|
+
attr_accessor :aliases, :domain_aliases, :fetched_at
|
|
34
33
|
|
|
35
34
|
# Whether the current page should be stored
|
|
36
35
|
# Default: true
|
|
37
36
|
attr_accessor :storable
|
|
38
37
|
|
|
39
|
-
attr_accessor :fetched_at
|
|
40
|
-
|
|
41
38
|
#
|
|
42
39
|
# Create a new page
|
|
43
40
|
#
|
|
@@ -61,7 +58,7 @@ module Digger
|
|
|
61
58
|
end
|
|
62
59
|
|
|
63
60
|
def title
|
|
64
|
-
doc
|
|
61
|
+
doc&.title
|
|
65
62
|
end
|
|
66
63
|
|
|
67
64
|
#
|
|
@@ -75,6 +72,7 @@ module Digger
|
|
|
75
72
|
doc.search('//a[@href]').each do |a|
|
|
76
73
|
u = a['href']
|
|
77
74
|
next if u.nil? || u.empty?
|
|
75
|
+
|
|
78
76
|
abs = to_absolute(u) rescue next
|
|
79
77
|
@links << abs if abs && in_domain?(abs)
|
|
80
78
|
end
|
|
@@ -101,7 +99,11 @@ module Digger
|
|
|
101
99
|
end
|
|
102
100
|
|
|
103
101
|
def jsonp
|
|
104
|
-
@jsonp ||= JSON.parse body.match(/^[
|
|
102
|
+
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def cookies
|
|
106
|
+
@cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
|
|
105
107
|
end
|
|
106
108
|
|
|
107
109
|
#
|
|
@@ -163,7 +165,7 @@ module Digger
|
|
|
163
165
|
# returns +false+ otherwise.
|
|
164
166
|
#
|
|
165
167
|
def not_found?
|
|
166
|
-
|
|
168
|
+
@code == 404
|
|
167
169
|
end
|
|
168
170
|
|
|
169
171
|
#
|
|
@@ -177,6 +179,7 @@ module Digger
|
|
|
177
179
|
end unless @base
|
|
178
180
|
|
|
179
181
|
return nil if @base && @base.to_s.empty?
|
|
182
|
+
|
|
180
183
|
@base
|
|
181
184
|
end
|
|
182
185
|
|
|
@@ -245,6 +248,7 @@ module Digger
|
|
|
245
248
|
|
|
246
249
|
def expired?(ttl)
|
|
247
250
|
return false if fetched_at.nil?
|
|
251
|
+
|
|
248
252
|
(Time.now.to_i - ttl) > fetched_at
|
|
249
253
|
end
|
|
250
254
|
|
|
@@ -274,4 +278,4 @@ module Digger
|
|
|
274
278
|
from_hash hash
|
|
275
279
|
end
|
|
276
280
|
end
|
|
277
|
-
end
|
|
281
|
+
end
|
data/lib/digger/pattern.rb
CHANGED
|
@@ -1,111 +1,142 @@
|
|
|
1
1
|
require 'nokogiri'
|
|
2
2
|
|
|
3
3
|
module Digger
|
|
4
|
+
# Extractor patterns definition
|
|
4
5
|
class Pattern
|
|
5
6
|
attr_accessor :type, :value, :block
|
|
6
7
|
|
|
7
8
|
def initialize(hash = {})
|
|
8
|
-
hash.each_pair
|
|
9
|
+
hash.each_pair do |key, value|
|
|
10
|
+
send("#{key}=", value) if %w[type value block].include?(key.to_s)
|
|
11
|
+
end
|
|
9
12
|
end
|
|
10
13
|
|
|
11
|
-
def safe_block
|
|
12
|
-
block &&
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
nil
|
|
14
|
+
def safe_block(&default_block)
|
|
15
|
+
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
|
16
|
+
default_block || ->(v) { v }
|
|
17
|
+
elsif block.respond_to?(:call)
|
|
18
|
+
block
|
|
19
|
+
else
|
|
20
|
+
proc {
|
|
21
|
+
$SAFE = 2
|
|
22
|
+
eval block
|
|
23
|
+
}.call
|
|
22
24
|
end
|
|
23
25
|
end
|
|
24
26
|
|
|
25
27
|
def self.wrap(hash)
|
|
26
|
-
|
|
28
|
+
hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) }
|
|
27
29
|
end
|
|
28
30
|
|
|
29
31
|
MATCH_MAX = 3
|
|
30
32
|
|
|
31
|
-
TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w
|
|
32
|
-
TYPES_CSS = %w
|
|
33
|
-
TYPES_JSON = %w
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
|
34
|
+
TYPES_CSS = %w[css_one css_many].freeze
|
|
35
|
+
TYPES_JSON = %w[json jsonp].freeze
|
|
36
|
+
TYPES_OTHER = %w[cookie plain lines header body].freeze
|
|
37
|
+
|
|
38
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
|
|
39
|
+
|
|
40
|
+
def match_page(page)
|
|
41
|
+
return unless page.success?
|
|
36
42
|
|
|
37
|
-
def match_page(page, &callback)
|
|
38
|
-
blk = callback || safe_block
|
|
39
43
|
if TYPES_REGEXP.include?(type) # regular expression
|
|
40
|
-
|
|
41
|
-
# content is String
|
|
42
|
-
if type == 'match_many'
|
|
43
|
-
match = page.body.gsub(value).to_a
|
|
44
|
-
else
|
|
45
|
-
index = TYPES_REGEXP.index(type)
|
|
46
|
-
matches = page.body.match(value)
|
|
47
|
-
match = matches.nil? ? nil : matches[index]
|
|
48
|
-
end
|
|
44
|
+
regexp_match(page.body)
|
|
49
45
|
elsif TYPES_CSS.include?(type) # css expression
|
|
50
|
-
|
|
51
|
-
# content is Nokogiri::HTML::Document
|
|
52
|
-
if type == 'css_one'
|
|
53
|
-
match = page.doc.css(value).first
|
|
54
|
-
else
|
|
55
|
-
match = page.doc.css(value)
|
|
56
|
-
end
|
|
46
|
+
css_match(page.doc)
|
|
57
47
|
elsif TYPES_JSON.include?(type)
|
|
58
|
-
|
|
59
|
-
|
|
48
|
+
json_match(page)
|
|
49
|
+
elsif TYPES_OTHER.include?(type)
|
|
50
|
+
send("get_#{type}", page)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def get_header(page)
|
|
55
|
+
header = (page.headers[value.to_s.downcase] || []).first
|
|
56
|
+
safe_block.call(header)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def get_body(page)
|
|
60
|
+
safe_block.call(page.body)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def get_plain(page)
|
|
64
|
+
safe_block.call(page.doc.text)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def get_lines(page)
|
|
68
|
+
block = safe_block
|
|
69
|
+
page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def get_cookie(page)
|
|
73
|
+
cookie = page.cookies.find { |c| c.name == value }&.value
|
|
74
|
+
safe_block.call(cookie)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def json_match(page)
|
|
78
|
+
json = page.send(type)
|
|
79
|
+
keys = json_index_keys(value)
|
|
80
|
+
match = json_fetch(json, keys)
|
|
81
|
+
safe_block.call(match)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def css_match(doc)
|
|
85
|
+
block = safe_block { |node| node.content.strip }
|
|
86
|
+
# content is Nokogiri::HTML::Document
|
|
87
|
+
contents = doc.css(value)
|
|
88
|
+
if type == 'css_many'
|
|
89
|
+
contents.map { |node| block.call(node) }.uniq
|
|
90
|
+
else
|
|
91
|
+
block.call(contents.first)
|
|
60
92
|
end
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def regexp_match(body)
|
|
96
|
+
block = safe_block(&:strip)
|
|
97
|
+
# content is String
|
|
98
|
+
if type == 'match_many'
|
|
99
|
+
regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
|
|
100
|
+
body.gsub(regexp).to_a.map { |node| block.call(node) }.uniq
|
|
65
101
|
else
|
|
66
|
-
|
|
102
|
+
index = TYPES_REGEXP.index(type)
|
|
103
|
+
matches = body.match(value)
|
|
104
|
+
block.call(matches[index]) unless matches.nil?
|
|
67
105
|
end
|
|
68
|
-
rescue
|
|
69
|
-
nil
|
|
70
106
|
end
|
|
71
107
|
|
|
72
108
|
def json_fetch(json, keys)
|
|
73
|
-
if keys.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
json_fetch(json[pt[:index] || pt[:key]], keys)
|
|
85
|
-
end
|
|
109
|
+
if keys.empty?
|
|
110
|
+
json
|
|
111
|
+
else
|
|
112
|
+
pt = keys.shift
|
|
113
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def json_index_keys(keys)
|
|
118
|
+
keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p|
|
|
119
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
|
86
120
|
end
|
|
87
121
|
end
|
|
88
122
|
|
|
123
|
+
private :json_index_keys, :json_fetch
|
|
124
|
+
|
|
125
|
+
# Nokogiri node methods
|
|
89
126
|
class Nokogiri::XML::Node
|
|
90
|
-
%w
|
|
91
|
-
define_method "inner_#{name}" do |css, &block|
|
|
92
|
-
callback = ->(node)
|
|
93
|
-
if node
|
|
94
|
-
(block || ->(n){n.text.strip}).call(node)
|
|
95
|
-
else
|
|
96
|
-
nil
|
|
97
|
-
end
|
|
98
|
-
end
|
|
127
|
+
%w[one many].each do |name|
|
|
128
|
+
define_method "inner_#{name}" do |css, &block|
|
|
129
|
+
callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
|
|
99
130
|
if name == 'one' # inner_one
|
|
100
131
|
callback.call(self.css(css).first)
|
|
101
132
|
else # inner_many
|
|
102
|
-
self.css(css).map{|node| callback.call(node)}
|
|
133
|
+
self.css(css).map { |node| callback.call(node) }
|
|
103
134
|
end
|
|
104
135
|
end
|
|
105
136
|
end
|
|
106
137
|
def source
|
|
107
138
|
to_xml
|
|
108
139
|
end
|
|
109
|
-
end
|
|
140
|
+
end
|
|
110
141
|
end
|
|
111
|
-
end
|
|
142
|
+
end
|
data/lib/digger/version.rb
CHANGED
data/spec/index_spec.rb
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'digger'
|
|
2
|
+
|
|
3
|
+
describe Digger::Index do
|
|
4
|
+
it 'batch digger' do
|
|
5
|
+
list = [1, 2, 3, 4, 5, 6, 7, 8]
|
|
6
|
+
pt = Digger::Index.batch(list, 3) do |num|
|
|
7
|
+
sleep(rand(1..3))
|
|
8
|
+
"##{num}"
|
|
9
|
+
end
|
|
10
|
+
expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'slow down' do
|
|
14
|
+
list = [1, 2, 3, 4]
|
|
15
|
+
conf = {
|
|
16
|
+
sleep_range_seconds: 1...2,
|
|
17
|
+
fail_unit_seconds: 1,
|
|
18
|
+
fail_max_cnt: 2,
|
|
19
|
+
when_fail: ->(_, e, nth) { puts "#{nth}: #{e.message}" }
|
|
20
|
+
}
|
|
21
|
+
pt = Digger::Index.slow_down(list, conf) do |num|
|
|
22
|
+
raise 'error' if num == 3
|
|
23
|
+
num
|
|
24
|
+
end
|
|
25
|
+
p pt
|
|
26
|
+
expect(pt.size).to eq(2)
|
|
27
|
+
end
|
|
28
|
+
end
|
data/spec/page_spec.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
require 'digger'
|
|
2
2
|
require 'json'
|
|
3
3
|
require 'uri'
|
|
4
|
+
require 'cgi'
|
|
4
5
|
|
|
5
6
|
describe Digger::Page do
|
|
6
7
|
it 'page json' do
|
|
@@ -15,13 +16,12 @@ describe Digger::Page do
|
|
|
15
16
|
|
|
16
17
|
it 'fetch baidu' do
|
|
17
18
|
http = Digger::HTTP.new
|
|
18
|
-
page = http.fetch_page('http://
|
|
19
|
+
page = http.fetch_page('http://baidu.com/')
|
|
19
20
|
expect(page.code).to eq(200)
|
|
20
21
|
end
|
|
21
22
|
|
|
22
|
-
it 'page uri' do
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
end
|
|
23
|
+
# it 'page uri' do
|
|
24
|
+
# link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
|
25
|
+
# link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
|
|
26
|
+
# end
|
|
27
|
+
end
|
data/spec/pattern_spec.rb
CHANGED
|
@@ -2,14 +2,26 @@ require 'digger'
|
|
|
2
2
|
require 'json'
|
|
3
3
|
|
|
4
4
|
describe Digger::Pattern do
|
|
5
|
-
it 'json fetch' do
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
end
|
|
13
|
-
|
|
5
|
+
# it 'json fetch' do
|
|
6
|
+
# json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
|
7
|
+
# pt = Digger::Pattern.new
|
|
8
|
+
# expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
|
9
|
+
# expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
|
10
|
+
# expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
|
11
|
+
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
|
12
|
+
# end
|
|
14
13
|
|
|
15
|
-
|
|
14
|
+
it 'parse cookie & others' do
|
|
15
|
+
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
|
16
|
+
p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
|
17
|
+
# cookie
|
|
18
|
+
result = p1.match_page(page)
|
|
19
|
+
expect(result.length).to eq(42)
|
|
20
|
+
# header
|
|
21
|
+
p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
|
|
22
|
+
expect(p2.match_page(page)).to eq('chunked')
|
|
23
|
+
# get_plain
|
|
24
|
+
p3 = Digger::Pattern.new({ type: 'plain' })
|
|
25
|
+
expect(p3.match_page(page).length).to be > 100
|
|
26
|
+
end
|
|
27
|
+
end
|
metadata
CHANGED
|
@@ -1,71 +1,71 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: digger
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- binz
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-12-
|
|
11
|
+
date: 2021-12-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
|
-
name:
|
|
14
|
+
name: rake
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - "
|
|
17
|
+
- - ">="
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version:
|
|
19
|
+
version: 12.3.3
|
|
20
20
|
type: :development
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - "
|
|
24
|
+
- - ">="
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version:
|
|
26
|
+
version: 12.3.3
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
28
|
+
name: bundler
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- - "
|
|
31
|
+
- - "~>"
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version:
|
|
33
|
+
version: '2.0'
|
|
34
34
|
type: :development
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- - "
|
|
38
|
+
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
|
-
version:
|
|
40
|
+
version: '2.0'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
|
-
name:
|
|
42
|
+
name: http-cookie
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
45
|
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: '1.
|
|
47
|
+
version: '1.0'
|
|
48
48
|
type: :runtime
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: '1.
|
|
54
|
+
version: '1.0'
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
|
-
name:
|
|
56
|
+
name: nokogiri
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
59
|
- - "~>"
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
|
-
version: '1.
|
|
61
|
+
version: '1.6'
|
|
62
62
|
type: :runtime
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
66
|
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '1.
|
|
68
|
+
version: '1.6'
|
|
69
69
|
description: ''
|
|
70
70
|
email:
|
|
71
71
|
- xinkiang@gmail.com
|
|
@@ -87,6 +87,7 @@ files:
|
|
|
87
87
|
- lib/digger/pattern.rb
|
|
88
88
|
- lib/digger/version.rb
|
|
89
89
|
- spec/digger_spec.rb
|
|
90
|
+
- spec/index_spec.rb
|
|
90
91
|
- spec/page_spec.rb
|
|
91
92
|
- spec/pattern_spec.rb
|
|
92
93
|
- spec/validate_spec.rb
|
|
@@ -115,6 +116,7 @@ specification_version: 4
|
|
|
115
116
|
summary: Dig need stractual infomation from web page.
|
|
116
117
|
test_files:
|
|
117
118
|
- spec/digger_spec.rb
|
|
119
|
+
- spec/index_spec.rb
|
|
118
120
|
- spec/page_spec.rb
|
|
119
121
|
- spec/pattern_spec.rb
|
|
120
122
|
- spec/validate_spec.rb
|