digger 0.1.4 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/digger.gemspec +10 -9
- data/lib/digger/index.rb +13 -12
- data/lib/digger/model.rb +17 -11
- data/lib/digger/page.rb +20 -18
- data/lib/digger/pattern.rb +85 -56
- data/lib/digger/version.rb +1 -1
- data/spec/digger_spec.rb +6 -7
- data/spec/index_spec.rb +12 -0
- data/spec/page_spec.rb +27 -0
- data/spec/pattern_spec.rb +20 -0
- metadata +26 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 19e59bc2161a078d80d00adf538a7c33891a53f9beeb453748eec7e0810c5b65
|
4
|
+
data.tar.gz: da1d93c663b42a6e0b7be2f136bea3b0f3a86c3c36c57a72347c70e7e0538508
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d13b06c8491f9cda42f8a8fd4fa9547aa6a95d075efbf52c789760adaa66d0c2bde24f810e23e4713df4362ae82cb82b0d61996386caaad2c685eb22f1a375db
|
7
|
+
data.tar.gz: a63a5cfe70b154b446dcbd0d2937d94a33fb033277c0d17aa1500a9ccd86dc8791e3cc40e8487ecc1c609eec304b91fb4a29b75b89a74b8efe24e29c49957d4e
|
data/digger.gemspec
CHANGED
@@ -1,26 +1,27 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'digger/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'digger'
|
8
9
|
spec.version = Digger::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
10
|
+
spec.authors = ['binz']
|
11
|
+
spec.email = ['xinkiang@gmail.com']
|
11
12
|
spec.summary = %q{Dig need stractual infomation from web page.}
|
12
13
|
spec.description = %q{}
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
14
|
+
spec.homepage = ''
|
15
|
+
spec.license = 'MIT'
|
15
16
|
|
16
17
|
spec.files = `git ls-files -z`.split("\x0")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
22
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
23
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
24
|
|
24
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
26
27
|
end
|
data/lib/digger/index.rb
CHANGED
@@ -8,33 +8,34 @@ module Digger
|
|
8
8
|
|
9
9
|
def urls
|
10
10
|
@urls ||= begin
|
11
|
-
args = self.args.map{|a|
|
12
|
-
args.shift.product(*args).map{|arg| pattern_applied_url(arg)}
|
11
|
+
args = self.args.map { |a| a.respond_to?(:each) ? a.to_a : [a] }
|
12
|
+
args.shift.product(*args).map { |arg| pattern_applied_url(arg) }
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
16
|
def pattern_applied_url(arg)
|
17
|
-
pattern.gsub('*').each_with_index{|_, i| arg[i]}
|
17
|
+
pattern.gsub('*').each_with_index { |_, i| arg[i] }
|
18
18
|
end
|
19
19
|
|
20
20
|
def self.batch(entities, cocurrence = 1, &block)
|
21
|
-
raise NoBlockError,
|
21
|
+
raise NoBlockError, 'No block given' unless block
|
22
22
|
|
23
23
|
if cocurrence > 1
|
24
|
-
results =
|
25
|
-
entities.each_slice(cocurrence) do |group|
|
24
|
+
results = Array.new(entities.size)
|
25
|
+
entities.each_slice(cocurrence).with_index do |group, idx1|
|
26
26
|
threads = []
|
27
|
-
group.
|
27
|
+
group.each_with_index do |entity, idx2|
|
28
|
+
index = idx1 * cocurrence + idx2
|
28
29
|
threads << Thread.new(entity) do |ent|
|
29
|
-
results[
|
30
|
+
results[index] = block.call(ent)
|
30
31
|
end
|
31
32
|
end
|
32
|
-
threads.each
|
33
|
+
threads.each(&:join)
|
33
34
|
end
|
34
|
-
|
35
|
+
results
|
35
36
|
else
|
36
|
-
entities.map{|ent| block.call(ent) }
|
37
|
+
entities.map { |ent| block.call(ent) }
|
37
38
|
end
|
38
39
|
end
|
39
40
|
end
|
40
|
-
end
|
41
|
+
end
|
data/lib/digger/model.rb
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
|
2
2
|
module Digger
|
3
3
|
class Model
|
4
|
-
@@digger_config = {
|
4
|
+
@@digger_config = {
|
5
|
+
'pattern' => {},
|
6
|
+
'index' => {}
|
7
|
+
}
|
5
8
|
|
6
9
|
class << self
|
7
10
|
# patterns
|
8
11
|
def pattern_config
|
9
|
-
@@digger_config['pattern'][
|
12
|
+
@@digger_config['pattern'][name] ||= {}
|
10
13
|
end
|
11
14
|
|
12
15
|
Pattern::TYPES.each do |method|
|
13
|
-
define_method method, ->(pairs, &block){
|
16
|
+
define_method method, -> (pairs, &block) {
|
14
17
|
pairs.each_pair do |key, value|
|
15
18
|
pattern_config[key] = Pattern.new(type: method, value: value, block: block)
|
16
19
|
end
|
@@ -18,21 +21,22 @@ module Digger
|
|
18
21
|
end
|
19
22
|
|
20
23
|
def validate_presence(*keys)
|
21
|
-
|
22
|
-
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless
|
24
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
25
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless is_all
|
23
26
|
end
|
24
27
|
|
25
28
|
def validate_includeness(*keys)
|
26
|
-
|
29
|
+
is_all = pattern_config.keys.all? { |k| keys.include?(k) }
|
30
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" if is_all
|
27
31
|
end
|
28
32
|
|
29
33
|
# index page
|
30
34
|
def index_config
|
31
|
-
@@digger_config['index'][
|
35
|
+
@@digger_config['index'][name]
|
32
36
|
end
|
33
37
|
|
34
38
|
def index_page(pattern, *args)
|
35
|
-
|
39
|
+
@@digger_config['index'][name] = Index.new(pattern, args)
|
36
40
|
end
|
37
41
|
|
38
42
|
def index_page?
|
@@ -55,13 +59,15 @@ module Digger
|
|
55
59
|
end
|
56
60
|
|
57
61
|
def dig_urls(urls, cocurrence = 1, opts = {})
|
58
|
-
Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
|
62
|
+
Index.batch(urls, cocurrence) { |url| dig_url(url, opts) }
|
59
63
|
end
|
60
64
|
|
61
65
|
def dig(cocurrence = 1)
|
62
66
|
if self.class.index_page?
|
63
|
-
self.class.index_config.process(cocurrence)
|
67
|
+
self.class.index_config.process(cocurrence) do |url|
|
68
|
+
dig_url(url)
|
69
|
+
end
|
64
70
|
end
|
65
71
|
end
|
66
72
|
end
|
67
|
-
end
|
73
|
+
end
|
data/lib/digger/page.rb
CHANGED
@@ -3,6 +3,8 @@ require 'json'
|
|
3
3
|
require 'ostruct'
|
4
4
|
require 'set'
|
5
5
|
require 'kconv'
|
6
|
+
require 'uri'
|
7
|
+
require 'http/cookie'
|
6
8
|
|
7
9
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
8
10
|
module Digger
|
@@ -27,16 +29,12 @@ module Digger
|
|
27
29
|
# OpenStruct it holds users defined data
|
28
30
|
attr_accessor :user_data
|
29
31
|
|
30
|
-
attr_accessor :aliases
|
31
|
-
|
32
|
-
attr_accessor :domain_aliases
|
32
|
+
attr_accessor :aliases, :domain_aliases, :fetched_at
|
33
33
|
|
34
34
|
# Whether the current page should be stored
|
35
35
|
# Default: true
|
36
36
|
attr_accessor :storable
|
37
37
|
|
38
|
-
attr_accessor :fetched_at
|
39
|
-
|
40
38
|
#
|
41
39
|
# Create a new page
|
42
40
|
#
|
@@ -60,7 +58,7 @@ module Digger
|
|
60
58
|
end
|
61
59
|
|
62
60
|
def title
|
63
|
-
doc
|
61
|
+
doc&.title
|
64
62
|
end
|
65
63
|
|
66
64
|
#
|
@@ -74,6 +72,7 @@ module Digger
|
|
74
72
|
doc.search('//a[@href]').each do |a|
|
75
73
|
u = a['href']
|
76
74
|
next if u.nil? || u.empty?
|
75
|
+
|
77
76
|
abs = to_absolute(u) rescue next
|
78
77
|
@links << abs if abs && in_domain?(abs)
|
79
78
|
end
|
@@ -95,7 +94,17 @@ module Digger
|
|
95
94
|
end
|
96
95
|
end
|
97
96
|
|
97
|
+
def json
|
98
|
+
@json ||= JSON.parse body
|
99
|
+
end
|
100
|
+
|
101
|
+
def jsonp
|
102
|
+
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
103
|
+
end
|
98
104
|
|
105
|
+
def cookies
|
106
|
+
@cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
|
107
|
+
end
|
99
108
|
|
100
109
|
#
|
101
110
|
# Discard links, a next call of page.links will return an empty array
|
@@ -156,7 +165,7 @@ module Digger
|
|
156
165
|
# returns +false+ otherwise.
|
157
166
|
#
|
158
167
|
def not_found?
|
159
|
-
|
168
|
+
@code == 404
|
160
169
|
end
|
161
170
|
|
162
171
|
#
|
@@ -170,6 +179,7 @@ module Digger
|
|
170
179
|
end unless @base
|
171
180
|
|
172
181
|
return nil if @base && @base.to_s.empty?
|
182
|
+
|
173
183
|
@base
|
174
184
|
end
|
175
185
|
|
@@ -180,16 +190,7 @@ module Digger
|
|
180
190
|
def to_absolute(link)
|
181
191
|
return nil if link.nil?
|
182
192
|
|
183
|
-
|
184
|
-
|
185
|
-
# remove anchor
|
186
|
-
link =
|
187
|
-
begin
|
188
|
-
URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
189
|
-
rescue URI::Error
|
190
|
-
return nil
|
191
|
-
end
|
192
|
-
|
193
|
+
link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
|
193
194
|
relative = begin
|
194
195
|
URI(link)
|
195
196
|
rescue URI::Error
|
@@ -247,6 +248,7 @@ module Digger
|
|
247
248
|
|
248
249
|
def expired?(ttl)
|
249
250
|
return false if fetched_at.nil?
|
251
|
+
|
250
252
|
(Time.now.to_i - ttl) > fetched_at
|
251
253
|
end
|
252
254
|
|
@@ -276,4 +278,4 @@ module Digger
|
|
276
278
|
from_hash hash
|
277
279
|
end
|
278
280
|
end
|
279
|
-
end
|
281
|
+
end
|
data/lib/digger/pattern.rb
CHANGED
@@ -1,91 +1,120 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
module Digger
|
4
|
+
# Extractor patterns definition
|
4
5
|
class Pattern
|
5
6
|
attr_accessor :type, :value, :block
|
6
7
|
|
7
8
|
def initialize(hash = {})
|
8
|
-
hash.each_pair{|key, value| send("#{key}=", value) if %w
|
9
|
+
hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
|
9
10
|
end
|
10
11
|
|
11
|
-
def safe_block
|
12
|
-
block &&
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
nil
|
12
|
+
def safe_block(&default_block)
|
13
|
+
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
14
|
+
default_block || ->(v) { v }
|
15
|
+
elsif block.respond_to?(:call)
|
16
|
+
block
|
17
|
+
else
|
18
|
+
proc {
|
19
|
+
$SAFE = 2
|
20
|
+
eval block
|
21
|
+
}.call
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
25
|
def self.wrap(hash)
|
26
|
-
|
26
|
+
hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) }
|
27
27
|
end
|
28
28
|
|
29
29
|
MATCH_MAX = 3
|
30
|
-
|
31
|
-
TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
|
32
30
|
|
33
|
-
|
34
|
-
|
31
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
32
|
+
TYPES_CSS = %w[css_one css_many].freeze
|
33
|
+
TYPES_JSON = %w[json jsonp].freeze
|
34
|
+
|
35
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + ['cookie']
|
36
|
+
|
37
|
+
def match_page(page)
|
38
|
+
return unless page.success?
|
39
|
+
|
40
|
+
if TYPES_REGEXP.include?(type) # regular expression
|
41
|
+
regexp_match(page.body)
|
42
|
+
elsif TYPES_CSS.include?(type) # css expression
|
43
|
+
css_match(page.doc)
|
44
|
+
elsif TYPES_JSON.include?(type)
|
45
|
+
json_match(page)
|
46
|
+
else
|
47
|
+
cookie_get(page.cookies)
|
48
|
+
end
|
35
49
|
end
|
36
50
|
|
37
|
-
def
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
51
|
+
def cookie_get(cookies)
|
52
|
+
cookie = cookies.find { |c| c.name == value }&.value
|
53
|
+
safe_block.call(cookie)
|
54
|
+
end
|
55
|
+
|
56
|
+
def json_match(page)
|
57
|
+
json = page.send(type)
|
58
|
+
keys = json_index_keys(value)
|
59
|
+
match = json_fetch(json, keys)
|
60
|
+
safe_block.call(match)
|
61
|
+
end
|
62
|
+
|
63
|
+
def css_match(doc)
|
64
|
+
block = safe_block { |node| node.content.strip }
|
65
|
+
# content is Nokogiri::HTML::Document
|
66
|
+
contents = doc.css(value)
|
67
|
+
if type == 'css_many'
|
68
|
+
contents.map { |node| block.call(node) }.uniq
|
69
|
+
else
|
70
|
+
block.call(contents.first)
|
57
71
|
end
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
72
|
+
end
|
73
|
+
|
74
|
+
def regexp_match(body)
|
75
|
+
block = safe_block(&:strip)
|
76
|
+
# content is String
|
77
|
+
if type == 'match_many'
|
78
|
+
body.gsub(value).to_a.map { |node| block.call(node) }.uniq
|
79
|
+
else
|
80
|
+
index = TYPES_REGEXP.index(type)
|
81
|
+
matches = body.match(value)
|
82
|
+
block.call(matches[index]) unless matches.nil?
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def json_fetch(json, keys)
|
87
|
+
if keys.empty?
|
88
|
+
json
|
62
89
|
else
|
63
|
-
|
90
|
+
pt = keys.shift
|
91
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
64
92
|
end
|
65
|
-
rescue
|
66
|
-
nil
|
67
93
|
end
|
68
94
|
|
95
|
+
def json_index_keys(keys)
|
96
|
+
keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p|
|
97
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
private :json_index_keys, :json_fetch
|
102
|
+
|
103
|
+
# Nokogiri node methods
|
69
104
|
class Nokogiri::XML::Node
|
70
|
-
%w
|
71
|
-
define_method "inner_#{name}" do |css, &block|
|
72
|
-
callback = ->(node)
|
73
|
-
if node
|
74
|
-
(block || ->(n){n.text.strip}).call(node)
|
75
|
-
else
|
76
|
-
nil
|
77
|
-
end
|
78
|
-
end
|
105
|
+
%w[one many].each do |name|
|
106
|
+
define_method "inner_#{name}" do |css, &block|
|
107
|
+
callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
|
79
108
|
if name == 'one' # inner_one
|
80
109
|
callback.call(self.css(css).first)
|
81
110
|
else # inner_many
|
82
|
-
self.css(css).map{|node| callback.call(node)}
|
111
|
+
self.css(css).map { |node| callback.call(node) }
|
83
112
|
end
|
84
113
|
end
|
85
114
|
end
|
86
115
|
def source
|
87
116
|
to_xml
|
88
117
|
end
|
89
|
-
end
|
118
|
+
end
|
90
119
|
end
|
91
|
-
end
|
120
|
+
end
|
data/lib/digger/version.rb
CHANGED
data/spec/digger_spec.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
require 'digger'
|
2
2
|
|
3
3
|
http = Digger::HTTP.new
|
4
|
-
page = http.fetch_page('http://
|
4
|
+
page = http.fetch_page('http://www.baidu.com/')
|
5
5
|
|
6
|
-
pattern = Digger::Pattern.new({type: 'css_many', value: '
|
6
|
+
pattern = Digger::Pattern.new({ type: 'css_many', value: '#s-top-left>a' })
|
7
7
|
|
8
8
|
class Item < Digger::Model
|
9
|
-
css_many sites: '
|
10
|
-
css_one logo: '.logo'
|
9
|
+
css_many sites: '#s-top-left>a'
|
11
10
|
validate_presence :sites
|
12
|
-
validate_includeness :sites
|
11
|
+
validate_includeness :sites
|
13
12
|
end
|
14
13
|
|
15
14
|
describe Digger do
|
@@ -19,12 +18,12 @@ describe Digger do
|
|
19
18
|
|
20
19
|
it "pattern should match content" do
|
21
20
|
sites = pattern.match_page(page)
|
22
|
-
expect(sites.include?('
|
21
|
+
expect(sites.include?('新闻')).to eq(true)
|
23
22
|
end
|
24
23
|
|
25
24
|
it "model should dig content" do
|
26
25
|
item = Item.new.match_page(page)
|
27
|
-
expect(item[:sites].include?('
|
26
|
+
expect(item[:sites].include?('新闻')).to be(true)
|
28
27
|
end
|
29
28
|
|
30
29
|
it "validation support" do
|
data/spec/index_spec.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'digger'
|
2
|
+
|
3
|
+
describe Digger::Index do
|
4
|
+
it 'batch digger' do
|
5
|
+
list = [1, 2, 3, 4, 5, 6, 7, 8]
|
6
|
+
pt = Digger::Index.batch(list, 3) do |num|
|
7
|
+
sleep(rand(1..3))
|
8
|
+
"##{num}"
|
9
|
+
end
|
10
|
+
expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
|
11
|
+
end
|
12
|
+
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
require 'uri'
|
4
|
+
require 'cgi'
|
5
|
+
|
6
|
+
describe Digger::Page do
|
7
|
+
it 'page json' do
|
8
|
+
json_str = '{"a":1,"b":[1,2,3]}'
|
9
|
+
j1 = Digger::Page.new('', body: json_str)
|
10
|
+
j2 = Digger::Page.new('', body: "hello(#{json_str});")
|
11
|
+
expect(j1.json['a']).to eq(1)
|
12
|
+
expect(j2.jsonp['a']).to eq(1)
|
13
|
+
expect(j1.json['b'][0]).to eq(1)
|
14
|
+
expect(j2.jsonp['b'][1]).to eq(2)
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'fetch baidu' do
|
18
|
+
http = Digger::HTTP.new
|
19
|
+
page = http.fetch_page('http://baidu.com/')
|
20
|
+
expect(page.code).to eq(200)
|
21
|
+
end
|
22
|
+
|
23
|
+
# it 'page uri' do
|
24
|
+
# link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
25
|
+
# link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
|
26
|
+
# end
|
27
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
describe Digger::Pattern do
|
5
|
+
# it 'json fetch' do
|
6
|
+
# json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
7
|
+
# pt = Digger::Pattern.new
|
8
|
+
# expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
9
|
+
# expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
10
|
+
# expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
11
|
+
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
|
+
# end
|
13
|
+
|
14
|
+
it 'parse cookoe' do
|
15
|
+
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
16
|
+
pt = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
17
|
+
result = pt.match_page(page)
|
18
|
+
expect(result.length).to eq(42)
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -1,71 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-12-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 12.3.3
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 12.3.3
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: http-cookie
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1.
|
47
|
+
version: '1.0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1.
|
54
|
+
version: '1.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
61
|
+
version: '1.6'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
68
|
+
version: '1.6'
|
69
69
|
description: ''
|
70
70
|
email:
|
71
71
|
- xinkiang@gmail.com
|
@@ -87,12 +87,15 @@ files:
|
|
87
87
|
- lib/digger/pattern.rb
|
88
88
|
- lib/digger/version.rb
|
89
89
|
- spec/digger_spec.rb
|
90
|
+
- spec/index_spec.rb
|
91
|
+
- spec/page_spec.rb
|
92
|
+
- spec/pattern_spec.rb
|
90
93
|
- spec/validate_spec.rb
|
91
94
|
homepage: ''
|
92
95
|
licenses:
|
93
96
|
- MIT
|
94
97
|
metadata: {}
|
95
|
-
post_install_message:
|
98
|
+
post_install_message:
|
96
99
|
rdoc_options: []
|
97
100
|
require_paths:
|
98
101
|
- lib
|
@@ -107,11 +110,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
110
|
- !ruby/object:Gem::Version
|
108
111
|
version: '0'
|
109
112
|
requirements: []
|
110
|
-
|
111
|
-
|
112
|
-
signing_key:
|
113
|
+
rubygems_version: 3.2.32
|
114
|
+
signing_key:
|
113
115
|
specification_version: 4
|
114
116
|
summary: Dig need stractual infomation from web page.
|
115
117
|
test_files:
|
116
118
|
- spec/digger_spec.rb
|
119
|
+
- spec/index_spec.rb
|
120
|
+
- spec/page_spec.rb
|
121
|
+
- spec/pattern_spec.rb
|
117
122
|
- spec/validate_spec.rb
|