digger 0.1.1 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/digger.gemspec +1 -1
- data/lib/digger/http.rb +9 -2
- data/lib/digger/model.rb +13 -4
- data/lib/digger/page.rb +6 -0
- data/lib/digger/pattern.rb +31 -11
- data/lib/digger/version.rb +1 -1
- data/spec/digger_spec.rb +6 -0
- data/spec/page_spec.rb +14 -0
- data/spec/pattern_spec.rb +15 -0
- data/spec/validate_spec.rb +10 -0
- metadata +14 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5c4a94163a25d4b53ad5b477040f69b8fccca026adc313f8f61759317c1bf198
|
4
|
+
data.tar.gz: 307b443277c16708103c172e5fb4ef4d833f7d2631a7f85779570a3cbeac8925
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9da40123fd09615d0c69ca5104d1141b82981813ef8d175bc567a0f35e8f7dd868ce235bc0f265308e2133710f63c3ce8325d6b528ae060eccb77904c12e3139
|
7
|
+
data.tar.gz: c05be67df6db25345acfdc3615690c0467029dd40ffde262863923b6f0786696a16fd547277ec4d3764ed39b399174dd28e10ca7eb31ffb55f773e08b04f2986
|
data/digger.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_development_dependency "bundler", "~>
|
21
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
22
22
|
spec.add_development_dependency "rake", "~> 10.0"
|
23
23
|
|
24
24
|
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
data/lib/digger/http.rb
CHANGED
@@ -49,7 +49,7 @@ module Digger
|
|
49
49
|
url = URI(url)
|
50
50
|
pages = []
|
51
51
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
52
|
-
handle_compression response
|
52
|
+
handle_compression response if handle_compression?
|
53
53
|
pages << Page.new(location, body: response.body,
|
54
54
|
code: code,
|
55
55
|
headers: response.to_hash,
|
@@ -70,6 +70,13 @@ module Digger
|
|
70
70
|
[Page.new(url, error: e, referer: referer, depth: depth)]
|
71
71
|
end
|
72
72
|
|
73
|
+
#
|
74
|
+
# Accept response compression, may bring encoding error if true
|
75
|
+
#
|
76
|
+
def handle_compression?
|
77
|
+
@opts[:handle_compression]
|
78
|
+
end
|
79
|
+
|
73
80
|
#
|
74
81
|
# The maximum number of redirects to follow
|
75
82
|
#
|
@@ -185,7 +192,7 @@ module Digger
|
|
185
192
|
opts['User-Agent'] = user_agent if user_agent
|
186
193
|
opts['Referer'] = referer.to_s if referer
|
187
194
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
188
|
-
opts['Accept-Encoding'] = 'gzip,deflate'
|
195
|
+
opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
|
189
196
|
|
190
197
|
retries = 0
|
191
198
|
begin
|
data/lib/digger/model.rb
CHANGED
@@ -17,6 +17,15 @@ module Digger
|
|
17
17
|
}
|
18
18
|
end
|
19
19
|
|
20
|
+
def validate_presence(*keys)
|
21
|
+
keys_all = pattern_config.keys
|
22
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def validate_includeness(*keys)
|
26
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
|
27
|
+
end
|
28
|
+
|
20
29
|
# index page
|
21
30
|
def index_config
|
22
31
|
@@digger_config['index'][self.name]
|
@@ -39,14 +48,14 @@ module Digger
|
|
39
48
|
result
|
40
49
|
end
|
41
50
|
|
42
|
-
def dig_url(url)
|
43
|
-
client = Digger::HTTP.new
|
51
|
+
def dig_url(url, opts = {})
|
52
|
+
client = Digger::HTTP.new(opts)
|
44
53
|
page = client.fetch_page(url)
|
45
54
|
match_page(page)
|
46
55
|
end
|
47
56
|
|
48
|
-
def dig_urls(urls, cocurrence = 1)
|
49
|
-
Index.batch(urls, cocurrence){|url| dig_url(url) }
|
57
|
+
def dig_urls(urls, cocurrence = 1, opts = {})
|
58
|
+
Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
|
50
59
|
end
|
51
60
|
|
52
61
|
def dig(cocurrence = 1)
|
data/lib/digger/page.rb
CHANGED
@@ -95,7 +95,13 @@ module Digger
|
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
+
def json
|
99
|
+
@json ||= JSON.parse body
|
100
|
+
end
|
98
101
|
|
102
|
+
def jsonp
|
103
|
+
@jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
|
104
|
+
end
|
99
105
|
|
100
106
|
#
|
101
107
|
# Discard links, a next call of page.links will return an empty array
|
data/lib/digger/pattern.rb
CHANGED
@@ -27,33 +27,36 @@ module Digger
|
|
27
27
|
end
|
28
28
|
|
29
29
|
MATCH_MAX = 3
|
30
|
-
|
31
|
-
TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
|
32
30
|
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
|
32
|
+
TYPES_CSS = %w{css_one css_many}
|
33
|
+
TYPES_JSON = %w{json jsonp}
|
34
|
+
|
35
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
36
36
|
|
37
37
|
def match_page(page, &callback)
|
38
38
|
blk = callback || safe_block
|
39
|
-
if
|
40
|
-
|
41
|
-
blk ||= ->(text){text.strip}
|
39
|
+
if TYPES_REGEXP.include?(type) # regular expression
|
40
|
+
blk ||= ->(text){ text.strip }
|
42
41
|
# content is String
|
43
42
|
if type == 'match_many'
|
44
43
|
match = page.body.gsub(value).to_a
|
45
44
|
else
|
45
|
+
index = TYPES_REGEXP.index(type)
|
46
46
|
matches = page.body.match(value)
|
47
47
|
match = matches.nil? ? nil : matches[index]
|
48
48
|
end
|
49
|
-
|
50
|
-
blk ||= ->(node){node.content.strip}
|
49
|
+
elsif TYPES_CSS.include?(type) # css expression
|
50
|
+
blk ||= ->(node){ node.content.strip }
|
51
51
|
# content is Nokogiri::HTML::Document
|
52
52
|
if type == 'css_one'
|
53
53
|
match = page.doc.css(value).first
|
54
|
-
|
54
|
+
else
|
55
55
|
match = page.doc.css(value)
|
56
56
|
end
|
57
|
+
elsif TYPES_JSON.include?(type)
|
58
|
+
json = page.send(type)
|
59
|
+
match = json_fetch(json, value)
|
57
60
|
end
|
58
61
|
if match.nil?
|
59
62
|
nil
|
@@ -66,6 +69,23 @@ module Digger
|
|
66
69
|
nil
|
67
70
|
end
|
68
71
|
|
72
|
+
def json_fetch(json, keys)
|
73
|
+
if keys.is_a? String
|
74
|
+
# parse json keys like '$.k1.k2[0]'
|
75
|
+
parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
|
76
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
77
|
+
end
|
78
|
+
json_fetch(json, parts)
|
79
|
+
elsif keys.is_a? Array
|
80
|
+
if keys.length == 0
|
81
|
+
json
|
82
|
+
else
|
83
|
+
pt = keys.shift
|
84
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
69
89
|
class Nokogiri::XML::Node
|
70
90
|
%w{one many}.each do |name|
|
71
91
|
define_method "inner_#{name}" do |css, &block|
|
data/lib/digger/version.rb
CHANGED
data/spec/digger_spec.rb
CHANGED
@@ -7,6 +7,9 @@ pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
|
|
7
7
|
|
8
8
|
class Item < Digger::Model
|
9
9
|
css_many sites: '.sites>a>span'
|
10
|
+
css_one logo: '.logo'
|
11
|
+
validate_presence :sites
|
12
|
+
validate_includeness :sites, :logo
|
10
13
|
end
|
11
14
|
|
12
15
|
describe Digger do
|
@@ -24,6 +27,9 @@ describe Digger do
|
|
24
27
|
expect(item[:sites].include?('读远')).to be(true)
|
25
28
|
end
|
26
29
|
|
30
|
+
it "validation support" do
|
31
|
+
end
|
32
|
+
|
27
33
|
it "index multiple threading" do
|
28
34
|
|
29
35
|
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
describe Digger::Page do
|
5
|
+
it 'page json' do
|
6
|
+
json_str = '{"a":1,"b":[1,2,3]}'
|
7
|
+
j1 = Digger::Page.new('', body: json_str)
|
8
|
+
j2 = Digger::Page.new('', body: "hello(#{json_str});")
|
9
|
+
expect(j1.json['a']).to eq(1)
|
10
|
+
expect(j2.jsonp['a']).to eq(1)
|
11
|
+
expect(j1.json['b'][0]).to eq(1)
|
12
|
+
expect(j2.jsonp['b'][1]).to eq(2)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
describe Digger::Pattern do
|
5
|
+
it 'json fetch' do
|
6
|
+
json = JSON.parse('{"a":1,"b":[1,2,3]}')
|
7
|
+
pt = Digger::Pattern.new
|
8
|
+
expect(pt.json_fetch(json, '$')['a']).to eq(1)
|
9
|
+
expect(pt.json_fetch(json, '$.a')).to eq(1)
|
10
|
+
expect(pt.json_fetch(json, '$.b').length).to eq(3)
|
11
|
+
expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-12-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -87,11 +87,14 @@ files:
|
|
87
87
|
- lib/digger/pattern.rb
|
88
88
|
- lib/digger/version.rb
|
89
89
|
- spec/digger_spec.rb
|
90
|
+
- spec/page_spec.rb
|
91
|
+
- spec/pattern_spec.rb
|
92
|
+
- spec/validate_spec.rb
|
90
93
|
homepage: ''
|
91
94
|
licenses:
|
92
95
|
- MIT
|
93
96
|
metadata: {}
|
94
|
-
post_install_message:
|
97
|
+
post_install_message:
|
95
98
|
rdoc_options: []
|
96
99
|
require_paths:
|
97
100
|
- lib
|
@@ -106,10 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
106
109
|
- !ruby/object:Gem::Version
|
107
110
|
version: '0'
|
108
111
|
requirements: []
|
109
|
-
|
110
|
-
|
111
|
-
signing_key:
|
112
|
+
rubygems_version: 3.2.32
|
113
|
+
signing_key:
|
112
114
|
specification_version: 4
|
113
115
|
summary: Dig need stractual infomation from web page.
|
114
116
|
test_files:
|
115
117
|
- spec/digger_spec.rb
|
118
|
+
- spec/page_spec.rb
|
119
|
+
- spec/pattern_spec.rb
|
120
|
+
- spec/validate_spec.rb
|