digger 0.1.1 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/digger.gemspec +1 -1
- data/lib/digger/http.rb +9 -2
- data/lib/digger/model.rb +13 -4
- data/lib/digger/page.rb +6 -0
- data/lib/digger/pattern.rb +31 -11
- data/lib/digger/version.rb +1 -1
- data/spec/digger_spec.rb +6 -0
- data/spec/page_spec.rb +14 -0
- data/spec/pattern_spec.rb +15 -0
- data/spec/validate_spec.rb +10 -0
- metadata +14 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5c4a94163a25d4b53ad5b477040f69b8fccca026adc313f8f61759317c1bf198
|
4
|
+
data.tar.gz: 307b443277c16708103c172e5fb4ef4d833f7d2631a7f85779570a3cbeac8925
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9da40123fd09615d0c69ca5104d1141b82981813ef8d175bc567a0f35e8f7dd868ce235bc0f265308e2133710f63c3ce8325d6b528ae060eccb77904c12e3139
|
7
|
+
data.tar.gz: c05be67df6db25345acfdc3615690c0467029dd40ffde262863923b6f0786696a16fd547277ec4d3764ed39b399174dd28e10ca7eb31ffb55f773e08b04f2986
|
data/digger.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_development_dependency "bundler", "~>
|
21
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
22
22
|
spec.add_development_dependency "rake", "~> 10.0"
|
23
23
|
|
24
24
|
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
data/lib/digger/http.rb
CHANGED
@@ -49,7 +49,7 @@ module Digger
|
|
49
49
|
url = URI(url)
|
50
50
|
pages = []
|
51
51
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
52
|
-
handle_compression response
|
52
|
+
handle_compression response if handle_compression?
|
53
53
|
pages << Page.new(location, body: response.body,
|
54
54
|
code: code,
|
55
55
|
headers: response.to_hash,
|
@@ -70,6 +70,13 @@ module Digger
|
|
70
70
|
[Page.new(url, error: e, referer: referer, depth: depth)]
|
71
71
|
end
|
72
72
|
|
73
|
+
#
|
74
|
+
# Accept response compression, may bring encoding error if true
|
75
|
+
#
|
76
|
+
def handle_compression?
|
77
|
+
@opts[:handle_compression]
|
78
|
+
end
|
79
|
+
|
73
80
|
#
|
74
81
|
# The maximum number of redirects to follow
|
75
82
|
#
|
@@ -185,7 +192,7 @@ module Digger
|
|
185
192
|
opts['User-Agent'] = user_agent if user_agent
|
186
193
|
opts['Referer'] = referer.to_s if referer
|
187
194
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
188
|
-
opts['Accept-Encoding'] = 'gzip,deflate'
|
195
|
+
opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
|
189
196
|
|
190
197
|
retries = 0
|
191
198
|
begin
|
data/lib/digger/model.rb
CHANGED
@@ -17,6 +17,15 @@ module Digger
|
|
17
17
|
}
|
18
18
|
end
|
19
19
|
|
20
|
+
def validate_presence(*keys)
|
21
|
+
keys_all = pattern_config.keys
|
22
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def validate_includeness(*keys)
|
26
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
|
27
|
+
end
|
28
|
+
|
20
29
|
# index page
|
21
30
|
def index_config
|
22
31
|
@@digger_config['index'][self.name]
|
@@ -39,14 +48,14 @@ module Digger
|
|
39
48
|
result
|
40
49
|
end
|
41
50
|
|
42
|
-
def dig_url(url)
|
43
|
-
client = Digger::HTTP.new
|
51
|
+
def dig_url(url, opts = {})
|
52
|
+
client = Digger::HTTP.new(opts)
|
44
53
|
page = client.fetch_page(url)
|
45
54
|
match_page(page)
|
46
55
|
end
|
47
56
|
|
48
|
-
def dig_urls(urls, cocurrence = 1)
|
49
|
-
Index.batch(urls, cocurrence){|url| dig_url(url) }
|
57
|
+
def dig_urls(urls, cocurrence = 1, opts = {})
|
58
|
+
Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
|
50
59
|
end
|
51
60
|
|
52
61
|
def dig(cocurrence = 1)
|
data/lib/digger/page.rb
CHANGED
@@ -95,7 +95,13 @@ module Digger
|
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
+
def json
|
99
|
+
@json ||= JSON.parse body
|
100
|
+
end
|
98
101
|
|
102
|
+
def jsonp
|
103
|
+
@jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
|
104
|
+
end
|
99
105
|
|
100
106
|
#
|
101
107
|
# Discard links, a next call of page.links will return an empty array
|
data/lib/digger/pattern.rb
CHANGED
@@ -27,33 +27,36 @@ module Digger
|
|
27
27
|
end
|
28
28
|
|
29
29
|
MATCH_MAX = 3
|
30
|
-
|
31
|
-
TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
|
32
30
|
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
|
32
|
+
TYPES_CSS = %w{css_one css_many}
|
33
|
+
TYPES_JSON = %w{json jsonp}
|
34
|
+
|
35
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
36
36
|
|
37
37
|
def match_page(page, &callback)
|
38
38
|
blk = callback || safe_block
|
39
|
-
if
|
40
|
-
|
41
|
-
blk ||= ->(text){text.strip}
|
39
|
+
if TYPES_REGEXP.include?(type) # regular expression
|
40
|
+
blk ||= ->(text){ text.strip }
|
42
41
|
# content is String
|
43
42
|
if type == 'match_many'
|
44
43
|
match = page.body.gsub(value).to_a
|
45
44
|
else
|
45
|
+
index = TYPES_REGEXP.index(type)
|
46
46
|
matches = page.body.match(value)
|
47
47
|
match = matches.nil? ? nil : matches[index]
|
48
48
|
end
|
49
|
-
|
50
|
-
blk ||= ->(node){node.content.strip}
|
49
|
+
elsif TYPES_CSS.include?(type) # css expression
|
50
|
+
blk ||= ->(node){ node.content.strip }
|
51
51
|
# content is Nokogiri::HTML::Document
|
52
52
|
if type == 'css_one'
|
53
53
|
match = page.doc.css(value).first
|
54
|
-
|
54
|
+
else
|
55
55
|
match = page.doc.css(value)
|
56
56
|
end
|
57
|
+
elsif TYPES_JSON.include?(type)
|
58
|
+
json = page.send(type)
|
59
|
+
match = json_fetch(json, value)
|
57
60
|
end
|
58
61
|
if match.nil?
|
59
62
|
nil
|
@@ -66,6 +69,23 @@ module Digger
|
|
66
69
|
nil
|
67
70
|
end
|
68
71
|
|
72
|
+
def json_fetch(json, keys)
|
73
|
+
if keys.is_a? String
|
74
|
+
# parse json keys like '$.k1.k2[0]'
|
75
|
+
parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
|
76
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
77
|
+
end
|
78
|
+
json_fetch(json, parts)
|
79
|
+
elsif keys.is_a? Array
|
80
|
+
if keys.length == 0
|
81
|
+
json
|
82
|
+
else
|
83
|
+
pt = keys.shift
|
84
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
69
89
|
class Nokogiri::XML::Node
|
70
90
|
%w{one many}.each do |name|
|
71
91
|
define_method "inner_#{name}" do |css, &block|
|
data/lib/digger/version.rb
CHANGED
data/spec/digger_spec.rb
CHANGED
@@ -7,6 +7,9 @@ pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
|
|
7
7
|
|
8
8
|
class Item < Digger::Model
|
9
9
|
css_many sites: '.sites>a>span'
|
10
|
+
css_one logo: '.logo'
|
11
|
+
validate_presence :sites
|
12
|
+
validate_includeness :sites, :logo
|
10
13
|
end
|
11
14
|
|
12
15
|
describe Digger do
|
@@ -24,6 +27,9 @@ describe Digger do
|
|
24
27
|
expect(item[:sites].include?('读远')).to be(true)
|
25
28
|
end
|
26
29
|
|
30
|
+
it "validation support" do
|
31
|
+
end
|
32
|
+
|
27
33
|
it "index multiple threading" do
|
28
34
|
|
29
35
|
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
describe Digger::Page do
|
5
|
+
it 'page json' do
|
6
|
+
json_str = '{"a":1,"b":[1,2,3]}'
|
7
|
+
j1 = Digger::Page.new('', body: json_str)
|
8
|
+
j2 = Digger::Page.new('', body: "hello(#{json_str});")
|
9
|
+
expect(j1.json['a']).to eq(1)
|
10
|
+
expect(j2.jsonp['a']).to eq(1)
|
11
|
+
expect(j1.json['b'][0]).to eq(1)
|
12
|
+
expect(j2.jsonp['b'][1]).to eq(2)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
describe Digger::Pattern do
|
5
|
+
it 'json fetch' do
|
6
|
+
json = JSON.parse('{"a":1,"b":[1,2,3]}')
|
7
|
+
pt = Digger::Pattern.new
|
8
|
+
expect(pt.json_fetch(json, '$')['a']).to eq(1)
|
9
|
+
expect(pt.json_fetch(json, '$.a')).to eq(1)
|
10
|
+
expect(pt.json_fetch(json, '$.b').length).to eq(3)
|
11
|
+
expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-12-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -87,11 +87,14 @@ files:
|
|
87
87
|
- lib/digger/pattern.rb
|
88
88
|
- lib/digger/version.rb
|
89
89
|
- spec/digger_spec.rb
|
90
|
+
- spec/page_spec.rb
|
91
|
+
- spec/pattern_spec.rb
|
92
|
+
- spec/validate_spec.rb
|
90
93
|
homepage: ''
|
91
94
|
licenses:
|
92
95
|
- MIT
|
93
96
|
metadata: {}
|
94
|
-
post_install_message:
|
97
|
+
post_install_message:
|
95
98
|
rdoc_options: []
|
96
99
|
require_paths:
|
97
100
|
- lib
|
@@ -106,10 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
106
109
|
- !ruby/object:Gem::Version
|
107
110
|
version: '0'
|
108
111
|
requirements: []
|
109
|
-
|
110
|
-
|
111
|
-
signing_key:
|
112
|
+
rubygems_version: 3.2.32
|
113
|
+
signing_key:
|
112
114
|
specification_version: 4
|
113
115
|
summary: Dig need stractual infomation from web page.
|
114
116
|
test_files:
|
115
117
|
- spec/digger_spec.rb
|
118
|
+
- spec/page_spec.rb
|
119
|
+
- spec/pattern_spec.rb
|
120
|
+
- spec/validate_spec.rb
|