digger 0.1.2 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/digger.gemspec +2 -2
- data/lib/digger/http.rb +9 -2
- data/lib/digger/model.rb +9 -0
- data/lib/digger/page.rb +8 -10
- data/lib/digger/pattern.rb +31 -11
- data/lib/digger/version.rb +1 -1
- data/spec/digger_spec.rb +10 -5
- data/spec/page_spec.rb +27 -0
- data/spec/pattern_spec.rb +15 -0
- data/spec/validate_spec.rb +10 -0
- metadata +18 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f3e89f179fa868ecd2879180d1fbfbf03ba0ebee3731b9c8b4741d22663ff4aa
|
4
|
+
data.tar.gz: 1b27e4a1446e9835203bf5497aeebc3bc4ab58998a0fc443eeaaf7e7ec86c2c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5671e5d2484ca744e5c75f97beeb473e1970291fc094c0274714af27cc847ef47bcf3b1b81534e6f299f35648fdf3aabec9d90777e4b2fbe49dc2629c048f610
|
7
|
+
data.tar.gz: 496534bb394d17792dc7173759b83c0dd62b8569a241b2a6a57e60a14ea99ddafd7e030fe16480c5782ec319f6dc1d3563919f7202026fb008522dd26cae6c01
|
data/digger.gemspec
CHANGED
@@ -18,8 +18,8 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_development_dependency "bundler", "~>
|
22
|
-
spec.add_development_dependency "rake", "
|
21
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
22
|
+
spec.add_development_dependency "rake", ">= 12.3.3"
|
23
23
|
|
24
24
|
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
data/lib/digger/http.rb
CHANGED
@@ -49,7 +49,7 @@ module Digger
|
|
49
49
|
url = URI(url)
|
50
50
|
pages = []
|
51
51
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
52
|
-
handle_compression response
|
52
|
+
handle_compression response if handle_compression?
|
53
53
|
pages << Page.new(location, body: response.body,
|
54
54
|
code: code,
|
55
55
|
headers: response.to_hash,
|
@@ -70,6 +70,13 @@ module Digger
|
|
70
70
|
[Page.new(url, error: e, referer: referer, depth: depth)]
|
71
71
|
end
|
72
72
|
|
73
|
+
#
|
74
|
+
# Accept response compression, may bring encoding error if true
|
75
|
+
#
|
76
|
+
def handle_compression?
|
77
|
+
@opts[:handle_compression]
|
78
|
+
end
|
79
|
+
|
73
80
|
#
|
74
81
|
# The maximum number of redirects to follow
|
75
82
|
#
|
@@ -185,7 +192,7 @@ module Digger
|
|
185
192
|
opts['User-Agent'] = user_agent if user_agent
|
186
193
|
opts['Referer'] = referer.to_s if referer
|
187
194
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
188
|
-
opts['Accept-Encoding'] = 'gzip,deflate'
|
195
|
+
opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
|
189
196
|
|
190
197
|
retries = 0
|
191
198
|
begin
|
data/lib/digger/model.rb
CHANGED
@@ -17,6 +17,15 @@ module Digger
|
|
17
17
|
}
|
18
18
|
end
|
19
19
|
|
20
|
+
def validate_presence(*keys)
|
21
|
+
keys_all = pattern_config.keys
|
22
|
+
raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def validate_includeness(*keys)
|
26
|
+
raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
|
27
|
+
end
|
28
|
+
|
20
29
|
# index page
|
21
30
|
def index_config
|
22
31
|
@@digger_config['index'][self.name]
|
data/lib/digger/page.rb
CHANGED
@@ -3,6 +3,7 @@ require 'json'
|
|
3
3
|
require 'ostruct'
|
4
4
|
require 'set'
|
5
5
|
require 'kconv'
|
6
|
+
require 'uri'
|
6
7
|
|
7
8
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
8
9
|
module Digger
|
@@ -95,7 +96,13 @@ module Digger
|
|
95
96
|
end
|
96
97
|
end
|
97
98
|
|
99
|
+
def json
|
100
|
+
@json ||= JSON.parse body
|
101
|
+
end
|
98
102
|
|
103
|
+
def jsonp
|
104
|
+
@jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
|
105
|
+
end
|
99
106
|
|
100
107
|
#
|
101
108
|
# Discard links, a next call of page.links will return an empty array
|
@@ -180,16 +187,7 @@ module Digger
|
|
180
187
|
def to_absolute(link)
|
181
188
|
return nil if link.nil?
|
182
189
|
|
183
|
-
|
184
|
-
|
185
|
-
# remove anchor
|
186
|
-
link =
|
187
|
-
begin
|
188
|
-
URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
189
|
-
rescue URI::Error
|
190
|
-
return nil
|
191
|
-
end
|
192
|
-
|
190
|
+
link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
|
193
191
|
relative = begin
|
194
192
|
URI(link)
|
195
193
|
rescue URI::Error
|
data/lib/digger/pattern.rb
CHANGED
@@ -27,33 +27,36 @@ module Digger
|
|
27
27
|
end
|
28
28
|
|
29
29
|
MATCH_MAX = 3
|
30
|
-
|
31
|
-
TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
|
32
30
|
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
|
32
|
+
TYPES_CSS = %w{css_one css_many}
|
33
|
+
TYPES_JSON = %w{json jsonp}
|
34
|
+
|
35
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
36
36
|
|
37
37
|
def match_page(page, &callback)
|
38
38
|
blk = callback || safe_block
|
39
|
-
if
|
40
|
-
|
41
|
-
blk ||= ->(text){text.strip}
|
39
|
+
if TYPES_REGEXP.include?(type) # regular expression
|
40
|
+
blk ||= ->(text){ text.strip }
|
42
41
|
# content is String
|
43
42
|
if type == 'match_many'
|
44
43
|
match = page.body.gsub(value).to_a
|
45
44
|
else
|
45
|
+
index = TYPES_REGEXP.index(type)
|
46
46
|
matches = page.body.match(value)
|
47
47
|
match = matches.nil? ? nil : matches[index]
|
48
48
|
end
|
49
|
-
|
50
|
-
blk ||= ->(node){node.content.strip}
|
49
|
+
elsif TYPES_CSS.include?(type) # css expression
|
50
|
+
blk ||= ->(node){ node.content.strip }
|
51
51
|
# content is Nokogiri::HTML::Document
|
52
52
|
if type == 'css_one'
|
53
53
|
match = page.doc.css(value).first
|
54
|
-
|
54
|
+
else
|
55
55
|
match = page.doc.css(value)
|
56
56
|
end
|
57
|
+
elsif TYPES_JSON.include?(type)
|
58
|
+
json = page.send(type)
|
59
|
+
match = json_fetch(json, value)
|
57
60
|
end
|
58
61
|
if match.nil?
|
59
62
|
nil
|
@@ -66,6 +69,23 @@ module Digger
|
|
66
69
|
nil
|
67
70
|
end
|
68
71
|
|
72
|
+
def json_fetch(json, keys)
|
73
|
+
if keys.is_a? String
|
74
|
+
# parse json keys like '$.k1.k2[0]'
|
75
|
+
parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
|
76
|
+
p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
|
77
|
+
end
|
78
|
+
json_fetch(json, parts)
|
79
|
+
elsif keys.is_a? Array
|
80
|
+
if keys.length == 0
|
81
|
+
json
|
82
|
+
else
|
83
|
+
pt = keys.shift
|
84
|
+
json_fetch(json[pt[:index] || pt[:key]], keys)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
69
89
|
class Nokogiri::XML::Node
|
70
90
|
%w{one many}.each do |name|
|
71
91
|
define_method "inner_#{name}" do |css, &block|
|
data/lib/digger/version.rb
CHANGED
data/spec/digger_spec.rb
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
require 'digger'
|
2
2
|
|
3
3
|
http = Digger::HTTP.new
|
4
|
-
page = http.fetch_page('http://
|
4
|
+
page = http.fetch_page('http://www.baidu.com/')
|
5
5
|
|
6
|
-
pattern = Digger::Pattern.new({type: 'css_many', value: '
|
6
|
+
pattern = Digger::Pattern.new({ type: 'css_many', value: '#s-top-left>a' })
|
7
7
|
|
8
8
|
class Item < Digger::Model
|
9
|
-
css_many sites: '
|
9
|
+
css_many sites: '#s-top-left>a'
|
10
|
+
validate_presence :sites
|
11
|
+
validate_includeness :sites
|
10
12
|
end
|
11
13
|
|
12
14
|
describe Digger do
|
@@ -16,12 +18,15 @@ describe Digger do
|
|
16
18
|
|
17
19
|
it "pattern should match content" do
|
18
20
|
sites = pattern.match_page(page)
|
19
|
-
expect(sites.include?('
|
21
|
+
expect(sites.include?('新闻')).to eq(true)
|
20
22
|
end
|
21
23
|
|
22
24
|
it "model should dig content" do
|
23
25
|
item = Item.new.match_page(page)
|
24
|
-
expect(item[:sites].include?('
|
26
|
+
expect(item[:sites].include?('新闻')).to be(true)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "validation support" do
|
25
30
|
end
|
26
31
|
|
27
32
|
it "index multiple threading" do
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
describe Digger::Page do
|
6
|
+
it 'page json' do
|
7
|
+
json_str = '{"a":1,"b":[1,2,3]}'
|
8
|
+
j1 = Digger::Page.new('', body: json_str)
|
9
|
+
j2 = Digger::Page.new('', body: "hello(#{json_str});")
|
10
|
+
expect(j1.json['a']).to eq(1)
|
11
|
+
expect(j2.jsonp['a']).to eq(1)
|
12
|
+
expect(j1.json['b'][0]).to eq(1)
|
13
|
+
expect(j2.jsonp['b'][1]).to eq(2)
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'fetch baidu' do
|
17
|
+
http = Digger::HTTP.new
|
18
|
+
page = http.fetch_page('http://www.baidu.com/')
|
19
|
+
expect(page.code).to eq(200)
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'page uri' do
|
23
|
+
link ='https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
24
|
+
link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
|
25
|
+
p link
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'digger'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
describe Digger::Pattern do
|
5
|
+
it 'json fetch' do
|
6
|
+
json = JSON.parse('{"a":1,"b":[1,2,3]}')
|
7
|
+
pt = Digger::Pattern.new
|
8
|
+
expect(pt.json_fetch(json, '$')['a']).to eq(1)
|
9
|
+
expect(pt.json_fetch(json, '$.a')).to eq(1)
|
10
|
+
expect(pt.json_fetch(json, '$.b').length).to eq(3)
|
11
|
+
expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-12-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -87,11 +87,14 @@ files:
|
|
87
87
|
- lib/digger/pattern.rb
|
88
88
|
- lib/digger/version.rb
|
89
89
|
- spec/digger_spec.rb
|
90
|
+
- spec/page_spec.rb
|
91
|
+
- spec/pattern_spec.rb
|
92
|
+
- spec/validate_spec.rb
|
90
93
|
homepage: ''
|
91
94
|
licenses:
|
92
95
|
- MIT
|
93
96
|
metadata: {}
|
94
|
-
post_install_message:
|
97
|
+
post_install_message:
|
95
98
|
rdoc_options: []
|
96
99
|
require_paths:
|
97
100
|
- lib
|
@@ -106,10 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
106
109
|
- !ruby/object:Gem::Version
|
107
110
|
version: '0'
|
108
111
|
requirements: []
|
109
|
-
|
110
|
-
|
111
|
-
signing_key:
|
112
|
+
rubygems_version: 3.2.32
|
113
|
+
signing_key:
|
112
114
|
specification_version: 4
|
113
115
|
summary: Dig need stractual infomation from web page.
|
114
116
|
test_files:
|
115
117
|
- spec/digger_spec.rb
|
118
|
+
- spec/page_spec.rb
|
119
|
+
- spec/pattern_spec.rb
|
120
|
+
- spec/validate_spec.rb
|