digger 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +2 -0
- data/digger.gemspec +26 -0
- data/lib/digger.rb +9 -0
- data/lib/digger/http.rb +284 -0
- data/lib/digger/model.rb +41 -0
- data/lib/digger/page.rb +279 -0
- data/lib/digger/pattern.rb +91 -0
- data/lib/digger/version.rb +3 -0
- data/spec/digger_spec.rb +26 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 71e41cc25211835ca901f56a0514d6d4f94326d9
|
4
|
+
data.tar.gz: e7f954b64cb216e5cda6391940576fe0f188553e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 78a0717ae08e03a0325dc338411f3aae055a2f19c0849802d2aa8c0b17b6fabeacc98a9dea66142bf8f5058b08c6d4043244b65db944f28dc5f8317ccc641f4f
|
7
|
+
data.tar.gz: 186bfded593330616d7849dd519b8f47a5bd3e1ed2dce7bb33580dcbdb4e61bbb1eabb49bcf7a7c23d6cd93d56fed8e93cc41d366797914b55e83da7c0638437
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 binz
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# Digger
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'digger'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install digger
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
TODO: Write usage instructions here
|
24
|
+
|
25
|
+
## Contributing
|
26
|
+
|
27
|
+
1. Fork it ( https://github.com/[my-github-username]/digger/fork )
|
28
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
30
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
31
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/digger.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'digger/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "digger"
|
8
|
+
spec.version = Digger::VERSION
|
9
|
+
spec.authors = ["binz"]
|
10
|
+
spec.email = ["xinkiang@gmail.com"]
|
11
|
+
spec.summary = %q{Dig need stractual infomation from web page.}
|
12
|
+
spec.description = %q{}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
|
24
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
|
+
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
26
|
+
end
|
data/lib/digger.rb
ADDED
data/lib/digger/http.rb
ADDED
@@ -0,0 +1,284 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'http/cookie'
|
3
|
+
require 'zlib'
|
4
|
+
require 'digger/page'
|
5
|
+
|
6
|
+
|
7
|
+
# https://github.com/taganaka/polipus/blob/master/lib/polipus/http.rb
|
8
|
+
|
9
|
+
module Digger
|
10
|
+
class HTTP
|
11
|
+
# Maximum number of redirects to follow on each get_response
|
12
|
+
REDIRECT_LIMIT = 5
|
13
|
+
RESCUABLE_ERRORS = [
|
14
|
+
EOFError,
|
15
|
+
Errno::ECONNREFUSED,
|
16
|
+
Errno::ECONNRESET,
|
17
|
+
Errno::EHOSTUNREACH,
|
18
|
+
Errno::EINVAL,
|
19
|
+
Errno::EPIPE,
|
20
|
+
Errno::ETIMEDOUT,
|
21
|
+
Net::HTTPBadResponse,
|
22
|
+
Net::HTTPHeaderSyntaxError,
|
23
|
+
Net::ProtocolError,
|
24
|
+
SocketError,
|
25
|
+
Timeout::Error,
|
26
|
+
Zlib::DataError,
|
27
|
+
Zlib::GzipFile::Error
|
28
|
+
]
|
29
|
+
|
30
|
+
def initialize(opts = {})
|
31
|
+
@connections = {}
|
32
|
+
@connections_hits = {}
|
33
|
+
@opts = opts
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
38
|
+
# Just gets the final destination page.
|
39
|
+
#
|
40
|
+
def fetch_page(url, referer = nil, depth = nil)
|
41
|
+
fetch_pages(url, referer, depth).last
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
46
|
+
# including redirects
|
47
|
+
#
|
48
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
49
|
+
url = URI(url)
|
50
|
+
pages = []
|
51
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
52
|
+
handle_compression response
|
53
|
+
pages << Page.new(location, body: response.body,
|
54
|
+
code: code,
|
55
|
+
headers: response.to_hash,
|
56
|
+
referer: referer,
|
57
|
+
depth: depth,
|
58
|
+
redirect_to: redirect_to,
|
59
|
+
response_time: response_time,
|
60
|
+
fetched_at: Time.now.to_i)
|
61
|
+
end
|
62
|
+
|
63
|
+
pages
|
64
|
+
rescue *RESCUABLE_ERRORS => e
|
65
|
+
if verbose?
|
66
|
+
puts e.inspect
|
67
|
+
puts e.backtrace
|
68
|
+
end
|
69
|
+
|
70
|
+
[Page.new(url, error: e, referer: referer, depth: depth)]
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# The maximum number of redirects to follow
|
75
|
+
#
|
76
|
+
def redirect_limit
|
77
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# The user-agent string which will be sent with each request,
|
82
|
+
# or nil if no such option is set
|
83
|
+
#
|
84
|
+
def user_agent
|
85
|
+
if @opts[:user_agent].respond_to?(:sample)
|
86
|
+
@opts[:user_agent].sample
|
87
|
+
else
|
88
|
+
@opts[:user_agent]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# The proxy address string
|
94
|
+
#
|
95
|
+
def proxy_host
|
96
|
+
@opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# The proxy port
|
101
|
+
#
|
102
|
+
def proxy_port
|
103
|
+
@opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# The proxy username
|
108
|
+
#
|
109
|
+
def proxy_user
|
110
|
+
@opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# The proxy password
|
115
|
+
#
|
116
|
+
def proxy_pass
|
117
|
+
#return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
|
118
|
+
@opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Shorthand to get proxy info with a single call
|
123
|
+
# It returns an array of ['addr', port, 'user', 'pass']
|
124
|
+
#
|
125
|
+
def proxy_host_port
|
126
|
+
@opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# HTTP read timeout in seconds
|
131
|
+
#
|
132
|
+
def read_timeout
|
133
|
+
@opts[:read_timeout]
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# HTTP open timeout in seconds
|
138
|
+
#
|
139
|
+
def open_timeout
|
140
|
+
@opts[:open_timeout]
|
141
|
+
end
|
142
|
+
|
143
|
+
# Does this HTTP client accept cookies from the server?
|
144
|
+
#
|
145
|
+
def accept_cookies?
|
146
|
+
@opts[:accept_cookies]
|
147
|
+
end
|
148
|
+
|
149
|
+
def cookie_jar
|
150
|
+
@opts[:cookie_jar] ||= ::HTTP::CookieJar.new
|
151
|
+
@opts[:cookie_jar]
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
156
|
+
#
|
157
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
158
|
+
# Yields the response object, response code, and URI location
|
159
|
+
# for each response.
|
160
|
+
#
|
161
|
+
def get(url, referer = nil)
|
162
|
+
limit = redirect_limit
|
163
|
+
loc = url
|
164
|
+
loop do
|
165
|
+
# if redirected to a relative url, merge it with the host of the original
|
166
|
+
# request url
|
167
|
+
loc = url.merge(loc) if loc.relative?
|
168
|
+
|
169
|
+
response, response_time = get_response(loc, referer)
|
170
|
+
code = Integer(response.code)
|
171
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
172
|
+
yield response, code, loc, redirect_to, response_time
|
173
|
+
limit -= 1
|
174
|
+
break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
#
|
179
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
180
|
+
#
|
181
|
+
def get_response(url, referer = nil)
|
182
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
183
|
+
|
184
|
+
opts = {}
|
185
|
+
opts['User-Agent'] = user_agent if user_agent
|
186
|
+
opts['Referer'] = referer.to_s if referer
|
187
|
+
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
188
|
+
opts['Accept-Encoding'] = 'gzip,deflate'
|
189
|
+
|
190
|
+
retries = 0
|
191
|
+
begin
|
192
|
+
start = Time.now
|
193
|
+
# format request
|
194
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
195
|
+
# HTTP Basic authentication
|
196
|
+
req.basic_auth url.user, url.password if url.user
|
197
|
+
if @opts[:http_user]
|
198
|
+
req.basic_auth @opts[:http_user], @opts[:http_password]
|
199
|
+
end
|
200
|
+
# urls auth schema has higher priority
|
201
|
+
req.basic_auth url.user, url.password if url.user
|
202
|
+
response = connection(url).request(req)
|
203
|
+
finish = Time.now
|
204
|
+
response_time = ((finish - start) * 1000).round
|
205
|
+
cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
|
206
|
+
return response, response_time
|
207
|
+
rescue *RESCUABLE_ERRORS => e
|
208
|
+
puts e.inspect if verbose?
|
209
|
+
refresh_connection(url)
|
210
|
+
retries += 1
|
211
|
+
if retries < 3
|
212
|
+
retry
|
213
|
+
else
|
214
|
+
raise e
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def connection(url)
|
220
|
+
@connections[url.host] ||= {}
|
221
|
+
@connections_hits[url.host] ||= {}
|
222
|
+
|
223
|
+
if @connections[url.host][url.port]
|
224
|
+
if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
|
225
|
+
@opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
|
226
|
+
return refresh_connection url
|
227
|
+
end
|
228
|
+
@connections_hits[url.host][url.port] += 1
|
229
|
+
return @connections[url.host][url.port]
|
230
|
+
end
|
231
|
+
|
232
|
+
refresh_connection url
|
233
|
+
end
|
234
|
+
|
235
|
+
def refresh_connection(url)
|
236
|
+
if @opts[:logger] && proxy_host && proxy_port
|
237
|
+
@opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
|
238
|
+
end
|
239
|
+
|
240
|
+
# Block has higher priority
|
241
|
+
unless @opts[:proxy_host_port].nil?
|
242
|
+
p_host, p_port, p_user, p_pass = proxy_host_port
|
243
|
+
else
|
244
|
+
p_host = proxy_host
|
245
|
+
p_port = proxy_port
|
246
|
+
p_user = proxy_user
|
247
|
+
p_pass = proxy_pass
|
248
|
+
end
|
249
|
+
|
250
|
+
http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
|
251
|
+
|
252
|
+
http.read_timeout = read_timeout if read_timeout
|
253
|
+
http.open_timeout = open_timeout if open_timeout
|
254
|
+
|
255
|
+
if url.scheme == 'https'
|
256
|
+
http.use_ssl = true
|
257
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
258
|
+
end
|
259
|
+
@connections_hits[url.host][url.port] = 1
|
260
|
+
@connections[url.host][url.port] = http.start
|
261
|
+
end
|
262
|
+
|
263
|
+
def verbose?
|
264
|
+
@opts[:verbose]
|
265
|
+
end
|
266
|
+
|
267
|
+
#
|
268
|
+
# Allowed to connect to the requested url?
|
269
|
+
#
|
270
|
+
def allowed?(to_url, from_url)
|
271
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
272
|
+
end
|
273
|
+
|
274
|
+
def handle_compression(response)
|
275
|
+
case response['content-encoding']
|
276
|
+
when 'gzip', 'x-gzip'
|
277
|
+
body_io = StringIO.new(response.body)
|
278
|
+
response.body.replace Zlib::GzipReader.new(body_io).read
|
279
|
+
when 'deflate'
|
280
|
+
response.body.replace Zlib::Inflate.inflate(response.body)
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|
data/lib/digger/model.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'digger/pattern'
|
2
|
+
|
3
|
+
module Digger
|
4
|
+
class Model
|
5
|
+
@@patterns = {}
|
6
|
+
|
7
|
+
class << self
|
8
|
+
def pattern_config
|
9
|
+
@@patterns[self.name] ||= {}
|
10
|
+
end
|
11
|
+
|
12
|
+
Pattern::TYPES.each do |method|
|
13
|
+
define_method method, ->(pairs, &block){
|
14
|
+
pairs.each_pair do |key, value|
|
15
|
+
pattern_config[key] = Pattern.new(type: method, value: value, block: block)
|
16
|
+
end
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
def index_page
|
21
|
+
end
|
22
|
+
|
23
|
+
def one_page
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def match_page(page)
|
28
|
+
result = {}
|
29
|
+
self.class.pattern_config.each_pair do |key, pattern|
|
30
|
+
result[key] = pattern.match_page(page)
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def dig(url)
|
36
|
+
client = Digger::HTTP.new
|
37
|
+
page = client.fetch_page(url)
|
38
|
+
match_page(page)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/digger/page.rb
ADDED
@@ -0,0 +1,279 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'json'
|
3
|
+
require 'ostruct'
|
4
|
+
require 'set'
|
5
|
+
require 'kconv'
|
6
|
+
|
7
|
+
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
8
|
+
module Digger
|
9
|
+
class Page
|
10
|
+
attr_reader :url
|
11
|
+
# The raw HTTP response body of the page
|
12
|
+
attr_reader :body
|
13
|
+
# Headers of the HTTP response
|
14
|
+
attr_reader :headers
|
15
|
+
# URL of the page this one redirected to, if any
|
16
|
+
attr_reader :redirect_to
|
17
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
18
|
+
attr_reader :error
|
19
|
+
# Integer response code of the page
|
20
|
+
attr_accessor :code
|
21
|
+
# Depth of this page from the root of the crawl.
|
22
|
+
attr_accessor :depth
|
23
|
+
# URL of the page that brought us to this page
|
24
|
+
attr_accessor :referer
|
25
|
+
# Response time of the request for this page in milliseconds
|
26
|
+
attr_accessor :response_time
|
27
|
+
# OpenStruct it holds users defined data
|
28
|
+
attr_accessor :user_data
|
29
|
+
|
30
|
+
attr_accessor :aliases
|
31
|
+
|
32
|
+
attr_accessor :domain_aliases
|
33
|
+
|
34
|
+
# Whether the current page should be stored
|
35
|
+
# Default: true
|
36
|
+
attr_accessor :storable
|
37
|
+
|
38
|
+
attr_accessor :fetched_at
|
39
|
+
|
40
|
+
#
|
41
|
+
# Create a new page
|
42
|
+
#
|
43
|
+
def initialize(url, params = {})
|
44
|
+
@url = URI(url)
|
45
|
+
@code = params[:code]
|
46
|
+
@headers = params[:headers] || {}
|
47
|
+
@headers['content-type'] ||= ['']
|
48
|
+
@aliases = Array(params[:aka]).compact
|
49
|
+
@referer = params[:referer]
|
50
|
+
@depth = params[:depth] || 0
|
51
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
52
|
+
@response_time = params[:response_time]
|
53
|
+
@body = params[:body]
|
54
|
+
@error = params[:error]
|
55
|
+
@fetched = !params[:code].nil?
|
56
|
+
@user_data = OpenStruct.new
|
57
|
+
@domain_aliases = params[:domain_aliases] ||= []
|
58
|
+
@storable = true
|
59
|
+
@fetched_at = params[:fetched_at]
|
60
|
+
end
|
61
|
+
|
62
|
+
def title
|
63
|
+
doc.title if doc
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# Array of distinct A tag HREFs from the page
|
68
|
+
#
|
69
|
+
def links
|
70
|
+
unless @links.nil?
|
71
|
+
@links = Set.new
|
72
|
+
return [] unless doc
|
73
|
+
|
74
|
+
doc.search('//a[@href]').each do |a|
|
75
|
+
u = a['href']
|
76
|
+
next if u.nil? || u.empty?
|
77
|
+
abs = to_absolute(u) rescue next
|
78
|
+
@links << abs if abs && in_domain?(abs)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
@links.to_a
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Nokogiri document for the HTML body
|
86
|
+
#
|
87
|
+
def doc
|
88
|
+
# return @doc if @doc
|
89
|
+
# @body ||= ''
|
90
|
+
# @body = @body.encode('utf-8', 'binary', :invalid => :replace,
|
91
|
+
# :undef => :replace, :replace => '')
|
92
|
+
# @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
|
93
|
+
@doc ||= begin
|
94
|
+
Nokogiri::HTML(body) if !body.nil? && html? rescue nil
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
#
|
101
|
+
# Discard links, a next call of page.links will return an empty array
|
102
|
+
#
|
103
|
+
def discard_links!
|
104
|
+
@links = []
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Delete the Nokogiri document and response body to conserve memory
|
109
|
+
#
|
110
|
+
def discard_doc!
|
111
|
+
links # force parsing of page links before we trash the document
|
112
|
+
@doc = @body = nil
|
113
|
+
end
|
114
|
+
|
115
|
+
#
|
116
|
+
# Was the page successfully fetched?
|
117
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
118
|
+
#
|
119
|
+
def fetched?
|
120
|
+
@fetched
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# The content-type returned by the HTTP request for this page
|
125
|
+
#
|
126
|
+
def content_type
|
127
|
+
headers['content-type'].first
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
132
|
+
# otherwise.
|
133
|
+
#
|
134
|
+
def html?
|
135
|
+
content_type =~ %r{^(text/html|application/xhtml+xml)\b}
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
140
|
+
# otherwise.
|
141
|
+
#
|
142
|
+
def redirect?
|
143
|
+
(300...400).include?(@code)
|
144
|
+
end
|
145
|
+
|
146
|
+
#
|
147
|
+
# Returns +true+ if the page is a HTTP success, returns +false+
|
148
|
+
# otherwise.
|
149
|
+
#
|
150
|
+
def success?
|
151
|
+
(200..206).include?(@code)
|
152
|
+
end
|
153
|
+
|
154
|
+
#
|
155
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
156
|
+
# returns +false+ otherwise.
|
157
|
+
#
|
158
|
+
def not_found?
|
159
|
+
404 == @code
|
160
|
+
end
|
161
|
+
|
162
|
+
#
|
163
|
+
# Base URI from the HTML doc head element
|
164
|
+
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
|
165
|
+
#
|
166
|
+
def base
|
167
|
+
@base = if doc
|
168
|
+
href = doc.search('//head/base/@href')
|
169
|
+
URI(href.to_s) unless href.nil? rescue nil
|
170
|
+
end unless @base
|
171
|
+
|
172
|
+
return nil if @base && @base.to_s.empty?
|
173
|
+
@base
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
# Converts relative URL *link* into an absolute URL based on the
|
178
|
+
# location of the page
|
179
|
+
#
|
180
|
+
def to_absolute(link)
|
181
|
+
return nil if link.nil?
|
182
|
+
|
183
|
+
# link = link.to_s.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
|
184
|
+
|
185
|
+
# remove anchor
|
186
|
+
link =
|
187
|
+
begin
|
188
|
+
URI.encode(URI.decode(link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
189
|
+
rescue URI::Error
|
190
|
+
return nil
|
191
|
+
end
|
192
|
+
|
193
|
+
relative = begin
|
194
|
+
URI(link)
|
195
|
+
rescue URI::Error
|
196
|
+
return nil
|
197
|
+
end
|
198
|
+
absolute = base ? base.merge(relative) : @url.merge(relative)
|
199
|
+
|
200
|
+
absolute.path = '/' if absolute.path.empty?
|
201
|
+
|
202
|
+
absolute
|
203
|
+
end
|
204
|
+
|
205
|
+
#
|
206
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
207
|
+
# +false+ otherwise
|
208
|
+
#
|
209
|
+
def in_domain?(uri)
|
210
|
+
@domain_aliases ||= []
|
211
|
+
uri.host == @url.host || @domain_aliases.include?(uri.host)
|
212
|
+
end
|
213
|
+
|
214
|
+
def to_hash
|
215
|
+
{
|
216
|
+
'url' => @url.to_s,
|
217
|
+
'headers' => Marshal.dump(@headers),
|
218
|
+
'body' => @body,
|
219
|
+
'links' => links.map(&:to_s),
|
220
|
+
'code' => @code,
|
221
|
+
'depth' => @depth,
|
222
|
+
'referer' => @referer.to_s,
|
223
|
+
'redirect_to' => @redirect_to.to_s,
|
224
|
+
'response_time' => @response_time,
|
225
|
+
'fetched' => @fetched,
|
226
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
227
|
+
'fetched_at' => @fetched_at,
|
228
|
+
'error' => @error.to_s
|
229
|
+
}
|
230
|
+
end
|
231
|
+
|
232
|
+
def to_json
|
233
|
+
th = to_hash.dup
|
234
|
+
th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
235
|
+
th.delete('headers') if content_type.empty?
|
236
|
+
th.to_json
|
237
|
+
end
|
238
|
+
|
239
|
+
#
|
240
|
+
# Returns +true+ if page is marked as storeable
|
241
|
+
# +false+ otherwise
|
242
|
+
# Default is +true+
|
243
|
+
#
|
244
|
+
def storable?
|
245
|
+
@storable
|
246
|
+
end
|
247
|
+
|
248
|
+
def expired?(ttl)
|
249
|
+
return false if fetched_at.nil?
|
250
|
+
(Time.now.to_i - ttl) > fetched_at
|
251
|
+
end
|
252
|
+
|
253
|
+
def self.from_hash(hash)
|
254
|
+
page = new(URI(hash['url']))
|
255
|
+
{
|
256
|
+
'@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
|
257
|
+
'@body' => hash['body'],
|
258
|
+
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
259
|
+
'@code' => hash['code'].to_i,
|
260
|
+
'@depth' => hash['depth'].to_i,
|
261
|
+
'@referer' => hash['referer'],
|
262
|
+
'@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
263
|
+
'@response_time' => hash['response_time'].to_i,
|
264
|
+
'@fetched' => hash['fetched'],
|
265
|
+
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
266
|
+
'@fetched_at' => hash['fetched_at'],
|
267
|
+
'@error' => hash['error']
|
268
|
+
}.each do |var, value|
|
269
|
+
page.instance_variable_set(var, value)
|
270
|
+
end
|
271
|
+
page
|
272
|
+
end
|
273
|
+
|
274
|
+
def self.from_json(json)
|
275
|
+
hash = JSON.parse json
|
276
|
+
from_hash hash
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Digger
|
4
|
+
class Pattern
|
5
|
+
attr_accessor :type, :value, :block
|
6
|
+
|
7
|
+
def initialize(hash = {})
|
8
|
+
hash.each_pair{|key, value| send("#{key}=", value) if %w{type value block}.include?(key.to_s)}
|
9
|
+
end
|
10
|
+
|
11
|
+
def safe_block
|
12
|
+
block && begin
|
13
|
+
if block.respond_to?(:call)
|
14
|
+
block
|
15
|
+
elsif block.strip == '' #
|
16
|
+
nil
|
17
|
+
else
|
18
|
+
proc{ $SAFE = 2; eval block }.call
|
19
|
+
end
|
20
|
+
rescue StandardError
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.wrap(hash)
|
26
|
+
Hash[hash.map{|key, value| [key, value.is_a?(Pattern) ? value : Pattern.new(value)]}]
|
27
|
+
end
|
28
|
+
|
29
|
+
MATCH_MAX = 3
|
30
|
+
|
31
|
+
TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
|
32
|
+
|
33
|
+
def regexp?
|
34
|
+
TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
|
35
|
+
end
|
36
|
+
|
37
|
+
def match_page(page, &callback)
|
38
|
+
blk = callback || safe_block
|
39
|
+
if regexp? # regular expression
|
40
|
+
index = TYPES.index(type)
|
41
|
+
blk ||= ->(text){text.strip}
|
42
|
+
# content is String
|
43
|
+
if type == 'match_many'
|
44
|
+
match = page.body.gsub(value).to_a
|
45
|
+
else
|
46
|
+
matches = page.body.match(value)
|
47
|
+
match = matches.nil? ? nil : matches[index]
|
48
|
+
end
|
49
|
+
else # css expression
|
50
|
+
blk ||= ->(node){node.content.strip}
|
51
|
+
# content is Nokogiri::HTML::Document
|
52
|
+
if type == 'css_one'
|
53
|
+
match = page.doc.css(value).first
|
54
|
+
elsif type == 'css_many' # css_many
|
55
|
+
match = page.doc.css(value)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
if match.nil?
|
59
|
+
nil
|
60
|
+
elsif %w{css_many match_many}.include? type
|
61
|
+
match.map{|node| blk.call(node) }.uniq
|
62
|
+
else
|
63
|
+
blk.call(match)
|
64
|
+
end
|
65
|
+
rescue
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
class Nokogiri::XML::Node
|
70
|
+
%w{one many}.each do |name|
|
71
|
+
define_method "inner_#{name}" do |css, &block|
|
72
|
+
callback = ->(node) do
|
73
|
+
if node
|
74
|
+
(block || ->(n){n.text.strip}).call(node)
|
75
|
+
else
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
end
|
79
|
+
if name == 'one' # inner_one
|
80
|
+
callback.call(self.css(css).first)
|
81
|
+
else # inner_many
|
82
|
+
self.css(css).map{|node| callback.call(node)}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
def source
|
87
|
+
to_xml
|
88
|
+
end
|
89
|
+
end # nokogiri
|
90
|
+
end
|
91
|
+
end
|
data/spec/digger_spec.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'digger'
|
2
|
+
|
3
|
+
http = Digger::HTTP.new
|
4
|
+
page = http.fetch_page('http://nan.so/')
|
5
|
+
|
6
|
+
pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
|
7
|
+
|
8
|
+
class Item < Digger::Model
|
9
|
+
css_many sites: '.sites>a>span'
|
10
|
+
end
|
11
|
+
|
12
|
+
describe Digger do
|
13
|
+
it "http should fetch a page" do
|
14
|
+
expect(page.code).to eq(200)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "pattern should match content" do
|
18
|
+
sites = pattern.match_page(page)
|
19
|
+
expect(sites.include?('百度网盘')).to eq(true)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "model should dig content" do
|
23
|
+
item = Item.new.match_page(page)
|
24
|
+
expect(item[:sites].include?('读远')).to be(true)
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: digger
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- binz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: http-cookie
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.0'
|
69
|
+
description: ''
|
70
|
+
email:
|
71
|
+
- xinkiang@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- Gemfile
|
78
|
+
- LICENSE.txt
|
79
|
+
- README.md
|
80
|
+
- Rakefile
|
81
|
+
- digger.gemspec
|
82
|
+
- lib/digger.rb
|
83
|
+
- lib/digger/http.rb
|
84
|
+
- lib/digger/model.rb
|
85
|
+
- lib/digger/page.rb
|
86
|
+
- lib/digger/pattern.rb
|
87
|
+
- lib/digger/version.rb
|
88
|
+
- spec/digger_spec.rb
|
89
|
+
homepage: ''
|
90
|
+
licenses:
|
91
|
+
- MIT
|
92
|
+
metadata: {}
|
93
|
+
post_install_message:
|
94
|
+
rdoc_options: []
|
95
|
+
require_paths:
|
96
|
+
- lib
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 2.2.2
|
110
|
+
signing_key:
|
111
|
+
specification_version: 4
|
112
|
+
summary: Dig need stractual infomation from web page.
|
113
|
+
test_files:
|
114
|
+
- spec/digger_spec.rb
|