digger 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +2 -0
- data/digger.gemspec +26 -0
- data/lib/digger.rb +9 -0
- data/lib/digger/http.rb +284 -0
- data/lib/digger/model.rb +41 -0
- data/lib/digger/page.rb +279 -0
- data/lib/digger/pattern.rb +91 -0
- data/lib/digger/version.rb +3 -0
- data/spec/digger_spec.rb +26 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 71e41cc25211835ca901f56a0514d6d4f94326d9
|
4
|
+
data.tar.gz: e7f954b64cb216e5cda6391940576fe0f188553e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 78a0717ae08e03a0325dc338411f3aae055a2f19c0849802d2aa8c0b17b6fabeacc98a9dea66142bf8f5058b08c6d4043244b65db944f28dc5f8317ccc641f4f
|
7
|
+
data.tar.gz: 186bfded593330616d7849dd519b8f47a5bd3e1ed2dce7bb33580dcbdb4e61bbb1eabb49bcf7a7c23d6cd93d56fed8e93cc41d366797914b55e83da7c0638437
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 binz
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# Digger
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'digger'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install digger
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
TODO: Write usage instructions here
|
24
|
+
|
25
|
+
## Contributing
|
26
|
+
|
27
|
+
1. Fork it ( https://github.com/[my-github-username]/digger/fork )
|
28
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
30
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
31
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/digger.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'digger/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "digger"
|
8
|
+
spec.version = Digger::VERSION
|
9
|
+
spec.authors = ["binz"]
|
10
|
+
spec.email = ["xinkiang@gmail.com"]
|
11
|
+
spec.summary = %q{Dig need stractual infomation from web page.}
|
12
|
+
spec.description = %q{}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
|
24
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
|
+
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
26
|
+
end
|
data/lib/digger.rb
ADDED
data/lib/digger/http.rb
ADDED
@@ -0,0 +1,284 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'http/cookie'
|
3
|
+
require 'zlib'
|
4
|
+
require 'digger/page'
|
5
|
+
|
6
|
+
|
7
|
+
# https://github.com/taganaka/polipus/blob/master/lib/polipus/http.rb
|
8
|
+
|
9
|
+
module Digger
|
10
|
+
class HTTP
|
11
|
+
# Maximum number of redirects to follow on each get_response
|
12
|
+
REDIRECT_LIMIT = 5
|
13
|
+
RESCUABLE_ERRORS = [
|
14
|
+
EOFError,
|
15
|
+
Errno::ECONNREFUSED,
|
16
|
+
Errno::ECONNRESET,
|
17
|
+
Errno::EHOSTUNREACH,
|
18
|
+
Errno::EINVAL,
|
19
|
+
Errno::EPIPE,
|
20
|
+
Errno::ETIMEDOUT,
|
21
|
+
Net::HTTPBadResponse,
|
22
|
+
Net::HTTPHeaderSyntaxError,
|
23
|
+
Net::ProtocolError,
|
24
|
+
SocketError,
|
25
|
+
Timeout::Error,
|
26
|
+
Zlib::DataError,
|
27
|
+
Zlib::GzipFile::Error
|
28
|
+
]
|
29
|
+
|
30
|
+
def initialize(opts = {})
|
31
|
+
@connections = {}
|
32
|
+
@connections_hits = {}
|
33
|
+
@opts = opts
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
38
|
+
# Just gets the final destination page.
|
39
|
+
#
|
40
|
+
def fetch_page(url, referer = nil, depth = nil)
|
41
|
+
fetch_pages(url, referer, depth).last
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
46
|
+
# including redirects
|
47
|
+
#
|
48
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
49
|
+
url = URI(url)
|
50
|
+
pages = []
|
51
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
52
|
+
handle_compression response
|
53
|
+
pages << Page.new(location, body: response.body,
|
54
|
+
code: code,
|
55
|
+
headers: response.to_hash,
|
56
|
+
referer: referer,
|
57
|
+
depth: depth,
|
58
|
+
redirect_to: redirect_to,
|
59
|
+
response_time: response_time,
|
60
|
+
fetched_at: Time.now.to_i)
|
61
|
+
end
|
62
|
+
|
63
|
+
pages
|
64
|
+
rescue *RESCUABLE_ERRORS => e
|
65
|
+
if verbose?
|
66
|
+
puts e.inspect
|
67
|
+
puts e.backtrace
|
68
|
+
end
|
69
|
+
|
70
|
+
[Page.new(url, error: e, referer: referer, depth: depth)]
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# The maximum number of redirects to follow
|
75
|
+
#
|
76
|
+
def redirect_limit
|
77
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# The user-agent string which will be sent with each request,
|
82
|
+
# or nil if no such option is set
|
83
|
+
#
|
84
|
+
def user_agent
|
85
|
+
if @opts[:user_agent].respond_to?(:sample)
|
86
|
+
@opts[:user_agent].sample
|
87
|
+
else
|
88
|
+
@opts[:user_agent]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# The proxy address string
|
94
|
+
#
|
95
|
+
def proxy_host
|
96
|
+
@opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# The proxy port
|
101
|
+
#
|
102
|
+
def proxy_port
|
103
|
+
@opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# The proxy username
|
108
|
+
#
|
109
|
+
def proxy_user
|
110
|
+
@opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# The proxy password
|
115
|
+
#
|
116
|
+
def proxy_pass
|
117
|
+
#return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
|
118
|
+
@opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Shorthand to get proxy info with a single call
|
123
|
+
# It returns an array of ['addr', port, 'user', 'pass']
|
124
|
+
#
|
125
|
+
def proxy_host_port
|
126
|
+
@opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# HTTP read timeout in seconds
|
131
|
+
#
|
132
|
+
def read_timeout
|
133
|
+
@opts[:read_timeout]
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# HTTP open timeout in seconds
|
138
|
+
#
|
139
|
+
def open_timeout
|
140
|
+
@opts[:open_timeout]
|
141
|
+
end
|
142
|
+
|
143
|
+
# Does this HTTP client accept cookies from the server?
|
144
|
+
#
|
145
|
+
def accept_cookies?
|
146
|
+
@opts[:accept_cookies]
|
147
|
+
end
|
148
|
+
|
149
|
+
def cookie_jar
|
150
|
+
@opts[:cookie_jar] ||= ::HTTP::CookieJar.new
|
151
|
+
@opts[:cookie_jar]
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
156
|
+
#
|
157
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
158
|
+
# Yields the response object, response code, and URI location
|
159
|
+
# for each response.
|
160
|
+
#
|
161
|
+
def get(url, referer = nil)
|
162
|
+
limit = redirect_limit
|
163
|
+
loc = url
|
164
|
+
loop do
|
165
|
+
# if redirected to a relative url, merge it with the host of the original
|
166
|
+
# request url
|
167
|
+
loc = url.merge(loc) if loc.relative?
|
168
|
+
|
169
|
+
response, response_time = get_response(loc, referer)
|
170
|
+
code = Integer(response.code)
|
171
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
172
|
+
yield response, code, loc, redirect_to, response_time
|
173
|
+
limit -= 1
|
174
|
+
break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
#
|
179
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
180
|
+
#
|
181
|
+
def get_response(url, referer = nil)
|
182
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
183
|
+
|
184
|
+
opts = {}
|
185
|
+
opts['User-Agent'] = user_agent if user_agent
|
186
|
+
opts['Referer'] = referer.to_s if referer
|
187
|
+
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
188
|
+
opts['Accept-Encoding'] = 'gzip,deflate'
|
189
|
+
|
190
|
+
retries = 0
|
191
|
+
begin
|
192
|
+
start = Time.now
|
193
|
+
# format request
|
194
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
195
|
+
# HTTP Basic authentication
|
196
|
+
req.basic_auth url.user, url.password if url.user
|
197
|
+
if @opts[:http_user]
|
198
|
+
req.basic_auth @opts[:http_user], @opts[:http_password]
|
199
|
+
end
|
200
|
+
# urls auth schema has higher priority
|
201
|
+
req.basic_auth url.user, url.password if url.user
|
202
|
+
response = connection(url).request(req)
|
203
|
+
finish = Time.now
|
204
|
+
response_time = ((finish - start) * 1000).round
|
205
|
+
cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
|
206
|
+
return response, response_time
|
207
|
+
rescue *RESCUABLE_ERRORS => e
|
208
|
+
puts e.inspect if verbose?
|
209
|
+
refresh_connection(url)
|
210
|
+
retries += 1
|
211
|
+
if retries < 3
|
212
|
+
retry
|
213
|
+
else
|
214
|
+
raise e
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def connection(url)
|
220
|
+
@connections[url.host] ||= {}
|
221
|
+
@connections_hits[url.host] ||= {}
|
222
|
+
|
223
|
+
if @connections[url.host][url.port]
|
224
|
+
if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
|
225
|
+
@opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
|
226
|
+
return refresh_connection url
|
227
|
+
end
|
228
|
+
@connections_hits[url.host][url.port] += 1
|
229
|
+
return @connections[url.host][url.port]
|
230
|
+
end
|
231
|
+
|
232
|
+
refresh_connection url
|
233
|
+
end
|
234
|
+
|
235
|
+
def refresh_connection(url)
|
236
|
+
if @opts[:logger] && proxy_host && proxy_port
|
237
|
+
@opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
|
238
|
+
end
|
239
|
+
|
240
|
+
# Block has higher priority
|
241
|
+
unless @opts[:proxy_host_port].nil?
|
242
|
+
p_host, p_port, p_user, p_pass = proxy_host_port
|
243
|
+
else
|
244
|
+
p_host = proxy_host
|
245
|
+
p_port = proxy_port
|
246
|
+
p_user = proxy_user
|
247
|
+
p_pass = proxy_pass
|
248
|
+
end
|
249
|
+
|
250
|
+
http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
|
251
|
+
|
252
|
+
http.read_timeout = read_timeout if read_timeout
|
253
|
+
http.open_timeout = open_timeout if open_timeout
|
254
|
+
|
255
|
+
if url.scheme == 'https'
|
256
|
+
http.use_ssl = true
|
257
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
258
|
+
end
|
259
|
+
@connections_hits[url.host][url.port] = 1
|
260
|
+
@connections[url.host][url.port] = http.start
|
261
|
+
end
|
262
|
+
|
263
|
+
def verbose?
|
264
|
+
@opts[:verbose]
|
265
|
+
end
|
266
|
+
|
267
|
+
#
|
268
|
+
# Allowed to connect to the requested url?
|
269
|
+
#
|
270
|
+
def allowed?(to_url, from_url)
|
271
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
272
|
+
end
|
273
|
+
|
274
|
+
def handle_compression(response)
|
275
|
+
case response['content-encoding']
|
276
|
+
when 'gzip', 'x-gzip'
|
277
|
+
body_io = StringIO.new(response.body)
|
278
|
+
response.body.replace Zlib::GzipReader.new(body_io).read
|
279
|
+
when 'deflate'
|
280
|
+
response.body.replace Zlib::Inflate.inflate(response.body)
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|
data/lib/digger/model.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'digger/pattern'
|
2
|
+
|
3
|
+
module Digger
|
4
|
+
class Model
|
5
|
+
@@patterns = {}
|
6
|
+
|
7
|
+
class << self
|
8
|
+
def pattern_config
|
9
|
+
@@patterns[self.name] ||= {}
|
10
|
+
end
|
11
|
+
|
12
|
+
Pattern::TYPES.each do |method|
|
13
|
+
define_method method, ->(pairs, &block){
|
14
|
+
pairs.each_pair do |key, value|
|
15
|
+
pattern_config[key] = Pattern.new(type: method, value: value, block: block)
|
16
|
+
end
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
def index_page
|
21
|
+
end
|
22
|
+
|
23
|
+
def one_page
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def match_page(page)
|
28
|
+
result = {}
|
29
|
+
self.class.pattern_config.each_pair do |key, pattern|
|
30
|
+
result[key] = pattern.match_page(page)
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def dig(url)
|
36
|
+
client = Digger::HTTP.new
|
37
|
+
page = client.fetch_page(url)
|
38
|
+
match_page(page)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/digger/page.rb
ADDED
@@ -0,0 +1,279 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'json'
|
3
|
+
require 'ostruct'
|
4
|
+
require 'set'
|
5
|
+
require 'kconv'
|
6
|
+
|
7
|
+
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
8
|
+
module Digger
|
9
|
+
class Page
|
10
|
+
attr_reader :url
|
11
|
+
# The raw HTTP response body of the page
|
12
|
+
attr_reader :body
|
13
|
+
# Headers of the HTTP response
|
14
|
+
attr_reader :headers
|
15
|
+
# URL of the page this one redirected to, if any
|
16
|
+
attr_reader :redirect_to
|
17
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
18
|
+
attr_reader :error
|
19
|
+
# Integer response code of the page
|
20
|
+
attr_accessor :code
|
21
|
+
# Depth of this page from the root of the crawl.
|
22
|
+
attr_accessor :depth
|
23
|
+
# URL of the page that brought us to this page
|
24
|
+
attr_accessor :referer
|
25
|
+
# Response time of the request for this page in milliseconds
|
26
|
+
attr_accessor :response_time
|
27
|
+
# OpenStruct it holds users defined data
|
28
|
+
attr_accessor :user_data
|
29
|
+
|
30
|
+
attr_accessor :aliases
|
31
|
+
|
32
|
+
attr_accessor :domain_aliases
|
33
|
+
|
34
|
+
# Whether the current page should be stored
|
35
|
+
# Default: true
|
36
|
+
attr_accessor :storable
|
37
|
+
|
38
|
+
attr_accessor :fetched_at
|
39
|
+
|
40
|
+
#
|
41
|
+
# Create a new page
|
42
|
+
#
|
43
|
+
def initialize(url, params = {})
|
44
|
+
@url = URI(url)
|
45
|
+
@code = params[:code]
|
46
|
+
@headers = params[:headers] || {}
|
47
|
+
@headers['content-type'] ||= ['']
|
48
|
+
@aliases = Array(params[:aka]).compact
|
49
|
+
@referer = params[:referer]
|
50
|
+
@depth = params[:depth] || 0
|
51
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
52
|
+
@response_time = params[:response_time]
|
53
|
+
@body = params[:body]
|
54
|
+
@error = params[:error]
|
55
|
+
@fetched = !params[:code].nil?
|
56
|
+
@user_data = OpenStruct.new
|
57
|
+
@domain_aliases = params[:domain_aliases] ||= []
|
58
|
+
@storable = true
|
59
|
+
@fetched_at = params[:fetched_at]
|
60
|
+
end
|
61
|
+
|
62
|
+
def title
|
63
|
+
doc.title if doc
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# Array of distinct A tag HREFs from the page
|
68
|
+
#
|
69
|
+
def links
|
70
|
+
unless @links.nil?
|
71
|
+
@links = Set.new
|
72
|
+
return [] unless doc
|
73
|
+
|
74
|
+
doc.search('//a[@href]').each do |a|
|
75
|
+
u = a['href']
|
76
|
+
next if u.nil? || u.empty?
|
77
|
+
abs = to_absolute(u) rescue next
|
78
|
+
@links << abs if abs && in_domain?(abs)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
@links.to_a
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Nokogiri document for the HTML body
|
86
|
+
#
|
87
|
+
def doc
|
88
|
+
# return @doc if @doc
|
89
|
+
# @body ||= ''
|
90
|
+
# @body = @body.encode('utf-8', 'binary', :invalid => :replace,
|
91
|
+
# :undef => :replace, :replace => '')
|
92
|
+
# @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
|
93
|
+
@doc ||= begin
|
94
|
+
Nokogiri::HTML(body) if !body.nil? && html? rescue nil
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
#
|
101
|
+
# Discard links, a next call of page.links will return an empty array
|
102
|
+
#
|
103
|
+
def discard_links!
|
104
|
+
@links = []
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Delete the Nokogiri document and response body to conserve memory
|
109
|
+
#
|
110
|
+
def discard_doc!
|
111
|
+
links # force parsing of page links before we trash the document
|
112
|
+
@doc = @body = nil
|
113
|
+
end
|
114
|
+
|
115
|
+
#
|
116
|
+
# Was the page successfully fetched?
|
117
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
118
|
+
#
|
119
|
+
def fetched?
|
120
|
+
@fetched
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# The content-type returned by the HTTP request for this page
|
125
|
+
#
|
126
|
+
def content_type
|
127
|
+
headers['content-type'].first
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
132
|
+
# otherwise.
|
133
|
+
#
|
134
|
+
def html?
|
135
|
+
content_type =~ %r{^(text/html|application/xhtml+xml)\b}
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
140
|
+
# otherwise.
|
141
|
+
#
|
142
|
+
def redirect?
|
143
|
+
(300...400).include?(@code)
|
144
|
+
end
|
145
|
+
|
146
|
+
#
|
147
|
+
# Returns +true+ if the page is a HTTP success, returns +false+
|
148
|
+
# otherwise.
|
149
|
+
#
|
150
|
+
def success?
|
151
|
+
(200..206).include?(@code)
|
152
|
+
end
|
153
|
+
|
154
|
+
#
|
155
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
156
|
+
# returns +false+ otherwise.
|
157
|
+
#
|
158
|
+
def not_found?
|
159
|
+
404 == @code
|
160
|
+
end
|
161
|
+
|
162
|
+
#
|
163
|
+
# Base URI from the HTML doc head element
|
164
|
+
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
|
165
|
+
#
|
166
|
+
def base
|
167
|
+
@base = if doc
|
168
|
+
href = doc.search('//head/base/@href')
|
169
|
+
URI(href.to_s) unless href.nil? rescue nil
|
170
|
+
end unless @base
|
171
|
+
|
172
|
+
return nil if @base && @base.to_s.empty?
|
173
|
+
@base
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
# Converts relative URL *link* into an absolute URL based on the
|
178
|
+
# location of the page
|
179
|
+
#
|
180
|
+
def to_absolute(link)
|
181
|
+
return nil if link.nil?
|
182
|
+
|
183
|
+
# link = link.to_s.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
|
184
|
+
|
185
|
+
# remove anchor
|
186
|
+
link =
|
187
|
+
begin
|
188
|
+
URI.encode(URI.decode(link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
189
|
+
rescue URI::Error
|
190
|
+
return nil
|
191
|
+
end
|
192
|
+
|
193
|
+
relative = begin
|
194
|
+
URI(link)
|
195
|
+
rescue URI::Error
|
196
|
+
return nil
|
197
|
+
end
|
198
|
+
absolute = base ? base.merge(relative) : @url.merge(relative)
|
199
|
+
|
200
|
+
absolute.path = '/' if absolute.path.empty?
|
201
|
+
|
202
|
+
absolute
|
203
|
+
end
|
204
|
+
|
205
|
+
#
|
206
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
207
|
+
# +false+ otherwise
|
208
|
+
#
|
209
|
+
def in_domain?(uri)
|
210
|
+
@domain_aliases ||= []
|
211
|
+
uri.host == @url.host || @domain_aliases.include?(uri.host)
|
212
|
+
end
|
213
|
+
|
214
|
+
def to_hash
|
215
|
+
{
|
216
|
+
'url' => @url.to_s,
|
217
|
+
'headers' => Marshal.dump(@headers),
|
218
|
+
'body' => @body,
|
219
|
+
'links' => links.map(&:to_s),
|
220
|
+
'code' => @code,
|
221
|
+
'depth' => @depth,
|
222
|
+
'referer' => @referer.to_s,
|
223
|
+
'redirect_to' => @redirect_to.to_s,
|
224
|
+
'response_time' => @response_time,
|
225
|
+
'fetched' => @fetched,
|
226
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
227
|
+
'fetched_at' => @fetched_at,
|
228
|
+
'error' => @error.to_s
|
229
|
+
}
|
230
|
+
end
|
231
|
+
|
232
|
+
def to_json
|
233
|
+
th = to_hash.dup
|
234
|
+
th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
235
|
+
th.delete('headers') if content_type.empty?
|
236
|
+
th.to_json
|
237
|
+
end
|
238
|
+
|
239
|
+
#
|
240
|
+
# Returns +true+ if page is marked as storeable
|
241
|
+
# +false+ otherwise
|
242
|
+
# Default is +true+
|
243
|
+
#
|
244
|
+
def storable?
|
245
|
+
@storable
|
246
|
+
end
|
247
|
+
|
248
|
+
def expired?(ttl)
|
249
|
+
return false if fetched_at.nil?
|
250
|
+
(Time.now.to_i - ttl) > fetched_at
|
251
|
+
end
|
252
|
+
|
253
|
+
def self.from_hash(hash)
|
254
|
+
page = new(URI(hash['url']))
|
255
|
+
{
|
256
|
+
'@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
|
257
|
+
'@body' => hash['body'],
|
258
|
+
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
259
|
+
'@code' => hash['code'].to_i,
|
260
|
+
'@depth' => hash['depth'].to_i,
|
261
|
+
'@referer' => hash['referer'],
|
262
|
+
'@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
263
|
+
'@response_time' => hash['response_time'].to_i,
|
264
|
+
'@fetched' => hash['fetched'],
|
265
|
+
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
266
|
+
'@fetched_at' => hash['fetched_at'],
|
267
|
+
'@error' => hash['error']
|
268
|
+
}.each do |var, value|
|
269
|
+
page.instance_variable_set(var, value)
|
270
|
+
end
|
271
|
+
page
|
272
|
+
end
|
273
|
+
|
274
|
+
def self.from_json(json)
|
275
|
+
hash = JSON.parse json
|
276
|
+
from_hash hash
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Digger
|
4
|
+
class Pattern
|
5
|
+
attr_accessor :type, :value, :block
|
6
|
+
|
7
|
+
def initialize(hash = {})
|
8
|
+
hash.each_pair{|key, value| send("#{key}=", value) if %w{type value block}.include?(key.to_s)}
|
9
|
+
end
|
10
|
+
|
11
|
+
def safe_block
|
12
|
+
block && begin
|
13
|
+
if block.respond_to?(:call)
|
14
|
+
block
|
15
|
+
elsif block.strip == '' #
|
16
|
+
nil
|
17
|
+
else
|
18
|
+
proc{ $SAFE = 2; eval block }.call
|
19
|
+
end
|
20
|
+
rescue StandardError
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.wrap(hash)
|
26
|
+
Hash[hash.map{|key, value| [key, value.is_a?(Pattern) ? value : Pattern.new(value)]}]
|
27
|
+
end
|
28
|
+
|
29
|
+
MATCH_MAX = 3
|
30
|
+
|
31
|
+
TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
|
32
|
+
|
33
|
+
def regexp?
|
34
|
+
TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
|
35
|
+
end
|
36
|
+
|
37
|
+
def match_page(page, &callback)
|
38
|
+
blk = callback || safe_block
|
39
|
+
if regexp? # regular expression
|
40
|
+
index = TYPES.index(type)
|
41
|
+
blk ||= ->(text){text.strip}
|
42
|
+
# content is String
|
43
|
+
if type == 'match_many'
|
44
|
+
match = page.body.gsub(value).to_a
|
45
|
+
else
|
46
|
+
matches = page.body.match(value)
|
47
|
+
match = matches.nil? ? nil : matches[index]
|
48
|
+
end
|
49
|
+
else # css expression
|
50
|
+
blk ||= ->(node){node.content.strip}
|
51
|
+
# content is Nokogiri::HTML::Document
|
52
|
+
if type == 'css_one'
|
53
|
+
match = page.doc.css(value).first
|
54
|
+
elsif type == 'css_many' # css_many
|
55
|
+
match = page.doc.css(value)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
if match.nil?
|
59
|
+
nil
|
60
|
+
elsif %w{css_many match_many}.include? type
|
61
|
+
match.map{|node| blk.call(node) }.uniq
|
62
|
+
else
|
63
|
+
blk.call(match)
|
64
|
+
end
|
65
|
+
rescue
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
class Nokogiri::XML::Node
|
70
|
+
%w{one many}.each do |name|
|
71
|
+
define_method "inner_#{name}" do |css, &block|
|
72
|
+
callback = ->(node) do
|
73
|
+
if node
|
74
|
+
(block || ->(n){n.text.strip}).call(node)
|
75
|
+
else
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
end
|
79
|
+
if name == 'one' # inner_one
|
80
|
+
callback.call(self.css(css).first)
|
81
|
+
else # inner_many
|
82
|
+
self.css(css).map{|node| callback.call(node)}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
def source
|
87
|
+
to_xml
|
88
|
+
end
|
89
|
+
end # nokogiri
|
90
|
+
end
|
91
|
+
end
|
data/spec/digger_spec.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'digger'
|
2
|
+
|
3
|
+
http = Digger::HTTP.new
|
4
|
+
page = http.fetch_page('http://nan.so/')
|
5
|
+
|
6
|
+
pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
|
7
|
+
|
8
|
+
class Item < Digger::Model
|
9
|
+
css_many sites: '.sites>a>span'
|
10
|
+
end
|
11
|
+
|
12
|
+
describe Digger do
|
13
|
+
it "http should fetch a page" do
|
14
|
+
expect(page.code).to eq(200)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "pattern should match content" do
|
18
|
+
sites = pattern.match_page(page)
|
19
|
+
expect(sites.include?('百度网盘')).to eq(true)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "model should dig content" do
|
23
|
+
item = Item.new.match_page(page)
|
24
|
+
expect(item[:sites].include?('读远')).to be(true)
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: digger
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- binz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: http-cookie
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.0'
|
69
|
+
description: ''
|
70
|
+
email:
|
71
|
+
- xinkiang@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- Gemfile
|
78
|
+
- LICENSE.txt
|
79
|
+
- README.md
|
80
|
+
- Rakefile
|
81
|
+
- digger.gemspec
|
82
|
+
- lib/digger.rb
|
83
|
+
- lib/digger/http.rb
|
84
|
+
- lib/digger/model.rb
|
85
|
+
- lib/digger/page.rb
|
86
|
+
- lib/digger/pattern.rb
|
87
|
+
- lib/digger/version.rb
|
88
|
+
- spec/digger_spec.rb
|
89
|
+
homepage: ''
|
90
|
+
licenses:
|
91
|
+
- MIT
|
92
|
+
metadata: {}
|
93
|
+
post_install_message:
|
94
|
+
rdoc_options: []
|
95
|
+
require_paths:
|
96
|
+
- lib
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 2.2.2
|
110
|
+
signing_key:
|
111
|
+
specification_version: 4
|
112
|
+
summary: Dig need stractual infomation from web page.
|
113
|
+
test_files:
|
114
|
+
- spec/digger_spec.rb
|