digger 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 71e41cc25211835ca901f56a0514d6d4f94326d9
4
+ data.tar.gz: e7f954b64cb216e5cda6391940576fe0f188553e
5
+ SHA512:
6
+ metadata.gz: 78a0717ae08e03a0325dc338411f3aae055a2f19c0849802d2aa8c0b17b6fabeacc98a9dea66142bf8f5058b08c6d4043244b65db944f28dc5f8317ccc641f4f
7
+ data.tar.gz: 186bfded593330616d7849dd519b8f47a5bd3e1ed2dce7bb33580dcbdb4e61bbb1eabb49bcf7a7c23d6cd93d56fed8e93cc41d366797914b55e83da7c0638437
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in digger.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 binz
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Digger
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'digger'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install digger
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/digger/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/digger.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'digger/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "digger"
8
+ spec.version = Digger::VERSION
9
+ spec.authors = ["binz"]
10
+ spec.email = ["xinkiang@gmail.com"]
11
+ spec.summary = %q{Dig need stractual infomation from web page.}
12
+ spec.description = %q{}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6'
25
+ spec.add_runtime_dependency 'http-cookie', '~> 1.0'
26
+ end
data/lib/digger.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'digger/version'
2
+ require 'digger/page'
3
+ require 'digger/http'
4
+ require 'digger/pattern'
5
+ require 'digger/model'
6
+
7
+ module Digger
8
+ #
9
+ end
@@ -0,0 +1,284 @@
1
+ require 'net/http'
2
+ require 'http/cookie'
3
+ require 'zlib'
4
+ require 'digger/page'
5
+
6
+
7
+ # https://github.com/taganaka/polipus/blob/master/lib/polipus/http.rb
8
+
9
+ module Digger
10
+ class HTTP
11
+ # Maximum number of redirects to follow on each get_response
12
+ REDIRECT_LIMIT = 5
13
+ RESCUABLE_ERRORS = [
14
+ EOFError,
15
+ Errno::ECONNREFUSED,
16
+ Errno::ECONNRESET,
17
+ Errno::EHOSTUNREACH,
18
+ Errno::EINVAL,
19
+ Errno::EPIPE,
20
+ Errno::ETIMEDOUT,
21
+ Net::HTTPBadResponse,
22
+ Net::HTTPHeaderSyntaxError,
23
+ Net::ProtocolError,
24
+ SocketError,
25
+ Timeout::Error,
26
+ Zlib::DataError,
27
+ Zlib::GzipFile::Error
28
+ ]
29
+
30
+ def initialize(opts = {})
31
+ @connections = {}
32
+ @connections_hits = {}
33
+ @opts = opts
34
+ end
35
+
36
+ #
37
+ # Fetch a single Page from the response of an HTTP request to *url*.
38
+ # Just gets the final destination page.
39
+ #
40
+ def fetch_page(url, referer = nil, depth = nil)
41
+ fetch_pages(url, referer, depth).last
42
+ end
43
+
44
+ #
45
+ # Create new Pages from the response of an HTTP request to *url*,
46
+ # including redirects
47
+ #
48
+ def fetch_pages(url, referer = nil, depth = nil)
49
+ url = URI(url)
50
+ pages = []
51
+ get(url, referer) do |response, code, location, redirect_to, response_time|
52
+ handle_compression response
53
+ pages << Page.new(location, body: response.body,
54
+ code: code,
55
+ headers: response.to_hash,
56
+ referer: referer,
57
+ depth: depth,
58
+ redirect_to: redirect_to,
59
+ response_time: response_time,
60
+ fetched_at: Time.now.to_i)
61
+ end
62
+
63
+ pages
64
+ rescue *RESCUABLE_ERRORS => e
65
+ if verbose?
66
+ puts e.inspect
67
+ puts e.backtrace
68
+ end
69
+
70
+ [Page.new(url, error: e, referer: referer, depth: depth)]
71
+ end
72
+
73
+ #
74
+ # The maximum number of redirects to follow
75
+ #
76
+ def redirect_limit
77
+ @opts[:redirect_limit] || REDIRECT_LIMIT
78
+ end
79
+
80
+ #
81
+ # The user-agent string which will be sent with each request,
82
+ # or nil if no such option is set
83
+ #
84
+ def user_agent
85
+ if @opts[:user_agent].respond_to?(:sample)
86
+ @opts[:user_agent].sample
87
+ else
88
+ @opts[:user_agent]
89
+ end
90
+ end
91
+
92
+ #
93
+ # The proxy address string
94
+ #
95
+ def proxy_host
96
+ @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
97
+ end
98
+
99
+ #
100
+ # The proxy port
101
+ #
102
+ def proxy_port
103
+ @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
104
+ end
105
+
106
+ #
107
+ # The proxy username
108
+ #
109
+ def proxy_user
110
+ @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
111
+ end
112
+
113
+ #
114
+ # The proxy password
115
+ #
116
+ def proxy_pass
117
+ #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
118
+ @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
119
+ end
120
+
121
+ #
122
+ # Shorthand to get proxy info with a single call
123
+ # It returns an array of ['addr', port, 'user', 'pass']
124
+ #
125
+ def proxy_host_port
126
+ @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
127
+ end
128
+
129
+ #
130
+ # HTTP read timeout in seconds
131
+ #
132
+ def read_timeout
133
+ @opts[:read_timeout]
134
+ end
135
+
136
+ #
137
+ # HTTP open timeout in seconds
138
+ #
139
+ def open_timeout
140
+ @opts[:open_timeout]
141
+ end
142
+
143
+ # Does this HTTP client accept cookies from the server?
144
+ #
145
+ def accept_cookies?
146
+ @opts[:accept_cookies]
147
+ end
148
+
149
+ def cookie_jar
150
+ @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
151
+ @opts[:cookie_jar]
152
+ end
153
+
154
+ private
155
+
156
+ #
157
+ # Retrieve HTTP responses for *url*, including redirects.
158
+ # Yields the response object, response code, and URI location
159
+ # for each response.
160
+ #
161
+ def get(url, referer = nil)
162
+ limit = redirect_limit
163
+ loc = url
164
+ loop do
165
+ # if redirected to a relative url, merge it with the host of the original
166
+ # request url
167
+ loc = url.merge(loc) if loc.relative?
168
+
169
+ response, response_time = get_response(loc, referer)
170
+ code = Integer(response.code)
171
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
172
+ yield response, code, loc, redirect_to, response_time
173
+ limit -= 1
174
+ break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
175
+ end
176
+ end
177
+
178
+ #
179
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
180
+ #
181
+ def get_response(url, referer = nil)
182
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
183
+
184
+ opts = {}
185
+ opts['User-Agent'] = user_agent if user_agent
186
+ opts['Referer'] = referer.to_s if referer
187
+ opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
188
+ opts['Accept-Encoding'] = 'gzip,deflate'
189
+
190
+ retries = 0
191
+ begin
192
+ start = Time.now
193
+ # format request
194
+ req = Net::HTTP::Get.new(full_path, opts)
195
+ # HTTP Basic authentication
196
+ req.basic_auth url.user, url.password if url.user
197
+ if @opts[:http_user]
198
+ req.basic_auth @opts[:http_user], @opts[:http_password]
199
+ end
200
+ # urls auth schema has higher priority
201
+ req.basic_auth url.user, url.password if url.user
202
+ response = connection(url).request(req)
203
+ finish = Time.now
204
+ response_time = ((finish - start) * 1000).round
205
+ cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
206
+ return response, response_time
207
+ rescue *RESCUABLE_ERRORS => e
208
+ puts e.inspect if verbose?
209
+ refresh_connection(url)
210
+ retries += 1
211
+ if retries < 3
212
+ retry
213
+ else
214
+ raise e
215
+ end
216
+ end
217
+ end
218
+
219
+ def connection(url)
220
+ @connections[url.host] ||= {}
221
+ @connections_hits[url.host] ||= {}
222
+
223
+ if @connections[url.host][url.port]
224
+ if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
225
+ @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
226
+ return refresh_connection url
227
+ end
228
+ @connections_hits[url.host][url.port] += 1
229
+ return @connections[url.host][url.port]
230
+ end
231
+
232
+ refresh_connection url
233
+ end
234
+
235
+ def refresh_connection(url)
236
+ if @opts[:logger] && proxy_host && proxy_port
237
+ @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
238
+ end
239
+
240
+ # Block has higher priority
241
+ unless @opts[:proxy_host_port].nil?
242
+ p_host, p_port, p_user, p_pass = proxy_host_port
243
+ else
244
+ p_host = proxy_host
245
+ p_port = proxy_port
246
+ p_user = proxy_user
247
+ p_pass = proxy_pass
248
+ end
249
+
250
+ http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
251
+
252
+ http.read_timeout = read_timeout if read_timeout
253
+ http.open_timeout = open_timeout if open_timeout
254
+
255
+ if url.scheme == 'https'
256
+ http.use_ssl = true
257
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
258
+ end
259
+ @connections_hits[url.host][url.port] = 1
260
+ @connections[url.host][url.port] = http.start
261
+ end
262
+
263
+ def verbose?
264
+ @opts[:verbose]
265
+ end
266
+
267
+ #
268
+ # Allowed to connect to the requested url?
269
+ #
270
+ def allowed?(to_url, from_url)
271
+ to_url.host.nil? || (to_url.host == from_url.host)
272
+ end
273
+
274
+ def handle_compression(response)
275
+ case response['content-encoding']
276
+ when 'gzip', 'x-gzip'
277
+ body_io = StringIO.new(response.body)
278
+ response.body.replace Zlib::GzipReader.new(body_io).read
279
+ when 'deflate'
280
+ response.body.replace Zlib::Inflate.inflate(response.body)
281
+ end
282
+ end
283
+ end
284
+ end
@@ -0,0 +1,41 @@
1
+ require 'digger/pattern'
2
+
3
+ module Digger
4
+ class Model
5
+ @@patterns = {}
6
+
7
+ class << self
8
+ def pattern_config
9
+ @@patterns[self.name] ||= {}
10
+ end
11
+
12
+ Pattern::TYPES.each do |method|
13
+ define_method method, ->(pairs, &block){
14
+ pairs.each_pair do |key, value|
15
+ pattern_config[key] = Pattern.new(type: method, value: value, block: block)
16
+ end
17
+ }
18
+ end
19
+
20
+ def index_page
21
+ end
22
+
23
+ def one_page
24
+ end
25
+ end
26
+
27
+ def match_page(page)
28
+ result = {}
29
+ self.class.pattern_config.each_pair do |key, pattern|
30
+ result[key] = pattern.match_page(page)
31
+ end
32
+ result
33
+ end
34
+
35
+ def dig(url)
36
+ client = Digger::HTTP.new
37
+ page = client.fetch_page(url)
38
+ match_page(page)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,279 @@
1
+ require 'nokogiri'
2
+ require 'json'
3
+ require 'ostruct'
4
+ require 'set'
5
+ require 'kconv'
6
+
7
+ # https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
8
+ module Digger
9
+ class Page
10
+ attr_reader :url
11
+ # The raw HTTP response body of the page
12
+ attr_reader :body
13
+ # Headers of the HTTP response
14
+ attr_reader :headers
15
+ # URL of the page this one redirected to, if any
16
+ attr_reader :redirect_to
17
+ # Exception object, if one was raised during HTTP#fetch_page
18
+ attr_reader :error
19
+ # Integer response code of the page
20
+ attr_accessor :code
21
+ # Depth of this page from the root of the crawl.
22
+ attr_accessor :depth
23
+ # URL of the page that brought us to this page
24
+ attr_accessor :referer
25
+ # Response time of the request for this page in milliseconds
26
+ attr_accessor :response_time
27
+ # OpenStruct it holds users defined data
28
+ attr_accessor :user_data
29
+
30
+ attr_accessor :aliases
31
+
32
+ attr_accessor :domain_aliases
33
+
34
+ # Whether the current page should be stored
35
+ # Default: true
36
+ attr_accessor :storable
37
+
38
+ attr_accessor :fetched_at
39
+
40
+ #
41
+ # Create a new page
42
+ #
43
+ def initialize(url, params = {})
44
+ @url = URI(url)
45
+ @code = params[:code]
46
+ @headers = params[:headers] || {}
47
+ @headers['content-type'] ||= ['']
48
+ @aliases = Array(params[:aka]).compact
49
+ @referer = params[:referer]
50
+ @depth = params[:depth] || 0
51
+ @redirect_to = to_absolute(params[:redirect_to])
52
+ @response_time = params[:response_time]
53
+ @body = params[:body]
54
+ @error = params[:error]
55
+ @fetched = !params[:code].nil?
56
+ @user_data = OpenStruct.new
57
+ @domain_aliases = params[:domain_aliases] ||= []
58
+ @storable = true
59
+ @fetched_at = params[:fetched_at]
60
+ end
61
+
62
+ def title
63
+ doc.title if doc
64
+ end
65
+
66
+ #
67
+ # Array of distinct A tag HREFs from the page
68
+ #
69
+ def links
70
+ unless @links.nil?
71
+ @links = Set.new
72
+ return [] unless doc
73
+
74
+ doc.search('//a[@href]').each do |a|
75
+ u = a['href']
76
+ next if u.nil? || u.empty?
77
+ abs = to_absolute(u) rescue next
78
+ @links << abs if abs && in_domain?(abs)
79
+ end
80
+ end
81
+ @links.to_a
82
+ end
83
+
84
+ #
85
+ # Nokogiri document for the HTML body
86
+ #
87
+ def doc
88
+ # return @doc if @doc
89
+ # @body ||= ''
90
+ # @body = @body.encode('utf-8', 'binary', :invalid => :replace,
91
+ # :undef => :replace, :replace => '')
92
+ # @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
93
+ @doc ||= begin
94
+ Nokogiri::HTML(body) if !body.nil? && html? rescue nil
95
+ end
96
+ end
97
+
98
+
99
+
100
+ #
101
+ # Discard links, a next call of page.links will return an empty array
102
+ #
103
+ def discard_links!
104
+ @links = []
105
+ end
106
+
107
+ #
108
+ # Delete the Nokogiri document and response body to conserve memory
109
+ #
110
+ def discard_doc!
111
+ links # force parsing of page links before we trash the document
112
+ @doc = @body = nil
113
+ end
114
+
115
+ #
116
+ # Was the page successfully fetched?
117
+ # +true+ if the page was fetched with no error, +false+ otherwise.
118
+ #
119
+ def fetched?
120
+ @fetched
121
+ end
122
+
123
+ #
124
+ # The content-type returned by the HTTP request for this page
125
+ #
126
+ def content_type
127
+ headers['content-type'].first
128
+ end
129
+
130
+ #
131
+ # Returns +true+ if the page is a HTML document, returns +false+
132
+ # otherwise.
133
+ #
134
+ def html?
135
+ content_type =~ %r{^(text/html|application/xhtml+xml)\b}
136
+ end
137
+
138
+ #
139
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
140
+ # otherwise.
141
+ #
142
+ def redirect?
143
+ (300...400).include?(@code)
144
+ end
145
+
146
+ #
147
+ # Returns +true+ if the page is a HTTP success, returns +false+
148
+ # otherwise.
149
+ #
150
+ def success?
151
+ (200..206).include?(@code)
152
+ end
153
+
154
+ #
155
+ # Returns +true+ if the page was not found (returned 404 code),
156
+ # returns +false+ otherwise.
157
+ #
158
+ def not_found?
159
+ 404 == @code
160
+ end
161
+
162
+ #
163
+ # Base URI from the HTML doc head element
164
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
165
+ #
166
+ def base
167
+ @base = if doc
168
+ href = doc.search('//head/base/@href')
169
+ URI(href.to_s) unless href.nil? rescue nil
170
+ end unless @base
171
+
172
+ return nil if @base && @base.to_s.empty?
173
+ @base
174
+ end
175
+
176
+ #
177
+ # Converts relative URL *link* into an absolute URL based on the
178
+ # location of the page
179
+ #
180
+ def to_absolute(link)
181
+ return nil if link.nil?
182
+
183
+ # link = link.to_s.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
184
+
185
+ # remove anchor
186
+ link =
187
+ begin
188
+ URI.encode(URI.decode(link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
189
+ rescue URI::Error
190
+ return nil
191
+ end
192
+
193
+ relative = begin
194
+ URI(link)
195
+ rescue URI::Error
196
+ return nil
197
+ end
198
+ absolute = base ? base.merge(relative) : @url.merge(relative)
199
+
200
+ absolute.path = '/' if absolute.path.empty?
201
+
202
+ absolute
203
+ end
204
+
205
+ #
206
+ # Returns +true+ if *uri* is in the same domain as the page, returns
207
+ # +false+ otherwise
208
+ #
209
+ def in_domain?(uri)
210
+ @domain_aliases ||= []
211
+ uri.host == @url.host || @domain_aliases.include?(uri.host)
212
+ end
213
+
214
+ def to_hash
215
+ {
216
+ 'url' => @url.to_s,
217
+ 'headers' => Marshal.dump(@headers),
218
+ 'body' => @body,
219
+ 'links' => links.map(&:to_s),
220
+ 'code' => @code,
221
+ 'depth' => @depth,
222
+ 'referer' => @referer.to_s,
223
+ 'redirect_to' => @redirect_to.to_s,
224
+ 'response_time' => @response_time,
225
+ 'fetched' => @fetched,
226
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
227
+ 'fetched_at' => @fetched_at,
228
+ 'error' => @error.to_s
229
+ }
230
+ end
231
+
232
+ def to_json
233
+ th = to_hash.dup
234
+ th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
235
+ th.delete('headers') if content_type.empty?
236
+ th.to_json
237
+ end
238
+
239
+ #
240
+ # Returns +true+ if page is marked as storeable
241
+ # +false+ otherwise
242
+ # Default is +true+
243
+ #
244
+ def storable?
245
+ @storable
246
+ end
247
+
248
+ def expired?(ttl)
249
+ return false if fetched_at.nil?
250
+ (Time.now.to_i - ttl) > fetched_at
251
+ end
252
+
253
+ def self.from_hash(hash)
254
+ page = new(URI(hash['url']))
255
+ {
256
+ '@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
257
+ '@body' => hash['body'],
258
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
259
+ '@code' => hash['code'].to_i,
260
+ '@depth' => hash['depth'].to_i,
261
+ '@referer' => hash['referer'],
262
+ '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
263
+ '@response_time' => hash['response_time'].to_i,
264
+ '@fetched' => hash['fetched'],
265
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
266
+ '@fetched_at' => hash['fetched_at'],
267
+ '@error' => hash['error']
268
+ }.each do |var, value|
269
+ page.instance_variable_set(var, value)
270
+ end
271
+ page
272
+ end
273
+
274
+ def self.from_json(json)
275
+ hash = JSON.parse json
276
+ from_hash hash
277
+ end
278
+ end
279
+ end
@@ -0,0 +1,91 @@
1
+ require 'nokogiri'
2
+
3
+ module Digger
4
+ class Pattern
5
+ attr_accessor :type, :value, :block
6
+
7
+ def initialize(hash = {})
8
+ hash.each_pair{|key, value| send("#{key}=", value) if %w{type value block}.include?(key.to_s)}
9
+ end
10
+
11
+ def safe_block
12
+ block && begin
13
+ if block.respond_to?(:call)
14
+ block
15
+ elsif block.strip == '' #
16
+ nil
17
+ else
18
+ proc{ $SAFE = 2; eval block }.call
19
+ end
20
+ rescue StandardError
21
+ nil
22
+ end
23
+ end
24
+
25
+ def self.wrap(hash)
26
+ Hash[hash.map{|key, value| [key, value.is_a?(Pattern) ? value : Pattern.new(value)]}]
27
+ end
28
+
29
+ MATCH_MAX = 3
30
+
31
+ TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
32
+
33
+ def regexp?
34
+ TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
35
+ end
36
+
37
+ def match_page(page, &callback)
38
+ blk = callback || safe_block
39
+ if regexp? # regular expression
40
+ index = TYPES.index(type)
41
+ blk ||= ->(text){text.strip}
42
+ # content is String
43
+ if type == 'match_many'
44
+ match = page.body.gsub(value).to_a
45
+ else
46
+ matches = page.body.match(value)
47
+ match = matches.nil? ? nil : matches[index]
48
+ end
49
+ else # css expression
50
+ blk ||= ->(node){node.content.strip}
51
+ # content is Nokogiri::HTML::Document
52
+ if type == 'css_one'
53
+ match = page.doc.css(value).first
54
+ elsif type == 'css_many' # css_many
55
+ match = page.doc.css(value)
56
+ end
57
+ end
58
+ if match.nil?
59
+ nil
60
+ elsif %w{css_many match_many}.include? type
61
+ match.map{|node| blk.call(node) }.uniq
62
+ else
63
+ blk.call(match)
64
+ end
65
+ rescue
66
+ nil
67
+ end
68
+
69
+ class Nokogiri::XML::Node
70
+ %w{one many}.each do |name|
71
+ define_method "inner_#{name}" do |css, &block|
72
+ callback = ->(node) do
73
+ if node
74
+ (block || ->(n){n.text.strip}).call(node)
75
+ else
76
+ nil
77
+ end
78
+ end
79
+ if name == 'one' # inner_one
80
+ callback.call(self.css(css).first)
81
+ else # inner_many
82
+ self.css(css).map{|node| callback.call(node)}
83
+ end
84
+ end
85
+ end
86
+ def source
87
+ to_xml
88
+ end
89
+ end # nokogiri
90
+ end
91
+ end
@@ -0,0 +1,3 @@
1
+ module Digger
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ require 'digger'
2
+
3
+ http = Digger::HTTP.new
4
+ page = http.fetch_page('http://nan.so/')
5
+
6
+ pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
7
+
8
+ class Item < Digger::Model
9
+ css_many sites: '.sites>a>span'
10
+ end
11
+
12
+ describe Digger do
13
+ it "http should fetch a page" do
14
+ expect(page.code).to eq(200)
15
+ end
16
+
17
+ it "pattern should match content" do
18
+ sites = pattern.match_page(page)
19
+ expect(sites.include?('百度网盘')).to eq(true)
20
+ end
21
+
22
+ it "model should dig content" do
23
+ item = Item.new.match_page(page)
24
+ expect(item[:sites].include?('读远')).to be(true)
25
+ end
26
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: digger
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - binz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: http-cookie
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
69
+ description: ''
70
+ email:
71
+ - xinkiang@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - Gemfile
78
+ - LICENSE.txt
79
+ - README.md
80
+ - Rakefile
81
+ - digger.gemspec
82
+ - lib/digger.rb
83
+ - lib/digger/http.rb
84
+ - lib/digger/model.rb
85
+ - lib/digger/page.rb
86
+ - lib/digger/pattern.rb
87
+ - lib/digger/version.rb
88
+ - spec/digger_spec.rb
89
+ homepage: ''
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Dig need stractual infomation from web page.
113
+ test_files:
114
+ - spec/digger_spec.rb