digger 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 71e41cc25211835ca901f56a0514d6d4f94326d9
4
+ data.tar.gz: e7f954b64cb216e5cda6391940576fe0f188553e
5
+ SHA512:
6
+ metadata.gz: 78a0717ae08e03a0325dc338411f3aae055a2f19c0849802d2aa8c0b17b6fabeacc98a9dea66142bf8f5058b08c6d4043244b65db944f28dc5f8317ccc641f4f
7
+ data.tar.gz: 186bfded593330616d7849dd519b8f47a5bd3e1ed2dce7bb33580dcbdb4e61bbb1eabb49bcf7a7c23d6cd93d56fed8e93cc41d366797914b55e83da7c0638437
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in digger.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 binz
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Digger
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'digger'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install digger
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/digger/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/digger.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'digger/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "digger"
8
+ spec.version = Digger::VERSION
9
+ spec.authors = ["binz"]
10
+ spec.email = ["xinkiang@gmail.com"]
11
+ spec.summary = %q{Dig need stractual infomation from web page.}
12
+ spec.description = %q{}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6'
25
+ spec.add_runtime_dependency 'http-cookie', '~> 1.0'
26
+ end
data/lib/digger.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'digger/version'
2
+ require 'digger/page'
3
+ require 'digger/http'
4
+ require 'digger/pattern'
5
+ require 'digger/model'
6
+
7
+ module Digger
8
+ #
9
+ end
@@ -0,0 +1,284 @@
1
+ require 'net/http'
2
+ require 'http/cookie'
3
+ require 'zlib'
4
+ require 'digger/page'
5
+
6
+
7
+ # https://github.com/taganaka/polipus/blob/master/lib/polipus/http.rb
8
+
9
+ module Digger
10
+ class HTTP
11
+ # Maximum number of redirects to follow on each get_response
12
+ REDIRECT_LIMIT = 5
13
+ RESCUABLE_ERRORS = [
14
+ EOFError,
15
+ Errno::ECONNREFUSED,
16
+ Errno::ECONNRESET,
17
+ Errno::EHOSTUNREACH,
18
+ Errno::EINVAL,
19
+ Errno::EPIPE,
20
+ Errno::ETIMEDOUT,
21
+ Net::HTTPBadResponse,
22
+ Net::HTTPHeaderSyntaxError,
23
+ Net::ProtocolError,
24
+ SocketError,
25
+ Timeout::Error,
26
+ Zlib::DataError,
27
+ Zlib::GzipFile::Error
28
+ ]
29
+
30
+ def initialize(opts = {})
31
+ @connections = {}
32
+ @connections_hits = {}
33
+ @opts = opts
34
+ end
35
+
36
+ #
37
+ # Fetch a single Page from the response of an HTTP request to *url*.
38
+ # Just gets the final destination page.
39
+ #
40
+ def fetch_page(url, referer = nil, depth = nil)
41
+ fetch_pages(url, referer, depth).last
42
+ end
43
+
44
+ #
45
+ # Create new Pages from the response of an HTTP request to *url*,
46
+ # including redirects
47
+ #
48
+ def fetch_pages(url, referer = nil, depth = nil)
49
+ url = URI(url)
50
+ pages = []
51
+ get(url, referer) do |response, code, location, redirect_to, response_time|
52
+ handle_compression response
53
+ pages << Page.new(location, body: response.body,
54
+ code: code,
55
+ headers: response.to_hash,
56
+ referer: referer,
57
+ depth: depth,
58
+ redirect_to: redirect_to,
59
+ response_time: response_time,
60
+ fetched_at: Time.now.to_i)
61
+ end
62
+
63
+ pages
64
+ rescue *RESCUABLE_ERRORS => e
65
+ if verbose?
66
+ puts e.inspect
67
+ puts e.backtrace
68
+ end
69
+
70
+ [Page.new(url, error: e, referer: referer, depth: depth)]
71
+ end
72
+
73
+ #
74
+ # The maximum number of redirects to follow
75
+ #
76
+ def redirect_limit
77
+ @opts[:redirect_limit] || REDIRECT_LIMIT
78
+ end
79
+
80
+ #
81
+ # The user-agent string which will be sent with each request,
82
+ # or nil if no such option is set
83
+ #
84
+ def user_agent
85
+ if @opts[:user_agent].respond_to?(:sample)
86
+ @opts[:user_agent].sample
87
+ else
88
+ @opts[:user_agent]
89
+ end
90
+ end
91
+
92
+ #
93
+ # The proxy address string
94
+ #
95
+ def proxy_host
96
+ @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
97
+ end
98
+
99
+ #
100
+ # The proxy port
101
+ #
102
+ def proxy_port
103
+ @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
104
+ end
105
+
106
+ #
107
+ # The proxy username
108
+ #
109
+ def proxy_user
110
+ @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
111
+ end
112
+
113
+ #
114
+ # The proxy password
115
+ #
116
+ def proxy_pass
117
+ #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
118
+ @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
119
+ end
120
+
121
+ #
122
+ # Shorthand to get proxy info with a single call
123
+ # It returns an array of ['addr', port, 'user', 'pass']
124
+ #
125
+ def proxy_host_port
126
+ @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
127
+ end
128
+
129
+ #
130
+ # HTTP read timeout in seconds
131
+ #
132
+ def read_timeout
133
+ @opts[:read_timeout]
134
+ end
135
+
136
+ #
137
+ # HTTP open timeout in seconds
138
+ #
139
+ def open_timeout
140
+ @opts[:open_timeout]
141
+ end
142
+
143
+ # Does this HTTP client accept cookies from the server?
144
+ #
145
+ def accept_cookies?
146
+ @opts[:accept_cookies]
147
+ end
148
+
149
+ def cookie_jar
150
+ @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
151
+ @opts[:cookie_jar]
152
+ end
153
+
154
+ private
155
+
156
+ #
157
+ # Retrieve HTTP responses for *url*, including redirects.
158
+ # Yields the response object, response code, and URI location
159
+ # for each response.
160
+ #
161
+ def get(url, referer = nil)
162
+ limit = redirect_limit
163
+ loc = url
164
+ loop do
165
+ # if redirected to a relative url, merge it with the host of the original
166
+ # request url
167
+ loc = url.merge(loc) if loc.relative?
168
+
169
+ response, response_time = get_response(loc, referer)
170
+ code = Integer(response.code)
171
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
172
+ yield response, code, loc, redirect_to, response_time
173
+ limit -= 1
174
+ break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
175
+ end
176
+ end
177
+
178
+ #
179
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
180
+ #
181
+ def get_response(url, referer = nil)
182
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
183
+
184
+ opts = {}
185
+ opts['User-Agent'] = user_agent if user_agent
186
+ opts['Referer'] = referer.to_s if referer
187
+ opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
188
+ opts['Accept-Encoding'] = 'gzip,deflate'
189
+
190
+ retries = 0
191
+ begin
192
+ start = Time.now
193
+ # format request
194
+ req = Net::HTTP::Get.new(full_path, opts)
195
+ # HTTP Basic authentication
196
+ req.basic_auth url.user, url.password if url.user
197
+ if @opts[:http_user]
198
+ req.basic_auth @opts[:http_user], @opts[:http_password]
199
+ end
200
+ # urls auth schema has higher priority
201
+ req.basic_auth url.user, url.password if url.user
202
+ response = connection(url).request(req)
203
+ finish = Time.now
204
+ response_time = ((finish - start) * 1000).round
205
+ cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
206
+ return response, response_time
207
+ rescue *RESCUABLE_ERRORS => e
208
+ puts e.inspect if verbose?
209
+ refresh_connection(url)
210
+ retries += 1
211
+ if retries < 3
212
+ retry
213
+ else
214
+ raise e
215
+ end
216
+ end
217
+ end
218
+
219
+ def connection(url)
220
+ @connections[url.host] ||= {}
221
+ @connections_hits[url.host] ||= {}
222
+
223
+ if @connections[url.host][url.port]
224
+ if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
225
+ @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
226
+ return refresh_connection url
227
+ end
228
+ @connections_hits[url.host][url.port] += 1
229
+ return @connections[url.host][url.port]
230
+ end
231
+
232
+ refresh_connection url
233
+ end
234
+
235
+ def refresh_connection(url)
236
+ if @opts[:logger] && proxy_host && proxy_port
237
+ @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
238
+ end
239
+
240
+ # Block has higher priority
241
+ unless @opts[:proxy_host_port].nil?
242
+ p_host, p_port, p_user, p_pass = proxy_host_port
243
+ else
244
+ p_host = proxy_host
245
+ p_port = proxy_port
246
+ p_user = proxy_user
247
+ p_pass = proxy_pass
248
+ end
249
+
250
+ http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
251
+
252
+ http.read_timeout = read_timeout if read_timeout
253
+ http.open_timeout = open_timeout if open_timeout
254
+
255
+ if url.scheme == 'https'
256
+ http.use_ssl = true
257
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
258
+ end
259
+ @connections_hits[url.host][url.port] = 1
260
+ @connections[url.host][url.port] = http.start
261
+ end
262
+
263
+ def verbose?
264
+ @opts[:verbose]
265
+ end
266
+
267
+ #
268
+ # Allowed to connect to the requested url?
269
+ #
270
+ def allowed?(to_url, from_url)
271
+ to_url.host.nil? || (to_url.host == from_url.host)
272
+ end
273
+
274
+ def handle_compression(response)
275
+ case response['content-encoding']
276
+ when 'gzip', 'x-gzip'
277
+ body_io = StringIO.new(response.body)
278
+ response.body.replace Zlib::GzipReader.new(body_io).read
279
+ when 'deflate'
280
+ response.body.replace Zlib::Inflate.inflate(response.body)
281
+ end
282
+ end
283
+ end
284
+ end
@@ -0,0 +1,41 @@
1
+ require 'digger/pattern'
2
+
3
+ module Digger
4
+ class Model
5
+ @@patterns = {}
6
+
7
+ class << self
8
+ def pattern_config
9
+ @@patterns[self.name] ||= {}
10
+ end
11
+
12
+ Pattern::TYPES.each do |method|
13
+ define_method method, ->(pairs, &block){
14
+ pairs.each_pair do |key, value|
15
+ pattern_config[key] = Pattern.new(type: method, value: value, block: block)
16
+ end
17
+ }
18
+ end
19
+
20
+ def index_page
21
+ end
22
+
23
+ def one_page
24
+ end
25
+ end
26
+
27
+ def match_page(page)
28
+ result = {}
29
+ self.class.pattern_config.each_pair do |key, pattern|
30
+ result[key] = pattern.match_page(page)
31
+ end
32
+ result
33
+ end
34
+
35
+ def dig(url)
36
+ client = Digger::HTTP.new
37
+ page = client.fetch_page(url)
38
+ match_page(page)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,279 @@
1
+ require 'nokogiri'
2
+ require 'json'
3
+ require 'ostruct'
4
+ require 'set'
5
+ require 'kconv'
6
+
7
+ # https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
8
+ module Digger
9
+ class Page
10
+ attr_reader :url
11
+ # The raw HTTP response body of the page
12
+ attr_reader :body
13
+ # Headers of the HTTP response
14
+ attr_reader :headers
15
+ # URL of the page this one redirected to, if any
16
+ attr_reader :redirect_to
17
+ # Exception object, if one was raised during HTTP#fetch_page
18
+ attr_reader :error
19
+ # Integer response code of the page
20
+ attr_accessor :code
21
+ # Depth of this page from the root of the crawl.
22
+ attr_accessor :depth
23
+ # URL of the page that brought us to this page
24
+ attr_accessor :referer
25
+ # Response time of the request for this page in milliseconds
26
+ attr_accessor :response_time
27
+ # OpenStruct it holds users defined data
28
+ attr_accessor :user_data
29
+
30
+ attr_accessor :aliases
31
+
32
+ attr_accessor :domain_aliases
33
+
34
+ # Whether the current page should be stored
35
+ # Default: true
36
+ attr_accessor :storable
37
+
38
+ attr_accessor :fetched_at
39
+
40
+ #
41
+ # Create a new page
42
+ #
43
+ def initialize(url, params = {})
44
+ @url = URI(url)
45
+ @code = params[:code]
46
+ @headers = params[:headers] || {}
47
+ @headers['content-type'] ||= ['']
48
+ @aliases = Array(params[:aka]).compact
49
+ @referer = params[:referer]
50
+ @depth = params[:depth] || 0
51
+ @redirect_to = to_absolute(params[:redirect_to])
52
+ @response_time = params[:response_time]
53
+ @body = params[:body]
54
+ @error = params[:error]
55
+ @fetched = !params[:code].nil?
56
+ @user_data = OpenStruct.new
57
+ @domain_aliases = params[:domain_aliases] ||= []
58
+ @storable = true
59
+ @fetched_at = params[:fetched_at]
60
+ end
61
+
62
+ def title
63
+ doc.title if doc
64
+ end
65
+
66
+ #
67
+ # Array of distinct A tag HREFs from the page
68
+ #
69
+ def links
70
+ unless @links.nil?
71
+ @links = Set.new
72
+ return [] unless doc
73
+
74
+ doc.search('//a[@href]').each do |a|
75
+ u = a['href']
76
+ next if u.nil? || u.empty?
77
+ abs = to_absolute(u) rescue next
78
+ @links << abs if abs && in_domain?(abs)
79
+ end
80
+ end
81
+ @links.to_a
82
+ end
83
+
84
+ #
85
+ # Nokogiri document for the HTML body
86
+ #
87
+ def doc
88
+ # return @doc if @doc
89
+ # @body ||= ''
90
+ # @body = @body.encode('utf-8', 'binary', :invalid => :replace,
91
+ # :undef => :replace, :replace => '')
92
+ # @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
93
+ @doc ||= begin
94
+ Nokogiri::HTML(body) if !body.nil? && html? rescue nil
95
+ end
96
+ end
97
+
98
+
99
+
100
+ #
101
+ # Discard links, a next call of page.links will return an empty array
102
+ #
103
+ def discard_links!
104
+ @links = []
105
+ end
106
+
107
+ #
108
+ # Delete the Nokogiri document and response body to conserve memory
109
+ #
110
+ def discard_doc!
111
+ links # force parsing of page links before we trash the document
112
+ @doc = @body = nil
113
+ end
114
+
115
+ #
116
+ # Was the page successfully fetched?
117
+ # +true+ if the page was fetched with no error, +false+ otherwise.
118
+ #
119
+ def fetched?
120
+ @fetched
121
+ end
122
+
123
+ #
124
+ # The content-type returned by the HTTP request for this page
125
+ #
126
+ def content_type
127
+ headers['content-type'].first
128
+ end
129
+
130
+ #
131
+ # Returns +true+ if the page is a HTML document, returns +false+
132
+ # otherwise.
133
+ #
134
+ def html?
135
+ content_type =~ %r{^(text/html|application/xhtml+xml)\b}
136
+ end
137
+
138
+ #
139
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
140
+ # otherwise.
141
+ #
142
+ def redirect?
143
+ (300...400).include?(@code)
144
+ end
145
+
146
+ #
147
+ # Returns +true+ if the page is a HTTP success, returns +false+
148
+ # otherwise.
149
+ #
150
+ def success?
151
+ (200..206).include?(@code)
152
+ end
153
+
154
+ #
155
+ # Returns +true+ if the page was not found (returned 404 code),
156
+ # returns +false+ otherwise.
157
+ #
158
+ def not_found?
159
+ 404 == @code
160
+ end
161
+
162
+ #
163
+ # Base URI from the HTML doc head element
164
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
165
+ #
166
+ def base
167
+ @base = if doc
168
+ href = doc.search('//head/base/@href')
169
+ URI(href.to_s) unless href.nil? rescue nil
170
+ end unless @base
171
+
172
+ return nil if @base && @base.to_s.empty?
173
+ @base
174
+ end
175
+
176
+ #
177
+ # Converts relative URL *link* into an absolute URL based on the
178
+ # location of the page
179
+ #
180
+ def to_absolute(link)
181
+ return nil if link.nil?
182
+
183
+ # link = link.to_s.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
184
+
185
+ # remove anchor
186
+ link =
187
+ begin
188
+ URI.encode(URI.decode(link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
189
+ rescue URI::Error
190
+ return nil
191
+ end
192
+
193
+ relative = begin
194
+ URI(link)
195
+ rescue URI::Error
196
+ return nil
197
+ end
198
+ absolute = base ? base.merge(relative) : @url.merge(relative)
199
+
200
+ absolute.path = '/' if absolute.path.empty?
201
+
202
+ absolute
203
+ end
204
+
205
+ #
206
+ # Returns +true+ if *uri* is in the same domain as the page, returns
207
+ # +false+ otherwise
208
+ #
209
+ def in_domain?(uri)
210
+ @domain_aliases ||= []
211
+ uri.host == @url.host || @domain_aliases.include?(uri.host)
212
+ end
213
+
214
+ def to_hash
215
+ {
216
+ 'url' => @url.to_s,
217
+ 'headers' => Marshal.dump(@headers),
218
+ 'body' => @body,
219
+ 'links' => links.map(&:to_s),
220
+ 'code' => @code,
221
+ 'depth' => @depth,
222
+ 'referer' => @referer.to_s,
223
+ 'redirect_to' => @redirect_to.to_s,
224
+ 'response_time' => @response_time,
225
+ 'fetched' => @fetched,
226
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
227
+ 'fetched_at' => @fetched_at,
228
+ 'error' => @error.to_s
229
+ }
230
+ end
231
+
232
+ def to_json
233
+ th = to_hash.dup
234
+ th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
235
+ th.delete('headers') if content_type.empty?
236
+ th.to_json
237
+ end
238
+
239
+ #
240
+ # Returns +true+ if page is marked as storeable
241
+ # +false+ otherwise
242
+ # Default is +true+
243
+ #
244
+ def storable?
245
+ @storable
246
+ end
247
+
248
+ def expired?(ttl)
249
+ return false if fetched_at.nil?
250
+ (Time.now.to_i - ttl) > fetched_at
251
+ end
252
+
253
+ def self.from_hash(hash)
254
+ page = new(URI(hash['url']))
255
+ {
256
+ '@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
257
+ '@body' => hash['body'],
258
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
259
+ '@code' => hash['code'].to_i,
260
+ '@depth' => hash['depth'].to_i,
261
+ '@referer' => hash['referer'],
262
+ '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
263
+ '@response_time' => hash['response_time'].to_i,
264
+ '@fetched' => hash['fetched'],
265
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
266
+ '@fetched_at' => hash['fetched_at'],
267
+ '@error' => hash['error']
268
+ }.each do |var, value|
269
+ page.instance_variable_set(var, value)
270
+ end
271
+ page
272
+ end
273
+
274
+ def self.from_json(json)
275
+ hash = JSON.parse json
276
+ from_hash hash
277
+ end
278
+ end
279
+ end
@@ -0,0 +1,91 @@
1
+ require 'nokogiri'
2
+
3
+ module Digger
4
+ class Pattern
5
+ attr_accessor :type, :value, :block
6
+
7
+ def initialize(hash = {})
8
+ hash.each_pair{|key, value| send("#{key}=", value) if %w{type value block}.include?(key.to_s)}
9
+ end
10
+
11
+ def safe_block
12
+ block && begin
13
+ if block.respond_to?(:call)
14
+ block
15
+ elsif block.strip == '' #
16
+ nil
17
+ else
18
+ proc{ $SAFE = 2; eval block }.call
19
+ end
20
+ rescue StandardError
21
+ nil
22
+ end
23
+ end
24
+
25
+ def self.wrap(hash)
26
+ Hash[hash.map{|key, value| [key, value.is_a?(Pattern) ? value : Pattern.new(value)]}]
27
+ end
28
+
29
+ MATCH_MAX = 3
30
+
31
+ TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
32
+
33
+ def regexp?
34
+ TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
35
+ end
36
+
37
+ def match_page(page, &callback)
38
+ blk = callback || safe_block
39
+ if regexp? # regular expression
40
+ index = TYPES.index(type)
41
+ blk ||= ->(text){text.strip}
42
+ # content is String
43
+ if type == 'match_many'
44
+ match = page.body.gsub(value).to_a
45
+ else
46
+ matches = page.body.match(value)
47
+ match = matches.nil? ? nil : matches[index]
48
+ end
49
+ else # css expression
50
+ blk ||= ->(node){node.content.strip}
51
+ # content is Nokogiri::HTML::Document
52
+ if type == 'css_one'
53
+ match = page.doc.css(value).first
54
+ elsif type == 'css_many' # css_many
55
+ match = page.doc.css(value)
56
+ end
57
+ end
58
+ if match.nil?
59
+ nil
60
+ elsif %w{css_many match_many}.include? type
61
+ match.map{|node| blk.call(node) }.uniq
62
+ else
63
+ blk.call(match)
64
+ end
65
+ rescue
66
+ nil
67
+ end
68
+
69
+ class Nokogiri::XML::Node
70
+ %w{one many}.each do |name|
71
+ define_method "inner_#{name}" do |css, &block|
72
+ callback = ->(node) do
73
+ if node
74
+ (block || ->(n){n.text.strip}).call(node)
75
+ else
76
+ nil
77
+ end
78
+ end
79
+ if name == 'one' # inner_one
80
+ callback.call(self.css(css).first)
81
+ else # inner_many
82
+ self.css(css).map{|node| callback.call(node)}
83
+ end
84
+ end
85
+ end
86
+ def source
87
+ to_xml
88
+ end
89
+ end # nokogiri
90
+ end
91
+ end
@@ -0,0 +1,3 @@
1
+ module Digger
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ require 'digger'
2
+
3
+ http = Digger::HTTP.new
4
+ page = http.fetch_page('http://nan.so/')
5
+
6
+ pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
7
+
8
+ class Item < Digger::Model
9
+ css_many sites: '.sites>a>span'
10
+ end
11
+
12
+ describe Digger do
13
+ it "http should fetch a page" do
14
+ expect(page.code).to eq(200)
15
+ end
16
+
17
+ it "pattern should match content" do
18
+ sites = pattern.match_page(page)
19
+ expect(sites.include?('百度网盘')).to eq(true)
20
+ end
21
+
22
+ it "model should dig content" do
23
+ item = Item.new.match_page(page)
24
+ expect(item[:sites].include?('读远')).to be(true)
25
+ end
26
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: digger
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - binz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: http-cookie
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
69
+ description: ''
70
+ email:
71
+ - xinkiang@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - Gemfile
78
+ - LICENSE.txt
79
+ - README.md
80
+ - Rakefile
81
+ - digger.gemspec
82
+ - lib/digger.rb
83
+ - lib/digger/http.rb
84
+ - lib/digger/model.rb
85
+ - lib/digger/page.rb
86
+ - lib/digger/pattern.rb
87
+ - lib/digger/version.rb
88
+ - spec/digger_spec.rb
89
+ homepage: ''
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Dig need stractual infomation from web page.
113
+ test_files:
114
+ - spec/digger_spec.rb