wgit 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +13 -5
- data/lib/wgit/document.rb +40 -44
- data/lib/wgit/url.rb +84 -24
- data/lib/wgit/version.rb +1 -1
- metadata +46 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4dc572e7d48a95d423e175ad2d6a791be52bf56f6c391e9152c075f45672ee8
|
4
|
+
data.tar.gz: fd6a9c9d1e38906f500543ae92169f7bcb9e64de1567f4f29ec24f7ca74c60d8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f4fc425aa1b25254dba343151794893ff26a3682e58dd08bdc918c180da89ecb08cf0f1013837e41527f564d36bb0784c6ecd204e53c1276cb5b32401c88ffab
|
7
|
+
data.tar.gz: 2ce1250ad7312257bc021e7414f164ec174e07e469a6956478cccaa7b05f159981a7a69ef6b713db5454233c8a03ea8487f14184c1e26dcab28ed8e81250507d
|
data/lib/wgit/crawler.rb
CHANGED
@@ -106,7 +106,8 @@ module Wgit
|
|
106
106
|
doc = crawl_url(base_url, &block)
|
107
107
|
return nil if doc.nil?
|
108
108
|
|
109
|
-
|
109
|
+
path = base_url.path.empty? ? '/' : base_url.path
|
110
|
+
crawled_urls = [path]
|
110
111
|
external_urls = doc.external_links
|
111
112
|
internal_urls = doc.internal_links
|
112
113
|
|
@@ -149,7 +150,10 @@ module Wgit
|
|
149
150
|
def fetch(url)
|
150
151
|
response = resolve(url)
|
151
152
|
response.body.empty? ? nil : response.body
|
152
|
-
rescue
|
153
|
+
rescue Exception => ex
|
154
|
+
Wgit.logger.debug(
|
155
|
+
"Wgit::Crawler#fetch('#{url}') exception: #{ex.message}"
|
156
|
+
)
|
153
157
|
nil
|
154
158
|
end
|
155
159
|
|
@@ -158,12 +162,16 @@ module Wgit
|
|
158
162
|
# an exception. Redirects can be disabled by setting `redirect_limit: 1`.
|
159
163
|
# The Net::HTTPResponse will be returned.
|
160
164
|
def resolve(url, redirect_limit: 5)
|
161
|
-
redirect_count =
|
165
|
+
redirect_count = -1
|
162
166
|
begin
|
163
167
|
raise "Too many redirects" if redirect_count >= redirect_limit
|
164
|
-
response = Net::HTTP.get_response(URI.parse(url))
|
165
|
-
url = response['location']
|
166
168
|
redirect_count += 1
|
169
|
+
|
170
|
+
response = Net::HTTP.get_response(URI(url))
|
171
|
+
location = Wgit::Url.new(response.fetch('location', ''))
|
172
|
+
if not location.empty?
|
173
|
+
url = location.is_relative? ? url.to_base.concat(location) : location
|
174
|
+
end
|
167
175
|
end while response.is_a?(Net::HTTPRedirection)
|
168
176
|
response
|
169
177
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -62,6 +62,8 @@ module Wgit
|
|
62
62
|
@html = html ||= ""
|
63
63
|
@doc = init_nokogiri
|
64
64
|
@score = 0.0
|
65
|
+
|
66
|
+
process_url_and_html
|
65
67
|
|
66
68
|
# Dynamically run the init_*_from_html methods.
|
67
69
|
Document.private_instance_methods(false).each do |method|
|
@@ -80,7 +82,9 @@ module Wgit
|
|
80
82
|
@html = obj.fetch("html", "")
|
81
83
|
@doc = init_nokogiri
|
82
84
|
@score = obj.fetch("score", 0.0)
|
83
|
-
|
85
|
+
|
86
|
+
process_url_and_html
|
87
|
+
|
84
88
|
# Dynamically run the init_*_from_object methods.
|
85
89
|
Document.private_instance_methods(false).each do |method|
|
86
90
|
if method.to_s.start_with?("init_") &&
|
@@ -174,7 +178,7 @@ module Wgit
|
|
174
178
|
# @return [Boolean] True if @html is nil/empty, false otherwise.
|
175
179
|
def empty?
|
176
180
|
return true if @html.nil?
|
177
|
-
@html.
|
181
|
+
@html.empty?
|
178
182
|
end
|
179
183
|
|
180
184
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
@@ -194,47 +198,54 @@ module Wgit
|
|
194
198
|
def css(selector)
|
195
199
|
@doc.css(selector)
|
196
200
|
end
|
197
|
-
|
198
|
-
# Get all internal links of this Document.
|
201
|
+
|
202
|
+
# Get all internal links of this Document in relative form. Internal
|
203
|
+
# meaning a link to another page on this website. Also see
|
204
|
+
# Wgit::Document#internal_full_links.
|
199
205
|
#
|
200
206
|
# @return [Array<Wgit::Url>] self's internal/relative URL's.
|
201
207
|
def internal_links
|
202
208
|
return [] if @links.empty?
|
203
|
-
|
204
|
-
|
205
|
-
|
209
|
+
|
210
|
+
links = @links.
|
211
|
+
reject do |link|
|
212
|
+
not link.relative_link?(base: @url.to_base)
|
206
213
|
rescue
|
207
214
|
true
|
208
|
-
end
|
209
|
-
|
215
|
+
end.
|
216
|
+
map(&:to_path)
|
217
|
+
|
218
|
+
process_arr(links)
|
210
219
|
end
|
211
220
|
|
212
221
|
# Get all internal links of this Document and append them to this
|
213
|
-
# Document's base URL.
|
222
|
+
# Document's base URL making them absolute. Also see
|
223
|
+
# Wgit::Document#internal_links.
|
214
224
|
#
|
215
225
|
# @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
|
216
226
|
# form.
|
217
227
|
def internal_full_links
|
218
228
|
in_links = internal_links
|
219
229
|
return [] if in_links.empty?
|
220
|
-
in_links.map
|
221
|
-
link.replace("/" + link) unless link.start_with?("/")
|
222
|
-
Wgit::Url.new(@url.to_base + link)
|
223
|
-
end
|
230
|
+
in_links.map { |link| @url.to_base.concat(link) }
|
224
231
|
end
|
225
232
|
|
226
|
-
# Get all external links of this Document.
|
233
|
+
# Get all external links of this Document. External meaning a link to
|
234
|
+
# another website.
|
227
235
|
#
|
228
236
|
# @return [Array<Wgit::Url>] self's external/absolute URL's.
|
229
237
|
def external_links
|
230
238
|
return [] if @links.empty?
|
231
|
-
|
232
|
-
|
233
|
-
|
239
|
+
|
240
|
+
links = @links.
|
241
|
+
reject do |link|
|
242
|
+
link.relative_link?(base: @url.to_base)
|
234
243
|
rescue
|
235
244
|
true
|
236
|
-
end
|
237
|
-
|
245
|
+
end.
|
246
|
+
map { |link| link.end_with?('/') ? link.chop : link }
|
247
|
+
|
248
|
+
process_arr(links)
|
238
249
|
end
|
239
250
|
|
240
251
|
# Searches against the @text for the given search query.
|
@@ -253,8 +264,8 @@ module Wgit
|
|
253
264
|
# sentence.
|
254
265
|
# @return [Array<String>] Representing the search results.
|
255
266
|
def search(query, sentence_limit = 80)
|
256
|
-
raise "A search
|
257
|
-
raise "The
|
267
|
+
raise "A search query must be provided" if query.empty?
|
268
|
+
raise "The sentence_limit value must be even" if sentence_limit.odd?
|
258
269
|
|
259
270
|
results = {}
|
260
271
|
regex = Regexp.new(query, Regexp::IGNORECASE)
|
@@ -469,29 +480,16 @@ module Wgit
|
|
469
480
|
if array.is_a?(Array)
|
470
481
|
array.map! { |str| process_str(str) }
|
471
482
|
array.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
483
|
+
array.compact!
|
472
484
|
array.uniq!
|
473
485
|
end
|
474
486
|
array
|
475
487
|
end
|
476
|
-
|
477
|
-
#
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
def process_internal_links(links)
|
482
|
-
links.map! do |link|
|
483
|
-
host_or_base = if link.start_with?("http")
|
484
|
-
@url.base
|
485
|
-
else
|
486
|
-
@url.host
|
487
|
-
end
|
488
|
-
if link.start_with?(host_or_base)
|
489
|
-
link.sub!(host_or_base, "")
|
490
|
-
link.replace(link[1..-1]) if link.start_with?("/")
|
491
|
-
link.strip!
|
492
|
-
end
|
493
|
-
link
|
494
|
-
end
|
488
|
+
|
489
|
+
# Ensure the @url and @html Strings are correctly encoded etc.
|
490
|
+
def process_url_and_html
|
491
|
+
@url = process_str(@url)
|
492
|
+
@html = process_str(@html)
|
495
493
|
end
|
496
494
|
|
497
495
|
### Default init_* (Document extension) methods. ###
|
@@ -547,7 +545,6 @@ module Wgit
|
|
547
545
|
xpath = "//a/@href"
|
548
546
|
result = find_in_html(xpath, singleton: false) do |links|
|
549
547
|
if links
|
550
|
-
links.reject! { |link| link == "/" }
|
551
548
|
links.map! do |link|
|
552
549
|
begin
|
553
550
|
Wgit::Url.new(link)
|
@@ -555,8 +552,7 @@ module Wgit
|
|
555
552
|
nil
|
556
553
|
end
|
557
554
|
end
|
558
|
-
links.
|
559
|
-
process_internal_links(links)
|
555
|
+
links.compact!
|
560
556
|
end
|
561
557
|
links
|
562
558
|
end
|
data/lib/wgit/url.rb
CHANGED
@@ -95,21 +95,29 @@ module Wgit
|
|
95
95
|
url
|
96
96
|
end
|
97
97
|
|
98
|
-
# Returns if link is a relative or absolute Url.
|
99
|
-
#
|
100
|
-
#
|
101
|
-
#
|
102
|
-
#
|
103
|
-
# internal link (regardless of whether it is valid or not).
|
98
|
+
# Returns if link is a relative or absolute Url.
|
99
|
+
# All external links in a page are expected to have a protocol prefix e.g.
|
100
|
+
# "http://", otherwise the link is treated as an internal link (regardless
|
101
|
+
# of whether it is valid or not). The only exception is if base is provided
|
102
|
+
# and link is a page within that site; then the link is relative.
|
104
103
|
#
|
105
104
|
# @param link [Wgit::Url, String] The url to test if relative or not.
|
105
|
+
# @param base [String] The Url base e.g. http://www.google.co.uk.
|
106
106
|
# @return [Boolean] True if relative, false if absolute.
|
107
107
|
# @raise [RuntimeError] If the link is invalid.
|
108
|
-
def self.relative_link?(link)
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
108
|
+
def self.relative_link?(link, base: nil)
|
109
|
+
if base and URI(base).host.nil?
|
110
|
+
raise "Invalid base, must contain protocol prefix: #{base}"
|
111
|
+
end
|
112
|
+
|
113
|
+
uri = URI(link)
|
114
|
+
if not uri.host.nil? and not uri.host.empty?
|
115
|
+
if base
|
116
|
+
uri.host == URI(base).host
|
117
|
+
else
|
118
|
+
false
|
119
|
+
end
|
120
|
+
elsif not uri.path.nil? and not uri.path.empty?
|
113
121
|
true
|
114
122
|
else
|
115
123
|
raise "Invalid link: #{link}"
|
@@ -128,11 +136,14 @@ module Wgit
|
|
128
136
|
Wgit::Url.new(url + "/" + link)
|
129
137
|
end
|
130
138
|
|
131
|
-
# Returns if self is a relative or absolute Url.
|
139
|
+
# Returns if self is a relative or absolute Url. If base is provided and
|
140
|
+
# self is a page within that site then the link is relative.
|
141
|
+
# See Wgit.relative_link? for more information.
|
142
|
+
#
|
132
143
|
# @return [Boolean] True if relative, false if absolute.
|
133
144
|
# @raise [RuntimeError] If the link is invalid.
|
134
|
-
def relative_link?
|
135
|
-
Wgit::Url.relative_link?(self)
|
145
|
+
def relative_link?(base: nil)
|
146
|
+
Wgit::Url.relative_link?(self, base: base)
|
136
147
|
end
|
137
148
|
|
138
149
|
# Determines if self is a valid Url or not.
|
@@ -142,7 +153,7 @@ module Wgit
|
|
142
153
|
Wgit::Url.valid?(self)
|
143
154
|
end
|
144
155
|
|
145
|
-
# Concats self
|
156
|
+
# Concats self and the link.
|
146
157
|
#
|
147
158
|
# @param link [Wgit::Url, String] The link to concat with self.
|
148
159
|
# @return [Wgit::Url] self + "/" + link
|
@@ -172,6 +183,14 @@ module Wgit
|
|
172
183
|
def to_url
|
173
184
|
self
|
174
185
|
end
|
186
|
+
|
187
|
+
# Returns a new Wgit::Url containing just the scheme/protocol of this URL
|
188
|
+
# e.g. Given http://www.google.co.uk, http is returned.
|
189
|
+
#
|
190
|
+
# @return [Wgit::Url] Containing just the scheme/protocol.
|
191
|
+
def to_scheme
|
192
|
+
Wgit::Url.new(@uri.scheme)
|
193
|
+
end
|
175
194
|
|
176
195
|
# Returns a new Wgit::Url containing just the host of this URL e.g.
|
177
196
|
# Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
|
@@ -181,24 +200,54 @@ module Wgit
|
|
181
200
|
Wgit::Url.new(@uri.host)
|
182
201
|
end
|
183
202
|
|
184
|
-
# Returns the base of this URL e.g. the protocol and host combined.
|
185
|
-
# How it works:
|
186
|
-
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
187
|
-
# array[0]: "http://", array[2]: "www.google.co.uk", which we use.
|
203
|
+
# Returns only the base of this URL e.g. the protocol and host combined.
|
188
204
|
#
|
189
|
-
# @return [Wgit::Url] Base of self
|
205
|
+
# @return [Wgit::Url] Base of self e.g. http://www.google.co.uk.
|
190
206
|
def to_base
|
191
207
|
if Wgit::Url.relative_link?(self)
|
192
208
|
raise "A relative link doesn't have a base URL: #{self}"
|
193
209
|
end
|
194
|
-
|
195
|
-
if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
|
210
|
+
if @uri.scheme.nil? or @uri.host.nil? or @uri.host.empty?
|
196
211
|
raise "Both a protocol and host are needed: #{self}"
|
197
212
|
end
|
198
|
-
base = "#{
|
213
|
+
base = "#{@uri.scheme}://#{@uri.host}"
|
199
214
|
Wgit::Url.new(base)
|
200
215
|
end
|
201
|
-
|
216
|
+
|
217
|
+
# Returns the path of this URL e.g. the bit after the host without slashes.
|
218
|
+
# For example:
|
219
|
+
# Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
|
220
|
+
# "about.html". See Wgit::Url#to_endpoint if you want the slashes.
|
221
|
+
#
|
222
|
+
# @return [Wgit::Url] Path of self e.g. about.html.
|
223
|
+
def to_path
|
224
|
+
path = @uri.path
|
225
|
+
return Wgit::Url.new(path) if path == '/'
|
226
|
+
path = path[1..-1] if path.start_with?('/')
|
227
|
+
path.chop! if path.end_with?('/')
|
228
|
+
Wgit::Url.new(path)
|
229
|
+
end
|
230
|
+
|
231
|
+
# Returns the endpoint of this URL e.g. the bit after the host with any
|
232
|
+
# slashes included. For example:
|
233
|
+
# Wgit::Url.new("http://www.google.co.uk/about.html/").to_endpoint returns
|
234
|
+
# "/about.html/". See Wgit::Url#to_path if you don't want the slashes.
|
235
|
+
#
|
236
|
+
# @return [Wgit::Url] Endpoint of self e.g. /about.html/.
|
237
|
+
def to_endpoint
|
238
|
+
endpoint = @uri.path
|
239
|
+
endpoint = '/' + endpoint unless endpoint.start_with?('/')
|
240
|
+
Wgit::Url.new(endpoint)
|
241
|
+
end
|
242
|
+
|
243
|
+
# Returns a new Wgit::Url containing just the query string of this URL
|
244
|
+
# e.g. Given http://google.com?q=ruby, ruby is returned.
|
245
|
+
#
|
246
|
+
# @return [Wgit::Url] Containing just the query string.
|
247
|
+
def to_query_string
|
248
|
+
Wgit::Url.new(@uri.query)
|
249
|
+
end
|
250
|
+
|
202
251
|
# Returns a Hash containing this Url's instance vars excluding @uri.
|
203
252
|
# Used when storing the URL in a Database e.g. MongoDB etc.
|
204
253
|
#
|
@@ -210,9 +259,20 @@ module Wgit
|
|
210
259
|
end
|
211
260
|
|
212
261
|
alias :to_hash :to_h
|
262
|
+
alias :uri :to_uri
|
263
|
+
alias :url :to_url
|
264
|
+
alias :scheme :to_scheme
|
265
|
+
alias :to_protocol :to_scheme
|
266
|
+
alias :protocol :to_scheme
|
213
267
|
alias :host :to_host
|
214
268
|
alias :base :to_base
|
269
|
+
alias :path :to_path
|
270
|
+
alias :endpoint :to_endpoint
|
271
|
+
alias :query_string :to_query_string
|
272
|
+
alias :query :to_query_string
|
215
273
|
alias :internal_link? :relative_link?
|
274
|
+
alias :is_relative? :relative_link?
|
275
|
+
alias :is_internal? :relative_link?
|
216
276
|
alias :crawled? :crawled
|
217
277
|
end
|
218
278
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -28,16 +28,16 @@ dependencies:
|
|
28
28
|
name: yard
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 0.9.20
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 0.9.20
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: byebug
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,6 +108,34 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '1.3'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: webmock
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '3.6'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '3.6'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: rack
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '2.0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '2.0'
|
111
139
|
- !ruby/object:Gem::Dependency
|
112
140
|
name: nokogiri
|
113
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,21 +156,21 @@ dependencies:
|
|
128
156
|
requirements:
|
129
157
|
- - "~>"
|
130
158
|
- !ruby/object:Gem::Version
|
131
|
-
version:
|
159
|
+
version: 2.8.0
|
132
160
|
type: :runtime
|
133
161
|
prerelease: false
|
134
162
|
version_requirements: !ruby/object:Gem::Requirement
|
135
163
|
requirements:
|
136
164
|
- - "~>"
|
137
165
|
- !ruby/object:Gem::Version
|
138
|
-
version:
|
139
|
-
description: Wgit is a WWW indexer/scraper which crawls URL's, retrieves
|
140
|
-
their page contents for later use. You can use Wgit to copy entire
|
141
|
-
Wgit also provides a means to search indexed documents stored
|
142
|
-
this library provides the main components of a WWW search
|
143
|
-
easily extended allowing you to pull out the parts of a
|
144
|
-
to you, the code snippets or images for example. As Wgit
|
145
|
-
|
166
|
+
version: 2.8.0
|
167
|
+
description: Fundamentally, Wgit is a WWW indexer/scraper which crawls URL's, retrieves
|
168
|
+
and serialises their page contents for later use. You can use Wgit to copy entire
|
169
|
+
websites if required. Wgit also provides a means to search indexed documents stored
|
170
|
+
in a database. Therefore, this library provides the main components of a WWW search
|
171
|
+
engine. The Wgit API is easily extended allowing you to pull out the parts of a
|
172
|
+
webpage that are important to you, the code snippets or images for example. As Wgit
|
173
|
+
is a library, it has uses in many different application types.
|
146
174
|
email: michael.telford@live.com
|
147
175
|
executables: []
|
148
176
|
extensions: []
|
@@ -166,6 +194,7 @@ licenses:
|
|
166
194
|
- MIT
|
167
195
|
metadata:
|
168
196
|
source_code_uri: https://github.com/michaeltelford/wgit
|
197
|
+
yard.run: yri
|
169
198
|
allowed_push_host: https://rubygems.org
|
170
199
|
post_install_message:
|
171
200
|
rdoc_options: []
|
@@ -183,9 +212,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
212
|
version: '0'
|
184
213
|
requirements: []
|
185
214
|
rubyforge_project:
|
186
|
-
rubygems_version: 2.7.
|
215
|
+
rubygems_version: 2.7.6
|
187
216
|
signing_key:
|
188
217
|
specification_version: 4
|
189
|
-
summary: Wgit is
|
190
|
-
web scraping, indexing and searching.
|
218
|
+
summary: Wgit is a Ruby gem similar in nature to GNU's `wget`. It provides an easy
|
219
|
+
to use API for programmatic web scraping, indexing and searching.
|
191
220
|
test_files: []
|