wgit 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc46154d26c11924869c5a38687ac6a855503c0669db40f43941992273d9c3d2
4
- data.tar.gz: d348acb154faa3cb19d4b5e28332bec1192b8b04b526493ec08bdfde933964d8
3
+ metadata.gz: ae86258e3aac086f2215d1fb3e3b871cd4f4839884eb7c359ac9863148f1a307
4
+ data.tar.gz: 2eafa3a2b7b6d6ff99aaf00466ccd5f9214f049a9f5836b45aaf6d17bfbe226b
5
5
  SHA512:
6
- metadata.gz: e0a48d554b359abb247ebdbde597b4654f56baf2a980261d70906126f9715d03221a1581010cd0d8e625ff2d624e75d55c6152804d2f27d236a8376962f6227f
7
- data.tar.gz: 84be3d1deb26d4e0641db1eca73afdc0323588ea0bd283e9d2920fd96614dfa549ab4c7497317ed53d87d5deaf344490dc7258af381dee116adc0c6c44837746
6
+ metadata.gz: 5735051c62d3d22db75a42c7d33cd8b1f78d4500b27c0e136980382ffb0e6830ee8d355a0f9151c4d844dd0eb91dc860cd5bd1855ec68099985c34d55ba1a3aa
7
+ data.tar.gz: 0b5ab8f7f60e69f791fd4b51995f5dbf2f38a77954b164062b64b982847b7fd3844b016fbfaff186a7178f68d2fedc8736fd60b8c16ae82bc11c7a84a5892e42
data/lib/wgit/crawler.rb CHANGED
@@ -8,13 +8,12 @@ module Wgit
8
8
 
9
9
  # The Crawler class provides a means of crawling web based URL's, turning
10
10
  # their HTML into Wgit::Document's.
11
- # Note that currently all redirects will not be followed during a crawl.
12
11
  class Crawler
13
12
  include Assertable
14
-
13
+
15
14
  # The urls to crawl.
16
15
  attr_reader :urls
17
-
16
+
18
17
  # The docs of the crawled @urls.
19
18
  attr_reader :docs
20
19
 
@@ -146,26 +145,35 @@ module Wgit
146
145
 
147
146
  # The fetch method performs a HTTP GET to obtain the HTML document.
148
147
  # Invalid urls or any HTTP response that doesn't return a HTML body will be
149
- # ignored and nil will be returned. This means that redirects etc. will
150
- # not be followed.
148
+ # ignored and nil will be returned. Otherwise, the HTML is returned.
151
149
  def fetch(url)
152
- raise unless url.respond_to?(:to_uri)
153
- res = Net::HTTP.get_response(url.to_uri)
154
- res.body.empty? ? nil : res.body
150
+ response = resolve(url)
151
+ response.body.empty? ? nil : response.body
155
152
  rescue
156
153
  nil
157
154
  end
158
-
155
+
156
+ # The resolve method performs a HTTP GET to obtain the HTML document.
157
+ # A certain amount of redirects will be followed by default before raising
158
+ # an exception. Redirects can be disabled by setting `redirect_limit: 1`.
159
+ # The Net::HTTPResponse will be returned.
160
+ def resolve(url, redirect_limit: 5)
161
+ redirect_count = 0
162
+ begin
163
+ raise "Too many redirects" if redirect_count >= redirect_limit
164
+ response = Net::HTTP.get_response(URI.parse(url))
165
+ url = response['location']
166
+ redirect_count += 1
167
+ end while response.is_a?(Net::HTTPRedirection)
168
+ response
169
+ end
170
+
159
171
  # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
160
172
  def add_url(url)
161
173
  @urls = [] if @urls.nil?
162
- if url.is_a?(Wgit::Url)
163
- @urls << url
164
- else
165
- @urls << Wgit::Url.new(url)
166
- end
174
+ @urls << Wgit::Url.new(url)
167
175
  end
168
-
176
+
169
177
  alias :crawl :crawl_urls
170
178
  alias :crawl_r :crawl_site
171
179
  end
data/lib/wgit/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # @author Michael Telford
4
4
  module Wgit
5
5
  # The current gem version of Wgit.
6
- VERSION = "0.0.8".freeze
6
+ VERSION = "0.0.9".freeze
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -137,11 +137,11 @@ dependencies:
137
137
  - !ruby/object:Gem::Version
138
138
  version: '2.6'
139
139
  description: Wgit is a WWW indexer/scraper which crawls URL's, retrieves and serialises
140
- their page contents for later use. You can use Wgit to copy entire website if required.
140
+ their page contents for later use. You can use Wgit to copy entire websites if required.
141
141
  Wgit also provides a means to search indexed documents stored in a database. Therefore,
142
142
  this library provides the main components of a WWW search engine. The Wgit API is
143
- easily extendable allowing you to pull out the parts of a webpage that are important
144
- to you, the external links or keywords for example. As Wgit is an API, it's very
143
+ easily extended allowing you to pull out the parts of a webpage that are important
144
+ to you, the code snippets or images for example. As Wgit is a library, it's very
145
145
  useful in many different application types.
146
146
  email: michael.telford@live.com
147
147
  executables: []