wgit 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +23 -15
- data/lib/wgit/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae86258e3aac086f2215d1fb3e3b871cd4f4839884eb7c359ac9863148f1a307
|
4
|
+
data.tar.gz: 2eafa3a2b7b6d6ff99aaf00466ccd5f9214f049a9f5836b45aaf6d17bfbe226b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5735051c62d3d22db75a42c7d33cd8b1f78d4500b27c0e136980382ffb0e6830ee8d355a0f9151c4d844dd0eb91dc860cd5bd1855ec68099985c34d55ba1a3aa
|
7
|
+
data.tar.gz: 0b5ab8f7f60e69f791fd4b51995f5dbf2f38a77954b164062b64b982847b7fd3844b016fbfaff186a7178f68d2fedc8736fd60b8c16ae82bc11c7a84a5892e42
|
data/lib/wgit/crawler.rb
CHANGED
@@ -8,13 +8,12 @@ module Wgit
|
|
8
8
|
|
9
9
|
# The Crawler class provides a means of crawling web based URL's, turning
|
10
10
|
# their HTML into Wgit::Document's.
|
11
|
-
# Note that currently all redirects will not be followed during a crawl.
|
12
11
|
class Crawler
|
13
12
|
include Assertable
|
14
|
-
|
13
|
+
|
15
14
|
# The urls to crawl.
|
16
15
|
attr_reader :urls
|
17
|
-
|
16
|
+
|
18
17
|
# The docs of the crawled @urls.
|
19
18
|
attr_reader :docs
|
20
19
|
|
@@ -146,26 +145,35 @@ module Wgit
|
|
146
145
|
|
147
146
|
# The fetch method performs a HTTP GET to obtain the HTML document.
|
148
147
|
# Invalid urls or any HTTP response that doesn't return a HTML body will be
|
149
|
-
# ignored and nil will be returned.
|
150
|
-
# not be followed.
|
148
|
+
# ignored and nil will be returned. Otherwise, the HTML is returned.
|
151
149
|
def fetch(url)
|
152
|
-
|
153
|
-
|
154
|
-
res.body.empty? ? nil : res.body
|
150
|
+
response = resolve(url)
|
151
|
+
response.body.empty? ? nil : response.body
|
155
152
|
rescue
|
156
153
|
nil
|
157
154
|
end
|
158
|
-
|
155
|
+
|
156
|
+
# The resolve method performs a HTTP GET to obtain the HTML document.
|
157
|
+
# A certain amount of redirects will be followed by default before raising
|
158
|
+
# an exception. Redirects can be disabled by setting `redirect_limit: 1`.
|
159
|
+
# The Net::HTTPResponse will be returned.
|
160
|
+
def resolve(url, redirect_limit: 5)
|
161
|
+
redirect_count = 0
|
162
|
+
begin
|
163
|
+
raise "Too many redirects" if redirect_count >= redirect_limit
|
164
|
+
response = Net::HTTP.get_response(URI.parse(url))
|
165
|
+
url = response['location']
|
166
|
+
redirect_count += 1
|
167
|
+
end while response.is_a?(Net::HTTPRedirection)
|
168
|
+
response
|
169
|
+
end
|
170
|
+
|
159
171
|
# Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
|
160
172
|
def add_url(url)
|
161
173
|
@urls = [] if @urls.nil?
|
162
|
-
|
163
|
-
@urls << url
|
164
|
-
else
|
165
|
-
@urls << Wgit::Url.new(url)
|
166
|
-
end
|
174
|
+
@urls << Wgit::Url.new(url)
|
167
175
|
end
|
168
|
-
|
176
|
+
|
169
177
|
alias :crawl :crawl_urls
|
170
178
|
alias :crawl_r :crawl_site
|
171
179
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -137,11 +137,11 @@ dependencies:
|
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '2.6'
|
139
139
|
description: Wgit is a WWW indexer/scraper which crawls URL's, retrieves and serialises
|
140
|
-
their page contents for later use. You can use Wgit to copy entire
|
140
|
+
their page contents for later use. You can use Wgit to copy entire websites if required.
|
141
141
|
Wgit also provides a means to search indexed documents stored in a database. Therefore,
|
142
142
|
this library provides the main components of a WWW search engine. The Wgit API is
|
143
|
-
easily
|
144
|
-
to you, the
|
143
|
+
easily extended allowing you to pull out the parts of a webpage that are important
|
144
|
+
to you, the code snippets or images for example. As Wgit is a library, it's very
|
145
145
|
useful in many different application types.
|
146
146
|
email: michael.telford@live.com
|
147
147
|
executables: []
|