wgit 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +23 -15
- data/lib/wgit/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae86258e3aac086f2215d1fb3e3b871cd4f4839884eb7c359ac9863148f1a307
|
4
|
+
data.tar.gz: 2eafa3a2b7b6d6ff99aaf00466ccd5f9214f049a9f5836b45aaf6d17bfbe226b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5735051c62d3d22db75a42c7d33cd8b1f78d4500b27c0e136980382ffb0e6830ee8d355a0f9151c4d844dd0eb91dc860cd5bd1855ec68099985c34d55ba1a3aa
|
7
|
+
data.tar.gz: 0b5ab8f7f60e69f791fd4b51995f5dbf2f38a77954b164062b64b982847b7fd3844b016fbfaff186a7178f68d2fedc8736fd60b8c16ae82bc11c7a84a5892e42
|
data/lib/wgit/crawler.rb
CHANGED
@@ -8,13 +8,12 @@ module Wgit
|
|
8
8
|
|
9
9
|
# The Crawler class provides a means of crawling web based URL's, turning
|
10
10
|
# their HTML into Wgit::Document's.
|
11
|
-
# Note that currently all redirects will not be followed during a crawl.
|
12
11
|
class Crawler
|
13
12
|
include Assertable
|
14
|
-
|
13
|
+
|
15
14
|
# The urls to crawl.
|
16
15
|
attr_reader :urls
|
17
|
-
|
16
|
+
|
18
17
|
# The docs of the crawled @urls.
|
19
18
|
attr_reader :docs
|
20
19
|
|
@@ -146,26 +145,35 @@ module Wgit
|
|
146
145
|
|
147
146
|
# The fetch method performs a HTTP GET to obtain the HTML document.
|
148
147
|
# Invalid urls or any HTTP response that doesn't return a HTML body will be
|
149
|
-
# ignored and nil will be returned.
|
150
|
-
# not be followed.
|
148
|
+
# ignored and nil will be returned. Otherwise, the HTML is returned.
|
151
149
|
def fetch(url)
|
152
|
-
|
153
|
-
|
154
|
-
res.body.empty? ? nil : res.body
|
150
|
+
response = resolve(url)
|
151
|
+
response.body.empty? ? nil : response.body
|
155
152
|
rescue
|
156
153
|
nil
|
157
154
|
end
|
158
|
-
|
155
|
+
|
156
|
+
# The resolve method performs a HTTP GET to obtain the HTML document.
|
157
|
+
# A certain amount of redirects will be followed by default before raising
|
158
|
+
# an exception. Redirects can be disabled by setting `redirect_limit: 1`.
|
159
|
+
# The Net::HTTPResponse will be returned.
|
160
|
+
def resolve(url, redirect_limit: 5)
|
161
|
+
redirect_count = 0
|
162
|
+
begin
|
163
|
+
raise "Too many redirects" if redirect_count >= redirect_limit
|
164
|
+
response = Net::HTTP.get_response(URI.parse(url))
|
165
|
+
url = response['location']
|
166
|
+
redirect_count += 1
|
167
|
+
end while response.is_a?(Net::HTTPRedirection)
|
168
|
+
response
|
169
|
+
end
|
170
|
+
|
159
171
|
# Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
|
160
172
|
def add_url(url)
|
161
173
|
@urls = [] if @urls.nil?
|
162
|
-
|
163
|
-
@urls << url
|
164
|
-
else
|
165
|
-
@urls << Wgit::Url.new(url)
|
166
|
-
end
|
174
|
+
@urls << Wgit::Url.new(url)
|
167
175
|
end
|
168
|
-
|
176
|
+
|
169
177
|
alias :crawl :crawl_urls
|
170
178
|
alias :crawl_r :crawl_site
|
171
179
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -137,11 +137,11 @@ dependencies:
|
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '2.6'
|
139
139
|
description: Wgit is a WWW indexer/scraper which crawls URL's, retrieves and serialises
|
140
|
-
their page contents for later use. You can use Wgit to copy entire
|
140
|
+
their page contents for later use. You can use Wgit to copy entire websites if required.
|
141
141
|
Wgit also provides a means to search indexed documents stored in a database. Therefore,
|
142
142
|
this library provides the main components of a WWW search engine. The Wgit API is
|
143
|
-
easily
|
144
|
-
to you, the
|
143
|
+
easily extended allowing you to pull out the parts of a webpage that are important
|
144
|
+
to you, the code snippets or images for example. As Wgit is a library, it's very
|
145
145
|
useful in many different application types.
|
146
146
|
email: michael.telford@live.com
|
147
147
|
executables: []
|