wgit 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +61 -0
- data/LICENSE.txt +21 -0
- data/README.md +16 -7
- data/TODO.txt +34 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +35 -29
- data/lib/wgit/core_ext.rb +5 -3
- data/lib/wgit/crawler.rb +96 -58
- data/lib/wgit/database/connection_details.rb +4 -2
- data/lib/wgit/database/database.rb +84 -46
- data/lib/wgit/database/model.rb +12 -10
- data/lib/wgit/document.rb +100 -72
- data/lib/wgit/document_extensions.rb +11 -9
- data/lib/wgit/indexer.rb +34 -24
- data/lib/wgit/logger.rb +4 -2
- data/lib/wgit/url.rb +94 -59
- data/lib/wgit/utils.rb +13 -11
- data/lib/wgit/version.rb +3 -1
- metadata +41 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26e6a29fbf72b0ecbbc487c8aba9ec243a260b4761805c6c7923f2af82fa94f5
|
4
|
+
data.tar.gz: 9e15ad14991418fc3b4b2c0dafacac617b32197e825ad72887d91182c8ddf652
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4b17b8467abf13b186e88fb63fe8630163612bc685d7d521122fdc4c693e7d9229c59888afa1191189b3838317fa29e028c90757b880177c1e7a8f81a0a38047
|
7
|
+
data.tar.gz: 6fb7bb518ca3b9e520e1edbf25b4c265018686b5c61e134d623c38efa1bdf5073affb5205e47aee3a32a4502b56205080ded00a79bd6f138cf9178b019a2b32d
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Wgit Change Log
|
2
|
+
|
3
|
+
## v0.0.0 (TEMPLATE - DO NOT EDIT)
|
4
|
+
### Added
|
5
|
+
- ...
|
6
|
+
### Changed/Removed
|
7
|
+
- ...
|
8
|
+
### Fixed
|
9
|
+
- ...
|
10
|
+
---
|
11
|
+
|
12
|
+
## v0.0.18
|
13
|
+
### Added
|
14
|
+
- `Wgit::Url#to_brand` method and updated `Wgit::Url#is_relative?` to support it.
|
15
|
+
### Changed/Removed
|
16
|
+
- Updated the documentation by changing some `private` methods to `protected`. These methods are now documented (on rubydocs) as a result.
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.0.17
|
22
|
+
### Added
|
23
|
+
- Support for `<base>` element in `Wgit::Document`'s.
|
24
|
+
- New `Wgit::Url` methods: `without_query_string`, `is_query_string?`, `is_anchor?`, `replace` (override of `String#replace`).
|
25
|
+
### Changed/Removed
|
26
|
+
- Breaking changes: Removed `Wgit::Document#internal_links_without_anchors` method.
|
27
|
+
- Breaking changes (potentially): `Wgit::Url`'s are now replaced with the redirected to Url during a crawl.
|
28
|
+
- Updated `Wgit::Document#base_url` to support an optional `link:` named parameter.
|
29
|
+
- Updated `Wgit::Crawler#crawl_site` to allow the initial url to redirect to another host.
|
30
|
+
- Updated `Wgit::Url#is_relative?` to support an optional `domain:` named parameter.
|
31
|
+
### Fixed
|
32
|
+
- Bug in `Wgit::Document#internal_full_links` affecting anchor and query string links including those used during `Wgit::Crawler#crawl_site`.
|
33
|
+
- Bug causing an 'Invalid URL' error for `Wgit::Crawler#crawl_site`.
|
34
|
+
---
|
35
|
+
|
36
|
+
## v0.0.16
|
37
|
+
### Added
|
38
|
+
- Added `Url.parse` class method as alias for `Url.new`.
|
39
|
+
### Changed/Removed
|
40
|
+
- Breaking changes: Removed `Wgit::Url.relative_link?` (class method). Use `Wgit::Url#is_relative?` (instance method) instead e.g. `Wgit::Url.new('/blah').is_relative?`.
|
41
|
+
### Fixed
|
42
|
+
- Several URI related bugs in `Wgit::Url` affecting crawls.
|
43
|
+
---
|
44
|
+
|
45
|
+
## v0.0.15
|
46
|
+
### Added
|
47
|
+
- Support for IRI's (non ASCII based URL's).
|
48
|
+
### Changed/Removed
|
49
|
+
- Breaking changes: Removed `Document` and `Url#to_hash` aliases. Call `to_h` instead.
|
50
|
+
### Fixed
|
51
|
+
- Bug in `Crawler#crawl_site` where an internal redirect to an external site's page was being followed.
|
52
|
+
---
|
53
|
+
|
54
|
+
## v0.0.14
|
55
|
+
### Added
|
56
|
+
- `Indexer#index_this_page` method.
|
57
|
+
### Changed/Removed
|
58
|
+
- Breaking Changes: `Wgit::CONNECTION_DETAILS` now only requires `DB_CONNECTION_STRING`.
|
59
|
+
### Fixed
|
60
|
+
- Found and fixed a bug in `Document#new`.
|
61
|
+
---
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 Michael Telford
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# Wgit
|
2
2
|
|
3
|
+
[](https://rubygems.org/gems/wgit)
|
4
|
+
[](https://rubygems.org/gems/wgit)
|
5
|
+
[](https://travis-ci.org/michaeltelford/wgit)
|
6
|
+
[](https://www.rubydoc.info/gems/wgit)
|
7
|
+
[](https://www.codacy.com/app/michaeltelford/wgit?utm_source=github.com&utm_medium=referral&utm_content=michaeltelford/wgit&utm_campaign=Badge_Grade)
|
8
|
+
|
9
|
+
---
|
10
|
+
|
3
11
|
Wgit is a Ruby gem similar in nature to GNU's `wget` tool. It provides an easy to use API for programmatic web scraping, indexing and searching.
|
4
12
|
|
5
13
|
Fundamentally, Wgit is a WWW indexer/scraper which crawls URL's, retrieves and serialises their page contents for later use. You can use Wgit to copy entire websites if required. Wgit also provides a means to search indexed documents stored in a database. Therefore, this library provides the main components of a WWW search engine. The Wgit API is easily extended allowing you to pull out the parts of a webpage that are important to you, the code snippets or tables for example. As Wgit is a library, it has uses in many different application types.
|
@@ -58,11 +66,12 @@ doc.stats # => {
|
|
58
66
|
# doc responds to the following methods:
|
59
67
|
Wgit::Document.instance_methods(false).sort # => [
|
60
68
|
# :==, :[], :author, :base, :base_url, :css, :date_crawled, :doc, :empty?,
|
61
|
-
# :external_links, :external_urls, :
|
62
|
-
# :
|
63
|
-
# :
|
64
|
-
# :
|
65
|
-
# :
|
69
|
+
# :external_links, :external_urls, :find_in_html, :find_in_object, :html,
|
70
|
+
# :init_nokogiri, :internal_absolute_links, :internal_full_links,
|
71
|
+
# :internal_links, :keywords, :links, :relative_absolute_links,
|
72
|
+
# :relative_absolute_urls, :relative_full_links, :relative_full_urls,
|
73
|
+
# :relative_links, :relative_urls, :score, :search, :search!, :size, :stats,
|
74
|
+
# :text, :title, :to_h, :to_json, :url, :xpath
|
66
75
|
# ]
|
67
76
|
|
68
77
|
results = doc.search "corruption"
|
@@ -72,7 +81,7 @@ results.first # => "ial materials involving war, spying and corruption.
|
|
72
81
|
|
73
82
|
## Documentation
|
74
83
|
|
75
|
-
|
84
|
+
100% of Wgit's code is documented using [YARD](https://yardoc.org/), deployed to [Rubydocs](https://www.rubydoc.info/gems/wgit). This greatly benefits developers in using Wgit in their own programs. Another good source of information (as to how the library behaves) are the tests. Also, see the [Practical Examples](#Practical-Examples) section below for real working examples of Wgit in action.
|
76
85
|
|
77
86
|
## Practical Examples
|
78
87
|
|
@@ -347,6 +356,6 @@ For a full list of available Rake tasks, run `bundle exec rake help`. The most c
|
|
347
356
|
|
348
357
|
After checking out the repo, run `bundle exec rake setup` to install the dependencies (requires `bundler`). Then, run `bundle exec rake test` to run the tests. You can also run `bundle exec rake console` for an interactive (`pry`) REPL that will allow you to experiment with the code.
|
349
358
|
|
350
|
-
To generate code documentation run `bundle exec
|
359
|
+
To generate code documentation run `bundle exec yardoc`. To browse the generated documentation run `bundle exec yard server -r`.
|
351
360
|
|
352
361
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, see the *Gem Publishing Checklist* section of the `TODO.txt` file.
|
data/TODO.txt
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
|
2
|
+
Primary
|
3
|
+
-------
|
4
|
+
- Update Database#search & Document#search to have optional case sensitivity.
|
5
|
+
- Have the ability to crawl sub sections of a site only e.g. https://www.honda.co.uk/motorcycles.html as the base url and crawl any links containing this as a prefix. For example, https://www.honda.co.uk/cars.html would not be crawled but https://www.honda.co.uk/motorcycles/africa-twin.html would be.
|
6
|
+
- Create an executable based on the ./bin/console shipped as `wpry` or `wgit`.
|
7
|
+
|
8
|
+
Secondary
|
9
|
+
---------
|
10
|
+
- Think about how we handle invalid Url's on crawled documents. Setup tests and implement logic for this scenario.
|
11
|
+
- Check if Document::TEXT_ELEMENTS is expansive enough.
|
12
|
+
|
13
|
+
Refactoring
|
14
|
+
-----------
|
15
|
+
- Plan to open up the required_ruby_version range, say from 2.5 upwards e.g. `~> 2.5`. Will need CI testing for the different versions of ruby as we move onto support newer versions.
|
16
|
+
- Refactor the 3 main classes and their tests (where needed): Url, Document & Crawler.
|
17
|
+
- After the above refactor, move onto the rest of the code base.
|
18
|
+
- Think about reducing the amount of method aliases, pick the best for the method def and remove the aliases? Also, do the Url#to_* make sense?
|
19
|
+
- Replace method params with named parameters where applicable.
|
20
|
+
- Possibly use refine instead of core-ext?
|
21
|
+
- Think about potentially using DB._update's update_many func.
|
22
|
+
|
23
|
+
Gem Publishing Checklist
|
24
|
+
------------------------
|
25
|
+
- Ensure a clean branch of master and create a 'release' branch.
|
26
|
+
- Update standalone files (if necessary): README.md, TODO.txt, wgit.gemspec etc.
|
27
|
+
- Increment the version number (in version.rb) and update the CHANGELOG.md.
|
28
|
+
- Run 'bundle install' to update deps.
|
29
|
+
- Run 'bundle exec rake compile' and ensure acceptable warnings.
|
30
|
+
- Run 'bundle exec rake test' and ensure all tests are passing.
|
31
|
+
- Run `bundle exec rake install` to build and install the gem locally, then test it manually from outside this repo.
|
32
|
+
- Run `bundle exec yardoc` to update documentation - should be 100% coverage.
|
33
|
+
- Commit, merge to master & push any changes made from the above steps.
|
34
|
+
- Run `bundle exec rake RELEASE[origin]` to tag, build and push everything to github.com and rubygems.org.
|
data/lib/wgit.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'wgit/version'
|
2
4
|
require_relative 'wgit/logger'
|
3
5
|
require_relative 'wgit/assertable'
|
@@ -10,4 +12,4 @@ require_relative 'wgit/database/connection_details'
|
|
10
12
|
require_relative 'wgit/database/model'
|
11
13
|
require_relative 'wgit/database/database'
|
12
14
|
require_relative 'wgit/indexer'
|
13
|
-
#require_relative 'wgit/core_ext' - Must be explicitly required.
|
15
|
+
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
data/lib/wgit/assertable.rb
CHANGED
@@ -1,17 +1,18 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
+
module Wgit
|
3
4
|
# Module containing assert methods including type checking which can be used
|
4
5
|
# for asserting the integrity of method definitions etc.
|
5
6
|
module Assertable
|
6
7
|
# Default type fail message.
|
7
|
-
DEFAULT_TYPE_FAIL_MSG =
|
8
|
+
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
8
9
|
# Wrong method message.
|
9
|
-
WRONG_METHOD_MSG =
|
10
|
+
WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
|
10
11
|
# Default duck fail message.
|
11
|
-
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
12
|
+
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
12
13
|
# Default required keys message.
|
13
|
-
DEFAULT_REQUIRED_KEYS_MSG =
|
14
|
-
|
14
|
+
DEFAULT_REQUIRED_KEYS_MSG = 'Some or all of the required keys are not present: %s'
|
15
|
+
|
15
16
|
# Tests if the obj is of a given type.
|
16
17
|
#
|
17
18
|
# @param obj [Object] The Object to test.
|
@@ -20,17 +21,18 @@ module Wgit
|
|
20
21
|
# @param msg [String] The raised RuntimeError message, if provided.
|
21
22
|
# @return [Object] The given obj on successful assertion.
|
22
23
|
def assert_types(obj, type_or_types, msg = nil)
|
23
|
-
msg ||= DEFAULT_TYPE_FAIL_MSG
|
24
|
-
if type_or_types.respond_to?(:any?)
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
24
|
+
msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class)
|
25
|
+
match = if type_or_types.respond_to?(:any?)
|
26
|
+
type_or_types.any? { |type| obj.instance_of?(type) }
|
27
|
+
else
|
28
|
+
obj.instance_of?(type_or_types)
|
29
|
+
end
|
29
30
|
raise msg unless match
|
31
|
+
|
30
32
|
obj
|
31
33
|
end
|
32
|
-
|
33
|
-
# Each object within arr must match one of the types listed in
|
34
|
+
|
35
|
+
# Each object within arr must match one of the types listed in
|
34
36
|
# type_or_types or an exception is raised using msg, if provided.
|
35
37
|
#
|
36
38
|
# @param arr [Enumerable#each] Enumerable of objects to type check.
|
@@ -39,12 +41,13 @@ module Wgit
|
|
39
41
|
# @return [Object] The given arr on successful assertion.
|
40
42
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
41
43
|
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
44
|
+
|
42
45
|
arr.each do |obj|
|
43
46
|
assert_types(obj, type_or_types, msg)
|
44
47
|
end
|
45
48
|
end
|
46
49
|
|
47
|
-
# The obj_or_objs must respond_to? all of the given methods or an
|
50
|
+
# The obj_or_objs must respond_to? all of the given methods or an
|
48
51
|
# Exception is raised using msg, if provided.
|
49
52
|
#
|
50
53
|
# @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
|
@@ -70,29 +73,32 @@ module Wgit
|
|
70
73
|
# @param msg [String] The raised KeyError message, if provided.
|
71
74
|
# @return [Hash] The given hash on successful assertion.
|
72
75
|
def assert_required_keys(hash, keys, msg = nil)
|
73
|
-
msg ||= DEFAULT_REQUIRED_KEYS_MSG
|
76
|
+
msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(', '))
|
74
77
|
all_present = keys.all? { |key| hash.keys.include? key }
|
75
|
-
raise KeyError
|
78
|
+
raise KeyError, msg unless all_present
|
79
|
+
|
76
80
|
hash
|
77
81
|
end
|
78
82
|
|
79
|
-
|
80
|
-
|
83
|
+
private
|
84
|
+
|
81
85
|
# obj must respond_to? all methods or an exception is raised.
|
82
86
|
def _assert_respond_to(obj, methods, msg = nil)
|
83
|
-
raise
|
84
|
-
|
87
|
+
raise 'methods must respond_to? :all?' unless methods.respond_to?(:all?)
|
88
|
+
|
89
|
+
msg ||= format(DEFAULT_DUCK_FAIL_MSG, "#{obj.class} (#{obj})", methods)
|
85
90
|
match = methods.all? { |method| obj.respond_to?(method) }
|
86
91
|
raise msg unless match
|
92
|
+
|
87
93
|
obj
|
88
94
|
end
|
89
|
-
|
90
|
-
alias
|
91
|
-
alias
|
92
|
-
alias
|
93
|
-
alias
|
94
|
-
alias
|
95
|
-
alias
|
96
|
-
alias
|
95
|
+
|
96
|
+
alias assert_type assert_types
|
97
|
+
alias type assert_types
|
98
|
+
alias types assert_types
|
99
|
+
alias assert_arr_type assert_arr_types
|
100
|
+
alias arr_type assert_arr_types
|
101
|
+
alias arr_types assert_arr_types
|
102
|
+
alias respond_to assert_respond_to
|
97
103
|
end
|
98
104
|
end
|
data/lib/wgit/core_ext.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Script which extends Ruby's core functionality when parsed.
|
2
4
|
# Needs to be required separately using `require 'wgit/core_ext'`.
|
3
5
|
|
@@ -15,7 +17,7 @@ end
|
|
15
17
|
|
16
18
|
# Extend the standard Enumerable functionality.
|
17
19
|
module Enumerable
|
18
|
-
# Converts each String instance into a Wgit::Url object and returns the new
|
20
|
+
# Converts each String instance into a Wgit::Url object and returns the new
|
19
21
|
# Array.
|
20
22
|
#
|
21
23
|
# @return [Array<Wgit::Url>] The converted URL's.
|
@@ -24,8 +26,8 @@ module Enumerable
|
|
24
26
|
process_url_element(element)
|
25
27
|
end
|
26
28
|
end
|
27
|
-
|
28
|
-
# Converts each String instance into a Wgit::Url object and returns the
|
29
|
+
|
30
|
+
# Converts each String instance into a Wgit::Url object and returns the
|
29
31
|
# updated array. Modifies the receiver.
|
30
32
|
#
|
31
33
|
# @return [Array<Wgit::Url>] Self containing the converted URL's.
|
data/lib/wgit/crawler.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'url'
|
2
4
|
require_relative 'document'
|
3
5
|
require_relative 'utils'
|
@@ -5,7 +7,6 @@ require_relative 'assertable'
|
|
5
7
|
require 'net/http' # Requires 'uri'.
|
6
8
|
|
7
9
|
module Wgit
|
8
|
-
|
9
10
|
# The Crawler class provides a means of crawling web based Wgit::Url's, turning
|
10
11
|
# their HTML into Wgit::Document instances.
|
11
12
|
class Crawler
|
@@ -61,7 +62,7 @@ module Wgit
|
|
61
62
|
def [](*urls)
|
62
63
|
# If urls is nil then add_url (when called later) will set @urls = []
|
63
64
|
# so we do nothing here.
|
64
|
-
|
65
|
+
unless urls.nil?
|
65
66
|
# Due to *urls you can end up with [[url1,url2,url3]] etc. where the
|
66
67
|
# outer array is bogus so we use the inner one only.
|
67
68
|
if urls.is_a?(Enumerable) &&
|
@@ -97,11 +98,12 @@ module Wgit
|
|
97
98
|
# by Crawler#docs after this method returns.
|
98
99
|
# @return [Wgit::Document] The last Document crawled.
|
99
100
|
def crawl_urls(urls = @urls, &block)
|
100
|
-
raise
|
101
|
+
raise 'No urls to crawl' unless urls
|
102
|
+
|
101
103
|
@docs = []
|
102
104
|
doc = nil
|
103
105
|
Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
|
104
|
-
doc
|
106
|
+
doc || @docs.last
|
105
107
|
end
|
106
108
|
|
107
109
|
# Crawl the url returning the response Wgit::Document or nil if an error
|
@@ -121,12 +123,12 @@ module Wgit
|
|
121
123
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
122
124
|
# crawl was unsuccessful.
|
123
125
|
def crawl_url(
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
126
|
+
url = @urls.first,
|
127
|
+
follow_external_redirects: true,
|
128
|
+
host: nil
|
129
|
+
)
|
128
130
|
assert_type(url, Wgit::Url)
|
129
|
-
if !follow_external_redirects
|
131
|
+
if !follow_external_redirects && host.nil?
|
130
132
|
raise 'host cannot be nil if follow_external_redirects is false'
|
131
133
|
end
|
132
134
|
|
@@ -200,23 +202,24 @@ module Wgit
|
|
200
202
|
externals.uniq
|
201
203
|
end
|
202
204
|
|
203
|
-
|
204
|
-
|
205
|
-
# Add the document to the @docs array for later processing or let the block
|
206
|
-
# process it here and now.
|
207
|
-
def handle_crawl_block(url, &block)
|
208
|
-
if block_given?
|
209
|
-
crawl_url(url, &block)
|
210
|
-
else
|
211
|
-
@docs << crawl_url(url)
|
212
|
-
nil
|
213
|
-
end
|
214
|
-
end
|
205
|
+
protected
|
215
206
|
|
216
|
-
#
|
217
|
-
#
|
218
|
-
#
|
219
|
-
#
|
207
|
+
# This method calls Wgit::Crawler#resolve to obtain the page HTML, handling
|
208
|
+
# any errors that arise and setting the @last_response. Errors or any
|
209
|
+
# HTTP response that doesn't return a HTML body will be ignored and nil
|
210
|
+
# will be returned; otherwise, the HTML String is returned.
|
211
|
+
#
|
212
|
+
# @param url [Wgit::Url] The URL to fetch the HTML for.
|
213
|
+
# @param follow_external_redirects [Boolean] Whether or not to follow
|
214
|
+
# an external redirect. False will return nil for such a crawl. If false,
|
215
|
+
# you must also provide a `host:` parameter.
|
216
|
+
# @param host [Wgit::Url, String] Specify the host by which
|
217
|
+
# an absolute redirect is determined to be internal or not. Must be
|
218
|
+
# absolute and contain a protocol prefix. For example, a `host:` of
|
219
|
+
# 'http://www.example.com' will only allow redirects for Urls with a
|
220
|
+
# `to_host` value of 'www.example.com'.
|
221
|
+
# @return [String, nil] The crawled HTML or nil if the crawl was
|
222
|
+
# unsuccessful.
|
220
223
|
def fetch(url, follow_external_redirects: true, host: nil)
|
221
224
|
response = resolve(
|
222
225
|
url,
|
@@ -225,74 +228,109 @@ module Wgit
|
|
225
228
|
)
|
226
229
|
@last_response = response
|
227
230
|
response.body.empty? ? nil : response.body
|
228
|
-
rescue
|
231
|
+
rescue StandardError => e
|
229
232
|
Wgit.logger.debug(
|
230
|
-
"Wgit::Crawler#fetch('#{url}') exception: #{
|
233
|
+
"Wgit::Crawler#fetch('#{url}') exception: #{e.message}"
|
231
234
|
)
|
232
235
|
@last_response = nil
|
233
236
|
nil
|
234
237
|
end
|
235
238
|
|
236
|
-
# The resolve method performs a HTTP GET to obtain the HTML
|
237
|
-
#
|
238
|
-
#
|
239
|
-
#
|
240
|
-
#
|
239
|
+
# The resolve method performs a HTTP GET to obtain the HTML response. The
|
240
|
+
# Net::HTTPResponse will be returned or an error raised. Redirects can be
|
241
|
+
# disabled by setting `redirect_limit: 0`.
|
242
|
+
#
|
243
|
+
# @param url [Wgit::Url] The URL to fetch the HTML from.
|
244
|
+
# @param redirect_limit [Integer] The number of redirect hops to allow
|
245
|
+
# before raising an error.
|
246
|
+
# @param follow_external_redirects [Boolean] Whether or not to follow
|
247
|
+
# an external redirect. If false, you must also provide a `host:`
|
248
|
+
# parameter.
|
249
|
+
# @param host [Wgit::Url, String] Specify the host by which
|
250
|
+
# an absolute redirect is determined to be internal or not. Must be
|
251
|
+
# absolute and contain a protocol prefix. For example, a `host:` of
|
252
|
+
# 'http://www.example.com' will only allow redirects for Urls with a
|
253
|
+
# `to_host` value of 'www.example.com'.
|
254
|
+
# @raise [StandardError] If !url.respond_to? :to_uri or a redirect isn't
|
255
|
+
# allowed.
|
256
|
+
# @return [Net::HTTPResponse] The HTTP response of the GET request.
|
241
257
|
def resolve(
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
258
|
+
url,
|
259
|
+
redirect_limit: Wgit::Crawler.default_redirect_limit,
|
260
|
+
follow_external_redirects: true,
|
261
|
+
host: nil
|
262
|
+
)
|
247
263
|
raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
|
264
|
+
|
248
265
|
redirect_count = 0
|
266
|
+
response = nil
|
249
267
|
|
250
|
-
|
268
|
+
loop do
|
251
269
|
response = Net::HTTP.get_response(url.to_uri)
|
252
270
|
location = Wgit::Url.new(response.fetch('location', ''))
|
253
271
|
|
272
|
+
break unless response.is_a?(Net::HTTPRedirection)
|
254
273
|
yield(url, response, location) if block_given?
|
255
274
|
|
256
|
-
|
257
|
-
if !follow_external_redirects
|
275
|
+
unless location.empty?
|
276
|
+
if !follow_external_redirects &&
|
258
277
|
!location.is_relative?(host: host)
|
259
278
|
raise "External redirect not allowed - Redirected to: \
|
260
279
|
'#{location}', which is outside of host: '#{host}'"
|
261
280
|
end
|
262
281
|
|
263
282
|
raise 'Too many redirects' if redirect_count >= redirect_limit
|
283
|
+
|
264
284
|
redirect_count += 1
|
265
285
|
|
266
286
|
location = url.to_base.concat(location) if location.is_relative?
|
267
287
|
url.replace(location)
|
268
288
|
end
|
269
|
-
end
|
289
|
+
end
|
270
290
|
|
271
291
|
response
|
272
292
|
end
|
273
293
|
|
294
|
+
# Returns a doc's internal HTML page links in absolute form; used when
|
295
|
+
# crawling a site. Override this method in a subclass to change how a site
|
296
|
+
# is crawled; not what is extracted from each page (Document extensions
|
297
|
+
# should be used for this purpose instead).
|
298
|
+
#
|
299
|
+
# @param doc [Wgit::Document] The document from which to extract it's
|
300
|
+
# internal page links.
|
301
|
+
# @return [Array<Wgit::Url>] The internal page links from doc.
|
302
|
+
def get_internal_links(doc)
|
303
|
+
doc.internal_full_links
|
304
|
+
.map(&:without_anchor) # Because anchors don't change page content.
|
305
|
+
.uniq
|
306
|
+
.reject do |link|
|
307
|
+
ext = link.to_extension
|
308
|
+
ext ? !%w[htm html].include?(ext) : false
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
private
|
313
|
+
|
314
|
+
# Add the document to the @docs array for later processing or let the block
|
315
|
+
# process it here and now.
|
316
|
+
def handle_crawl_block(url, &block)
|
317
|
+
if block_given?
|
318
|
+
crawl_url(url, &block)
|
319
|
+
else
|
320
|
+
@docs << crawl_url(url)
|
321
|
+
nil
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
274
325
|
# Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
|
275
326
|
def add_url(url)
|
276
327
|
@urls = [] if @urls.nil?
|
277
328
|
@urls << Wgit::Url.new(url)
|
278
329
|
end
|
279
330
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
doc.internal_full_links.
|
285
|
-
map(&:without_anchor).
|
286
|
-
uniq.
|
287
|
-
reject do |link|
|
288
|
-
ext = link.to_extension
|
289
|
-
ext ? !['htm', 'html'].include?(ext) : false
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
alias :crawl :crawl_urls
|
294
|
-
alias :crawl_pages :crawl_urls
|
295
|
-
alias :crawl_page :crawl_url
|
296
|
-
alias :crawl_r :crawl_site
|
331
|
+
alias crawl crawl_urls
|
332
|
+
alias crawl_pages crawl_urls
|
333
|
+
alias crawl_page crawl_url
|
334
|
+
alias crawl_r crawl_site
|
297
335
|
end
|
298
336
|
end
|