wgit 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +53 -17
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +17 -7
- data/lib/wgit/document.rb +87 -54
- data/lib/wgit/document_extensions.rb +13 -3
- data/lib/wgit/response.rb +6 -6
- data/lib/wgit/url.rb +17 -0
- data/lib/wgit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc2ea1de7219c66eb70ed7b4e97bb8da7c169eb68257df7d7d1bfdaf6a5ed4d6
|
4
|
+
data.tar.gz: f88afdd7477812c3b9fdbde8e1125b950aec3fe3fabef5a20e0c16a9e26a767b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5fe86fee44e4c494d936d747719d856c240a0129db0692afa44ad9855340929a8b90cc177d92d775bc7cc15af99093078cf6a8fc8b9bbd9bc8965866d343914
|
7
|
+
data.tar.gz: c87efb4c5dcfb8795d62ab41f7e8d2bc206e7f8407707c9269c0fc86fbdcd14b7269d04083ec756d3b77a99300639469e88c639ee125ddee0984c3957e7cfc7b
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,21 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.8.0
|
13
|
+
### Added
|
14
|
+
- To the range of `Wgit::Document.text_elements`. Now (only and) all visible page text should be extracted into `Wgit::Document#text` successfully.
|
15
|
+
- `Wgit::Document#description` default extension.
|
16
|
+
- `Wgit::Url.parse_or_nil` method.
|
17
|
+
### Changed/Removed
|
18
|
+
- Breaking change: Renamed `Document#stats[:text_snippets]` to be `:text`.
|
19
|
+
- Breaking change: `Wgit::Document.define_extension`'s block return value now becomes the `var` value, even when `nil` is returned. This allows `var` to be set to `nil`.
|
20
|
+
- Potential breaking change: Renamed `Wgit::Response#crawl_time` (alias) to be `#crawl_duration`.
|
21
|
+
- Updated `Wgit::Crawler::SUPPORTED_FILE_EXTENSIONS` to be `Wgit::Crawler.supported_file_extensions`, making it configurable. Now you can add your own URL extensions if needed.
|
22
|
+
- Updated the Wgit core extension `String#to_url` to use `Wgit::Url.parse` allowing instances of `Wgit::Url` to returned as is. This also affects `Enumerable#to_urls` in the same way.
|
23
|
+
### Fixed
|
24
|
+
- An issue where too much `Wgit::Document#text` was being extracted from the HTML. This was fixed by reverting the recent commit: "Document.text_elements_xpath is now `//*/text()`".
|
25
|
+
---
|
26
|
+
|
12
27
|
## v0.7.0
|
13
28
|
### Added
|
14
29
|
- `Wgit::Indexer.new` optional `crawler:` named param.
|
data/README.md
CHANGED
@@ -68,17 +68,18 @@ crawler.last_response.class # => Wgit::Response is a wrapper for Typhoeus::Respo
|
|
68
68
|
|
69
69
|
doc.class # => Wgit::Document
|
70
70
|
doc.class.public_instance_methods(false).sort # => [
|
71
|
-
# :==, :[], :author, :base, :base_url, :content, :css, :doc, :empty?,
|
72
|
-
# :external_urls, :html, :internal_absolute_links,
|
73
|
-
# :internal_links, :internal_urls, :keywords, :links, :score,
|
74
|
-
# :size, :statistics, :stats, :text, :title, :to_h, :to_json,
|
71
|
+
# :==, :[], :author, :base, :base_url, :content, :css, :description, :doc, :empty?,
|
72
|
+
# :external_links, :external_urls, :html, :internal_absolute_links,
|
73
|
+
# :internal_absolute_urls,:internal_links, :internal_urls, :keywords, :links, :score,
|
74
|
+
# :search, :search!, :size, :statistics, :stats, :text, :title, :to_h, :to_json,
|
75
|
+
# :url, :xpath
|
75
76
|
# ]
|
76
77
|
|
77
78
|
doc.url # => "https://wikileaks.org/What-is-Wikileaks.html"
|
78
79
|
doc.title # => "WikiLeaks - What is WikiLeaks"
|
79
80
|
doc.stats # => {
|
80
81
|
# :url=>44, :html=>28133, :title=>17, :keywords=>0,
|
81
|
-
# :links=>35, :
|
82
|
+
# :links=>35, :text=>67, :text_bytes=>13735
|
82
83
|
# }
|
83
84
|
doc.links # => ["#submit_help_contact", "#submit_help_tor", "#submit_help_tips", ...]
|
84
85
|
doc.text # => ["The Courage Foundation is an international organisation that <snip>", ...]
|
@@ -273,15 +274,50 @@ urls_to_crawl = db.uncrawled_urls # => Results will include top_result.external_
|
|
273
274
|
|
274
275
|
## Extending The API
|
275
276
|
|
276
|
-
Document serialising in Wgit is the means of downloading a web page and
|
277
|
+
Document serialising in Wgit is the means of downloading a web page and serialising parts of its content into accessible `Wgit::Document` attributes/methods. For example, `Wgit::Document#author` will return you the webpage's xpath value of `meta[@name='author']`.
|
277
278
|
|
278
|
-
|
279
|
+
There are two ways to extend the Document serialising behaviour of Wgit for your own means:
|
279
280
|
|
280
|
-
|
281
|
+
1. Add additional **textual** content to `Wgit::Document#text`.
|
282
|
+
2. Define `Wgit::Document` instance methods for specific HTML **elements**.
|
281
283
|
|
282
|
-
|
284
|
+
Below describes these two methods in more detail.
|
283
285
|
|
284
|
-
|
286
|
+
### 1. Extending The Default Text Elements
|
287
|
+
|
288
|
+
Wgit contains a set of `Wgit::Document.text_elements` defining which HTML elements contain text on a page; which in turn are serialised. Once serialised you can process this text content via methods like `Wgit::Document#text` and `Wgit::Document#search` etc.
|
289
|
+
|
290
|
+
The below code example shows how to extract additional text from a webpage:
|
291
|
+
|
292
|
+
```ruby
|
293
|
+
require 'wgit'
|
294
|
+
|
295
|
+
# The default text_elements cover most visible page text but let's say we
|
296
|
+
# have a <table> element with text content that we want.
|
297
|
+
Wgit::Document.text_elements << :table
|
298
|
+
|
299
|
+
doc = Wgit::Document.new(
|
300
|
+
'http://some_url.com',
|
301
|
+
<<~HTML
|
302
|
+
<html>
|
303
|
+
<p>Hello world!</p>
|
304
|
+
<table>My table</table>
|
305
|
+
</html>
|
306
|
+
HTML
|
307
|
+
)
|
308
|
+
|
309
|
+
# Now every crawled Document#text will include <table> text content.
|
310
|
+
doc.text # => ["Hello world!", "My table"]
|
311
|
+
doc.search('table') # => ["My table"]
|
312
|
+
```
|
313
|
+
|
314
|
+
**Note**: This only works for *textual* page content. For more control over the serialised *elements* themselves, see below.
|
315
|
+
|
316
|
+
### 2. Serialising Specific HTML Elements (via Document Extensions)
|
317
|
+
|
318
|
+
Wgit provides some [default extensions](https://github.com/michaeltelford/wgit/blob/master/lib/wgit/document_extensions.rb) to extract a page's text, links etc. This of course is often not enough given the nature of the WWW and the differences from one webpage to the next.
|
319
|
+
|
320
|
+
Therefore, you can define a Document extension for each HTML element(s) that you want to extract and serialise into a `Wgit::Document` instance variable, equipped with a getter method. Once an extension is defined, all crawled Documents will contain your extracted content.
|
285
321
|
|
286
322
|
Here's how to add a Document extension to serialise a specific page element:
|
287
323
|
|
@@ -293,14 +329,14 @@ Wgit::Document.define_extension(
|
|
293
329
|
:tables, # Wgit::Document#tables will return the page's tables.
|
294
330
|
'//table', # The xpath to extract the tables.
|
295
331
|
singleton: false, # True returns the first table found, false returns all.
|
296
|
-
text_content_only: false, # True returns
|
297
|
-
# false returns the tables as Nokogiri objects (see below).
|
332
|
+
text_content_only: false, # True returns the table text, false returns the Nokogiri object.
|
298
333
|
) do |tables|
|
299
|
-
# Here we can manipulate the
|
334
|
+
# Here we can inspect/manipulate the tables before they're set as Wgit::Document#tables.
|
335
|
+
tables
|
300
336
|
end
|
301
337
|
|
302
|
-
# Our Document has a table which we're interested in.
|
303
|
-
#
|
338
|
+
# Our Document has a table which we're interested in. Note it doesn't matter how the Document
|
339
|
+
# is initialised e.g. manually (as below) or via Wgit::Crawler methods etc.
|
304
340
|
doc = Wgit::Document.new(
|
305
341
|
'http://some_url.com',
|
306
342
|
<<~HTML
|
@@ -323,9 +359,9 @@ tables = doc.tables
|
|
323
359
|
tables.class # => Nokogiri::XML::NodeSet
|
324
360
|
tables.first.class # => Nokogiri::XML::Element
|
325
361
|
|
326
|
-
#
|
362
|
+
# Note, the Document's stats now include our 'tables' extension.
|
327
363
|
doc.stats # => {
|
328
|
-
# :url=>19, :html=>242, :links=>0, :
|
364
|
+
# :url=>19, :html=>242, :links=>0, :text=>8, :text_bytes=>91, :tables=>1
|
329
365
|
# }
|
330
366
|
```
|
331
367
|
|
data/lib/wgit/core_ext.rb
CHANGED
data/lib/wgit/crawler.rb
CHANGED
@@ -11,18 +11,26 @@ require 'typhoeus'
|
|
11
11
|
module Wgit
|
12
12
|
# The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
|
13
13
|
# serialising their HTML into Wgit::Document instances. This is the only Wgit
|
14
|
-
# class which contains network logic e.g. request/response handling.
|
14
|
+
# class which contains network logic e.g. HTTP request/response handling.
|
15
15
|
class Crawler
|
16
16
|
include Assertable
|
17
17
|
|
18
|
-
#
|
19
|
-
|
20
|
-
# doesn't keep the crawl of the site going. All URL's without a file
|
21
|
-
# extension will be crawled, because they're assumed to be HTML.
|
22
|
-
SUPPORTED_FILE_EXTENSIONS = Set.new(
|
18
|
+
# Set of supported file extensions for Wgit::Crawler#crawl_site.
|
19
|
+
@supported_file_extensions = Set.new(
|
23
20
|
%w[asp aspx cfm cgi htm html htmlx jsp php]
|
24
21
|
)
|
25
22
|
|
23
|
+
class << self
|
24
|
+
# The URL file extensions (from `<a>` hrefs) which will be crawled by
|
25
|
+
# `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
|
26
|
+
# doesn't keep the crawl of the site going. All URL's without a file
|
27
|
+
# extension will be crawled, because they're assumed to be HTML.
|
28
|
+
# The `#crawl` method will crawl anything since it's given the URL(s).
|
29
|
+
# You can add your own site's URL file extension e.g.
|
30
|
+
# `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
|
31
|
+
attr_reader :supported_file_extensions
|
32
|
+
end
|
33
|
+
|
26
34
|
# The amount of allowed redirects before raising an error. Set to 0 to
|
27
35
|
# disable redirects completely; or you can pass `follow_redirects: false`
|
28
36
|
# to any Wgit::Crawler.crawl_* method.
|
@@ -313,7 +321,9 @@ module Wgit
|
|
313
321
|
.uniq
|
314
322
|
.select do |link|
|
315
323
|
ext = link.to_extension
|
316
|
-
ext ?
|
324
|
+
ext ?
|
325
|
+
Wgit::Crawler.supported_file_extensions.include?(ext.downcase) :
|
326
|
+
true # URLs without an extension are assumed HTML.
|
317
327
|
end
|
318
328
|
|
319
329
|
return links if allow_paths.nil? && disallow_paths.nil?
|
data/lib/wgit/document.rb
CHANGED
@@ -20,14 +20,26 @@ module Wgit
|
|
20
20
|
# Regex for the allowed var names when defining an extension.
|
21
21
|
REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
|
22
22
|
|
23
|
-
#
|
24
|
-
|
23
|
+
# Set of text elements used to build Document#text.
|
24
|
+
@text_elements = Set.new(%i[
|
25
|
+
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
|
+
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
27
|
+
h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
|
28
|
+
option output p pre q rb rt ruby s samp section small span strong sub
|
29
|
+
summary sup td textarea th time u ul var wbr
|
30
|
+
])
|
25
31
|
|
26
32
|
# Set of Symbols representing the defined Document extensions.
|
27
33
|
@extensions = Set.new
|
28
34
|
|
29
35
|
class << self
|
30
|
-
#
|
36
|
+
# Set of HTML elements that make up the visible text on a page. These
|
37
|
+
# elements are used to initialize the Wgit::Document#text. See the
|
38
|
+
# README.md for how to add to this Set dynamically.
|
39
|
+
attr_reader :text_elements
|
40
|
+
|
41
|
+
# Set of Symbols representing the defined Document extensions. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extension for a new extension.
|
31
43
|
attr_reader :extensions
|
32
44
|
end
|
33
45
|
|
@@ -72,6 +84,23 @@ module Wgit
|
|
72
84
|
|
73
85
|
### Document Class Methods ###
|
74
86
|
|
87
|
+
# Uses Document.text_elements to build an xpath String, used to obtain
|
88
|
+
# all of the combined visual text on a webpage.
|
89
|
+
#
|
90
|
+
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
|
+
def self.text_elements_xpath
|
92
|
+
xpath = ''
|
93
|
+
return xpath if Wgit::Document.text_elements.empty?
|
94
|
+
|
95
|
+
el_xpath = '//%s/text()'
|
96
|
+
Wgit::Document.text_elements.each_with_index do |el, i|
|
97
|
+
xpath += ' | ' unless i.zero?
|
98
|
+
xpath += format(el_xpath, el)
|
99
|
+
end
|
100
|
+
|
101
|
+
xpath
|
102
|
+
end
|
103
|
+
|
75
104
|
# Defines an extension, which is a way to serialise HTML elements into
|
76
105
|
# instance variables upon Document initialization. See the default
|
77
106
|
# extensions defined in 'document_extensions.rb' as examples.
|
@@ -82,11 +111,12 @@ module Wgit
|
|
82
111
|
# the xpath or database object result(s).
|
83
112
|
#
|
84
113
|
# When initialising from HTML, a singleton value of true will only
|
85
|
-
# ever return
|
86
|
-
# Array. When initialising from a database object, the value
|
87
|
-
# is and singleton is only used to define the default empty
|
88
|
-
# If a value cannot be found (in either the HTML or database
|
89
|
-
# a default will be used. The default value is:
|
114
|
+
# ever return the first result found; otherwise all the results are
|
115
|
+
# returned in an Array. When initialising from a database object, the value
|
116
|
+
# is taken as is and singleton is only used to define the default empty
|
117
|
+
# value. If a value cannot be found (in either the HTML or database
|
118
|
+
# object), then a default will be used. The default value is:
|
119
|
+
# `singleton ? nil : []`.
|
90
120
|
#
|
91
121
|
# @param var [Symbol] The name of the variable to be initialised.
|
92
122
|
# @param xpath [String, #call] The xpath used to find the element(s)
|
@@ -105,14 +135,16 @@ module Wgit
|
|
105
135
|
# @option opts [Boolean] :text_content_only The text_content_only option
|
106
136
|
# if true will use the text content of the Nokogiri result object,
|
107
137
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
108
|
-
# @
|
109
|
-
#
|
110
|
-
#
|
111
|
-
#
|
112
|
-
#
|
113
|
-
#
|
114
|
-
#
|
115
|
-
#
|
138
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
139
|
+
# regardless of the source. Use it (optionally) to process the result
|
140
|
+
# value.
|
141
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
142
|
+
# `var`.
|
143
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
144
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
145
|
+
# `:object`.
|
146
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
147
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
116
148
|
# @raise [StandardError] If the var param isn't valid.
|
117
149
|
# @return [Symbol] The given var Symbol if successful.
|
118
150
|
def self.define_extension(var, xpath, opts = {}, &block)
|
@@ -143,12 +175,12 @@ module Wgit
|
|
143
175
|
var
|
144
176
|
end
|
145
177
|
|
146
|
-
# Removes the init_
|
147
|
-
# Therefore, this is the opposing method to Document.define_extension
|
178
|
+
# Removes the `init_*` methods created when an extension is defined.
|
179
|
+
# Therefore, this is the opposing method to `Document.define_extension`.
|
148
180
|
# Returns true if successful or false if the method(s) cannot be found.
|
149
181
|
#
|
150
182
|
# @param var [Symbol] The extension variable already defined.
|
151
|
-
# @return [Boolean] True if the extension var was found and removed;
|
183
|
+
# @return [Boolean] True if the extension `var` was found and removed;
|
152
184
|
# otherwise false.
|
153
185
|
def self.remove_extension(var)
|
154
186
|
Document.send(:remove_method, "init_#{var}_from_html")
|
@@ -173,7 +205,7 @@ module Wgit
|
|
173
205
|
(@url == other.url) && (@html == other.html)
|
174
206
|
end
|
175
207
|
|
176
|
-
#
|
208
|
+
# Shortcut for calling Document#html[range].
|
177
209
|
#
|
178
210
|
# @param range [Range] The range of @html to return.
|
179
211
|
# @return [String] The given range of @html.
|
@@ -262,8 +294,8 @@ module Wgit
|
|
262
294
|
instance_variables.each do |var|
|
263
295
|
# Add up the total bytes of text as well as the length.
|
264
296
|
if var == :@text
|
265
|
-
hash[:
|
266
|
-
hash[:text_bytes]
|
297
|
+
hash[:text] = @text.length
|
298
|
+
hash[:text_bytes] = @text.sum(&:length)
|
267
299
|
# Else take the var's #length method return value.
|
268
300
|
else
|
269
301
|
next unless instance_variable_get(var).respond_to?(:length)
|
@@ -309,8 +341,8 @@ module Wgit
|
|
309
341
|
@doc.css(selector)
|
310
342
|
end
|
311
343
|
|
312
|
-
# Returns all internal links from this Document in relative form.
|
313
|
-
# meaning a link to another document on the same host.
|
344
|
+
# Returns all unique internal links from this Document in relative form.
|
345
|
+
# Internal meaning a link to another document on the same host.
|
314
346
|
#
|
315
347
|
# This Document's host is used to determine if an absolute URL is actually
|
316
348
|
# a relative link e.g. For a Document representing
|
@@ -319,7 +351,7 @@ module Wgit
|
|
319
351
|
# as an internal link because both Documents live on the same host. Also
|
320
352
|
# see Wgit::Document#internal_absolute_links.
|
321
353
|
#
|
322
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in relative form.
|
354
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
|
323
355
|
def internal_links
|
324
356
|
return [] if @links.empty?
|
325
357
|
|
@@ -333,19 +365,19 @@ module Wgit
|
|
333
365
|
Wgit::Utils.process_arr(links)
|
334
366
|
end
|
335
367
|
|
336
|
-
# Returns all internal links from this Document in absolute form by
|
368
|
+
# Returns all unique internal links from this Document in absolute form by
|
337
369
|
# appending them to self's #base_url. Also see
|
338
370
|
# Wgit::Document#internal_links.
|
339
371
|
#
|
340
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
372
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
341
373
|
def internal_absolute_links
|
342
374
|
internal_links.map { |link| link.prefix_base(self) }
|
343
375
|
end
|
344
376
|
|
345
|
-
# Returns all external links from this Document in absolute form.
|
346
|
-
# meaning a link to a different host.
|
377
|
+
# Returns all unique external links from this Document in absolute form.
|
378
|
+
# External meaning a link to a different host.
|
347
379
|
#
|
348
|
-
# @return [Array<Wgit::Url>] Self's external Url's in absolute form.
|
380
|
+
# @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
|
349
381
|
def external_links
|
350
382
|
return [] if @links.empty?
|
351
383
|
|
@@ -437,19 +469,16 @@ module Wgit
|
|
437
469
|
# Override this method to custom configure the Nokogiri object returned.
|
438
470
|
# Gets called from Wgit::Document.new upon initialization.
|
439
471
|
#
|
472
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for initialisation.
|
440
473
|
# @raise [StandardError] If @html isn't set.
|
441
474
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
442
|
-
def init_nokogiri
|
475
|
+
def init_nokogiri(&block)
|
443
476
|
raise '@html must be set' unless @html
|
444
477
|
|
445
|
-
Nokogiri::HTML(@html)
|
446
|
-
# TODO: Remove #'s below when crawling in production.
|
447
|
-
# config.options = Nokogiri::XML::ParseOptions::STRICT |
|
448
|
-
# Nokogiri::XML::ParseOptions::NONET
|
449
|
-
end
|
478
|
+
Nokogiri::HTML(@html, &block)
|
450
479
|
end
|
451
480
|
|
452
|
-
#
|
481
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
453
482
|
# parameter.
|
454
483
|
#
|
455
484
|
# @param xpath [String] Used to find the value/object in @html.
|
@@ -457,10 +486,15 @@ module Wgit
|
|
457
486
|
# Object) : results (Array).
|
458
487
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
459
488
|
# (String) : result (Nokogiri Object).
|
460
|
-
# @yield
|
461
|
-
#
|
462
|
-
#
|
463
|
-
#
|
489
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
490
|
+
# regardless of the source. Use it (optionally) to process the result
|
491
|
+
# value.
|
492
|
+
# @yieldparam value [Object] The result value to be returned.
|
493
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
494
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
495
|
+
# `:object`.
|
496
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
497
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
464
498
|
# @return [String, Object] The value found in the html or the default value
|
465
499
|
# (singleton ? nil : []).
|
466
500
|
def find_in_html(xpath, singleton: true, text_content_only: true)
|
@@ -478,23 +512,25 @@ module Wgit
|
|
478
512
|
|
479
513
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
480
514
|
|
481
|
-
if block_given?
|
482
|
-
new_result = yield(result, self, :document)
|
483
|
-
result = new_result unless new_result.nil?
|
484
|
-
end
|
515
|
+
result = yield(result, self, :document) if block_given?
|
485
516
|
|
486
517
|
result
|
487
518
|
end
|
488
519
|
|
489
|
-
# Returns a value from the obj using the given key via obj#fetch
|
520
|
+
# Returns a value from the obj using the given key via `obj#fetch`.
|
490
521
|
#
|
491
522
|
# @param obj [#fetch] The object containing the key/value.
|
492
523
|
# @param key [String] Used to find the value in the obj.
|
493
524
|
# @param singleton [Boolean] True if a single value, false otherwise.
|
494
|
-
# @yield
|
495
|
-
#
|
496
|
-
#
|
497
|
-
#
|
525
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
526
|
+
# regardless of the source. Use it (optionally) to process the result
|
527
|
+
# value.
|
528
|
+
# @yieldparam value [Object] The result value to be returned.
|
529
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
530
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
531
|
+
# `:object`.
|
532
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
533
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
498
534
|
# @return [String, Object] The value found in the obj or the default value
|
499
535
|
# (singleton ? nil : []).
|
500
536
|
def find_in_object(obj, key, singleton: true)
|
@@ -505,10 +541,7 @@ module Wgit
|
|
505
541
|
|
506
542
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
507
543
|
|
508
|
-
if block_given?
|
509
|
-
new_result = yield(result, obj, :object)
|
510
|
-
result = new_result unless new_result.nil?
|
511
|
-
end
|
544
|
+
result = yield(result, obj, :object) if block_given?
|
512
545
|
|
513
546
|
result
|
514
547
|
end
|
@@ -9,7 +9,7 @@ Wgit::Document.define_extension(
|
|
9
9
|
singleton: true,
|
10
10
|
text_content_only: true
|
11
11
|
) do |base|
|
12
|
-
Wgit::Url.
|
12
|
+
Wgit::Url.parse_or_nil(base) if base
|
13
13
|
end
|
14
14
|
|
15
15
|
# Title.
|
@@ -20,6 +20,14 @@ Wgit::Document.define_extension(
|
|
20
20
|
text_content_only: true
|
21
21
|
)
|
22
22
|
|
23
|
+
# Description.
|
24
|
+
Wgit::Document.define_extension(
|
25
|
+
:description,
|
26
|
+
'//meta[@name="description"]/@content',
|
27
|
+
singleton: true,
|
28
|
+
text_content_only: true
|
29
|
+
)
|
30
|
+
|
23
31
|
# Author.
|
24
32
|
Wgit::Document.define_extension(
|
25
33
|
:author,
|
@@ -49,13 +57,15 @@ Wgit::Document.define_extension(
|
|
49
57
|
singleton: false,
|
50
58
|
text_content_only: true
|
51
59
|
) do |links|
|
52
|
-
links
|
60
|
+
links
|
61
|
+
.map { |link| Wgit::Url.parse_or_nil(link) }
|
62
|
+
.compact # Remove unparsable links.
|
53
63
|
end
|
54
64
|
|
55
65
|
# Text.
|
56
66
|
Wgit::Document.define_extension(
|
57
67
|
:text,
|
58
|
-
Wgit::Document
|
68
|
+
proc { Wgit::Document.text_elements_xpath },
|
59
69
|
singleton: false,
|
60
70
|
text_content_only: true
|
61
71
|
)
|
data/lib/wgit/response.rb
CHANGED
@@ -131,11 +131,11 @@ module Wgit
|
|
131
131
|
@status.positive?
|
132
132
|
end
|
133
133
|
|
134
|
-
alias code
|
135
|
-
alias content
|
136
|
-
alias
|
137
|
-
alias to_s
|
138
|
-
alias redirects
|
139
|
-
alias length
|
134
|
+
alias code status
|
135
|
+
alias content body
|
136
|
+
alias crawl_duration total_time
|
137
|
+
alias to_s body
|
138
|
+
alias redirects redirections
|
139
|
+
alias length size
|
140
140
|
end
|
141
141
|
end
|
data/lib/wgit/url.rb
CHANGED
@@ -90,6 +90,23 @@ module Wgit
|
|
90
90
|
obj.is_a?(Wgit::Url) ? obj : new(obj)
|
91
91
|
end
|
92
92
|
|
93
|
+
# Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
|
94
|
+
# be parsed successfully e.g. the String is invalid.
|
95
|
+
#
|
96
|
+
# Use this method when you can't gaurentee that obj is parsable as a URL.
|
97
|
+
# See Wgit::Url.parse for more information.
|
98
|
+
#
|
99
|
+
# @param obj [Object] The object to parse, which #is_a?(String).
|
100
|
+
# @raise [StandardError] If obj.is_a?(String) is false.
|
101
|
+
# @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
|
102
|
+
def self.parse_or_nil(obj)
|
103
|
+
parse(obj)
|
104
|
+
rescue Addressable::URI::InvalidURIError
|
105
|
+
Wgit.logger.debug("Wgit::Url.parse_or_nil('#{obj}') exception: \
|
106
|
+
Addressable::URI::InvalidURIError")
|
107
|
+
nil
|
108
|
+
end
|
109
|
+
|
93
110
|
# Sets the @crawled instance var, also setting @date_crawled for
|
94
111
|
# convenience.
|
95
112
|
#
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|