wgit 0.10.5 → 0.10.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/lib/wgit/document.rb +34 -32
- data/lib/wgit/dsl.rb +1 -0
- data/lib/wgit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4598dcfc047ce3915ba5a871837be5efc54201d61b4967cf53070bec2af4dd52
|
4
|
+
data.tar.gz: 604010011024af6f2d4dfcc87e6c4c1d73f8e4811938281119fccb79792818c1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 44b098e2a97191801787386e9d2060dcdeacc625c3453976679fc276a73b2bf0614713764a55f7074073018e898f2e43dc1a7f4f803339a86158052f59dcabcb
|
7
|
+
data.tar.gz: 8645c7095bb14590cf83c21905c9f5ed524e1047254e6526b8fe46a53f3989395472300d27fb65f899951a5f4b80ee9928accd23164b10e1a834975bf045db47
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,15 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.10.6
|
13
|
+
### Added
|
14
|
+
- `Wgit::DSL` method `#crawl_url` (aliased to `#crawl`).
|
15
|
+
### Changed/Removed
|
16
|
+
- Added a `&block` param to `Wgit::Document#extract`, which gets passed to `#extract_from_html`.
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
12
21
|
## v0.10.5
|
13
22
|
### Added
|
14
23
|
- `Database#last_result` getter method to return the most recent raw mongo result.
|
data/lib/wgit/document.rb
CHANGED
@@ -89,24 +89,18 @@ module Wgit
|
|
89
89
|
#
|
90
90
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
91
|
def self.text_elements_xpath
|
92
|
-
xpath
|
93
|
-
|
94
|
-
|
95
|
-
el_xpath = '//%s/text()'
|
96
|
-
Wgit::Document.text_elements.each_with_index do |el, i|
|
97
|
-
xpath += ' | ' unless i.zero?
|
98
|
-
xpath += format(el_xpath, el)
|
92
|
+
Wgit::Document.text_elements.each_with_index.reduce("") do |xpath, (el, i)|
|
93
|
+
xpath += " | " unless i.zero?
|
94
|
+
xpath += format("//%s/text()", el)
|
99
95
|
end
|
100
|
-
|
101
|
-
xpath
|
102
96
|
end
|
103
97
|
|
104
98
|
# Defines a content extractor, which extracts HTML elements/content
|
105
99
|
# into instance variables upon Document initialization. See the default
|
106
100
|
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
101
|
# extractor means that every subsequently crawled/initialized document
|
108
|
-
# will attempt to extract the xpath's content. Use `#
|
109
|
-
# content extraction.
|
102
|
+
# will attempt to extract the xpath's content. Use `#extract` for a one off
|
103
|
+
# content extraction on any document.
|
110
104
|
#
|
111
105
|
# Note that defined extractors work for both Documents initialized from
|
112
106
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
@@ -115,9 +109,9 @@ module Wgit
|
|
115
109
|
#
|
116
110
|
# When initialising from HTML, a singleton value of true will only
|
117
111
|
# ever return the first result found; otherwise all the results are
|
118
|
-
# returned in an
|
119
|
-
# is taken as is and singleton is only used to define the default
|
120
|
-
# value. If a value cannot be found (in either the HTML or database
|
112
|
+
# returned in an Enumerable. When initialising from a database object, the
|
113
|
+
# value is taken as is and singleton is only used to define the default
|
114
|
+
# empty value. If a value cannot be found (in either the HTML or database
|
121
115
|
# object), then a default will be used. The default value is:
|
122
116
|
# `singleton ? nil : []`.
|
123
117
|
#
|
@@ -134,12 +128,14 @@ module Wgit
|
|
134
128
|
# @param opts [Hash] The options to define an extractor with. The
|
135
129
|
# options are only used when intializing from HTML, not the database.
|
136
130
|
# @option opts [Boolean] :singleton The singleton option determines
|
137
|
-
# whether or not the result(s) should be in an
|
131
|
+
# whether or not the result(s) should be in an Enumerable. If multiple
|
138
132
|
# results are found and singleton is true then the first result will be
|
139
133
|
# used. Defaults to true.
|
140
134
|
# @option opts [Boolean] :text_content_only The text_content_only option
|
141
|
-
# if true will use the text content of the Nokogiri result object,
|
142
|
-
# otherwise the Nokogiri object itself is returned.
|
135
|
+
# if true will use the text #content of the Nokogiri result object,
|
136
|
+
# otherwise the Nokogiri object itself is returned. The type of Nokogiri
|
137
|
+
# object returned depends on the given xpath query. See the Nokogiri
|
138
|
+
# documentation for more information. Defaults to true.
|
143
139
|
# @yield The block is executed when a Wgit::Document is initialized,
|
144
140
|
# regardless of the source. Use it (optionally) to process the result
|
145
141
|
# value.
|
@@ -509,16 +505,24 @@ be relative"
|
|
509
505
|
# parameter.
|
510
506
|
#
|
511
507
|
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
-
# @param singleton [Boolean] singleton ? results.first (single
|
513
|
-
#
|
508
|
+
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
509
|
+
# results (Enumerable).
|
514
510
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
511
|
# (String) : result (Nokogiri Object).
|
512
|
+
# @yield (Optionally) Pass a block to read/write the result value before
|
513
|
+
# it's returned.
|
514
|
+
# @yieldparam value [Object] The result value to be returned.
|
515
|
+
# @yieldparam source [Wgit::Document, Object] This Document instance.
|
516
|
+
# @yieldparam type [Symbol] The `source` type, which is `:document`.
|
517
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
518
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
516
519
|
# @return [String, Object] The value found in the html or the default value
|
517
520
|
# (singleton ? nil : []).
|
518
|
-
def extract(xpath, singleton: true, text_content_only: true)
|
521
|
+
def extract(xpath, singleton: true, text_content_only: true, &block)
|
519
522
|
send(
|
520
523
|
:extract_from_html, xpath,
|
521
|
-
singleton: singleton, text_content_only: text_content_only
|
524
|
+
singleton: singleton, text_content_only: text_content_only,
|
525
|
+
&block
|
522
526
|
)
|
523
527
|
end
|
524
528
|
|
@@ -542,27 +546,25 @@ be relative"
|
|
542
546
|
# parameter.
|
543
547
|
#
|
544
548
|
# @param xpath [String, #call] Used to find the value/object in @html.
|
545
|
-
# @param singleton [Boolean] singleton ? results.first (single
|
546
|
-
#
|
549
|
+
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
550
|
+
# results (Enumerable).
|
547
551
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
548
552
|
# (String) : result (Nokogiri Object).
|
549
|
-
# @yield
|
550
|
-
#
|
551
|
-
# value.
|
553
|
+
# @yield (Optionally) Pass a block to read/write the result value before
|
554
|
+
# it's returned.
|
552
555
|
# @yieldparam value [Object] The result value to be returned.
|
553
|
-
# @yieldparam source [Wgit::Document, Object]
|
554
|
-
# @yieldparam type [Symbol] The `source` type,
|
555
|
-
# `:object`.
|
556
|
+
# @yieldparam source [Wgit::Document, Object] This Document instance.
|
557
|
+
# @yieldparam type [Symbol] The `source` type, which is `:document`.
|
556
558
|
# @yieldreturn [Object] The return value of the block gets returned. Return
|
557
559
|
# the block's `value` param unchanged if you simply want to inspect it.
|
558
560
|
# @return [String, Object] The value found in the html or the default value
|
559
561
|
# (singleton ? nil : []).
|
560
562
|
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
563
|
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
-
result = singleton ?
|
564
|
+
result = singleton ? at_xpath(xpath) : xpath(xpath)
|
563
565
|
|
564
|
-
if text_content_only
|
565
|
-
result = singleton ? result
|
566
|
+
if result && text_content_only
|
567
|
+
result = singleton ? result.content : result.map(&:content)
|
566
568
|
end
|
567
569
|
|
568
570
|
Wgit::Utils.sanitize(result)
|
data/lib/wgit/dsl.rb
CHANGED
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-07-
|
11
|
+
date: 2022-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|