wgit 0.10.5 → 0.10.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a359a5011cce717a84c0fa353cba658a4abcd11d2e8575da701b39eef35f641c
4
- data.tar.gz: 730b2eee3c88d9cd99c1d9754744aa4bae71bc10019113e840f2914b3a7909e5
3
+ metadata.gz: 4598dcfc047ce3915ba5a871837be5efc54201d61b4967cf53070bec2af4dd52
4
+ data.tar.gz: 604010011024af6f2d4dfcc87e6c4c1d73f8e4811938281119fccb79792818c1
5
5
  SHA512:
6
- metadata.gz: 4807d488cf03aa3dcf624249bd5169871bc17dd7e2de273cb841801147c040843912edc225a1f7346e427c022ecb2ce5360581323509221560ee99b31ea6a72b
7
- data.tar.gz: ea3f1237116d05bbb24b2e85fdba1b6447821377a07ff5c5e6afea859afa5fb36d4e2c3f708c6f5dc201e0802aa61f0eaa56e63a62882330b5225dbbaa08721d
6
+ metadata.gz: 44b098e2a97191801787386e9d2060dcdeacc625c3453976679fc276a73b2bf0614713764a55f7074073018e898f2e43dc1a7f4f803339a86158052f59dcabcb
7
+ data.tar.gz: 8645c7095bb14590cf83c21905c9f5ed524e1047254e6526b8fe46a53f3989395472300d27fb65f899951a5f4b80ee9928accd23164b10e1a834975bf045db47
data/CHANGELOG.md CHANGED
@@ -9,6 +9,15 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.10.6
13
+ ### Added
14
+ - `Wgit::DSL` method `#crawl_url` (aliased to `#crawl`).
15
+ ### Changed/Removed
16
+ - Added a `&block` param to `Wgit::Document#extract`, which gets passed to `#extract_from_html`.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
12
21
  ## v0.10.5
13
22
  ### Added
14
23
  - `Database#last_result` getter method to return the most recent raw mongo result.
data/lib/wgit/document.rb CHANGED
@@ -89,24 +89,18 @@ module Wgit
89
89
  #
90
90
  # @return [String] An xpath String to obtain a webpage's text elements.
91
91
  def self.text_elements_xpath
92
- xpath = ''
93
- return xpath if Wgit::Document.text_elements.empty?
94
-
95
- el_xpath = '//%s/text()'
96
- Wgit::Document.text_elements.each_with_index do |el, i|
97
- xpath += ' | ' unless i.zero?
98
- xpath += format(el_xpath, el)
92
+ Wgit::Document.text_elements.each_with_index.reduce("") do |xpath, (el, i)|
93
+ xpath += " | " unless i.zero?
94
+ xpath += format("//%s/text()", el)
99
95
  end
100
-
101
- xpath
102
96
  end
103
97
 
104
98
  # Defines a content extractor, which extracts HTML elements/content
105
99
  # into instance variables upon Document initialization. See the default
106
100
  # extractors defined in 'document_extractors.rb' as examples. Defining an
107
101
  # extractor means that every subsequently crawled/initialized document
108
- # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
- # content extraction.
102
+ # will attempt to extract the xpath's content. Use `#extract` for a one off
103
+ # content extraction on any document.
110
104
  #
111
105
  # Note that defined extractors work for both Documents initialized from
112
106
  # HTML (via Wgit::Crawler methods) and from database objects.
@@ -115,9 +109,9 @@ module Wgit
115
109
  #
116
110
  # When initialising from HTML, a singleton value of true will only
117
111
  # ever return the first result found; otherwise all the results are
118
- # returned in an Array. When initialising from a database object, the value
119
- # is taken as is and singleton is only used to define the default empty
120
- # value. If a value cannot be found (in either the HTML or database
112
+ # returned in an Enumerable. When initialising from a database object, the
113
+ # value is taken as is and singleton is only used to define the default
114
+ # empty value. If a value cannot be found (in either the HTML or database
121
115
  # object), then a default will be used. The default value is:
122
116
  # `singleton ? nil : []`.
123
117
  #
@@ -134,12 +128,14 @@ module Wgit
134
128
  # @param opts [Hash] The options to define an extractor with. The
135
129
  # options are only used when intializing from HTML, not the database.
136
130
  # @option opts [Boolean] :singleton The singleton option determines
137
- # whether or not the result(s) should be in an Array. If multiple
131
+ # whether or not the result(s) should be in an Enumerable. If multiple
138
132
  # results are found and singleton is true then the first result will be
139
133
  # used. Defaults to true.
140
134
  # @option opts [Boolean] :text_content_only The text_content_only option
141
- # if true will use the text content of the Nokogiri result object,
142
- # otherwise the Nokogiri object itself is returned. Defaults to true.
135
+ # if true will use the text #content of the Nokogiri result object,
136
+ # otherwise the Nokogiri object itself is returned. The type of Nokogiri
137
+ # object returned depends on the given xpath query. See the Nokogiri
138
+ # documentation for more information. Defaults to true.
143
139
  # @yield The block is executed when a Wgit::Document is initialized,
144
140
  # regardless of the source. Use it (optionally) to process the result
145
141
  # value.
@@ -509,16 +505,24 @@ be relative"
509
505
  # parameter.
510
506
  #
511
507
  # @param xpath [String, #call] Used to find the value/object in @html.
512
- # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
- # Object) : results (Array).
508
+ # @param singleton [Boolean] singleton ? results.first (single Object) :
509
+ # results (Enumerable).
514
510
  # @param text_content_only [Boolean] text_content_only ? result.content
515
511
  # (String) : result (Nokogiri Object).
512
+ # @yield (Optionally) Pass a block to read/write the result value before
513
+ # it's returned.
514
+ # @yieldparam value [Object] The result value to be returned.
515
+ # @yieldparam source [Wgit::Document, Object] This Document instance.
516
+ # @yieldparam type [Symbol] The `source` type, which is `:document`.
517
+ # @yieldreturn [Object] The return value of the block gets returned. Return
518
+ # the block's `value` param unchanged if you simply want to inspect it.
516
519
  # @return [String, Object] The value found in the html or the default value
517
520
  # (singleton ? nil : []).
518
- def extract(xpath, singleton: true, text_content_only: true)
521
+ def extract(xpath, singleton: true, text_content_only: true, &block)
519
522
  send(
520
523
  :extract_from_html, xpath,
521
- singleton: singleton, text_content_only: text_content_only
524
+ singleton: singleton, text_content_only: text_content_only,
525
+ &block
522
526
  )
523
527
  end
524
528
 
@@ -542,27 +546,25 @@ be relative"
542
546
  # parameter.
543
547
  #
544
548
  # @param xpath [String, #call] Used to find the value/object in @html.
545
- # @param singleton [Boolean] singleton ? results.first (single Nokogiri
546
- # Object) : results (Array).
549
+ # @param singleton [Boolean] singleton ? results.first (single Object) :
550
+ # results (Enumerable).
547
551
  # @param text_content_only [Boolean] text_content_only ? result.content
548
552
  # (String) : result (Nokogiri Object).
549
- # @yield The block is executed when a Wgit::Document is initialized,
550
- # regardless of the source. Use it (optionally) to process the result
551
- # value.
553
+ # @yield (Optionally) Pass a block to read/write the result value before
554
+ # it's returned.
552
555
  # @yieldparam value [Object] The result value to be returned.
553
- # @yieldparam source [Wgit::Document, Object] The source of the `value`.
554
- # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
555
- # `:object`.
556
+ # @yieldparam source [Wgit::Document, Object] This Document instance.
557
+ # @yieldparam type [Symbol] The `source` type, which is `:document`.
556
558
  # @yieldreturn [Object] The return value of the block gets returned. Return
557
559
  # the block's `value` param unchanged if you simply want to inspect it.
558
560
  # @return [String, Object] The value found in the html or the default value
559
561
  # (singleton ? nil : []).
560
562
  def extract_from_html(xpath, singleton: true, text_content_only: true)
561
563
  xpath = xpath.call if xpath.respond_to?(:call)
562
- result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
564
+ result = singleton ? at_xpath(xpath) : xpath(xpath)
563
565
 
564
- if text_content_only
565
- result = singleton ? result&.content : result.map(&:content)
566
+ if result && text_content_only
567
+ result = singleton ? result.content : result.map(&:content)
566
568
  end
567
569
 
568
570
  Wgit::Utils.sanitize(result)
data/lib/wgit/dsl.rb CHANGED
@@ -317,6 +317,7 @@ the 'start' function".freeze
317
317
  db.clear_db
318
318
  end
319
319
 
320
+ alias crawl_url crawl
320
321
  alias crawl_r crawl_site
321
322
  alias index_r index_site
322
323
  alias start_urls start
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.5'
9
+ VERSION = '0.10.6'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.5
4
+ version: 0.10.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-07-11 00:00:00.000000000 Z
11
+ date: 2022-07-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable