wgit 0.8.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +68 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -326
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +39 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +145 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +66 -163
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +177 -63
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
    
        data/lib/wgit/document.rb
    CHANGED
    
    | @@ -6,19 +6,19 @@ require 'json' | |
| 6 6 | 
             
            require 'set'
         | 
| 7 7 |  | 
| 8 8 | 
             
            module Wgit
         | 
| 9 | 
            -
              # Class  | 
| 9 | 
            +
              # Class modeling/serialising a HTML web document, although other MIME types
         | 
| 10 10 | 
             
              # will work e.g. images etc. Also doubles as a search result when
         | 
| 11 11 | 
             
              # loading Documents from the database via `Wgit::Database#search`.
         | 
| 12 12 | 
             
              #
         | 
| 13 13 | 
             
              # The initialize method dynamically initializes instance variables from the
         | 
| 14 14 | 
             
              # Document HTML / Database object e.g. text. This bit is dynamic so that the
         | 
| 15 | 
            -
              # Document class can be easily extended allowing you to  | 
| 16 | 
            -
              # a webpage that are important to you. See `Wgit::Document. | 
| 15 | 
            +
              # Document class can be easily extended allowing you to extract the bits of
         | 
| 16 | 
            +
              # a webpage that are important to you. See `Wgit::Document.define_extractor`.
         | 
| 17 17 | 
             
              class Document
         | 
| 18 18 | 
             
                include Assertable
         | 
| 19 19 |  | 
| 20 | 
            -
                # Regex for the allowed var names when defining an  | 
| 21 | 
            -
                 | 
| 20 | 
            +
                # Regex for the allowed var names when defining an extractor.
         | 
| 21 | 
            +
                REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
         | 
| 22 22 |  | 
| 23 23 | 
             
                # Set of text elements used to build Document#text.
         | 
| 24 24 | 
             
                @text_elements = Set.new(%i[
         | 
| @@ -29,8 +29,8 @@ module Wgit | |
| 29 29 | 
             
                  summary sup td textarea th time u ul var wbr
         | 
| 30 30 | 
             
                ])
         | 
| 31 31 |  | 
| 32 | 
            -
                # Set of Symbols representing the defined Document  | 
| 33 | 
            -
                @ | 
| 32 | 
            +
                # Set of Symbols representing the defined Document extractors.
         | 
| 33 | 
            +
                @extractors = Set.new
         | 
| 34 34 |  | 
| 35 35 | 
             
                class << self
         | 
| 36 36 | 
             
                  # Set of HTML elements that make up the visible text on a page. These
         | 
| @@ -38,9 +38,9 @@ module Wgit | |
| 38 38 | 
             
                  # README.md for how to add to this Set dynamically.
         | 
| 39 39 | 
             
                  attr_reader :text_elements
         | 
| 40 40 |  | 
| 41 | 
            -
                  # Set of Symbols representing the defined Document  | 
| 42 | 
            -
                  # read-only. Use Wgit::Document. | 
| 43 | 
            -
                  attr_reader : | 
| 41 | 
            +
                  # Set of Symbols representing the defined Document extractors. Is
         | 
| 42 | 
            +
                  # read-only. Use Wgit::Document.define_extractor for a new extractor.
         | 
| 43 | 
            +
                  attr_reader :extractors
         | 
| 44 44 | 
             
                end
         | 
| 45 45 |  | 
| 46 46 | 
             
                # The URL of the webpage, an instance of Wgit::Url.
         | 
| @@ -50,7 +50,7 @@ module Wgit | |
| 50 50 | 
             
                attr_reader :html
         | 
| 51 51 |  | 
| 52 52 | 
             
                # The Nokogiri::HTML document object initialized from @html.
         | 
| 53 | 
            -
                attr_reader : | 
| 53 | 
            +
                attr_reader :parser
         | 
| 54 54 |  | 
| 55 55 | 
             
                # The score is only used following a `Database#search` and records matches.
         | 
| 56 56 | 
             
                attr_reader :score
         | 
| @@ -62,7 +62,7 @@ module Wgit | |
| 62 62 | 
             
                #
         | 
| 63 63 | 
             
                # During initialisation, the Document will call any private
         | 
| 64 64 | 
             
                # `init_*_from_html` and `init_*_from_object` methods it can find. See the
         | 
| 65 | 
            -
                #  | 
| 65 | 
            +
                # Wgit::Document.define_extractor method for more details.
         | 
| 66 66 | 
             
                #
         | 
| 67 67 | 
             
                # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
         | 
| 68 68 | 
             
                #   representing a URL or a Hash-like object responding to :fetch. e.g. a
         | 
| @@ -101,13 +101,16 @@ module Wgit | |
| 101 101 | 
             
                  xpath
         | 
| 102 102 | 
             
                end
         | 
| 103 103 |  | 
| 104 | 
            -
                # Defines  | 
| 105 | 
            -
                # instance variables upon Document initialization. See the default
         | 
| 106 | 
            -
                #  | 
| 104 | 
            +
                # Defines a content extractor, which extracts HTML elements/content
         | 
| 105 | 
            +
                # into instance variables upon Document initialization. See the default
         | 
| 106 | 
            +
                # extractors defined in 'document_extractors.rb' as examples. Defining an
         | 
| 107 | 
            +
                # extractor means that every subsequently crawled/initialized document
         | 
| 108 | 
            +
                # will attempt to extract the xpath's content. Use `#xpath` for a one off
         | 
| 109 | 
            +
                # content extraction.
         | 
| 107 110 | 
             
                #
         | 
| 108 | 
            -
                # Note that defined  | 
| 111 | 
            +
                # Note that defined extractors work for both Documents initialized from
         | 
| 109 112 | 
             
                # HTML (via Wgit::Crawler methods) and from database objects.
         | 
| 110 | 
            -
                # An  | 
| 113 | 
            +
                # An extractor once defined, initializes a private instance variable with
         | 
| 111 114 | 
             
                # the xpath or database object result(s).
         | 
| 112 115 | 
             
                #
         | 
| 113 116 | 
             
                # When initialising from HTML, a singleton value of true will only
         | 
| @@ -118,15 +121,17 @@ module Wgit | |
| 118 121 | 
             
                # object), then a default will be used. The default value is:
         | 
| 119 122 | 
             
                # `singleton ? nil : []`.
         | 
| 120 123 | 
             
                #
         | 
| 121 | 
            -
                # @param var [Symbol] The name of the variable to be initialised | 
| 124 | 
            +
                # @param var [Symbol] The name of the variable to be initialised, that will
         | 
| 125 | 
            +
                #   contain the extracted content. A getter and setter method is defined
         | 
| 126 | 
            +
                #   for the initialised variable.
         | 
| 122 127 | 
             
                # @param xpath [String, #call] The xpath used to find the element(s)
         | 
| 123 128 | 
             
                #   of the webpage. Only used when initializing from HTML.
         | 
| 124 129 | 
             
                #
         | 
| 125 130 | 
             
                #   Pass a callable object (proc etc.) if you want the
         | 
| 126 131 | 
             
                #   xpath value to be derived on Document initialisation (instead of when
         | 
| 127 | 
            -
                #   the  | 
| 132 | 
            +
                #   the extractor is defined). The call method must return a valid xpath
         | 
| 128 133 | 
             
                #   String.
         | 
| 129 | 
            -
                # @param opts [Hash] The options to define an  | 
| 134 | 
            +
                # @param opts [Hash] The options to define an extractor with. The
         | 
| 130 135 | 
             
                #   options are only used when intializing from HTML, not the database.
         | 
| 131 136 | 
             
                # @option opts [Boolean] :singleton The singleton option determines
         | 
| 132 137 | 
             
                #   whether or not the result(s) should be in an Array. If multiple
         | 
| @@ -147,46 +152,50 @@ module Wgit | |
| 147 152 | 
             
                #   value. Return the block's value param unchanged if you want to inspect.
         | 
| 148 153 | 
             
                # @raise [StandardError] If the var param isn't valid.
         | 
| 149 154 | 
             
                # @return [Symbol] The given var Symbol if successful.
         | 
| 150 | 
            -
                def self. | 
| 155 | 
            +
                def self.define_extractor(var, xpath, opts = {}, &block)
         | 
| 151 156 | 
             
                  var = var.to_sym
         | 
| 152 157 | 
             
                  defaults = { singleton: true, text_content_only: true }
         | 
| 153 158 | 
             
                  opts = defaults.merge(opts)
         | 
| 154 159 |  | 
| 155 | 
            -
                  raise "var must match #{ | 
| 156 | 
            -
                  var =~  | 
| 160 | 
            +
                  raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
         | 
| 161 | 
            +
                  var =~ REGEX_EXTRACTOR_NAME
         | 
| 157 162 |  | 
| 158 163 | 
             
                  # Define the private init_*_from_html method for HTML.
         | 
| 159 164 | 
             
                  # Gets the HTML's xpath value and creates a var for it.
         | 
| 160 165 | 
             
                  func_name = Document.send(:define_method, "init_#{var}_from_html") do
         | 
| 161 | 
            -
                    result =  | 
| 166 | 
            +
                    result = extract_from_html(xpath, **opts, &block)
         | 
| 162 167 | 
             
                    init_var(var, result)
         | 
| 163 168 | 
             
                  end
         | 
| 164 169 | 
             
                  Document.send(:private, func_name)
         | 
| 165 170 |  | 
| 166 171 | 
             
                  # Define the private init_*_from_object method for a Database object.
         | 
| 167 172 | 
             
                  # Gets the Object's 'key' value and creates a var for it.
         | 
| 168 | 
            -
                  func_name = Document.send( | 
| 169 | 
            -
                     | 
| 173 | 
            +
                  func_name = Document.send(
         | 
| 174 | 
            +
                    :define_method, "init_#{var}_from_object"
         | 
| 175 | 
            +
                  ) do |obj|
         | 
| 176 | 
            +
                    result = extract_from_object(
         | 
| 177 | 
            +
                      obj, var.to_s, singleton: opts[:singleton], &block
         | 
| 178 | 
            +
                    )
         | 
| 170 179 | 
             
                    init_var(var, result)
         | 
| 171 180 | 
             
                  end
         | 
| 172 181 | 
             
                  Document.send(:private, func_name)
         | 
| 173 182 |  | 
| 174 | 
            -
                  @ | 
| 183 | 
            +
                  @extractors << var
         | 
| 175 184 | 
             
                  var
         | 
| 176 185 | 
             
                end
         | 
| 177 186 |  | 
| 178 | 
            -
                # Removes the `init_*` methods created when an  | 
| 179 | 
            -
                # Therefore, this is the opposing method to `Document. | 
| 187 | 
            +
                # Removes the `init_*` methods created when an extractor is defined.
         | 
| 188 | 
            +
                # Therefore, this is the opposing method to `Document.define_extractor`.
         | 
| 180 189 | 
             
                # Returns true if successful or false if the method(s) cannot be found.
         | 
| 181 190 | 
             
                #
         | 
| 182 | 
            -
                # @param var [Symbol] The  | 
| 183 | 
            -
                # @return [Boolean] True if the  | 
| 191 | 
            +
                # @param var [Symbol] The extractor variable to remove.
         | 
| 192 | 
            +
                # @return [Boolean] True if the extractor `var` was found and removed;
         | 
| 184 193 | 
             
                #   otherwise false.
         | 
| 185 | 
            -
                def self. | 
| 194 | 
            +
                def self.remove_extractor(var)
         | 
| 186 195 | 
             
                  Document.send(:remove_method, "init_#{var}_from_html")
         | 
| 187 196 | 
             
                  Document.send(:remove_method, "init_#{var}_from_object")
         | 
| 188 197 |  | 
| 189 | 
            -
                  @ | 
| 198 | 
            +
                  @extractors.delete(var.to_sym)
         | 
| 190 199 | 
             
                  true
         | 
| 191 200 | 
             
                rescue NameError
         | 
| 192 201 | 
             
                  false
         | 
| @@ -215,9 +224,9 @@ module Wgit | |
| 215 224 |  | 
| 216 225 | 
             
                # Returns the base URL of this Wgit::Document. The base URL is either the
         | 
| 217 226 | 
             
                # <base> element's href value or @url (if @base is nil). If @base is
         | 
| 218 | 
            -
                # present and relative, then @url. | 
| 219 | 
            -
                # should be used instead of `doc.url. | 
| 220 | 
            -
                # absolute links from relative links; or use `link. | 
| 227 | 
            +
                # present and relative, then @url.to_origin + @base is returned. This method
         | 
| 228 | 
            +
                # should be used instead of `doc.url.to_origin` etc. when manually building
         | 
| 229 | 
            +
                # absolute links from relative links; or use `link.make_absolute(doc)`.
         | 
| 221 230 | 
             
                #
         | 
| 222 231 | 
             
                # Provide the `link:` parameter to get the correct base URL for that type
         | 
| 223 232 | 
             
                # of link. For example, a link of `#top` would always return @url because
         | 
| @@ -236,12 +245,16 @@ module Wgit | |
| 236 245 | 
             
                # @return [Wgit::Url] The base URL of this Document e.g.
         | 
| 237 246 | 
             
                #   'http://example.com/public'.
         | 
| 238 247 | 
             
                def base_url(link: nil)
         | 
| 239 | 
            -
                  raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
         | 
| 240 248 | 
             
                  if @url.relative? && @base.nil?
         | 
| 241 | 
            -
             | 
| 249 | 
            +
                    raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
         | 
| 250 | 
            +
                  end
         | 
| 251 | 
            +
             | 
| 242 252 | 
             
                  if @url.relative? && @base&.relative?
         | 
| 253 | 
            +
                    raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
         | 
| 254 | 
            +
            be relative"
         | 
| 255 | 
            +
                  end
         | 
| 243 256 |  | 
| 244 | 
            -
                  get_base = -> { @base.relative? ? @url. | 
| 257 | 
            +
                  get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
         | 
| 245 258 |  | 
| 246 259 | 
             
                  if link
         | 
| 247 260 | 
             
                    link = Wgit::Url.new(link)
         | 
| @@ -253,7 +266,7 @@ module Wgit | |
| 253 266 | 
             
                    end
         | 
| 254 267 | 
             
                  end
         | 
| 255 268 |  | 
| 256 | 
            -
                  base_url = @base ? get_base.call : @url. | 
| 269 | 
            +
                  base_url = @base ? get_base.call : @url.to_origin
         | 
| 257 270 | 
             
                  base_url.omit_fragment.omit_query
         | 
| 258 271 | 
             
                end
         | 
| 259 272 |  | 
| @@ -267,7 +280,7 @@ module Wgit | |
| 267 280 | 
             
                def to_h(include_html: false, include_score: true)
         | 
| 268 281 | 
             
                  ignore = include_html ? [] : ['@html']
         | 
| 269 282 | 
             
                  ignore << '@score' unless include_score
         | 
| 270 | 
            -
                  ignore << '@ | 
| 283 | 
            +
                  ignore << '@parser' # Always ignore the Nokogiri object.
         | 
| 271 284 |  | 
| 272 285 | 
             
                  Wgit::Utils.to_h(self, ignore: ignore)
         | 
| 273 286 | 
             
                end
         | 
| @@ -284,7 +297,7 @@ module Wgit | |
| 284 297 |  | 
| 285 298 | 
             
                # Returns a Hash containing this Document's instance variables and
         | 
| 286 299 | 
             
                # their #length (if they respond to it). Works dynamically so that any
         | 
| 287 | 
            -
                # user defined  | 
| 300 | 
            +
                # user defined extractors (and their created instance vars) will appear in
         | 
| 288 301 | 
             
                # the returned Hash as well. The number of text snippets as well as total
         | 
| 289 302 | 
             
                # number of textual bytes are always included in the returned Hash.
         | 
| 290 303 | 
             
                #
         | 
| @@ -324,21 +337,39 @@ module Wgit | |
| 324 337 | 
             
                end
         | 
| 325 338 |  | 
| 326 339 | 
             
                # Uses Nokogiri's xpath method to search the doc's html and return the
         | 
| 327 | 
            -
                # results.
         | 
| 340 | 
            +
                # results. Use `#at_xpath` for returning the first result only.
         | 
| 328 341 | 
             
                #
         | 
| 329 342 | 
             
                # @param xpath [String] The xpath to search the @html with.
         | 
| 330 343 | 
             
                # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
         | 
| 331 344 | 
             
                def xpath(xpath)
         | 
| 332 | 
            -
                  @ | 
| 345 | 
            +
                  @parser.xpath(xpath)
         | 
| 333 346 | 
             
                end
         | 
| 334 347 |  | 
| 335 | 
            -
                # Uses Nokogiri's  | 
| 336 | 
            -
                # results.
         | 
| 348 | 
            +
                # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
         | 
| 349 | 
            +
                # result. Use `#xpath` for returning several results.
         | 
| 350 | 
            +
                #
         | 
| 351 | 
            +
                # @param xpath [String] The xpath to search the @html with.
         | 
| 352 | 
            +
                # @return [Nokogiri::XML::Element] The result of the xpath search.
         | 
| 353 | 
            +
                def at_xpath(xpath)
         | 
| 354 | 
            +
                  @parser.at_xpath(xpath)
         | 
| 355 | 
            +
                end
         | 
| 356 | 
            +
             | 
| 357 | 
            +
                # Uses Nokogiri's `css` method to search the doc's html and return the
         | 
| 358 | 
            +
                # results. Use `#at_css` for returning the first result only.
         | 
| 337 359 | 
             
                #
         | 
| 338 360 | 
             
                # @param selector [String] The CSS selector to search the @html with.
         | 
| 339 361 | 
             
                # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
         | 
| 340 362 | 
             
                def css(selector)
         | 
| 341 | 
            -
                  @ | 
| 363 | 
            +
                  @parser.css(selector)
         | 
| 364 | 
            +
                end
         | 
| 365 | 
            +
             | 
| 366 | 
            +
                # Uses Nokogiri's `at_css` method to search the doc's html and return the
         | 
| 367 | 
            +
                # result. Use `#css` for returning several results.
         | 
| 368 | 
            +
                #
         | 
| 369 | 
            +
                # @param selector [String] The CSS selector to search the @html with.
         | 
| 370 | 
            +
                # @return [Nokogiri::XML::Element] The result of the CSS search.
         | 
| 371 | 
            +
                def at_css(selector)
         | 
| 372 | 
            +
                  @parser.at_css(selector)
         | 
| 342 373 | 
             
                end
         | 
| 343 374 |  | 
| 344 375 | 
             
                # Returns all unique internal links from this Document in relative form.
         | 
| @@ -356,13 +387,13 @@ module Wgit | |
| 356 387 | 
             
                  return [] if @links.empty?
         | 
| 357 388 |  | 
| 358 389 | 
             
                  links = @links
         | 
| 359 | 
            -
                          .select { |link| link.relative?(host: @url. | 
| 390 | 
            +
                          .select { |link| link.relative?(host: @url.to_origin) }
         | 
| 360 391 | 
             
                          .map(&:omit_base)
         | 
| 361 392 | 
             
                          .map do |link| # Map @url.to_host into / as it's a duplicate.
         | 
| 362 393 | 
             
                    link.to_host == @url.to_host ? Wgit::Url.new('/') : link
         | 
| 363 394 | 
             
                  end
         | 
| 364 395 |  | 
| 365 | 
            -
                  Wgit::Utils. | 
| 396 | 
            +
                  Wgit::Utils.sanitize(links)
         | 
| 366 397 | 
             
                end
         | 
| 367 398 |  | 
| 368 399 | 
             
                # Returns all unique internal links from this Document in absolute form by
         | 
| @@ -371,7 +402,7 @@ module Wgit | |
| 371 402 | 
             
                #
         | 
| 372 403 | 
             
                # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
         | 
| 373 404 | 
             
                def internal_absolute_links
         | 
| 374 | 
            -
                  internal_links.map { |link| link. | 
| 405 | 
            +
                  internal_links.map { |link| link.make_absolute(self) }
         | 
| 375 406 | 
             
                end
         | 
| 376 407 |  | 
| 377 408 | 
             
                # Returns all unique external links from this Document in absolute form.
         | 
| @@ -382,10 +413,17 @@ module Wgit | |
| 382 413 | 
             
                  return [] if @links.empty?
         | 
| 383 414 |  | 
| 384 415 | 
             
                  links = @links
         | 
| 385 | 
            -
                          . | 
| 416 | 
            +
                          .map do |link|
         | 
| 417 | 
            +
                            if link.scheme_relative?
         | 
| 418 | 
            +
                              link.prefix_scheme(@url.to_scheme.to_sym)
         | 
| 419 | 
            +
                            else
         | 
| 420 | 
            +
                              link
         | 
| 421 | 
            +
                            end
         | 
| 422 | 
            +
                          end
         | 
| 423 | 
            +
                          .reject { |link| link.relative?(host: @url.to_origin) }
         | 
| 386 424 | 
             
                          .map(&:omit_trailing_slash)
         | 
| 387 425 |  | 
| 388 | 
            -
                  Wgit::Utils. | 
| 426 | 
            +
                  Wgit::Utils.sanitize(links)
         | 
| 389 427 | 
             
                end
         | 
| 390 428 |  | 
| 391 429 | 
             
                # Searches the @text for the given query and returns the results.
         | 
| @@ -400,8 +438,8 @@ module Wgit | |
| 400 438 | 
             
                # original sentence, which ever is less. The algorithm obviously ensures
         | 
| 401 439 | 
             
                # that the search query is visible somewhere in the sentence.
         | 
| 402 440 | 
             
                #
         | 
| 403 | 
            -
                # @param query [ | 
| 404 | 
            -
                #   @text for.
         | 
| 441 | 
            +
                # @param query [Regexp, #to_s] The regex or text value to search the
         | 
| 442 | 
            +
                #   document's @text for.
         | 
| 405 443 | 
             
                # @param case_sensitive [Boolean] Whether character case must match.
         | 
| 406 444 | 
             
                # @param whole_sentence [Boolean] Whether multiple words should be searched
         | 
| 407 445 | 
             
                #   for separately.
         | 
| @@ -411,12 +449,16 @@ module Wgit | |
| 411 449 | 
             
                def search(
         | 
| 412 450 | 
             
                  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
         | 
| 413 451 | 
             
                )
         | 
| 414 | 
            -
                  query = query.to_s
         | 
| 415 | 
            -
                  raise 'A search query must be provided' if query.empty?
         | 
| 416 452 | 
             
                  raise 'The sentence_limit value must be even' if sentence_limit.odd?
         | 
| 417 453 |  | 
| 418 | 
            -
                   | 
| 419 | 
            -
             | 
| 454 | 
            +
                  if query.is_a?(Regexp)
         | 
| 455 | 
            +
                    regex = query
         | 
| 456 | 
            +
                  else # respond_to? #to_s == true
         | 
| 457 | 
            +
                    query = query.to_s
         | 
| 458 | 
            +
                    query = query.gsub(' ', '|') unless whole_sentence
         | 
| 459 | 
            +
                    regex = Regexp.new(query, !case_sensitive)
         | 
| 460 | 
            +
                  end
         | 
| 461 | 
            +
             | 
| 420 462 | 
             
                  results = {}
         | 
| 421 463 |  | 
| 422 464 | 
             
                  @text.each do |sentence|
         | 
| @@ -443,8 +485,8 @@ module Wgit | |
| 443 485 | 
             
                # functionality. The original text is returned; no other reference to it
         | 
| 444 486 | 
             
                # is kept thereafter.
         | 
| 445 487 | 
             
                #
         | 
| 446 | 
            -
                # @param query [ | 
| 447 | 
            -
                #   @text for.
         | 
| 488 | 
            +
                # @param query [Regexp, #to_s] The regex or text value to search the
         | 
| 489 | 
            +
                #   document's @text for.
         | 
| 448 490 | 
             
                # @param case_sensitive [Boolean] Whether character case must match.
         | 
| 449 491 | 
             
                # @param whole_sentence [Boolean] Whether multiple words should be searched
         | 
| 450 492 | 
             
                #   for separately.
         | 
| @@ -463,13 +505,31 @@ module Wgit | |
| 463 505 | 
             
                  orig_text
         | 
| 464 506 | 
             
                end
         | 
| 465 507 |  | 
| 508 | 
            +
                # Extracts a value/object from this Document's @html using the given xpath
         | 
| 509 | 
            +
                # parameter.
         | 
| 510 | 
            +
                #
         | 
| 511 | 
            +
                # @param xpath [String, #call] Used to find the value/object in @html.
         | 
| 512 | 
            +
                # @param singleton [Boolean] singleton ? results.first (single Nokogiri
         | 
| 513 | 
            +
                #   Object) : results (Array).
         | 
| 514 | 
            +
                # @param text_content_only [Boolean] text_content_only ? result.content
         | 
| 515 | 
            +
                #   (String) : result (Nokogiri Object).
         | 
| 516 | 
            +
                # @return [String, Object] The value found in the html or the default value
         | 
| 517 | 
            +
                #   (singleton ? nil : []).
         | 
| 518 | 
            +
                def extract(xpath, singleton: true, text_content_only: true)
         | 
| 519 | 
            +
                  send(
         | 
| 520 | 
            +
                    :extract_from_html, xpath,
         | 
| 521 | 
            +
                    singleton: singleton, text_content_only: text_content_only
         | 
| 522 | 
            +
                  )
         | 
| 523 | 
            +
                end
         | 
| 524 | 
            +
             | 
| 466 525 | 
             
                protected
         | 
| 467 526 |  | 
| 468 527 | 
             
                # Initializes the nokogiri object using @html, which cannot be nil.
         | 
| 469 528 | 
             
                # Override this method to custom configure the Nokogiri object returned.
         | 
| 470 529 | 
             
                # Gets called from Wgit::Document.new upon initialization.
         | 
| 471 530 | 
             
                #
         | 
| 472 | 
            -
                # @yield [config] The given block is passed to Nokogiri::HTML for | 
| 531 | 
            +
                # @yield [config] The given block is passed to Nokogiri::HTML for
         | 
| 532 | 
            +
                #   initialisation.
         | 
| 473 533 | 
             
                # @raise [StandardError] If @html isn't set.
         | 
| 474 534 | 
             
                # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
         | 
| 475 535 | 
             
                def init_nokogiri(&block)
         | 
| @@ -481,7 +541,7 @@ module Wgit | |
| 481 541 | 
             
                # Extracts a value/object from this Document's @html using the given xpath
         | 
| 482 542 | 
             
                # parameter.
         | 
| 483 543 | 
             
                #
         | 
| 484 | 
            -
                # @param xpath [String] Used to find the value/object in @html.
         | 
| 544 | 
            +
                # @param xpath [String, #call] Used to find the value/object in @html.
         | 
| 485 545 | 
             
                # @param singleton [Boolean] singleton ? results.first (single Nokogiri
         | 
| 486 546 | 
             
                #   Object) : results (Array).
         | 
| 487 547 | 
             
                # @param text_content_only [Boolean] text_content_only ? result.content
         | 
| @@ -497,23 +557,16 @@ module Wgit | |
| 497 557 | 
             
                #   the block's `value` param unchanged if you simply want to inspect it.
         | 
| 498 558 | 
             
                # @return [String, Object] The value found in the html or the default value
         | 
| 499 559 | 
             
                #   (singleton ? nil : []).
         | 
| 500 | 
            -
                def  | 
| 501 | 
            -
                   | 
| 502 | 
            -
                   | 
| 503 | 
            -
                  results = @doc.xpath(xpath)
         | 
| 504 | 
            -
             | 
| 505 | 
            -
                  return default if results.nil? || results.empty?
         | 
| 560 | 
            +
                def extract_from_html(xpath, singleton: true, text_content_only: true)
         | 
| 561 | 
            +
                  xpath  = xpath.call if xpath.respond_to?(:call)
         | 
| 562 | 
            +
                  result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
         | 
| 506 563 |  | 
| 507 | 
            -
                   | 
| 508 | 
            -
             | 
| 509 | 
            -
             | 
| 510 | 
            -
                             text_content_only ? results.map(&:content) : results
         | 
| 511 | 
            -
                           end
         | 
| 512 | 
            -
             | 
| 513 | 
            -
                  singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
         | 
| 564 | 
            +
                  if text_content_only
         | 
| 565 | 
            +
                    result = singleton ? result&.content : result.map(&:content)
         | 
| 566 | 
            +
                  end
         | 
| 514 567 |  | 
| 568 | 
            +
                  Wgit::Utils.sanitize(result)
         | 
| 515 569 | 
             
                  result = yield(result, self, :document) if block_given?
         | 
| 516 | 
            -
             | 
| 517 570 | 
             
                  result
         | 
| 518 571 | 
             
                end
         | 
| 519 572 |  | 
| @@ -533,16 +586,14 @@ module Wgit | |
| 533 586 | 
             
                #   the block's `value` param unchanged if you simply want to inspect it.
         | 
| 534 587 | 
             
                # @return [String, Object] The value found in the obj or the default value
         | 
| 535 588 | 
             
                #   (singleton ? nil : []).
         | 
| 536 | 
            -
                def  | 
| 589 | 
            +
                def extract_from_object(obj, key, singleton: true)
         | 
| 537 590 | 
             
                  assert_respond_to(obj, :fetch)
         | 
| 538 591 |  | 
| 539 592 | 
             
                  default = singleton ? nil : []
         | 
| 540 593 | 
             
                  result  = obj.fetch(key.to_s, default)
         | 
| 541 594 |  | 
| 542 | 
            -
                   | 
| 543 | 
            -
             | 
| 595 | 
            +
                  Wgit::Utils.sanitize(result)
         | 
| 544 596 | 
             
                  result = yield(result, obj, :object) if block_given?
         | 
| 545 | 
            -
             | 
| 546 597 | 
             
                  result
         | 
| 547 598 | 
             
                end
         | 
| 548 599 |  | 
| @@ -556,12 +607,12 @@ module Wgit | |
| 556 607 | 
             
                  url = Wgit::Url.parse(url)
         | 
| 557 608 | 
             
                  url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
         | 
| 558 609 |  | 
| 559 | 
            -
                  @url | 
| 560 | 
            -
                  @html | 
| 561 | 
            -
                  @ | 
| 562 | 
            -
                  @score | 
| 610 | 
            +
                  @url    = url
         | 
| 611 | 
            +
                  @html   = html || ''
         | 
| 612 | 
            +
                  @parser = init_nokogiri
         | 
| 613 | 
            +
                  @score  = 0.0
         | 
| 563 614 |  | 
| 564 | 
            -
                  Wgit::Utils. | 
| 615 | 
            +
                  Wgit::Utils.sanitize(@html, encode: encode)
         | 
| 565 616 |  | 
| 566 617 | 
             
                  # Dynamically run the init_*_from_html methods.
         | 
| 567 618 | 
             
                  Document.private_instance_methods(false).each do |method|
         | 
| @@ -577,12 +628,12 @@ module Wgit | |
| 577 628 | 
             
                def init_from_object(obj, encode: true)
         | 
| 578 629 | 
             
                  assert_respond_to(obj, :fetch)
         | 
| 579 630 |  | 
| 580 | 
            -
                  @url | 
| 581 | 
            -
                  @html | 
| 582 | 
            -
                  @ | 
| 583 | 
            -
                  @score | 
| 631 | 
            +
                  @url    = Wgit::Url.new(obj.fetch('url')) # Should always be present.
         | 
| 632 | 
            +
                  @html   = obj.fetch('html', '')
         | 
| 633 | 
            +
                  @parser = init_nokogiri
         | 
| 634 | 
            +
                  @score  = obj.fetch('score', 0.0)
         | 
| 584 635 |  | 
| 585 | 
            -
                  Wgit::Utils. | 
| 636 | 
            +
                  Wgit::Utils.sanitize(@html, encode: encode)
         | 
| 586 637 |  | 
| 587 638 | 
             
                  # Dynamically run the init_*_from_object methods.
         | 
| 588 639 | 
             
                  Document.private_instance_methods(false).each do |method|
         | 
| @@ -593,11 +644,11 @@ module Wgit | |
| 593 644 | 
             
                  end
         | 
| 594 645 | 
             
                end
         | 
| 595 646 |  | 
| 596 | 
            -
                # Initialises an instance variable and defines  | 
| 647 | 
            +
                # Initialises an instance variable and defines an accessor method for it.
         | 
| 597 648 | 
             
                #
         | 
| 598 649 | 
             
                # @param var [Symbol] The name of the variable to be initialized.
         | 
| 599 650 | 
             
                # @param value [Object] The newly initialized variable's value.
         | 
| 600 | 
            -
                # @return [Symbol] The name of the  | 
| 651 | 
            +
                # @return [Symbol] The name of the defined getter method.
         | 
| 601 652 | 
             
                def init_var(var, value)
         | 
| 602 653 | 
             
                  # instance_var_name starts with @, var_name doesn't.
         | 
| 603 654 | 
             
                  var = var.to_s
         | 
| @@ -605,10 +656,9 @@ module Wgit | |
| 605 656 | 
             
                  instance_var_name = "@#{var_name}".to_sym
         | 
| 606 657 |  | 
| 607 658 | 
             
                  instance_variable_set(instance_var_name, value)
         | 
| 659 | 
            +
                  Wgit::Document.attr_accessor(var_name)
         | 
| 608 660 |  | 
| 609 | 
            -
                   | 
| 610 | 
            -
                    instance_variable_get(instance_var_name)
         | 
| 611 | 
            -
                  end
         | 
| 661 | 
            +
                  var_name
         | 
| 612 662 | 
             
                end
         | 
| 613 663 |  | 
| 614 664 | 
             
                alias content                html
         | 
| @@ -1,19 +1,19 @@ | |
| 1 1 | 
             
            # frozen_string_literal: true
         | 
| 2 2 |  | 
| 3 | 
            -
            ### Default Document  | 
| 3 | 
            +
            ### Default Document Extractors ###
         | 
| 4 4 |  | 
| 5 5 | 
             
            # Base.
         | 
| 6 | 
            -
            Wgit::Document. | 
| 6 | 
            +
            Wgit::Document.define_extractor(
         | 
| 7 7 | 
             
              :base,
         | 
| 8 8 | 
             
              '//base/@href',
         | 
| 9 9 | 
             
              singleton: true,
         | 
| 10 10 | 
             
              text_content_only: true
         | 
| 11 11 | 
             
            ) do |base|
         | 
| 12 | 
            -
              Wgit::Url. | 
| 12 | 
            +
              Wgit::Url.parse?(base) if base
         | 
| 13 13 | 
             
            end
         | 
| 14 14 |  | 
| 15 15 | 
             
            # Title.
         | 
| 16 | 
            -
            Wgit::Document. | 
| 16 | 
            +
            Wgit::Document.define_extractor(
         | 
| 17 17 | 
             
              :title,
         | 
| 18 18 | 
             
              '//title',
         | 
| 19 19 | 
             
              singleton: true,
         | 
| @@ -21,7 +21,7 @@ Wgit::Document.define_extension( | |
| 21 21 | 
             
            )
         | 
| 22 22 |  | 
| 23 23 | 
             
            # Description.
         | 
| 24 | 
            -
            Wgit::Document. | 
| 24 | 
            +
            Wgit::Document.define_extractor(
         | 
| 25 25 | 
             
              :description,
         | 
| 26 26 | 
             
              '//meta[@name="description"]/@content',
         | 
| 27 27 | 
             
              singleton: true,
         | 
| @@ -29,7 +29,7 @@ Wgit::Document.define_extension( | |
| 29 29 | 
             
            )
         | 
| 30 30 |  | 
| 31 31 | 
             
            # Author.
         | 
| 32 | 
            -
            Wgit::Document. | 
| 32 | 
            +
            Wgit::Document.define_extractor(
         | 
| 33 33 | 
             
              :author,
         | 
| 34 34 | 
             
              '//meta[@name="author"]/@content',
         | 
| 35 35 | 
             
              singleton: true,
         | 
| @@ -37,7 +37,7 @@ Wgit::Document.define_extension( | |
| 37 37 | 
             
            )
         | 
| 38 38 |  | 
| 39 39 | 
             
            # Keywords.
         | 
| 40 | 
            -
            Wgit::Document. | 
| 40 | 
            +
            Wgit::Document.define_extractor(
         | 
| 41 41 | 
             
              :keywords,
         | 
| 42 42 | 
             
              '//meta[@name="keywords"]/@content',
         | 
| 43 43 | 
             
              singleton: true,
         | 
| @@ -45,25 +45,25 @@ Wgit::Document.define_extension( | |
| 45 45 | 
             
            ) do |keywords, _source, type|
         | 
| 46 46 | 
             
              if keywords && (type == :document)
         | 
| 47 47 | 
             
                keywords = keywords.split(',')
         | 
| 48 | 
            -
                Wgit::Utils. | 
| 48 | 
            +
                Wgit::Utils.sanitize(keywords)
         | 
| 49 49 | 
             
              end
         | 
| 50 50 | 
             
              keywords
         | 
| 51 51 | 
             
            end
         | 
| 52 52 |  | 
| 53 53 | 
             
            # Links.
         | 
| 54 | 
            -
            Wgit::Document. | 
| 54 | 
            +
            Wgit::Document.define_extractor(
         | 
| 55 55 | 
             
              :links,
         | 
| 56 56 | 
             
              '//a/@href',
         | 
| 57 57 | 
             
              singleton: false,
         | 
| 58 58 | 
             
              text_content_only: true
         | 
| 59 59 | 
             
            ) do |links|
         | 
| 60 60 | 
             
              links
         | 
| 61 | 
            -
                .map { |link| Wgit::Url. | 
| 61 | 
            +
                .map { |link| Wgit::Url.parse?(link) }
         | 
| 62 62 | 
             
                .compact # Remove unparsable links.
         | 
| 63 63 | 
             
            end
         | 
| 64 64 |  | 
| 65 65 | 
             
            # Text.
         | 
| 66 | 
            -
            Wgit::Document. | 
| 66 | 
            +
            Wgit::Document.define_extractor(
         | 
| 67 67 | 
             
              :text,
         | 
| 68 68 | 
             
              proc { Wgit::Document.text_elements_xpath },
         | 
| 69 69 | 
             
              singleton: false,
         |