wgit 0.10.3 → 0.10.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/lib/wgit/database/database.rb +76 -10
- data/lib/wgit/document.rb +35 -33
- data/lib/wgit/dsl.rb +1 -0
- data/lib/wgit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4598dcfc047ce3915ba5a871837be5efc54201d61b4967cf53070bec2af4dd52
|
4
|
+
data.tar.gz: 604010011024af6f2d4dfcc87e6c4c1d73f8e4811938281119fccb79792818c1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 44b098e2a97191801787386e9d2060dcdeacc625c3453976679fc276a73b2bf0614713764a55f7074073018e898f2e43dc1a7f4f803339a86158052f59dcabcb
|
7
|
+
data.tar.gz: 8645c7095bb14590cf83c21905c9f5ed524e1047254e6526b8fe46a53f3989395472300d27fb65f899951a5f4b80ee9928accd23164b10e1a834975bf045db47
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,33 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.10.6
|
13
|
+
### Added
|
14
|
+
- `Wgit::DSL` method `#crawl_url` (aliased to `#crawl`).
|
15
|
+
### Changed/Removed
|
16
|
+
- Added a `&block` param to `Wgit::Document#extract`, which gets passed to `#extract_from_html`.
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.10.5
|
22
|
+
### Added
|
23
|
+
- `Database#last_result` getter method to return the most recent raw mongo result.
|
24
|
+
### Changed/Removed
|
25
|
+
- ...
|
26
|
+
### Fixed
|
27
|
+
- ...
|
28
|
+
---
|
29
|
+
|
30
|
+
## v0.10.4
|
31
|
+
### Added
|
32
|
+
- `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
|
33
|
+
### Changed/Removed
|
34
|
+
- ...
|
35
|
+
### Fixed
|
36
|
+
- ...
|
37
|
+
---
|
38
|
+
|
12
39
|
## v0.10.3
|
13
40
|
### Added
|
14
41
|
- ...
|
@@ -45,6 +45,9 @@ module Wgit
|
|
45
45
|
# A custom setter method is also provided for changing the search logic.
|
46
46
|
attr_reader :text_index
|
47
47
|
|
48
|
+
# The raw MongoDB result of the most recent operation.
|
49
|
+
attr_reader :last_result
|
50
|
+
|
48
51
|
# Initializes a connected database client using the provided
|
49
52
|
# connection_string or ENV['WGIT_CONNECTION_STRING'].
|
50
53
|
#
|
@@ -185,6 +188,8 @@ module Wgit
|
|
185
188
|
result = @client[collection].replace_one(query, data_hash, upsert: true)
|
186
189
|
|
187
190
|
result.matched_count.zero?
|
191
|
+
ensure
|
192
|
+
@last_result = result
|
188
193
|
end
|
189
194
|
|
190
195
|
### Retrieve Data ###
|
@@ -294,16 +299,12 @@ module Wgit
|
|
294
299
|
results = retrieve(DOCUMENTS_COLLECTION, query,
|
295
300
|
sort: sort_proj, projection: sort_proj,
|
296
301
|
limit: limit, skip: skip)
|
297
|
-
return [] if results.count < 1 # respond_to? :empty? == false
|
298
302
|
|
299
|
-
|
300
|
-
results = results.map do |mongo_doc|
|
303
|
+
results.map do |mongo_doc|
|
301
304
|
doc = Wgit::Document.new(mongo_doc)
|
302
305
|
yield(doc) if block_given?
|
303
306
|
doc
|
304
307
|
end
|
305
|
-
|
306
|
-
results
|
307
308
|
end
|
308
309
|
|
309
310
|
# Searches the database's Documents for the given query and then searches
|
@@ -348,6 +349,58 @@ module Wgit
|
|
348
349
|
results
|
349
350
|
end
|
350
351
|
|
352
|
+
# Searches the database's Documents for the given query and then searches
|
353
|
+
# each result in turn using `doc.search`. Instead of an Array of Documents,
|
354
|
+
# this method returns a Hash of the docs url => search_results creating a
|
355
|
+
# search engine like result set for quick access to text matches.
|
356
|
+
#
|
357
|
+
# @param query [String] The text query to search with.
|
358
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
359
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
360
|
+
# for separately.
|
361
|
+
# @param limit [Integer] The max number of results to return.
|
362
|
+
# @param skip [Integer] The number of results to skip.
|
363
|
+
# @param sentence_limit [Integer] The max length of each search result
|
364
|
+
# sentence.
|
365
|
+
# @param top_result_only [Boolean] Whether to return all of the documents
|
366
|
+
# search results or just the top (most relavent) result.
|
367
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
368
|
+
# DB.
|
369
|
+
# @return [Hash<String, String | Array<String>>] The search results obtained
|
370
|
+
# from the DB having mapped the docs url => search_results. The format of
|
371
|
+
# search_results depends on the value of `top_result_only`.
|
372
|
+
def search_text(
|
373
|
+
query, case_sensitive: false, whole_sentence: true,
|
374
|
+
limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
|
375
|
+
)
|
376
|
+
results = search(
|
377
|
+
query,
|
378
|
+
case_sensitive: case_sensitive,
|
379
|
+
whole_sentence: whole_sentence,
|
380
|
+
limit: limit,
|
381
|
+
skip: skip
|
382
|
+
)
|
383
|
+
|
384
|
+
results
|
385
|
+
.map do |doc|
|
386
|
+
yield(doc) if block_given?
|
387
|
+
|
388
|
+
results = doc.search(
|
389
|
+
query,
|
390
|
+
case_sensitive: case_sensitive,
|
391
|
+
whole_sentence: whole_sentence,
|
392
|
+
sentence_limit: sentence_limit
|
393
|
+
)
|
394
|
+
|
395
|
+
# Only return result if its text has a match - compact is called below.
|
396
|
+
next nil if results.empty?
|
397
|
+
|
398
|
+
[doc.url, (top_result_only ? results.first : results)]
|
399
|
+
end
|
400
|
+
.compact
|
401
|
+
.to_h
|
402
|
+
end
|
403
|
+
|
351
404
|
# Returns statistics about the database.
|
352
405
|
#
|
353
406
|
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
@@ -454,21 +507,30 @@ module Wgit
|
|
454
507
|
# 0 or 1 because urls are unique.
|
455
508
|
def delete(obj)
|
456
509
|
collection, query = get_type_info(obj)
|
457
|
-
@client[collection].delete_one(query)
|
510
|
+
result = @client[collection].delete_one(query)
|
511
|
+
result.n
|
512
|
+
ensure
|
513
|
+
@last_result = result
|
458
514
|
end
|
459
515
|
|
460
516
|
# Deletes everything in the urls collection.
|
461
517
|
#
|
462
518
|
# @return [Integer] The number of deleted records.
|
463
519
|
def clear_urls
|
464
|
-
@client[URLS_COLLECTION].delete_many({})
|
520
|
+
result = @client[URLS_COLLECTION].delete_many({})
|
521
|
+
result.n
|
522
|
+
ensure
|
523
|
+
@last_result = result
|
465
524
|
end
|
466
525
|
|
467
526
|
# Deletes everything in the documents collection.
|
468
527
|
#
|
469
528
|
# @return [Integer] The number of deleted records.
|
470
529
|
def clear_docs
|
471
|
-
@client[DOCUMENTS_COLLECTION].delete_many({})
|
530
|
+
result = @client[DOCUMENTS_COLLECTION].delete_many({})
|
531
|
+
result.n
|
532
|
+
ensure
|
533
|
+
@last_result = result
|
472
534
|
end
|
473
535
|
|
474
536
|
# Deletes everything in the urls and documents collections. This will nuke
|
@@ -536,6 +598,8 @@ module Wgit
|
|
536
598
|
else
|
537
599
|
raise 'data must be a Hash or an Array of Hashes'
|
538
600
|
end
|
601
|
+
ensure
|
602
|
+
@last_result = result
|
539
603
|
end
|
540
604
|
|
541
605
|
# Return if the write to the DB succeeded or not.
|
@@ -572,8 +636,8 @@ module Wgit
|
|
572
636
|
sort: {}, projection: {},
|
573
637
|
limit: 0, skip: 0)
|
574
638
|
assert_type(query, Hash)
|
575
|
-
@client[collection.to_sym].find(query).projection(projection)
|
576
|
-
|
639
|
+
@last_result = @client[collection.to_sym].find(query).projection(projection)
|
640
|
+
.skip(skip).limit(limit).sort(sort)
|
577
641
|
end
|
578
642
|
|
579
643
|
# Mutate/update one or more Url or Document records in the DB.
|
@@ -593,6 +657,8 @@ module Wgit
|
|
593
657
|
raise 'DB write(s) (update) failed' unless write_succeeded?(result)
|
594
658
|
|
595
659
|
result.n
|
660
|
+
ensure
|
661
|
+
@last_result = result
|
596
662
|
end
|
597
663
|
|
598
664
|
alias num_objects num_records
|
data/lib/wgit/document.rb
CHANGED
@@ -89,24 +89,18 @@ module Wgit
|
|
89
89
|
#
|
90
90
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
91
|
def self.text_elements_xpath
|
92
|
-
xpath
|
93
|
-
|
94
|
-
|
95
|
-
el_xpath = '//%s/text()'
|
96
|
-
Wgit::Document.text_elements.each_with_index do |el, i|
|
97
|
-
xpath += ' | ' unless i.zero?
|
98
|
-
xpath += format(el_xpath, el)
|
92
|
+
Wgit::Document.text_elements.each_with_index.reduce("") do |xpath, (el, i)|
|
93
|
+
xpath += " | " unless i.zero?
|
94
|
+
xpath += format("//%s/text()", el)
|
99
95
|
end
|
100
|
-
|
101
|
-
xpath
|
102
96
|
end
|
103
97
|
|
104
98
|
# Defines a content extractor, which extracts HTML elements/content
|
105
99
|
# into instance variables upon Document initialization. See the default
|
106
100
|
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
101
|
# extractor means that every subsequently crawled/initialized document
|
108
|
-
# will attempt to extract the xpath's content. Use `#
|
109
|
-
# content extraction.
|
102
|
+
# will attempt to extract the xpath's content. Use `#extract` for a one off
|
103
|
+
# content extraction on any document.
|
110
104
|
#
|
111
105
|
# Note that defined extractors work for both Documents initialized from
|
112
106
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
@@ -115,9 +109,9 @@ module Wgit
|
|
115
109
|
#
|
116
110
|
# When initialising from HTML, a singleton value of true will only
|
117
111
|
# ever return the first result found; otherwise all the results are
|
118
|
-
# returned in an
|
119
|
-
# is taken as is and singleton is only used to define the default
|
120
|
-
# value. If a value cannot be found (in either the HTML or database
|
112
|
+
# returned in an Enumerable. When initialising from a database object, the
|
113
|
+
# value is taken as is and singleton is only used to define the default
|
114
|
+
# empty value. If a value cannot be found (in either the HTML or database
|
121
115
|
# object), then a default will be used. The default value is:
|
122
116
|
# `singleton ? nil : []`.
|
123
117
|
#
|
@@ -134,12 +128,14 @@ module Wgit
|
|
134
128
|
# @param opts [Hash] The options to define an extractor with. The
|
135
129
|
# options are only used when intializing from HTML, not the database.
|
136
130
|
# @option opts [Boolean] :singleton The singleton option determines
|
137
|
-
# whether or not the result(s) should be in an
|
131
|
+
# whether or not the result(s) should be in an Enumerable. If multiple
|
138
132
|
# results are found and singleton is true then the first result will be
|
139
133
|
# used. Defaults to true.
|
140
134
|
# @option opts [Boolean] :text_content_only The text_content_only option
|
141
|
-
# if true will use the text content of the Nokogiri result object,
|
142
|
-
# otherwise the Nokogiri object itself is returned.
|
135
|
+
# if true will use the text #content of the Nokogiri result object,
|
136
|
+
# otherwise the Nokogiri object itself is returned. The type of Nokogiri
|
137
|
+
# object returned depends on the given xpath query. See the Nokogiri
|
138
|
+
# documentation for more information. Defaults to true.
|
143
139
|
# @yield The block is executed when a Wgit::Document is initialized,
|
144
140
|
# regardless of the source. Use it (optionally) to process the result
|
145
141
|
# value.
|
@@ -453,7 +449,7 @@ be relative"
|
|
453
449
|
|
454
450
|
if query.is_a?(Regexp)
|
455
451
|
regex = query
|
456
|
-
else # respond_to?
|
452
|
+
else # query.respond_to? :to_s == true
|
457
453
|
query = query.to_s
|
458
454
|
query = query.gsub(' ', '|') unless whole_sentence
|
459
455
|
regex = Regexp.new(query, !case_sensitive)
|
@@ -509,16 +505,24 @@ be relative"
|
|
509
505
|
# parameter.
|
510
506
|
#
|
511
507
|
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
-
# @param singleton [Boolean] singleton ? results.first (single
|
513
|
-
#
|
508
|
+
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
509
|
+
# results (Enumerable).
|
514
510
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
511
|
# (String) : result (Nokogiri Object).
|
512
|
+
# @yield (Optionally) Pass a block to read/write the result value before
|
513
|
+
# it's returned.
|
514
|
+
# @yieldparam value [Object] The result value to be returned.
|
515
|
+
# @yieldparam source [Wgit::Document, Object] This Document instance.
|
516
|
+
# @yieldparam type [Symbol] The `source` type, which is `:document`.
|
517
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
518
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
516
519
|
# @return [String, Object] The value found in the html or the default value
|
517
520
|
# (singleton ? nil : []).
|
518
|
-
def extract(xpath, singleton: true, text_content_only: true)
|
521
|
+
def extract(xpath, singleton: true, text_content_only: true, &block)
|
519
522
|
send(
|
520
523
|
:extract_from_html, xpath,
|
521
|
-
singleton: singleton, text_content_only: text_content_only
|
524
|
+
singleton: singleton, text_content_only: text_content_only,
|
525
|
+
&block
|
522
526
|
)
|
523
527
|
end
|
524
528
|
|
@@ -542,27 +546,25 @@ be relative"
|
|
542
546
|
# parameter.
|
543
547
|
#
|
544
548
|
# @param xpath [String, #call] Used to find the value/object in @html.
|
545
|
-
# @param singleton [Boolean] singleton ? results.first (single
|
546
|
-
#
|
549
|
+
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
550
|
+
# results (Enumerable).
|
547
551
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
548
552
|
# (String) : result (Nokogiri Object).
|
549
|
-
# @yield
|
550
|
-
#
|
551
|
-
# value.
|
553
|
+
# @yield (Optionally) Pass a block to read/write the result value before
|
554
|
+
# it's returned.
|
552
555
|
# @yieldparam value [Object] The result value to be returned.
|
553
|
-
# @yieldparam source [Wgit::Document, Object]
|
554
|
-
# @yieldparam type [Symbol] The `source` type,
|
555
|
-
# `:object`.
|
556
|
+
# @yieldparam source [Wgit::Document, Object] This Document instance.
|
557
|
+
# @yieldparam type [Symbol] The `source` type, which is `:document`.
|
556
558
|
# @yieldreturn [Object] The return value of the block gets returned. Return
|
557
559
|
# the block's `value` param unchanged if you simply want to inspect it.
|
558
560
|
# @return [String, Object] The value found in the html or the default value
|
559
561
|
# (singleton ? nil : []).
|
560
562
|
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
563
|
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
-
result = singleton ?
|
564
|
+
result = singleton ? at_xpath(xpath) : xpath(xpath)
|
563
565
|
|
564
|
-
if text_content_only
|
565
|
-
result = singleton ? result
|
566
|
+
if result && text_content_only
|
567
|
+
result = singleton ? result.content : result.map(&:content)
|
566
568
|
end
|
567
569
|
|
568
570
|
Wgit::Utils.sanitize(result)
|
data/lib/wgit/dsl.rb
CHANGED
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|