wgit 0.10.3 → 0.10.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/lib/wgit/database/database.rb +76 -10
- data/lib/wgit/document.rb +35 -33
- data/lib/wgit/dsl.rb +1 -0
- data/lib/wgit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4598dcfc047ce3915ba5a871837be5efc54201d61b4967cf53070bec2af4dd52
|
4
|
+
data.tar.gz: 604010011024af6f2d4dfcc87e6c4c1d73f8e4811938281119fccb79792818c1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 44b098e2a97191801787386e9d2060dcdeacc625c3453976679fc276a73b2bf0614713764a55f7074073018e898f2e43dc1a7f4f803339a86158052f59dcabcb
|
7
|
+
data.tar.gz: 8645c7095bb14590cf83c21905c9f5ed524e1047254e6526b8fe46a53f3989395472300d27fb65f899951a5f4b80ee9928accd23164b10e1a834975bf045db47
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,33 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.10.6
|
13
|
+
### Added
|
14
|
+
- `Wgit::DSL` method `#crawl_url` (aliased to `#crawl`).
|
15
|
+
### Changed/Removed
|
16
|
+
- Added a `&block` param to `Wgit::Document#extract`, which gets passed to `#extract_from_html`.
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.10.5
|
22
|
+
### Added
|
23
|
+
- `Database#last_result` getter method to return the most recent raw mongo result.
|
24
|
+
### Changed/Removed
|
25
|
+
- ...
|
26
|
+
### Fixed
|
27
|
+
- ...
|
28
|
+
---
|
29
|
+
|
30
|
+
## v0.10.4
|
31
|
+
### Added
|
32
|
+
- `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
|
33
|
+
### Changed/Removed
|
34
|
+
- ...
|
35
|
+
### Fixed
|
36
|
+
- ...
|
37
|
+
---
|
38
|
+
|
12
39
|
## v0.10.3
|
13
40
|
### Added
|
14
41
|
- ...
|
@@ -45,6 +45,9 @@ module Wgit
|
|
45
45
|
# A custom setter method is also provided for changing the search logic.
|
46
46
|
attr_reader :text_index
|
47
47
|
|
48
|
+
# The raw MongoDB result of the most recent operation.
|
49
|
+
attr_reader :last_result
|
50
|
+
|
48
51
|
# Initializes a connected database client using the provided
|
49
52
|
# connection_string or ENV['WGIT_CONNECTION_STRING'].
|
50
53
|
#
|
@@ -185,6 +188,8 @@ module Wgit
|
|
185
188
|
result = @client[collection].replace_one(query, data_hash, upsert: true)
|
186
189
|
|
187
190
|
result.matched_count.zero?
|
191
|
+
ensure
|
192
|
+
@last_result = result
|
188
193
|
end
|
189
194
|
|
190
195
|
### Retrieve Data ###
|
@@ -294,16 +299,12 @@ module Wgit
|
|
294
299
|
results = retrieve(DOCUMENTS_COLLECTION, query,
|
295
300
|
sort: sort_proj, projection: sort_proj,
|
296
301
|
limit: limit, skip: skip)
|
297
|
-
return [] if results.count < 1 # respond_to? :empty? == false
|
298
302
|
|
299
|
-
|
300
|
-
results = results.map do |mongo_doc|
|
303
|
+
results.map do |mongo_doc|
|
301
304
|
doc = Wgit::Document.new(mongo_doc)
|
302
305
|
yield(doc) if block_given?
|
303
306
|
doc
|
304
307
|
end
|
305
|
-
|
306
|
-
results
|
307
308
|
end
|
308
309
|
|
309
310
|
# Searches the database's Documents for the given query and then searches
|
@@ -348,6 +349,58 @@ module Wgit
|
|
348
349
|
results
|
349
350
|
end
|
350
351
|
|
352
|
+
# Searches the database's Documents for the given query and then searches
|
353
|
+
# each result in turn using `doc.search`. Instead of an Array of Documents,
|
354
|
+
# this method returns a Hash of the docs url => search_results creating a
|
355
|
+
# search engine like result set for quick access to text matches.
|
356
|
+
#
|
357
|
+
# @param query [String] The text query to search with.
|
358
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
359
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
360
|
+
# for separately.
|
361
|
+
# @param limit [Integer] The max number of results to return.
|
362
|
+
# @param skip [Integer] The number of results to skip.
|
363
|
+
# @param sentence_limit [Integer] The max length of each search result
|
364
|
+
# sentence.
|
365
|
+
# @param top_result_only [Boolean] Whether to return all of the documents
|
366
|
+
# search results or just the top (most relavent) result.
|
367
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
368
|
+
# DB.
|
369
|
+
# @return [Hash<String, String | Array<String>>] The search results obtained
|
370
|
+
# from the DB having mapped the docs url => search_results. The format of
|
371
|
+
# search_results depends on the value of `top_result_only`.
|
372
|
+
def search_text(
|
373
|
+
query, case_sensitive: false, whole_sentence: true,
|
374
|
+
limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
|
375
|
+
)
|
376
|
+
results = search(
|
377
|
+
query,
|
378
|
+
case_sensitive: case_sensitive,
|
379
|
+
whole_sentence: whole_sentence,
|
380
|
+
limit: limit,
|
381
|
+
skip: skip
|
382
|
+
)
|
383
|
+
|
384
|
+
results
|
385
|
+
.map do |doc|
|
386
|
+
yield(doc) if block_given?
|
387
|
+
|
388
|
+
results = doc.search(
|
389
|
+
query,
|
390
|
+
case_sensitive: case_sensitive,
|
391
|
+
whole_sentence: whole_sentence,
|
392
|
+
sentence_limit: sentence_limit
|
393
|
+
)
|
394
|
+
|
395
|
+
# Only return result if its text has a match - compact is called below.
|
396
|
+
next nil if results.empty?
|
397
|
+
|
398
|
+
[doc.url, (top_result_only ? results.first : results)]
|
399
|
+
end
|
400
|
+
.compact
|
401
|
+
.to_h
|
402
|
+
end
|
403
|
+
|
351
404
|
# Returns statistics about the database.
|
352
405
|
#
|
353
406
|
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
@@ -454,21 +507,30 @@ module Wgit
|
|
454
507
|
# 0 or 1 because urls are unique.
|
455
508
|
def delete(obj)
|
456
509
|
collection, query = get_type_info(obj)
|
457
|
-
@client[collection].delete_one(query)
|
510
|
+
result = @client[collection].delete_one(query)
|
511
|
+
result.n
|
512
|
+
ensure
|
513
|
+
@last_result = result
|
458
514
|
end
|
459
515
|
|
460
516
|
# Deletes everything in the urls collection.
|
461
517
|
#
|
462
518
|
# @return [Integer] The number of deleted records.
|
463
519
|
def clear_urls
|
464
|
-
@client[URLS_COLLECTION].delete_many({})
|
520
|
+
result = @client[URLS_COLLECTION].delete_many({})
|
521
|
+
result.n
|
522
|
+
ensure
|
523
|
+
@last_result = result
|
465
524
|
end
|
466
525
|
|
467
526
|
# Deletes everything in the documents collection.
|
468
527
|
#
|
469
528
|
# @return [Integer] The number of deleted records.
|
470
529
|
def clear_docs
|
471
|
-
@client[DOCUMENTS_COLLECTION].delete_many({})
|
530
|
+
result = @client[DOCUMENTS_COLLECTION].delete_many({})
|
531
|
+
result.n
|
532
|
+
ensure
|
533
|
+
@last_result = result
|
472
534
|
end
|
473
535
|
|
474
536
|
# Deletes everything in the urls and documents collections. This will nuke
|
@@ -536,6 +598,8 @@ module Wgit
|
|
536
598
|
else
|
537
599
|
raise 'data must be a Hash or an Array of Hashes'
|
538
600
|
end
|
601
|
+
ensure
|
602
|
+
@last_result = result
|
539
603
|
end
|
540
604
|
|
541
605
|
# Return if the write to the DB succeeded or not.
|
@@ -572,8 +636,8 @@ module Wgit
|
|
572
636
|
sort: {}, projection: {},
|
573
637
|
limit: 0, skip: 0)
|
574
638
|
assert_type(query, Hash)
|
575
|
-
@client[collection.to_sym].find(query).projection(projection)
|
576
|
-
|
639
|
+
@last_result = @client[collection.to_sym].find(query).projection(projection)
|
640
|
+
.skip(skip).limit(limit).sort(sort)
|
577
641
|
end
|
578
642
|
|
579
643
|
# Mutate/update one or more Url or Document records in the DB.
|
@@ -593,6 +657,8 @@ module Wgit
|
|
593
657
|
raise 'DB write(s) (update) failed' unless write_succeeded?(result)
|
594
658
|
|
595
659
|
result.n
|
660
|
+
ensure
|
661
|
+
@last_result = result
|
596
662
|
end
|
597
663
|
|
598
664
|
alias num_objects num_records
|
data/lib/wgit/document.rb
CHANGED
@@ -89,24 +89,18 @@ module Wgit
|
|
89
89
|
#
|
90
90
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
91
|
def self.text_elements_xpath
|
92
|
-
xpath
|
93
|
-
|
94
|
-
|
95
|
-
el_xpath = '//%s/text()'
|
96
|
-
Wgit::Document.text_elements.each_with_index do |el, i|
|
97
|
-
xpath += ' | ' unless i.zero?
|
98
|
-
xpath += format(el_xpath, el)
|
92
|
+
Wgit::Document.text_elements.each_with_index.reduce("") do |xpath, (el, i)|
|
93
|
+
xpath += " | " unless i.zero?
|
94
|
+
xpath += format("//%s/text()", el)
|
99
95
|
end
|
100
|
-
|
101
|
-
xpath
|
102
96
|
end
|
103
97
|
|
104
98
|
# Defines a content extractor, which extracts HTML elements/content
|
105
99
|
# into instance variables upon Document initialization. See the default
|
106
100
|
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
101
|
# extractor means that every subsequently crawled/initialized document
|
108
|
-
# will attempt to extract the xpath's content. Use `#
|
109
|
-
# content extraction.
|
102
|
+
# will attempt to extract the xpath's content. Use `#extract` for a one off
|
103
|
+
# content extraction on any document.
|
110
104
|
#
|
111
105
|
# Note that defined extractors work for both Documents initialized from
|
112
106
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
@@ -115,9 +109,9 @@ module Wgit
|
|
115
109
|
#
|
116
110
|
# When initialising from HTML, a singleton value of true will only
|
117
111
|
# ever return the first result found; otherwise all the results are
|
118
|
-
# returned in an
|
119
|
-
# is taken as is and singleton is only used to define the default
|
120
|
-
# value. If a value cannot be found (in either the HTML or database
|
112
|
+
# returned in an Enumerable. When initialising from a database object, the
|
113
|
+
# value is taken as is and singleton is only used to define the default
|
114
|
+
# empty value. If a value cannot be found (in either the HTML or database
|
121
115
|
# object), then a default will be used. The default value is:
|
122
116
|
# `singleton ? nil : []`.
|
123
117
|
#
|
@@ -134,12 +128,14 @@ module Wgit
|
|
134
128
|
# @param opts [Hash] The options to define an extractor with. The
|
135
129
|
# options are only used when intializing from HTML, not the database.
|
136
130
|
# @option opts [Boolean] :singleton The singleton option determines
|
137
|
-
# whether or not the result(s) should be in an
|
131
|
+
# whether or not the result(s) should be in an Enumerable. If multiple
|
138
132
|
# results are found and singleton is true then the first result will be
|
139
133
|
# used. Defaults to true.
|
140
134
|
# @option opts [Boolean] :text_content_only The text_content_only option
|
141
|
-
# if true will use the text content of the Nokogiri result object,
|
142
|
-
# otherwise the Nokogiri object itself is returned.
|
135
|
+
# if true will use the text #content of the Nokogiri result object,
|
136
|
+
# otherwise the Nokogiri object itself is returned. The type of Nokogiri
|
137
|
+
# object returned depends on the given xpath query. See the Nokogiri
|
138
|
+
# documentation for more information. Defaults to true.
|
143
139
|
# @yield The block is executed when a Wgit::Document is initialized,
|
144
140
|
# regardless of the source. Use it (optionally) to process the result
|
145
141
|
# value.
|
@@ -453,7 +449,7 @@ be relative"
|
|
453
449
|
|
454
450
|
if query.is_a?(Regexp)
|
455
451
|
regex = query
|
456
|
-
else # respond_to?
|
452
|
+
else # query.respond_to? :to_s == true
|
457
453
|
query = query.to_s
|
458
454
|
query = query.gsub(' ', '|') unless whole_sentence
|
459
455
|
regex = Regexp.new(query, !case_sensitive)
|
@@ -509,16 +505,24 @@ be relative"
|
|
509
505
|
# parameter.
|
510
506
|
#
|
511
507
|
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
-
# @param singleton [Boolean] singleton ? results.first (single
|
513
|
-
#
|
508
|
+
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
509
|
+
# results (Enumerable).
|
514
510
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
511
|
# (String) : result (Nokogiri Object).
|
512
|
+
# @yield (Optionally) Pass a block to read/write the result value before
|
513
|
+
# it's returned.
|
514
|
+
# @yieldparam value [Object] The result value to be returned.
|
515
|
+
# @yieldparam source [Wgit::Document, Object] This Document instance.
|
516
|
+
# @yieldparam type [Symbol] The `source` type, which is `:document`.
|
517
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
518
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
516
519
|
# @return [String, Object] The value found in the html or the default value
|
517
520
|
# (singleton ? nil : []).
|
518
|
-
def extract(xpath, singleton: true, text_content_only: true)
|
521
|
+
def extract(xpath, singleton: true, text_content_only: true, &block)
|
519
522
|
send(
|
520
523
|
:extract_from_html, xpath,
|
521
|
-
singleton: singleton, text_content_only: text_content_only
|
524
|
+
singleton: singleton, text_content_only: text_content_only,
|
525
|
+
&block
|
522
526
|
)
|
523
527
|
end
|
524
528
|
|
@@ -542,27 +546,25 @@ be relative"
|
|
542
546
|
# parameter.
|
543
547
|
#
|
544
548
|
# @param xpath [String, #call] Used to find the value/object in @html.
|
545
|
-
# @param singleton [Boolean] singleton ? results.first (single
|
546
|
-
#
|
549
|
+
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
550
|
+
# results (Enumerable).
|
547
551
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
548
552
|
# (String) : result (Nokogiri Object).
|
549
|
-
# @yield
|
550
|
-
#
|
551
|
-
# value.
|
553
|
+
# @yield (Optionally) Pass a block to read/write the result value before
|
554
|
+
# it's returned.
|
552
555
|
# @yieldparam value [Object] The result value to be returned.
|
553
|
-
# @yieldparam source [Wgit::Document, Object]
|
554
|
-
# @yieldparam type [Symbol] The `source` type,
|
555
|
-
# `:object`.
|
556
|
+
# @yieldparam source [Wgit::Document, Object] This Document instance.
|
557
|
+
# @yieldparam type [Symbol] The `source` type, which is `:document`.
|
556
558
|
# @yieldreturn [Object] The return value of the block gets returned. Return
|
557
559
|
# the block's `value` param unchanged if you simply want to inspect it.
|
558
560
|
# @return [String, Object] The value found in the html or the default value
|
559
561
|
# (singleton ? nil : []).
|
560
562
|
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
563
|
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
-
result = singleton ?
|
564
|
+
result = singleton ? at_xpath(xpath) : xpath(xpath)
|
563
565
|
|
564
|
-
if text_content_only
|
565
|
-
result = singleton ? result
|
566
|
+
if result && text_content_only
|
567
|
+
result = singleton ? result.content : result.map(&:content)
|
566
568
|
end
|
567
569
|
|
568
570
|
Wgit::Utils.sanitize(result)
|
data/lib/wgit/dsl.rb
CHANGED
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|