wgit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +22 -16
- data/lib/wgit/database/database.rb +5 -1
- data/lib/wgit/database/model.rb +13 -9
- data/lib/wgit/document.rb +16 -5
- data/lib/wgit/document_extensions.rb +1 -1
- data/lib/wgit/indexer.rb +38 -12
- data/lib/wgit/url.rb +26 -15
- data/lib/wgit/utils.rb +6 -7
- data/lib/wgit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d9d045d2dd7f570db1811bafab1ac244103cc359033efd9279323c795a67bb9f
|
4
|
+
data.tar.gz: 996801763a6576ede812e2edd7d201ceb34b2135548a365b748f953e7df40db9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0dfe907c599c320377464aec927b24700d0e9e17d7bb37b4903715af63cbf031dc5983cd6749b1d90353cbcffc0d71e76eb2a0f8c0ba77b3b03f2d51ca9634f
|
7
|
+
data.tar.gz: bade693ab5b32bf8a16747233356307fe489798855133b8e16b3a907d38f8fd9ecfadadab0273d1c0767106bcf85027c64ebb6f86dc38660240200d5fef07377
|
data/lib/wgit/crawler.rb
CHANGED
@@ -5,6 +5,7 @@ require_relative 'document'
|
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
7
|
require 'net/http' # Requires 'uri'.
|
8
|
+
require 'benchmark'
|
8
9
|
|
9
10
|
module Wgit
|
10
11
|
# The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
|
@@ -105,7 +106,7 @@ module Wgit
|
|
105
106
|
# Crawl the url returning the response Wgit::Document or nil if an error
|
106
107
|
# occurs.
|
107
108
|
#
|
108
|
-
# @param url [Wgit::Url] The Url to crawl.
|
109
|
+
# @param url [Wgit::Url] The Url to crawl; which will likely be modified.
|
109
110
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
110
111
|
# an external redirect. External meaning to a different host. False will
|
111
112
|
# return nil for such a crawl. If false, you must also provide a `host:`
|
@@ -123,16 +124,14 @@ module Wgit
|
|
123
124
|
# A String url isn't allowed because it's passed by value not reference,
|
124
125
|
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
125
126
|
assert_type(url, Wgit::Url)
|
127
|
+
raise 'host cannot be nil if follow_external_redirects is false' \
|
126
128
|
if !follow_external_redirects && host.nil?
|
127
|
-
raise 'host cannot be nil if follow_external_redirects is false'
|
128
|
-
end
|
129
129
|
|
130
130
|
html = fetch(
|
131
131
|
url,
|
132
132
|
follow_external_redirects: follow_external_redirects,
|
133
133
|
host: host
|
134
134
|
)
|
135
|
-
url.crawled = true
|
136
135
|
|
137
136
|
doc = Wgit::Document.new(url, html)
|
138
137
|
yield(doc) if block_given?
|
@@ -147,7 +146,8 @@ module Wgit
|
|
147
146
|
# HTTP response that doesn't return a HTML body will be ignored and nil
|
148
147
|
# will be returned; otherwise, the HTML String is returned.
|
149
148
|
#
|
150
|
-
# @param url [Wgit::Url] The URL to fetch the HTML for.
|
149
|
+
# @param url [Wgit::Url] The URL to fetch the HTML for. This Url object
|
150
|
+
# will likely be modified as a result of the fetch/crawl.
|
151
151
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
152
152
|
# an external redirect. False will return nil for such a crawl. If false,
|
153
153
|
# you must also provide a `host:` parameter.
|
@@ -159,19 +159,26 @@ module Wgit
|
|
159
159
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
160
160
|
# unsuccessful.
|
161
161
|
def fetch(url, follow_external_redirects: true, host: nil)
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
162
|
+
crawl_duration = nil
|
163
|
+
response = nil
|
164
|
+
|
165
|
+
crawl_duration = Benchmark.measure do
|
166
|
+
response = resolve(
|
167
|
+
url,
|
168
|
+
follow_external_redirects: follow_external_redirects,
|
169
|
+
host: host
|
170
|
+
)
|
171
|
+
end.real
|
168
172
|
|
169
173
|
response.body.empty? ? nil : response.body
|
170
174
|
rescue StandardError => e
|
171
175
|
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
|
172
|
-
@last_response = nil
|
173
176
|
|
174
177
|
nil
|
178
|
+
ensure
|
179
|
+
url.crawled = true # Also sets date_crawled underneath.
|
180
|
+
url.crawl_duration = crawl_duration
|
181
|
+
@last_response = response
|
175
182
|
end
|
176
183
|
|
177
184
|
# The resolve method performs a HTTP GET to obtain the HTML response. The
|
@@ -204,10 +211,9 @@ module Wgit
|
|
204
211
|
|
205
212
|
yield(url, response, location) if block_given?
|
206
213
|
|
214
|
+
raise "External redirect not allowed - Redirected to: \
|
215
|
+
'#{location}', which is outside of host: '#{host}'" \
|
207
216
|
if !follow_external_redirects && !location.is_relative?(host: host)
|
208
|
-
raise "External redirect not allowed - Redirected to: \
|
209
|
-
'#{location}', which is outside of host: '#{host}'"
|
210
|
-
end
|
211
217
|
|
212
218
|
raise "Too many redirects: #{redirect_count}" \
|
213
219
|
if redirect_count >= @redirect_limit
|
@@ -235,7 +241,7 @@ module Wgit
|
|
235
241
|
.uniq
|
236
242
|
.reject do |link|
|
237
243
|
ext = link.to_extension
|
238
|
-
ext ? !%w[htm html].include?(ext) : false
|
244
|
+
ext ? !%w[htm html].include?(ext.downcase) : false
|
239
245
|
end
|
240
246
|
end
|
241
247
|
|
@@ -64,7 +64,9 @@ module Wgit
|
|
64
64
|
# Wgit::Model.document.
|
65
65
|
# @raise [StandardError] If data isn't valid.
|
66
66
|
def insert(data)
|
67
|
+
data = data.dup # Avoid modifying by reference.
|
67
68
|
type = data.is_a?(Enumerable) ? data.first : data
|
69
|
+
|
68
70
|
case type
|
69
71
|
when Wgit::Url
|
70
72
|
insert_urls(data)
|
@@ -146,7 +148,7 @@ module Wgit
|
|
146
148
|
def search(
|
147
149
|
query, case_sensitive: false, whole_sentence: false, limit: 10, skip: 0
|
148
150
|
)
|
149
|
-
query.strip
|
151
|
+
query = query.to_s.strip
|
150
152
|
query.replace('"' + query + '"') if whole_sentence
|
151
153
|
|
152
154
|
# Sort based on the most search hits (aka "textScore").
|
@@ -232,6 +234,8 @@ module Wgit
|
|
232
234
|
# @param data [Wgit::Url, Wgit::Document] The data to update.
|
233
235
|
# @raise [StandardError] If the data is not valid.
|
234
236
|
def update(data)
|
237
|
+
data = data.dup # Avoid modifying by reference.
|
238
|
+
|
235
239
|
case data
|
236
240
|
when Wgit::Url
|
237
241
|
update_url(data)
|
data/lib/wgit/database/model.rb
CHANGED
@@ -3,43 +3,47 @@
|
|
3
3
|
require_relative '../utils'
|
4
4
|
|
5
5
|
module Wgit
|
6
|
-
# Module used to build the database collection objects.
|
6
|
+
# Module used to build the database collection objects, forming a data model.
|
7
7
|
module Model
|
8
|
-
# The data model for a Wgit::Url
|
8
|
+
# The data model for a Wgit::Url collection object and for an embedded
|
9
|
+
# 'url' inside a Wgit::Document collection object.
|
9
10
|
#
|
10
|
-
# @param url [Wgit::Url] The Url
|
11
|
+
# @param url [Wgit::Url] The Url data object.
|
11
12
|
# @return [Hash] The URL model ready for DB insertion.
|
12
13
|
def self.url(url)
|
13
14
|
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
14
15
|
|
15
16
|
model = url.to_h
|
17
|
+
|
16
18
|
Wgit::Utils.remove_non_bson_types(model)
|
17
19
|
end
|
18
20
|
|
19
|
-
# The data model for a Wgit::Document.
|
21
|
+
# The data model for a Wgit::Document collection object.
|
20
22
|
#
|
21
|
-
# @param doc [Wgit::Document] The Document
|
23
|
+
# @param doc [Wgit::Document] The Document data object.
|
22
24
|
# @return [Hash] The Document model ready for DB insertion.
|
23
25
|
def self.document(doc)
|
24
26
|
raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
|
25
27
|
|
26
|
-
model = doc.to_h(include_html: false)
|
28
|
+
model = doc.to_h(include_html: false, include_score: false)
|
29
|
+
model['url'] = self.url(doc.url) # Expand Url String into full object.
|
30
|
+
|
27
31
|
Wgit::Utils.remove_non_bson_types(model)
|
28
32
|
end
|
29
33
|
|
30
34
|
# Common fields when inserting a record into the DB.
|
31
35
|
#
|
32
|
-
# @return [Hash]
|
36
|
+
# @return [Hash] Insertion fields common to all models.
|
33
37
|
def self.common_insert_data
|
34
38
|
{
|
35
|
-
date_added:
|
39
|
+
date_added: Wgit::Utils.time_stamp,
|
36
40
|
date_modified: Wgit::Utils.time_stamp
|
37
41
|
}
|
38
42
|
end
|
39
43
|
|
40
44
|
# Common fields when updating a record in the DB.
|
41
45
|
#
|
42
|
-
# @return [Hash]
|
46
|
+
# @return [Hash] Update fields common to all models.
|
43
47
|
def self.common_update_data
|
44
48
|
{
|
45
49
|
date_modified: Wgit::Utils.time_stamp
|
data/lib/wgit/document.rb
CHANGED
@@ -87,7 +87,7 @@ module Wgit
|
|
87
87
|
xpath
|
88
88
|
end
|
89
89
|
|
90
|
-
# Defines an extension, which is a way to
|
90
|
+
# Defines an extension, which is a way to serialise HTML elements into
|
91
91
|
# instance variables upon Document initialization. See the default
|
92
92
|
# extensions defined in 'document_extensions.rb' as examples.
|
93
93
|
#
|
@@ -189,13 +189,20 @@ module Wgit
|
|
189
189
|
@html[range]
|
190
190
|
end
|
191
191
|
|
192
|
-
# Returns the timestamp of when this
|
192
|
+
# Returns the timestamp of when this Document was crawled.
|
193
193
|
#
|
194
|
-
# @return [Time] Time of when this
|
194
|
+
# @return [Time] Time of when this Document was crawled.
|
195
195
|
def date_crawled
|
196
196
|
@url.date_crawled
|
197
197
|
end
|
198
198
|
|
199
|
+
# Returns the duration of the crawl for this Document (in seconds).
|
200
|
+
#
|
201
|
+
# @return [Float] The duration of the crawl for this Document.
|
202
|
+
def crawl_duration
|
203
|
+
@url.crawl_duration
|
204
|
+
end
|
205
|
+
|
199
206
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
200
207
|
# <base> element's href value or @url (if @base is nil). If @base is
|
201
208
|
# present and relative, then @url.to_base + @base is returned. This method
|
@@ -240,8 +247,9 @@ module Wgit
|
|
240
247
|
# @param include_html [Boolean] Whether or not to include @html in the
|
241
248
|
# returned Hash.
|
242
249
|
# @return [Hash] Containing self's instance vars.
|
243
|
-
def to_h(include_html: false)
|
250
|
+
def to_h(include_html: false, include_score: true)
|
244
251
|
ignore = include_html ? [] : ['@html']
|
252
|
+
ignore << '@score' unless include_score
|
245
253
|
ignore << '@doc' # Always ignore Nokogiri @doc.
|
246
254
|
|
247
255
|
Wgit::Utils.to_h(self, ignore: ignore)
|
@@ -525,7 +533,10 @@ module Wgit
|
|
525
533
|
assert_types(html, [String, NilClass])
|
526
534
|
|
527
535
|
# We already know url.is_a?(String) so parse into Url unless already so.
|
528
|
-
|
536
|
+
url = Wgit::Url.parse(url)
|
537
|
+
url.crawled = true unless url.crawled # Avoid overriding date_crawled.
|
538
|
+
|
539
|
+
@url = url
|
529
540
|
@html = html || ''
|
530
541
|
@doc = init_nokogiri
|
531
542
|
@score = 0.0
|
data/lib/wgit/indexer.rb
CHANGED
@@ -12,6 +12,8 @@ module Wgit
|
|
12
12
|
# external url's to be crawled later on. Logs info on the crawl
|
13
13
|
# using Wgit.logger as it goes along.
|
14
14
|
#
|
15
|
+
# @param connection_string [String] The database connection string. Set as
|
16
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
15
17
|
# @param max_sites [Integer] The number of separate and whole
|
16
18
|
# websites to be crawled before the method exits. Defaults to -1 which
|
17
19
|
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
@@ -19,8 +21,10 @@ module Wgit
|
|
19
21
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
20
22
|
# determine when to stop crawling; it's not a guarantee of the max data
|
21
23
|
# that will be obtained.
|
22
|
-
def self.index_www(
|
23
|
-
|
24
|
+
def self.index_www(
|
25
|
+
connection_string: nil, max_sites: -1, max_data: 1_048_576_000
|
26
|
+
)
|
27
|
+
db = Wgit::Database.new(connection_string)
|
24
28
|
indexer = Wgit::Indexer.new(db)
|
25
29
|
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
26
30
|
end
|
@@ -32,14 +36,18 @@ module Wgit
|
|
32
36
|
# There is no max download limit so be careful which sites you index.
|
33
37
|
#
|
34
38
|
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
39
|
+
# @param connection_string [String] The database connection string. Set as
|
40
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
35
41
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
36
42
|
# external Url's into the database.
|
37
43
|
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
|
38
44
|
# inserted into the database allowing for prior manipulation.
|
39
45
|
# @return [Integer] The total number of pages crawled within the website.
|
40
|
-
def self.index_site(
|
46
|
+
def self.index_site(
|
47
|
+
url, connection_string: nil, insert_externals: true, &block
|
48
|
+
)
|
41
49
|
url = Wgit::Url.parse(url)
|
42
|
-
db = Wgit::Database.new
|
50
|
+
db = Wgit::Database.new(connection_string)
|
43
51
|
indexer = Wgit::Indexer.new(db)
|
44
52
|
indexer.index_site(url, insert_externals: insert_externals, &block)
|
45
53
|
end
|
@@ -51,13 +59,17 @@ module Wgit
|
|
51
59
|
# There is no max download limit so be careful of large pages.
|
52
60
|
#
|
53
61
|
# @param url [Wgit::Url, String] The Url of the webpage to crawl.
|
62
|
+
# @param connection_string [String] The database connection string. Set as
|
63
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
54
64
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
55
65
|
# external Url's into the database.
|
56
66
|
# @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
|
57
67
|
# inserted into the database allowing for prior manipulation.
|
58
|
-
def self.index_page(
|
68
|
+
def self.index_page(
|
69
|
+
url, connection_string: nil, insert_externals: true, &block
|
70
|
+
)
|
59
71
|
url = Wgit::Url.parse(url)
|
60
|
-
db = Wgit::Database.new
|
72
|
+
db = Wgit::Database.new(connection_string)
|
61
73
|
indexer = Wgit::Indexer.new(db)
|
62
74
|
indexer.index_page(url, insert_externals: insert_externals, &block)
|
63
75
|
end
|
@@ -67,6 +79,8 @@ module Wgit
|
|
67
79
|
# details of how the search works.
|
68
80
|
#
|
69
81
|
# @param query [String] The text query to search with.
|
82
|
+
# @param connection_string [String] The database connection string. Set as
|
83
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
70
84
|
# @param case_sensitive [Boolean] Whether character case must match.
|
71
85
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
72
86
|
# for separately.
|
@@ -76,11 +90,20 @@ module Wgit
|
|
76
90
|
# snippet.
|
77
91
|
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
78
92
|
# database.
|
79
|
-
def self.indexed_search(
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
93
|
+
def self.indexed_search(
|
94
|
+
query, connection_string: nil,
|
95
|
+
case_sensitive: false, whole_sentence: false,
|
96
|
+
limit: 10, skip: 0, sentence_limit: 80, &block
|
97
|
+
)
|
98
|
+
db = Wgit::Database.new(connection_string)
|
99
|
+
|
100
|
+
results = db.search(
|
101
|
+
query,
|
102
|
+
case_sensitive: case_sensitive,
|
103
|
+
whole_sentence: whole_sentence,
|
104
|
+
limit: limit,
|
105
|
+
skip: skip,
|
106
|
+
&block
|
84
107
|
)
|
85
108
|
|
86
109
|
results.each do |doc|
|
@@ -88,7 +111,8 @@ module Wgit
|
|
88
111
|
query,
|
89
112
|
case_sensitive: case_sensitive,
|
90
113
|
whole_sentence: whole_sentence,
|
91
|
-
sentence_limit: sentence_limit
|
114
|
+
sentence_limit: sentence_limit
|
115
|
+
)
|
92
116
|
end
|
93
117
|
|
94
118
|
Wgit::Utils.printf_search_results(results)
|
@@ -137,6 +161,7 @@ runs out of urls to crawl (which might be never).")
|
|
137
161
|
|
138
162
|
if uncrawled_urls.empty?
|
139
163
|
Wgit.logger.info('No urls to crawl, exiting.')
|
164
|
+
|
140
165
|
return
|
141
166
|
end
|
142
167
|
Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
|
@@ -148,6 +173,7 @@ runs out of urls to crawl (which might be never).")
|
|
148
173
|
unless keep_crawling?(site_count, max_sites, max_data)
|
149
174
|
Wgit.logger.info("Reached max number of sites to crawl or \
|
150
175
|
database capacity, exiting.")
|
176
|
+
|
151
177
|
return
|
152
178
|
end
|
153
179
|
site_count += 1
|
data/lib/wgit/url.rb
CHANGED
@@ -15,12 +15,15 @@ module Wgit
|
|
15
15
|
include Assertable
|
16
16
|
|
17
17
|
# Whether or not the Url has been crawled or not. A custom crawled= method
|
18
|
-
# is
|
19
|
-
|
18
|
+
# is provided by this class, overridding the default one.
|
19
|
+
attr_accessor :crawled
|
20
20
|
|
21
|
-
# The Time
|
21
|
+
# The Time stamp of when this Url was crawled.
|
22
22
|
attr_accessor :date_crawled
|
23
23
|
|
24
|
+
# The duration of the crawl for this Url (in seconds).
|
25
|
+
attr_accessor :crawl_duration
|
26
|
+
|
24
27
|
# Initializes a new instance of Wgit::Url which represents a web based
|
25
28
|
# HTTP URL.
|
26
29
|
#
|
@@ -32,8 +35,12 @@ module Wgit
|
|
32
35
|
# @param date_crawled [Time] Should only be provided if crawled is true. A
|
33
36
|
# suitable object can be returned from Wgit::Utils.time_stamp. Only used
|
34
37
|
# if url_or_obj is a String.
|
38
|
+
# @param crawl_duration [Float] Should only be provided if crawled is true.
|
39
|
+
# The duration of the crawl for this Url (in seconds).
|
35
40
|
# @raise [StandardError] If url_or_obj is an Object with missing methods.
|
36
|
-
def initialize(
|
41
|
+
def initialize(
|
42
|
+
url_or_obj, crawled: false, date_crawled: nil, crawl_duration: nil
|
43
|
+
)
|
37
44
|
# Init from a URL String.
|
38
45
|
if url_or_obj.is_a?(String)
|
39
46
|
url = url_or_obj.to_s
|
@@ -42,14 +49,16 @@ module Wgit
|
|
42
49
|
obj = url_or_obj
|
43
50
|
assert_respond_to(obj, :fetch)
|
44
51
|
|
45
|
-
url
|
46
|
-
crawled
|
47
|
-
date_crawled
|
52
|
+
url = obj.fetch('url') # Should always be present.
|
53
|
+
crawled = obj.fetch('crawled', false)
|
54
|
+
date_crawled = obj.fetch('date_crawled', nil)
|
55
|
+
crawl_duration = obj.fetch('crawl_duration', nil)
|
48
56
|
end
|
49
57
|
|
50
|
-
@uri
|
51
|
-
@crawled
|
52
|
-
@date_crawled
|
58
|
+
@uri = Addressable::URI.parse(url)
|
59
|
+
@crawled = crawled
|
60
|
+
@date_crawled = date_crawled
|
61
|
+
@crawl_duration = crawl_duration
|
53
62
|
|
54
63
|
super(url)
|
55
64
|
end
|
@@ -77,14 +86,16 @@ module Wgit
|
|
77
86
|
obj.is_a?(Wgit::Url) ? obj : new(obj)
|
78
87
|
end
|
79
88
|
|
80
|
-
# Sets the @crawled instance var, also setting @date_crawled
|
81
|
-
#
|
89
|
+
# Sets the @crawled instance var, also setting @date_crawled for
|
90
|
+
# convenience.
|
82
91
|
#
|
83
|
-
# @param bool [Boolean] True if
|
84
|
-
# @return [
|
92
|
+
# @param bool [Boolean] True if this Url has been crawled, false otherwise.
|
93
|
+
# @return [Boolean] The value of bool having been set.
|
85
94
|
def crawled=(bool)
|
86
|
-
@crawled
|
95
|
+
@crawled = bool
|
87
96
|
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
97
|
+
|
98
|
+
bool
|
88
99
|
end
|
89
100
|
|
90
101
|
# Overrides String#replace setting the new_url @uri and String value.
|
data/lib/wgit/utils.rb
CHANGED
@@ -120,20 +120,19 @@ module Wgit
|
|
120
120
|
# outputted to the stream.
|
121
121
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
122
122
|
# to output text somewhere e.g. a file or STDOUT.
|
123
|
-
# @return [NilClass] Returns nil.
|
124
123
|
def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
|
125
124
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
126
125
|
|
127
126
|
results.each do |doc|
|
128
|
-
title
|
129
|
-
|
130
|
-
keywords = missing_keywords ? nil : doc.keywords.take(keyword_limit)
|
127
|
+
title = (doc.title || '<no title>')
|
128
|
+
keywords = doc.keywords&.take(keyword_limit)&.join(', ')
|
131
129
|
sentence = doc.text.first
|
130
|
+
url = doc.url
|
132
131
|
|
133
132
|
stream.puts title
|
134
|
-
stream.puts keywords
|
135
|
-
stream.puts sentence
|
136
|
-
stream.puts
|
133
|
+
stream.puts keywords if keywords
|
134
|
+
stream.puts sentence
|
135
|
+
stream.puts url
|
137
136
|
stream.puts
|
138
137
|
end
|
139
138
|
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|