wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
data/lib/wgit/indexer.rb
CHANGED
@@ -1,12 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative 'assertable'
|
3
4
|
require_relative 'crawler'
|
4
|
-
require_relative 'database/
|
5
|
+
require_relative 'database/database_adapter'
|
5
6
|
|
6
7
|
module Wgit
|
7
8
|
# Class which crawls and saves the Documents to a database. Can be thought of
|
8
|
-
# as a combination of Wgit::Crawler and Wgit::Database.
|
9
|
+
# as a combination of Wgit::Crawler and Wgit::Database::DatabaseAdapter.
|
9
10
|
class Indexer
|
11
|
+
include Assertable
|
12
|
+
|
13
|
+
# The ENV var used to omit and ignore robots.txt parsing during an index.
|
14
|
+
# Applies to all index_* methods if set in the ENV.
|
15
|
+
WGIT_IGNORE_ROBOTS_TXT = "WGIT_IGNORE_ROBOTS_TXT".freeze
|
16
|
+
|
17
|
+
# The block return value used to skip saving a crawled document to the
|
18
|
+
# database. Applies to all index_* methods that take a block.
|
19
|
+
SKIP_UPSERT = :skip.freeze
|
20
|
+
|
10
21
|
# The crawler used to index the WWW.
|
11
22
|
attr_reader :crawler
|
12
23
|
|
@@ -15,10 +26,13 @@ module Wgit
|
|
15
26
|
|
16
27
|
# Initialize the Indexer.
|
17
28
|
#
|
18
|
-
# @param database [Wgit::Database] The database instance
|
19
|
-
# initialized and connected) used
|
20
|
-
# @param crawler [Wgit::Crawler] The crawler instance used
|
29
|
+
# @param database [Wgit::Database::DatabaseAdapter] The database instance
|
30
|
+
# (already initialized and connected) used for indexing.
|
31
|
+
# @param crawler [Wgit::Crawler] The crawler instance used for indexing.
|
21
32
|
def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
|
33
|
+
assert_type(database, Wgit::Database::DatabaseAdapter)
|
34
|
+
assert_type(crawler, Wgit::Crawler)
|
35
|
+
|
22
36
|
@db = database
|
23
37
|
@crawler = crawler
|
24
38
|
end
|
@@ -26,33 +40,38 @@ module Wgit
|
|
26
40
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
27
41
|
# site storing their internal pages into the database and adding their
|
28
42
|
# external url's to be crawled later on. Logs info on the crawl using
|
29
|
-
# Wgit.logger as it goes along.
|
43
|
+
# Wgit.logger as it goes along. This method will honour all site's
|
44
|
+
# robots.txt and 'noindex' requests.
|
30
45
|
#
|
31
46
|
# @param max_sites [Integer] The number of separate and whole
|
32
47
|
# websites to be crawled before the method exits. Defaults to -1 which
|
33
|
-
# means the crawl will occur until manually stopped (Ctrl+C
|
48
|
+
# means the crawl will occur until manually stopped (Ctrl+C), the
|
49
|
+
# max_data has been reached, or it runs out of external urls to index.
|
34
50
|
# @param max_data [Integer] The maximum amount of bytes that will be
|
35
51
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
36
52
|
# determine when to stop crawling; it's not a guarantee of the max data
|
37
53
|
# that will be obtained.
|
38
|
-
|
54
|
+
# @param max_urls_per_iteration [Integer] The maximum number of uncrawled
|
55
|
+
# urls to index for each iteration, before checking max_sites and
|
56
|
+
# max_data, possibly ending the crawl.
|
57
|
+
def index_www(max_sites: -1, max_data: 1_048_576_000, max_urls_per_iteration: 10)
|
39
58
|
if max_sites.negative?
|
40
59
|
Wgit.logger.info("Indexing until the database has been filled or it \
|
41
|
-
runs out of urls to crawl (which might be never)
|
60
|
+
runs out of urls to crawl (which might be never)")
|
42
61
|
end
|
43
62
|
site_count = 0
|
44
63
|
|
45
64
|
while keep_crawling?(site_count, max_sites, max_data)
|
46
65
|
Wgit.logger.info("Current database size: #{@db.size}")
|
47
66
|
|
48
|
-
uncrawled_urls = @db.uncrawled_urls(limit:
|
67
|
+
uncrawled_urls = @db.uncrawled_urls(limit: max_urls_per_iteration)
|
49
68
|
|
50
69
|
if uncrawled_urls.empty?
|
51
|
-
Wgit.logger.info('No urls to crawl, exiting
|
70
|
+
Wgit.logger.info('No urls to crawl, exiting')
|
52
71
|
|
53
72
|
return
|
54
73
|
end
|
55
|
-
Wgit.logger.info("Starting
|
74
|
+
Wgit.logger.info("Starting indexing loop for: #{uncrawled_urls.map(&:to_s)}")
|
56
75
|
|
57
76
|
docs_count = 0
|
58
77
|
urls_count = 0
|
@@ -60,38 +79,48 @@ runs out of urls to crawl (which might be never).")
|
|
60
79
|
uncrawled_urls.each do |url|
|
61
80
|
unless keep_crawling?(site_count, max_sites, max_data)
|
62
81
|
Wgit.logger.info("Reached max number of sites to crawl or \
|
63
|
-
database capacity, exiting
|
82
|
+
database capacity, exiting")
|
64
83
|
|
65
84
|
return
|
66
85
|
end
|
67
86
|
site_count += 1
|
68
87
|
|
88
|
+
parser = parse_robots_txt(url)
|
89
|
+
if parser&.no_index?
|
90
|
+
upsert_url_and_redirects(url)
|
91
|
+
|
92
|
+
next
|
93
|
+
end
|
94
|
+
|
69
95
|
site_docs_count = 0
|
70
|
-
ext_links = @crawler.crawl_site(
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
96
|
+
ext_links = @crawler.crawl_site(
|
97
|
+
url, allow_paths: parser&.allow_paths, disallow_paths: parser&.disallow_paths
|
98
|
+
) do |doc|
|
99
|
+
next if doc.empty? || no_index?(@crawler.last_response, doc)
|
100
|
+
|
101
|
+
upsert_doc(doc)
|
102
|
+
docs_count += 1
|
103
|
+
site_docs_count += 1
|
76
104
|
end
|
77
105
|
|
78
|
-
|
106
|
+
upsert_url_and_redirects(url)
|
79
107
|
|
80
|
-
urls_count +=
|
108
|
+
urls_count += upsert_external_urls(ext_links)
|
81
109
|
end
|
82
110
|
|
83
111
|
Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
|
84
|
-
url(s)
|
112
|
+
url(s) during this iteration")
|
85
113
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
86
|
-
|
87
|
-
|
88
|
-
nil
|
114
|
+
future iterations")
|
89
115
|
end
|
116
|
+
|
117
|
+
nil
|
90
118
|
end
|
91
119
|
|
92
120
|
# Crawls a single website's pages and stores them into the database.
|
93
121
|
# There is no max download limit so be careful which sites you index.
|
94
|
-
# Logs info on the crawl using Wgit.logger as it goes along.
|
122
|
+
# Logs info on the crawl using Wgit.logger as it goes along. This method
|
123
|
+
# will honour the site's robots.txt and 'noindex' requests.
|
95
124
|
#
|
96
125
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
97
126
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
@@ -113,28 +142,29 @@ the next iteration.")
|
|
113
142
|
url, insert_externals: false, follow: :default,
|
114
143
|
allow_paths: nil, disallow_paths: nil
|
115
144
|
)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
145
|
+
parser = parse_robots_txt(url)
|
146
|
+
if parser&.no_index?
|
147
|
+
upsert_url_and_redirects(url)
|
148
|
+
|
149
|
+
return 0
|
150
|
+
end
|
151
|
+
|
152
|
+
allow_paths, disallow_paths = merge_paths(parser, allow_paths, disallow_paths)
|
153
|
+
crawl_opts = { follow:, allow_paths:, disallow_paths: }
|
121
154
|
total_pages_indexed = 0
|
122
155
|
|
123
156
|
ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
|
157
|
+
next if no_index?(@crawler.last_response, doc)
|
158
|
+
|
124
159
|
result = block_given? ? yield(doc) : true
|
160
|
+
next if doc.empty? || result == SKIP_UPSERT
|
125
161
|
|
126
|
-
|
127
|
-
|
128
|
-
total_pages_indexed += 1
|
129
|
-
end
|
162
|
+
upsert_doc(doc)
|
163
|
+
total_pages_indexed += 1
|
130
164
|
end
|
131
165
|
|
132
|
-
|
133
|
-
|
134
|
-
if insert_externals && ext_urls
|
135
|
-
num_inserted_urls = write_urls_to_db(ext_urls)
|
136
|
-
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
137
|
-
end
|
166
|
+
upsert_url_and_redirects(url)
|
167
|
+
upsert_external_urls(ext_urls) if insert_externals && ext_urls
|
138
168
|
|
139
169
|
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
|
140
170
|
for the site: #{url}")
|
@@ -145,6 +175,8 @@ for the site: #{url}")
|
|
145
175
|
# Crawls one or more webpages and stores them into the database.
|
146
176
|
# There is no max download limit so be careful of large pages.
|
147
177
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
178
|
+
# This method will honour the site's robots.txt and 'noindex' requests
|
179
|
+
# in relation to the given urls.
|
148
180
|
#
|
149
181
|
# @param urls [*Wgit::Url] The webpage Url's to crawl.
|
150
182
|
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
@@ -157,7 +189,7 @@ for the site: #{url}")
|
|
157
189
|
def index_urls(*urls, insert_externals: false, &block)
|
158
190
|
raise 'You must provide at least one Url' if urls.empty?
|
159
191
|
|
160
|
-
opts = { insert_externals:
|
192
|
+
opts = { insert_externals: }
|
161
193
|
Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
|
162
194
|
|
163
195
|
nil
|
@@ -166,6 +198,8 @@ for the site: #{url}")
|
|
166
198
|
# Crawls a single webpage and stores it into the database.
|
167
199
|
# There is no max download limit so be careful of large pages.
|
168
200
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
201
|
+
# This method will honour the site's robots.txt and 'noindex' requests
|
202
|
+
# in relation to the given url.
|
169
203
|
#
|
170
204
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
171
205
|
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
@@ -175,18 +209,26 @@ for the site: #{url}")
|
|
175
209
|
# manipulation. Return nil or false from the block to prevent the
|
176
210
|
# document from being saved into the database.
|
177
211
|
def index_url(url, insert_externals: false)
|
212
|
+
parser = parse_robots_txt(url)
|
213
|
+
if parser && (parser.no_index? || contains_path?(parser.disallow_paths, url))
|
214
|
+
upsert_url_and_redirects(url)
|
215
|
+
|
216
|
+
return
|
217
|
+
end
|
218
|
+
|
178
219
|
document = @crawler.crawl_url(url) do |doc|
|
220
|
+
break if no_index?(@crawler.last_response, doc)
|
221
|
+
|
179
222
|
result = block_given? ? yield(doc) : true
|
180
|
-
|
223
|
+
break if doc.empty? || result == SKIP_UPSERT
|
224
|
+
|
225
|
+
upsert_doc(doc)
|
181
226
|
end
|
182
227
|
|
183
|
-
|
228
|
+
upsert_url_and_redirects(url)
|
184
229
|
|
185
230
|
ext_urls = document&.external_links
|
186
|
-
if insert_externals && ext_urls
|
187
|
-
num_inserted_urls = write_urls_to_db(ext_urls)
|
188
|
-
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
189
|
-
end
|
231
|
+
upsert_external_urls(ext_urls) if insert_externals && ext_urls
|
190
232
|
|
191
233
|
nil
|
192
234
|
end
|
@@ -210,10 +252,11 @@ for the site: #{url}")
|
|
210
252
|
end
|
211
253
|
|
212
254
|
# Write the doc to the DB. Note that the unique url index on the documents
|
213
|
-
# collection deliberately prevents duplicate inserts.
|
255
|
+
# collection deliberately prevents duplicate inserts. If the document
|
256
|
+
# already exists, then it will be updated in the DB.
|
214
257
|
#
|
215
258
|
# @param doc [Wgit::Document] The document to write to the DB.
|
216
|
-
def
|
259
|
+
def upsert_doc(doc)
|
217
260
|
if @db.upsert(doc)
|
218
261
|
Wgit.logger.info("Saved document for url: #{doc.url}")
|
219
262
|
else
|
@@ -221,35 +264,105 @@ for the site: #{url}")
|
|
221
264
|
end
|
222
265
|
end
|
223
266
|
|
224
|
-
#
|
225
|
-
# collection deliberately prevents duplicate inserts.
|
267
|
+
# Upsert the url and its redirects, setting all to crawled = true.
|
226
268
|
#
|
227
|
-
# @param
|
228
|
-
# @return [Integer] The number of
|
229
|
-
def
|
230
|
-
|
269
|
+
# @param url [Wgit::Url] The url to write to the DB.
|
270
|
+
# @return [Integer] The number of upserted urls (url + redirect urls).
|
271
|
+
def upsert_url_and_redirects(url)
|
272
|
+
url.crawled = true unless url.crawled?
|
231
273
|
|
232
|
-
|
274
|
+
# Upsert the url and any url redirects, setting them as crawled also.
|
275
|
+
@db.bulk_upsert(url.redirects_journey)
|
276
|
+
end
|
233
277
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
278
|
+
# Write the external urls to the DB. For any external url, its origin will
|
279
|
+
# be inserted e.g. if the external url is http://example.com/contact then
|
280
|
+
# http://example.com will be inserted into the database. Note that the
|
281
|
+
# unique url index on the urls collection deliberately prevents duplicate
|
282
|
+
# inserts.
|
283
|
+
#
|
284
|
+
# @param urls [Array<Wgit::Url>] The external urls to write to the DB.
|
285
|
+
# @return [Integer] The number of upserted urls.
|
286
|
+
def upsert_external_urls(urls)
|
287
|
+
urls = urls
|
288
|
+
.reject(&:invalid?)
|
289
|
+
.map(&:to_origin)
|
290
|
+
.uniq
|
291
|
+
return 0 if urls.empty?
|
292
|
+
|
293
|
+
count = @db.bulk_upsert(urls)
|
294
|
+
Wgit.logger.info("Saved #{count} external urls")
|
295
|
+
|
296
|
+
count
|
297
|
+
end
|
298
|
+
|
299
|
+
private
|
300
|
+
|
301
|
+
# Crawls and parses robots.txt file (if found). Returns the parser or nil.
|
302
|
+
def parse_robots_txt(url)
|
303
|
+
return nil if ENV[WGIT_IGNORE_ROBOTS_TXT]
|
304
|
+
|
305
|
+
robots_url = url.to_origin.join('/robots.txt')
|
239
306
|
|
240
|
-
|
241
|
-
count += 1
|
307
|
+
Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
|
242
308
|
|
243
|
-
|
244
|
-
|
245
|
-
|
309
|
+
doc = @crawler.crawl_url(robots_url)
|
310
|
+
return nil if !@crawler.last_response.ok? || doc.empty?
|
311
|
+
|
312
|
+
parser = Wgit::RobotsParser.new(doc.content)
|
313
|
+
|
314
|
+
Wgit.logger.info("robots.txt allow paths: #{parser.allow_paths}")
|
315
|
+
Wgit.logger.info("robots.txt disallow paths: #{parser.disallow_paths}")
|
316
|
+
if parser.no_index?
|
317
|
+
Wgit.logger.info('robots.txt has banned wgit indexing, skipping')
|
246
318
|
end
|
247
319
|
|
248
|
-
|
320
|
+
parser
|
321
|
+
end
|
322
|
+
|
323
|
+
# Takes the user defined allow/disallow_paths and merges robots paths
|
324
|
+
# into them. The allow/disallow_paths vars each can be of type nil, String,
|
325
|
+
# Enumerable<String>.
|
326
|
+
def merge_paths(parser, allow_paths, disallow_paths)
|
327
|
+
return allow_paths, disallow_paths unless parser&.rules?
|
328
|
+
|
329
|
+
allow = allow_paths || []
|
330
|
+
allow = [allow] unless allow.is_a?(Enumerable)
|
331
|
+
|
332
|
+
disallow = disallow_paths || []
|
333
|
+
disallow = [disallow] unless disallow.is_a?(Enumerable)
|
334
|
+
|
335
|
+
allow.concat(parser.allow_paths)
|
336
|
+
disallow.concat(parser.disallow_paths)
|
337
|
+
|
338
|
+
[allow, disallow]
|
339
|
+
end
|
340
|
+
|
341
|
+
# Returns true if url is included in the given paths.
|
342
|
+
def contains_path?(paths, url)
|
343
|
+
paths.any? { |path| Wgit::Url.new(path).to_path == url.to_path }
|
344
|
+
end
|
345
|
+
|
346
|
+
# Returns if the last_response or doc #no_index? is true or not.
|
347
|
+
def no_index?(last_response, doc)
|
348
|
+
return false if ENV[WGIT_IGNORE_ROBOTS_TXT]
|
349
|
+
|
350
|
+
url = last_response.url.to_s
|
351
|
+
if last_response.no_index?
|
352
|
+
Wgit.logger.info("Skipping page due to no-index response header: #{url}")
|
353
|
+
return true
|
354
|
+
end
|
355
|
+
|
356
|
+
if doc&.no_index?
|
357
|
+
Wgit.logger.info("Skipping page due to no-index HTML meta tag: #{url}")
|
358
|
+
return true
|
359
|
+
end
|
360
|
+
|
361
|
+
false
|
249
362
|
end
|
250
363
|
|
251
|
-
|
252
|
-
|
253
|
-
|
364
|
+
alias_method :database, :db
|
365
|
+
alias_method :index, :index_urls
|
366
|
+
alias_method :index_r, :index_site
|
254
367
|
end
|
255
368
|
end
|
data/lib/wgit/logger.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# FYI: The default logger is set at the bottom of this file.
|
4
4
|
|
5
|
-
require
|
5
|
+
require "logger"
|
6
6
|
|
7
7
|
module Wgit
|
8
8
|
# The Logger instance used by Wgit. Set your own custom logger after
|
@@ -28,7 +28,7 @@ module Wgit
|
|
28
28
|
#
|
29
29
|
# @return [Logger] The default Logger instance.
|
30
30
|
def self.default_logger
|
31
|
-
logger = Logger.new(
|
31
|
+
logger = Logger.new($stdout, progname: "wgit", level: :info)
|
32
32
|
logger.formatter = proc do |_severity, _datetime, progname, msg|
|
33
33
|
"[#{progname}] #{msg}\n"
|
34
34
|
end
|
data/lib/wgit/model.rb
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "./utils"
|
4
|
+
|
5
|
+
module Wgit
|
6
|
+
# Module used to build the Database collection objects, forming a data model.
|
7
|
+
# The models produced are Hash like and therefore DB agnostic. Each model
|
8
|
+
# will contain a unique field used for searching and avoiding duplicates,
|
9
|
+
# this is typically a `url` field. Also contained in the model are the
|
10
|
+
# search fields used in Database and Document #search calls.
|
11
|
+
module Model
|
12
|
+
# The default search fields used in Database and Document #search calls.
|
13
|
+
# The number of matches for each field is multiplied by the field weight,
|
14
|
+
# the total is the search score, used to sort the search results.
|
15
|
+
# Call Wgit::Model.set_default_search_fields` to revert to default.
|
16
|
+
DEFAULT_SEARCH_FIELDS = {
|
17
|
+
title: 2,
|
18
|
+
description: 2,
|
19
|
+
keywords: 2,
|
20
|
+
text: 1
|
21
|
+
}.freeze
|
22
|
+
|
23
|
+
# The search fields used in Database and Document #search calls.
|
24
|
+
# The number of matches for each field is multiplied by the field weight,
|
25
|
+
# the total is the search score, used to sort the search results.
|
26
|
+
# Call Wgit::Model.set_default_search_fields` to revert to default.
|
27
|
+
@search_fields = DEFAULT_SEARCH_FIELDS
|
28
|
+
|
29
|
+
# Whether or not to include the Document#html in the #document model.
|
30
|
+
@include_doc_html = false
|
31
|
+
|
32
|
+
# Whether or not to include the Document#score in the #document model.
|
33
|
+
@include_doc_score = false
|
34
|
+
|
35
|
+
class << self
|
36
|
+
# The search fields used in Database and Document #search calls.
|
37
|
+
# A custom setter method is also provided for changing these fields.
|
38
|
+
attr_reader :search_fields
|
39
|
+
|
40
|
+
# Whether or not to include the Document#html in the #document model.
|
41
|
+
attr_accessor :include_doc_html
|
42
|
+
|
43
|
+
# Whether or not to include the Document#score in the #document model.
|
44
|
+
attr_accessor :include_doc_score
|
45
|
+
end
|
46
|
+
|
47
|
+
# Sets the search fields used in Database and Document #search calls.
|
48
|
+
#
|
49
|
+
# You can pass the fields as an Array of Symbols which gives each field a
|
50
|
+
# weight of 1 meaning all fields are considered of equal value. Or you can
|
51
|
+
# pass a Hash of Symbol => Int and specify the weights yourself, allowing
|
52
|
+
# you to customise the search rankings.
|
53
|
+
#
|
54
|
+
# Use like:
|
55
|
+
# ```
|
56
|
+
# Wgit::Model.set_search_fields [:title, :text], db
|
57
|
+
# => { title: 1, text: 1 }
|
58
|
+
# Wgit::Model.set_search_fields {title: 2, text: 1}, db
|
59
|
+
# => { title: 2, text: 1 }
|
60
|
+
# ```
|
61
|
+
#
|
62
|
+
# If the given db (database) param responds to #search_fields= then it will
|
63
|
+
# be called and given the fields to set. This should perform whatever the
|
64
|
+
# database adapter needs in order to search using the given fields e.g.
|
65
|
+
# creating a search index. Calling the DB enables the search_fields to be
|
66
|
+
# set globally within Wgit by one method call, this one.
|
67
|
+
#
|
68
|
+
# @param fields [Array<Symbol>, Hash<Symbol, Integer>] The field names or
|
69
|
+
# the field names with their coresponding search weights.
|
70
|
+
# @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
|
71
|
+
# db responds to #search_fields=, it will be called and given the fields.
|
72
|
+
# @raise [StandardError] If fields is of an incorrect type.
|
73
|
+
# @return [Hash<Symbol, Integer>] The fields and their weights.
|
74
|
+
def self.set_search_fields(fields, db = nil)
|
75
|
+
# We need a Hash of fields => weights (Symbols => Integers).
|
76
|
+
case fields
|
77
|
+
when Array # of Strings/Symbols.
|
78
|
+
fields = fields.map { |field| [field.to_sym, 1] }
|
79
|
+
when Hash # of Strings/Symbols and Integers.
|
80
|
+
fields = fields.map { |field, weight| [field.to_sym, weight.to_i] }
|
81
|
+
else
|
82
|
+
raise "fields must be an Array or Hash, not a #{fields.class}"
|
83
|
+
end
|
84
|
+
|
85
|
+
@search_fields = fields.to_h
|
86
|
+
db.search_fields = @search_fields if db.respond_to?(:search_fields=)
|
87
|
+
|
88
|
+
@search_fields
|
89
|
+
end
|
90
|
+
|
91
|
+
# Sets the search fields used in Database and Document #search calls.
|
92
|
+
#
|
93
|
+
# If the given db (database) param responds to #search_fields= then it will
|
94
|
+
# be called and given the fields to set. This should perform whatever the
|
95
|
+
# database adapter needs in order to search using the given fields e.g.
|
96
|
+
# creating a search index. Calling the DB enables the search_fields to be
|
97
|
+
# set globally within Wgit by one method call, this one.
|
98
|
+
#
|
99
|
+
# @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
|
100
|
+
# db responds to #search_fields=, it will be called and given the fields.
|
101
|
+
# @return [Hash<Symbol, Integer>] The fields and their weights.
|
102
|
+
def self.set_default_search_fields(db = nil)
|
103
|
+
set_search_fields(DEFAULT_SEARCH_FIELDS, db)
|
104
|
+
end
|
105
|
+
|
106
|
+
# The data model for a Wgit::Url collection object and for an embedded
|
107
|
+
# 'url' inside a Wgit::Document collection object.
|
108
|
+
#
|
109
|
+
# The unique field for this model is `model['url']`.
|
110
|
+
#
|
111
|
+
# @param url [Wgit::Url] The Url data object.
|
112
|
+
# @return [Hash] The URL model ready for DB insertion.
|
113
|
+
def self.url(url)
|
114
|
+
raise "url must respond_to? :to_h" unless url.respond_to?(:to_h)
|
115
|
+
|
116
|
+
model = url.to_h
|
117
|
+
select_bson_types(model)
|
118
|
+
end
|
119
|
+
|
120
|
+
# The data model for a Wgit::Document collection object.
|
121
|
+
#
|
122
|
+
# The unique field for this model is `model['url']['url']`.
|
123
|
+
#
|
124
|
+
# @param doc [Wgit::Document] The Document data object.
|
125
|
+
# @return [Hash] The Document model ready for DB insertion.
|
126
|
+
def self.document(doc)
|
127
|
+
raise "doc must respond_to? :to_h" unless doc.respond_to?(:to_h)
|
128
|
+
|
129
|
+
model = doc.to_h(
|
130
|
+
include_html: @include_doc_html, include_score: @include_doc_score
|
131
|
+
)
|
132
|
+
model["url"] = url(doc.url) # Expand Url String into full object.
|
133
|
+
|
134
|
+
select_bson_types(model)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Common fields when inserting a record into the DB.
|
138
|
+
#
|
139
|
+
# @return [Hash] Insertion fields common to all models.
|
140
|
+
def self.common_insert_data
|
141
|
+
{
|
142
|
+
date_added: Wgit::Utils.time_stamp,
|
143
|
+
date_modified: Wgit::Utils.time_stamp
|
144
|
+
}
|
145
|
+
end
|
146
|
+
|
147
|
+
# Common fields when updating a record in the DB.
|
148
|
+
#
|
149
|
+
# @return [Hash] Update fields common to all models.
|
150
|
+
def self.common_update_data
|
151
|
+
{
|
152
|
+
date_modified: Wgit::Utils.time_stamp
|
153
|
+
}
|
154
|
+
end
|
155
|
+
|
156
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
157
|
+
#
|
158
|
+
# @param model_hash [Hash] The model Hash to sanitize.
|
159
|
+
# @return [Hash] The model Hash with non bson types removed.
|
160
|
+
def self.select_bson_types(model_hash)
|
161
|
+
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
data/lib/wgit/response.rb
CHANGED
@@ -27,18 +27,25 @@ module Wgit
|
|
27
27
|
|
28
28
|
# Defaults some values and returns a "blank" Wgit::Response object.
|
29
29
|
def initialize
|
30
|
-
@body =
|
30
|
+
@body = ""
|
31
31
|
@headers = {}
|
32
32
|
@redirections = {}
|
33
33
|
@total_time = 0.0
|
34
34
|
end
|
35
35
|
|
36
|
+
# Overrides String#inspect to shorten the printed output of a Response.
|
37
|
+
#
|
38
|
+
# @return [String] A short textual representation of this Response.
|
39
|
+
def inspect
|
40
|
+
"#<Wgit::Response url=\"#{@url}\" status=#{status}>"
|
41
|
+
end
|
42
|
+
|
36
43
|
# Adds time to @total_time (incrementally).
|
37
44
|
#
|
38
45
|
# @param time [Float] The time to add to @total_time.
|
39
46
|
# @return [Float] @total_time's new value.
|
40
47
|
def add_total_time(time)
|
41
|
-
@total_time +=
|
48
|
+
@total_time += time || 0.0
|
42
49
|
end
|
43
50
|
|
44
51
|
# Sets the HTML response body.
|
@@ -46,7 +53,7 @@ module Wgit
|
|
46
53
|
# @param str [String] The new HTML body.
|
47
54
|
# @return [String] @body's new value.
|
48
55
|
def body=(str)
|
49
|
-
@body =
|
56
|
+
@body = str || ""
|
50
57
|
end
|
51
58
|
|
52
59
|
# Returns the HTML response body or nil (if it's empty).
|
@@ -74,10 +81,7 @@ module Wgit
|
|
74
81
|
return
|
75
82
|
end
|
76
83
|
|
77
|
-
@headers = headers.
|
78
|
-
k = k.downcase.gsub('-', '_').to_sym
|
79
|
-
[k, v]
|
80
|
-
end.to_h
|
84
|
+
@headers = headers.transform_keys { |k| k.downcase.gsub("-", "_").to_sym }
|
81
85
|
end
|
82
86
|
|
83
87
|
# Returns whether or not the response is 404 Not Found.
|
@@ -134,11 +138,19 @@ module Wgit
|
|
134
138
|
@status.positive?
|
135
139
|
end
|
136
140
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
141
|
+
# Returns whether or not Wgit is banned from indexing this site.
|
142
|
+
#
|
143
|
+
# @return [Boolean] True if Wgit should not index this site, false
|
144
|
+
# otherwise.
|
145
|
+
def no_index?
|
146
|
+
headers.fetch(:x_robots_tag, "").downcase.strip == "noindex"
|
147
|
+
end
|
148
|
+
|
149
|
+
alias_method :code, :status
|
150
|
+
alias_method :content, :body
|
151
|
+
alias_method :crawl_duration, :total_time
|
152
|
+
alias_method :to_s, :body
|
153
|
+
alias_method :redirects, :redirections
|
154
|
+
alias_method :length, :size
|
143
155
|
end
|
144
156
|
end
|