wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
data/lib/wgit/utils.rb
CHANGED
@@ -18,17 +18,15 @@ module Wgit
|
|
18
18
|
# keys.
|
19
19
|
# @return [Hash] A Hash created from obj's instance vars and values.
|
20
20
|
def self.to_h(obj, ignore: [], use_strings_as_keys: true)
|
21
|
-
hash
|
21
|
+
obj.instance_variables.reduce({}) do |hash, var|
|
22
|
+
next hash if ignore.include?(var.to_s)
|
22
23
|
|
23
|
-
|
24
|
-
next if ignore.include?(var.to_s)
|
25
|
-
|
26
|
-
key = var.to_s[1..-1] # Remove the @ prefix.
|
24
|
+
key = var.to_s[1..] # Remove the @ prefix.
|
27
25
|
key = key.to_sym unless use_strings_as_keys
|
28
26
|
hash[key] = obj.instance_variable_get(var)
|
29
|
-
end
|
30
27
|
|
31
|
-
|
28
|
+
hash
|
29
|
+
end
|
32
30
|
end
|
33
31
|
|
34
32
|
# An improved :each method which supports both singleton and Enumerable
|
@@ -37,9 +35,9 @@ module Wgit
|
|
37
35
|
# @yield [el] Gives each element (Object) of obj_or_objects if it's
|
38
36
|
# Enumerable, otherwise obj_or_objs itself is given.
|
39
37
|
# @return [Object] The obj_or_objs parameter is returned.
|
40
|
-
def self.each(obj_or_objs)
|
38
|
+
def self.each(obj_or_objs, &block)
|
41
39
|
if obj_or_objs.respond_to?(:each)
|
42
|
-
obj_or_objs.each
|
40
|
+
obj_or_objs.each(&block)
|
43
41
|
else
|
44
42
|
yield(obj_or_objs)
|
45
43
|
end
|
@@ -127,96 +125,261 @@ module Wgit
|
|
127
125
|
end
|
128
126
|
|
129
127
|
# Prints out the search results in a search engine like format.
|
130
|
-
# The format for each result looks like:
|
131
|
-
#
|
132
|
-
# Title
|
133
128
|
#
|
134
|
-
#
|
129
|
+
# The given results should be matching documents from a DB and should have
|
130
|
+
# `doc.search_text!` called for each document - to turn doc.text into only
|
131
|
+
# matching text, which this method uses.
|
135
132
|
#
|
136
|
-
#
|
133
|
+
# The format for each result looks something like:
|
137
134
|
#
|
135
|
+
# ```
|
136
|
+
# Title
|
137
|
+
# Keywords (if there are some)
|
138
|
+
# Text Snippet (formatted to show the searched for query)
|
138
139
|
# URL
|
139
|
-
#
|
140
|
+
# Score (if include_score: true)
|
140
141
|
# <empty_line_seperator>
|
142
|
+
# ```
|
141
143
|
#
|
142
144
|
# @param results [Array<Wgit::Document>] Array of Wgit::Document's which
|
143
145
|
# each have had #search!(query) called (to update it's @text with the
|
144
146
|
# the search results). The first @text sentence gets printed.
|
145
147
|
# @param keyword_limit [Integer] The max amount of keywords to be
|
146
148
|
# outputted to the stream.
|
149
|
+
# @param include_score [Boolean] Wether or not to puts the document score.
|
147
150
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
148
151
|
# to output text somewhere e.g. a file or STDERR.
|
149
152
|
# @return [Integer] The number of results.
|
150
|
-
def self.
|
153
|
+
def self.pprint_top_search_results(
|
154
|
+
results, keyword_limit: 5, include_score: false, stream: $stdout
|
155
|
+
)
|
151
156
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
152
157
|
|
153
158
|
results.each do |doc|
|
154
|
-
title =
|
159
|
+
title = doc.title || '<no title>'
|
155
160
|
keywords = doc.keywords&.take(keyword_limit)&.join(', ')
|
156
161
|
sentence = doc.text.first
|
157
162
|
url = doc.url
|
163
|
+
score = doc.score
|
158
164
|
|
159
165
|
stream.puts title
|
160
166
|
stream.puts keywords if keywords
|
161
167
|
stream.puts sentence
|
162
168
|
stream.puts url
|
169
|
+
stream.puts score if include_score
|
163
170
|
stream.puts
|
164
171
|
end
|
165
172
|
|
166
173
|
results.size
|
167
174
|
end
|
168
175
|
|
176
|
+
# Prints out the search results listing all of the matching text in each
|
177
|
+
# document.
|
178
|
+
#
|
179
|
+
# The given results should be matching documents from a DB and should have
|
180
|
+
# `doc.search_text!` called for each document - to turn doc.text into only
|
181
|
+
# matching text, which this method uses.
|
182
|
+
#
|
183
|
+
# The format for each result looks something like:
|
184
|
+
#
|
185
|
+
# ```
|
186
|
+
# Title
|
187
|
+
# Keywords (if there are some)
|
188
|
+
# URL
|
189
|
+
# Score (if include_score: true)
|
190
|
+
#
|
191
|
+
# "<text_snippet_1>"
|
192
|
+
# "<text_snippet_2>"
|
193
|
+
# ...
|
194
|
+
#
|
195
|
+
# <seperator>
|
196
|
+
# ```
|
197
|
+
#
|
198
|
+
# @param results [Array<Wgit::Document>] Array of Wgit::Document's which
|
199
|
+
# each have had #search!(query) called (to update it's @text with the
|
200
|
+
# the search results). The first @text sentence gets printed.
|
201
|
+
# @param keyword_limit [Integer] The max amount of keywords to be
|
202
|
+
# outputted to the stream.
|
203
|
+
# @param include_score [Boolean] Wether or not to puts the document score.
|
204
|
+
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
205
|
+
# to output text somewhere e.g. a file or STDERR.
|
206
|
+
# @return [Integer] The number of results.
|
207
|
+
def self.pprint_all_search_results(
|
208
|
+
results, keyword_limit: 5, include_score: false, stream: $stdout
|
209
|
+
)
|
210
|
+
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
211
|
+
|
212
|
+
results.each_with_index do |doc, i|
|
213
|
+
last_result = i == (results.size-1)
|
214
|
+
|
215
|
+
title = doc.title || '<no title>'
|
216
|
+
keywords = doc.keywords&.take(keyword_limit)&.join(', ')
|
217
|
+
url = doc.url
|
218
|
+
score = doc.score
|
219
|
+
|
220
|
+
stream.puts title
|
221
|
+
stream.puts keywords if keywords
|
222
|
+
stream.puts url
|
223
|
+
stream.puts score if include_score
|
224
|
+
stream.puts
|
225
|
+
doc.text.each { |text| stream.puts text }
|
226
|
+
stream.puts
|
227
|
+
stream.puts "-----" unless last_result
|
228
|
+
stream.puts unless last_result
|
229
|
+
end
|
230
|
+
|
231
|
+
results.size
|
232
|
+
end
|
233
|
+
|
169
234
|
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
170
|
-
# method for its type e.g. if obj.is_a? String then
|
171
|
-
# not in the case statement will be ignored and returned as is.
|
235
|
+
# method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
|
236
|
+
# Any type not in the case statement will be ignored and returned as is.
|
237
|
+
# Call this method if unsure what obj's type is.
|
172
238
|
#
|
173
239
|
# @param obj [Object] The object to be sanitized.
|
174
240
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
175
241
|
# invalid characters.
|
176
|
-
# @return [Object] The sanitized obj
|
242
|
+
# @return [Object] The sanitized obj.
|
177
243
|
def self.sanitize(obj, encode: true)
|
178
244
|
case obj
|
245
|
+
when Wgit::Url
|
246
|
+
sanitize_url(obj, encode:)
|
179
247
|
when String
|
180
|
-
sanitize_str(obj, encode:
|
248
|
+
sanitize_str(obj, encode:)
|
181
249
|
when Array
|
182
|
-
sanitize_arr(obj, encode:
|
250
|
+
sanitize_arr(obj, encode:)
|
183
251
|
else
|
184
252
|
obj
|
185
253
|
end
|
186
254
|
end
|
187
255
|
|
256
|
+
# Sanitises a Wgit::Url to make it uniform. First sanitizes the Url as a
|
257
|
+
# String before replacing the Url value with the sanitized version. This
|
258
|
+
# method therefore modifies the given url param and also returns it.
|
259
|
+
#
|
260
|
+
# @param url [Wgit::Url] The Wgit::Url to sanitize. url is modified.
|
261
|
+
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
262
|
+
# invalid characters.
|
263
|
+
# @return [Wgit::Url] The sanitized url, which is also modified.
|
264
|
+
def self.sanitize_url(url, encode: true)
|
265
|
+
str = sanitize_str(url.to_s, encode:)
|
266
|
+
url.replace(str)
|
267
|
+
end
|
268
|
+
|
188
269
|
# Sanitises a String to make it uniform. Strips any leading/trailing white
|
189
270
|
# space. Also applies UTF-8 encoding (replacing invalid characters) if
|
190
271
|
# `encode: true`.
|
191
272
|
#
|
192
|
-
# @param str [String] The String to sanitize. str is modified.
|
273
|
+
# @param str [String] The String to sanitize. str is not modified.
|
193
274
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
194
275
|
# invalid characters.
|
195
|
-
# @return [String] The sanitized str
|
276
|
+
# @return [String] The sanitized str.
|
196
277
|
def self.sanitize_str(str, encode: true)
|
197
|
-
|
198
|
-
str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
|
199
|
-
str.strip!
|
200
|
-
end
|
278
|
+
return str unless str.is_a?(String)
|
201
279
|
|
202
|
-
str
|
280
|
+
str = str.encode('UTF-8', undef: :replace, invalid: :replace) if encode
|
281
|
+
str.strip
|
203
282
|
end
|
204
283
|
|
205
284
|
# Sanitises an Array to make it uniform. Removes empty Strings and nils,
|
206
285
|
# processes non empty Strings using Wgit::Utils.sanitize and removes
|
207
286
|
# duplicates.
|
208
287
|
#
|
209
|
-
# @param arr [Enumerable] The Array to sanitize. arr is modified.
|
210
|
-
# @return [Enumerable] The sanitized arr
|
288
|
+
# @param arr [Enumerable] The Array to sanitize. arr is not modified.
|
289
|
+
# @return [Enumerable] The sanitized arr.
|
211
290
|
def self.sanitize_arr(arr, encode: true)
|
212
|
-
|
213
|
-
arr.map! { |str| sanitize(str, encode: encode) }
|
214
|
-
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
215
|
-
arr.compact!
|
216
|
-
arr.uniq!
|
217
|
-
end
|
291
|
+
return arr unless arr.is_a?(Array)
|
218
292
|
|
219
293
|
arr
|
294
|
+
.map { |str| sanitize(str, encode:) }
|
295
|
+
.reject { |str| str.is_a?(String) && str.empty? }
|
296
|
+
.compact
|
297
|
+
.uniq
|
298
|
+
end
|
299
|
+
|
300
|
+
# Build a regular expression from a query string, for searching text with.
|
301
|
+
#
|
302
|
+
# All searches using this regex are always whole word based while whole
|
303
|
+
# sentence searches are configurable using the whole_sentence: param. For
|
304
|
+
# example:
|
305
|
+
#
|
306
|
+
# ```
|
307
|
+
# text = "hello world"
|
308
|
+
# query = "world hello", whole_sentence: true # => No match
|
309
|
+
# query = "world hello", whole_sentence: false # => Match
|
310
|
+
# query = "he" # => Never matches
|
311
|
+
# ```
|
312
|
+
#
|
313
|
+
# @param query [String, Regexp] The query string to build a regex from.
|
314
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
315
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
316
|
+
# for separately, matching in any order.
|
317
|
+
# @return [Regexp] The regex with which to search text.
|
318
|
+
def self.build_search_regex(
|
319
|
+
query, case_sensitive: false, whole_sentence: true
|
320
|
+
)
|
321
|
+
return query if query.is_a?(Regexp)
|
322
|
+
|
323
|
+
# query: "hello world", whole_sentence: false produces:
|
324
|
+
# (?<=^|\s|[^a-zA-Z0-9])hello(?=$|\s|[^a-zA-Z0-9])|(?<=^|\s|[^a-zA-Z0-9])world(?=$|\s|[^a-zA-Z0-9])
|
325
|
+
|
326
|
+
sep = whole_sentence ? " " : "|"
|
327
|
+
segs = query.split(" ").map do |word|
|
328
|
+
word = Regexp.escape(word)
|
329
|
+
"(?<=^|\\s|[^a-zA-Z0-9])#{word}(?=$|\\s|[^a-zA-Z0-9])"
|
330
|
+
end
|
331
|
+
query = segs.join(sep)
|
332
|
+
|
333
|
+
Regexp.new(query, !case_sensitive)
|
334
|
+
end
|
335
|
+
|
336
|
+
# Pretty prints a log statement, used for debugging purposes.
|
337
|
+
#
|
338
|
+
# Use like:
|
339
|
+
#
|
340
|
+
# ```
|
341
|
+
# Wgit::Utils.pprint 1, include_html: include_html, ignore: ignore_vars
|
342
|
+
# ```
|
343
|
+
#
|
344
|
+
# Which produces a log like:
|
345
|
+
#
|
346
|
+
# ```
|
347
|
+
# DEBUG_1 - include_html: true | ignore: ['@html', '@parser']
|
348
|
+
# ```
|
349
|
+
#
|
350
|
+
# @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
|
351
|
+
# @param display [Boolean] Setting as false will cause a noop, useful for
|
352
|
+
# switching off several/all pprint statements at once e.g. via ENV var.
|
353
|
+
# @param stream [#puts] Any object that respond_to? :puts and :print. It is
|
354
|
+
# used to output the log text somewhere e.g. a file or STDERR.
|
355
|
+
# @param prefix [String] The log prefix, useful for visibility/greping.
|
356
|
+
# @param new_line [Boolean] Wether or not to use a new line (\n) as the
|
357
|
+
# separator.
|
358
|
+
# @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
|
359
|
+
def self.pprint(identifier, display: true, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
|
360
|
+
return unless display
|
361
|
+
|
362
|
+
sep1 = new_line ? "\n" : ' - '
|
363
|
+
sep1 = '' if vars.empty?
|
364
|
+
sep2 = new_line ? "\n" : ' | '
|
365
|
+
|
366
|
+
stream.print "\n#{prefix}_#{identifier}#{sep1}"
|
367
|
+
|
368
|
+
vars.each_with_index do |arr, i|
|
369
|
+
is_last_item = (i + 1) == vars.size
|
370
|
+
sep3 = sep2
|
371
|
+
sep3 = new_line ? "\n" : '' if is_last_item
|
372
|
+
k, v = arr
|
373
|
+
|
374
|
+
stream.print "#{k}: #{v.inspect}#{sep3}"
|
375
|
+
end
|
376
|
+
|
377
|
+
stream.puts "\n"
|
378
|
+
stream.puts "\n" unless new_line
|
379
|
+
|
380
|
+
nil
|
220
381
|
end
|
382
|
+
|
383
|
+
self.singleton_class.alias_method :pprint_search_results, :pprint_top_search_results
|
221
384
|
end
|
222
385
|
end
|
data/lib/wgit/version.rb
CHANGED
data/lib/wgit.rb
CHANGED
@@ -1,16 +1,21 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative
|
5
|
-
require_relative
|
6
|
-
require_relative
|
7
|
-
require_relative
|
8
|
-
require_relative
|
9
|
-
require_relative
|
10
|
-
require_relative
|
11
|
-
require_relative
|
12
|
-
require_relative
|
13
|
-
require_relative
|
14
|
-
require_relative
|
15
|
-
require_relative
|
3
|
+
require_relative "wgit/version"
|
4
|
+
require_relative "wgit/logger"
|
5
|
+
require_relative "wgit/assertable"
|
6
|
+
require_relative "wgit/utils"
|
7
|
+
require_relative "wgit/url"
|
8
|
+
require_relative "wgit/html_to_text"
|
9
|
+
require_relative "wgit/document"
|
10
|
+
require_relative "wgit/document_extractors"
|
11
|
+
require_relative "wgit/crawler"
|
12
|
+
require_relative "wgit/model"
|
13
|
+
require_relative "wgit/database/database"
|
14
|
+
require_relative "wgit/database/database_adapter"
|
15
|
+
require_relative "wgit/database/adapters/mongo_db"
|
16
|
+
require_relative "wgit/database/adapters/in_memory"
|
17
|
+
require_relative "wgit/robots_parser"
|
18
|
+
require_relative "wgit/indexer"
|
19
|
+
require_relative "wgit/dsl"
|
20
|
+
require_relative "wgit/base"
|
16
21
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -16,188 +16,196 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '2.
|
19
|
+
version: '2.8'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '2.
|
26
|
+
version: '2.8'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: base64
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2
|
33
|
+
version: '0.2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '2
|
40
|
+
version: '0.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: ferrum
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0.14'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0.14'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: mongo
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '2.19'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '2.19'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '1.15'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '1.15'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: typhoeus
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.4'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.4'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: byebug
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
101
|
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
103
|
+
version: '11.1'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
108
|
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
110
|
+
version: '11.1'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: dotenv
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '2.
|
117
|
+
version: '2.8'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '2.
|
124
|
+
version: '2.8'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: maxitest
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
129
|
- - "~>"
|
116
130
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
131
|
+
version: '5.4'
|
118
132
|
type: :development
|
119
133
|
prerelease: false
|
120
134
|
version_requirements: !ruby/object:Gem::Requirement
|
121
135
|
requirements:
|
122
136
|
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
138
|
+
version: '5.4'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: pry
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
128
142
|
requirements:
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
145
|
+
version: '0.14'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
152
|
+
version: '0.14'
|
139
153
|
- !ruby/object:Gem::Dependency
|
140
154
|
name: rubocop
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
142
156
|
requirements:
|
143
157
|
- - "~>"
|
144
158
|
- !ruby/object:Gem::Version
|
145
|
-
version: '
|
159
|
+
version: '1.57'
|
146
160
|
type: :development
|
147
161
|
prerelease: false
|
148
162
|
version_requirements: !ruby/object:Gem::Requirement
|
149
163
|
requirements:
|
150
164
|
- - "~>"
|
151
165
|
- !ruby/object:Gem::Version
|
152
|
-
version: '
|
166
|
+
version: '1.57'
|
153
167
|
- !ruby/object:Gem::Dependency
|
154
168
|
name: toys
|
155
169
|
requirement: !ruby/object:Gem::Requirement
|
156
170
|
requirements:
|
157
171
|
- - "~>"
|
158
172
|
- !ruby/object:Gem::Version
|
159
|
-
version: '0.
|
173
|
+
version: '0.15'
|
160
174
|
type: :development
|
161
175
|
prerelease: false
|
162
176
|
version_requirements: !ruby/object:Gem::Requirement
|
163
177
|
requirements:
|
164
178
|
- - "~>"
|
165
179
|
- !ruby/object:Gem::Version
|
166
|
-
version: '0.
|
180
|
+
version: '0.15'
|
167
181
|
- !ruby/object:Gem::Dependency
|
168
182
|
name: webmock
|
169
183
|
requirement: !ruby/object:Gem::Requirement
|
170
184
|
requirements:
|
171
185
|
- - "~>"
|
172
186
|
- !ruby/object:Gem::Version
|
173
|
-
version: '3.
|
187
|
+
version: '3.19'
|
174
188
|
type: :development
|
175
189
|
prerelease: false
|
176
190
|
version_requirements: !ruby/object:Gem::Requirement
|
177
191
|
requirements:
|
178
192
|
- - "~>"
|
179
193
|
- !ruby/object:Gem::Version
|
180
|
-
version: '3.
|
194
|
+
version: '3.19'
|
181
195
|
- !ruby/object:Gem::Dependency
|
182
196
|
name: yard
|
183
197
|
requirement: !ruby/object:Gem::Requirement
|
184
198
|
requirements:
|
185
|
-
- - "
|
186
|
-
- !ruby/object:Gem::Version
|
187
|
-
version: 0.9.20
|
188
|
-
- - "<"
|
199
|
+
- - "~>"
|
189
200
|
- !ruby/object:Gem::Version
|
190
|
-
version: '
|
201
|
+
version: '0.9'
|
191
202
|
type: :development
|
192
203
|
prerelease: false
|
193
204
|
version_requirements: !ruby/object:Gem::Requirement
|
194
205
|
requirements:
|
195
|
-
- - "
|
196
|
-
- !ruby/object:Gem::Version
|
197
|
-
version: 0.9.20
|
198
|
-
- - "<"
|
206
|
+
- - "~>"
|
199
207
|
- !ruby/object:Gem::Version
|
200
|
-
version: '
|
208
|
+
version: '0.9'
|
201
209
|
description: 'Wgit was primarily designed to crawl static HTML websites to index and
|
202
210
|
search their content - providing the basis of any search engine; but Wgit is suitable
|
203
211
|
for many application domains including: URL parsing, data mining and statistical
|
@@ -215,14 +223,19 @@ files:
|
|
215
223
|
- "./lib/wgit/base.rb"
|
216
224
|
- "./lib/wgit/core_ext.rb"
|
217
225
|
- "./lib/wgit/crawler.rb"
|
226
|
+
- "./lib/wgit/database/adapters/in_memory.rb"
|
227
|
+
- "./lib/wgit/database/adapters/mongo_db.rb"
|
218
228
|
- "./lib/wgit/database/database.rb"
|
219
|
-
- "./lib/wgit/database/
|
229
|
+
- "./lib/wgit/database/database_adapter.rb"
|
220
230
|
- "./lib/wgit/document.rb"
|
221
231
|
- "./lib/wgit/document_extractors.rb"
|
222
232
|
- "./lib/wgit/dsl.rb"
|
233
|
+
- "./lib/wgit/html_to_text.rb"
|
223
234
|
- "./lib/wgit/indexer.rb"
|
224
235
|
- "./lib/wgit/logger.rb"
|
236
|
+
- "./lib/wgit/model.rb"
|
225
237
|
- "./lib/wgit/response.rb"
|
238
|
+
- "./lib/wgit/robots_parser.rb"
|
226
239
|
- "./lib/wgit/url.rb"
|
227
240
|
- "./lib/wgit/utils.rb"
|
228
241
|
- "./lib/wgit/version.rb"
|
@@ -251,7 +264,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
251
264
|
requirements:
|
252
265
|
- - ">="
|
253
266
|
- !ruby/object:Gem::Version
|
254
|
-
version: '
|
267
|
+
version: '3'
|
255
268
|
- - "<"
|
256
269
|
- !ruby/object:Gem::Version
|
257
270
|
version: '4'
|
@@ -261,7 +274,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
261
274
|
- !ruby/object:Gem::Version
|
262
275
|
version: '0'
|
263
276
|
requirements: []
|
264
|
-
rubygems_version: 3.
|
277
|
+
rubygems_version: 3.5.22
|
265
278
|
signing_key:
|
266
279
|
specification_version: 4
|
267
280
|
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|