wgit 0.10.8 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +3 -1
- data/bin/wgit +3 -1
- data/lib/wgit/assertable.rb +2 -2
- data/lib/wgit/crawler.rb +56 -34
- data/lib/wgit/database/database.rb +64 -52
- data/lib/wgit/document.rb +54 -40
- data/lib/wgit/document_extractors.rb +15 -1
- data/lib/wgit/dsl.rb +16 -20
- data/lib/wgit/indexer.rb +157 -63
- data/lib/wgit/logger.rb +1 -1
- data/lib/wgit/response.rb +21 -6
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +115 -55
- data/lib/wgit/utils.rb +81 -28
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +1 -0
- metadata +33 -38
data/lib/wgit/url.rb
CHANGED
@@ -28,6 +28,9 @@ module Wgit
|
|
28
28
|
# The duration of the crawl for this Url (in seconds).
|
29
29
|
attr_accessor :crawl_duration
|
30
30
|
|
31
|
+
# Record the redirects from the initial Url to the final Url.
|
32
|
+
attr_reader :redirects
|
33
|
+
|
31
34
|
# Initializes a new instance of Wgit::Url which models a web based
|
32
35
|
# HTTP URL.
|
33
36
|
#
|
@@ -57,12 +60,14 @@ module Wgit
|
|
57
60
|
crawled = obj.fetch('crawled', false)
|
58
61
|
date_crawled = obj.fetch('date_crawled', nil)
|
59
62
|
crawl_duration = obj.fetch('crawl_duration', nil)
|
63
|
+
redirects = obj.fetch('redirects', {})
|
60
64
|
end
|
61
65
|
|
62
66
|
@uri = Addressable::URI.parse(url)
|
63
67
|
@crawled = crawled
|
64
68
|
@date_crawled = date_crawled
|
65
69
|
@crawl_duration = crawl_duration
|
70
|
+
@redirects = redirects || {}
|
66
71
|
|
67
72
|
super(url)
|
68
73
|
end
|
@@ -107,16 +112,6 @@ Addressable::URI::InvalidURIError")
|
|
107
112
|
nil
|
108
113
|
end
|
109
114
|
|
110
|
-
# Sets the @crawled instance var, also setting @date_crawled for
|
111
|
-
# convenience.
|
112
|
-
#
|
113
|
-
# @param bool [Boolean] True if this Url has been crawled, false otherwise.
|
114
|
-
# @return [Boolean] The value of bool having been set.
|
115
|
-
def crawled=(bool)
|
116
|
-
@crawled = bool
|
117
|
-
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
118
|
-
end
|
119
|
-
|
120
115
|
# Overrides String#inspect to distingiush this Url from a String.
|
121
116
|
#
|
122
117
|
# @return [String] A short textual representation of this Url.
|
@@ -134,6 +129,71 @@ Addressable::URI::InvalidURIError")
|
|
134
129
|
super(new_url)
|
135
130
|
end
|
136
131
|
|
132
|
+
# Overrides String#concat which oddly returns a Wgit::Url object, and
|
133
|
+
# instead returns a String. Therefore this method works the same as if
|
134
|
+
# you call String#concat, or its alias String#+, which is desired for
|
135
|
+
# this method. If you want to join two Urls, use Wgit::Url#join method.
|
136
|
+
#
|
137
|
+
# @param other [String] The String to concat onto this one.
|
138
|
+
# @return [String] The new concatted String, not a Wgit::Url.
|
139
|
+
def concat(other)
|
140
|
+
to_s.concat(other.to_s)
|
141
|
+
end
|
142
|
+
|
143
|
+
# Sets the @crawled instance var, also setting @date_crawled for
|
144
|
+
# convenience.
|
145
|
+
#
|
146
|
+
# @param bool [Boolean] True if this Url has been crawled, false otherwise.
|
147
|
+
# @return [Boolean] The value of bool having been set.
|
148
|
+
def crawled=(bool)
|
149
|
+
@crawled = bool
|
150
|
+
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
151
|
+
end
|
152
|
+
|
153
|
+
# Sets the @redirects instance var, mapping any Strings into Wgit::Urls.
|
154
|
+
#
|
155
|
+
# @param redirects [Hash] The redirects Hash to set for this Url.
|
156
|
+
def redirects=(redirects)
|
157
|
+
assert_type(redirects, Hash)
|
158
|
+
|
159
|
+
map_to_url = proc do |url|
|
160
|
+
Wgit::Url.new(url.to_s, crawled: @crawled, date_crawled: @date_crawled)
|
161
|
+
end
|
162
|
+
|
163
|
+
@redirects = redirects
|
164
|
+
.map { |from, to| [map_to_url.call(from), map_to_url.call(to)] }
|
165
|
+
.to_h
|
166
|
+
end
|
167
|
+
|
168
|
+
# Returns the Wgit::Url's starting with the originally requested Url to be
|
169
|
+
# crawled, followed by each redirected to Url, finishing with the final
|
170
|
+
# crawled Url e.g.
|
171
|
+
#
|
172
|
+
# Example Url redirects journey (dictated by the webserver):
|
173
|
+
#
|
174
|
+
# ```
|
175
|
+
# http://example.com => 301 to https://example.com
|
176
|
+
# https://example.com => 301 to https://example.com/
|
177
|
+
# https://example.com/ => 200 OK (no more redirects, crawl complete)
|
178
|
+
# ```
|
179
|
+
#
|
180
|
+
# Would return an Array of Wgit::Url's in the form of:
|
181
|
+
#
|
182
|
+
# ```
|
183
|
+
# %w(
|
184
|
+
# http://example.com
|
185
|
+
# https://example.com
|
186
|
+
# https://example.com/
|
187
|
+
# )
|
188
|
+
# ```
|
189
|
+
#
|
190
|
+
# @return [Array<Wgit::Url>] Each redirected to Url's finishing with the
|
191
|
+
# final (successfully) crawled Url. If no redirects took place, then just
|
192
|
+
# the originally requested Url is returned inside the Array.
|
193
|
+
def redirects_journey
|
194
|
+
[redirects.keys, self].flatten
|
195
|
+
end
|
196
|
+
|
137
197
|
# Returns true if self is a relative Url; false if absolute.
|
138
198
|
#
|
139
199
|
# An absolute URL must have a scheme prefix e.g.
|
@@ -170,7 +230,7 @@ Addressable::URI::InvalidURIError")
|
|
170
230
|
raise 'Url (self) cannot be empty' if empty?
|
171
231
|
|
172
232
|
return false if scheme_relative?
|
173
|
-
return true
|
233
|
+
return true if @uri.relative?
|
174
234
|
|
175
235
|
# Self is absolute but may be relative to the opts param e.g. host.
|
176
236
|
opts.select! { |_k, v| v }
|
@@ -226,22 +286,23 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
226
286
|
!valid?
|
227
287
|
end
|
228
288
|
|
229
|
-
#
|
230
|
-
# modified.
|
289
|
+
# Joins self and other together before returning a new Url. Self is not
|
290
|
+
# modified. Some magic occurs depending on what is being joined, see
|
291
|
+
# the source code for more information.
|
231
292
|
#
|
232
|
-
# @param other [Wgit::Url, String] The other to
|
293
|
+
# @param other [Wgit::Url, String] The other (relative) Url to join to the
|
294
|
+
# end of self.
|
233
295
|
# @return [Wgit::Url] self + separator + other, separator depends on other.
|
234
|
-
def
|
296
|
+
def join(other)
|
235
297
|
other = Wgit::Url.new(other)
|
236
298
|
raise 'other must be relative' unless other.relative?
|
237
299
|
|
238
300
|
other = other.omit_leading_slash
|
239
301
|
separator = %w[# ? .].include?(other[0]) ? '' : '/'
|
302
|
+
separator = '' if end_with?('/')
|
303
|
+
joined = self + separator + other
|
240
304
|
|
241
|
-
|
242
|
-
concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
|
243
|
-
|
244
|
-
Wgit::Url.new(concatted)
|
305
|
+
Wgit::Url.new(joined)
|
245
306
|
end
|
246
307
|
|
247
308
|
# Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
|
@@ -257,7 +318,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
257
318
|
#
|
258
319
|
# If self is absolute then it's returned as is, making this method
|
259
320
|
# idempotent. The doc's `<base>` element is used if present, otherwise
|
260
|
-
# `doc.url` is used as the base; which is
|
321
|
+
# `doc.url` is used as the base; which is joined with self.
|
261
322
|
#
|
262
323
|
# Typically used to build an absolute link obtained from a document.
|
263
324
|
#
|
@@ -267,7 +328,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
267
328
|
#
|
268
329
|
# link.make_absolute(doc) # => "http://example.com/favicon.png"
|
269
330
|
#
|
270
|
-
# @param doc [Wgit::Document] The doc whose base Url is
|
331
|
+
# @param doc [Wgit::Document] The doc whose base Url is joined with
|
271
332
|
# self.
|
272
333
|
# @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
|
273
334
|
# raises an Exception.
|
@@ -279,7 +340,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
279
340
|
|
280
341
|
return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
|
281
342
|
|
282
|
-
absolute? ? self : doc.base_url(link: self).
|
343
|
+
absolute? ? self : doc.base_url(link: self).join(self)
|
283
344
|
end
|
284
345
|
|
285
346
|
# Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
|
@@ -427,7 +488,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
427
488
|
return nil if path.nil? || path.empty?
|
428
489
|
return Wgit::Url.new('/') if path == '/'
|
429
490
|
|
430
|
-
Wgit::Url.new(path).
|
491
|
+
Wgit::Url.new(path).omit_leading_slash
|
431
492
|
end
|
432
493
|
|
433
494
|
# Returns the endpoint of this URL e.g. the bit after the host with any
|
@@ -439,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
439
500
|
# an endpoint, / is returned.
|
440
501
|
def to_endpoint
|
441
502
|
endpoint = @uri.path
|
442
|
-
endpoint =
|
503
|
+
endpoint = "/#{endpoint}" unless endpoint.start_with?('/')
|
443
504
|
Wgit::Url.new(endpoint)
|
444
505
|
end
|
445
506
|
|
@@ -484,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
484
545
|
#
|
485
546
|
# @return [Wgit::Url, nil] Containing just the extension string or nil.
|
486
547
|
def to_extension
|
487
|
-
path = to_path
|
548
|
+
path = to_path&.omit_trailing_slash
|
488
549
|
return nil unless path
|
489
550
|
|
490
551
|
segs = path.split('.')
|
@@ -530,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
530
591
|
#
|
531
592
|
# @return [Wgit::Url] Self without a trailing slash.
|
532
593
|
def omit_leading_slash
|
533
|
-
start_with?('/') ? Wgit::Url.new(self[1
|
594
|
+
start_with?('/') ? Wgit::Url.new(self[1..]) : self
|
534
595
|
end
|
535
596
|
|
536
597
|
# Returns a new Wgit::Url containing self without a trailing slash. Is
|
@@ -564,7 +625,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
564
625
|
|
565
626
|
return self if ['', '/'].include?(omit_base)
|
566
627
|
|
567
|
-
Wgit::Url.new(omit_base).
|
628
|
+
Wgit::Url.new(omit_base).omit_leading_slash
|
568
629
|
end
|
569
630
|
|
570
631
|
# Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
|
@@ -579,7 +640,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
579
640
|
|
580
641
|
return self if ['', '/'].include?(omit_origin)
|
581
642
|
|
582
|
-
Wgit::Url.new(omit_origin).
|
643
|
+
Wgit::Url.new(omit_origin).omit_leading_slash
|
583
644
|
end
|
584
645
|
|
585
646
|
# Returns a new Wgit::Url with the query string portion removed e.g. Given
|
@@ -642,32 +703,31 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
642
703
|
start_with?('//')
|
643
704
|
end
|
644
705
|
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
alias sub_domain to_sub_domain
|
706
|
+
alias_method :crawled?, :crawled
|
707
|
+
alias_method :is_relative?, :relative?
|
708
|
+
alias_method :is_absolute?, :absolute?
|
709
|
+
alias_method :is_valid?, :valid?
|
710
|
+
alias_method :is_query?, :query?
|
711
|
+
alias_method :is_fragment?, :fragment?
|
712
|
+
alias_method :is_index?, :index?
|
713
|
+
alias_method :is_scheme_relative?, :scheme_relative?
|
714
|
+
alias_method :uri, :to_uri
|
715
|
+
alias_method :url, :to_url
|
716
|
+
alias_method :scheme, :to_scheme
|
717
|
+
alias_method :host, :to_host
|
718
|
+
alias_method :port, :to_port
|
719
|
+
alias_method :domain, :to_domain
|
720
|
+
alias_method :brand, :to_brand
|
721
|
+
alias_method :base, :to_base
|
722
|
+
alias_method :origin, :to_origin
|
723
|
+
alias_method :path, :to_path
|
724
|
+
alias_method :endpoint, :to_endpoint
|
725
|
+
alias_method :query, :to_query
|
726
|
+
alias_method :query_hash, :to_query_hash
|
727
|
+
alias_method :fragment, :to_fragment
|
728
|
+
alias_method :extension, :to_extension
|
729
|
+
alias_method :user, :to_user
|
730
|
+
alias_method :password, :to_password
|
731
|
+
alias_method :sub_domain, :to_sub_domain
|
672
732
|
end
|
673
733
|
end
|
data/lib/wgit/utils.rb
CHANGED
@@ -23,7 +23,7 @@ module Wgit
|
|
23
23
|
obj.instance_variables.each do |var|
|
24
24
|
next if ignore.include?(var.to_s)
|
25
25
|
|
26
|
-
key = var.to_s[1
|
26
|
+
key = var.to_s[1..] # Remove the @ prefix.
|
27
27
|
key = key.to_sym unless use_strings_as_keys
|
28
28
|
hash[key] = obj.instance_variable_get(var)
|
29
29
|
end
|
@@ -37,9 +37,9 @@ module Wgit
|
|
37
37
|
# @yield [el] Gives each element (Object) of obj_or_objects if it's
|
38
38
|
# Enumerable, otherwise obj_or_objs itself is given.
|
39
39
|
# @return [Object] The obj_or_objs parameter is returned.
|
40
|
-
def self.each(obj_or_objs)
|
40
|
+
def self.each(obj_or_objs, &block)
|
41
41
|
if obj_or_objs.respond_to?(:each)
|
42
|
-
obj_or_objs.each
|
42
|
+
obj_or_objs.each(&block)
|
43
43
|
else
|
44
44
|
yield(obj_or_objs)
|
45
45
|
end
|
@@ -129,15 +129,13 @@ module Wgit
|
|
129
129
|
# Prints out the search results in a search engine like format.
|
130
130
|
# The format for each result looks like:
|
131
131
|
#
|
132
|
+
# ```
|
132
133
|
# Title
|
133
|
-
#
|
134
134
|
# Keywords (if there are some)
|
135
|
-
#
|
136
135
|
# Text Snippet (formatted to show the searched for query, if provided)
|
137
|
-
#
|
138
136
|
# URL
|
139
|
-
#
|
140
137
|
# <empty_line_seperator>
|
138
|
+
# ```
|
141
139
|
#
|
142
140
|
# @param results [Array<Wgit::Document>] Array of Wgit::Document's which
|
143
141
|
# each have had #search!(query) called (to update it's @text with the
|
@@ -147,7 +145,7 @@ module Wgit
|
|
147
145
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
148
146
|
# to output text somewhere e.g. a file or STDERR.
|
149
147
|
# @return [Integer] The number of results.
|
150
|
-
def self.
|
148
|
+
def self.pprint_search_results(results, keyword_limit: 5, stream: $stdout)
|
151
149
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
152
150
|
|
153
151
|
results.each do |doc|
|
@@ -167,56 +165,111 @@ module Wgit
|
|
167
165
|
end
|
168
166
|
|
169
167
|
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
170
|
-
# method for its type e.g. if obj.is_a? String then
|
171
|
-
# not in the case statement will be ignored and returned as is.
|
168
|
+
# method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
|
169
|
+
# Any type not in the case statement will be ignored and returned as is.
|
170
|
+
# Call this method if unsure what obj's type is.
|
172
171
|
#
|
173
172
|
# @param obj [Object] The object to be sanitized.
|
174
173
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
175
174
|
# invalid characters.
|
176
|
-
# @return [Object] The sanitized obj
|
175
|
+
# @return [Object] The sanitized obj.
|
177
176
|
def self.sanitize(obj, encode: true)
|
178
177
|
case obj
|
178
|
+
when Wgit::Url
|
179
|
+
sanitize_url(obj, encode:)
|
179
180
|
when String
|
180
|
-
sanitize_str(obj, encode:
|
181
|
+
sanitize_str(obj, encode:)
|
181
182
|
when Array
|
182
|
-
sanitize_arr(obj, encode:
|
183
|
+
sanitize_arr(obj, encode:)
|
183
184
|
else
|
184
185
|
obj
|
185
186
|
end
|
186
187
|
end
|
187
188
|
|
189
|
+
# Sanitises a Wgit::Url to make it uniform. First sanitizes the Url as a
|
190
|
+
# String before replacing the Url value with the sanitized version. This
|
191
|
+
# method therefore modifies the given url param and also returns it.
|
192
|
+
#
|
193
|
+
# @param url [Wgit::Url] The Wgit::Url to sanitize. url is modified.
|
194
|
+
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
195
|
+
# invalid characters.
|
196
|
+
# @return [Wgit::Url] The sanitized url, which is also modified.
|
197
|
+
def self.sanitize_url(url, encode: true)
|
198
|
+
str = sanitize_str(url.to_s, encode:)
|
199
|
+
url.replace(str)
|
200
|
+
end
|
201
|
+
|
188
202
|
# Sanitises a String to make it uniform. Strips any leading/trailing white
|
189
203
|
# space. Also applies UTF-8 encoding (replacing invalid characters) if
|
190
204
|
# `encode: true`.
|
191
205
|
#
|
192
|
-
# @param str [String] The String to sanitize. str is modified.
|
206
|
+
# @param str [String] The String to sanitize. str is not modified.
|
193
207
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
194
208
|
# invalid characters.
|
195
|
-
# @return [String] The sanitized str
|
209
|
+
# @return [String] The sanitized str.
|
196
210
|
def self.sanitize_str(str, encode: true)
|
197
|
-
|
198
|
-
str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
|
199
|
-
str.strip!
|
200
|
-
end
|
211
|
+
return str unless str.is_a?(String)
|
201
212
|
|
202
|
-
str
|
213
|
+
str = str.encode('UTF-8', undef: :replace, invalid: :replace) if encode
|
214
|
+
str.strip
|
203
215
|
end
|
204
216
|
|
205
217
|
# Sanitises an Array to make it uniform. Removes empty Strings and nils,
|
206
218
|
# processes non empty Strings using Wgit::Utils.sanitize and removes
|
207
219
|
# duplicates.
|
208
220
|
#
|
209
|
-
# @param arr [Enumerable] The Array to sanitize. arr is modified.
|
210
|
-
# @return [Enumerable] The sanitized arr
|
221
|
+
# @param arr [Enumerable] The Array to sanitize. arr is not modified.
|
222
|
+
# @return [Enumerable] The sanitized arr.
|
211
223
|
def self.sanitize_arr(arr, encode: true)
|
212
|
-
|
213
|
-
arr.map! { |str| sanitize(str, encode: encode) }
|
214
|
-
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
215
|
-
arr.compact!
|
216
|
-
arr.uniq!
|
217
|
-
end
|
224
|
+
return arr unless arr.is_a?(Array)
|
218
225
|
|
219
226
|
arr
|
227
|
+
.map { |str| sanitize(str, encode:) }
|
228
|
+
.reject { |str| str.is_a?(String) && str.empty? }
|
229
|
+
.compact
|
230
|
+
.uniq
|
231
|
+
end
|
232
|
+
|
233
|
+
# Pretty prints a log statement, used for debugging purposes.
|
234
|
+
#
|
235
|
+
# Use like:
|
236
|
+
#
|
237
|
+
# ```
|
238
|
+
# Wgit::Utils.pprint 1, include_html: include_html, ignore: ignore_vars
|
239
|
+
# ```
|
240
|
+
#
|
241
|
+
# Which produces a log like:
|
242
|
+
#
|
243
|
+
# ```
|
244
|
+
# DEBUG_1 - include_html: true | ignore: ['@html', '@parser']
|
245
|
+
# ```
|
246
|
+
#
|
247
|
+
# @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
|
248
|
+
# @param stream [#puts] Any object that respond_to? :puts and :print. It is
|
249
|
+
# used to output the log text somewhere e.g. a file or STDERR.
|
250
|
+
# @param prefix [String] The log prefix, useful for visibility/greping.
|
251
|
+
# @param new_line [Boolean] Wether or not to use a new line (\n) as the
|
252
|
+
# separator.
|
253
|
+
# @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
|
254
|
+
def self.pprint(identifier, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
|
255
|
+
sep1 = new_line ? "\n" : ' - '
|
256
|
+
sep2 = new_line ? "\n" : ' | '
|
257
|
+
|
258
|
+
stream.print "\n#{prefix}_#{identifier}#{sep1}"
|
259
|
+
|
260
|
+
vars.each_with_index do |arr, i|
|
261
|
+
last_item = (i + 1) == vars.size
|
262
|
+
sep3 = sep2
|
263
|
+
sep3 = new_line ? "\n" : '' if last_item
|
264
|
+
k, v = arr
|
265
|
+
|
266
|
+
stream.print "#{k}: #{v}#{sep3}"
|
267
|
+
end
|
268
|
+
|
269
|
+
stream.puts "\n"
|
270
|
+
stream.puts "\n" unless new_line
|
271
|
+
|
272
|
+
nil
|
220
273
|
end
|
221
274
|
end
|
222
275
|
end
|
data/lib/wgit/version.rb
CHANGED
data/lib/wgit.rb
CHANGED
@@ -10,6 +10,7 @@ require_relative 'wgit/document_extractors'
|
|
10
10
|
require_relative 'wgit/crawler'
|
11
11
|
require_relative 'wgit/database/model'
|
12
12
|
require_relative 'wgit/database/database'
|
13
|
+
require_relative 'wgit/robots_parser'
|
13
14
|
require_relative 'wgit/indexer'
|
14
15
|
require_relative 'wgit/dsl'
|
15
16
|
require_relative 'wgit/base'
|