wgit 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/dsl.rb CHANGED
@@ -44,14 +44,14 @@ the 'start' function".freeze
44
44
  Wgit::Document.define_extractor(var, xpath, opts, &block)
45
45
  end
46
46
 
47
- # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
48
- # index methods used by the DSL. See the Wgit::Crawler documentation for
49
- # more details.
47
+ # Sets and returns the Wgit::Crawler used in subsequent crawls including
48
+ # indexing. Defaults to `Wgit::Crawler.new` if not given a param. See the
49
+ # Wgit::Crawler documentation for more details.
50
50
  #
51
- # @yield [crawler] The created crawler; use the block to configure.
52
- # @return [Wgit::Crawler] The created crawler used by the DSL.
53
- def crawler
54
- @dsl_crawler ||= Wgit::Crawler.new
51
+ # @yield [crawler] Given the DSL crawler; use the block to configure.
52
+ # @return [Wgit::Crawler] The crawler instance used by the DSL.
53
+ def use_crawler(crawler = nil)
54
+ @dsl_crawler = crawler || @dsl_crawler || Wgit::Crawler.new
55
55
  yield @dsl_crawler if block_given?
56
56
  @dsl_crawler
57
57
  end
@@ -66,7 +66,7 @@ the 'start' function".freeze
66
66
  # @yield [crawler] The crawler that'll be used in the subsequent
67
67
  # crawl/index; use the block to configure.
68
68
  def start(*urls, &block)
69
- crawler(&block)
69
+ use_crawler(&block) if block_given?
70
70
  @dsl_start = urls
71
71
  end
72
72
 
@@ -101,7 +101,7 @@ the 'start' function".freeze
101
101
  raise DSL_ERROR__NO_START_URL if urls.empty?
102
102
 
103
103
  urls.map! { |url| Wgit::Url.parse(url) }
104
- crawler.crawl_urls(*urls, follow_redirects:, &block)
104
+ get_crawler.crawl_urls(*urls, follow_redirects:, &block)
105
105
  end
106
106
 
107
107
  # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -138,42 +138,41 @@ the 'start' function".freeze
138
138
  opts = { follow: xpath, allow_paths:, disallow_paths: }
139
139
 
140
140
  urls.reduce([]) do |externals, url|
141
- externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
141
+ externals + get_crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
142
142
  end
143
143
  end
144
144
 
145
- # Returns the DSL's `crawler#last_response`.
145
+ # Returns the DSL's `Wgit::Crawler#last_response`.
146
146
  #
147
147
  # @return [Wgit::Response] The response from the last URL crawled.
148
148
  def last_response
149
- crawler.last_response
149
+ get_crawler.last_response
150
150
  end
151
151
 
152
152
  # Nilifies the DSL instance variables.
153
153
  def reset
154
- @dsl_crawler = nil
155
- @dsl_start = nil
156
- @dsl_follow = nil
157
- @dsl_conn_str = nil
154
+ @dsl_crawler = nil
155
+ @dsl_start = nil
156
+ @dsl_follow = nil
157
+ @dsl_db = nil
158
158
  end
159
159
 
160
160
  ### INDEXER METHODS ###
161
161
 
162
- # Defines the connection string to the database used in subsequent `index*`
163
- # method calls. This method is optional as the connection string can be
164
- # passed to the index method instead.
162
+ # Defines the connected database instance used in subsequent index and DB
163
+ # method calls. This method is optional however, as a new instance of the
164
+ # Wgit::Database.adapter_class will be initialised otherwise. Therefore
165
+ # if not calling this method, you should ensure
166
+ # ENV['WGIT_CONNECTION_STRING'] is set or the connection will fail.
165
167
  #
166
- # @param conn_str [String] The connection string used to connect to the
167
- # database in subsequent `index*` method calls.
168
- def connection_string(conn_str)
169
- @dsl_conn_str = conn_str
168
+ # @param db [Wgit::Database::DatabaseAdapter] The connected database
169
+ # instance used in subsequent `index*` method calls.
170
+ def use_database(db)
171
+ @dsl_db = db
170
172
  end
171
173
 
172
174
  # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
173
175
  #
174
- # @param connection_string [String] The database connection string. Set as
175
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
176
- # `connection_string`.
177
176
  # @param max_sites [Integer] The number of separate and whole
178
177
  # websites to be crawled before the method exits. Defaults to -1 which
179
178
  # means the crawl will occur until manually stopped (Ctrl+C etc).
@@ -181,11 +180,8 @@ the 'start' function".freeze
181
180
  # scraped from the web (default is 1GB). Note, that this value is used to
182
181
  # determine when to stop crawling; it's not a guarantee of the max data
183
182
  # that will be obtained.
184
- def index_www(
185
- connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
186
- )
187
- db = Wgit::Database.new(connection_string)
188
- indexer = Wgit::Indexer.new(db, crawler)
183
+ def index_www(max_sites: -1, max_data: 1_048_576_000)
184
+ indexer = Wgit::Indexer.new(get_db, get_crawler)
189
185
 
190
186
  indexer.index_www(max_sites:, max_data:)
191
187
  end
@@ -194,9 +190,6 @@ the 'start' function".freeze
194
190
  #
195
191
  # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
196
192
  # crawl. Can be set using `start`.
197
- # @param connection_string [String] The database connection string. Set as
198
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
199
- # `connection_string`.
200
193
  # @param insert_externals [Boolean] Whether or not to insert the website's
201
194
  # external URL's into the database.
202
195
  # @param follow [String] The xpath extracting links to be followed during
@@ -213,15 +206,13 @@ the 'start' function".freeze
213
206
  # set.
214
207
  # @return [Integer] The total number of pages crawled within the website.
215
208
  def index_site(
216
- *urls, connection_string: @dsl_conn_str,
217
- insert_externals: false, follow: @dsl_follow,
209
+ *urls, insert_externals: false, follow: @dsl_follow,
218
210
  allow_paths: nil, disallow_paths: nil, &block
219
211
  )
220
212
  urls = (@dsl_start || []) if urls.empty?
221
213
  raise DSL_ERROR__NO_START_URL if urls.empty?
222
214
 
223
- db = Wgit::Database.new(connection_string)
224
- indexer = Wgit::Indexer.new(db, crawler)
215
+ indexer = Wgit::Indexer.new(get_db, get_crawler)
225
216
  xpath = follow || :default
226
217
  crawl_opts = {
227
218
  insert_externals:, follow: xpath, allow_paths:, disallow_paths:
@@ -236,9 +227,6 @@ the 'start' function".freeze
236
227
  #
237
228
  # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
238
229
  # `start` URL(s).
239
- # @param connection_string [String] The database connection string. Set as
240
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
241
- # `connection_string`.
242
230
  # @param insert_externals [Boolean] Whether or not to insert the website's
243
231
  # external URL's into the database.
244
232
  # @yield [doc] Given the Wgit::Document of the crawled webpage,
@@ -247,15 +235,11 @@ the 'start' function".freeze
247
235
  # document from being saved into the database.
248
236
  # @raise [StandardError] If no urls are provided and no `start` URL has
249
237
  # been set.
250
- def index(
251
- *urls, connection_string: @dsl_conn_str,
252
- insert_externals: false, &block
253
- )
238
+ def index(*urls, insert_externals: false, &block)
254
239
  urls = (@dsl_start || []) if urls.empty?
255
240
  raise DSL_ERROR__NO_START_URL if urls.empty?
256
241
 
257
- db = Wgit::Database.new(connection_string)
258
- indexer = Wgit::Indexer.new(db, crawler)
242
+ indexer = Wgit::Indexer.new(get_db, get_crawler)
259
243
 
260
244
  urls.map! { |url| Wgit::Url.parse(url) }
261
245
  indexer.index_urls(*urls, insert_externals:, &block)
@@ -264,13 +248,11 @@ the 'start' function".freeze
264
248
  ### DATABASE METHODS ###
265
249
 
266
250
  # Performs a search of the database's indexed documents and pretty prints
267
- # the results in a search engine-esque format. See `Wgit::Database#search!`
268
- # and `Wgit::Document#search!` for details of how the search works.
251
+ # the results in a search engine-esque format. See
252
+ # `Wgit::Database::DatabaseAdapter#search` and `Wgit::Document#search!`
253
+ # for details of how the search methods work.
269
254
  #
270
255
  # @param query [String] The text query to search with.
271
- # @param connection_string [String] The database connection string. Set as
272
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
273
- # `connection_string`.
274
256
  # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
275
257
  # to output text somewhere e.g. a file or STDERR. Use nil for no output.
276
258
  # @param case_sensitive [Boolean] Whether character case must match.
@@ -284,38 +266,53 @@ the 'start' function".freeze
284
266
  # database containing only its matching `#text`.
285
267
  # @return [Array<Wgit::Document>] The search results with matching text.
286
268
  def search(
287
- query, connection_string: @dsl_conn_str, stream: $stdout,
269
+ query, stream: $stdout,
270
+ top_result_only: true, include_score: false,
288
271
  case_sensitive: false, whole_sentence: true,
289
- limit: 10, skip: 0, sentence_limit: 80, &block
272
+ limit: 10, skip: 0, sentence_limit: 80
290
273
  )
291
274
  stream ||= File.open(File::NULL, 'w')
292
- db = Wgit::Database.new(connection_string)
293
275
 
294
- results = db.search!(
295
- query,
296
- case_sensitive:, whole_sentence:,
297
- limit:, skip:,
298
- sentence_limit:, &block
299
- )
276
+ results = get_db.search(
277
+ query, case_sensitive:, whole_sentence:, limit:, skip:)
300
278
 
301
- Wgit::Utils.pprint_search_results(results, stream:)
279
+ results.each do |doc|
280
+ doc.search_text!(
281
+ query, case_sensitive:, whole_sentence:, sentence_limit:)
282
+ yield(doc) if block_given?
283
+ end
284
+
285
+ if top_result_only
286
+ Wgit::Utils.pprint_top_search_results(results, include_score:, stream:)
287
+ else
288
+ Wgit::Utils.pprint_all_search_results(results, include_score:, stream:)
289
+ end
302
290
 
303
291
  results
304
292
  end
305
293
 
306
294
  # Deletes everything in the urls and documents collections by calling
307
- # `Wgit::Database#clear_db` underneath. This will nuke the entire database
308
- # so yeah... be careful.
295
+ # `Wgit::Database::DatabaseAdapter#empty` underneath.
309
296
  #
310
297
  # @return [Integer] The number of deleted records.
311
- def clear_db!(connection_string: @dsl_conn_str)
312
- db = Wgit::Database.new(connection_string)
313
- db.clear_db
298
+ def empty_db!
299
+ get_db.empty
300
+ end
301
+
302
+ private
303
+
304
+ def get_crawler
305
+ @dsl_crawler ||= Wgit::Crawler.new
306
+ end
307
+
308
+ def get_db
309
+ @dsl_db ||= Wgit::Database.new
314
310
  end
315
311
 
316
312
  alias_method :crawl_url, :crawl
317
313
  alias_method :crawl_r, :crawl_site
318
314
  alias_method :index_r, :index_site
315
+ alias_method :index_url, :index
319
316
  alias_method :start_urls, :start
320
317
  end
321
318
  end
@@ -0,0 +1,277 @@
1
+ require_relative "utils"
2
+ require_relative "assertable"
3
+ require "nokogiri"
4
+
5
+ module Wgit
6
+ # Class used to extract the visible page text from a HTML string.
7
+ # This is in turn used to set the output of a Wgit::Document#text method.
8
+ class HTMLToText
9
+ include Assertable
10
+
11
+ # Set of text elements used to extract the visible text.
12
+ # The element's display (:inline or :block) is used to delimit sentences e.g.
13
+ # <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
14
+ # <span>foo</span><span>bar</span> will be extracted as ['foobar'].
15
+ @text_elements = {
16
+ a: :inline,
17
+ abbr: :inline,
18
+ address: :block,
19
+ article: :block,
20
+ aside: :block,
21
+ b: :inline,
22
+ bdi: :inline,
23
+ bdo: :inline,
24
+ blockquote: :block,
25
+ br: :block,
26
+ button: :block, # Normally inline but Wgit treats as block.
27
+ caption: :block,
28
+ cite: :inline,
29
+ code: :inline,
30
+ data: :inline,
31
+ dd: :block,
32
+ del: :inline,
33
+ details: :block,
34
+ dfn: :inline,
35
+ div: :block,
36
+ dl: :block,
37
+ dt: :block,
38
+ em: :inline,
39
+ figcaption: :block,
40
+ figure: :block,
41
+ footer: :block,
42
+ h1: :block,
43
+ h2: :block,
44
+ h3: :block,
45
+ h4: :block,
46
+ h5: :block,
47
+ h6: :block,
48
+ header: :block,
49
+ hr: :block,
50
+ i: :inline,
51
+ input: :inline,
52
+ ins: :block,
53
+ kbd: :inline,
54
+ label: :inline,
55
+ legend: :block,
56
+ li: :block,
57
+ main: :block,
58
+ mark: :inline,
59
+ meter: :block,
60
+ ol: :block,
61
+ option: :block,
62
+ output: :block,
63
+ p: :block,
64
+ pre: :block,
65
+ q: :inline,
66
+ rb: :inline,
67
+ rt: :inline,
68
+ ruby: :inline,
69
+ s: :inline,
70
+ samp: :inline,
71
+ section: :block,
72
+ small: :inline,
73
+ span: :inline,
74
+ strong: :inline,
75
+ sub: :inline,
76
+ summary: :block,
77
+ sup: :inline,
78
+ td: :block,
79
+ textarea: :block,
80
+ th: :block,
81
+ time: :inline,
82
+ u: :inline,
83
+ ul: :block,
84
+ var: :inline,
85
+ wbr: :inline
86
+ }
87
+
88
+ class << self
89
+ # Set of HTML elements that make up the visible text on a page. These
90
+ # elements are used to initialize the Wgit::Document#text. See the
91
+ # README.md for how to add to this Hash dynamically.
92
+ attr_reader :text_elements
93
+ end
94
+
95
+ # The Nokogiri::HTML document object initialized from a HTML string.
96
+ attr_reader :parser
97
+
98
+ # Creates a new HTML to text extractor instance.
99
+ #
100
+ # @param parser [Nokogiri::HTML4::Document] The nokogiri parser object.
101
+ # @raise [StandardError] If the given parser is of an invalid type.
102
+ def initialize(parser)
103
+ assert_type(parser, Nokogiri::HTML4::Document)
104
+
105
+ @parser = parser
106
+ end
107
+
108
+ # Extracts and returns the text sentences from the @parser HTML.
109
+ #
110
+ # @return [Array<String>] An array of unique text sentences.
111
+ def extract_arr
112
+ return [] if @parser.to_s.empty?
113
+
114
+ text_str = extract_str
115
+
116
+ # Split the text_str into an Array of text sentences.
117
+ text_str
118
+ .split("\n")
119
+ .map(&:strip)
120
+ .reject(&:empty?)
121
+ end
122
+
123
+ # Extracts and returns a text string from the @parser HTML.
124
+ #
125
+ # @return [String] A string of text with \n delimiting sentences.
126
+ def extract_str
127
+ text_str = ""
128
+
129
+ iterate_child_nodes(@parser) do |node, display|
130
+ # Handle any special cases e.g. skip nodes we don't care about...
131
+ # <pre> nodes should have their contents displayed exactly as is.
132
+ if node_name(node) == :pre
133
+ text_str << "\n"
134
+ text_str << node.text
135
+ next
136
+ end
137
+
138
+ # Skip any child node of <pre> since they're handled as a special case above.
139
+ next if child_of?(:pre, node)
140
+
141
+ if node.text?
142
+ # Skip any text element that is purely whitespace.
143
+ next unless valid_text_content?(node.text)
144
+ else
145
+ # Skip a concrete node if it has other concrete child nodes as these
146
+ # will be iterated onto later.
147
+ #
148
+ # Process if node has no children or one child which is a valid text node.
149
+ next unless node.children.empty? || parent_of_text_node_only?(node)
150
+ end
151
+
152
+ # Apply display rules deciding if a new line is needed before node.text.
153
+ add_new_line = false
154
+ prev = prev_sibling_or_parent(node)
155
+
156
+ if node.text?
157
+ add_new_line = true unless prev && inline?(prev)
158
+ else
159
+ add_new_line = true if display == :block
160
+ add_new_line = true if prev && block?(prev)
161
+ end
162
+
163
+ text_str << "\n" if add_new_line
164
+ text_str << format_text(node.text)
165
+ end
166
+
167
+ text_str
168
+ .strip
169
+ .squeeze("\n")
170
+ .squeeze(" ")
171
+ end
172
+
173
+ private
174
+
175
+ def node_name(node)
176
+ node.name&.downcase&.to_sym
177
+ end
178
+
179
+ def display(node)
180
+ name = node_name(node)
181
+ Wgit::HTMLToText.text_elements[name]
182
+ end
183
+
184
+ def inline?(node)
185
+ display(node) == :inline
186
+ end
187
+
188
+ def block?(node)
189
+ display(node) == :block
190
+ end
191
+
192
+ # Returns the previous sibling of node or nil. Only valid text elements are
193
+ # returned i.e. non duplicates with valid text content.
194
+ def prev_sibling(node)
195
+ prev = node.previous
196
+
197
+ return nil unless prev
198
+ return prev unless prev.text?
199
+ return prev if valid_text_node?(prev) && !contains_new_line?(prev.text)
200
+ return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
201
+
202
+ prev.previous
203
+ end
204
+
205
+ # Returns node's previous sibling, parent or nil; in that order. Only valid
206
+ # text elements are returned i.e. non duplicates with valid text content.
207
+ def prev_sibling_or_parent(node)
208
+ prev = prev_sibling(node)
209
+ return prev if prev
210
+
211
+ node.parent
212
+ end
213
+
214
+ def child_of?(ancestor_name, node)
215
+ node.ancestors.any? { |ancestor| node_name(ancestor) == ancestor_name }
216
+ end
217
+
218
+ # Returns true if any of the child nodes contain a non empty :text node.
219
+ def parent_of_text_node?(node)
220
+ node.children.any? { |child| child.text? && valid_text_content?(child.text) }
221
+ end
222
+
223
+ def parent_of_text_node_only?(node)
224
+ node.children.size == 1 && parent_of_text_node?(node)
225
+ end
226
+
227
+ # Returns true if text is not empty having removed all new lines.
228
+ def valid_text_content?(text)
229
+ !format_text(text).empty?
230
+ end
231
+
232
+ # Returns true if node is a text node.
233
+ # Duplicate text nodes (that follow a concrete node) are omitted.
234
+ def valid_text_node?(node)
235
+ node.text? && node.text != node.parent.text
236
+ end
237
+
238
+ def contains_new_line?(text)
239
+ ["\n", '\\n'].any? { |new_line| text.include?(new_line) }
240
+ end
241
+
242
+ # Remove special characters including any new lines; as semantic HTML will
243
+ # typically use <br> and/or block elements to denote a line break.
244
+ def format_text(text)
245
+ text
246
+ .encode("UTF-8", undef: :replace, invalid: :replace)
247
+ .gsub("\n", "")
248
+ .gsub('\\n', "")
249
+ .gsub("\r", "")
250
+ .gsub('\\r', "")
251
+ .gsub("\f", "")
252
+ .gsub('\\f', "")
253
+ .gsub("\t", "")
254
+ .gsub('\\t', "")
255
+ .gsub("&zwnj;", "")
256
+ .gsub("&nbsp;", " ")
257
+ .gsub("&#160;", " ")
258
+ .gsub("&thinsp;", " ")
259
+ .gsub("&ensp;", " ")
260
+ .gsub("&emsp;", " ")
261
+ .gsub('\u00a0', " ")
262
+ end
263
+
264
+ # Iterate over node and it's child nodes, yielding each to &block.
265
+ # Only HTMLToText.text_elements or valid :text nodes will be yielded.
266
+ # Duplicate text nodes (that follow a concrete node) are omitted.
267
+ def iterate_child_nodes(node, &block)
268
+ display = display(node)
269
+ text_node = valid_text_node?(node)
270
+
271
+ yield(node, display) if display || text_node
272
+ node.children.each { |child| iterate_child_nodes(child, &block) }
273
+ end
274
+
275
+ alias_method :extract, :extract_arr
276
+ end
277
+ end
data/lib/wgit/indexer.rb CHANGED
@@ -1,12 +1,23 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'assertable'
3
4
  require_relative 'crawler'
4
- require_relative 'database/database'
5
+ require_relative 'database/database_adapter'
5
6
 
6
7
  module Wgit
7
8
  # Class which crawls and saves the Documents to a database. Can be thought of
8
- # as a combination of Wgit::Crawler and Wgit::Database.
9
+ # as a combination of Wgit::Crawler and Wgit::Database::DatabaseAdapter.
9
10
  class Indexer
11
+ include Assertable
12
+
13
+ # The ENV var used to omit and ignore robots.txt parsing during an index.
14
+ # Applies to all index_* methods if set in the ENV.
15
+ WGIT_IGNORE_ROBOTS_TXT = "WGIT_IGNORE_ROBOTS_TXT".freeze
16
+
17
+ # The block return value used to skip saving a crawled document to the
18
+ # database. Applies to all index_* methods that take a block.
19
+ SKIP_UPSERT = :skip.freeze
20
+
10
21
  # The crawler used to index the WWW.
11
22
  attr_reader :crawler
12
23
 
@@ -15,10 +26,13 @@ module Wgit
15
26
 
16
27
  # Initialize the Indexer.
17
28
  #
18
- # @param database [Wgit::Database] The database instance (already
19
- # initialized and connected) used to index.
20
- # @param crawler [Wgit::Crawler] The crawler instance used to index.
29
+ # @param database [Wgit::Database::DatabaseAdapter] The database instance
30
+ # (already initialized and connected) used for indexing.
31
+ # @param crawler [Wgit::Crawler] The crawler instance used for indexing.
21
32
  def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
33
+ assert_type(database, Wgit::Database::DatabaseAdapter)
34
+ assert_type(crawler, Wgit::Crawler)
35
+
22
36
  @db = database
23
37
  @crawler = crawler
24
38
  end
@@ -143,11 +157,10 @@ future iterations")
143
157
  next if no_index?(@crawler.last_response, doc)
144
158
 
145
159
  result = block_given? ? yield(doc) : true
160
+ next if doc.empty? || result == SKIP_UPSERT
146
161
 
147
- if result && !doc.empty?
148
- upsert_doc(doc)
149
- total_pages_indexed += 1
150
- end
162
+ upsert_doc(doc)
163
+ total_pages_indexed += 1
151
164
  end
152
165
 
153
166
  upsert_url_and_redirects(url)
@@ -207,7 +220,9 @@ for the site: #{url}")
207
220
  break if no_index?(@crawler.last_response, doc)
208
221
 
209
222
  result = block_given? ? yield(doc) : true
210
- upsert_doc(doc) if result && !doc.empty?
223
+ break if doc.empty? || result == SKIP_UPSERT
224
+
225
+ upsert_doc(doc)
211
226
  end
212
227
 
213
228
  upsert_url_and_redirects(url)
@@ -285,6 +300,8 @@ for the site: #{url}")
285
300
 
286
301
  # Crawls and parses robots.txt file (if found). Returns the parser or nil.
287
302
  def parse_robots_txt(url)
303
+ return nil if ENV[WGIT_IGNORE_ROBOTS_TXT]
304
+
288
305
  robots_url = url.to_origin.join('/robots.txt')
289
306
 
290
307
  Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
@@ -328,6 +345,8 @@ for the site: #{url}")
328
345
 
329
346
  # Returns if the last_response or doc #no_index? is true or not.
330
347
  def no_index?(last_response, doc)
348
+ return false if ENV[WGIT_IGNORE_ROBOTS_TXT]
349
+
331
350
  url = last_response.url.to_s
332
351
  if last_response.no_index?
333
352
  Wgit.logger.info("Skipping page due to no-index response header: #{url}")
data/lib/wgit/logger.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  # FYI: The default logger is set at the bottom of this file.
4
4
 
5
- require 'logger'
5
+ require "logger"
6
6
 
7
7
  module Wgit
8
8
  # The Logger instance used by Wgit. Set your own custom logger after
@@ -28,7 +28,7 @@ module Wgit
28
28
  #
29
29
  # @return [Logger] The default Logger instance.
30
30
  def self.default_logger
31
- logger = Logger.new($stdout, progname: 'wgit', level: :info)
31
+ logger = Logger.new($stdout, progname: "wgit", level: :info)
32
32
  logger.formatter = proc do |_severity, _datetime, progname, msg|
33
33
  "[#{progname}] #{msg}\n"
34
34
  end