wgit 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- ### Default Document Extensions ###
3
+ ### Default Document Extractors ###
4
4
 
5
5
  # Base.
6
- Wgit::Document.define_extension(
6
+ Wgit::Document.define_extractor(
7
7
  :base,
8
8
  '//base/@href',
9
9
  singleton: true,
10
10
  text_content_only: true
11
11
  ) do |base|
12
- Wgit::Url.parse_or_nil(base) if base
12
+ Wgit::Url.parse?(base) if base
13
13
  end
14
14
 
15
15
  # Title.
16
- Wgit::Document.define_extension(
16
+ Wgit::Document.define_extractor(
17
17
  :title,
18
18
  '//title',
19
19
  singleton: true,
@@ -21,7 +21,7 @@ Wgit::Document.define_extension(
21
21
  )
22
22
 
23
23
  # Description.
24
- Wgit::Document.define_extension(
24
+ Wgit::Document.define_extractor(
25
25
  :description,
26
26
  '//meta[@name="description"]/@content',
27
27
  singleton: true,
@@ -29,7 +29,7 @@ Wgit::Document.define_extension(
29
29
  )
30
30
 
31
31
  # Author.
32
- Wgit::Document.define_extension(
32
+ Wgit::Document.define_extractor(
33
33
  :author,
34
34
  '//meta[@name="author"]/@content',
35
35
  singleton: true,
@@ -37,7 +37,7 @@ Wgit::Document.define_extension(
37
37
  )
38
38
 
39
39
  # Keywords.
40
- Wgit::Document.define_extension(
40
+ Wgit::Document.define_extractor(
41
41
  :keywords,
42
42
  '//meta[@name="keywords"]/@content',
43
43
  singleton: true,
@@ -45,25 +45,25 @@ Wgit::Document.define_extension(
45
45
  ) do |keywords, _source, type|
46
46
  if keywords && (type == :document)
47
47
  keywords = keywords.split(',')
48
- Wgit::Utils.process_arr(keywords)
48
+ Wgit::Utils.sanitize(keywords)
49
49
  end
50
50
  keywords
51
51
  end
52
52
 
53
53
  # Links.
54
- Wgit::Document.define_extension(
54
+ Wgit::Document.define_extractor(
55
55
  :links,
56
56
  '//a/@href',
57
57
  singleton: false,
58
58
  text_content_only: true
59
59
  ) do |links|
60
60
  links
61
- .map { |link| Wgit::Url.parse_or_nil(link) }
61
+ .map { |link| Wgit::Url.parse?(link) }
62
62
  .compact # Remove unparsable links.
63
63
  end
64
64
 
65
65
  # Text.
66
- Wgit::Document.define_extension(
66
+ Wgit::Document.define_extractor(
67
67
  :text,
68
68
  proc { Wgit::Document.text_elements_xpath },
69
69
  singleton: false,
@@ -0,0 +1,324 @@
1
+ module Wgit
2
+ # DSL methods that act as a wrapper around Wgit's underlying class methods.
3
+ # All instance vars/constants are prefixed to avoid conflicts when included.
4
+ module DSL
5
+ # Error message shown when there's no URL to crawl.
6
+ DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
7
+ the 'start' function".freeze
8
+
9
+ ### CRAWLER METHODS ###
10
+
11
+ # Defines an extractor using `Wgit::Document.define_extractor` underneath.
12
+ #
13
+ # @param var [Symbol] The name of the variable to be initialised, that will
14
+ # contain the extracted content.
15
+ # @param xpath [String, #call] The xpath used to find the element(s)
16
+ # of the webpage. Only used when initializing from HTML.
17
+ #
18
+ # Pass a callable object (proc etc.) if you want the
19
+ # xpath value to be derived on Document initialisation (instead of when
20
+ # the extractor is defined). The call method must return a valid xpath
21
+ # String.
22
+ # @param opts [Hash] The options to define an extractor with. The
23
+ # options are only used when intializing from HTML, not the database.
24
+ # @option opts [Boolean] :singleton The singleton option determines
25
+ # whether or not the result(s) should be in an Array. If multiple
26
+ # results are found and singleton is true then the first result will be
27
+ # used. Defaults to true.
28
+ # @option opts [Boolean] :text_content_only The text_content_only option
29
+ # if true will use the text content of the Nokogiri result object,
30
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
31
+ # @yield The block is executed when a Wgit::Document is initialized,
32
+ # regardless of the source. Use it (optionally) to process the result
33
+ # value.
34
+ # @yieldparam value [Object] The result value to be assigned to the new
35
+ # `var`.
36
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
37
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
38
+ # `:object`.
39
+ # @yieldreturn [Object] The return value of the block becomes the new var's
40
+ # value. Return the block's value param unchanged if you want to inspect.
41
+ # @raise [StandardError] If the var param isn't valid.
42
+ # @return [Symbol] The given var Symbol if successful.
43
+ def extract(var, xpath, opts = {}, &block)
44
+ Wgit::Document.define_extractor(var, xpath, opts, &block)
45
+ end
46
+
47
+ # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
48
+ # index methods used by the DSL. See the Wgit::Crawler documentation for
49
+ # more details.
50
+ #
51
+ # @yield [crawler] The created crawler; use the block to configure.
52
+ # @return [Wgit::Crawler] The created crawler used by the DSL.
53
+ def crawler
54
+ @dsl_crawler ||= Wgit::Crawler.new
55
+ yield @dsl_crawler if block_given?
56
+ @dsl_crawler
57
+ end
58
+
59
+ # Sets the URL to be crawled when a `crawl*` or `index*` method is
60
+ # subsequently called. Calling this is optional as the URL can be
61
+ # passed to the method instead. You can also omit the url param and just
62
+ # use the block to configure the crawler instead.
63
+ #
64
+ # @param urls [*String, *Wgit::Url] The URL(s) to crawl
65
+ # or nil (if only using the block to configure the crawler).
66
+ # @yield [crawler] The crawler that'll be used in the subsequent
67
+ # crawl/index; use the block to configure.
68
+ def start(*urls, &block)
69
+ crawler(&block)
70
+ @dsl_start = urls
71
+ end
72
+
73
+ # Sets the xpath to be followed when `crawl_site` or `index_site` is
74
+ # subsequently called. Calling this method is optional as the default is to
75
+ # follow all `<a>` href's that point to the site domain. You can also pass
76
+ # `follow:` to the crawl/index methods directly.
77
+ #
78
+ # @param xpath [String] The xpath which is followed when crawling/indexing
79
+ # a site. Use `:default` to restore the default follow logic.
80
+ def follow(xpath)
81
+ @dsl_follow = xpath
82
+ end
83
+
84
+ # Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
85
+ # underneath. If no urls are provided, then the `start` URL is used.
86
+ #
87
+ # @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
88
+ # URL(s).
89
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
90
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
91
+ # e.g. :host only allows redirects within the same host. Choose from
92
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
93
+ # This value will be used for all urls crawled.
94
+ # @yield [doc] Given each crawled page (Wgit::Document); this is the only
95
+ # way to interact with them.
96
+ # @raise [StandardError] If no urls are provided and no `start` URL has
97
+ # been set.
98
+ # @return [Wgit::Document] The last Document crawled.
99
+ def crawl(*urls, follow_redirects: true, &block)
100
+ urls = (@dsl_start || []) if urls.empty?
101
+ raise DSL_ERROR__NO_START_URL if urls.empty?
102
+
103
+ urls.map! { |url| Wgit::Url.parse(url) }
104
+ crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
105
+ end
106
+
107
+ # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
108
+ # url is provided, then the first `start` URL is used.
109
+ #
110
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
111
+ # crawled. It is recommended that this URL be the index page of the site
112
+ # to give a greater chance of finding all pages within that site/host.
113
+ # Defaults to the `start` URLs.
114
+ # @param follow [String] The xpath extracting links to be followed during
115
+ # the crawl. This changes how a site is crawled. Only links pointing to
116
+ # the site domain are allowed. The `:default` is any `<a>` href returning
117
+ # HTML. This can also be set using `follow`.
118
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
119
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
120
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
121
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
122
+ # @yield [doc] Given each crawled page (Wgit::Document) of the site.
123
+ # A block is the only way to interact with each crawled Document.
124
+ # Use `doc.empty?` to determine if the page is valid.
125
+ # @raise [StandardError] If no url is provided and no `start` URL has been
126
+ # set.
127
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
128
+ # from all of the site's pages or nil if the given url could not be
129
+ # crawled successfully.
130
+ def crawl_site(
131
+ *urls, follow: @dsl_follow,
132
+ allow_paths: nil, disallow_paths: nil, &block
133
+ )
134
+ urls = (@dsl_start || []) if urls.empty?
135
+ raise DSL_ERROR__NO_START_URL if urls.empty?
136
+
137
+ xpath = follow || :default
138
+ opts = {
139
+ follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
+ }
141
+
142
+ urls.reduce([]) do |externals, url|
143
+ externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
144
+ end
145
+ end
146
+
147
+ # Returns the DSL's `crawler#last_response`.
148
+ #
149
+ # @return [Wgit::Response] The response from the last URL crawled.
150
+ def last_response
151
+ crawler.last_response
152
+ end
153
+
154
+ # Nilifies the DSL instance variables.
155
+ def reset
156
+ @dsl_crawler = nil
157
+ @dsl_start = nil
158
+ @dsl_follow = nil
159
+ @dsl_conn_str = nil
160
+ end
161
+
162
+ ### INDEXER METHODS ###
163
+
164
+ # Defines the connection string to the database used in subsequent `index*`
165
+ # method calls. This method is optional as the connection string can be
166
+ # passed to the index method instead.
167
+ #
168
+ # @param conn_str [String] The connection string used to connect to the
169
+ # database in subsequent `index*` method calls.
170
+ def connection_string(conn_str)
171
+ @dsl_conn_str = conn_str
172
+ end
173
+
174
+ # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
175
+ #
176
+ # @param connection_string [String] The database connection string. Set as
177
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
178
+ # `connection_string`.
179
+ # @param max_sites [Integer] The number of separate and whole
180
+ # websites to be crawled before the method exits. Defaults to -1 which
181
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
182
+ # @param max_data [Integer] The maximum amount of bytes that will be
183
+ # scraped from the web (default is 1GB). Note, that this value is used to
184
+ # determine when to stop crawling; it's not a guarantee of the max data
185
+ # that will be obtained.
186
+ def index_www(
187
+ connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
188
+ )
189
+ db = Wgit::Database.new(connection_string)
190
+ indexer = Wgit::Indexer.new(db, crawler)
191
+
192
+ indexer.index_www(max_sites: max_sites, max_data: max_data)
193
+ end
194
+
195
+ # Indexes a single website using `Wgit::Indexer#index_site` underneath.
196
+ #
197
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
198
+ # crawl. Can be set using `start`.
199
+ # @param connection_string [String] The database connection string. Set as
200
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
201
+ # `connection_string`.
202
+ # @param insert_externals [Boolean] Whether or not to insert the website's
203
+ # external URL's into the database.
204
+ # @param follow [String] The xpath extracting links to be followed during
205
+ # the crawl. This changes how a site is crawled. Only links pointing to
206
+ # the site domain are allowed. The `:default` is any `<a>` href returning
207
+ # HTML. This can also be set using `follow`.
208
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
209
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
210
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
211
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
212
+ # @yield [doc] Given the Wgit::Document of each crawled webpage, before it
213
+ # is inserted into the database allowing for prior manipulation.
214
+ # @raise [StandardError] If no url is provided and no `start` URL has been
215
+ # set.
216
+ # @return [Integer] The total number of pages crawled within the website.
217
+ def index_site(
218
+ *urls, connection_string: @dsl_conn_str,
219
+ insert_externals: false, follow: @dsl_follow,
220
+ allow_paths: nil, disallow_paths: nil, &block
221
+ )
222
+ urls = (@dsl_start || []) if urls.empty?
223
+ raise DSL_ERROR__NO_START_URL if urls.empty?
224
+
225
+ db = Wgit::Database.new(connection_string)
226
+ indexer = Wgit::Indexer.new(db, crawler)
227
+ xpath = follow || :default
228
+ crawl_opts = {
229
+ insert_externals: insert_externals, follow: xpath,
230
+ allow_paths: allow_paths, disallow_paths: disallow_paths
231
+ }
232
+
233
+ urls.reduce(0) do |total, url|
234
+ total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
235
+ end
236
+ end
237
+
238
+ # Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
239
+ #
240
+ # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
241
+ # `start` URL(s).
242
+ # @param connection_string [String] The database connection string. Set as
243
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
244
+ # `connection_string`.
245
+ # @param insert_externals [Boolean] Whether or not to insert the website's
246
+ # external URL's into the database.
247
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
248
+ # before it's inserted into the database allowing for prior
249
+ # manipulation. Return nil or false from the block to prevent the
250
+ # document from being saved into the database.
251
+ # @raise [StandardError] If no urls are provided and no `start` URL has
252
+ # been set.
253
+ def index(
254
+ *urls, connection_string: @dsl_conn_str,
255
+ insert_externals: false, &block
256
+ )
257
+ urls = (@dsl_start || []) if urls.empty?
258
+ raise DSL_ERROR__NO_START_URL if urls.empty?
259
+
260
+ db = Wgit::Database.new(connection_string)
261
+ indexer = Wgit::Indexer.new(db, crawler)
262
+
263
+ urls.map! { |url| Wgit::Url.parse(url) }
264
+ indexer.index_urls(*urls, insert_externals: insert_externals, &block)
265
+ end
266
+
267
+ # Performs a search of the database's indexed documents and pretty prints
268
+ # the results in a search engine-esque format. See `Wgit::Database#search!`
269
+ # and `Wgit::Document#search!` for details of how the search works.
270
+ #
271
+ # @param query [String] The text query to search with.
272
+ # @param connection_string [String] The database connection string. Set as
273
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
274
+ # `connection_string`.
275
+ # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
276
+ # to output text somewhere e.g. a file or STDERR. Use nil for no output.
277
+ # @param case_sensitive [Boolean] Whether character case must match.
278
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
279
+ # for separately.
280
+ # @param limit [Integer] The max number of results to print.
281
+ # @param skip [Integer] The number of DB records to skip.
282
+ # @param sentence_limit [Integer] The max length of each result's text
283
+ # snippet.
284
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
285
+ # database containing only its matching `#text`.
286
+ # @return [Array<Wgit::Document>] The search results with matching text.
287
+ def search(
288
+ query, connection_string: @dsl_conn_str, stream: STDOUT,
289
+ case_sensitive: false, whole_sentence: true,
290
+ limit: 10, skip: 0, sentence_limit: 80, &block
291
+ )
292
+ stream ||= File.open(File::NULL, 'w')
293
+ db = Wgit::Database.new(connection_string)
294
+
295
+ results = db.search!(
296
+ query,
297
+ case_sensitive: case_sensitive,
298
+ whole_sentence: whole_sentence,
299
+ limit: limit,
300
+ skip: skip,
301
+ sentence_limit: sentence_limit,
302
+ &block
303
+ )
304
+
305
+ Wgit::Utils.printf_search_results(results, stream: stream)
306
+
307
+ results
308
+ end
309
+
310
+ # Deletes everything in the urls and documents collections by calling
311
+ # `Wgit::Database#clear_db` underneath. This will nuke the entire database
312
+ # so yeah... be careful.
313
+ #
314
+ # @return [Integer] The number of deleted records.
315
+ def clear_db!(connection_string: @dsl_conn_str)
316
+ db = Wgit::Database.new(connection_string)
317
+ db.clear_db
318
+ end
319
+
320
+ alias crawl_r crawl_site
321
+ alias index_r index_site
322
+ alias start_urls start
323
+ end
324
+ end
@@ -4,129 +4,8 @@ require_relative 'crawler'
4
4
  require_relative 'database/database'
5
5
 
6
6
  module Wgit
7
- # Convience method to index the World Wide Web using
8
- # Wgit::Indexer#index_www.
9
- #
10
- # Retrieves uncrawled url's from the database and recursively crawls each
11
- # site storing their internal pages into the database and adding their
12
- # external url's to be crawled later on. Logs info on the crawl
13
- # using Wgit.logger as it goes along.
14
- #
15
- # @param connection_string [String] The database connection string. Set as
16
- # nil to use ENV['WGIT_CONNECTION_STRING'].
17
- # @param max_sites [Integer] The number of separate and whole
18
- # websites to be crawled before the method exits. Defaults to -1 which
19
- # means the crawl will occur until manually stopped (Ctrl+C etc).
20
- # @param max_data [Integer] The maximum amount of bytes that will be
21
- # scraped from the web (default is 1GB). Note, that this value is used to
22
- # determine when to stop crawling; it's not a guarantee of the max data
23
- # that will be obtained.
24
- def self.index_www(
25
- connection_string: nil, max_sites: -1, max_data: 1_048_576_000
26
- )
27
- db = Wgit::Database.new(connection_string)
28
- indexer = Wgit::Indexer.new(db)
29
- indexer.index_www(max_sites: max_sites, max_data: max_data)
30
- end
31
-
32
- # Convience method to index a single website using
33
- # Wgit::Indexer#index_site.
34
- #
35
- # Crawls a single website's pages and stores them into the database.
36
- # There is no max download limit so be careful which sites you index.
37
- #
38
- # @param url [Wgit::Url, String] The base Url of the website to crawl.
39
- # @param connection_string [String] The database connection string. Set as
40
- # nil to use ENV['WGIT_CONNECTION_STRING'].
41
- # @param insert_externals [Boolean] Whether or not to insert the website's
42
- # external Url's into the database.
43
- # @param allow_paths [String, Array<String>] Filters links by selecting
44
- # them if their path `File.fnmatch?` one of allow_paths.
45
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
46
- # them if their path `File.fnmatch?` one of disallow_paths.
47
- # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
48
- # inserted into the database allowing for prior manipulation.
49
- # @return [Integer] The total number of pages crawled within the website.
50
- def self.index_site(
51
- url, connection_string: nil, insert_externals: true,
52
- allow_paths: nil, disallow_paths: nil, &block
53
- )
54
- url = Wgit::Url.parse(url)
55
- db = Wgit::Database.new(connection_string)
56
- indexer = Wgit::Indexer.new(db)
57
- indexer.index_site(
58
- url, insert_externals: insert_externals,
59
- allow_paths: allow_paths, disallow_paths: disallow_paths, &block
60
- )
61
- end
62
-
63
- # Convience method to index a single webpage using
64
- # Wgit::Indexer#index_page.
65
- #
66
- # Crawls a single webpage and stores it into the database.
67
- # There is no max download limit so be careful of large pages.
68
- #
69
- # @param url [Wgit::Url, String] The Url of the webpage to crawl.
70
- # @param connection_string [String] The database connection string. Set as
71
- # nil to use ENV['WGIT_CONNECTION_STRING'].
72
- # @param insert_externals [Boolean] Whether or not to insert the website's
73
- # external Url's into the database.
74
- # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
75
- # inserted into the database allowing for prior manipulation.
76
- def self.index_page(
77
- url, connection_string: nil, insert_externals: true, &block
78
- )
79
- url = Wgit::Url.parse(url)
80
- db = Wgit::Database.new(connection_string)
81
- indexer = Wgit::Indexer.new(db)
82
- indexer.index_page(url, insert_externals: insert_externals, &block)
83
- end
84
-
85
- # Performs a search of the database's indexed documents and pretty prints
86
- # the results. See Wgit::Database#search and Wgit::Document#search for
87
- # details of how the search works.
88
- #
89
- # @param query [String] The text query to search with.
90
- # @param connection_string [String] The database connection string. Set as
91
- # nil to use ENV['WGIT_CONNECTION_STRING'].
92
- # @param case_sensitive [Boolean] Whether character case must match.
93
- # @param whole_sentence [Boolean] Whether multiple words should be searched
94
- # for separately.
95
- # @param limit [Integer] The max number of results to print.
96
- # @param skip [Integer] The number of DB records to skip.
97
- # @param sentence_limit [Integer] The max length of each result's text
98
- # snippet.
99
- # @yield [doc] Given each search result (Wgit::Document) returned from the
100
- # database.
101
- def self.indexed_search(
102
- query, connection_string: nil,
103
- case_sensitive: false, whole_sentence: true,
104
- limit: 10, skip: 0, sentence_limit: 80, &block
105
- )
106
- db = Wgit::Database.new(connection_string)
107
-
108
- results = db.search(
109
- query,
110
- case_sensitive: case_sensitive,
111
- whole_sentence: whole_sentence,
112
- limit: limit,
113
- skip: skip,
114
- &block
115
- )
116
-
117
- results.each do |doc|
118
- doc.search!(
119
- query,
120
- case_sensitive: case_sensitive,
121
- whole_sentence: whole_sentence,
122
- sentence_limit: sentence_limit
123
- )
124
- end
125
-
126
- Wgit::Utils.printf_search_results(results)
127
- end
128
-
129
- # Class which crawls and saves the indexed Documents to a database.
7
+ # Class which crawls and saves the Documents to a database. Can be thought of
8
+ # as a combination of Wgit::Crawler and Wgit::Database.
130
9
  class Indexer
131
10
  # The crawler used to index the WWW.
132
11
  attr_reader :crawler
@@ -139,7 +18,7 @@ module Wgit
139
18
  # @param database [Wgit::Database] The database instance (already
140
19
  # initialized and connected) used to index.
141
20
  # @param crawler [Wgit::Crawler] The crawler instance used to index.
142
- def initialize(database, crawler = Wgit::Crawler.new)
21
+ def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
143
22
  @db = database
144
23
  @crawler = crawler
145
24
  end
@@ -189,7 +68,8 @@ database capacity, exiting.")
189
68
 
190
69
  site_docs_count = 0
191
70
  ext_links = @crawler.crawl_site(url) do |doc|
192
- if !doc.empty? && write_doc_to_db(doc)
71
+ unless doc.empty?
72
+ write_doc_to_db(doc)
193
73
  docs_count += 1
194
74
  site_docs_count += 1
195
75
  end
@@ -198,12 +78,9 @@ database capacity, exiting.")
198
78
  raise 'Error updating url' unless @db.update(url) == 1
199
79
 
200
80
  urls_count += write_urls_to_db(ext_links)
201
-
202
- Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
203
- site: #{url}")
204
81
  end
205
82
 
206
- Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
83
+ Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
207
84
  overall for this iteration.")
208
85
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
209
86
  the next iteration.")
@@ -219,66 +96,91 @@ the next iteration.")
219
96
  # @param url [Wgit::Url] The base Url of the website to crawl.
220
97
  # @param insert_externals [Boolean] Whether or not to insert the website's
221
98
  # external Url's into the database.
222
- # @param allow_paths [String, Array<String>] Filters links by selecting
223
- # them if their path `File.fnmatch?` one of allow_paths.
224
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
225
- # them if their path `File.fnmatch?` one of disallow_paths.
99
+ # @param follow [String] The xpath extracting links to be followed during
100
+ # the crawl. This changes how a site is crawled. Only links pointing to
101
+ # the site domain are allowed. The `:default` is any `<a>` href returning
102
+ # HTML.
103
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
104
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
105
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
106
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
226
107
  # @yield [doc] Given the Wgit::Document of each crawled web page before
227
108
  # it's inserted into the database allowing for prior manipulation. Return
228
109
  # nil or false from the block to prevent the document from being saved
229
110
  # into the database.
230
111
  # @return [Integer] The total number of webpages/documents indexed.
231
112
  def index_site(
232
- url, insert_externals: true, allow_paths: nil, disallow_paths: nil
113
+ url, insert_externals: false, follow: :default,
114
+ allow_paths: nil, disallow_paths: nil
233
115
  )
234
- crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
116
+ crawl_opts = {
117
+ follow: follow,
118
+ allow_paths: allow_paths,
119
+ disallow_paths: disallow_paths
120
+ }
235
121
  total_pages_indexed = 0
236
122
 
237
- ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
238
- result = true
239
- result = yield(doc) if block_given?
123
+ ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
124
+ result = block_given? ? yield(doc) : true
240
125
 
241
- if result && !doc.empty? && write_doc_to_db(doc)
126
+ if result && !doc.empty?
127
+ write_doc_to_db(doc)
242
128
  total_pages_indexed += 1
243
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
244
129
  end
245
130
  end
246
131
 
247
- @db.url?(url) ? @db.update(url) : @db.insert(url)
132
+ @db.upsert(url)
248
133
 
249
134
  if insert_externals && ext_urls
250
135
  num_inserted_urls = write_urls_to_db(ext_urls)
251
136
  Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
252
137
  end
253
138
 
254
- Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
255
- site: #{url}")
139
+ Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
140
+ the site: #{url}")
256
141
 
257
142
  total_pages_indexed
258
143
  end
259
144
 
145
+ # Crawls one or more webpages and stores them into the database.
146
+ # There is no max download limit so be careful of large pages.
147
+ # Logs info on the crawl using Wgit.logger as it goes along.
148
+ #
149
+ # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
151
+ # external Url's into the database.
152
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
153
+ # before it's inserted into the database allowing for prior
154
+ # manipulation. Return nil or false from the block to prevent the
155
+ # document from being saved into the database.
156
+ # @raise [StandardError] if no urls are provided.
157
+ def index_urls(*urls, insert_externals: false, &block)
158
+ raise 'You must provide at least one Url' if urls.empty?
159
+
160
+ opts = { insert_externals: insert_externals }
161
+ Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
+
163
+ nil
164
+ end
165
+
260
166
  # Crawls a single webpage and stores it into the database.
261
167
  # There is no max download limit so be careful of large pages.
262
168
  # Logs info on the crawl using Wgit.logger as it goes along.
263
169
  #
264
170
  # @param url [Wgit::Url] The webpage Url to crawl.
265
- # @param insert_externals [Boolean] Whether or not to insert the webpage's
171
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
266
172
  # external Url's into the database.
267
173
  # @yield [doc] Given the Wgit::Document of the crawled webpage,
268
174
  # before it's inserted into the database allowing for prior
269
175
  # manipulation. Return nil or false from the block to prevent the
270
176
  # document from being saved into the database.
271
- def index_page(url, insert_externals: true)
177
+ def index_url(url, insert_externals: false)
272
178
  document = @crawler.crawl_url(url) do |doc|
273
- result = true
274
- result = yield(doc) if block_given?
275
-
276
- if result && !doc.empty? && write_doc_to_db(doc)
277
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
278
- end
179
+ result = block_given? ? yield(doc) : true
180
+ write_doc_to_db(doc) if result && !doc.empty?
279
181
  end
280
182
 
281
- @db.url?(url) ? @db.update(url) : @db.insert(url)
183
+ @db.upsert(url)
282
184
 
283
185
  ext_urls = document&.external_links
284
186
  if insert_externals && ext_urls
@@ -311,23 +213,19 @@ site: #{url}")
311
213
  # collection deliberately prevents duplicate inserts.
312
214
  #
313
215
  # @param doc [Wgit::Document] The document to write to the DB.
314
- # @return [Boolean] True if the write was successful, false otherwise.
315
216
  def write_doc_to_db(doc)
316
- @db.insert(doc)
317
- Wgit.logger.info("Saved document for url: #{doc.url}")
318
-
319
- true
320
- rescue Mongo::Error::OperationFailure
321
- Wgit.logger.info("Document already exists: #{doc.url}")
322
-
323
- false
217
+ if @db.upsert(doc)
218
+ Wgit.logger.info("Saved document for url: #{doc.url}")
219
+ else
220
+ Wgit.logger.info("Updated document for url: #{doc.url}")
221
+ end
324
222
  end
325
223
 
326
224
  # Write the urls to the DB. Note that the unique url index on the urls
327
225
  # collection deliberately prevents duplicate inserts.
328
226
  #
329
227
  # @param urls [Array<Wgit::Url>] The urls to write to the DB.
330
- # @return [Boolean] True if the write was successful, false otherwise.
228
+ # @return [Integer] The number of inserted urls.
331
229
  def write_urls_to_db(urls)
332
230
  count = 0
333
231
 
@@ -341,6 +239,7 @@ site: #{url}")
341
239
 
342
240
  @db.insert(url)
343
241
  count += 1
242
+
344
243
  Wgit.logger.info("Inserted external url: #{url}")
345
244
  rescue Mongo::Error::OperationFailure
346
245
  Wgit.logger.info("External url already exists: #{url}")
@@ -348,5 +247,9 @@ site: #{url}")
348
247
 
349
248
  count
350
249
  end
250
+
251
+ alias database db
252
+ alias index index_urls
253
+ alias index_r index_site
351
254
  end
352
255
  end