wgit 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,19 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- ### Default Document Extensions ###
3
+ ### Default Document Extractors ###
4
4
 
5
5
  # Base.
6
- Wgit::Document.define_extension(
6
+ Wgit::Document.define_extractor(
7
7
  :base,
8
8
  '//base/@href',
9
9
  singleton: true,
10
10
  text_content_only: true
11
11
  ) do |base|
12
- Wgit::Url.parse_or_nil(base) if base
12
+ Wgit::Url.parse?(base) if base
13
13
  end
14
14
 
15
15
  # Title.
16
- Wgit::Document.define_extension(
16
+ Wgit::Document.define_extractor(
17
17
  :title,
18
18
  '//title',
19
19
  singleton: true,
@@ -21,7 +21,7 @@ Wgit::Document.define_extension(
21
21
  )
22
22
 
23
23
  # Description.
24
- Wgit::Document.define_extension(
24
+ Wgit::Document.define_extractor(
25
25
  :description,
26
26
  '//meta[@name="description"]/@content',
27
27
  singleton: true,
@@ -29,7 +29,7 @@ Wgit::Document.define_extension(
29
29
  )
30
30
 
31
31
  # Author.
32
- Wgit::Document.define_extension(
32
+ Wgit::Document.define_extractor(
33
33
  :author,
34
34
  '//meta[@name="author"]/@content',
35
35
  singleton: true,
@@ -37,7 +37,7 @@ Wgit::Document.define_extension(
37
37
  )
38
38
 
39
39
  # Keywords.
40
- Wgit::Document.define_extension(
40
+ Wgit::Document.define_extractor(
41
41
  :keywords,
42
42
  '//meta[@name="keywords"]/@content',
43
43
  singleton: true,
@@ -45,25 +45,25 @@ Wgit::Document.define_extension(
45
45
  ) do |keywords, _source, type|
46
46
  if keywords && (type == :document)
47
47
  keywords = keywords.split(',')
48
- Wgit::Utils.process_arr(keywords)
48
+ Wgit::Utils.sanitize(keywords)
49
49
  end
50
50
  keywords
51
51
  end
52
52
 
53
53
  # Links.
54
- Wgit::Document.define_extension(
54
+ Wgit::Document.define_extractor(
55
55
  :links,
56
56
  '//a/@href',
57
57
  singleton: false,
58
58
  text_content_only: true
59
59
  ) do |links|
60
60
  links
61
- .map { |link| Wgit::Url.parse_or_nil(link) }
61
+ .map { |link| Wgit::Url.parse?(link) }
62
62
  .compact # Remove unparsable links.
63
63
  end
64
64
 
65
65
  # Text.
66
- Wgit::Document.define_extension(
66
+ Wgit::Document.define_extractor(
67
67
  :text,
68
68
  proc { Wgit::Document.text_elements_xpath },
69
69
  singleton: false,
@@ -0,0 +1,324 @@
1
+ module Wgit
2
+ # DSL methods that act as a wrapper around Wgit's underlying class methods.
3
+ # All instance vars/constants are prefixed to avoid conflicts when included.
4
+ module DSL
5
+ # Error message shown when there's no URL to crawl.
6
+ DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
7
+ the 'start' function".freeze
8
+
9
+ ### CRAWLER METHODS ###
10
+
11
+ # Defines an extractor using `Wgit::Document.define_extractor` underneath.
12
+ #
13
+ # @param var [Symbol] The name of the variable to be initialised, that will
14
+ # contain the extracted content.
15
+ # @param xpath [String, #call] The xpath used to find the element(s)
16
+ # of the webpage. Only used when initializing from HTML.
17
+ #
18
+ # Pass a callable object (proc etc.) if you want the
19
+ # xpath value to be derived on Document initialisation (instead of when
20
+ # the extractor is defined). The call method must return a valid xpath
21
+ # String.
22
+ # @param opts [Hash] The options to define an extractor with. The
23
+ # options are only used when intializing from HTML, not the database.
24
+ # @option opts [Boolean] :singleton The singleton option determines
25
+ # whether or not the result(s) should be in an Array. If multiple
26
+ # results are found and singleton is true then the first result will be
27
+ # used. Defaults to true.
28
+ # @option opts [Boolean] :text_content_only The text_content_only option
29
+ # if true will use the text content of the Nokogiri result object,
30
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
31
+ # @yield The block is executed when a Wgit::Document is initialized,
32
+ # regardless of the source. Use it (optionally) to process the result
33
+ # value.
34
+ # @yieldparam value [Object] The result value to be assigned to the new
35
+ # `var`.
36
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
37
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
38
+ # `:object`.
39
+ # @yieldreturn [Object] The return value of the block becomes the new var's
40
+ # value. Return the block's value param unchanged if you want to inspect.
41
+ # @raise [StandardError] If the var param isn't valid.
42
+ # @return [Symbol] The given var Symbol if successful.
43
+ def extract(var, xpath, opts = {}, &block)
44
+ Wgit::Document.define_extractor(var, xpath, opts, &block)
45
+ end
46
+
47
+ # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
48
+ # index methods used by the DSL. See the Wgit::Crawler documentation for
49
+ # more details.
50
+ #
51
+ # @yield [crawler] The created crawler; use the block to configure.
52
+ # @return [Wgit::Crawler] The created crawler used by the DSL.
53
+ def crawler
54
+ @dsl_crawler ||= Wgit::Crawler.new
55
+ yield @dsl_crawler if block_given?
56
+ @dsl_crawler
57
+ end
58
+
59
+ # Sets the URL to be crawled when a `crawl*` or `index*` method is
60
+ # subsequently called. Calling this is optional as the URL can be
61
+ # passed to the method instead. You can also omit the url param and just
62
+ # use the block to configure the crawler instead.
63
+ #
64
+ # @param urls [*String, *Wgit::Url] The URL(s) to crawl
65
+ # or nil (if only using the block to configure the crawler).
66
+ # @yield [crawler] The crawler that'll be used in the subsequent
67
+ # crawl/index; use the block to configure.
68
+ def start(*urls, &block)
69
+ crawler(&block)
70
+ @dsl_start = urls
71
+ end
72
+
73
+ # Sets the xpath to be followed when `crawl_site` or `index_site` is
74
+ # subsequently called. Calling this method is optional as the default is to
75
+ # follow all `<a>` href's that point to the site domain. You can also pass
76
+ # `follow:` to the crawl/index methods directly.
77
+ #
78
+ # @param xpath [String] The xpath which is followed when crawling/indexing
79
+ # a site. Use `:default` to restore the default follow logic.
80
+ def follow(xpath)
81
+ @dsl_follow = xpath
82
+ end
83
+
84
+ # Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
85
+ # underneath. If no urls are provided, then the `start` URL is used.
86
+ #
87
+ # @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
88
+ # URL(s).
89
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
90
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
91
+ # e.g. :host only allows redirects within the same host. Choose from
92
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
93
+ # This value will be used for all urls crawled.
94
+ # @yield [doc] Given each crawled page (Wgit::Document); this is the only
95
+ # way to interact with them.
96
+ # @raise [StandardError] If no urls are provided and no `start` URL has
97
+ # been set.
98
+ # @return [Wgit::Document] The last Document crawled.
99
+ def crawl(*urls, follow_redirects: true, &block)
100
+ urls = (@dsl_start || []) if urls.empty?
101
+ raise DSL_ERROR__NO_START_URL if urls.empty?
102
+
103
+ urls.map! { |url| Wgit::Url.parse(url) }
104
+ crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
105
+ end
106
+
107
+ # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
108
+ # url is provided, then the first `start` URL is used.
109
+ #
110
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
111
+ # crawled. It is recommended that this URL be the index page of the site
112
+ # to give a greater chance of finding all pages within that site/host.
113
+ # Defaults to the `start` URLs.
114
+ # @param follow [String] The xpath extracting links to be followed during
115
+ # the crawl. This changes how a site is crawled. Only links pointing to
116
+ # the site domain are allowed. The `:default` is any `<a>` href returning
117
+ # HTML. This can also be set using `follow`.
118
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
119
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
120
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
121
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
122
+ # @yield [doc] Given each crawled page (Wgit::Document) of the site.
123
+ # A block is the only way to interact with each crawled Document.
124
+ # Use `doc.empty?` to determine if the page is valid.
125
+ # @raise [StandardError] If no url is provided and no `start` URL has been
126
+ # set.
127
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
128
+ # from all of the site's pages or nil if the given url could not be
129
+ # crawled successfully.
130
+ def crawl_site(
131
+ *urls, follow: @dsl_follow,
132
+ allow_paths: nil, disallow_paths: nil, &block
133
+ )
134
+ urls = (@dsl_start || []) if urls.empty?
135
+ raise DSL_ERROR__NO_START_URL if urls.empty?
136
+
137
+ xpath = follow || :default
138
+ opts = {
139
+ follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
+ }
141
+
142
+ urls.reduce([]) do |externals, url|
143
+ externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
144
+ end
145
+ end
146
+
147
+ # Returns the DSL's `crawler#last_response`.
148
+ #
149
+ # @return [Wgit::Response] The response from the last URL crawled.
150
+ def last_response
151
+ crawler.last_response
152
+ end
153
+
154
+ # Nilifies the DSL instance variables.
155
+ def reset
156
+ @dsl_crawler = nil
157
+ @dsl_start = nil
158
+ @dsl_follow = nil
159
+ @dsl_conn_str = nil
160
+ end
161
+
162
+ ### INDEXER METHODS ###
163
+
164
+ # Defines the connection string to the database used in subsequent `index*`
165
+ # method calls. This method is optional as the connection string can be
166
+ # passed to the index method instead.
167
+ #
168
+ # @param conn_str [String] The connection string used to connect to the
169
+ # database in subsequent `index*` method calls.
170
+ def connection_string(conn_str)
171
+ @dsl_conn_str = conn_str
172
+ end
173
+
174
+ # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
175
+ #
176
+ # @param connection_string [String] The database connection string. Set as
177
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
178
+ # `connection_string`.
179
+ # @param max_sites [Integer] The number of separate and whole
180
+ # websites to be crawled before the method exits. Defaults to -1 which
181
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
182
+ # @param max_data [Integer] The maximum amount of bytes that will be
183
+ # scraped from the web (default is 1GB). Note, that this value is used to
184
+ # determine when to stop crawling; it's not a guarantee of the max data
185
+ # that will be obtained.
186
+ def index_www(
187
+ connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
188
+ )
189
+ db = Wgit::Database.new(connection_string)
190
+ indexer = Wgit::Indexer.new(db, crawler)
191
+
192
+ indexer.index_www(max_sites: max_sites, max_data: max_data)
193
+ end
194
+
195
+ # Indexes a single website using `Wgit::Indexer#index_site` underneath.
196
+ #
197
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
198
+ # crawl. Can be set using `start`.
199
+ # @param connection_string [String] The database connection string. Set as
200
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
201
+ # `connection_string`.
202
+ # @param insert_externals [Boolean] Whether or not to insert the website's
203
+ # external URL's into the database.
204
+ # @param follow [String] The xpath extracting links to be followed during
205
+ # the crawl. This changes how a site is crawled. Only links pointing to
206
+ # the site domain are allowed. The `:default` is any `<a>` href returning
207
+ # HTML. This can also be set using `follow`.
208
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
209
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
210
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
211
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
212
+ # @yield [doc] Given the Wgit::Document of each crawled webpage, before it
213
+ # is inserted into the database allowing for prior manipulation.
214
+ # @raise [StandardError] If no url is provided and no `start` URL has been
215
+ # set.
216
+ # @return [Integer] The total number of pages crawled within the website.
217
+ def index_site(
218
+ *urls, connection_string: @dsl_conn_str,
219
+ insert_externals: false, follow: @dsl_follow,
220
+ allow_paths: nil, disallow_paths: nil, &block
221
+ )
222
+ urls = (@dsl_start || []) if urls.empty?
223
+ raise DSL_ERROR__NO_START_URL if urls.empty?
224
+
225
+ db = Wgit::Database.new(connection_string)
226
+ indexer = Wgit::Indexer.new(db, crawler)
227
+ xpath = follow || :default
228
+ crawl_opts = {
229
+ insert_externals: insert_externals, follow: xpath,
230
+ allow_paths: allow_paths, disallow_paths: disallow_paths
231
+ }
232
+
233
+ urls.reduce(0) do |total, url|
234
+ total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
235
+ end
236
+ end
237
+
238
+ # Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
239
+ #
240
+ # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
241
+ # `start` URL(s).
242
+ # @param connection_string [String] The database connection string. Set as
243
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
244
+ # `connection_string`.
245
+ # @param insert_externals [Boolean] Whether or not to insert the website's
246
+ # external URL's into the database.
247
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
248
+ # before it's inserted into the database allowing for prior
249
+ # manipulation. Return nil or false from the block to prevent the
250
+ # document from being saved into the database.
251
+ # @raise [StandardError] If no urls are provided and no `start` URL has
252
+ # been set.
253
+ def index(
254
+ *urls, connection_string: @dsl_conn_str,
255
+ insert_externals: false, &block
256
+ )
257
+ urls = (@dsl_start || []) if urls.empty?
258
+ raise DSL_ERROR__NO_START_URL if urls.empty?
259
+
260
+ db = Wgit::Database.new(connection_string)
261
+ indexer = Wgit::Indexer.new(db, crawler)
262
+
263
+ urls.map! { |url| Wgit::Url.parse(url) }
264
+ indexer.index_urls(*urls, insert_externals: insert_externals, &block)
265
+ end
266
+
267
+ # Performs a search of the database's indexed documents and pretty prints
268
+ # the results in a search engine-esque format. See `Wgit::Database#search!`
269
+ # and `Wgit::Document#search!` for details of how the search works.
270
+ #
271
+ # @param query [String] The text query to search with.
272
+ # @param connection_string [String] The database connection string. Set as
273
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
274
+ # `connection_string`.
275
+ # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
276
+ # to output text somewhere e.g. a file or STDERR. Use nil for no output.
277
+ # @param case_sensitive [Boolean] Whether character case must match.
278
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
279
+ # for separately.
280
+ # @param limit [Integer] The max number of results to print.
281
+ # @param skip [Integer] The number of DB records to skip.
282
+ # @param sentence_limit [Integer] The max length of each result's text
283
+ # snippet.
284
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
285
+ # database containing only its matching `#text`.
286
+ # @return [Array<Wgit::Document>] The search results with matching text.
287
+ def search(
288
+ query, connection_string: @dsl_conn_str, stream: STDOUT,
289
+ case_sensitive: false, whole_sentence: true,
290
+ limit: 10, skip: 0, sentence_limit: 80, &block
291
+ )
292
+ stream ||= File.open(File::NULL, 'w')
293
+ db = Wgit::Database.new(connection_string)
294
+
295
+ results = db.search!(
296
+ query,
297
+ case_sensitive: case_sensitive,
298
+ whole_sentence: whole_sentence,
299
+ limit: limit,
300
+ skip: skip,
301
+ sentence_limit: sentence_limit,
302
+ &block
303
+ )
304
+
305
+ Wgit::Utils.printf_search_results(results, stream: stream)
306
+
307
+ results
308
+ end
309
+
310
+ # Deletes everything in the urls and documents collections by calling
311
+ # `Wgit::Database#clear_db` underneath. This will nuke the entire database
312
+ # so yeah... be careful.
313
+ #
314
+ # @return [Integer] The number of deleted records.
315
+ def clear_db!(connection_string: @dsl_conn_str)
316
+ db = Wgit::Database.new(connection_string)
317
+ db.clear_db
318
+ end
319
+
320
+ alias crawl_r crawl_site
321
+ alias index_r index_site
322
+ alias start_urls start
323
+ end
324
+ end
@@ -4,129 +4,8 @@ require_relative 'crawler'
4
4
  require_relative 'database/database'
5
5
 
6
6
  module Wgit
7
- # Convience method to index the World Wide Web using
8
- # Wgit::Indexer#index_www.
9
- #
10
- # Retrieves uncrawled url's from the database and recursively crawls each
11
- # site storing their internal pages into the database and adding their
12
- # external url's to be crawled later on. Logs info on the crawl
13
- # using Wgit.logger as it goes along.
14
- #
15
- # @param connection_string [String] The database connection string. Set as
16
- # nil to use ENV['WGIT_CONNECTION_STRING'].
17
- # @param max_sites [Integer] The number of separate and whole
18
- # websites to be crawled before the method exits. Defaults to -1 which
19
- # means the crawl will occur until manually stopped (Ctrl+C etc).
20
- # @param max_data [Integer] The maximum amount of bytes that will be
21
- # scraped from the web (default is 1GB). Note, that this value is used to
22
- # determine when to stop crawling; it's not a guarantee of the max data
23
- # that will be obtained.
24
- def self.index_www(
25
- connection_string: nil, max_sites: -1, max_data: 1_048_576_000
26
- )
27
- db = Wgit::Database.new(connection_string)
28
- indexer = Wgit::Indexer.new(db)
29
- indexer.index_www(max_sites: max_sites, max_data: max_data)
30
- end
31
-
32
- # Convience method to index a single website using
33
- # Wgit::Indexer#index_site.
34
- #
35
- # Crawls a single website's pages and stores them into the database.
36
- # There is no max download limit so be careful which sites you index.
37
- #
38
- # @param url [Wgit::Url, String] The base Url of the website to crawl.
39
- # @param connection_string [String] The database connection string. Set as
40
- # nil to use ENV['WGIT_CONNECTION_STRING'].
41
- # @param insert_externals [Boolean] Whether or not to insert the website's
42
- # external Url's into the database.
43
- # @param allow_paths [String, Array<String>] Filters links by selecting
44
- # them if their path `File.fnmatch?` one of allow_paths.
45
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
46
- # them if their path `File.fnmatch?` one of disallow_paths.
47
- # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
48
- # inserted into the database allowing for prior manipulation.
49
- # @return [Integer] The total number of pages crawled within the website.
50
- def self.index_site(
51
- url, connection_string: nil, insert_externals: true,
52
- allow_paths: nil, disallow_paths: nil, &block
53
- )
54
- url = Wgit::Url.parse(url)
55
- db = Wgit::Database.new(connection_string)
56
- indexer = Wgit::Indexer.new(db)
57
- indexer.index_site(
58
- url, insert_externals: insert_externals,
59
- allow_paths: allow_paths, disallow_paths: disallow_paths, &block
60
- )
61
- end
62
-
63
- # Convience method to index a single webpage using
64
- # Wgit::Indexer#index_page.
65
- #
66
- # Crawls a single webpage and stores it into the database.
67
- # There is no max download limit so be careful of large pages.
68
- #
69
- # @param url [Wgit::Url, String] The Url of the webpage to crawl.
70
- # @param connection_string [String] The database connection string. Set as
71
- # nil to use ENV['WGIT_CONNECTION_STRING'].
72
- # @param insert_externals [Boolean] Whether or not to insert the website's
73
- # external Url's into the database.
74
- # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
75
- # inserted into the database allowing for prior manipulation.
76
- def self.index_page(
77
- url, connection_string: nil, insert_externals: true, &block
78
- )
79
- url = Wgit::Url.parse(url)
80
- db = Wgit::Database.new(connection_string)
81
- indexer = Wgit::Indexer.new(db)
82
- indexer.index_page(url, insert_externals: insert_externals, &block)
83
- end
84
-
85
- # Performs a search of the database's indexed documents and pretty prints
86
- # the results. See Wgit::Database#search and Wgit::Document#search for
87
- # details of how the search works.
88
- #
89
- # @param query [String] The text query to search with.
90
- # @param connection_string [String] The database connection string. Set as
91
- # nil to use ENV['WGIT_CONNECTION_STRING'].
92
- # @param case_sensitive [Boolean] Whether character case must match.
93
- # @param whole_sentence [Boolean] Whether multiple words should be searched
94
- # for separately.
95
- # @param limit [Integer] The max number of results to print.
96
- # @param skip [Integer] The number of DB records to skip.
97
- # @param sentence_limit [Integer] The max length of each result's text
98
- # snippet.
99
- # @yield [doc] Given each search result (Wgit::Document) returned from the
100
- # database.
101
- def self.indexed_search(
102
- query, connection_string: nil,
103
- case_sensitive: false, whole_sentence: true,
104
- limit: 10, skip: 0, sentence_limit: 80, &block
105
- )
106
- db = Wgit::Database.new(connection_string)
107
-
108
- results = db.search(
109
- query,
110
- case_sensitive: case_sensitive,
111
- whole_sentence: whole_sentence,
112
- limit: limit,
113
- skip: skip,
114
- &block
115
- )
116
-
117
- results.each do |doc|
118
- doc.search!(
119
- query,
120
- case_sensitive: case_sensitive,
121
- whole_sentence: whole_sentence,
122
- sentence_limit: sentence_limit
123
- )
124
- end
125
-
126
- Wgit::Utils.printf_search_results(results)
127
- end
128
-
129
- # Class which crawls and saves the indexed Documents to a database.
7
+ # Class which crawls and saves the Documents to a database. Can be thought of
8
+ # as a combination of Wgit::Crawler and Wgit::Database.
130
9
  class Indexer
131
10
  # The crawler used to index the WWW.
132
11
  attr_reader :crawler
@@ -139,7 +18,7 @@ module Wgit
139
18
  # @param database [Wgit::Database] The database instance (already
140
19
  # initialized and connected) used to index.
141
20
  # @param crawler [Wgit::Crawler] The crawler instance used to index.
142
- def initialize(database, crawler = Wgit::Crawler.new)
21
+ def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
143
22
  @db = database
144
23
  @crawler = crawler
145
24
  end
@@ -189,7 +68,8 @@ database capacity, exiting.")
189
68
 
190
69
  site_docs_count = 0
191
70
  ext_links = @crawler.crawl_site(url) do |doc|
192
- if !doc.empty? && write_doc_to_db(doc)
71
+ unless doc.empty?
72
+ write_doc_to_db(doc)
193
73
  docs_count += 1
194
74
  site_docs_count += 1
195
75
  end
@@ -198,12 +78,9 @@ database capacity, exiting.")
198
78
  raise 'Error updating url' unless @db.update(url) == 1
199
79
 
200
80
  urls_count += write_urls_to_db(ext_links)
201
-
202
- Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
203
- site: #{url}")
204
81
  end
205
82
 
206
- Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
83
+ Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
207
84
  overall for this iteration.")
208
85
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
209
86
  the next iteration.")
@@ -219,66 +96,91 @@ the next iteration.")
219
96
  # @param url [Wgit::Url] The base Url of the website to crawl.
220
97
  # @param insert_externals [Boolean] Whether or not to insert the website's
221
98
  # external Url's into the database.
222
- # @param allow_paths [String, Array<String>] Filters links by selecting
223
- # them if their path `File.fnmatch?` one of allow_paths.
224
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
225
- # them if their path `File.fnmatch?` one of disallow_paths.
99
+ # @param follow [String] The xpath extracting links to be followed during
100
+ # the crawl. This changes how a site is crawled. Only links pointing to
101
+ # the site domain are allowed. The `:default` is any `<a>` href returning
102
+ # HTML.
103
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
104
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
105
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
106
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
226
107
  # @yield [doc] Given the Wgit::Document of each crawled web page before
227
108
  # it's inserted into the database allowing for prior manipulation. Return
228
109
  # nil or false from the block to prevent the document from being saved
229
110
  # into the database.
230
111
  # @return [Integer] The total number of webpages/documents indexed.
231
112
  def index_site(
232
- url, insert_externals: true, allow_paths: nil, disallow_paths: nil
113
+ url, insert_externals: false, follow: :default,
114
+ allow_paths: nil, disallow_paths: nil
233
115
  )
234
- crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
116
+ crawl_opts = {
117
+ follow: follow,
118
+ allow_paths: allow_paths,
119
+ disallow_paths: disallow_paths
120
+ }
235
121
  total_pages_indexed = 0
236
122
 
237
- ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
238
- result = true
239
- result = yield(doc) if block_given?
123
+ ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
124
+ result = block_given? ? yield(doc) : true
240
125
 
241
- if result && !doc.empty? && write_doc_to_db(doc)
126
+ if result && !doc.empty?
127
+ write_doc_to_db(doc)
242
128
  total_pages_indexed += 1
243
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
244
129
  end
245
130
  end
246
131
 
247
- @db.url?(url) ? @db.update(url) : @db.insert(url)
132
+ @db.upsert(url)
248
133
 
249
134
  if insert_externals && ext_urls
250
135
  num_inserted_urls = write_urls_to_db(ext_urls)
251
136
  Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
252
137
  end
253
138
 
254
- Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
255
- site: #{url}")
139
+ Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
140
+ the site: #{url}")
256
141
 
257
142
  total_pages_indexed
258
143
  end
259
144
 
145
+ # Crawls one or more webpages and stores them into the database.
146
+ # There is no max download limit so be careful of large pages.
147
+ # Logs info on the crawl using Wgit.logger as it goes along.
148
+ #
149
+ # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
151
+ # external Url's into the database.
152
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
153
+ # before it's inserted into the database allowing for prior
154
+ # manipulation. Return nil or false from the block to prevent the
155
+ # document from being saved into the database.
156
+ # @raise [StandardError] if no urls are provided.
157
+ def index_urls(*urls, insert_externals: false, &block)
158
+ raise 'You must provide at least one Url' if urls.empty?
159
+
160
+ opts = { insert_externals: insert_externals }
161
+ Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
+
163
+ nil
164
+ end
165
+
260
166
  # Crawls a single webpage and stores it into the database.
261
167
  # There is no max download limit so be careful of large pages.
262
168
  # Logs info on the crawl using Wgit.logger as it goes along.
263
169
  #
264
170
  # @param url [Wgit::Url] The webpage Url to crawl.
265
- # @param insert_externals [Boolean] Whether or not to insert the webpage's
171
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
266
172
  # external Url's into the database.
267
173
  # @yield [doc] Given the Wgit::Document of the crawled webpage,
268
174
  # before it's inserted into the database allowing for prior
269
175
  # manipulation. Return nil or false from the block to prevent the
270
176
  # document from being saved into the database.
271
- def index_page(url, insert_externals: true)
177
+ def index_url(url, insert_externals: false)
272
178
  document = @crawler.crawl_url(url) do |doc|
273
- result = true
274
- result = yield(doc) if block_given?
275
-
276
- if result && !doc.empty? && write_doc_to_db(doc)
277
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
278
- end
179
+ result = block_given? ? yield(doc) : true
180
+ write_doc_to_db(doc) if result && !doc.empty?
279
181
  end
280
182
 
281
- @db.url?(url) ? @db.update(url) : @db.insert(url)
183
+ @db.upsert(url)
282
184
 
283
185
  ext_urls = document&.external_links
284
186
  if insert_externals && ext_urls
@@ -311,23 +213,19 @@ site: #{url}")
311
213
  # collection deliberately prevents duplicate inserts.
312
214
  #
313
215
  # @param doc [Wgit::Document] The document to write to the DB.
314
- # @return [Boolean] True if the write was successful, false otherwise.
315
216
  def write_doc_to_db(doc)
316
- @db.insert(doc)
317
- Wgit.logger.info("Saved document for url: #{doc.url}")
318
-
319
- true
320
- rescue Mongo::Error::OperationFailure
321
- Wgit.logger.info("Document already exists: #{doc.url}")
322
-
323
- false
217
+ if @db.upsert(doc)
218
+ Wgit.logger.info("Saved document for url: #{doc.url}")
219
+ else
220
+ Wgit.logger.info("Updated document for url: #{doc.url}")
221
+ end
324
222
  end
325
223
 
326
224
  # Write the urls to the DB. Note that the unique url index on the urls
327
225
  # collection deliberately prevents duplicate inserts.
328
226
  #
329
227
  # @param urls [Array<Wgit::Url>] The urls to write to the DB.
330
- # @return [Boolean] True if the write was successful, false otherwise.
228
+ # @return [Integer] The number of inserted urls.
331
229
  def write_urls_to_db(urls)
332
230
  count = 0
333
231
 
@@ -341,6 +239,7 @@ site: #{url}")
341
239
 
342
240
  @db.insert(url)
343
241
  count += 1
242
+
344
243
  Wgit.logger.info("Inserted external url: #{url}")
345
244
  rescue Mongo::Error::OperationFailure
346
245
  Wgit.logger.info("External url already exists: #{url}")
@@ -348,5 +247,9 @@ site: #{url}")
348
247
 
349
248
  count
350
249
  end
250
+
251
+ alias database db
252
+ alias index index_urls
253
+ alias index_r index_site
351
254
  end
352
255
  end