wgit 0.5.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,27 +1,35 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- ### Default Document Extensions ###
3
+ ### Default Document Extractors ###
4
4
 
5
5
  # Base.
6
- Wgit::Document.define_extension(
6
+ Wgit::Document.define_extractor(
7
7
  :base,
8
8
  '//base/@href',
9
9
  singleton: true,
10
10
  text_content_only: true
11
11
  ) do |base|
12
- Wgit::Url.new(base) if base
12
+ Wgit::Url.parse?(base) if base
13
13
  end
14
14
 
15
15
  # Title.
16
- Wgit::Document.define_extension(
16
+ Wgit::Document.define_extractor(
17
17
  :title,
18
18
  '//title',
19
19
  singleton: true,
20
20
  text_content_only: true
21
21
  )
22
22
 
23
+ # Description.
24
+ Wgit::Document.define_extractor(
25
+ :description,
26
+ '//meta[@name="description"]/@content',
27
+ singleton: true,
28
+ text_content_only: true
29
+ )
30
+
23
31
  # Author.
24
- Wgit::Document.define_extension(
32
+ Wgit::Document.define_extractor(
25
33
  :author,
26
34
  '//meta[@name="author"]/@content',
27
35
  singleton: true,
@@ -29,7 +37,7 @@ Wgit::Document.define_extension(
29
37
  )
30
38
 
31
39
  # Keywords.
32
- Wgit::Document.define_extension(
40
+ Wgit::Document.define_extractor(
33
41
  :keywords,
34
42
  '//meta[@name="keywords"]/@content',
35
43
  singleton: true,
@@ -37,23 +45,25 @@ Wgit::Document.define_extension(
37
45
  ) do |keywords, _source, type|
38
46
  if keywords && (type == :document)
39
47
  keywords = keywords.split(',')
40
- Wgit::Utils.process_arr(keywords)
48
+ Wgit::Utils.sanitize(keywords)
41
49
  end
42
50
  keywords
43
51
  end
44
52
 
45
53
  # Links.
46
- Wgit::Document.define_extension(
54
+ Wgit::Document.define_extractor(
47
55
  :links,
48
56
  '//a/@href',
49
57
  singleton: false,
50
58
  text_content_only: true
51
59
  ) do |links|
52
- links.map! { |link| Wgit::Url.new(link) }
60
+ links
61
+ .map { |link| Wgit::Url.parse?(link) }
62
+ .compact # Remove unparsable links.
53
63
  end
54
64
 
55
65
  # Text.
56
- Wgit::Document.define_extension(
66
+ Wgit::Document.define_extractor(
57
67
  :text,
58
68
  proc { Wgit::Document.text_elements_xpath },
59
69
  singleton: false,
data/lib/wgit/dsl.rb ADDED
@@ -0,0 +1,324 @@
1
+ module Wgit
2
+ # DSL methods that act as a wrapper around Wgit's underlying class methods.
3
+ # All instance vars/constants are prefixed to avoid conflicts when included.
4
+ module DSL
5
+ # Error message shown when there's no URL to crawl.
6
+ DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
7
+ the 'start' function".freeze
8
+
9
+ ### CRAWLER METHODS ###
10
+
11
+ # Defines an extractor using `Wgit::Document.define_extractor` underneath.
12
+ #
13
+ # @param var [Symbol] The name of the variable to be initialised, that will
14
+ # contain the extracted content.
15
+ # @param xpath [String, #call] The xpath used to find the element(s)
16
+ # of the webpage. Only used when initializing from HTML.
17
+ #
18
+ # Pass a callable object (proc etc.) if you want the
19
+ # xpath value to be derived on Document initialisation (instead of when
20
+ # the extractor is defined). The call method must return a valid xpath
21
+ # String.
22
+ # @param opts [Hash] The options to define an extractor with. The
23
+ # options are only used when intializing from HTML, not the database.
24
+ # @option opts [Boolean] :singleton The singleton option determines
25
+ # whether or not the result(s) should be in an Array. If multiple
26
+ # results are found and singleton is true then the first result will be
27
+ # used. Defaults to true.
28
+ # @option opts [Boolean] :text_content_only The text_content_only option
29
+ # if true will use the text content of the Nokogiri result object,
30
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
31
+ # @yield The block is executed when a Wgit::Document is initialized,
32
+ # regardless of the source. Use it (optionally) to process the result
33
+ # value.
34
+ # @yieldparam value [Object] The result value to be assigned to the new
35
+ # `var`.
36
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
37
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
38
+ # `:object`.
39
+ # @yieldreturn [Object] The return value of the block becomes the new var's
40
+ # value. Return the block's value param unchanged if you want to inspect.
41
+ # @raise [StandardError] If the var param isn't valid.
42
+ # @return [Symbol] The given var Symbol if successful.
43
+ def extract(var, xpath, opts = {}, &block)
44
+ Wgit::Document.define_extractor(var, xpath, opts, &block)
45
+ end
46
+
47
+ # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
48
+ # index methods used by the DSL. See the Wgit::Crawler documentation for
49
+ # more details.
50
+ #
51
+ # @yield [crawler] The created crawler; use the block to configure.
52
+ # @return [Wgit::Crawler] The created crawler used by the DSL.
53
+ def crawler
54
+ @dsl_crawler ||= Wgit::Crawler.new
55
+ yield @dsl_crawler if block_given?
56
+ @dsl_crawler
57
+ end
58
+
59
+ # Sets the URL to be crawled when a `crawl*` or `index*` method is
60
+ # subsequently called. Calling this is optional as the URL can be
61
+ # passed to the method instead. You can also omit the url param and just
62
+ # use the block to configure the crawler instead.
63
+ #
64
+ # @param urls [*String, *Wgit::Url] The URL(s) to crawl
65
+ # or nil (if only using the block to configure the crawler).
66
+ # @yield [crawler] The crawler that'll be used in the subsequent
67
+ # crawl/index; use the block to configure.
68
+ def start(*urls, &block)
69
+ crawler(&block)
70
+ @dsl_start = urls
71
+ end
72
+
73
+ # Sets the xpath to be followed when `crawl_site` or `index_site` is
74
+ # subsequently called. Calling this method is optional as the default is to
75
+ # follow all `<a>` href's that point to the site domain. You can also pass
76
+ # `follow:` to the crawl/index methods directly.
77
+ #
78
+ # @param xpath [String] The xpath which is followed when crawling/indexing
79
+ # a site. Use `:default` to restore the default follow logic.
80
+ def follow(xpath)
81
+ @dsl_follow = xpath
82
+ end
83
+
84
+ # Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
85
+ # underneath. If no urls are provided, then the `start` URL is used.
86
+ #
87
+ # @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
88
+ # URL(s).
89
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
90
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
91
+ # e.g. :host only allows redirects within the same host. Choose from
92
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
93
+ # This value will be used for all urls crawled.
94
+ # @yield [doc] Given each crawled page (Wgit::Document); this is the only
95
+ # way to interact with them.
96
+ # @raise [StandardError] If no urls are provided and no `start` URL has
97
+ # been set.
98
+ # @return [Wgit::Document] The last Document crawled.
99
+ def crawl(*urls, follow_redirects: true, &block)
100
+ urls = (@dsl_start || []) if urls.empty?
101
+ raise DSL_ERROR__NO_START_URL if urls.empty?
102
+
103
+ urls.map! { |url| Wgit::Url.parse(url) }
104
+ crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
105
+ end
106
+
107
+ # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
108
+ # url is provided, then the first `start` URL is used.
109
+ #
110
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
111
+ # crawled. It is recommended that this URL be the index page of the site
112
+ # to give a greater chance of finding all pages within that site/host.
113
+ # Defaults to the `start` URLs.
114
+ # @param follow [String] The xpath extracting links to be followed during
115
+ # the crawl. This changes how a site is crawled. Only links pointing to
116
+ # the site domain are allowed. The `:default` is any `<a>` href returning
117
+ # HTML. This can also be set using `follow`.
118
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
119
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
120
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
121
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
122
+ # @yield [doc] Given each crawled page (Wgit::Document) of the site.
123
+ # A block is the only way to interact with each crawled Document.
124
+ # Use `doc.empty?` to determine if the page is valid.
125
+ # @raise [StandardError] If no url is provided and no `start` URL has been
126
+ # set.
127
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
128
+ # from all of the site's pages or nil if the given url could not be
129
+ # crawled successfully.
130
+ def crawl_site(
131
+ *urls, follow: @dsl_follow,
132
+ allow_paths: nil, disallow_paths: nil, &block
133
+ )
134
+ urls = (@dsl_start || []) if urls.empty?
135
+ raise DSL_ERROR__NO_START_URL if urls.empty?
136
+
137
+ xpath = follow || :default
138
+ opts = {
139
+ follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
+ }
141
+
142
+ urls.reduce([]) do |externals, url|
143
+ externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
144
+ end
145
+ end
146
+
147
+ # Returns the DSL's `crawler#last_response`.
148
+ #
149
+ # @return [Wgit::Response] The response from the last URL crawled.
150
+ def last_response
151
+ crawler.last_response
152
+ end
153
+
154
+ # Nilifies the DSL instance variables.
155
+ def reset
156
+ @dsl_crawler = nil
157
+ @dsl_start = nil
158
+ @dsl_follow = nil
159
+ @dsl_conn_str = nil
160
+ end
161
+
162
+ ### INDEXER METHODS ###
163
+
164
+ # Defines the connection string to the database used in subsequent `index*`
165
+ # method calls. This method is optional as the connection string can be
166
+ # passed to the index method instead.
167
+ #
168
+ # @param conn_str [String] The connection string used to connect to the
169
+ # database in subsequent `index*` method calls.
170
+ def connection_string(conn_str)
171
+ @dsl_conn_str = conn_str
172
+ end
173
+
174
+ # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
175
+ #
176
+ # @param connection_string [String] The database connection string. Set as
177
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
178
+ # `connection_string`.
179
+ # @param max_sites [Integer] The number of separate and whole
180
+ # websites to be crawled before the method exits. Defaults to -1 which
181
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
182
+ # @param max_data [Integer] The maximum amount of bytes that will be
183
+ # scraped from the web (default is 1GB). Note, that this value is used to
184
+ # determine when to stop crawling; it's not a guarantee of the max data
185
+ # that will be obtained.
186
+ def index_www(
187
+ connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
188
+ )
189
+ db = Wgit::Database.new(connection_string)
190
+ indexer = Wgit::Indexer.new(db, crawler)
191
+
192
+ indexer.index_www(max_sites: max_sites, max_data: max_data)
193
+ end
194
+
195
+ # Indexes a single website using `Wgit::Indexer#index_site` underneath.
196
+ #
197
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
198
+ # crawl. Can be set using `start`.
199
+ # @param connection_string [String] The database connection string. Set as
200
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
201
+ # `connection_string`.
202
+ # @param insert_externals [Boolean] Whether or not to insert the website's
203
+ # external URL's into the database.
204
+ # @param follow [String] The xpath extracting links to be followed during
205
+ # the crawl. This changes how a site is crawled. Only links pointing to
206
+ # the site domain are allowed. The `:default` is any `<a>` href returning
207
+ # HTML. This can also be set using `follow`.
208
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
209
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
210
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
211
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
212
+ # @yield [doc] Given the Wgit::Document of each crawled webpage, before it
213
+ # is inserted into the database allowing for prior manipulation.
214
+ # @raise [StandardError] If no url is provided and no `start` URL has been
215
+ # set.
216
+ # @return [Integer] The total number of pages crawled within the website.
217
+ def index_site(
218
+ *urls, connection_string: @dsl_conn_str,
219
+ insert_externals: false, follow: @dsl_follow,
220
+ allow_paths: nil, disallow_paths: nil, &block
221
+ )
222
+ urls = (@dsl_start || []) if urls.empty?
223
+ raise DSL_ERROR__NO_START_URL if urls.empty?
224
+
225
+ db = Wgit::Database.new(connection_string)
226
+ indexer = Wgit::Indexer.new(db, crawler)
227
+ xpath = follow || :default
228
+ crawl_opts = {
229
+ insert_externals: insert_externals, follow: xpath,
230
+ allow_paths: allow_paths, disallow_paths: disallow_paths
231
+ }
232
+
233
+ urls.reduce(0) do |total, url|
234
+ total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
235
+ end
236
+ end
237
+
238
+ # Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
239
+ #
240
+ # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
241
+ # `start` URL(s).
242
+ # @param connection_string [String] The database connection string. Set as
243
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
244
+ # `connection_string`.
245
+ # @param insert_externals [Boolean] Whether or not to insert the website's
246
+ # external URL's into the database.
247
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
248
+ # before it's inserted into the database allowing for prior
249
+ # manipulation. Return nil or false from the block to prevent the
250
+ # document from being saved into the database.
251
+ # @raise [StandardError] If no urls are provided and no `start` URL has
252
+ # been set.
253
+ def index(
254
+ *urls, connection_string: @dsl_conn_str,
255
+ insert_externals: false, &block
256
+ )
257
+ urls = (@dsl_start || []) if urls.empty?
258
+ raise DSL_ERROR__NO_START_URL if urls.empty?
259
+
260
+ db = Wgit::Database.new(connection_string)
261
+ indexer = Wgit::Indexer.new(db, crawler)
262
+
263
+ urls.map! { |url| Wgit::Url.parse(url) }
264
+ indexer.index_urls(*urls, insert_externals: insert_externals, &block)
265
+ end
266
+
267
+ # Performs a search of the database's indexed documents and pretty prints
268
+ # the results in a search engine-esque format. See `Wgit::Database#search!`
269
+ # and `Wgit::Document#search!` for details of how the search works.
270
+ #
271
+ # @param query [String] The text query to search with.
272
+ # @param connection_string [String] The database connection string. Set as
273
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
274
+ # `connection_string`.
275
+ # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
276
+ # to output text somewhere e.g. a file or STDERR. Use nil for no output.
277
+ # @param case_sensitive [Boolean] Whether character case must match.
278
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
279
+ # for separately.
280
+ # @param limit [Integer] The max number of results to print.
281
+ # @param skip [Integer] The number of DB records to skip.
282
+ # @param sentence_limit [Integer] The max length of each result's text
283
+ # snippet.
284
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
285
+ # database containing only its matching `#text`.
286
+ # @return [Array<Wgit::Document>] The search results with matching text.
287
+ def search(
288
+ query, connection_string: @dsl_conn_str, stream: STDOUT,
289
+ case_sensitive: false, whole_sentence: true,
290
+ limit: 10, skip: 0, sentence_limit: 80, &block
291
+ )
292
+ stream ||= File.open(File::NULL, 'w')
293
+ db = Wgit::Database.new(connection_string)
294
+
295
+ results = db.search!(
296
+ query,
297
+ case_sensitive: case_sensitive,
298
+ whole_sentence: whole_sentence,
299
+ limit: limit,
300
+ skip: skip,
301
+ sentence_limit: sentence_limit,
302
+ &block
303
+ )
304
+
305
+ Wgit::Utils.printf_search_results(results, stream: stream)
306
+
307
+ results
308
+ end
309
+
310
+ # Deletes everything in the urls and documents collections by calling
311
+ # `Wgit::Database#clear_db` underneath. This will nuke the entire database
312
+ # so yeah... be careful.
313
+ #
314
+ # @return [Integer] The number of deleted records.
315
+ def clear_db!(connection_string: @dsl_conn_str)
316
+ db = Wgit::Database.new(connection_string)
317
+ db.clear_db
318
+ end
319
+
320
+ alias crawl_r crawl_site
321
+ alias index_r index_site
322
+ alias start_urls start
323
+ end
324
+ end
data/lib/wgit/indexer.rb CHANGED
@@ -4,125 +4,8 @@ require_relative 'crawler'
4
4
  require_relative 'database/database'
5
5
 
6
6
  module Wgit
7
- # Convience method to index the World Wide Web using
8
- # Wgit::Indexer#index_www.
9
- #
10
- # Retrieves uncrawled url's from the database and recursively crawls each
11
- # site storing their internal pages into the database and adding their
12
- # external url's to be crawled later on. Logs info on the crawl
13
- # using Wgit.logger as it goes along.
14
- #
15
- # @param connection_string [String] The database connection string. Set as
16
- # nil to use ENV['WGIT_CONNECTION_STRING'].
17
- # @param max_sites [Integer] The number of separate and whole
18
- # websites to be crawled before the method exits. Defaults to -1 which
19
- # means the crawl will occur until manually stopped (Ctrl+C etc).
20
- # @param max_data [Integer] The maximum amount of bytes that will be
21
- # scraped from the web (default is 1GB). Note, that this value is used to
22
- # determine when to stop crawling; it's not a guarantee of the max data
23
- # that will be obtained.
24
- def self.index_www(
25
- connection_string: nil, max_sites: -1, max_data: 1_048_576_000
26
- )
27
- db = Wgit::Database.new(connection_string)
28
- indexer = Wgit::Indexer.new(db)
29
- indexer.index_www(max_sites: max_sites, max_data: max_data)
30
- end
31
-
32
- # Convience method to index a single website using
33
- # Wgit::Indexer#index_site.
34
- #
35
- # Crawls a single website's pages and stores them into the database.
36
- # There is no max download limit so be careful which sites you index.
37
- #
38
- # @param url [Wgit::Url, String] The base Url of the website to crawl.
39
- # @param connection_string [String] The database connection string. Set as
40
- # nil to use ENV['WGIT_CONNECTION_STRING'].
41
- # @param insert_externals [Boolean] Whether or not to insert the website's
42
- # external Url's into the database.
43
- # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
44
- # inserted into the database allowing for prior manipulation.
45
- # @return [Integer] The total number of pages crawled within the website.
46
- def self.index_site(
47
- url, connection_string: nil, insert_externals: true,
48
- allow_paths: nil, disallow_paths: nil, &block
49
- )
50
- url = Wgit::Url.parse(url)
51
- db = Wgit::Database.new(connection_string)
52
- indexer = Wgit::Indexer.new(db)
53
- indexer.index_site(
54
- url, insert_externals: insert_externals,
55
- allow_paths: allow_paths, disallow_paths: disallow_paths, &block
56
- )
57
- end
58
-
59
- # Convience method to index a single webpage using
60
- # Wgit::Indexer#index_page.
61
- #
62
- # Crawls a single webpage and stores it into the database.
63
- # There is no max download limit so be careful of large pages.
64
- #
65
- # @param url [Wgit::Url, String] The Url of the webpage to crawl.
66
- # @param connection_string [String] The database connection string. Set as
67
- # nil to use ENV['WGIT_CONNECTION_STRING'].
68
- # @param insert_externals [Boolean] Whether or not to insert the website's
69
- # external Url's into the database.
70
- # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
71
- # inserted into the database allowing for prior manipulation.
72
- def self.index_page(
73
- url, connection_string: nil, insert_externals: true, &block
74
- )
75
- url = Wgit::Url.parse(url)
76
- db = Wgit::Database.new(connection_string)
77
- indexer = Wgit::Indexer.new(db)
78
- indexer.index_page(url, insert_externals: insert_externals, &block)
79
- end
80
-
81
- # Performs a search of the database's indexed documents and pretty prints
82
- # the results. See Wgit::Database#search and Wgit::Document#search for
83
- # details of how the search works.
84
- #
85
- # @param query [String] The text query to search with.
86
- # @param connection_string [String] The database connection string. Set as
87
- # nil to use ENV['WGIT_CONNECTION_STRING'].
88
- # @param case_sensitive [Boolean] Whether character case must match.
89
- # @param whole_sentence [Boolean] Whether multiple words should be searched
90
- # for separately.
91
- # @param limit [Integer] The max number of results to print.
92
- # @param skip [Integer] The number of DB records to skip.
93
- # @param sentence_limit [Integer] The max length of each result's text
94
- # snippet.
95
- # @yield [doc] Given each search result (Wgit::Document) returned from the
96
- # database.
97
- def self.indexed_search(
98
- query, connection_string: nil,
99
- case_sensitive: false, whole_sentence: false,
100
- limit: 10, skip: 0, sentence_limit: 80, &block
101
- )
102
- db = Wgit::Database.new(connection_string)
103
-
104
- results = db.search(
105
- query,
106
- case_sensitive: case_sensitive,
107
- whole_sentence: whole_sentence,
108
- limit: limit,
109
- skip: skip,
110
- &block
111
- )
112
-
113
- results.each do |doc|
114
- doc.search!(
115
- query,
116
- case_sensitive: case_sensitive,
117
- whole_sentence: whole_sentence,
118
- sentence_limit: sentence_limit
119
- )
120
- end
121
-
122
- Wgit::Utils.printf_search_results(results)
123
- end
124
-
125
- # Class which sets up a crawler and saves the indexed docs to a database.
7
+ # Class which crawls and saves the Documents to a database. Can be thought of
8
+ # as a combination of Wgit::Crawler and Wgit::Database.
126
9
  class Indexer
127
10
  # The crawler used to index the WWW.
128
11
  attr_reader :crawler
@@ -133,10 +16,11 @@ module Wgit
133
16
  # Initialize the Indexer.
134
17
  #
135
18
  # @param database [Wgit::Database] The database instance (already
136
- # initialized with the correct connection string etc).
137
- def initialize(database)
138
- @crawler = Wgit::Crawler.new
19
+ # initialized and connected) used to index.
20
+ # @param crawler [Wgit::Crawler] The crawler instance used to index.
21
+ def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
139
22
  @db = database
23
+ @crawler = crawler
140
24
  end
141
25
 
142
26
  # Retrieves uncrawled url's from the database and recursively crawls each
@@ -184,7 +68,8 @@ database capacity, exiting.")
184
68
 
185
69
  site_docs_count = 0
186
70
  ext_links = @crawler.crawl_site(url) do |doc|
187
- if !doc.empty? && write_doc_to_db(doc)
71
+ unless doc.empty?
72
+ write_doc_to_db(doc)
188
73
  docs_count += 1
189
74
  site_docs_count += 1
190
75
  end
@@ -193,12 +78,9 @@ database capacity, exiting.")
193
78
  raise 'Error updating url' unless @db.update(url) == 1
194
79
 
195
80
  urls_count += write_urls_to_db(ext_links)
196
-
197
- Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
198
- site: #{url}")
199
81
  end
200
82
 
201
- Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
83
+ Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
202
84
  overall for this iteration.")
203
85
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
204
86
  the next iteration.")
@@ -214,62 +96,91 @@ the next iteration.")
214
96
  # @param url [Wgit::Url] The base Url of the website to crawl.
215
97
  # @param insert_externals [Boolean] Whether or not to insert the website's
216
98
  # external Url's into the database.
99
+ # @param follow [String] The xpath extracting links to be followed during
100
+ # the crawl. This changes how a site is crawled. Only links pointing to
101
+ # the site domain are allowed. The `:default` is any `<a>` href returning
102
+ # HTML.
103
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
104
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
105
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
106
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
217
107
  # @yield [doc] Given the Wgit::Document of each crawled web page before
218
108
  # it's inserted into the database allowing for prior manipulation. Return
219
109
  # nil or false from the block to prevent the document from being saved
220
110
  # into the database.
221
111
  # @return [Integer] The total number of webpages/documents indexed.
222
112
  def index_site(
223
- url, insert_externals: true, allow_paths: nil, disallow_paths: nil
113
+ url, insert_externals: false, follow: :default,
114
+ allow_paths: nil, disallow_paths: nil
224
115
  )
225
- crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
116
+ crawl_opts = {
117
+ follow: follow,
118
+ allow_paths: allow_paths,
119
+ disallow_paths: disallow_paths
120
+ }
226
121
  total_pages_indexed = 0
227
122
 
228
- ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
229
- result = true
230
- result = yield(doc) if block_given?
123
+ ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
124
+ result = block_given? ? yield(doc) : true
231
125
 
232
- if result && !doc.empty? && write_doc_to_db(doc)
126
+ if result && !doc.empty?
127
+ write_doc_to_db(doc)
233
128
  total_pages_indexed += 1
234
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
235
129
  end
236
130
  end
237
131
 
238
- @db.url?(url) ? @db.update(url) : @db.insert(url)
132
+ @db.upsert(url)
239
133
 
240
134
  if insert_externals && ext_urls
241
135
  num_inserted_urls = write_urls_to_db(ext_urls)
242
136
  Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
243
137
  end
244
138
 
245
- Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
246
- site: #{url}")
139
+ Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
140
+ the site: #{url}")
247
141
 
248
142
  total_pages_indexed
249
143
  end
250
144
 
145
+ # Crawls one or more webpages and stores them into the database.
146
+ # There is no max download limit so be careful of large pages.
147
+ # Logs info on the crawl using Wgit.logger as it goes along.
148
+ #
149
+ # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
151
+ # external Url's into the database.
152
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
153
+ # before it's inserted into the database allowing for prior
154
+ # manipulation. Return nil or false from the block to prevent the
155
+ # document from being saved into the database.
156
+ # @raise [StandardError] if no urls are provided.
157
+ def index_urls(*urls, insert_externals: false, &block)
158
+ raise 'You must provide at least one Url' if urls.empty?
159
+
160
+ opts = { insert_externals: insert_externals }
161
+ Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
+
163
+ nil
164
+ end
165
+
251
166
  # Crawls a single webpage and stores it into the database.
252
167
  # There is no max download limit so be careful of large pages.
253
168
  # Logs info on the crawl using Wgit.logger as it goes along.
254
169
  #
255
170
  # @param url [Wgit::Url] The webpage Url to crawl.
256
- # @param insert_externals [Boolean] Whether or not to insert the webpage's
171
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
257
172
  # external Url's into the database.
258
173
  # @yield [doc] Given the Wgit::Document of the crawled webpage,
259
174
  # before it's inserted into the database allowing for prior
260
175
  # manipulation. Return nil or false from the block to prevent the
261
176
  # document from being saved into the database.
262
- def index_page(url, insert_externals: true)
177
+ def index_url(url, insert_externals: false)
263
178
  document = @crawler.crawl_url(url) do |doc|
264
- result = true
265
- result = yield(doc) if block_given?
266
-
267
- if result && !doc.empty? && write_doc_to_db(doc)
268
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
269
- end
179
+ result = block_given? ? yield(doc) : true
180
+ write_doc_to_db(doc) if result && !doc.empty?
270
181
  end
271
182
 
272
- @db.url?(url) ? @db.update(url) : @db.insert(url)
183
+ @db.upsert(url)
273
184
 
274
185
  ext_urls = document&.external_links
275
186
  if insert_externals && ext_urls
@@ -302,23 +213,19 @@ site: #{url}")
302
213
  # collection deliberately prevents duplicate inserts.
303
214
  #
304
215
  # @param doc [Wgit::Document] The document to write to the DB.
305
- # @return [Boolean] True if the write was successful, false otherwise.
306
216
  def write_doc_to_db(doc)
307
- @db.insert(doc)
308
- Wgit.logger.info("Saved document for url: #{doc.url}")
309
-
310
- true
311
- rescue Mongo::Error::OperationFailure
312
- Wgit.logger.info("Document already exists: #{doc.url}")
313
-
314
- false
217
+ if @db.upsert(doc)
218
+ Wgit.logger.info("Saved document for url: #{doc.url}")
219
+ else
220
+ Wgit.logger.info("Updated document for url: #{doc.url}")
221
+ end
315
222
  end
316
223
 
317
224
  # Write the urls to the DB. Note that the unique url index on the urls
318
225
  # collection deliberately prevents duplicate inserts.
319
226
  #
320
227
  # @param urls [Array<Wgit::Url>] The urls to write to the DB.
321
- # @return [Boolean] True if the write was successful, false otherwise.
228
+ # @return [Integer] The number of inserted urls.
322
229
  def write_urls_to_db(urls)
323
230
  count = 0
324
231
 
@@ -332,6 +239,7 @@ site: #{url}")
332
239
 
333
240
  @db.insert(url)
334
241
  count += 1
242
+
335
243
  Wgit.logger.info("Inserted external url: #{url}")
336
244
  rescue Mongo::Error::OperationFailure
337
245
  Wgit.logger.info("External url already exists: #{url}")
@@ -339,5 +247,9 @@ site: #{url}")
339
247
 
340
248
  count
341
249
  end
250
+
251
+ alias database db
252
+ alias index index_urls
253
+ alias index_r index_site
342
254
  end
343
255
  end