wgit 0.5.1 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +249 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +232 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +241 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -14
- data/lib/wgit/url.rb +213 -73
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +3 -2
- metadata +38 -19
@@ -1,27 +1,35 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
### Default Document
|
3
|
+
### Default Document Extractors ###
|
4
4
|
|
5
5
|
# Base.
|
6
|
-
Wgit::Document.
|
6
|
+
Wgit::Document.define_extractor(
|
7
7
|
:base,
|
8
8
|
'//base/@href',
|
9
9
|
singleton: true,
|
10
10
|
text_content_only: true
|
11
11
|
) do |base|
|
12
|
-
Wgit::Url.
|
12
|
+
Wgit::Url.parse?(base) if base
|
13
13
|
end
|
14
14
|
|
15
15
|
# Title.
|
16
|
-
Wgit::Document.
|
16
|
+
Wgit::Document.define_extractor(
|
17
17
|
:title,
|
18
18
|
'//title',
|
19
19
|
singleton: true,
|
20
20
|
text_content_only: true
|
21
21
|
)
|
22
22
|
|
23
|
+
# Description.
|
24
|
+
Wgit::Document.define_extractor(
|
25
|
+
:description,
|
26
|
+
'//meta[@name="description"]/@content',
|
27
|
+
singleton: true,
|
28
|
+
text_content_only: true
|
29
|
+
)
|
30
|
+
|
23
31
|
# Author.
|
24
|
-
Wgit::Document.
|
32
|
+
Wgit::Document.define_extractor(
|
25
33
|
:author,
|
26
34
|
'//meta[@name="author"]/@content',
|
27
35
|
singleton: true,
|
@@ -29,7 +37,7 @@ Wgit::Document.define_extension(
|
|
29
37
|
)
|
30
38
|
|
31
39
|
# Keywords.
|
32
|
-
Wgit::Document.
|
40
|
+
Wgit::Document.define_extractor(
|
33
41
|
:keywords,
|
34
42
|
'//meta[@name="keywords"]/@content',
|
35
43
|
singleton: true,
|
@@ -37,23 +45,25 @@ Wgit::Document.define_extension(
|
|
37
45
|
) do |keywords, _source, type|
|
38
46
|
if keywords && (type == :document)
|
39
47
|
keywords = keywords.split(',')
|
40
|
-
Wgit::Utils.
|
48
|
+
Wgit::Utils.sanitize(keywords)
|
41
49
|
end
|
42
50
|
keywords
|
43
51
|
end
|
44
52
|
|
45
53
|
# Links.
|
46
|
-
Wgit::Document.
|
54
|
+
Wgit::Document.define_extractor(
|
47
55
|
:links,
|
48
56
|
'//a/@href',
|
49
57
|
singleton: false,
|
50
58
|
text_content_only: true
|
51
59
|
) do |links|
|
52
|
-
links
|
60
|
+
links
|
61
|
+
.map { |link| Wgit::Url.parse?(link) }
|
62
|
+
.compact # Remove unparsable links.
|
53
63
|
end
|
54
64
|
|
55
65
|
# Text.
|
56
|
-
Wgit::Document.
|
66
|
+
Wgit::Document.define_extractor(
|
57
67
|
:text,
|
58
68
|
proc { Wgit::Document.text_elements_xpath },
|
59
69
|
singleton: false,
|
data/lib/wgit/dsl.rb
ADDED
@@ -0,0 +1,324 @@
|
|
1
|
+
module Wgit
|
2
|
+
# DSL methods that act as a wrapper around Wgit's underlying class methods.
|
3
|
+
# All instance vars/constants are prefixed to avoid conflicts when included.
|
4
|
+
module DSL
|
5
|
+
# Error message shown when there's no URL to crawl.
|
6
|
+
DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
|
7
|
+
the 'start' function".freeze
|
8
|
+
|
9
|
+
### CRAWLER METHODS ###
|
10
|
+
|
11
|
+
# Defines an extractor using `Wgit::Document.define_extractor` underneath.
|
12
|
+
#
|
13
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
14
|
+
# contain the extracted content.
|
15
|
+
# @param xpath [String, #call] The xpath used to find the element(s)
|
16
|
+
# of the webpage. Only used when initializing from HTML.
|
17
|
+
#
|
18
|
+
# Pass a callable object (proc etc.) if you want the
|
19
|
+
# xpath value to be derived on Document initialisation (instead of when
|
20
|
+
# the extractor is defined). The call method must return a valid xpath
|
21
|
+
# String.
|
22
|
+
# @param opts [Hash] The options to define an extractor with. The
|
23
|
+
# options are only used when intializing from HTML, not the database.
|
24
|
+
# @option opts [Boolean] :singleton The singleton option determines
|
25
|
+
# whether or not the result(s) should be in an Array. If multiple
|
26
|
+
# results are found and singleton is true then the first result will be
|
27
|
+
# used. Defaults to true.
|
28
|
+
# @option opts [Boolean] :text_content_only The text_content_only option
|
29
|
+
# if true will use the text content of the Nokogiri result object,
|
30
|
+
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
31
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
32
|
+
# regardless of the source. Use it (optionally) to process the result
|
33
|
+
# value.
|
34
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
35
|
+
# `var`.
|
36
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
37
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
38
|
+
# `:object`.
|
39
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
40
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
41
|
+
# @raise [StandardError] If the var param isn't valid.
|
42
|
+
# @return [Symbol] The given var Symbol if successful.
|
43
|
+
def extract(var, xpath, opts = {}, &block)
|
44
|
+
Wgit::Document.define_extractor(var, xpath, opts, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
|
48
|
+
# index methods used by the DSL. See the Wgit::Crawler documentation for
|
49
|
+
# more details.
|
50
|
+
#
|
51
|
+
# @yield [crawler] The created crawler; use the block to configure.
|
52
|
+
# @return [Wgit::Crawler] The created crawler used by the DSL.
|
53
|
+
def crawler
|
54
|
+
@dsl_crawler ||= Wgit::Crawler.new
|
55
|
+
yield @dsl_crawler if block_given?
|
56
|
+
@dsl_crawler
|
57
|
+
end
|
58
|
+
|
59
|
+
# Sets the URL to be crawled when a `crawl*` or `index*` method is
|
60
|
+
# subsequently called. Calling this is optional as the URL can be
|
61
|
+
# passed to the method instead. You can also omit the url param and just
|
62
|
+
# use the block to configure the crawler instead.
|
63
|
+
#
|
64
|
+
# @param urls [*String, *Wgit::Url] The URL(s) to crawl
|
65
|
+
# or nil (if only using the block to configure the crawler).
|
66
|
+
# @yield [crawler] The crawler that'll be used in the subsequent
|
67
|
+
# crawl/index; use the block to configure.
|
68
|
+
def start(*urls, &block)
|
69
|
+
crawler(&block)
|
70
|
+
@dsl_start = urls
|
71
|
+
end
|
72
|
+
|
73
|
+
# Sets the xpath to be followed when `crawl_site` or `index_site` is
|
74
|
+
# subsequently called. Calling this method is optional as the default is to
|
75
|
+
# follow all `<a>` href's that point to the site domain. You can also pass
|
76
|
+
# `follow:` to the crawl/index methods directly.
|
77
|
+
#
|
78
|
+
# @param xpath [String] The xpath which is followed when crawling/indexing
|
79
|
+
# a site. Use `:default` to restore the default follow logic.
|
80
|
+
def follow(xpath)
|
81
|
+
@dsl_follow = xpath
|
82
|
+
end
|
83
|
+
|
84
|
+
# Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
|
85
|
+
# underneath. If no urls are provided, then the `start` URL is used.
|
86
|
+
#
|
87
|
+
# @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
|
88
|
+
# URL(s).
|
89
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
90
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
91
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
92
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
93
|
+
# This value will be used for all urls crawled.
|
94
|
+
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
95
|
+
# way to interact with them.
|
96
|
+
# @raise [StandardError] If no urls are provided and no `start` URL has
|
97
|
+
# been set.
|
98
|
+
# @return [Wgit::Document] The last Document crawled.
|
99
|
+
def crawl(*urls, follow_redirects: true, &block)
|
100
|
+
urls = (@dsl_start || []) if urls.empty?
|
101
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
102
|
+
|
103
|
+
urls.map! { |url| Wgit::Url.parse(url) }
|
104
|
+
crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
108
|
+
# url is provided, then the first `start` URL is used.
|
109
|
+
#
|
110
|
+
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
|
111
|
+
# crawled. It is recommended that this URL be the index page of the site
|
112
|
+
# to give a greater chance of finding all pages within that site/host.
|
113
|
+
# Defaults to the `start` URLs.
|
114
|
+
# @param follow [String] The xpath extracting links to be followed during
|
115
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
116
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
117
|
+
# HTML. This can also be set using `follow`.
|
118
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
119
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
120
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
121
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
122
|
+
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
123
|
+
# A block is the only way to interact with each crawled Document.
|
124
|
+
# Use `doc.empty?` to determine if the page is valid.
|
125
|
+
# @raise [StandardError] If no url is provided and no `start` URL has been
|
126
|
+
# set.
|
127
|
+
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
128
|
+
# from all of the site's pages or nil if the given url could not be
|
129
|
+
# crawled successfully.
|
130
|
+
def crawl_site(
|
131
|
+
*urls, follow: @dsl_follow,
|
132
|
+
allow_paths: nil, disallow_paths: nil, &block
|
133
|
+
)
|
134
|
+
urls = (@dsl_start || []) if urls.empty?
|
135
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
136
|
+
|
137
|
+
xpath = follow || :default
|
138
|
+
opts = {
|
139
|
+
follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
|
140
|
+
}
|
141
|
+
|
142
|
+
urls.reduce([]) do |externals, url|
|
143
|
+
externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns the DSL's `crawler#last_response`.
|
148
|
+
#
|
149
|
+
# @return [Wgit::Response] The response from the last URL crawled.
|
150
|
+
def last_response
|
151
|
+
crawler.last_response
|
152
|
+
end
|
153
|
+
|
154
|
+
# Nilifies the DSL instance variables.
|
155
|
+
def reset
|
156
|
+
@dsl_crawler = nil
|
157
|
+
@dsl_start = nil
|
158
|
+
@dsl_follow = nil
|
159
|
+
@dsl_conn_str = nil
|
160
|
+
end
|
161
|
+
|
162
|
+
### INDEXER METHODS ###
|
163
|
+
|
164
|
+
# Defines the connection string to the database used in subsequent `index*`
|
165
|
+
# method calls. This method is optional as the connection string can be
|
166
|
+
# passed to the index method instead.
|
167
|
+
#
|
168
|
+
# @param conn_str [String] The connection string used to connect to the
|
169
|
+
# database in subsequent `index*` method calls.
|
170
|
+
def connection_string(conn_str)
|
171
|
+
@dsl_conn_str = conn_str
|
172
|
+
end
|
173
|
+
|
174
|
+
# Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
|
175
|
+
#
|
176
|
+
# @param connection_string [String] The database connection string. Set as
|
177
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
178
|
+
# `connection_string`.
|
179
|
+
# @param max_sites [Integer] The number of separate and whole
|
180
|
+
# websites to be crawled before the method exits. Defaults to -1 which
|
181
|
+
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
182
|
+
# @param max_data [Integer] The maximum amount of bytes that will be
|
183
|
+
# scraped from the web (default is 1GB). Note, that this value is used to
|
184
|
+
# determine when to stop crawling; it's not a guarantee of the max data
|
185
|
+
# that will be obtained.
|
186
|
+
def index_www(
|
187
|
+
connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
|
188
|
+
)
|
189
|
+
db = Wgit::Database.new(connection_string)
|
190
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
191
|
+
|
192
|
+
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Indexes a single website using `Wgit::Indexer#index_site` underneath.
|
196
|
+
#
|
197
|
+
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
|
198
|
+
# crawl. Can be set using `start`.
|
199
|
+
# @param connection_string [String] The database connection string. Set as
|
200
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
201
|
+
# `connection_string`.
|
202
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
203
|
+
# external URL's into the database.
|
204
|
+
# @param follow [String] The xpath extracting links to be followed during
|
205
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
206
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
207
|
+
# HTML. This can also be set using `follow`.
|
208
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
209
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
210
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
211
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
212
|
+
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it
|
213
|
+
# is inserted into the database allowing for prior manipulation.
|
214
|
+
# @raise [StandardError] If no url is provided and no `start` URL has been
|
215
|
+
# set.
|
216
|
+
# @return [Integer] The total number of pages crawled within the website.
|
217
|
+
def index_site(
|
218
|
+
*urls, connection_string: @dsl_conn_str,
|
219
|
+
insert_externals: false, follow: @dsl_follow,
|
220
|
+
allow_paths: nil, disallow_paths: nil, &block
|
221
|
+
)
|
222
|
+
urls = (@dsl_start || []) if urls.empty?
|
223
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
224
|
+
|
225
|
+
db = Wgit::Database.new(connection_string)
|
226
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
227
|
+
xpath = follow || :default
|
228
|
+
crawl_opts = {
|
229
|
+
insert_externals: insert_externals, follow: xpath,
|
230
|
+
allow_paths: allow_paths, disallow_paths: disallow_paths
|
231
|
+
}
|
232
|
+
|
233
|
+
urls.reduce(0) do |total, url|
|
234
|
+
total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
|
239
|
+
#
|
240
|
+
# @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
|
241
|
+
# `start` URL(s).
|
242
|
+
# @param connection_string [String] The database connection string. Set as
|
243
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
244
|
+
# `connection_string`.
|
245
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
246
|
+
# external URL's into the database.
|
247
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
248
|
+
# before it's inserted into the database allowing for prior
|
249
|
+
# manipulation. Return nil or false from the block to prevent the
|
250
|
+
# document from being saved into the database.
|
251
|
+
# @raise [StandardError] If no urls are provided and no `start` URL has
|
252
|
+
# been set.
|
253
|
+
def index(
|
254
|
+
*urls, connection_string: @dsl_conn_str,
|
255
|
+
insert_externals: false, &block
|
256
|
+
)
|
257
|
+
urls = (@dsl_start || []) if urls.empty?
|
258
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
259
|
+
|
260
|
+
db = Wgit::Database.new(connection_string)
|
261
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
262
|
+
|
263
|
+
urls.map! { |url| Wgit::Url.parse(url) }
|
264
|
+
indexer.index_urls(*urls, insert_externals: insert_externals, &block)
|
265
|
+
end
|
266
|
+
|
267
|
+
# Performs a search of the database's indexed documents and pretty prints
|
268
|
+
# the results in a search engine-esque format. See `Wgit::Database#search!`
|
269
|
+
# and `Wgit::Document#search!` for details of how the search works.
|
270
|
+
#
|
271
|
+
# @param query [String] The text query to search with.
|
272
|
+
# @param connection_string [String] The database connection string. Set as
|
273
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
274
|
+
# `connection_string`.
|
275
|
+
# @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
|
276
|
+
# to output text somewhere e.g. a file or STDERR. Use nil for no output.
|
277
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
278
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
279
|
+
# for separately.
|
280
|
+
# @param limit [Integer] The max number of results to print.
|
281
|
+
# @param skip [Integer] The number of DB records to skip.
|
282
|
+
# @param sentence_limit [Integer] The max length of each result's text
|
283
|
+
# snippet.
|
284
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
285
|
+
# database containing only its matching `#text`.
|
286
|
+
# @return [Array<Wgit::Document>] The search results with matching text.
|
287
|
+
def search(
|
288
|
+
query, connection_string: @dsl_conn_str, stream: STDOUT,
|
289
|
+
case_sensitive: false, whole_sentence: true,
|
290
|
+
limit: 10, skip: 0, sentence_limit: 80, &block
|
291
|
+
)
|
292
|
+
stream ||= File.open(File::NULL, 'w')
|
293
|
+
db = Wgit::Database.new(connection_string)
|
294
|
+
|
295
|
+
results = db.search!(
|
296
|
+
query,
|
297
|
+
case_sensitive: case_sensitive,
|
298
|
+
whole_sentence: whole_sentence,
|
299
|
+
limit: limit,
|
300
|
+
skip: skip,
|
301
|
+
sentence_limit: sentence_limit,
|
302
|
+
&block
|
303
|
+
)
|
304
|
+
|
305
|
+
Wgit::Utils.printf_search_results(results, stream: stream)
|
306
|
+
|
307
|
+
results
|
308
|
+
end
|
309
|
+
|
310
|
+
# Deletes everything in the urls and documents collections by calling
|
311
|
+
# `Wgit::Database#clear_db` underneath. This will nuke the entire database
|
312
|
+
# so yeah... be careful.
|
313
|
+
#
|
314
|
+
# @return [Integer] The number of deleted records.
|
315
|
+
def clear_db!(connection_string: @dsl_conn_str)
|
316
|
+
db = Wgit::Database.new(connection_string)
|
317
|
+
db.clear_db
|
318
|
+
end
|
319
|
+
|
320
|
+
alias crawl_r crawl_site
|
321
|
+
alias index_r index_site
|
322
|
+
alias start_urls start
|
323
|
+
end
|
324
|
+
end
|
data/lib/wgit/indexer.rb
CHANGED
@@ -4,125 +4,8 @@ require_relative 'crawler'
|
|
4
4
|
require_relative 'database/database'
|
5
5
|
|
6
6
|
module Wgit
|
7
|
-
#
|
8
|
-
# Wgit::
|
9
|
-
#
|
10
|
-
# Retrieves uncrawled url's from the database and recursively crawls each
|
11
|
-
# site storing their internal pages into the database and adding their
|
12
|
-
# external url's to be crawled later on. Logs info on the crawl
|
13
|
-
# using Wgit.logger as it goes along.
|
14
|
-
#
|
15
|
-
# @param connection_string [String] The database connection string. Set as
|
16
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
17
|
-
# @param max_sites [Integer] The number of separate and whole
|
18
|
-
# websites to be crawled before the method exits. Defaults to -1 which
|
19
|
-
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
20
|
-
# @param max_data [Integer] The maximum amount of bytes that will be
|
21
|
-
# scraped from the web (default is 1GB). Note, that this value is used to
|
22
|
-
# determine when to stop crawling; it's not a guarantee of the max data
|
23
|
-
# that will be obtained.
|
24
|
-
def self.index_www(
|
25
|
-
connection_string: nil, max_sites: -1, max_data: 1_048_576_000
|
26
|
-
)
|
27
|
-
db = Wgit::Database.new(connection_string)
|
28
|
-
indexer = Wgit::Indexer.new(db)
|
29
|
-
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
30
|
-
end
|
31
|
-
|
32
|
-
# Convience method to index a single website using
|
33
|
-
# Wgit::Indexer#index_site.
|
34
|
-
#
|
35
|
-
# Crawls a single website's pages and stores them into the database.
|
36
|
-
# There is no max download limit so be careful which sites you index.
|
37
|
-
#
|
38
|
-
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
39
|
-
# @param connection_string [String] The database connection string. Set as
|
40
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
41
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
42
|
-
# external Url's into the database.
|
43
|
-
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
|
44
|
-
# inserted into the database allowing for prior manipulation.
|
45
|
-
# @return [Integer] The total number of pages crawled within the website.
|
46
|
-
def self.index_site(
|
47
|
-
url, connection_string: nil, insert_externals: true,
|
48
|
-
allow_paths: nil, disallow_paths: nil, &block
|
49
|
-
)
|
50
|
-
url = Wgit::Url.parse(url)
|
51
|
-
db = Wgit::Database.new(connection_string)
|
52
|
-
indexer = Wgit::Indexer.new(db)
|
53
|
-
indexer.index_site(
|
54
|
-
url, insert_externals: insert_externals,
|
55
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths, &block
|
56
|
-
)
|
57
|
-
end
|
58
|
-
|
59
|
-
# Convience method to index a single webpage using
|
60
|
-
# Wgit::Indexer#index_page.
|
61
|
-
#
|
62
|
-
# Crawls a single webpage and stores it into the database.
|
63
|
-
# There is no max download limit so be careful of large pages.
|
64
|
-
#
|
65
|
-
# @param url [Wgit::Url, String] The Url of the webpage to crawl.
|
66
|
-
# @param connection_string [String] The database connection string. Set as
|
67
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
68
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
69
|
-
# external Url's into the database.
|
70
|
-
# @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
|
71
|
-
# inserted into the database allowing for prior manipulation.
|
72
|
-
def self.index_page(
|
73
|
-
url, connection_string: nil, insert_externals: true, &block
|
74
|
-
)
|
75
|
-
url = Wgit::Url.parse(url)
|
76
|
-
db = Wgit::Database.new(connection_string)
|
77
|
-
indexer = Wgit::Indexer.new(db)
|
78
|
-
indexer.index_page(url, insert_externals: insert_externals, &block)
|
79
|
-
end
|
80
|
-
|
81
|
-
# Performs a search of the database's indexed documents and pretty prints
|
82
|
-
# the results. See Wgit::Database#search and Wgit::Document#search for
|
83
|
-
# details of how the search works.
|
84
|
-
#
|
85
|
-
# @param query [String] The text query to search with.
|
86
|
-
# @param connection_string [String] The database connection string. Set as
|
87
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
88
|
-
# @param case_sensitive [Boolean] Whether character case must match.
|
89
|
-
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
90
|
-
# for separately.
|
91
|
-
# @param limit [Integer] The max number of results to print.
|
92
|
-
# @param skip [Integer] The number of DB records to skip.
|
93
|
-
# @param sentence_limit [Integer] The max length of each result's text
|
94
|
-
# snippet.
|
95
|
-
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
96
|
-
# database.
|
97
|
-
def self.indexed_search(
|
98
|
-
query, connection_string: nil,
|
99
|
-
case_sensitive: false, whole_sentence: false,
|
100
|
-
limit: 10, skip: 0, sentence_limit: 80, &block
|
101
|
-
)
|
102
|
-
db = Wgit::Database.new(connection_string)
|
103
|
-
|
104
|
-
results = db.search(
|
105
|
-
query,
|
106
|
-
case_sensitive: case_sensitive,
|
107
|
-
whole_sentence: whole_sentence,
|
108
|
-
limit: limit,
|
109
|
-
skip: skip,
|
110
|
-
&block
|
111
|
-
)
|
112
|
-
|
113
|
-
results.each do |doc|
|
114
|
-
doc.search!(
|
115
|
-
query,
|
116
|
-
case_sensitive: case_sensitive,
|
117
|
-
whole_sentence: whole_sentence,
|
118
|
-
sentence_limit: sentence_limit
|
119
|
-
)
|
120
|
-
end
|
121
|
-
|
122
|
-
Wgit::Utils.printf_search_results(results)
|
123
|
-
end
|
124
|
-
|
125
|
-
# Class which sets up a crawler and saves the indexed docs to a database.
|
7
|
+
# Class which crawls and saves the Documents to a database. Can be thought of
|
8
|
+
# as a combination of Wgit::Crawler and Wgit::Database.
|
126
9
|
class Indexer
|
127
10
|
# The crawler used to index the WWW.
|
128
11
|
attr_reader :crawler
|
@@ -133,10 +16,11 @@ module Wgit
|
|
133
16
|
# Initialize the Indexer.
|
134
17
|
#
|
135
18
|
# @param database [Wgit::Database] The database instance (already
|
136
|
-
# initialized
|
137
|
-
|
138
|
-
|
19
|
+
# initialized and connected) used to index.
|
20
|
+
# @param crawler [Wgit::Crawler] The crawler instance used to index.
|
21
|
+
def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
|
139
22
|
@db = database
|
23
|
+
@crawler = crawler
|
140
24
|
end
|
141
25
|
|
142
26
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
@@ -184,7 +68,8 @@ database capacity, exiting.")
|
|
184
68
|
|
185
69
|
site_docs_count = 0
|
186
70
|
ext_links = @crawler.crawl_site(url) do |doc|
|
187
|
-
|
71
|
+
unless doc.empty?
|
72
|
+
write_doc_to_db(doc)
|
188
73
|
docs_count += 1
|
189
74
|
site_docs_count += 1
|
190
75
|
end
|
@@ -193,12 +78,9 @@ database capacity, exiting.")
|
|
193
78
|
raise 'Error updating url' unless @db.update(url) == 1
|
194
79
|
|
195
80
|
urls_count += write_urls_to_db(ext_links)
|
196
|
-
|
197
|
-
Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
|
198
|
-
site: #{url}")
|
199
81
|
end
|
200
82
|
|
201
|
-
Wgit.logger.info("Crawled and
|
83
|
+
Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
|
202
84
|
overall for this iteration.")
|
203
85
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
204
86
|
the next iteration.")
|
@@ -214,62 +96,91 @@ the next iteration.")
|
|
214
96
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
215
97
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
216
98
|
# external Url's into the database.
|
99
|
+
# @param follow [String] The xpath extracting links to be followed during
|
100
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
101
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
102
|
+
# HTML.
|
103
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
104
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
105
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
106
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
217
107
|
# @yield [doc] Given the Wgit::Document of each crawled web page before
|
218
108
|
# it's inserted into the database allowing for prior manipulation. Return
|
219
109
|
# nil or false from the block to prevent the document from being saved
|
220
110
|
# into the database.
|
221
111
|
# @return [Integer] The total number of webpages/documents indexed.
|
222
112
|
def index_site(
|
223
|
-
url, insert_externals:
|
113
|
+
url, insert_externals: false, follow: :default,
|
114
|
+
allow_paths: nil, disallow_paths: nil
|
224
115
|
)
|
225
|
-
crawl_opts = {
|
116
|
+
crawl_opts = {
|
117
|
+
follow: follow,
|
118
|
+
allow_paths: allow_paths,
|
119
|
+
disallow_paths: disallow_paths
|
120
|
+
}
|
226
121
|
total_pages_indexed = 0
|
227
122
|
|
228
|
-
ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
|
229
|
-
result = true
|
230
|
-
result = yield(doc) if block_given?
|
123
|
+
ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
|
124
|
+
result = block_given? ? yield(doc) : true
|
231
125
|
|
232
|
-
if result && !doc.empty?
|
126
|
+
if result && !doc.empty?
|
127
|
+
write_doc_to_db(doc)
|
233
128
|
total_pages_indexed += 1
|
234
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
235
129
|
end
|
236
130
|
end
|
237
131
|
|
238
|
-
@db.
|
132
|
+
@db.upsert(url)
|
239
133
|
|
240
134
|
if insert_externals && ext_urls
|
241
135
|
num_inserted_urls = write_urls_to_db(ext_urls)
|
242
136
|
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
243
137
|
end
|
244
138
|
|
245
|
-
Wgit.logger.info("Crawled and
|
246
|
-
site: #{url}")
|
139
|
+
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
|
140
|
+
the site: #{url}")
|
247
141
|
|
248
142
|
total_pages_indexed
|
249
143
|
end
|
250
144
|
|
145
|
+
# Crawls one or more webpages and stores them into the database.
|
146
|
+
# There is no max download limit so be careful of large pages.
|
147
|
+
# Logs info on the crawl using Wgit.logger as it goes along.
|
148
|
+
#
|
149
|
+
# @param urls [*Wgit::Url] The webpage Url's to crawl.
|
150
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
151
|
+
# external Url's into the database.
|
152
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
153
|
+
# before it's inserted into the database allowing for prior
|
154
|
+
# manipulation. Return nil or false from the block to prevent the
|
155
|
+
# document from being saved into the database.
|
156
|
+
# @raise [StandardError] if no urls are provided.
|
157
|
+
def index_urls(*urls, insert_externals: false, &block)
|
158
|
+
raise 'You must provide at least one Url' if urls.empty?
|
159
|
+
|
160
|
+
opts = { insert_externals: insert_externals }
|
161
|
+
Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
|
162
|
+
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
251
166
|
# Crawls a single webpage and stores it into the database.
|
252
167
|
# There is no max download limit so be careful of large pages.
|
253
168
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
254
169
|
#
|
255
170
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
256
|
-
# @param insert_externals [Boolean] Whether or not to insert the
|
171
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
257
172
|
# external Url's into the database.
|
258
173
|
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
259
174
|
# before it's inserted into the database allowing for prior
|
260
175
|
# manipulation. Return nil or false from the block to prevent the
|
261
176
|
# document from being saved into the database.
|
262
|
-
def
|
177
|
+
def index_url(url, insert_externals: false)
|
263
178
|
document = @crawler.crawl_url(url) do |doc|
|
264
|
-
result = true
|
265
|
-
|
266
|
-
|
267
|
-
if result && !doc.empty? && write_doc_to_db(doc)
|
268
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
269
|
-
end
|
179
|
+
result = block_given? ? yield(doc) : true
|
180
|
+
write_doc_to_db(doc) if result && !doc.empty?
|
270
181
|
end
|
271
182
|
|
272
|
-
@db.
|
183
|
+
@db.upsert(url)
|
273
184
|
|
274
185
|
ext_urls = document&.external_links
|
275
186
|
if insert_externals && ext_urls
|
@@ -302,23 +213,19 @@ site: #{url}")
|
|
302
213
|
# collection deliberately prevents duplicate inserts.
|
303
214
|
#
|
304
215
|
# @param doc [Wgit::Document] The document to write to the DB.
|
305
|
-
# @return [Boolean] True if the write was successful, false otherwise.
|
306
216
|
def write_doc_to_db(doc)
|
307
|
-
@db.
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
Wgit.logger.info("Document already exists: #{doc.url}")
|
313
|
-
|
314
|
-
false
|
217
|
+
if @db.upsert(doc)
|
218
|
+
Wgit.logger.info("Saved document for url: #{doc.url}")
|
219
|
+
else
|
220
|
+
Wgit.logger.info("Updated document for url: #{doc.url}")
|
221
|
+
end
|
315
222
|
end
|
316
223
|
|
317
224
|
# Write the urls to the DB. Note that the unique url index on the urls
|
318
225
|
# collection deliberately prevents duplicate inserts.
|
319
226
|
#
|
320
227
|
# @param urls [Array<Wgit::Url>] The urls to write to the DB.
|
321
|
-
# @return [
|
228
|
+
# @return [Integer] The number of inserted urls.
|
322
229
|
def write_urls_to_db(urls)
|
323
230
|
count = 0
|
324
231
|
|
@@ -332,6 +239,7 @@ site: #{url}")
|
|
332
239
|
|
333
240
|
@db.insert(url)
|
334
241
|
count += 1
|
242
|
+
|
335
243
|
Wgit.logger.info("Inserted external url: #{url}")
|
336
244
|
rescue Mongo::Error::OperationFailure
|
337
245
|
Wgit.logger.info("External url already exists: #{url}")
|
@@ -339,5 +247,9 @@ site: #{url}")
|
|
339
247
|
|
340
248
|
count
|
341
249
|
end
|
250
|
+
|
251
|
+
alias database db
|
252
|
+
alias index index_urls
|
253
|
+
alias index_r index_site
|
342
254
|
end
|
343
255
|
end
|