wgit 0.5.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +240 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +239 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +234 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -17
- data/lib/wgit/url.rb +170 -42
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +8 -2
- metadata +54 -32
@@ -1,27 +1,35 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
### Default Document
|
3
|
+
### Default Document Extractors ###
|
4
4
|
|
5
5
|
# Base.
|
6
|
-
Wgit::Document.
|
6
|
+
Wgit::Document.define_extractor(
|
7
7
|
:base,
|
8
8
|
'//base/@href',
|
9
9
|
singleton: true,
|
10
10
|
text_content_only: true
|
11
11
|
) do |base|
|
12
|
-
Wgit::Url.
|
12
|
+
Wgit::Url.parse?(base) if base
|
13
13
|
end
|
14
14
|
|
15
15
|
# Title.
|
16
|
-
Wgit::Document.
|
16
|
+
Wgit::Document.define_extractor(
|
17
17
|
:title,
|
18
18
|
'//title',
|
19
19
|
singleton: true,
|
20
20
|
text_content_only: true
|
21
21
|
)
|
22
22
|
|
23
|
+
# Description.
|
24
|
+
Wgit::Document.define_extractor(
|
25
|
+
:description,
|
26
|
+
'//meta[@name="description"]/@content',
|
27
|
+
singleton: true,
|
28
|
+
text_content_only: true
|
29
|
+
)
|
30
|
+
|
23
31
|
# Author.
|
24
|
-
Wgit::Document.
|
32
|
+
Wgit::Document.define_extractor(
|
25
33
|
:author,
|
26
34
|
'//meta[@name="author"]/@content',
|
27
35
|
singleton: true,
|
@@ -29,7 +37,7 @@ Wgit::Document.define_extension(
|
|
29
37
|
)
|
30
38
|
|
31
39
|
# Keywords.
|
32
|
-
Wgit::Document.
|
40
|
+
Wgit::Document.define_extractor(
|
33
41
|
:keywords,
|
34
42
|
'//meta[@name="keywords"]/@content',
|
35
43
|
singleton: true,
|
@@ -37,23 +45,25 @@ Wgit::Document.define_extension(
|
|
37
45
|
) do |keywords, _source, type|
|
38
46
|
if keywords && (type == :document)
|
39
47
|
keywords = keywords.split(',')
|
40
|
-
Wgit::Utils.
|
48
|
+
Wgit::Utils.sanitize(keywords)
|
41
49
|
end
|
42
50
|
keywords
|
43
51
|
end
|
44
52
|
|
45
53
|
# Links.
|
46
|
-
Wgit::Document.
|
54
|
+
Wgit::Document.define_extractor(
|
47
55
|
:links,
|
48
56
|
'//a/@href',
|
49
57
|
singleton: false,
|
50
58
|
text_content_only: true
|
51
59
|
) do |links|
|
52
|
-
links
|
60
|
+
links
|
61
|
+
.map { |link| Wgit::Url.parse?(link) }
|
62
|
+
.compact # Remove unparsable links.
|
53
63
|
end
|
54
64
|
|
55
65
|
# Text.
|
56
|
-
Wgit::Document.
|
66
|
+
Wgit::Document.define_extractor(
|
57
67
|
:text,
|
58
68
|
proc { Wgit::Document.text_elements_xpath },
|
59
69
|
singleton: false,
|
data/lib/wgit/dsl.rb
ADDED
@@ -0,0 +1,324 @@
|
|
1
|
+
module Wgit
|
2
|
+
# DSL methods that act as a wrapper around Wgit's underlying class methods.
|
3
|
+
# All instance vars/constants are prefixed to avoid conflicts when included.
|
4
|
+
module DSL
|
5
|
+
# Error message shown when there's no URL to crawl.
|
6
|
+
DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
|
7
|
+
the 'start' function".freeze
|
8
|
+
|
9
|
+
### CRAWLER METHODS ###
|
10
|
+
|
11
|
+
# Defines an extractor using `Wgit::Document.define_extractor` underneath.
|
12
|
+
#
|
13
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
14
|
+
# contain the extracted content.
|
15
|
+
# @param xpath [String, #call] The xpath used to find the element(s)
|
16
|
+
# of the webpage. Only used when initializing from HTML.
|
17
|
+
#
|
18
|
+
# Pass a callable object (proc etc.) if you want the
|
19
|
+
# xpath value to be derived on Document initialisation (instead of when
|
20
|
+
# the extractor is defined). The call method must return a valid xpath
|
21
|
+
# String.
|
22
|
+
# @param opts [Hash] The options to define an extractor with. The
|
23
|
+
# options are only used when intializing from HTML, not the database.
|
24
|
+
# @option opts [Boolean] :singleton The singleton option determines
|
25
|
+
# whether or not the result(s) should be in an Array. If multiple
|
26
|
+
# results are found and singleton is true then the first result will be
|
27
|
+
# used. Defaults to true.
|
28
|
+
# @option opts [Boolean] :text_content_only The text_content_only option
|
29
|
+
# if true will use the text content of the Nokogiri result object,
|
30
|
+
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
31
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
32
|
+
# regardless of the source. Use it (optionally) to process the result
|
33
|
+
# value.
|
34
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
35
|
+
# `var`.
|
36
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
37
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
38
|
+
# `:object`.
|
39
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
40
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
41
|
+
# @raise [StandardError] If the var param isn't valid.
|
42
|
+
# @return [Symbol] The given var Symbol if successful.
|
43
|
+
def extract(var, xpath, opts = {}, &block)
|
44
|
+
Wgit::Document.define_extractor(var, xpath, opts, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
|
48
|
+
# index methods used by the DSL. See the Wgit::Crawler documentation for
|
49
|
+
# more details.
|
50
|
+
#
|
51
|
+
# @yield [crawler] The created crawler; use the block to configure.
|
52
|
+
# @return [Wgit::Crawler] The created crawler used by the DSL.
|
53
|
+
def crawler
|
54
|
+
@dsl_crawler ||= Wgit::Crawler.new
|
55
|
+
yield @dsl_crawler if block_given?
|
56
|
+
@dsl_crawler
|
57
|
+
end
|
58
|
+
|
59
|
+
# Sets the URL to be crawled when a `crawl*` or `index*` method is
|
60
|
+
# subsequently called. Calling this is optional as the URL can be
|
61
|
+
# passed to the method instead. You can also omit the url param and just
|
62
|
+
# use the block to configure the crawler instead.
|
63
|
+
#
|
64
|
+
# @param urls [*String, *Wgit::Url] The URL(s) to crawl
|
65
|
+
# or nil (if only using the block to configure the crawler).
|
66
|
+
# @yield [crawler] The crawler that'll be used in the subsequent
|
67
|
+
# crawl/index; use the block to configure.
|
68
|
+
def start(*urls, &block)
|
69
|
+
crawler(&block)
|
70
|
+
@dsl_start = urls
|
71
|
+
end
|
72
|
+
|
73
|
+
# Sets the xpath to be followed when `crawl_site` or `index_site` is
|
74
|
+
# subsequently called. Calling this method is optional as the default is to
|
75
|
+
# follow all `<a>` href's that point to the site domain. You can also pass
|
76
|
+
# `follow:` to the crawl/index methods directly.
|
77
|
+
#
|
78
|
+
# @param xpath [String] The xpath which is followed when crawling/indexing
|
79
|
+
# a site. Use `:default` to restore the default follow logic.
|
80
|
+
def follow(xpath)
|
81
|
+
@dsl_follow = xpath
|
82
|
+
end
|
83
|
+
|
84
|
+
# Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
|
85
|
+
# underneath. If no urls are provided, then the `start` URL is used.
|
86
|
+
#
|
87
|
+
# @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
|
88
|
+
# URL(s).
|
89
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
90
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
91
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
92
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
93
|
+
# This value will be used for all urls crawled.
|
94
|
+
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
95
|
+
# way to interact with them.
|
96
|
+
# @raise [StandardError] If no urls are provided and no `start` URL has
|
97
|
+
# been set.
|
98
|
+
# @return [Wgit::Document] The last Document crawled.
|
99
|
+
def crawl(*urls, follow_redirects: true, &block)
|
100
|
+
urls = (@dsl_start || []) if urls.empty?
|
101
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
102
|
+
|
103
|
+
urls.map! { |url| Wgit::Url.parse(url) }
|
104
|
+
crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
108
|
+
# url is provided, then the first `start` URL is used.
|
109
|
+
#
|
110
|
+
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
|
111
|
+
# crawled. It is recommended that this URL be the index page of the site
|
112
|
+
# to give a greater chance of finding all pages within that site/host.
|
113
|
+
# Defaults to the `start` URLs.
|
114
|
+
# @param follow [String] The xpath extracting links to be followed during
|
115
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
116
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
117
|
+
# HTML. This can also be set using `follow`.
|
118
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
119
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
120
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
121
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
122
|
+
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
123
|
+
# A block is the only way to interact with each crawled Document.
|
124
|
+
# Use `doc.empty?` to determine if the page is valid.
|
125
|
+
# @raise [StandardError] If no url is provided and no `start` URL has been
|
126
|
+
# set.
|
127
|
+
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
128
|
+
# from all of the site's pages or nil if the given url could not be
|
129
|
+
# crawled successfully.
|
130
|
+
def crawl_site(
|
131
|
+
*urls, follow: @dsl_follow,
|
132
|
+
allow_paths: nil, disallow_paths: nil, &block
|
133
|
+
)
|
134
|
+
urls = (@dsl_start || []) if urls.empty?
|
135
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
136
|
+
|
137
|
+
xpath = follow || :default
|
138
|
+
opts = {
|
139
|
+
follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
|
140
|
+
}
|
141
|
+
|
142
|
+
urls.reduce([]) do |externals, url|
|
143
|
+
externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns the DSL's `crawler#last_response`.
|
148
|
+
#
|
149
|
+
# @return [Wgit::Response] The response from the last URL crawled.
|
150
|
+
def last_response
|
151
|
+
crawler.last_response
|
152
|
+
end
|
153
|
+
|
154
|
+
# Nilifies the DSL instance variables.
|
155
|
+
def reset
|
156
|
+
@dsl_crawler = nil
|
157
|
+
@dsl_start = nil
|
158
|
+
@dsl_follow = nil
|
159
|
+
@dsl_conn_str = nil
|
160
|
+
end
|
161
|
+
|
162
|
+
### INDEXER METHODS ###
|
163
|
+
|
164
|
+
# Defines the connection string to the database used in subsequent `index*`
|
165
|
+
# method calls. This method is optional as the connection string can be
|
166
|
+
# passed to the index method instead.
|
167
|
+
#
|
168
|
+
# @param conn_str [String] The connection string used to connect to the
|
169
|
+
# database in subsequent `index*` method calls.
|
170
|
+
def connection_string(conn_str)
|
171
|
+
@dsl_conn_str = conn_str
|
172
|
+
end
|
173
|
+
|
174
|
+
# Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
|
175
|
+
#
|
176
|
+
# @param connection_string [String] The database connection string. Set as
|
177
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
178
|
+
# `connection_string`.
|
179
|
+
# @param max_sites [Integer] The number of separate and whole
|
180
|
+
# websites to be crawled before the method exits. Defaults to -1 which
|
181
|
+
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
182
|
+
# @param max_data [Integer] The maximum amount of bytes that will be
|
183
|
+
# scraped from the web (default is 1GB). Note, that this value is used to
|
184
|
+
# determine when to stop crawling; it's not a guarantee of the max data
|
185
|
+
# that will be obtained.
|
186
|
+
def index_www(
|
187
|
+
connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
|
188
|
+
)
|
189
|
+
db = Wgit::Database.new(connection_string)
|
190
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
191
|
+
|
192
|
+
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Indexes a single website using `Wgit::Indexer#index_site` underneath.
|
196
|
+
#
|
197
|
+
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
|
198
|
+
# crawl. Can be set using `start`.
|
199
|
+
# @param connection_string [String] The database connection string. Set as
|
200
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
201
|
+
# `connection_string`.
|
202
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
203
|
+
# external URL's into the database.
|
204
|
+
# @param follow [String] The xpath extracting links to be followed during
|
205
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
206
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
207
|
+
# HTML. This can also be set using `follow`.
|
208
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
209
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
210
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
211
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
212
|
+
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it
|
213
|
+
# is inserted into the database allowing for prior manipulation.
|
214
|
+
# @raise [StandardError] If no url is provided and no `start` URL has been
|
215
|
+
# set.
|
216
|
+
# @return [Integer] The total number of pages crawled within the website.
|
217
|
+
def index_site(
|
218
|
+
*urls, connection_string: @dsl_conn_str,
|
219
|
+
insert_externals: false, follow: @dsl_follow,
|
220
|
+
allow_paths: nil, disallow_paths: nil, &block
|
221
|
+
)
|
222
|
+
urls = (@dsl_start || []) if urls.empty?
|
223
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
224
|
+
|
225
|
+
db = Wgit::Database.new(connection_string)
|
226
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
227
|
+
xpath = follow || :default
|
228
|
+
crawl_opts = {
|
229
|
+
insert_externals: insert_externals, follow: xpath,
|
230
|
+
allow_paths: allow_paths, disallow_paths: disallow_paths
|
231
|
+
}
|
232
|
+
|
233
|
+
urls.reduce(0) do |total, url|
|
234
|
+
total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
|
239
|
+
#
|
240
|
+
# @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
|
241
|
+
# `start` URL(s).
|
242
|
+
# @param connection_string [String] The database connection string. Set as
|
243
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
244
|
+
# `connection_string`.
|
245
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
246
|
+
# external URL's into the database.
|
247
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
248
|
+
# before it's inserted into the database allowing for prior
|
249
|
+
# manipulation. Return nil or false from the block to prevent the
|
250
|
+
# document from being saved into the database.
|
251
|
+
# @raise [StandardError] If no urls are provided and no `start` URL has
|
252
|
+
# been set.
|
253
|
+
def index(
|
254
|
+
*urls, connection_string: @dsl_conn_str,
|
255
|
+
insert_externals: false, &block
|
256
|
+
)
|
257
|
+
urls = (@dsl_start || []) if urls.empty?
|
258
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
259
|
+
|
260
|
+
db = Wgit::Database.new(connection_string)
|
261
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
262
|
+
|
263
|
+
urls.map! { |url| Wgit::Url.parse(url) }
|
264
|
+
indexer.index_urls(*urls, insert_externals: insert_externals, &block)
|
265
|
+
end
|
266
|
+
|
267
|
+
# Performs a search of the database's indexed documents and pretty prints
|
268
|
+
# the results in a search engine-esque format. See `Wgit::Database#search!`
|
269
|
+
# and `Wgit::Document#search!` for details of how the search works.
|
270
|
+
#
|
271
|
+
# @param query [String] The text query to search with.
|
272
|
+
# @param connection_string [String] The database connection string. Set as
|
273
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
274
|
+
# `connection_string`.
|
275
|
+
# @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
|
276
|
+
# to output text somewhere e.g. a file or STDERR. Use nil for no output.
|
277
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
278
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
279
|
+
# for separately.
|
280
|
+
# @param limit [Integer] The max number of results to print.
|
281
|
+
# @param skip [Integer] The number of DB records to skip.
|
282
|
+
# @param sentence_limit [Integer] The max length of each result's text
|
283
|
+
# snippet.
|
284
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
285
|
+
# database containing only its matching `#text`.
|
286
|
+
# @return [Array<Wgit::Document>] The search results with matching text.
|
287
|
+
def search(
|
288
|
+
query, connection_string: @dsl_conn_str, stream: STDOUT,
|
289
|
+
case_sensitive: false, whole_sentence: true,
|
290
|
+
limit: 10, skip: 0, sentence_limit: 80, &block
|
291
|
+
)
|
292
|
+
stream ||= File.open(File::NULL, 'w')
|
293
|
+
db = Wgit::Database.new(connection_string)
|
294
|
+
|
295
|
+
results = db.search!(
|
296
|
+
query,
|
297
|
+
case_sensitive: case_sensitive,
|
298
|
+
whole_sentence: whole_sentence,
|
299
|
+
limit: limit,
|
300
|
+
skip: skip,
|
301
|
+
sentence_limit: sentence_limit,
|
302
|
+
&block
|
303
|
+
)
|
304
|
+
|
305
|
+
Wgit::Utils.printf_search_results(results, stream: stream)
|
306
|
+
|
307
|
+
results
|
308
|
+
end
|
309
|
+
|
310
|
+
# Deletes everything in the urls and documents collections by calling
|
311
|
+
# `Wgit::Database#clear_db` underneath. This will nuke the entire database
|
312
|
+
# so yeah... be careful.
|
313
|
+
#
|
314
|
+
# @return [Integer] The number of deleted records.
|
315
|
+
def clear_db!(connection_string: @dsl_conn_str)
|
316
|
+
db = Wgit::Database.new(connection_string)
|
317
|
+
db.clear_db
|
318
|
+
end
|
319
|
+
|
320
|
+
alias crawl_r crawl_site
|
321
|
+
alias index_r index_site
|
322
|
+
alias start_urls start
|
323
|
+
end
|
324
|
+
end
|
data/lib/wgit/indexer.rb
CHANGED
@@ -4,125 +4,8 @@ require_relative 'crawler'
|
|
4
4
|
require_relative 'database/database'
|
5
5
|
|
6
6
|
module Wgit
|
7
|
-
#
|
8
|
-
# Wgit::
|
9
|
-
#
|
10
|
-
# Retrieves uncrawled url's from the database and recursively crawls each
|
11
|
-
# site storing their internal pages into the database and adding their
|
12
|
-
# external url's to be crawled later on. Logs info on the crawl
|
13
|
-
# using Wgit.logger as it goes along.
|
14
|
-
#
|
15
|
-
# @param connection_string [String] The database connection string. Set as
|
16
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
17
|
-
# @param max_sites [Integer] The number of separate and whole
|
18
|
-
# websites to be crawled before the method exits. Defaults to -1 which
|
19
|
-
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
20
|
-
# @param max_data [Integer] The maximum amount of bytes that will be
|
21
|
-
# scraped from the web (default is 1GB). Note, that this value is used to
|
22
|
-
# determine when to stop crawling; it's not a guarantee of the max data
|
23
|
-
# that will be obtained.
|
24
|
-
def self.index_www(
|
25
|
-
connection_string: nil, max_sites: -1, max_data: 1_048_576_000
|
26
|
-
)
|
27
|
-
db = Wgit::Database.new(connection_string)
|
28
|
-
indexer = Wgit::Indexer.new(db)
|
29
|
-
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
30
|
-
end
|
31
|
-
|
32
|
-
# Convience method to index a single website using
|
33
|
-
# Wgit::Indexer#index_site.
|
34
|
-
#
|
35
|
-
# Crawls a single website's pages and stores them into the database.
|
36
|
-
# There is no max download limit so be careful which sites you index.
|
37
|
-
#
|
38
|
-
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
39
|
-
# @param connection_string [String] The database connection string. Set as
|
40
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
41
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
42
|
-
# external Url's into the database.
|
43
|
-
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
|
44
|
-
# inserted into the database allowing for prior manipulation.
|
45
|
-
# @return [Integer] The total number of pages crawled within the website.
|
46
|
-
def self.index_site(
|
47
|
-
url, connection_string: nil, insert_externals: true,
|
48
|
-
allow_paths: nil, disallow_paths: nil, &block
|
49
|
-
)
|
50
|
-
url = Wgit::Url.parse(url)
|
51
|
-
db = Wgit::Database.new(connection_string)
|
52
|
-
indexer = Wgit::Indexer.new(db)
|
53
|
-
indexer.index_site(
|
54
|
-
url, insert_externals: insert_externals,
|
55
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths, &block
|
56
|
-
)
|
57
|
-
end
|
58
|
-
|
59
|
-
# Convience method to index a single webpage using
|
60
|
-
# Wgit::Indexer#index_page.
|
61
|
-
#
|
62
|
-
# Crawls a single webpage and stores it into the database.
|
63
|
-
# There is no max download limit so be careful of large pages.
|
64
|
-
#
|
65
|
-
# @param url [Wgit::Url, String] The Url of the webpage to crawl.
|
66
|
-
# @param connection_string [String] The database connection string. Set as
|
67
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
68
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
69
|
-
# external Url's into the database.
|
70
|
-
# @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
|
71
|
-
# inserted into the database allowing for prior manipulation.
|
72
|
-
def self.index_page(
|
73
|
-
url, connection_string: nil, insert_externals: true, &block
|
74
|
-
)
|
75
|
-
url = Wgit::Url.parse(url)
|
76
|
-
db = Wgit::Database.new(connection_string)
|
77
|
-
indexer = Wgit::Indexer.new(db)
|
78
|
-
indexer.index_page(url, insert_externals: insert_externals, &block)
|
79
|
-
end
|
80
|
-
|
81
|
-
# Performs a search of the database's indexed documents and pretty prints
|
82
|
-
# the results. See Wgit::Database#search and Wgit::Document#search for
|
83
|
-
# details of how the search works.
|
84
|
-
#
|
85
|
-
# @param query [String] The text query to search with.
|
86
|
-
# @param connection_string [String] The database connection string. Set as
|
87
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
88
|
-
# @param case_sensitive [Boolean] Whether character case must match.
|
89
|
-
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
90
|
-
# for separately.
|
91
|
-
# @param limit [Integer] The max number of results to print.
|
92
|
-
# @param skip [Integer] The number of DB records to skip.
|
93
|
-
# @param sentence_limit [Integer] The max length of each result's text
|
94
|
-
# snippet.
|
95
|
-
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
96
|
-
# database.
|
97
|
-
def self.indexed_search(
|
98
|
-
query, connection_string: nil,
|
99
|
-
case_sensitive: false, whole_sentence: false,
|
100
|
-
limit: 10, skip: 0, sentence_limit: 80, &block
|
101
|
-
)
|
102
|
-
db = Wgit::Database.new(connection_string)
|
103
|
-
|
104
|
-
results = db.search(
|
105
|
-
query,
|
106
|
-
case_sensitive: case_sensitive,
|
107
|
-
whole_sentence: whole_sentence,
|
108
|
-
limit: limit,
|
109
|
-
skip: skip,
|
110
|
-
&block
|
111
|
-
)
|
112
|
-
|
113
|
-
results.each do |doc|
|
114
|
-
doc.search!(
|
115
|
-
query,
|
116
|
-
case_sensitive: case_sensitive,
|
117
|
-
whole_sentence: whole_sentence,
|
118
|
-
sentence_limit: sentence_limit
|
119
|
-
)
|
120
|
-
end
|
121
|
-
|
122
|
-
Wgit::Utils.printf_search_results(results)
|
123
|
-
end
|
124
|
-
|
125
|
-
# Class which sets up a crawler and saves the indexed docs to a database.
|
7
|
+
# Class which crawls and saves the Documents to a database. Can be thought of
|
8
|
+
# as a combination of Wgit::Crawler and Wgit::Database.
|
126
9
|
class Indexer
|
127
10
|
# The crawler used to index the WWW.
|
128
11
|
attr_reader :crawler
|
@@ -133,10 +16,11 @@ module Wgit
|
|
133
16
|
# Initialize the Indexer.
|
134
17
|
#
|
135
18
|
# @param database [Wgit::Database] The database instance (already
|
136
|
-
# initialized
|
137
|
-
|
138
|
-
|
19
|
+
# initialized and connected) used to index.
|
20
|
+
# @param crawler [Wgit::Crawler] The crawler instance used to index.
|
21
|
+
def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
|
139
22
|
@db = database
|
23
|
+
@crawler = crawler
|
140
24
|
end
|
141
25
|
|
142
26
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
@@ -184,7 +68,8 @@ database capacity, exiting.")
|
|
184
68
|
|
185
69
|
site_docs_count = 0
|
186
70
|
ext_links = @crawler.crawl_site(url) do |doc|
|
187
|
-
|
71
|
+
unless doc.empty?
|
72
|
+
write_doc_to_db(doc)
|
188
73
|
docs_count += 1
|
189
74
|
site_docs_count += 1
|
190
75
|
end
|
@@ -193,12 +78,9 @@ database capacity, exiting.")
|
|
193
78
|
raise 'Error updating url' unless @db.update(url) == 1
|
194
79
|
|
195
80
|
urls_count += write_urls_to_db(ext_links)
|
196
|
-
|
197
|
-
Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
|
198
|
-
site: #{url}")
|
199
81
|
end
|
200
82
|
|
201
|
-
Wgit.logger.info("Crawled and
|
83
|
+
Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
|
202
84
|
overall for this iteration.")
|
203
85
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
204
86
|
the next iteration.")
|
@@ -214,62 +96,91 @@ the next iteration.")
|
|
214
96
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
215
97
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
216
98
|
# external Url's into the database.
|
99
|
+
# @param follow [String] The xpath extracting links to be followed during
|
100
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
101
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
102
|
+
# HTML.
|
103
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
104
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
105
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
106
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
217
107
|
# @yield [doc] Given the Wgit::Document of each crawled web page before
|
218
108
|
# it's inserted into the database allowing for prior manipulation. Return
|
219
109
|
# nil or false from the block to prevent the document from being saved
|
220
110
|
# into the database.
|
221
111
|
# @return [Integer] The total number of webpages/documents indexed.
|
222
112
|
def index_site(
|
223
|
-
url, insert_externals:
|
113
|
+
url, insert_externals: false, follow: :default,
|
114
|
+
allow_paths: nil, disallow_paths: nil
|
224
115
|
)
|
225
|
-
crawl_opts = {
|
116
|
+
crawl_opts = {
|
117
|
+
follow: follow,
|
118
|
+
allow_paths: allow_paths,
|
119
|
+
disallow_paths: disallow_paths
|
120
|
+
}
|
226
121
|
total_pages_indexed = 0
|
227
122
|
|
228
|
-
ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
|
229
|
-
result = true
|
230
|
-
result = yield(doc) if block_given?
|
123
|
+
ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
|
124
|
+
result = block_given? ? yield(doc) : true
|
231
125
|
|
232
|
-
if result && !doc.empty?
|
126
|
+
if result && !doc.empty?
|
127
|
+
write_doc_to_db(doc)
|
233
128
|
total_pages_indexed += 1
|
234
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
235
129
|
end
|
236
130
|
end
|
237
131
|
|
238
|
-
@db.
|
132
|
+
@db.upsert(url)
|
239
133
|
|
240
134
|
if insert_externals && ext_urls
|
241
135
|
num_inserted_urls = write_urls_to_db(ext_urls)
|
242
136
|
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
243
137
|
end
|
244
138
|
|
245
|
-
Wgit.logger.info("Crawled and
|
246
|
-
site: #{url}")
|
139
|
+
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
|
140
|
+
the site: #{url}")
|
247
141
|
|
248
142
|
total_pages_indexed
|
249
143
|
end
|
250
144
|
|
145
|
+
# Crawls one or more webpages and stores them into the database.
|
146
|
+
# There is no max download limit so be careful of large pages.
|
147
|
+
# Logs info on the crawl using Wgit.logger as it goes along.
|
148
|
+
#
|
149
|
+
# @param urls [*Wgit::Url] The webpage Url's to crawl.
|
150
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
151
|
+
# external Url's into the database.
|
152
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
153
|
+
# before it's inserted into the database allowing for prior
|
154
|
+
# manipulation. Return nil or false from the block to prevent the
|
155
|
+
# document from being saved into the database.
|
156
|
+
# @raise [StandardError] if no urls are provided.
|
157
|
+
def index_urls(*urls, insert_externals: false, &block)
|
158
|
+
raise 'You must provide at least one Url' if urls.empty?
|
159
|
+
|
160
|
+
opts = { insert_externals: insert_externals }
|
161
|
+
Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
|
162
|
+
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
251
166
|
# Crawls a single webpage and stores it into the database.
|
252
167
|
# There is no max download limit so be careful of large pages.
|
253
168
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
254
169
|
#
|
255
170
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
256
|
-
# @param insert_externals [Boolean] Whether or not to insert the
|
171
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
257
172
|
# external Url's into the database.
|
258
173
|
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
259
174
|
# before it's inserted into the database allowing for prior
|
260
175
|
# manipulation. Return nil or false from the block to prevent the
|
261
176
|
# document from being saved into the database.
|
262
|
-
def
|
177
|
+
def index_url(url, insert_externals: false)
|
263
178
|
document = @crawler.crawl_url(url) do |doc|
|
264
|
-
result = true
|
265
|
-
|
266
|
-
|
267
|
-
if result && !doc.empty? && write_doc_to_db(doc)
|
268
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
269
|
-
end
|
179
|
+
result = block_given? ? yield(doc) : true
|
180
|
+
write_doc_to_db(doc) if result && !doc.empty?
|
270
181
|
end
|
271
182
|
|
272
|
-
@db.
|
183
|
+
@db.upsert(url)
|
273
184
|
|
274
185
|
ext_urls = document&.external_links
|
275
186
|
if insert_externals && ext_urls
|
@@ -302,23 +213,19 @@ site: #{url}")
|
|
302
213
|
# collection deliberately prevents duplicate inserts.
|
303
214
|
#
|
304
215
|
# @param doc [Wgit::Document] The document to write to the DB.
|
305
|
-
# @return [Boolean] True if the write was successful, false otherwise.
|
306
216
|
def write_doc_to_db(doc)
|
307
|
-
@db.
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
Wgit.logger.info("Document already exists: #{doc.url}")
|
313
|
-
|
314
|
-
false
|
217
|
+
if @db.upsert(doc)
|
218
|
+
Wgit.logger.info("Saved document for url: #{doc.url}")
|
219
|
+
else
|
220
|
+
Wgit.logger.info("Updated document for url: #{doc.url}")
|
221
|
+
end
|
315
222
|
end
|
316
223
|
|
317
224
|
# Write the urls to the DB. Note that the unique url index on the urls
|
318
225
|
# collection deliberately prevents duplicate inserts.
|
319
226
|
#
|
320
227
|
# @param urls [Array<Wgit::Url>] The urls to write to the DB.
|
321
|
-
# @return [
|
228
|
+
# @return [Integer] The number of inserted urls.
|
322
229
|
def write_urls_to_db(urls)
|
323
230
|
count = 0
|
324
231
|
|
@@ -332,6 +239,7 @@ site: #{url}")
|
|
332
239
|
|
333
240
|
@db.insert(url)
|
334
241
|
count += 1
|
242
|
+
|
335
243
|
Wgit.logger.info("Inserted external url: #{url}")
|
336
244
|
rescue Mongo::Error::OperationFailure
|
337
245
|
Wgit.logger.info("External url already exists: #{url}")
|
@@ -339,5 +247,9 @@ site: #{url}")
|
|
339
247
|
|
340
248
|
count
|
341
249
|
end
|
250
|
+
|
251
|
+
alias database db
|
252
|
+
alias index index_urls
|
253
|
+
alias index_r index_site
|
342
254
|
end
|
343
255
|
end
|