wgit 0.7.0 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
@@ -1,27 +1,35 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
### Default Document
|
3
|
+
### Default Document Extractors ###
|
4
4
|
|
5
5
|
# Base.
|
6
|
-
Wgit::Document.
|
6
|
+
Wgit::Document.define_extractor(
|
7
7
|
:base,
|
8
8
|
'//base/@href',
|
9
9
|
singleton: true,
|
10
10
|
text_content_only: true
|
11
11
|
) do |base|
|
12
|
-
Wgit::Url.
|
12
|
+
Wgit::Url.parse?(base) if base
|
13
13
|
end
|
14
14
|
|
15
15
|
# Title.
|
16
|
-
Wgit::Document.
|
16
|
+
Wgit::Document.define_extractor(
|
17
17
|
:title,
|
18
18
|
'//title',
|
19
19
|
singleton: true,
|
20
20
|
text_content_only: true
|
21
21
|
)
|
22
22
|
|
23
|
+
# Description.
|
24
|
+
Wgit::Document.define_extractor(
|
25
|
+
:description,
|
26
|
+
'//meta[@name="description"]/@content',
|
27
|
+
singleton: true,
|
28
|
+
text_content_only: true
|
29
|
+
)
|
30
|
+
|
23
31
|
# Author.
|
24
|
-
Wgit::Document.
|
32
|
+
Wgit::Document.define_extractor(
|
25
33
|
:author,
|
26
34
|
'//meta[@name="author"]/@content',
|
27
35
|
singleton: true,
|
@@ -29,7 +37,7 @@ Wgit::Document.define_extension(
|
|
29
37
|
)
|
30
38
|
|
31
39
|
# Keywords.
|
32
|
-
Wgit::Document.
|
40
|
+
Wgit::Document.define_extractor(
|
33
41
|
:keywords,
|
34
42
|
'//meta[@name="keywords"]/@content',
|
35
43
|
singleton: true,
|
@@ -37,25 +45,27 @@ Wgit::Document.define_extension(
|
|
37
45
|
) do |keywords, _source, type|
|
38
46
|
if keywords && (type == :document)
|
39
47
|
keywords = keywords.split(',')
|
40
|
-
Wgit::Utils.
|
48
|
+
Wgit::Utils.sanitize(keywords)
|
41
49
|
end
|
42
50
|
keywords
|
43
51
|
end
|
44
52
|
|
45
53
|
# Links.
|
46
|
-
Wgit::Document.
|
54
|
+
Wgit::Document.define_extractor(
|
47
55
|
:links,
|
48
56
|
'//a/@href',
|
49
57
|
singleton: false,
|
50
58
|
text_content_only: true
|
51
59
|
) do |links|
|
52
|
-
links
|
60
|
+
links
|
61
|
+
.map { |link| Wgit::Url.parse?(link) }
|
62
|
+
.compact # Remove unparsable links.
|
53
63
|
end
|
54
64
|
|
55
65
|
# Text.
|
56
|
-
Wgit::Document.
|
66
|
+
Wgit::Document.define_extractor(
|
57
67
|
:text,
|
58
|
-
Wgit::Document
|
68
|
+
proc { Wgit::Document.text_elements_xpath },
|
59
69
|
singleton: false,
|
60
70
|
text_content_only: true
|
61
71
|
)
|
data/lib/wgit/dsl.rb
ADDED
@@ -0,0 +1,324 @@
|
|
1
|
+
module Wgit
|
2
|
+
# DSL methods that act as a wrapper around Wgit's underlying class methods.
|
3
|
+
# All instance vars/constants are prefixed to avoid conflicts when included.
|
4
|
+
module DSL
|
5
|
+
# Error message shown when there's no URL to crawl.
|
6
|
+
DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
|
7
|
+
the 'start' function".freeze
|
8
|
+
|
9
|
+
### CRAWLER METHODS ###
|
10
|
+
|
11
|
+
# Defines an extractor using `Wgit::Document.define_extractor` underneath.
|
12
|
+
#
|
13
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
14
|
+
# contain the extracted content.
|
15
|
+
# @param xpath [String, #call] The xpath used to find the element(s)
|
16
|
+
# of the webpage. Only used when initializing from HTML.
|
17
|
+
#
|
18
|
+
# Pass a callable object (proc etc.) if you want the
|
19
|
+
# xpath value to be derived on Document initialisation (instead of when
|
20
|
+
# the extractor is defined). The call method must return a valid xpath
|
21
|
+
# String.
|
22
|
+
# @param opts [Hash] The options to define an extractor with. The
|
23
|
+
# options are only used when intializing from HTML, not the database.
|
24
|
+
# @option opts [Boolean] :singleton The singleton option determines
|
25
|
+
# whether or not the result(s) should be in an Array. If multiple
|
26
|
+
# results are found and singleton is true then the first result will be
|
27
|
+
# used. Defaults to true.
|
28
|
+
# @option opts [Boolean] :text_content_only The text_content_only option
|
29
|
+
# if true will use the text content of the Nokogiri result object,
|
30
|
+
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
31
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
32
|
+
# regardless of the source. Use it (optionally) to process the result
|
33
|
+
# value.
|
34
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
35
|
+
# `var`.
|
36
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
37
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
38
|
+
# `:object`.
|
39
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
40
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
41
|
+
# @raise [StandardError] If the var param isn't valid.
|
42
|
+
# @return [Symbol] The given var Symbol if successful.
|
43
|
+
def extract(var, xpath, opts = {}, &block)
|
44
|
+
Wgit::Document.define_extractor(var, xpath, opts, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
|
48
|
+
# index methods used by the DSL. See the Wgit::Crawler documentation for
|
49
|
+
# more details.
|
50
|
+
#
|
51
|
+
# @yield [crawler] The created crawler; use the block to configure.
|
52
|
+
# @return [Wgit::Crawler] The created crawler used by the DSL.
|
53
|
+
def crawler
|
54
|
+
@dsl_crawler ||= Wgit::Crawler.new
|
55
|
+
yield @dsl_crawler if block_given?
|
56
|
+
@dsl_crawler
|
57
|
+
end
|
58
|
+
|
59
|
+
# Sets the URL to be crawled when a `crawl*` or `index*` method is
|
60
|
+
# subsequently called. Calling this is optional as the URL can be
|
61
|
+
# passed to the method instead. You can also omit the url param and just
|
62
|
+
# use the block to configure the crawler instead.
|
63
|
+
#
|
64
|
+
# @param urls [*String, *Wgit::Url] The URL(s) to crawl
|
65
|
+
# or nil (if only using the block to configure the crawler).
|
66
|
+
# @yield [crawler] The crawler that'll be used in the subsequent
|
67
|
+
# crawl/index; use the block to configure.
|
68
|
+
def start(*urls, &block)
|
69
|
+
crawler(&block)
|
70
|
+
@dsl_start = urls
|
71
|
+
end
|
72
|
+
|
73
|
+
# Sets the xpath to be followed when `crawl_site` or `index_site` is
|
74
|
+
# subsequently called. Calling this method is optional as the default is to
|
75
|
+
# follow all `<a>` href's that point to the site domain. You can also pass
|
76
|
+
# `follow:` to the crawl/index methods directly.
|
77
|
+
#
|
78
|
+
# @param xpath [String] The xpath which is followed when crawling/indexing
|
79
|
+
# a site. Use `:default` to restore the default follow logic.
|
80
|
+
def follow(xpath)
|
81
|
+
@dsl_follow = xpath
|
82
|
+
end
|
83
|
+
|
84
|
+
# Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
|
85
|
+
# underneath. If no urls are provided, then the `start` URL is used.
|
86
|
+
#
|
87
|
+
# @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
|
88
|
+
# URL(s).
|
89
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
90
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
91
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
92
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
93
|
+
# This value will be used for all urls crawled.
|
94
|
+
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
95
|
+
# way to interact with them.
|
96
|
+
# @raise [StandardError] If no urls are provided and no `start` URL has
|
97
|
+
# been set.
|
98
|
+
# @return [Wgit::Document] The last Document crawled.
|
99
|
+
def crawl(*urls, follow_redirects: true, &block)
|
100
|
+
urls = (@dsl_start || []) if urls.empty?
|
101
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
102
|
+
|
103
|
+
urls.map! { |url| Wgit::Url.parse(url) }
|
104
|
+
crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
108
|
+
# url is provided, then the first `start` URL is used.
|
109
|
+
#
|
110
|
+
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
|
111
|
+
# crawled. It is recommended that this URL be the index page of the site
|
112
|
+
# to give a greater chance of finding all pages within that site/host.
|
113
|
+
# Defaults to the `start` URLs.
|
114
|
+
# @param follow [String] The xpath extracting links to be followed during
|
115
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
116
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
117
|
+
# HTML. This can also be set using `follow`.
|
118
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
119
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
120
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
121
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
122
|
+
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
123
|
+
# A block is the only way to interact with each crawled Document.
|
124
|
+
# Use `doc.empty?` to determine if the page is valid.
|
125
|
+
# @raise [StandardError] If no url is provided and no `start` URL has been
|
126
|
+
# set.
|
127
|
+
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
128
|
+
# from all of the site's pages or nil if the given url could not be
|
129
|
+
# crawled successfully.
|
130
|
+
def crawl_site(
|
131
|
+
*urls, follow: @dsl_follow,
|
132
|
+
allow_paths: nil, disallow_paths: nil, &block
|
133
|
+
)
|
134
|
+
urls = (@dsl_start || []) if urls.empty?
|
135
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
136
|
+
|
137
|
+
xpath = follow || :default
|
138
|
+
opts = {
|
139
|
+
follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
|
140
|
+
}
|
141
|
+
|
142
|
+
urls.reduce([]) do |externals, url|
|
143
|
+
externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns the DSL's `crawler#last_response`.
|
148
|
+
#
|
149
|
+
# @return [Wgit::Response] The response from the last URL crawled.
|
150
|
+
def last_response
|
151
|
+
crawler.last_response
|
152
|
+
end
|
153
|
+
|
154
|
+
# Nilifies the DSL instance variables.
|
155
|
+
def reset
|
156
|
+
@dsl_crawler = nil
|
157
|
+
@dsl_start = nil
|
158
|
+
@dsl_follow = nil
|
159
|
+
@dsl_conn_str = nil
|
160
|
+
end
|
161
|
+
|
162
|
+
### INDEXER METHODS ###
|
163
|
+
|
164
|
+
# Defines the connection string to the database used in subsequent `index*`
|
165
|
+
# method calls. This method is optional as the connection string can be
|
166
|
+
# passed to the index method instead.
|
167
|
+
#
|
168
|
+
# @param conn_str [String] The connection string used to connect to the
|
169
|
+
# database in subsequent `index*` method calls.
|
170
|
+
def connection_string(conn_str)
|
171
|
+
@dsl_conn_str = conn_str
|
172
|
+
end
|
173
|
+
|
174
|
+
# Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
|
175
|
+
#
|
176
|
+
# @param connection_string [String] The database connection string. Set as
|
177
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
178
|
+
# `connection_string`.
|
179
|
+
# @param max_sites [Integer] The number of separate and whole
|
180
|
+
# websites to be crawled before the method exits. Defaults to -1 which
|
181
|
+
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
182
|
+
# @param max_data [Integer] The maximum amount of bytes that will be
|
183
|
+
# scraped from the web (default is 1GB). Note, that this value is used to
|
184
|
+
# determine when to stop crawling; it's not a guarantee of the max data
|
185
|
+
# that will be obtained.
|
186
|
+
def index_www(
|
187
|
+
connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
|
188
|
+
)
|
189
|
+
db = Wgit::Database.new(connection_string)
|
190
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
191
|
+
|
192
|
+
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Indexes a single website using `Wgit::Indexer#index_site` underneath.
|
196
|
+
#
|
197
|
+
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
|
198
|
+
# crawl. Can be set using `start`.
|
199
|
+
# @param connection_string [String] The database connection string. Set as
|
200
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
201
|
+
# `connection_string`.
|
202
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
203
|
+
# external URL's into the database.
|
204
|
+
# @param follow [String] The xpath extracting links to be followed during
|
205
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
206
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
207
|
+
# HTML. This can also be set using `follow`.
|
208
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
209
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
210
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
211
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
212
|
+
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it
|
213
|
+
# is inserted into the database allowing for prior manipulation.
|
214
|
+
# @raise [StandardError] If no url is provided and no `start` URL has been
|
215
|
+
# set.
|
216
|
+
# @return [Integer] The total number of pages crawled within the website.
|
217
|
+
def index_site(
|
218
|
+
*urls, connection_string: @dsl_conn_str,
|
219
|
+
insert_externals: false, follow: @dsl_follow,
|
220
|
+
allow_paths: nil, disallow_paths: nil, &block
|
221
|
+
)
|
222
|
+
urls = (@dsl_start || []) if urls.empty?
|
223
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
224
|
+
|
225
|
+
db = Wgit::Database.new(connection_string)
|
226
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
227
|
+
xpath = follow || :default
|
228
|
+
crawl_opts = {
|
229
|
+
insert_externals: insert_externals, follow: xpath,
|
230
|
+
allow_paths: allow_paths, disallow_paths: disallow_paths
|
231
|
+
}
|
232
|
+
|
233
|
+
urls.reduce(0) do |total, url|
|
234
|
+
total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
|
239
|
+
#
|
240
|
+
# @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
|
241
|
+
# `start` URL(s).
|
242
|
+
# @param connection_string [String] The database connection string. Set as
|
243
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
244
|
+
# `connection_string`.
|
245
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
246
|
+
# external URL's into the database.
|
247
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
248
|
+
# before it's inserted into the database allowing for prior
|
249
|
+
# manipulation. Return nil or false from the block to prevent the
|
250
|
+
# document from being saved into the database.
|
251
|
+
# @raise [StandardError] If no urls are provided and no `start` URL has
|
252
|
+
# been set.
|
253
|
+
def index(
|
254
|
+
*urls, connection_string: @dsl_conn_str,
|
255
|
+
insert_externals: false, &block
|
256
|
+
)
|
257
|
+
urls = (@dsl_start || []) if urls.empty?
|
258
|
+
raise DSL_ERROR__NO_START_URL if urls.empty?
|
259
|
+
|
260
|
+
db = Wgit::Database.new(connection_string)
|
261
|
+
indexer = Wgit::Indexer.new(db, crawler)
|
262
|
+
|
263
|
+
urls.map! { |url| Wgit::Url.parse(url) }
|
264
|
+
indexer.index_urls(*urls, insert_externals: insert_externals, &block)
|
265
|
+
end
|
266
|
+
|
267
|
+
# Performs a search of the database's indexed documents and pretty prints
|
268
|
+
# the results in a search engine-esque format. See `Wgit::Database#search!`
|
269
|
+
# and `Wgit::Document#search!` for details of how the search works.
|
270
|
+
#
|
271
|
+
# @param query [String] The text query to search with.
|
272
|
+
# @param connection_string [String] The database connection string. Set as
|
273
|
+
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
274
|
+
# `connection_string`.
|
275
|
+
# @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
|
276
|
+
# to output text somewhere e.g. a file or STDERR. Use nil for no output.
|
277
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
278
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
279
|
+
# for separately.
|
280
|
+
# @param limit [Integer] The max number of results to print.
|
281
|
+
# @param skip [Integer] The number of DB records to skip.
|
282
|
+
# @param sentence_limit [Integer] The max length of each result's text
|
283
|
+
# snippet.
|
284
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
285
|
+
# database containing only its matching `#text`.
|
286
|
+
# @return [Array<Wgit::Document>] The search results with matching text.
|
287
|
+
def search(
|
288
|
+
query, connection_string: @dsl_conn_str, stream: STDOUT,
|
289
|
+
case_sensitive: false, whole_sentence: true,
|
290
|
+
limit: 10, skip: 0, sentence_limit: 80, &block
|
291
|
+
)
|
292
|
+
stream ||= File.open(File::NULL, 'w')
|
293
|
+
db = Wgit::Database.new(connection_string)
|
294
|
+
|
295
|
+
results = db.search!(
|
296
|
+
query,
|
297
|
+
case_sensitive: case_sensitive,
|
298
|
+
whole_sentence: whole_sentence,
|
299
|
+
limit: limit,
|
300
|
+
skip: skip,
|
301
|
+
sentence_limit: sentence_limit,
|
302
|
+
&block
|
303
|
+
)
|
304
|
+
|
305
|
+
Wgit::Utils.printf_search_results(results, stream: stream)
|
306
|
+
|
307
|
+
results
|
308
|
+
end
|
309
|
+
|
310
|
+
# Deletes everything in the urls and documents collections by calling
|
311
|
+
# `Wgit::Database#clear_db` underneath. This will nuke the entire database
|
312
|
+
# so yeah... be careful.
|
313
|
+
#
|
314
|
+
# @return [Integer] The number of deleted records.
|
315
|
+
def clear_db!(connection_string: @dsl_conn_str)
|
316
|
+
db = Wgit::Database.new(connection_string)
|
317
|
+
db.clear_db
|
318
|
+
end
|
319
|
+
|
320
|
+
alias crawl_r crawl_site
|
321
|
+
alias index_r index_site
|
322
|
+
alias start_urls start
|
323
|
+
end
|
324
|
+
end
|