wgit 0.7.0 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,27 +1,35 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- ### Default Document Extensions ###
3
+ ### Default Document Extractors ###
4
4
 
5
5
  # Base.
6
- Wgit::Document.define_extension(
6
+ Wgit::Document.define_extractor(
7
7
  :base,
8
8
  '//base/@href',
9
9
  singleton: true,
10
10
  text_content_only: true
11
11
  ) do |base|
12
- Wgit::Url.new(base) if base
12
+ Wgit::Url.parse?(base) if base
13
13
  end
14
14
 
15
15
  # Title.
16
- Wgit::Document.define_extension(
16
+ Wgit::Document.define_extractor(
17
17
  :title,
18
18
  '//title',
19
19
  singleton: true,
20
20
  text_content_only: true
21
21
  )
22
22
 
23
+ # Description.
24
+ Wgit::Document.define_extractor(
25
+ :description,
26
+ '//meta[@name="description"]/@content',
27
+ singleton: true,
28
+ text_content_only: true
29
+ )
30
+
23
31
  # Author.
24
- Wgit::Document.define_extension(
32
+ Wgit::Document.define_extractor(
25
33
  :author,
26
34
  '//meta[@name="author"]/@content',
27
35
  singleton: true,
@@ -29,7 +37,7 @@ Wgit::Document.define_extension(
29
37
  )
30
38
 
31
39
  # Keywords.
32
- Wgit::Document.define_extension(
40
+ Wgit::Document.define_extractor(
33
41
  :keywords,
34
42
  '//meta[@name="keywords"]/@content',
35
43
  singleton: true,
@@ -37,25 +45,27 @@ Wgit::Document.define_extension(
37
45
  ) do |keywords, _source, type|
38
46
  if keywords && (type == :document)
39
47
  keywords = keywords.split(',')
40
- Wgit::Utils.process_arr(keywords)
48
+ Wgit::Utils.sanitize(keywords)
41
49
  end
42
50
  keywords
43
51
  end
44
52
 
45
53
  # Links.
46
- Wgit::Document.define_extension(
54
+ Wgit::Document.define_extractor(
47
55
  :links,
48
56
  '//a/@href',
49
57
  singleton: false,
50
58
  text_content_only: true
51
59
  ) do |links|
52
- links.map! { |link| Wgit::Url.new(link) }
60
+ links
61
+ .map { |link| Wgit::Url.parse?(link) }
62
+ .compact # Remove unparsable links.
53
63
  end
54
64
 
55
65
  # Text.
56
- Wgit::Document.define_extension(
66
+ Wgit::Document.define_extractor(
57
67
  :text,
58
- Wgit::Document::TEXT_ELEMENTS_XPATH,
68
+ proc { Wgit::Document.text_elements_xpath },
59
69
  singleton: false,
60
70
  text_content_only: true
61
71
  )
data/lib/wgit/dsl.rb ADDED
@@ -0,0 +1,324 @@
1
+ module Wgit
2
+ # DSL methods that act as a wrapper around Wgit's underlying class methods.
3
+ # All instance vars/constants are prefixed to avoid conflicts when included.
4
+ module DSL
5
+ # Error message shown when there's no URL to crawl.
6
+ DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
7
+ the 'start' function".freeze
8
+
9
+ ### CRAWLER METHODS ###
10
+
11
+ # Defines an extractor using `Wgit::Document.define_extractor` underneath.
12
+ #
13
+ # @param var [Symbol] The name of the variable to be initialised, that will
14
+ # contain the extracted content.
15
+ # @param xpath [String, #call] The xpath used to find the element(s)
16
+ # of the webpage. Only used when initializing from HTML.
17
+ #
18
+ # Pass a callable object (proc etc.) if you want the
19
+ # xpath value to be derived on Document initialisation (instead of when
20
+ # the extractor is defined). The call method must return a valid xpath
21
+ # String.
22
+ # @param opts [Hash] The options to define an extractor with. The
23
+ # options are only used when intializing from HTML, not the database.
24
+ # @option opts [Boolean] :singleton The singleton option determines
25
+ # whether or not the result(s) should be in an Array. If multiple
26
+ # results are found and singleton is true then the first result will be
27
+ # used. Defaults to true.
28
+ # @option opts [Boolean] :text_content_only The text_content_only option
29
+ # if true will use the text content of the Nokogiri result object,
30
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
31
+ # @yield The block is executed when a Wgit::Document is initialized,
32
+ # regardless of the source. Use it (optionally) to process the result
33
+ # value.
34
+ # @yieldparam value [Object] The result value to be assigned to the new
35
+ # `var`.
36
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
37
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
38
+ # `:object`.
39
+ # @yieldreturn [Object] The return value of the block becomes the new var's
40
+ # value. Return the block's value param unchanged if you want to inspect.
41
+ # @raise [StandardError] If the var param isn't valid.
42
+ # @return [Symbol] The given var Symbol if successful.
43
+ def extract(var, xpath, opts = {}, &block)
44
+ Wgit::Document.define_extractor(var, xpath, opts, &block)
45
+ end
46
+
47
+ # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
48
+ # index methods used by the DSL. See the Wgit::Crawler documentation for
49
+ # more details.
50
+ #
51
+ # @yield [crawler] The created crawler; use the block to configure.
52
+ # @return [Wgit::Crawler] The created crawler used by the DSL.
53
+ def crawler
54
+ @dsl_crawler ||= Wgit::Crawler.new
55
+ yield @dsl_crawler if block_given?
56
+ @dsl_crawler
57
+ end
58
+
59
+ # Sets the URL to be crawled when a `crawl*` or `index*` method is
60
+ # subsequently called. Calling this is optional as the URL can be
61
+ # passed to the method instead. You can also omit the url param and just
62
+ # use the block to configure the crawler instead.
63
+ #
64
+ # @param urls [*String, *Wgit::Url] The URL(s) to crawl
65
+ # or nil (if only using the block to configure the crawler).
66
+ # @yield [crawler] The crawler that'll be used in the subsequent
67
+ # crawl/index; use the block to configure.
68
+ def start(*urls, &block)
69
+ crawler(&block)
70
+ @dsl_start = urls
71
+ end
72
+
73
+ # Sets the xpath to be followed when `crawl_site` or `index_site` is
74
+ # subsequently called. Calling this method is optional as the default is to
75
+ # follow all `<a>` href's that point to the site domain. You can also pass
76
+ # `follow:` to the crawl/index methods directly.
77
+ #
78
+ # @param xpath [String] The xpath which is followed when crawling/indexing
79
+ # a site. Use `:default` to restore the default follow logic.
80
+ def follow(xpath)
81
+ @dsl_follow = xpath
82
+ end
83
+
84
+ # Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
85
+ # underneath. If no urls are provided, then the `start` URL is used.
86
+ #
87
+ # @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
88
+ # URL(s).
89
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
90
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
91
+ # e.g. :host only allows redirects within the same host. Choose from
92
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
93
+ # This value will be used for all urls crawled.
94
+ # @yield [doc] Given each crawled page (Wgit::Document); this is the only
95
+ # way to interact with them.
96
+ # @raise [StandardError] If no urls are provided and no `start` URL has
97
+ # been set.
98
+ # @return [Wgit::Document] The last Document crawled.
99
+ def crawl(*urls, follow_redirects: true, &block)
100
+ urls = (@dsl_start || []) if urls.empty?
101
+ raise DSL_ERROR__NO_START_URL if urls.empty?
102
+
103
+ urls.map! { |url| Wgit::Url.parse(url) }
104
+ crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
105
+ end
106
+
107
+ # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
108
+ # url is provided, then the first `start` URL is used.
109
+ #
110
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
111
+ # crawled. It is recommended that this URL be the index page of the site
112
+ # to give a greater chance of finding all pages within that site/host.
113
+ # Defaults to the `start` URLs.
114
+ # @param follow [String] The xpath extracting links to be followed during
115
+ # the crawl. This changes how a site is crawled. Only links pointing to
116
+ # the site domain are allowed. The `:default` is any `<a>` href returning
117
+ # HTML. This can also be set using `follow`.
118
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
119
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
120
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
121
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
122
+ # @yield [doc] Given each crawled page (Wgit::Document) of the site.
123
+ # A block is the only way to interact with each crawled Document.
124
+ # Use `doc.empty?` to determine if the page is valid.
125
+ # @raise [StandardError] If no url is provided and no `start` URL has been
126
+ # set.
127
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
128
+ # from all of the site's pages or nil if the given url could not be
129
+ # crawled successfully.
130
+ def crawl_site(
131
+ *urls, follow: @dsl_follow,
132
+ allow_paths: nil, disallow_paths: nil, &block
133
+ )
134
+ urls = (@dsl_start || []) if urls.empty?
135
+ raise DSL_ERROR__NO_START_URL if urls.empty?
136
+
137
+ xpath = follow || :default
138
+ opts = {
139
+ follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
+ }
141
+
142
+ urls.reduce([]) do |externals, url|
143
+ externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
144
+ end
145
+ end
146
+
147
+ # Returns the DSL's `crawler#last_response`.
148
+ #
149
+ # @return [Wgit::Response] The response from the last URL crawled.
150
+ def last_response
151
+ crawler.last_response
152
+ end
153
+
154
+ # Nilifies the DSL instance variables.
155
+ def reset
156
+ @dsl_crawler = nil
157
+ @dsl_start = nil
158
+ @dsl_follow = nil
159
+ @dsl_conn_str = nil
160
+ end
161
+
162
+ ### INDEXER METHODS ###
163
+
164
+ # Defines the connection string to the database used in subsequent `index*`
165
+ # method calls. This method is optional as the connection string can be
166
+ # passed to the index method instead.
167
+ #
168
+ # @param conn_str [String] The connection string used to connect to the
169
+ # database in subsequent `index*` method calls.
170
+ def connection_string(conn_str)
171
+ @dsl_conn_str = conn_str
172
+ end
173
+
174
+ # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
175
+ #
176
+ # @param connection_string [String] The database connection string. Set as
177
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
178
+ # `connection_string`.
179
+ # @param max_sites [Integer] The number of separate and whole
180
+ # websites to be crawled before the method exits. Defaults to -1 which
181
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
182
+ # @param max_data [Integer] The maximum amount of bytes that will be
183
+ # scraped from the web (default is 1GB). Note, that this value is used to
184
+ # determine when to stop crawling; it's not a guarantee of the max data
185
+ # that will be obtained.
186
+ def index_www(
187
+ connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
188
+ )
189
+ db = Wgit::Database.new(connection_string)
190
+ indexer = Wgit::Indexer.new(db, crawler)
191
+
192
+ indexer.index_www(max_sites: max_sites, max_data: max_data)
193
+ end
194
+
195
+ # Indexes a single website using `Wgit::Indexer#index_site` underneath.
196
+ #
197
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
198
+ # crawl. Can be set using `start`.
199
+ # @param connection_string [String] The database connection string. Set as
200
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
201
+ # `connection_string`.
202
+ # @param insert_externals [Boolean] Whether or not to insert the website's
203
+ # external URL's into the database.
204
+ # @param follow [String] The xpath extracting links to be followed during
205
+ # the crawl. This changes how a site is crawled. Only links pointing to
206
+ # the site domain are allowed. The `:default` is any `<a>` href returning
207
+ # HTML. This can also be set using `follow`.
208
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
209
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
210
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
211
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
212
+ # @yield [doc] Given the Wgit::Document of each crawled webpage, before it
213
+ # is inserted into the database allowing for prior manipulation.
214
+ # @raise [StandardError] If no url is provided and no `start` URL has been
215
+ # set.
216
+ # @return [Integer] The total number of pages crawled within the website.
217
+ def index_site(
218
+ *urls, connection_string: @dsl_conn_str,
219
+ insert_externals: false, follow: @dsl_follow,
220
+ allow_paths: nil, disallow_paths: nil, &block
221
+ )
222
+ urls = (@dsl_start || []) if urls.empty?
223
+ raise DSL_ERROR__NO_START_URL if urls.empty?
224
+
225
+ db = Wgit::Database.new(connection_string)
226
+ indexer = Wgit::Indexer.new(db, crawler)
227
+ xpath = follow || :default
228
+ crawl_opts = {
229
+ insert_externals: insert_externals, follow: xpath,
230
+ allow_paths: allow_paths, disallow_paths: disallow_paths
231
+ }
232
+
233
+ urls.reduce(0) do |total, url|
234
+ total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
235
+ end
236
+ end
237
+
238
+ # Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
239
+ #
240
+ # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
241
+ # `start` URL(s).
242
+ # @param connection_string [String] The database connection string. Set as
243
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
244
+ # `connection_string`.
245
+ # @param insert_externals [Boolean] Whether or not to insert the website's
246
+ # external URL's into the database.
247
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
248
+ # before it's inserted into the database allowing for prior
249
+ # manipulation. Return nil or false from the block to prevent the
250
+ # document from being saved into the database.
251
+ # @raise [StandardError] If no urls are provided and no `start` URL has
252
+ # been set.
253
+ def index(
254
+ *urls, connection_string: @dsl_conn_str,
255
+ insert_externals: false, &block
256
+ )
257
+ urls = (@dsl_start || []) if urls.empty?
258
+ raise DSL_ERROR__NO_START_URL if urls.empty?
259
+
260
+ db = Wgit::Database.new(connection_string)
261
+ indexer = Wgit::Indexer.new(db, crawler)
262
+
263
+ urls.map! { |url| Wgit::Url.parse(url) }
264
+ indexer.index_urls(*urls, insert_externals: insert_externals, &block)
265
+ end
266
+
267
+ # Performs a search of the database's indexed documents and pretty prints
268
+ # the results in a search engine-esque format. See `Wgit::Database#search!`
269
+ # and `Wgit::Document#search!` for details of how the search works.
270
+ #
271
+ # @param query [String] The text query to search with.
272
+ # @param connection_string [String] The database connection string. Set as
273
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
274
+ # `connection_string`.
275
+ # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
276
+ # to output text somewhere e.g. a file or STDERR. Use nil for no output.
277
+ # @param case_sensitive [Boolean] Whether character case must match.
278
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
279
+ # for separately.
280
+ # @param limit [Integer] The max number of results to print.
281
+ # @param skip [Integer] The number of DB records to skip.
282
+ # @param sentence_limit [Integer] The max length of each result's text
283
+ # snippet.
284
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
285
+ # database containing only its matching `#text`.
286
+ # @return [Array<Wgit::Document>] The search results with matching text.
287
+ def search(
288
+ query, connection_string: @dsl_conn_str, stream: STDOUT,
289
+ case_sensitive: false, whole_sentence: true,
290
+ limit: 10, skip: 0, sentence_limit: 80, &block
291
+ )
292
+ stream ||= File.open(File::NULL, 'w')
293
+ db = Wgit::Database.new(connection_string)
294
+
295
+ results = db.search!(
296
+ query,
297
+ case_sensitive: case_sensitive,
298
+ whole_sentence: whole_sentence,
299
+ limit: limit,
300
+ skip: skip,
301
+ sentence_limit: sentence_limit,
302
+ &block
303
+ )
304
+
305
+ Wgit::Utils.printf_search_results(results, stream: stream)
306
+
307
+ results
308
+ end
309
+
310
+ # Deletes everything in the urls and documents collections by calling
311
+ # `Wgit::Database#clear_db` underneath. This will nuke the entire database
312
+ # so yeah... be careful.
313
+ #
314
+ # @return [Integer] The number of deleted records.
315
+ def clear_db!(connection_string: @dsl_conn_str)
316
+ db = Wgit::Database.new(connection_string)
317
+ db.clear_db
318
+ end
319
+
320
+ alias crawl_r crawl_site
321
+ alias index_r index_site
322
+ alias start_urls start
323
+ end
324
+ end