wgit 0.7.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,35 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- ### Default Document Extensions ###
3
+ ### Default Document Extractors ###
4
4
 
5
5
  # Base.
6
- Wgit::Document.define_extension(
6
+ Wgit::Document.define_extractor(
7
7
  :base,
8
8
  '//base/@href',
9
9
  singleton: true,
10
10
  text_content_only: true
11
11
  ) do |base|
12
- Wgit::Url.new(base) if base
12
+ Wgit::Url.parse?(base) if base
13
13
  end
14
14
 
15
15
  # Title.
16
- Wgit::Document.define_extension(
16
+ Wgit::Document.define_extractor(
17
17
  :title,
18
18
  '//title',
19
19
  singleton: true,
20
20
  text_content_only: true
21
21
  )
22
22
 
23
+ # Description.
24
+ Wgit::Document.define_extractor(
25
+ :description,
26
+ '//meta[@name="description"]/@content',
27
+ singleton: true,
28
+ text_content_only: true
29
+ )
30
+
23
31
  # Author.
24
- Wgit::Document.define_extension(
32
+ Wgit::Document.define_extractor(
25
33
  :author,
26
34
  '//meta[@name="author"]/@content',
27
35
  singleton: true,
@@ -29,7 +37,7 @@ Wgit::Document.define_extension(
29
37
  )
30
38
 
31
39
  # Keywords.
32
- Wgit::Document.define_extension(
40
+ Wgit::Document.define_extractor(
33
41
  :keywords,
34
42
  '//meta[@name="keywords"]/@content',
35
43
  singleton: true,
@@ -37,25 +45,27 @@ Wgit::Document.define_extension(
37
45
  ) do |keywords, _source, type|
38
46
  if keywords && (type == :document)
39
47
  keywords = keywords.split(',')
40
- Wgit::Utils.process_arr(keywords)
48
+ Wgit::Utils.sanitize(keywords)
41
49
  end
42
50
  keywords
43
51
  end
44
52
 
45
53
  # Links.
46
- Wgit::Document.define_extension(
54
+ Wgit::Document.define_extractor(
47
55
  :links,
48
56
  '//a/@href',
49
57
  singleton: false,
50
58
  text_content_only: true
51
59
  ) do |links|
52
- links.map! { |link| Wgit::Url.new(link) }
60
+ links
61
+ .map { |link| Wgit::Url.parse?(link) }
62
+ .compact # Remove unparsable links.
53
63
  end
54
64
 
55
65
  # Text.
56
- Wgit::Document.define_extension(
66
+ Wgit::Document.define_extractor(
57
67
  :text,
58
- Wgit::Document::TEXT_ELEMENTS_XPATH,
68
+ proc { Wgit::Document.text_elements_xpath },
59
69
  singleton: false,
60
70
  text_content_only: true
61
71
  )
data/lib/wgit/dsl.rb ADDED
@@ -0,0 +1,324 @@
1
+ module Wgit
2
+ # DSL methods that act as a wrapper around Wgit's underlying class methods.
3
+ # All instance vars/constants are prefixed to avoid conflicts when included.
4
+ module DSL
5
+ # Error message shown when there's no URL to crawl.
6
+ DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
7
+ the 'start' function".freeze
8
+
9
+ ### CRAWLER METHODS ###
10
+
11
+ # Defines an extractor using `Wgit::Document.define_extractor` underneath.
12
+ #
13
+ # @param var [Symbol] The name of the variable to be initialised, that will
14
+ # contain the extracted content.
15
+ # @param xpath [String, #call] The xpath used to find the element(s)
16
+ # of the webpage. Only used when initializing from HTML.
17
+ #
18
+ # Pass a callable object (proc etc.) if you want the
19
+ # xpath value to be derived on Document initialisation (instead of when
20
+ # the extractor is defined). The call method must return a valid xpath
21
+ # String.
22
+ # @param opts [Hash] The options to define an extractor with. The
23
+ # options are only used when intializing from HTML, not the database.
24
+ # @option opts [Boolean] :singleton The singleton option determines
25
+ # whether or not the result(s) should be in an Array. If multiple
26
+ # results are found and singleton is true then the first result will be
27
+ # used. Defaults to true.
28
+ # @option opts [Boolean] :text_content_only The text_content_only option
29
+ # if true will use the text content of the Nokogiri result object,
30
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
31
+ # @yield The block is executed when a Wgit::Document is initialized,
32
+ # regardless of the source. Use it (optionally) to process the result
33
+ # value.
34
+ # @yieldparam value [Object] The result value to be assigned to the new
35
+ # `var`.
36
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
37
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
38
+ # `:object`.
39
+ # @yieldreturn [Object] The return value of the block becomes the new var's
40
+ # value. Return the block's value param unchanged if you want to inspect.
41
+ # @raise [StandardError] If the var param isn't valid.
42
+ # @return [Symbol] The given var Symbol if successful.
43
+ def extract(var, xpath, opts = {}, &block)
44
+ Wgit::Document.define_extractor(var, xpath, opts, &block)
45
+ end
46
+
47
+ # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
48
+ # index methods used by the DSL. See the Wgit::Crawler documentation for
49
+ # more details.
50
+ #
51
+ # @yield [crawler] The created crawler; use the block to configure.
52
+ # @return [Wgit::Crawler] The created crawler used by the DSL.
53
+ def crawler
54
+ @dsl_crawler ||= Wgit::Crawler.new
55
+ yield @dsl_crawler if block_given?
56
+ @dsl_crawler
57
+ end
58
+
59
+ # Sets the URL to be crawled when a `crawl*` or `index*` method is
60
+ # subsequently called. Calling this is optional as the URL can be
61
+ # passed to the method instead. You can also omit the url param and just
62
+ # use the block to configure the crawler instead.
63
+ #
64
+ # @param urls [*String, *Wgit::Url] The URL(s) to crawl
65
+ # or nil (if only using the block to configure the crawler).
66
+ # @yield [crawler] The crawler that'll be used in the subsequent
67
+ # crawl/index; use the block to configure.
68
+ def start(*urls, &block)
69
+ crawler(&block)
70
+ @dsl_start = urls
71
+ end
72
+
73
+ # Sets the xpath to be followed when `crawl_site` or `index_site` is
74
+ # subsequently called. Calling this method is optional as the default is to
75
+ # follow all `<a>` href's that point to the site domain. You can also pass
76
+ # `follow:` to the crawl/index methods directly.
77
+ #
78
+ # @param xpath [String] The xpath which is followed when crawling/indexing
79
+ # a site. Use `:default` to restore the default follow logic.
80
+ def follow(xpath)
81
+ @dsl_follow = xpath
82
+ end
83
+
84
+ # Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
85
+ # underneath. If no urls are provided, then the `start` URL is used.
86
+ #
87
+ # @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
88
+ # URL(s).
89
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
90
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
91
+ # e.g. :host only allows redirects within the same host. Choose from
92
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
93
+ # This value will be used for all urls crawled.
94
+ # @yield [doc] Given each crawled page (Wgit::Document); this is the only
95
+ # way to interact with them.
96
+ # @raise [StandardError] If no urls are provided and no `start` URL has
97
+ # been set.
98
+ # @return [Wgit::Document] The last Document crawled.
99
+ def crawl(*urls, follow_redirects: true, &block)
100
+ urls = (@dsl_start || []) if urls.empty?
101
+ raise DSL_ERROR__NO_START_URL if urls.empty?
102
+
103
+ urls.map! { |url| Wgit::Url.parse(url) }
104
+ crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
105
+ end
106
+
107
+ # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
108
+ # url is provided, then the first `start` URL is used.
109
+ #
110
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
111
+ # crawled. It is recommended that this URL be the index page of the site
112
+ # to give a greater chance of finding all pages within that site/host.
113
+ # Defaults to the `start` URLs.
114
+ # @param follow [String] The xpath extracting links to be followed during
115
+ # the crawl. This changes how a site is crawled. Only links pointing to
116
+ # the site domain are allowed. The `:default` is any `<a>` href returning
117
+ # HTML. This can also be set using `follow`.
118
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
119
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
120
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
121
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
122
+ # @yield [doc] Given each crawled page (Wgit::Document) of the site.
123
+ # A block is the only way to interact with each crawled Document.
124
+ # Use `doc.empty?` to determine if the page is valid.
125
+ # @raise [StandardError] If no url is provided and no `start` URL has been
126
+ # set.
127
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
128
+ # from all of the site's pages or nil if the given url could not be
129
+ # crawled successfully.
130
+ def crawl_site(
131
+ *urls, follow: @dsl_follow,
132
+ allow_paths: nil, disallow_paths: nil, &block
133
+ )
134
+ urls = (@dsl_start || []) if urls.empty?
135
+ raise DSL_ERROR__NO_START_URL if urls.empty?
136
+
137
+ xpath = follow || :default
138
+ opts = {
139
+ follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
+ }
141
+
142
+ urls.reduce([]) do |externals, url|
143
+ externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
144
+ end
145
+ end
146
+
147
+ # Returns the DSL's `crawler#last_response`.
148
+ #
149
+ # @return [Wgit::Response] The response from the last URL crawled.
150
+ def last_response
151
+ crawler.last_response
152
+ end
153
+
154
+ # Nilifies the DSL instance variables.
155
+ def reset
156
+ @dsl_crawler = nil
157
+ @dsl_start = nil
158
+ @dsl_follow = nil
159
+ @dsl_conn_str = nil
160
+ end
161
+
162
+ ### INDEXER METHODS ###
163
+
164
+ # Defines the connection string to the database used in subsequent `index*`
165
+ # method calls. This method is optional as the connection string can be
166
+ # passed to the index method instead.
167
+ #
168
+ # @param conn_str [String] The connection string used to connect to the
169
+ # database in subsequent `index*` method calls.
170
+ def connection_string(conn_str)
171
+ @dsl_conn_str = conn_str
172
+ end
173
+
174
+ # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
175
+ #
176
+ # @param connection_string [String] The database connection string. Set as
177
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
178
+ # `connection_string`.
179
+ # @param max_sites [Integer] The number of separate and whole
180
+ # websites to be crawled before the method exits. Defaults to -1 which
181
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
182
+ # @param max_data [Integer] The maximum amount of bytes that will be
183
+ # scraped from the web (default is 1GB). Note, that this value is used to
184
+ # determine when to stop crawling; it's not a guarantee of the max data
185
+ # that will be obtained.
186
+ def index_www(
187
+ connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
188
+ )
189
+ db = Wgit::Database.new(connection_string)
190
+ indexer = Wgit::Indexer.new(db, crawler)
191
+
192
+ indexer.index_www(max_sites: max_sites, max_data: max_data)
193
+ end
194
+
195
+ # Indexes a single website using `Wgit::Indexer#index_site` underneath.
196
+ #
197
+ # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
198
+ # crawl. Can be set using `start`.
199
+ # @param connection_string [String] The database connection string. Set as
200
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
201
+ # `connection_string`.
202
+ # @param insert_externals [Boolean] Whether or not to insert the website's
203
+ # external URL's into the database.
204
+ # @param follow [String] The xpath extracting links to be followed during
205
+ # the crawl. This changes how a site is crawled. Only links pointing to
206
+ # the site domain are allowed. The `:default` is any `<a>` href returning
207
+ # HTML. This can also be set using `follow`.
208
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
209
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
210
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
211
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
212
+ # @yield [doc] Given the Wgit::Document of each crawled webpage, before it
213
+ # is inserted into the database allowing for prior manipulation.
214
+ # @raise [StandardError] If no url is provided and no `start` URL has been
215
+ # set.
216
+ # @return [Integer] The total number of pages crawled within the website.
217
+ def index_site(
218
+ *urls, connection_string: @dsl_conn_str,
219
+ insert_externals: false, follow: @dsl_follow,
220
+ allow_paths: nil, disallow_paths: nil, &block
221
+ )
222
+ urls = (@dsl_start || []) if urls.empty?
223
+ raise DSL_ERROR__NO_START_URL if urls.empty?
224
+
225
+ db = Wgit::Database.new(connection_string)
226
+ indexer = Wgit::Indexer.new(db, crawler)
227
+ xpath = follow || :default
228
+ crawl_opts = {
229
+ insert_externals: insert_externals, follow: xpath,
230
+ allow_paths: allow_paths, disallow_paths: disallow_paths
231
+ }
232
+
233
+ urls.reduce(0) do |total, url|
234
+ total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
235
+ end
236
+ end
237
+
238
+ # Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
239
+ #
240
+ # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
241
+ # `start` URL(s).
242
+ # @param connection_string [String] The database connection string. Set as
243
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
244
+ # `connection_string`.
245
+ # @param insert_externals [Boolean] Whether or not to insert the website's
246
+ # external URL's into the database.
247
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
248
+ # before it's inserted into the database allowing for prior
249
+ # manipulation. Return nil or false from the block to prevent the
250
+ # document from being saved into the database.
251
+ # @raise [StandardError] If no urls are provided and no `start` URL has
252
+ # been set.
253
+ def index(
254
+ *urls, connection_string: @dsl_conn_str,
255
+ insert_externals: false, &block
256
+ )
257
+ urls = (@dsl_start || []) if urls.empty?
258
+ raise DSL_ERROR__NO_START_URL if urls.empty?
259
+
260
+ db = Wgit::Database.new(connection_string)
261
+ indexer = Wgit::Indexer.new(db, crawler)
262
+
263
+ urls.map! { |url| Wgit::Url.parse(url) }
264
+ indexer.index_urls(*urls, insert_externals: insert_externals, &block)
265
+ end
266
+
267
+ # Performs a search of the database's indexed documents and pretty prints
268
+ # the results in a search engine-esque format. See `Wgit::Database#search!`
269
+ # and `Wgit::Document#search!` for details of how the search works.
270
+ #
271
+ # @param query [String] The text query to search with.
272
+ # @param connection_string [String] The database connection string. Set as
273
+ # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
274
+ # `connection_string`.
275
+ # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
276
+ # to output text somewhere e.g. a file or STDERR. Use nil for no output.
277
+ # @param case_sensitive [Boolean] Whether character case must match.
278
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
279
+ # for separately.
280
+ # @param limit [Integer] The max number of results to print.
281
+ # @param skip [Integer] The number of DB records to skip.
282
+ # @param sentence_limit [Integer] The max length of each result's text
283
+ # snippet.
284
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
285
+ # database containing only its matching `#text`.
286
+ # @return [Array<Wgit::Document>] The search results with matching text.
287
+ def search(
288
+ query, connection_string: @dsl_conn_str, stream: STDOUT,
289
+ case_sensitive: false, whole_sentence: true,
290
+ limit: 10, skip: 0, sentence_limit: 80, &block
291
+ )
292
+ stream ||= File.open(File::NULL, 'w')
293
+ db = Wgit::Database.new(connection_string)
294
+
295
+ results = db.search!(
296
+ query,
297
+ case_sensitive: case_sensitive,
298
+ whole_sentence: whole_sentence,
299
+ limit: limit,
300
+ skip: skip,
301
+ sentence_limit: sentence_limit,
302
+ &block
303
+ )
304
+
305
+ Wgit::Utils.printf_search_results(results, stream: stream)
306
+
307
+ results
308
+ end
309
+
310
+ # Deletes everything in the urls and documents collections by calling
311
+ # `Wgit::Database#clear_db` underneath. This will nuke the entire database
312
+ # so yeah... be careful.
313
+ #
314
+ # @return [Integer] The number of deleted records.
315
+ def clear_db!(connection_string: @dsl_conn_str)
316
+ db = Wgit::Database.new(connection_string)
317
+ db.clear_db
318
+ end
319
+
320
+ alias crawl_r crawl_site
321
+ alias index_r index_site
322
+ alias start_urls start
323
+ end
324
+ end