wgit 0.5.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/utils.rb CHANGED
@@ -145,7 +145,8 @@ module Wgit
145
145
  # @param keyword_limit [Integer] The max amount of keywords to be
146
146
  # outputted to the stream.
147
147
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
- # to output text somewhere e.g. a file or STDOUT.
148
+ # to output text somewhere e.g. a file or STDERR.
149
+ # @return [Integer] The number of results.
149
150
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
150
151
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
151
152
 
@@ -162,18 +163,37 @@ module Wgit
162
163
  stream.puts
163
164
  end
164
165
 
165
- nil
166
+ results.size
166
167
  end
167
168
 
168
- # Processes a String to make it uniform. Strips any leading/trailing white
169
+ # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
+ # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
+ # not in the case statement will be ignored and returned as is.
172
+ #
173
+ # @param obj [Object] The object to be sanitized.
174
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
+ # invalid characters.
176
+ # @return [Object] The sanitized obj is both modified and then returned.
177
+ def self.sanitize(obj, encode: true)
178
+ case obj
179
+ when String
180
+ sanitize_str(obj, encode: encode)
181
+ when Array
182
+ sanitize_arr(obj, encode: encode)
183
+ else
184
+ obj
185
+ end
186
+ end
187
+
188
+ # Sanitises a String to make it uniform. Strips any leading/trailing white
169
189
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
170
190
  # `encode: true`.
171
191
  #
172
- # @param str [String] The String to process. str is modified.
192
+ # @param str [String] The String to sanitize. str is modified.
173
193
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
174
194
  # invalid characters.
175
- # @return [String] The processed str is both modified and then returned.
176
- def self.process_str(str, encode: true)
195
+ # @return [String] The sanitized str is both modified and then returned.
196
+ def self.sanitize_str(str, encode: true)
177
197
  if str.is_a?(String)
178
198
  str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
179
199
  str.strip!
@@ -182,15 +202,15 @@ module Wgit
182
202
  str
183
203
  end
184
204
 
185
- # Processes an Array to make it uniform. Removes empty Strings and nils,
186
- # processes non empty Strings using Wgit::Utils.process_str and removes
205
+ # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
+ # processes non empty Strings using Wgit::Utils.sanitize and removes
187
207
  # duplicates.
188
208
  #
189
- # @param arr [Enumerable] The Array to process. arr is modified.
190
- # @return [Enumerable] The processed arr is both modified and then returned.
191
- def self.process_arr(arr)
209
+ # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
+ # @return [Enumerable] The sanitized arr is both modified and then returned.
211
+ def self.sanitize_arr(arr, encode: true)
192
212
  if arr.is_a?(Array)
193
- arr.map! { |str| process_str(str) }
213
+ arr.map! { |str| sanitize(str, encode: encode) }
194
214
  arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
195
215
  arr.compact!
196
216
  arr.uniq!
@@ -198,13 +218,5 @@ module Wgit
198
218
 
199
219
  arr
200
220
  end
201
-
202
- # Returns the model having removed non bson types (for use with MongoDB).
203
- #
204
- # @param model_hash [Hash] The model Hash to process.
205
- # @return [Hash] The model Hash with non bson types removed.
206
- def self.remove_non_bson_types(model_hash)
207
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
208
- end
209
221
  end
210
222
  end
data/lib/wgit/version.rb CHANGED
@@ -1,11 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
4
- # contents for later use by serialisation.
4
+ # contents for later use.
5
+ #
5
6
  # @author Michael Telford
6
7
  module Wgit
7
8
  # The current gem version of Wgit.
8
- VERSION = '0.5.1'
9
+ VERSION = '0.10.0'
9
10
 
10
11
  # Returns the current gem version of Wgit as a String.
11
12
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-22 00:00:00.000000000 Z
11
+ date: 2021-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ferrum
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.8'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.8'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: byebug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -184,35 +198,41 @@ dependencies:
184
198
  - - "<"
185
199
  - !ruby/object:Gem::Version
186
200
  version: '1.0'
187
- description: 'Fundamentally, Wgit is a HTTP indexer/scraper which crawls URL''s to
188
- retrieve and serialise their page contents for later use. You can use Wgit to copy
189
- entire websites if required. Wgit also provides a means to search indexed documents
190
- stored in a database. Therefore, this library provides the main components of a
191
- WWW search engine. The Wgit API is easily extended allowing you to pull out the
192
- parts of a webpage that are important to you, the code snippets or tables for example.
193
- As Wgit is a library, it supports many different use cases including data mining,
194
- analytics, web indexing and URL parsing to name a few.
201
+ description: 'Wgit was primarily designed to crawl static HTML websites to index and
202
+ search their content - providing the basis of any search engine; but Wgit is suitable
203
+ for many application domains including: URL parsing, data mining and statistical
204
+ analysis.
195
205
 
196
206
  '
197
207
  email: michael.telford@live.com
198
- executables: []
208
+ executables:
209
+ - wgit
199
210
  extensions: []
200
211
  extra_rdoc_files: []
201
212
  files:
202
213
  - "./lib/wgit.rb"
203
214
  - "./lib/wgit/assertable.rb"
215
+ - "./lib/wgit/base.rb"
204
216
  - "./lib/wgit/core_ext.rb"
205
217
  - "./lib/wgit/crawler.rb"
206
218
  - "./lib/wgit/database/database.rb"
207
219
  - "./lib/wgit/database/model.rb"
208
220
  - "./lib/wgit/document.rb"
209
- - "./lib/wgit/document_extensions.rb"
221
+ - "./lib/wgit/document_extractors.rb"
222
+ - "./lib/wgit/dsl.rb"
210
223
  - "./lib/wgit/indexer.rb"
211
224
  - "./lib/wgit/logger.rb"
212
225
  - "./lib/wgit/response.rb"
213
226
  - "./lib/wgit/url.rb"
214
227
  - "./lib/wgit/utils.rb"
215
228
  - "./lib/wgit/version.rb"
229
+ - ".yardopts"
230
+ - CHANGELOG.md
231
+ - CODE_OF_CONDUCT.md
232
+ - CONTRIBUTING.md
233
+ - LICENSE.txt
234
+ - README.md
235
+ - bin/wgit
216
236
  homepage: https://github.com/michaeltelford/wgit
217
237
  licenses:
218
238
  - MIT
@@ -223,7 +243,7 @@ metadata:
223
243
  bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
224
244
  documentation_uri: https://www.rubydoc.info/gems/wgit
225
245
  allowed_push_host: https://rubygems.org
226
- post_install_message:
246
+ post_install_message: Added the 'wgit' executable to $PATH
227
247
  rdoc_options: []
228
248
  require_paths:
229
249
  - lib
@@ -238,10 +258,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
238
258
  - !ruby/object:Gem::Version
239
259
  version: '0'
240
260
  requirements: []
241
- rubyforge_project:
242
- rubygems_version: 2.7.6
243
- signing_key:
261
+ rubygems_version: 3.1.2
262
+ signing_key:
244
263
  specification_version: 4
245
- summary: Wgit is a Ruby gem similar in nature to GNU's `wget` tool. It provides an
246
- easy to use API for programmatic URL parsing, HTML indexing and searching.
264
+ summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
265
+ extract the data you want from the web.
247
266
  test_files: []