wgit 0.5.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/utils.rb CHANGED
@@ -145,7 +145,8 @@ module Wgit
145
145
  # @param keyword_limit [Integer] The max amount of keywords to be
146
146
  # outputted to the stream.
147
147
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
- # to output text somewhere e.g. a file or STDOUT.
148
+ # to output text somewhere e.g. a file or STDERR.
149
+ # @return [Integer] The number of results.
149
150
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
150
151
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
151
152
 
@@ -162,18 +163,37 @@ module Wgit
162
163
  stream.puts
163
164
  end
164
165
 
165
- nil
166
+ results.size
166
167
  end
167
168
 
168
- # Processes a String to make it uniform. Strips any leading/trailing white
169
+ # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
+ # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
+ # not in the case statement will be ignored and returned as is.
172
+ #
173
+ # @param obj [Object] The object to be sanitized.
174
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
+ # invalid characters.
176
+ # @return [Object] The sanitized obj is both modified and then returned.
177
+ def self.sanitize(obj, encode: true)
178
+ case obj
179
+ when String
180
+ sanitize_str(obj, encode: encode)
181
+ when Array
182
+ sanitize_arr(obj, encode: encode)
183
+ else
184
+ obj
185
+ end
186
+ end
187
+
188
+ # Sanitises a String to make it uniform. Strips any leading/trailing white
169
189
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
170
190
  # `encode: true`.
171
191
  #
172
- # @param str [String] The String to process. str is modified.
192
+ # @param str [String] The String to sanitize. str is modified.
173
193
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
174
194
  # invalid characters.
175
- # @return [String] The processed str is both modified and then returned.
176
- def self.process_str(str, encode: true)
195
+ # @return [String] The sanitized str is both modified and then returned.
196
+ def self.sanitize_str(str, encode: true)
177
197
  if str.is_a?(String)
178
198
  str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
179
199
  str.strip!
@@ -182,15 +202,15 @@ module Wgit
182
202
  str
183
203
  end
184
204
 
185
- # Processes an Array to make it uniform. Removes empty Strings and nils,
186
- # processes non empty Strings using Wgit::Utils.process_str and removes
205
+ # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
+ # processes non empty Strings using Wgit::Utils.sanitize and removes
187
207
  # duplicates.
188
208
  #
189
- # @param arr [Enumerable] The Array to process. arr is modified.
190
- # @return [Enumerable] The processed arr is both modified and then returned.
191
- def self.process_arr(arr)
209
+ # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
+ # @return [Enumerable] The sanitized arr is both modified and then returned.
211
+ def self.sanitize_arr(arr, encode: true)
192
212
  if arr.is_a?(Array)
193
- arr.map! { |str| process_str(str) }
213
+ arr.map! { |str| sanitize(str, encode: encode) }
194
214
  arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
195
215
  arr.compact!
196
216
  arr.uniq!
@@ -198,13 +218,5 @@ module Wgit
198
218
 
199
219
  arr
200
220
  end
201
-
202
- # Returns the model having removed non bson types (for use with MongoDB).
203
- #
204
- # @param model_hash [Hash] The model Hash to process.
205
- # @return [Hash] The model Hash with non bson types removed.
206
- def self.remove_non_bson_types(model_hash)
207
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
208
- end
209
221
  end
210
222
  end
data/lib/wgit/version.rb CHANGED
@@ -1,11 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
4
- # contents for later use by serialisation.
4
+ # contents for later use.
5
+ #
5
6
  # @author Michael Telford
6
7
  module Wgit
7
8
  # The current gem version of Wgit.
8
- VERSION = '0.5.1'
9
+ VERSION = '0.10.0'
9
10
 
10
11
  # Returns the current gem version of Wgit as a String.
11
12
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-22 00:00:00.000000000 Z
11
+ date: 2021-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ferrum
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.8'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.8'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: byebug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -184,35 +198,41 @@ dependencies:
184
198
  - - "<"
185
199
  - !ruby/object:Gem::Version
186
200
  version: '1.0'
187
- description: 'Fundamentally, Wgit is a HTTP indexer/scraper which crawls URL''s to
188
- retrieve and serialise their page contents for later use. You can use Wgit to copy
189
- entire websites if required. Wgit also provides a means to search indexed documents
190
- stored in a database. Therefore, this library provides the main components of a
191
- WWW search engine. The Wgit API is easily extended allowing you to pull out the
192
- parts of a webpage that are important to you, the code snippets or tables for example.
193
- As Wgit is a library, it supports many different use cases including data mining,
194
- analytics, web indexing and URL parsing to name a few.
201
+ description: 'Wgit was primarily designed to crawl static HTML websites to index and
202
+ search their content - providing the basis of any search engine; but Wgit is suitable
203
+ for many application domains including: URL parsing, data mining and statistical
204
+ analysis.
195
205
 
196
206
  '
197
207
  email: michael.telford@live.com
198
- executables: []
208
+ executables:
209
+ - wgit
199
210
  extensions: []
200
211
  extra_rdoc_files: []
201
212
  files:
202
213
  - "./lib/wgit.rb"
203
214
  - "./lib/wgit/assertable.rb"
215
+ - "./lib/wgit/base.rb"
204
216
  - "./lib/wgit/core_ext.rb"
205
217
  - "./lib/wgit/crawler.rb"
206
218
  - "./lib/wgit/database/database.rb"
207
219
  - "./lib/wgit/database/model.rb"
208
220
  - "./lib/wgit/document.rb"
209
- - "./lib/wgit/document_extensions.rb"
221
+ - "./lib/wgit/document_extractors.rb"
222
+ - "./lib/wgit/dsl.rb"
210
223
  - "./lib/wgit/indexer.rb"
211
224
  - "./lib/wgit/logger.rb"
212
225
  - "./lib/wgit/response.rb"
213
226
  - "./lib/wgit/url.rb"
214
227
  - "./lib/wgit/utils.rb"
215
228
  - "./lib/wgit/version.rb"
229
+ - ".yardopts"
230
+ - CHANGELOG.md
231
+ - CODE_OF_CONDUCT.md
232
+ - CONTRIBUTING.md
233
+ - LICENSE.txt
234
+ - README.md
235
+ - bin/wgit
216
236
  homepage: https://github.com/michaeltelford/wgit
217
237
  licenses:
218
238
  - MIT
@@ -223,7 +243,7 @@ metadata:
223
243
  bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
224
244
  documentation_uri: https://www.rubydoc.info/gems/wgit
225
245
  allowed_push_host: https://rubygems.org
226
- post_install_message:
246
+ post_install_message: Added the 'wgit' executable to $PATH
227
247
  rdoc_options: []
228
248
  require_paths:
229
249
  - lib
@@ -238,10 +258,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
238
258
  - !ruby/object:Gem::Version
239
259
  version: '0'
240
260
  requirements: []
241
- rubyforge_project:
242
- rubygems_version: 2.7.6
243
- signing_key:
261
+ rubygems_version: 3.1.2
262
+ signing_key:
244
263
  specification_version: 4
245
- summary: Wgit is a Ruby gem similar in nature to GNU's `wget` tool. It provides an
246
- easy to use API for programmatic URL parsing, HTML indexing and searching.
264
+ summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
265
+ extract the data you want from the web.
247
266
  test_files: []