wgit 0.5.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,7 +145,8 @@ module Wgit
145
145
  # @param keyword_limit [Integer] The max amount of keywords to be
146
146
  # outputted to the stream.
147
147
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
- # to output text somewhere e.g. a file or STDOUT.
148
+ # to output text somewhere e.g. a file or STDERR.
149
+ # @return [Integer] The number of results.
149
150
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
150
151
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
151
152
 
@@ -162,18 +163,37 @@ module Wgit
162
163
  stream.puts
163
164
  end
164
165
 
165
- nil
166
+ results.size
166
167
  end
167
168
 
168
- # Processes a String to make it uniform. Strips any leading/trailing white
169
+ # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
+ # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
+ # not in the case statement will be ignored and returned as is.
172
+ #
173
+ # @param obj [Object] The object to be sanitized.
174
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
+ # invalid characters.
176
+ # @return [Object] The sanitized obj is both modified and then returned.
177
+ def self.sanitize(obj, encode: true)
178
+ case obj
179
+ when String
180
+ sanitize_str(obj, encode: encode)
181
+ when Array
182
+ sanitize_arr(obj, encode: encode)
183
+ else
184
+ obj
185
+ end
186
+ end
187
+
188
+ # Sanitises a String to make it uniform. Strips any leading/trailing white
169
189
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
170
190
  # `encode: true`.
171
191
  #
172
- # @param str [String] The String to process. str is modified.
192
+ # @param str [String] The String to sanitize. str is modified.
173
193
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
174
194
  # invalid characters.
175
- # @return [String] The processed str is both modified and then returned.
176
- def self.process_str(str, encode: true)
195
+ # @return [String] The sanitized str is both modified and then returned.
196
+ def self.sanitize_str(str, encode: true)
177
197
  if str.is_a?(String)
178
198
  str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
179
199
  str.strip!
@@ -182,15 +202,15 @@ module Wgit
182
202
  str
183
203
  end
184
204
 
185
- # Processes an Array to make it uniform. Removes empty Strings and nils,
186
- # processes non empty Strings using Wgit::Utils.process_str and removes
205
+ # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
+ # processes non empty Strings using Wgit::Utils.sanitize and removes
187
207
  # duplicates.
188
208
  #
189
- # @param arr [Enumerable] The Array to process. arr is modified.
190
- # @return [Enumerable] The processed arr is both modified and then returned.
191
- def self.process_arr(arr)
209
+ # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
+ # @return [Enumerable] The sanitized arr is both modified and then returned.
211
+ def self.sanitize_arr(arr, encode: true)
192
212
  if arr.is_a?(Array)
193
- arr.map! { |str| process_str(str) }
213
+ arr.map! { |str| sanitize(str, encode: encode) }
194
214
  arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
195
215
  arr.compact!
196
216
  arr.uniq!
@@ -198,13 +218,5 @@ module Wgit
198
218
 
199
219
  arr
200
220
  end
201
-
202
- # Returns the model having removed non bson types (for use with MongoDB).
203
- #
204
- # @param model_hash [Hash] The model Hash to process.
205
- # @return [Hash] The model Hash with non bson types removed.
206
- def self.remove_non_bson_types(model_hash)
207
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
208
- end
209
221
  end
210
222
  end
@@ -1,14 +1,20 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
4
- # contents for later use by serialisation.
4
+ # contents for later use.
5
+ #
5
6
  # @author Michael Telford
6
7
  module Wgit
7
8
  # The current gem version of Wgit.
8
- VERSION = '0.5.0'
9
+ VERSION = '0.9.0'
9
10
 
10
11
  # Returns the current gem version of Wgit as a String.
11
12
  def self.version
12
13
  VERSION
13
14
  end
15
+
16
+ # Returns the current gem version in a presentation String.
17
+ def self.version_str
18
+ "wgit v#{VERSION}"
19
+ end
14
20
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-02 00:00:00.000000000 Z
11
+ date: 2020-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -16,56 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 2.6.0
19
+ version: '2.6'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 2.6.0
26
+ version: '2.6'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: mongo
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 2.9.0
33
+ version: '2.9'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 2.9.0
40
+ version: '2.9'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 1.10.3
47
+ version: '1.10'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 1.10.3
54
+ version: '1.10'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: typhoeus
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 1.3.1
61
+ version: '1.3'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 1.3.1
68
+ version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ferrum
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.8'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.8'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: byebug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -123,33 +137,33 @@ dependencies:
123
137
  - !ruby/object:Gem::Version
124
138
  version: '0.12'
125
139
  - !ruby/object:Gem::Dependency
126
- name: rake
140
+ name: rubocop
127
141
  requirement: !ruby/object:Gem::Requirement
128
142
  requirements:
129
143
  - - "~>"
130
144
  - !ruby/object:Gem::Version
131
- version: '12.3'
145
+ version: '0.74'
132
146
  type: :development
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
150
  - - "~>"
137
151
  - !ruby/object:Gem::Version
138
- version: '12.3'
152
+ version: '0.74'
139
153
  - !ruby/object:Gem::Dependency
140
- name: rubocop
154
+ name: toys
141
155
  requirement: !ruby/object:Gem::Requirement
142
156
  requirements:
143
157
  - - "~>"
144
158
  - !ruby/object:Gem::Version
145
- version: '0.74'
159
+ version: '0.8'
146
160
  type: :development
147
161
  prerelease: false
148
162
  version_requirements: !ruby/object:Gem::Requirement
149
163
  requirements:
150
164
  - - "~>"
151
165
  - !ruby/object:Gem::Version
152
- version: '0.74'
166
+ version: '0.8'
153
167
  - !ruby/object:Gem::Dependency
154
168
  name: webmock
155
169
  requirement: !ruby/object:Gem::Requirement
@@ -184,43 +198,52 @@ dependencies:
184
198
  - - "<"
185
199
  - !ruby/object:Gem::Version
186
200
  version: '1.0'
187
- description: 'Fundamentally, Wgit is a HTTP indexer/scraper which crawls URL''s to
188
- retrieve and serialise their page contents for later use. You can use Wgit to copy
189
- entire websites if required. Wgit also provides a means to search indexed documents
190
- stored in a database. Therefore, this library provides the main components of a
191
- WWW search engine. The Wgit API is easily extended allowing you to pull out the
192
- parts of a webpage that are important to you, the code snippets or tables for example.
193
- As Wgit is a library, it supports many different use cases including data mining,
194
- analytics, web indexing and URL parsing to name a few.
201
+ description: 'Wgit was primarily designed to crawl static HTML websites to index and
202
+ search their content - providing the basis of any search engine; but Wgit is suitable
203
+ for many application domains including: URL parsing, data mining and statistical
204
+ analysis.
195
205
 
196
206
  '
197
207
  email: michael.telford@live.com
198
- executables: []
208
+ executables:
209
+ - wgit
199
210
  extensions: []
200
211
  extra_rdoc_files: []
201
212
  files:
202
213
  - "./lib/wgit.rb"
203
214
  - "./lib/wgit/assertable.rb"
215
+ - "./lib/wgit/base.rb"
204
216
  - "./lib/wgit/core_ext.rb"
205
217
  - "./lib/wgit/crawler.rb"
206
218
  - "./lib/wgit/database/database.rb"
207
219
  - "./lib/wgit/database/model.rb"
208
220
  - "./lib/wgit/document.rb"
209
- - "./lib/wgit/document_extensions.rb"
221
+ - "./lib/wgit/document_extractors.rb"
222
+ - "./lib/wgit/dsl.rb"
210
223
  - "./lib/wgit/indexer.rb"
211
224
  - "./lib/wgit/logger.rb"
212
225
  - "./lib/wgit/response.rb"
213
226
  - "./lib/wgit/url.rb"
214
227
  - "./lib/wgit/utils.rb"
215
228
  - "./lib/wgit/version.rb"
229
+ - ".yardopts"
230
+ - CHANGELOG.md
231
+ - CODE_OF_CONDUCT.md
232
+ - CONTRIBUTING.md
233
+ - LICENSE.txt
234
+ - README.md
235
+ - bin/wgit
216
236
  homepage: https://github.com/michaeltelford/wgit
217
237
  licenses:
218
238
  - MIT
219
239
  metadata:
220
- source_code_uri: https://github.com/michaeltelford/wgit
221
240
  yard.run: yri
241
+ source_code_uri: https://github.com/michaeltelford/wgit
242
+ changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
243
+ bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
244
+ documentation_uri: https://www.rubydoc.info/github/michaeltelford/wgit/master
222
245
  allowed_push_host: https://rubygems.org
223
- post_install_message:
246
+ post_install_message: Added the 'wgit' executable to $PATH
224
247
  rdoc_options: []
225
248
  require_paths:
226
249
  - lib
@@ -235,10 +258,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
235
258
  - !ruby/object:Gem::Version
236
259
  version: '0'
237
260
  requirements: []
238
- rubyforge_project:
239
- rubygems_version: 2.7.6
261
+ rubygems_version: 3.1.2
240
262
  signing_key:
241
263
  specification_version: 4
242
- summary: Wgit is a Ruby gem similar in nature to GNU's `wget` tool. It provides an
243
- easy to use API for programmatic URL parsing, HTML indexing and searching.
264
+ summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
265
+ extract the data you want from the web.
244
266
  test_files: []