wgit 0.5.1 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +249 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +232 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +241 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -14
- data/lib/wgit/url.rb +213 -73
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +3 -2
- metadata +38 -19
data/lib/wgit/utils.rb
CHANGED
@@ -145,7 +145,8 @@ module Wgit
|
|
145
145
|
# @param keyword_limit [Integer] The max amount of keywords to be
|
146
146
|
# outputted to the stream.
|
147
147
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
148
|
-
# to output text somewhere e.g. a file or
|
148
|
+
# to output text somewhere e.g. a file or STDERR.
|
149
|
+
# @return [Integer] The number of results.
|
149
150
|
def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
|
150
151
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
151
152
|
|
@@ -162,18 +163,37 @@ module Wgit
|
|
162
163
|
stream.puts
|
163
164
|
end
|
164
165
|
|
165
|
-
|
166
|
+
results.size
|
166
167
|
end
|
167
168
|
|
168
|
-
#
|
169
|
+
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
170
|
+
# method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
|
171
|
+
# not in the case statement will be ignored and returned as is.
|
172
|
+
#
|
173
|
+
# @param obj [Object] The object to be sanitized.
|
174
|
+
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
175
|
+
# invalid characters.
|
176
|
+
# @return [Object] The sanitized obj is both modified and then returned.
|
177
|
+
def self.sanitize(obj, encode: true)
|
178
|
+
case obj
|
179
|
+
when String
|
180
|
+
sanitize_str(obj, encode: encode)
|
181
|
+
when Array
|
182
|
+
sanitize_arr(obj, encode: encode)
|
183
|
+
else
|
184
|
+
obj
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Sanitises a String to make it uniform. Strips any leading/trailing white
|
169
189
|
# space. Also applies UTF-8 encoding (replacing invalid characters) if
|
170
190
|
# `encode: true`.
|
171
191
|
#
|
172
|
-
# @param str [String] The String to
|
192
|
+
# @param str [String] The String to sanitize. str is modified.
|
173
193
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
174
194
|
# invalid characters.
|
175
|
-
# @return [String] The
|
176
|
-
def self.
|
195
|
+
# @return [String] The sanitized str is both modified and then returned.
|
196
|
+
def self.sanitize_str(str, encode: true)
|
177
197
|
if str.is_a?(String)
|
178
198
|
str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
|
179
199
|
str.strip!
|
@@ -182,15 +202,15 @@ module Wgit
|
|
182
202
|
str
|
183
203
|
end
|
184
204
|
|
185
|
-
#
|
186
|
-
# processes non empty Strings using Wgit::Utils.
|
205
|
+
# Sanitises an Array to make it uniform. Removes empty Strings and nils,
|
206
|
+
# processes non empty Strings using Wgit::Utils.sanitize and removes
|
187
207
|
# duplicates.
|
188
208
|
#
|
189
|
-
# @param arr [Enumerable] The Array to
|
190
|
-
# @return [Enumerable] The
|
191
|
-
def self.
|
209
|
+
# @param arr [Enumerable] The Array to sanitize. arr is modified.
|
210
|
+
# @return [Enumerable] The sanitized arr is both modified and then returned.
|
211
|
+
def self.sanitize_arr(arr, encode: true)
|
192
212
|
if arr.is_a?(Array)
|
193
|
-
arr.map! { |str|
|
213
|
+
arr.map! { |str| sanitize(str, encode: encode) }
|
194
214
|
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
195
215
|
arr.compact!
|
196
216
|
arr.uniq!
|
@@ -198,13 +218,5 @@ module Wgit
|
|
198
218
|
|
199
219
|
arr
|
200
220
|
end
|
201
|
-
|
202
|
-
# Returns the model having removed non bson types (for use with MongoDB).
|
203
|
-
#
|
204
|
-
# @param model_hash [Hash] The model Hash to process.
|
205
|
-
# @return [Hash] The model Hash with non bson types removed.
|
206
|
-
def self.remove_non_bson_types(model_hash)
|
207
|
-
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
208
|
-
end
|
209
221
|
end
|
210
222
|
end
|
data/lib/wgit/version.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
# Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
|
4
|
-
# contents for later use
|
4
|
+
# contents for later use.
|
5
|
+
#
|
5
6
|
# @author Michael Telford
|
6
7
|
module Wgit
|
7
8
|
# The current gem version of Wgit.
|
8
|
-
VERSION = '0.
|
9
|
+
VERSION = '0.10.0'
|
9
10
|
|
10
11
|
# Returns the current gem version of Wgit as a String.
|
11
12
|
def self.version
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ferrum
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.8'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.8'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: byebug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,35 +198,41 @@ dependencies:
|
|
184
198
|
- - "<"
|
185
199
|
- !ruby/object:Gem::Version
|
186
200
|
version: '1.0'
|
187
|
-
description: '
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
WWW search engine. The Wgit API is easily extended allowing you to pull out the
|
192
|
-
parts of a webpage that are important to you, the code snippets or tables for example.
|
193
|
-
As Wgit is a library, it supports many different use cases including data mining,
|
194
|
-
analytics, web indexing and URL parsing to name a few.
|
201
|
+
description: 'Wgit was primarily designed to crawl static HTML websites to index and
|
202
|
+
search their content - providing the basis of any search engine; but Wgit is suitable
|
203
|
+
for many application domains including: URL parsing, data mining and statistical
|
204
|
+
analysis.
|
195
205
|
|
196
206
|
'
|
197
207
|
email: michael.telford@live.com
|
198
|
-
executables:
|
208
|
+
executables:
|
209
|
+
- wgit
|
199
210
|
extensions: []
|
200
211
|
extra_rdoc_files: []
|
201
212
|
files:
|
202
213
|
- "./lib/wgit.rb"
|
203
214
|
- "./lib/wgit/assertable.rb"
|
215
|
+
- "./lib/wgit/base.rb"
|
204
216
|
- "./lib/wgit/core_ext.rb"
|
205
217
|
- "./lib/wgit/crawler.rb"
|
206
218
|
- "./lib/wgit/database/database.rb"
|
207
219
|
- "./lib/wgit/database/model.rb"
|
208
220
|
- "./lib/wgit/document.rb"
|
209
|
-
- "./lib/wgit/
|
221
|
+
- "./lib/wgit/document_extractors.rb"
|
222
|
+
- "./lib/wgit/dsl.rb"
|
210
223
|
- "./lib/wgit/indexer.rb"
|
211
224
|
- "./lib/wgit/logger.rb"
|
212
225
|
- "./lib/wgit/response.rb"
|
213
226
|
- "./lib/wgit/url.rb"
|
214
227
|
- "./lib/wgit/utils.rb"
|
215
228
|
- "./lib/wgit/version.rb"
|
229
|
+
- ".yardopts"
|
230
|
+
- CHANGELOG.md
|
231
|
+
- CODE_OF_CONDUCT.md
|
232
|
+
- CONTRIBUTING.md
|
233
|
+
- LICENSE.txt
|
234
|
+
- README.md
|
235
|
+
- bin/wgit
|
216
236
|
homepage: https://github.com/michaeltelford/wgit
|
217
237
|
licenses:
|
218
238
|
- MIT
|
@@ -223,7 +243,7 @@ metadata:
|
|
223
243
|
bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
|
224
244
|
documentation_uri: https://www.rubydoc.info/gems/wgit
|
225
245
|
allowed_push_host: https://rubygems.org
|
226
|
-
post_install_message:
|
246
|
+
post_install_message: Added the 'wgit' executable to $PATH
|
227
247
|
rdoc_options: []
|
228
248
|
require_paths:
|
229
249
|
- lib
|
@@ -238,10 +258,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
238
258
|
- !ruby/object:Gem::Version
|
239
259
|
version: '0'
|
240
260
|
requirements: []
|
241
|
-
|
242
|
-
|
243
|
-
signing_key:
|
261
|
+
rubygems_version: 3.1.2
|
262
|
+
signing_key:
|
244
263
|
specification_version: 4
|
245
|
-
summary: Wgit is a
|
246
|
-
|
264
|
+
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|
265
|
+
extract the data you want from the web.
|
247
266
|
test_files: []
|