wgit 0.5.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +240 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +239 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +234 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -17
- data/lib/wgit/url.rb +170 -42
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +8 -2
- metadata +54 -32
data/lib/wgit/utils.rb
CHANGED
@@ -145,7 +145,8 @@ module Wgit
|
|
145
145
|
# @param keyword_limit [Integer] The max amount of keywords to be
|
146
146
|
# outputted to the stream.
|
147
147
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
148
|
-
# to output text somewhere e.g. a file or
|
148
|
+
# to output text somewhere e.g. a file or STDERR.
|
149
|
+
# @return [Integer] The number of results.
|
149
150
|
def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
|
150
151
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
151
152
|
|
@@ -162,18 +163,37 @@ module Wgit
|
|
162
163
|
stream.puts
|
163
164
|
end
|
164
165
|
|
165
|
-
|
166
|
+
results.size
|
166
167
|
end
|
167
168
|
|
168
|
-
#
|
169
|
+
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
170
|
+
# method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
|
171
|
+
# not in the case statement will be ignored and returned as is.
|
172
|
+
#
|
173
|
+
# @param obj [Object] The object to be sanitized.
|
174
|
+
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
175
|
+
# invalid characters.
|
176
|
+
# @return [Object] The sanitized obj is both modified and then returned.
|
177
|
+
def self.sanitize(obj, encode: true)
|
178
|
+
case obj
|
179
|
+
when String
|
180
|
+
sanitize_str(obj, encode: encode)
|
181
|
+
when Array
|
182
|
+
sanitize_arr(obj, encode: encode)
|
183
|
+
else
|
184
|
+
obj
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Sanitises a String to make it uniform. Strips any leading/trailing white
|
169
189
|
# space. Also applies UTF-8 encoding (replacing invalid characters) if
|
170
190
|
# `encode: true`.
|
171
191
|
#
|
172
|
-
# @param str [String] The String to
|
192
|
+
# @param str [String] The String to sanitize. str is modified.
|
173
193
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
174
194
|
# invalid characters.
|
175
|
-
# @return [String] The
|
176
|
-
def self.
|
195
|
+
# @return [String] The sanitized str is both modified and then returned.
|
196
|
+
def self.sanitize_str(str, encode: true)
|
177
197
|
if str.is_a?(String)
|
178
198
|
str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
|
179
199
|
str.strip!
|
@@ -182,15 +202,15 @@ module Wgit
|
|
182
202
|
str
|
183
203
|
end
|
184
204
|
|
185
|
-
#
|
186
|
-
# processes non empty Strings using Wgit::Utils.
|
205
|
+
# Sanitises an Array to make it uniform. Removes empty Strings and nils,
|
206
|
+
# processes non empty Strings using Wgit::Utils.sanitize and removes
|
187
207
|
# duplicates.
|
188
208
|
#
|
189
|
-
# @param arr [Enumerable] The Array to
|
190
|
-
# @return [Enumerable] The
|
191
|
-
def self.
|
209
|
+
# @param arr [Enumerable] The Array to sanitize. arr is modified.
|
210
|
+
# @return [Enumerable] The sanitized arr is both modified and then returned.
|
211
|
+
def self.sanitize_arr(arr, encode: true)
|
192
212
|
if arr.is_a?(Array)
|
193
|
-
arr.map! { |str|
|
213
|
+
arr.map! { |str| sanitize(str, encode: encode) }
|
194
214
|
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
195
215
|
arr.compact!
|
196
216
|
arr.uniq!
|
@@ -198,13 +218,5 @@ module Wgit
|
|
198
218
|
|
199
219
|
arr
|
200
220
|
end
|
201
|
-
|
202
|
-
# Returns the model having removed non bson types (for use with MongoDB).
|
203
|
-
#
|
204
|
-
# @param model_hash [Hash] The model Hash to process.
|
205
|
-
# @return [Hash] The model Hash with non bson types removed.
|
206
|
-
def self.remove_non_bson_types(model_hash)
|
207
|
-
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
208
|
-
end
|
209
221
|
end
|
210
222
|
end
|
data/lib/wgit/version.rb
CHANGED
@@ -1,14 +1,20 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
# Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
|
4
|
-
# contents for later use
|
4
|
+
# contents for later use.
|
5
|
+
#
|
5
6
|
# @author Michael Telford
|
6
7
|
module Wgit
|
7
8
|
# The current gem version of Wgit.
|
8
|
-
VERSION = '0.
|
9
|
+
VERSION = '0.9.0'
|
9
10
|
|
10
11
|
# Returns the current gem version of Wgit as a String.
|
11
12
|
def self.version
|
12
13
|
VERSION
|
13
14
|
end
|
15
|
+
|
16
|
+
# Returns the current gem version in a presentation String.
|
17
|
+
def self.version_str
|
18
|
+
"wgit v#{VERSION}"
|
19
|
+
end
|
14
20
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -16,56 +16,70 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 2.6
|
19
|
+
version: '2.6'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 2.6
|
26
|
+
version: '2.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: mongo
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 2.9
|
33
|
+
version: '2.9'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 2.9
|
40
|
+
version: '2.9'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.10
|
47
|
+
version: '1.10'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.10
|
54
|
+
version: '1.10'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: typhoeus
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: 1.3
|
61
|
+
version: '1.3'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: 1.3
|
68
|
+
version: '1.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ferrum
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.8'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.8'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: byebug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -123,33 +137,33 @@ dependencies:
|
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '0.12'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
140
|
+
name: rubocop
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
128
142
|
requirements:
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
145
|
+
version: '0.74'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
|
-
version: '
|
152
|
+
version: '0.74'
|
139
153
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
154
|
+
name: toys
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
142
156
|
requirements:
|
143
157
|
- - "~>"
|
144
158
|
- !ruby/object:Gem::Version
|
145
|
-
version: '0.
|
159
|
+
version: '0.8'
|
146
160
|
type: :development
|
147
161
|
prerelease: false
|
148
162
|
version_requirements: !ruby/object:Gem::Requirement
|
149
163
|
requirements:
|
150
164
|
- - "~>"
|
151
165
|
- !ruby/object:Gem::Version
|
152
|
-
version: '0.
|
166
|
+
version: '0.8'
|
153
167
|
- !ruby/object:Gem::Dependency
|
154
168
|
name: webmock
|
155
169
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,43 +198,52 @@ dependencies:
|
|
184
198
|
- - "<"
|
185
199
|
- !ruby/object:Gem::Version
|
186
200
|
version: '1.0'
|
187
|
-
description: '
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
WWW search engine. The Wgit API is easily extended allowing you to pull out the
|
192
|
-
parts of a webpage that are important to you, the code snippets or tables for example.
|
193
|
-
As Wgit is a library, it supports many different use cases including data mining,
|
194
|
-
analytics, web indexing and URL parsing to name a few.
|
201
|
+
description: 'Wgit was primarily designed to crawl static HTML websites to index and
|
202
|
+
search their content - providing the basis of any search engine; but Wgit is suitable
|
203
|
+
for many application domains including: URL parsing, data mining and statistical
|
204
|
+
analysis.
|
195
205
|
|
196
206
|
'
|
197
207
|
email: michael.telford@live.com
|
198
|
-
executables:
|
208
|
+
executables:
|
209
|
+
- wgit
|
199
210
|
extensions: []
|
200
211
|
extra_rdoc_files: []
|
201
212
|
files:
|
202
213
|
- "./lib/wgit.rb"
|
203
214
|
- "./lib/wgit/assertable.rb"
|
215
|
+
- "./lib/wgit/base.rb"
|
204
216
|
- "./lib/wgit/core_ext.rb"
|
205
217
|
- "./lib/wgit/crawler.rb"
|
206
218
|
- "./lib/wgit/database/database.rb"
|
207
219
|
- "./lib/wgit/database/model.rb"
|
208
220
|
- "./lib/wgit/document.rb"
|
209
|
-
- "./lib/wgit/
|
221
|
+
- "./lib/wgit/document_extractors.rb"
|
222
|
+
- "./lib/wgit/dsl.rb"
|
210
223
|
- "./lib/wgit/indexer.rb"
|
211
224
|
- "./lib/wgit/logger.rb"
|
212
225
|
- "./lib/wgit/response.rb"
|
213
226
|
- "./lib/wgit/url.rb"
|
214
227
|
- "./lib/wgit/utils.rb"
|
215
228
|
- "./lib/wgit/version.rb"
|
229
|
+
- ".yardopts"
|
230
|
+
- CHANGELOG.md
|
231
|
+
- CODE_OF_CONDUCT.md
|
232
|
+
- CONTRIBUTING.md
|
233
|
+
- LICENSE.txt
|
234
|
+
- README.md
|
235
|
+
- bin/wgit
|
216
236
|
homepage: https://github.com/michaeltelford/wgit
|
217
237
|
licenses:
|
218
238
|
- MIT
|
219
239
|
metadata:
|
220
|
-
source_code_uri: https://github.com/michaeltelford/wgit
|
221
240
|
yard.run: yri
|
241
|
+
source_code_uri: https://github.com/michaeltelford/wgit
|
242
|
+
changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
|
243
|
+
bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
|
244
|
+
documentation_uri: https://www.rubydoc.info/github/michaeltelford/wgit/master
|
222
245
|
allowed_push_host: https://rubygems.org
|
223
|
-
post_install_message:
|
246
|
+
post_install_message: Added the 'wgit' executable to $PATH
|
224
247
|
rdoc_options: []
|
225
248
|
require_paths:
|
226
249
|
- lib
|
@@ -235,10 +258,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
235
258
|
- !ruby/object:Gem::Version
|
236
259
|
version: '0'
|
237
260
|
requirements: []
|
238
|
-
|
239
|
-
rubygems_version: 2.7.6
|
261
|
+
rubygems_version: 3.1.2
|
240
262
|
signing_key:
|
241
263
|
specification_version: 4
|
242
|
-
summary: Wgit is a
|
243
|
-
|
264
|
+
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|
265
|
+
extract the data you want from the web.
|
244
266
|
test_files: []
|