wgit 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +61 -0
- data/LICENSE.txt +21 -0
- data/README.md +16 -7
- data/TODO.txt +34 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +35 -29
- data/lib/wgit/core_ext.rb +5 -3
- data/lib/wgit/crawler.rb +96 -58
- data/lib/wgit/database/connection_details.rb +4 -2
- data/lib/wgit/database/database.rb +84 -46
- data/lib/wgit/database/model.rb +12 -10
- data/lib/wgit/document.rb +100 -72
- data/lib/wgit/document_extensions.rb +11 -9
- data/lib/wgit/indexer.rb +34 -24
- data/lib/wgit/logger.rb +4 -2
- data/lib/wgit/url.rb +94 -59
- data/lib/wgit/utils.rb +13 -11
- data/lib/wgit/version.rb +3 -1
- metadata +41 -38
data/lib/wgit/utils.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
+
module Wgit
|
3
4
|
# Utility module containing generic methods.
|
4
5
|
module Utils
|
5
|
-
|
6
6
|
# Returns the current time stamp.
|
7
7
|
#
|
8
8
|
# @return [Time] The current time stamp.
|
@@ -21,6 +21,7 @@ module Wgit
|
|
21
21
|
hash = {}
|
22
22
|
obj.instance_variables.each do |var|
|
23
23
|
next if ignore.include?(var.to_s)
|
24
|
+
|
24
25
|
key = var.to_s[1..-1]
|
25
26
|
key = key.to_sym unless use_strings_as_keys
|
26
27
|
hash[key] = obj.instance_variable_get(var)
|
@@ -33,8 +34,8 @@ module Wgit
|
|
33
34
|
# @param model_hash [Hash] The model Hash to process.
|
34
35
|
# @return [Hash] The model Hash with non bson types removed.
|
35
36
|
def self.remove_non_bson_types(model_hash)
|
36
|
-
model_hash.
|
37
|
-
|
37
|
+
model_hash.select do |_k, v|
|
38
|
+
v.respond_to? :bson_type
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
@@ -65,9 +66,9 @@ module Wgit
|
|
65
66
|
# is less. The full sentence is returned if the sentence_limit is 0.
|
66
67
|
# @return [String] The sentence once formatted.
|
67
68
|
def self.format_sentence_length(sentence, index, sentence_limit)
|
68
|
-
raise
|
69
|
-
raise
|
70
|
-
if index < 0
|
69
|
+
raise 'A sentence value must be provided' if sentence.empty?
|
70
|
+
raise 'The sentence length value must be even' if sentence_limit.odd?
|
71
|
+
if (index < 0) || (index > sentence.length)
|
71
72
|
raise "Incorrect index value: #{index}"
|
72
73
|
end
|
73
74
|
|
@@ -121,7 +122,7 @@ module Wgit
|
|
121
122
|
# @param results [Array<Wgit::Document>] An Array whose
|
122
123
|
# Wgit::Documents#text matches the query at least once.
|
123
124
|
# @param query [String] The text query to search for.
|
124
|
-
# @param
|
125
|
+
# @param _case_sensitive [Boolean] Whether or not the search should be
|
125
126
|
# case sensitive or not.
|
126
127
|
# @param sentence_length [Integer] The length of the matching text of the
|
127
128
|
# search results to be outputted to the stream.
|
@@ -130,10 +131,11 @@ module Wgit
|
|
130
131
|
# @param stream [#puts] Any object that respond_to? :puts. It is used
|
131
132
|
# to output text somewhere e.g. STDOUT (the default).
|
132
133
|
# @return [nil]
|
133
|
-
def self.printf_search_results(results, query = nil,
|
134
|
+
def self.printf_search_results(results, query = nil, _case_sensitive = false,
|
134
135
|
sentence_length = 80, keyword_count = 5,
|
135
136
|
stream = Kernel)
|
136
|
-
raise
|
137
|
+
raise 'stream must respond_to? :puts' unless stream.respond_to? :puts
|
138
|
+
|
137
139
|
keyword_count -= 1 # Because Array's are zero indexed.
|
138
140
|
|
139
141
|
results.each do |doc|
|
@@ -149,7 +151,7 @@ module Wgit
|
|
149
151
|
end
|
150
152
|
stream.puts doc.title
|
151
153
|
unless doc.keywords.nil? || doc.keywords.empty?
|
152
|
-
stream.puts doc.keywords[0..keyword_count].join(
|
154
|
+
stream.puts doc.keywords[0..keyword_count].join(', ')
|
153
155
|
end
|
154
156
|
stream.puts sentence unless sentence.nil?
|
155
157
|
stream.puts doc.url
|
data/lib/wgit/version.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
|
2
4
|
# contents for later use.
|
3
5
|
# @author Michael Telford
|
4
6
|
module Wgit
|
5
7
|
# The current gem version of Wgit.
|
6
|
-
VERSION =
|
8
|
+
VERSION = '0.0.18'
|
7
9
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -11,81 +11,75 @@ cert_chain: []
|
|
11
11
|
date: 2016-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: byebug
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '10.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '10.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: dotenv
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 0.9.20
|
34
|
-
- - "<"
|
31
|
+
- - "~>"
|
35
32
|
- !ruby/object:Gem::Version
|
36
|
-
version: '
|
33
|
+
version: '2.5'
|
37
34
|
type: :development
|
38
35
|
prerelease: false
|
39
36
|
version_requirements: !ruby/object:Gem::Requirement
|
40
37
|
requirements:
|
41
|
-
- - "
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: 0.9.20
|
44
|
-
- - "<"
|
38
|
+
- - "~>"
|
45
39
|
- !ruby/object:Gem::Version
|
46
|
-
version: '
|
40
|
+
version: '2.5'
|
47
41
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
42
|
+
name: httplog
|
49
43
|
requirement: !ruby/object:Gem::Requirement
|
50
44
|
requirements:
|
51
45
|
- - "~>"
|
52
46
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
47
|
+
version: '1.3'
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
51
|
requirements:
|
58
52
|
- - "~>"
|
59
53
|
- !ruby/object:Gem::Version
|
60
|
-
version: '
|
54
|
+
version: '1.3'
|
61
55
|
- !ruby/object:Gem::Dependency
|
62
|
-
name:
|
56
|
+
name: minitest
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|
64
58
|
requirements:
|
65
59
|
- - "~>"
|
66
60
|
- !ruby/object:Gem::Version
|
67
|
-
version: '
|
61
|
+
version: '5.11'
|
68
62
|
type: :development
|
69
63
|
prerelease: false
|
70
64
|
version_requirements: !ruby/object:Gem::Requirement
|
71
65
|
requirements:
|
72
66
|
- - "~>"
|
73
67
|
- !ruby/object:Gem::Version
|
74
|
-
version: '
|
68
|
+
version: '5.11'
|
75
69
|
- !ruby/object:Gem::Dependency
|
76
|
-
name:
|
70
|
+
name: pry
|
77
71
|
requirement: !ruby/object:Gem::Requirement
|
78
72
|
requirements:
|
79
73
|
- - "~>"
|
80
74
|
- !ruby/object:Gem::Version
|
81
|
-
version: '
|
75
|
+
version: '0.12'
|
82
76
|
type: :development
|
83
77
|
prerelease: false
|
84
78
|
version_requirements: !ruby/object:Gem::Requirement
|
85
79
|
requirements:
|
86
80
|
- - "~>"
|
87
81
|
- !ruby/object:Gem::Version
|
88
|
-
version: '
|
82
|
+
version: '0.12'
|
89
83
|
- !ruby/object:Gem::Dependency
|
90
84
|
name: rake
|
91
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -101,33 +95,39 @@ dependencies:
|
|
101
95
|
- !ruby/object:Gem::Version
|
102
96
|
version: '12.3'
|
103
97
|
- !ruby/object:Gem::Dependency
|
104
|
-
name:
|
98
|
+
name: webmock
|
105
99
|
requirement: !ruby/object:Gem::Requirement
|
106
100
|
requirements:
|
107
101
|
- - "~>"
|
108
102
|
- !ruby/object:Gem::Version
|
109
|
-
version: '
|
103
|
+
version: '3.6'
|
110
104
|
type: :development
|
111
105
|
prerelease: false
|
112
106
|
version_requirements: !ruby/object:Gem::Requirement
|
113
107
|
requirements:
|
114
108
|
- - "~>"
|
115
109
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
110
|
+
version: '3.6'
|
117
111
|
- !ruby/object:Gem::Dependency
|
118
|
-
name:
|
112
|
+
name: yard
|
119
113
|
requirement: !ruby/object:Gem::Requirement
|
120
114
|
requirements:
|
121
|
-
- - "
|
115
|
+
- - ">="
|
122
116
|
- !ruby/object:Gem::Version
|
123
|
-
version:
|
117
|
+
version: 0.9.20
|
118
|
+
- - "<"
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '1.0'
|
124
121
|
type: :development
|
125
122
|
prerelease: false
|
126
123
|
version_requirements: !ruby/object:Gem::Requirement
|
127
124
|
requirements:
|
128
|
-
- - "
|
125
|
+
- - ">="
|
129
126
|
- !ruby/object:Gem::Version
|
130
|
-
version:
|
127
|
+
version: 0.9.20
|
128
|
+
- - "<"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '1.0'
|
131
131
|
- !ruby/object:Gem::Dependency
|
132
132
|
name: addressable
|
133
133
|
requirement: !ruby/object:Gem::Requirement
|
@@ -143,33 +143,33 @@ dependencies:
|
|
143
143
|
- !ruby/object:Gem::Version
|
144
144
|
version: 2.6.0
|
145
145
|
- !ruby/object:Gem::Dependency
|
146
|
-
name:
|
146
|
+
name: mongo
|
147
147
|
requirement: !ruby/object:Gem::Requirement
|
148
148
|
requirements:
|
149
149
|
- - "~>"
|
150
150
|
- !ruby/object:Gem::Version
|
151
|
-
version:
|
151
|
+
version: 2.9.0
|
152
152
|
type: :runtime
|
153
153
|
prerelease: false
|
154
154
|
version_requirements: !ruby/object:Gem::Requirement
|
155
155
|
requirements:
|
156
156
|
- - "~>"
|
157
157
|
- !ruby/object:Gem::Version
|
158
|
-
version:
|
158
|
+
version: 2.9.0
|
159
159
|
- !ruby/object:Gem::Dependency
|
160
|
-
name:
|
160
|
+
name: nokogiri
|
161
161
|
requirement: !ruby/object:Gem::Requirement
|
162
162
|
requirements:
|
163
163
|
- - "~>"
|
164
164
|
- !ruby/object:Gem::Version
|
165
|
-
version:
|
165
|
+
version: 1.10.3
|
166
166
|
type: :runtime
|
167
167
|
prerelease: false
|
168
168
|
version_requirements: !ruby/object:Gem::Requirement
|
169
169
|
requirements:
|
170
170
|
- - "~>"
|
171
171
|
- !ruby/object:Gem::Version
|
172
|
-
version:
|
172
|
+
version: 1.10.3
|
173
173
|
description: Fundamentally, Wgit is a WWW indexer/scraper which crawls URL's, retrieves
|
174
174
|
and serialises their page contents for later use. You can use Wgit to copy entire
|
175
175
|
websites if required. Wgit also provides a means to search indexed documents stored
|
@@ -196,7 +196,10 @@ files:
|
|
196
196
|
- "./lib/wgit/url.rb"
|
197
197
|
- "./lib/wgit/utils.rb"
|
198
198
|
- "./lib/wgit/version.rb"
|
199
|
+
- CHANGELOG.md
|
200
|
+
- LICENSE.txt
|
199
201
|
- README.md
|
202
|
+
- TODO.txt
|
200
203
|
homepage: https://github.com/michaeltelford/wgit
|
201
204
|
licenses:
|
202
205
|
- MIT
|