wgit 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -0
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +27 -24
- data/bin/wgit +72 -18
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +91 -20
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -663
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +187 -77
- data/lib/wgit/document_extractors.rb +15 -23
- data/lib/wgit/dsl.rb +64 -67
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +29 -10
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +5 -8
- data/lib/wgit/robots_parser.rb +8 -8
- data/lib/wgit/url.rb +38 -38
- data/lib/wgit/utils.rb +124 -14
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -14
- metadata +74 -30
- data/lib/wgit/database/model.rb +0 -60
data/lib/wgit/utils.rb
CHANGED
|
@@ -18,17 +18,15 @@ module Wgit
|
|
|
18
18
|
# keys.
|
|
19
19
|
# @return [Hash] A Hash created from obj's instance vars and values.
|
|
20
20
|
def self.to_h(obj, ignore: [], use_strings_as_keys: true)
|
|
21
|
-
hash
|
|
22
|
-
|
|
23
|
-
obj.instance_variables.each do |var|
|
|
24
|
-
next if ignore.include?(var.to_s)
|
|
21
|
+
obj.instance_variables.reduce({}) do |hash, var|
|
|
22
|
+
next hash if ignore.include?(var.to_s)
|
|
25
23
|
|
|
26
24
|
key = var.to_s[1..] # Remove the @ prefix.
|
|
27
25
|
key = key.to_sym unless use_strings_as_keys
|
|
28
26
|
hash[key] = obj.instance_variable_get(var)
|
|
29
|
-
end
|
|
30
27
|
|
|
31
|
-
|
|
28
|
+
hash
|
|
29
|
+
end
|
|
32
30
|
end
|
|
33
31
|
|
|
34
32
|
# An improved :each method which supports both singleton and Enumerable
|
|
@@ -127,13 +125,19 @@ module Wgit
|
|
|
127
125
|
end
|
|
128
126
|
|
|
129
127
|
# Prints out the search results in a search engine like format.
|
|
130
|
-
#
|
|
128
|
+
#
|
|
129
|
+
# The given results should be matching documents from a DB and should have
|
|
130
|
+
# `doc.search_text!` called for each document - to turn doc.text into only
|
|
131
|
+
# matching text, which this method uses.
|
|
132
|
+
#
|
|
133
|
+
# The format for each result looks something like:
|
|
131
134
|
#
|
|
132
135
|
# ```
|
|
133
136
|
# Title
|
|
134
137
|
# Keywords (if there are some)
|
|
135
|
-
# Text Snippet (formatted to show the searched for query
|
|
138
|
+
# Text Snippet (formatted to show the searched for query)
|
|
136
139
|
# URL
|
|
140
|
+
# Score (if include_score: true)
|
|
137
141
|
# <empty_line_seperator>
|
|
138
142
|
# ```
|
|
139
143
|
#
|
|
@@ -142,28 +146,91 @@ module Wgit
|
|
|
142
146
|
# the search results). The first @text sentence gets printed.
|
|
143
147
|
# @param keyword_limit [Integer] The max amount of keywords to be
|
|
144
148
|
# outputted to the stream.
|
|
149
|
+
# @param include_score [Boolean] Wether or not to puts the document score.
|
|
145
150
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
|
146
151
|
# to output text somewhere e.g. a file or STDERR.
|
|
147
152
|
# @return [Integer] The number of results.
|
|
148
|
-
def self.
|
|
153
|
+
def self.pprint_top_search_results(
|
|
154
|
+
results, keyword_limit: 5, include_score: false, stream: $stdout
|
|
155
|
+
)
|
|
149
156
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
|
150
157
|
|
|
151
158
|
results.each do |doc|
|
|
152
|
-
title =
|
|
159
|
+
title = doc.title || '<no title>'
|
|
153
160
|
keywords = doc.keywords&.take(keyword_limit)&.join(', ')
|
|
154
161
|
sentence = doc.text.first
|
|
155
162
|
url = doc.url
|
|
163
|
+
score = doc.score
|
|
156
164
|
|
|
157
165
|
stream.puts title
|
|
158
166
|
stream.puts keywords if keywords
|
|
159
167
|
stream.puts sentence
|
|
160
168
|
stream.puts url
|
|
169
|
+
stream.puts score if include_score
|
|
161
170
|
stream.puts
|
|
162
171
|
end
|
|
163
172
|
|
|
164
173
|
results.size
|
|
165
174
|
end
|
|
166
175
|
|
|
176
|
+
# Prints out the search results listing all of the matching text in each
|
|
177
|
+
# document.
|
|
178
|
+
#
|
|
179
|
+
# The given results should be matching documents from a DB and should have
|
|
180
|
+
# `doc.search_text!` called for each document - to turn doc.text into only
|
|
181
|
+
# matching text, which this method uses.
|
|
182
|
+
#
|
|
183
|
+
# The format for each result looks something like:
|
|
184
|
+
#
|
|
185
|
+
# ```
|
|
186
|
+
# Title
|
|
187
|
+
# Keywords (if there are some)
|
|
188
|
+
# URL
|
|
189
|
+
# Score (if include_score: true)
|
|
190
|
+
#
|
|
191
|
+
# "<text_snippet_1>"
|
|
192
|
+
# "<text_snippet_2>"
|
|
193
|
+
# ...
|
|
194
|
+
#
|
|
195
|
+
# <seperator>
|
|
196
|
+
# ```
|
|
197
|
+
#
|
|
198
|
+
# @param results [Array<Wgit::Document>] Array of Wgit::Document's which
|
|
199
|
+
# each have had #search!(query) called (to update it's @text with the
|
|
200
|
+
# the search results). The first @text sentence gets printed.
|
|
201
|
+
# @param keyword_limit [Integer] The max amount of keywords to be
|
|
202
|
+
# outputted to the stream.
|
|
203
|
+
# @param include_score [Boolean] Wether or not to puts the document score.
|
|
204
|
+
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
|
205
|
+
# to output text somewhere e.g. a file or STDERR.
|
|
206
|
+
# @return [Integer] The number of results.
|
|
207
|
+
def self.pprint_all_search_results(
|
|
208
|
+
results, keyword_limit: 5, include_score: false, stream: $stdout
|
|
209
|
+
)
|
|
210
|
+
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
|
211
|
+
|
|
212
|
+
results.each_with_index do |doc, i|
|
|
213
|
+
last_result = i == (results.size-1)
|
|
214
|
+
|
|
215
|
+
title = doc.title || '<no title>'
|
|
216
|
+
keywords = doc.keywords&.take(keyword_limit)&.join(', ')
|
|
217
|
+
url = doc.url
|
|
218
|
+
score = doc.score
|
|
219
|
+
|
|
220
|
+
stream.puts title
|
|
221
|
+
stream.puts keywords if keywords
|
|
222
|
+
stream.puts url
|
|
223
|
+
stream.puts score if include_score
|
|
224
|
+
stream.puts
|
|
225
|
+
doc.text.each { |text| stream.puts text }
|
|
226
|
+
stream.puts
|
|
227
|
+
stream.puts "-----" unless last_result
|
|
228
|
+
stream.puts unless last_result
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
results.size
|
|
232
|
+
end
|
|
233
|
+
|
|
167
234
|
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
|
168
235
|
# method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
|
|
169
236
|
# Any type not in the case statement will be ignored and returned as is.
|
|
@@ -230,6 +297,42 @@ module Wgit
|
|
|
230
297
|
.uniq
|
|
231
298
|
end
|
|
232
299
|
|
|
300
|
+
# Build a regular expression from a query string, for searching text with.
|
|
301
|
+
#
|
|
302
|
+
# All searches using this regex are always whole word based while whole
|
|
303
|
+
# sentence searches are configurable using the whole_sentence: param. For
|
|
304
|
+
# example:
|
|
305
|
+
#
|
|
306
|
+
# ```
|
|
307
|
+
# text = "hello world"
|
|
308
|
+
# query = "world hello", whole_sentence: true # => No match
|
|
309
|
+
# query = "world hello", whole_sentence: false # => Match
|
|
310
|
+
# query = "he" # => Never matches
|
|
311
|
+
# ```
|
|
312
|
+
#
|
|
313
|
+
# @param query [String, Regexp] The query string to build a regex from.
|
|
314
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
|
315
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
|
316
|
+
# for separately, matching in any order.
|
|
317
|
+
# @return [Regexp] The regex with which to search text.
|
|
318
|
+
def self.build_search_regex(
|
|
319
|
+
query, case_sensitive: false, whole_sentence: true
|
|
320
|
+
)
|
|
321
|
+
return query if query.is_a?(Regexp)
|
|
322
|
+
|
|
323
|
+
# query: "hello world", whole_sentence: false produces:
|
|
324
|
+
# (?<=^|\s|[^a-zA-Z0-9])hello(?=$|\s|[^a-zA-Z0-9])|(?<=^|\s|[^a-zA-Z0-9])world(?=$|\s|[^a-zA-Z0-9])
|
|
325
|
+
|
|
326
|
+
sep = whole_sentence ? " " : "|"
|
|
327
|
+
segs = query.split(" ").map do |word|
|
|
328
|
+
word = Regexp.escape(word)
|
|
329
|
+
"(?<=^|\\s|[^a-zA-Z0-9])#{word}(?=$|\\s|[^a-zA-Z0-9])"
|
|
330
|
+
end
|
|
331
|
+
query = segs.join(sep)
|
|
332
|
+
|
|
333
|
+
Regexp.new(query, !case_sensitive)
|
|
334
|
+
end
|
|
335
|
+
|
|
233
336
|
# Pretty prints a log statement, used for debugging purposes.
|
|
234
337
|
#
|
|
235
338
|
# Use like:
|
|
@@ -245,25 +348,30 @@ module Wgit
|
|
|
245
348
|
# ```
|
|
246
349
|
#
|
|
247
350
|
# @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
|
|
351
|
+
# @param display [Boolean] Setting as false will cause a noop, useful for
|
|
352
|
+
# switching off several/all pprint statements at once e.g. via ENV var.
|
|
248
353
|
# @param stream [#puts] Any object that respond_to? :puts and :print. It is
|
|
249
354
|
# used to output the log text somewhere e.g. a file or STDERR.
|
|
250
355
|
# @param prefix [String] The log prefix, useful for visibility/greping.
|
|
251
356
|
# @param new_line [Boolean] Wether or not to use a new line (\n) as the
|
|
252
357
|
# separator.
|
|
253
358
|
# @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
|
|
254
|
-
def self.pprint(identifier, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
|
|
359
|
+
def self.pprint(identifier, display: true, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
|
|
360
|
+
return unless display
|
|
361
|
+
|
|
255
362
|
sep1 = new_line ? "\n" : ' - '
|
|
363
|
+
sep1 = '' if vars.empty?
|
|
256
364
|
sep2 = new_line ? "\n" : ' | '
|
|
257
365
|
|
|
258
366
|
stream.print "\n#{prefix}_#{identifier}#{sep1}"
|
|
259
367
|
|
|
260
368
|
vars.each_with_index do |arr, i|
|
|
261
|
-
|
|
369
|
+
is_last_item = (i + 1) == vars.size
|
|
262
370
|
sep3 = sep2
|
|
263
|
-
sep3 = new_line ? "\n" : '' if
|
|
371
|
+
sep3 = new_line ? "\n" : '' if is_last_item
|
|
264
372
|
k, v = arr
|
|
265
373
|
|
|
266
|
-
stream.print "#{k}: #{v}#{sep3}"
|
|
374
|
+
stream.print "#{k}: #{v.inspect}#{sep3}"
|
|
267
375
|
end
|
|
268
376
|
|
|
269
377
|
stream.puts "\n"
|
|
@@ -271,5 +379,7 @@ module Wgit
|
|
|
271
379
|
|
|
272
380
|
nil
|
|
273
381
|
end
|
|
382
|
+
|
|
383
|
+
self.singleton_class.alias_method :pprint_search_results, :pprint_top_search_results
|
|
274
384
|
end
|
|
275
385
|
end
|
data/lib/wgit/version.rb
CHANGED
data/lib/wgit.rb
CHANGED
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require_relative
|
|
4
|
-
require_relative
|
|
5
|
-
require_relative
|
|
6
|
-
require_relative
|
|
7
|
-
require_relative
|
|
8
|
-
require_relative
|
|
9
|
-
require_relative
|
|
10
|
-
require_relative
|
|
11
|
-
require_relative
|
|
12
|
-
require_relative
|
|
13
|
-
require_relative
|
|
14
|
-
require_relative
|
|
15
|
-
require_relative
|
|
16
|
-
require_relative
|
|
3
|
+
require_relative "wgit/version"
|
|
4
|
+
require_relative "wgit/logger"
|
|
5
|
+
require_relative "wgit/assertable"
|
|
6
|
+
require_relative "wgit/utils"
|
|
7
|
+
require_relative "wgit/url"
|
|
8
|
+
require_relative "wgit/html_to_text"
|
|
9
|
+
require_relative "wgit/document"
|
|
10
|
+
require_relative "wgit/document_extractors"
|
|
11
|
+
require_relative "wgit/crawler"
|
|
12
|
+
require_relative "wgit/model"
|
|
13
|
+
require_relative "wgit/database/database"
|
|
14
|
+
require_relative "wgit/database/database_adapter"
|
|
15
|
+
require_relative "wgit/database/adapters/mongo_db"
|
|
16
|
+
require_relative "wgit/database/adapters/in_memory"
|
|
17
|
+
require_relative "wgit/robots_parser"
|
|
18
|
+
require_relative "wgit/indexer"
|
|
19
|
+
require_relative "wgit/dsl"
|
|
20
|
+
require_relative "wgit/base"
|
|
17
21
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wgit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Michael Telford
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 2025-08-18 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: addressable
|
|
@@ -25,131 +24,173 @@ dependencies:
|
|
|
25
24
|
- !ruby/object:Gem::Version
|
|
26
25
|
version: '2.8'
|
|
27
26
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
27
|
+
name: base64
|
|
29
28
|
requirement: !ruby/object:Gem::Requirement
|
|
30
29
|
requirements:
|
|
31
30
|
- - "~>"
|
|
32
31
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '
|
|
32
|
+
version: '0.3'
|
|
34
33
|
type: :runtime
|
|
35
34
|
prerelease: false
|
|
36
35
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
36
|
requirements:
|
|
38
37
|
- - "~>"
|
|
39
38
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: '
|
|
39
|
+
version: '0.3'
|
|
41
40
|
- !ruby/object:Gem::Dependency
|
|
42
|
-
name:
|
|
41
|
+
name: benchmark
|
|
43
42
|
requirement: !ruby/object:Gem::Requirement
|
|
44
43
|
requirements:
|
|
45
44
|
- - "~>"
|
|
46
45
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: '
|
|
46
|
+
version: '0.4'
|
|
48
47
|
type: :runtime
|
|
49
48
|
prerelease: false
|
|
50
49
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
50
|
requirements:
|
|
52
51
|
- - "~>"
|
|
53
52
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: '
|
|
53
|
+
version: '0.4'
|
|
55
54
|
- !ruby/object:Gem::Dependency
|
|
56
|
-
name:
|
|
55
|
+
name: ferrum
|
|
57
56
|
requirement: !ruby/object:Gem::Requirement
|
|
58
57
|
requirements:
|
|
59
58
|
- - "~>"
|
|
60
59
|
- !ruby/object:Gem::Version
|
|
61
|
-
version: '
|
|
60
|
+
version: '0.17'
|
|
62
61
|
type: :runtime
|
|
63
62
|
prerelease: false
|
|
64
63
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
64
|
requirements:
|
|
66
65
|
- - "~>"
|
|
67
66
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '
|
|
67
|
+
version: '0.17'
|
|
69
68
|
- !ruby/object:Gem::Dependency
|
|
70
|
-
name:
|
|
69
|
+
name: logger
|
|
71
70
|
requirement: !ruby/object:Gem::Requirement
|
|
72
71
|
requirements:
|
|
73
72
|
- - "~>"
|
|
74
73
|
- !ruby/object:Gem::Version
|
|
75
|
-
version: '
|
|
74
|
+
version: '1.7'
|
|
76
75
|
type: :runtime
|
|
77
76
|
prerelease: false
|
|
78
77
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
78
|
requirements:
|
|
80
79
|
- - "~>"
|
|
81
80
|
- !ruby/object:Gem::Version
|
|
82
|
-
version: '
|
|
81
|
+
version: '1.7'
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: mongo
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - "~>"
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '2.21'
|
|
89
|
+
type: :runtime
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - "~>"
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '2.21'
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: nokogiri
|
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
|
99
|
+
requirements:
|
|
100
|
+
- - "~>"
|
|
101
|
+
- !ruby/object:Gem::Version
|
|
102
|
+
version: '1.18'
|
|
103
|
+
type: :runtime
|
|
104
|
+
prerelease: false
|
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - "~>"
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: '1.18'
|
|
110
|
+
- !ruby/object:Gem::Dependency
|
|
111
|
+
name: typhoeus
|
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
|
113
|
+
requirements:
|
|
114
|
+
- - "~>"
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
version: '1.4'
|
|
117
|
+
type: :runtime
|
|
118
|
+
prerelease: false
|
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
120
|
+
requirements:
|
|
121
|
+
- - "~>"
|
|
122
|
+
- !ruby/object:Gem::Version
|
|
123
|
+
version: '1.4'
|
|
83
124
|
- !ruby/object:Gem::Dependency
|
|
84
125
|
name: byebug
|
|
85
126
|
requirement: !ruby/object:Gem::Requirement
|
|
86
127
|
requirements:
|
|
87
128
|
- - "~>"
|
|
88
129
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '
|
|
130
|
+
version: '12.0'
|
|
90
131
|
type: :development
|
|
91
132
|
prerelease: false
|
|
92
133
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
134
|
requirements:
|
|
94
135
|
- - "~>"
|
|
95
136
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '
|
|
137
|
+
version: '12.0'
|
|
97
138
|
- !ruby/object:Gem::Dependency
|
|
98
139
|
name: dotenv
|
|
99
140
|
requirement: !ruby/object:Gem::Requirement
|
|
100
141
|
requirements:
|
|
101
142
|
- - "~>"
|
|
102
143
|
- !ruby/object:Gem::Version
|
|
103
|
-
version: '
|
|
144
|
+
version: '3.1'
|
|
104
145
|
type: :development
|
|
105
146
|
prerelease: false
|
|
106
147
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
148
|
requirements:
|
|
108
149
|
- - "~>"
|
|
109
150
|
- !ruby/object:Gem::Version
|
|
110
|
-
version: '
|
|
151
|
+
version: '3.1'
|
|
111
152
|
- !ruby/object:Gem::Dependency
|
|
112
153
|
name: maxitest
|
|
113
154
|
requirement: !ruby/object:Gem::Requirement
|
|
114
155
|
requirements:
|
|
115
156
|
- - "~>"
|
|
116
157
|
- !ruby/object:Gem::Version
|
|
117
|
-
version: '
|
|
158
|
+
version: '6.0'
|
|
118
159
|
type: :development
|
|
119
160
|
prerelease: false
|
|
120
161
|
version_requirements: !ruby/object:Gem::Requirement
|
|
121
162
|
requirements:
|
|
122
163
|
- - "~>"
|
|
123
164
|
- !ruby/object:Gem::Version
|
|
124
|
-
version: '
|
|
165
|
+
version: '6.0'
|
|
125
166
|
- !ruby/object:Gem::Dependency
|
|
126
167
|
name: pry
|
|
127
168
|
requirement: !ruby/object:Gem::Requirement
|
|
128
169
|
requirements:
|
|
129
170
|
- - "~>"
|
|
130
171
|
- !ruby/object:Gem::Version
|
|
131
|
-
version: '0.
|
|
172
|
+
version: '0.15'
|
|
132
173
|
type: :development
|
|
133
174
|
prerelease: false
|
|
134
175
|
version_requirements: !ruby/object:Gem::Requirement
|
|
135
176
|
requirements:
|
|
136
177
|
- - "~>"
|
|
137
178
|
- !ruby/object:Gem::Version
|
|
138
|
-
version: '0.
|
|
179
|
+
version: '0.15'
|
|
139
180
|
- !ruby/object:Gem::Dependency
|
|
140
181
|
name: rubocop
|
|
141
182
|
requirement: !ruby/object:Gem::Requirement
|
|
142
183
|
requirements:
|
|
143
184
|
- - "~>"
|
|
144
185
|
- !ruby/object:Gem::Version
|
|
145
|
-
version: '1.
|
|
186
|
+
version: '1.79'
|
|
146
187
|
type: :development
|
|
147
188
|
prerelease: false
|
|
148
189
|
version_requirements: !ruby/object:Gem::Requirement
|
|
149
190
|
requirements:
|
|
150
191
|
- - "~>"
|
|
151
192
|
- !ruby/object:Gem::Version
|
|
152
|
-
version: '1.
|
|
193
|
+
version: '1.79'
|
|
153
194
|
- !ruby/object:Gem::Dependency
|
|
154
195
|
name: toys
|
|
155
196
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -170,14 +211,14 @@ dependencies:
|
|
|
170
211
|
requirements:
|
|
171
212
|
- - "~>"
|
|
172
213
|
- !ruby/object:Gem::Version
|
|
173
|
-
version: '3.
|
|
214
|
+
version: '3.25'
|
|
174
215
|
type: :development
|
|
175
216
|
prerelease: false
|
|
176
217
|
version_requirements: !ruby/object:Gem::Requirement
|
|
177
218
|
requirements:
|
|
178
219
|
- - "~>"
|
|
179
220
|
- !ruby/object:Gem::Version
|
|
180
|
-
version: '3.
|
|
221
|
+
version: '3.25'
|
|
181
222
|
- !ruby/object:Gem::Dependency
|
|
182
223
|
name: yard
|
|
183
224
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -209,13 +250,17 @@ files:
|
|
|
209
250
|
- "./lib/wgit/base.rb"
|
|
210
251
|
- "./lib/wgit/core_ext.rb"
|
|
211
252
|
- "./lib/wgit/crawler.rb"
|
|
253
|
+
- "./lib/wgit/database/adapters/in_memory.rb"
|
|
254
|
+
- "./lib/wgit/database/adapters/mongo_db.rb"
|
|
212
255
|
- "./lib/wgit/database/database.rb"
|
|
213
|
-
- "./lib/wgit/database/
|
|
256
|
+
- "./lib/wgit/database/database_adapter.rb"
|
|
214
257
|
- "./lib/wgit/document.rb"
|
|
215
258
|
- "./lib/wgit/document_extractors.rb"
|
|
216
259
|
- "./lib/wgit/dsl.rb"
|
|
260
|
+
- "./lib/wgit/html_to_text.rb"
|
|
217
261
|
- "./lib/wgit/indexer.rb"
|
|
218
262
|
- "./lib/wgit/logger.rb"
|
|
263
|
+
- "./lib/wgit/model.rb"
|
|
219
264
|
- "./lib/wgit/response.rb"
|
|
220
265
|
- "./lib/wgit/robots_parser.rb"
|
|
221
266
|
- "./lib/wgit/url.rb"
|
|
@@ -256,8 +301,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
256
301
|
- !ruby/object:Gem::Version
|
|
257
302
|
version: '0'
|
|
258
303
|
requirements: []
|
|
259
|
-
rubygems_version: 3.
|
|
260
|
-
signing_key:
|
|
304
|
+
rubygems_version: 3.6.7
|
|
261
305
|
specification_version: 4
|
|
262
306
|
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|
|
263
307
|
extract the data you want from the web.
|
data/lib/wgit/database/model.rb
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative '../utils'
|
|
4
|
-
|
|
5
|
-
module Wgit
|
|
6
|
-
# Module used to build the database collection objects, forming a data model.
|
|
7
|
-
module Model
|
|
8
|
-
# The data model for a Wgit::Url collection object and for an embedded
|
|
9
|
-
# 'url' inside a Wgit::Document collection object.
|
|
10
|
-
#
|
|
11
|
-
# @param url [Wgit::Url] The Url data object.
|
|
12
|
-
# @return [Hash] The URL model ready for DB insertion.
|
|
13
|
-
def self.url(url)
|
|
14
|
-
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
|
15
|
-
|
|
16
|
-
model = url.to_h
|
|
17
|
-
select_bson_types(model)
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
# The data model for a Wgit::Document collection object.
|
|
21
|
-
#
|
|
22
|
-
# @param doc [Wgit::Document] The Document data object.
|
|
23
|
-
# @return [Hash] The Document model ready for DB insertion.
|
|
24
|
-
def self.document(doc)
|
|
25
|
-
raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
|
|
26
|
-
|
|
27
|
-
model = doc.to_h(include_html: false, include_score: false)
|
|
28
|
-
model['url'] = url(doc.url) # Expand Url String into full object.
|
|
29
|
-
|
|
30
|
-
select_bson_types(model)
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# Common fields when inserting a record into the DB.
|
|
34
|
-
#
|
|
35
|
-
# @return [Hash] Insertion fields common to all models.
|
|
36
|
-
def self.common_insert_data
|
|
37
|
-
{
|
|
38
|
-
date_added: Wgit::Utils.time_stamp,
|
|
39
|
-
date_modified: Wgit::Utils.time_stamp
|
|
40
|
-
}
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
# Common fields when updating a record in the DB.
|
|
44
|
-
#
|
|
45
|
-
# @return [Hash] Update fields common to all models.
|
|
46
|
-
def self.common_update_data
|
|
47
|
-
{
|
|
48
|
-
date_modified: Wgit::Utils.time_stamp
|
|
49
|
-
}
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Returns the model having removed non bson types (for use with MongoDB).
|
|
53
|
-
#
|
|
54
|
-
# @param model_hash [Hash] The model Hash to sanitize.
|
|
55
|
-
# @return [Hash] The model Hash with non bson types removed.
|
|
56
|
-
def self.select_bson_types(model_hash)
|
|
57
|
-
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
end
|