wgit 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/utils.rb CHANGED
@@ -18,17 +18,15 @@ module Wgit
18
18
  # keys.
19
19
  # @return [Hash] A Hash created from obj's instance vars and values.
20
20
  def self.to_h(obj, ignore: [], use_strings_as_keys: true)
21
- hash = {}
22
-
23
- obj.instance_variables.each do |var|
24
- next if ignore.include?(var.to_s)
21
+ obj.instance_variables.reduce({}) do |hash, var|
22
+ next hash if ignore.include?(var.to_s)
25
23
 
26
24
  key = var.to_s[1..] # Remove the @ prefix.
27
25
  key = key.to_sym unless use_strings_as_keys
28
26
  hash[key] = obj.instance_variable_get(var)
29
- end
30
27
 
31
- hash
28
+ hash
29
+ end
32
30
  end
33
31
 
34
32
  # An improved :each method which supports both singleton and Enumerable
@@ -127,13 +125,19 @@ module Wgit
127
125
  end
128
126
 
129
127
  # Prints out the search results in a search engine like format.
130
- # The format for each result looks like:
128
+ #
129
+ # The given results should be matching documents from a DB and should have
130
+ # `doc.search_text!` called for each document - to turn doc.text into only
131
+ # matching text, which this method uses.
132
+ #
133
+ # The format for each result looks something like:
131
134
  #
132
135
  # ```
133
136
  # Title
134
137
  # Keywords (if there are some)
135
- # Text Snippet (formatted to show the searched for query, if provided)
138
+ # Text Snippet (formatted to show the searched for query)
136
139
  # URL
140
+ # Score (if include_score: true)
137
141
  # <empty_line_seperator>
138
142
  # ```
139
143
  #
@@ -142,28 +146,91 @@ module Wgit
142
146
  # the search results). The first @text sentence gets printed.
143
147
  # @param keyword_limit [Integer] The max amount of keywords to be
144
148
  # outputted to the stream.
149
+ # @param include_score [Boolean] Wether or not to puts the document score.
145
150
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
146
151
  # to output text somewhere e.g. a file or STDERR.
147
152
  # @return [Integer] The number of results.
148
- def self.pprint_search_results(results, keyword_limit: 5, stream: $stdout)
153
+ def self.pprint_top_search_results(
154
+ results, keyword_limit: 5, include_score: false, stream: $stdout
155
+ )
149
156
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
150
157
 
151
158
  results.each do |doc|
152
- title = (doc.title || '<no title>')
159
+ title = doc.title || '<no title>'
153
160
  keywords = doc.keywords&.take(keyword_limit)&.join(', ')
154
161
  sentence = doc.text.first
155
162
  url = doc.url
163
+ score = doc.score
156
164
 
157
165
  stream.puts title
158
166
  stream.puts keywords if keywords
159
167
  stream.puts sentence
160
168
  stream.puts url
169
+ stream.puts score if include_score
161
170
  stream.puts
162
171
  end
163
172
 
164
173
  results.size
165
174
  end
166
175
 
176
+ # Prints out the search results listing all of the matching text in each
177
+ # document.
178
+ #
179
+ # The given results should be matching documents from a DB and should have
180
+ # `doc.search_text!` called for each document - to turn doc.text into only
181
+ # matching text, which this method uses.
182
+ #
183
+ # The format for each result looks something like:
184
+ #
185
+ # ```
186
+ # Title
187
+ # Keywords (if there are some)
188
+ # URL
189
+ # Score (if include_score: true)
190
+ #
191
+ # "<text_snippet_1>"
192
+ # "<text_snippet_2>"
193
+ # ...
194
+ #
195
+ # <seperator>
196
+ # ```
197
+ #
198
+ # @param results [Array<Wgit::Document>] Array of Wgit::Document's which
199
+ # each have had #search!(query) called (to update it's @text with the
200
+ # the search results). The first @text sentence gets printed.
201
+ # @param keyword_limit [Integer] The max amount of keywords to be
202
+ # outputted to the stream.
203
+ # @param include_score [Boolean] Wether or not to puts the document score.
204
+ # @param stream [#puts] Any object that respond_to?(:puts). It is used
205
+ # to output text somewhere e.g. a file or STDERR.
206
+ # @return [Integer] The number of results.
207
+ def self.pprint_all_search_results(
208
+ results, keyword_limit: 5, include_score: false, stream: $stdout
209
+ )
210
+ raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
211
+
212
+ results.each_with_index do |doc, i|
213
+ last_result = i == (results.size-1)
214
+
215
+ title = doc.title || '<no title>'
216
+ keywords = doc.keywords&.take(keyword_limit)&.join(', ')
217
+ url = doc.url
218
+ score = doc.score
219
+
220
+ stream.puts title
221
+ stream.puts keywords if keywords
222
+ stream.puts url
223
+ stream.puts score if include_score
224
+ stream.puts
225
+ doc.text.each { |text| stream.puts text }
226
+ stream.puts
227
+ stream.puts "-----" unless last_result
228
+ stream.puts unless last_result
229
+ end
230
+
231
+ results.size
232
+ end
233
+
167
234
  # Sanitises the obj to make it uniform by calling the correct sanitize_*
168
235
  # method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
169
236
  # Any type not in the case statement will be ignored and returned as is.
@@ -230,6 +297,42 @@ module Wgit
230
297
  .uniq
231
298
  end
232
299
 
300
+ # Build a regular expression from a query string, for searching text with.
301
+ #
302
+ # All searches using this regex are always whole word based while whole
303
+ # sentence searches are configurable using the whole_sentence: param. For
304
+ # example:
305
+ #
306
+ # ```
307
+ # text = "hello world"
308
+ # query = "world hello", whole_sentence: true # => No match
309
+ # query = "world hello", whole_sentence: false # => Match
310
+ # query = "he" # => Never matches
311
+ # ```
312
+ #
313
+ # @param query [String, Regexp] The query string to build a regex from.
314
+ # @param case_sensitive [Boolean] Whether character case must match.
315
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
316
+ # for separately, matching in any order.
317
+ # @return [Regexp] The regex with which to search text.
318
+ def self.build_search_regex(
319
+ query, case_sensitive: false, whole_sentence: true
320
+ )
321
+ return query if query.is_a?(Regexp)
322
+
323
+ # query: "hello world", whole_sentence: false produces:
324
+ # (?<=^|\s|[^a-zA-Z0-9])hello(?=$|\s|[^a-zA-Z0-9])|(?<=^|\s|[^a-zA-Z0-9])world(?=$|\s|[^a-zA-Z0-9])
325
+
326
+ sep = whole_sentence ? " " : "|"
327
+ segs = query.split(" ").map do |word|
328
+ word = Regexp.escape(word)
329
+ "(?<=^|\\s|[^a-zA-Z0-9])#{word}(?=$|\\s|[^a-zA-Z0-9])"
330
+ end
331
+ query = segs.join(sep)
332
+
333
+ Regexp.new(query, !case_sensitive)
334
+ end
335
+
233
336
  # Pretty prints a log statement, used for debugging purposes.
234
337
  #
235
338
  # Use like:
@@ -245,25 +348,30 @@ module Wgit
245
348
  # ```
246
349
  #
247
350
  # @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
351
+ # @param display [Boolean] Setting as false will cause a noop, useful for
352
+ # switching off several/all pprint statements at once e.g. via ENV var.
248
353
  # @param stream [#puts] Any object that respond_to? :puts and :print. It is
249
354
  # used to output the log text somewhere e.g. a file or STDERR.
250
355
  # @param prefix [String] The log prefix, useful for visibility/greping.
251
356
  # @param new_line [Boolean] Wether or not to use a new line (\n) as the
252
357
  # separator.
253
358
  # @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
254
- def self.pprint(identifier, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
359
+ def self.pprint(identifier, display: true, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
360
+ return unless display
361
+
255
362
  sep1 = new_line ? "\n" : ' - '
363
+ sep1 = '' if vars.empty?
256
364
  sep2 = new_line ? "\n" : ' | '
257
365
 
258
366
  stream.print "\n#{prefix}_#{identifier}#{sep1}"
259
367
 
260
368
  vars.each_with_index do |arr, i|
261
- last_item = (i + 1) == vars.size
369
+ is_last_item = (i + 1) == vars.size
262
370
  sep3 = sep2
263
- sep3 = new_line ? "\n" : '' if last_item
371
+ sep3 = new_line ? "\n" : '' if is_last_item
264
372
  k, v = arr
265
373
 
266
- stream.print "#{k}: #{v}#{sep3}"
374
+ stream.print "#{k}: #{v.inspect}#{sep3}"
267
375
  end
268
376
 
269
377
  stream.puts "\n"
@@ -271,5 +379,7 @@ module Wgit
271
379
 
272
380
  nil
273
381
  end
382
+
383
+ self.singleton_class.alias_method :pprint_search_results, :pprint_top_search_results
274
384
  end
275
385
  end
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.11.0'
9
+ VERSION = "0.12.1"
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
data/lib/wgit.rb CHANGED
@@ -1,17 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'wgit/version'
4
- require_relative 'wgit/logger'
5
- require_relative 'wgit/assertable'
6
- require_relative 'wgit/utils'
7
- require_relative 'wgit/url'
8
- require_relative 'wgit/document'
9
- require_relative 'wgit/document_extractors'
10
- require_relative 'wgit/crawler'
11
- require_relative 'wgit/database/model'
12
- require_relative 'wgit/database/database'
13
- require_relative 'wgit/robots_parser'
14
- require_relative 'wgit/indexer'
15
- require_relative 'wgit/dsl'
16
- require_relative 'wgit/base'
3
+ require_relative "wgit/version"
4
+ require_relative "wgit/logger"
5
+ require_relative "wgit/assertable"
6
+ require_relative "wgit/utils"
7
+ require_relative "wgit/url"
8
+ require_relative "wgit/html_to_text"
9
+ require_relative "wgit/document"
10
+ require_relative "wgit/document_extractors"
11
+ require_relative "wgit/crawler"
12
+ require_relative "wgit/model"
13
+ require_relative "wgit/database/database"
14
+ require_relative "wgit/database/database_adapter"
15
+ require_relative "wgit/database/adapters/mongo_db"
16
+ require_relative "wgit/database/adapters/in_memory"
17
+ require_relative "wgit/robots_parser"
18
+ require_relative "wgit/indexer"
19
+ require_relative "wgit/dsl"
20
+ require_relative "wgit/base"
17
21
  # require_relative 'wgit/core_ext' - Must be explicitly required.
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-01-19 00:00:00.000000000 Z
10
+ date: 2025-08-18 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: addressable
@@ -25,131 +24,173 @@ dependencies:
25
24
  - !ruby/object:Gem::Version
26
25
  version: '2.8'
27
26
  - !ruby/object:Gem::Dependency
28
- name: mongo
27
+ name: base64
29
28
  requirement: !ruby/object:Gem::Requirement
30
29
  requirements:
31
30
  - - "~>"
32
31
  - !ruby/object:Gem::Version
33
- version: '2.19'
32
+ version: '0.3'
34
33
  type: :runtime
35
34
  prerelease: false
36
35
  version_requirements: !ruby/object:Gem::Requirement
37
36
  requirements:
38
37
  - - "~>"
39
38
  - !ruby/object:Gem::Version
40
- version: '2.19'
39
+ version: '0.3'
41
40
  - !ruby/object:Gem::Dependency
42
- name: nokogiri
41
+ name: benchmark
43
42
  requirement: !ruby/object:Gem::Requirement
44
43
  requirements:
45
44
  - - "~>"
46
45
  - !ruby/object:Gem::Version
47
- version: '1.15'
46
+ version: '0.4'
48
47
  type: :runtime
49
48
  prerelease: false
50
49
  version_requirements: !ruby/object:Gem::Requirement
51
50
  requirements:
52
51
  - - "~>"
53
52
  - !ruby/object:Gem::Version
54
- version: '1.15'
53
+ version: '0.4'
55
54
  - !ruby/object:Gem::Dependency
56
- name: typhoeus
55
+ name: ferrum
57
56
  requirement: !ruby/object:Gem::Requirement
58
57
  requirements:
59
58
  - - "~>"
60
59
  - !ruby/object:Gem::Version
61
- version: '1.4'
60
+ version: '0.17'
62
61
  type: :runtime
63
62
  prerelease: false
64
63
  version_requirements: !ruby/object:Gem::Requirement
65
64
  requirements:
66
65
  - - "~>"
67
66
  - !ruby/object:Gem::Version
68
- version: '1.4'
67
+ version: '0.17'
69
68
  - !ruby/object:Gem::Dependency
70
- name: ferrum
69
+ name: logger
71
70
  requirement: !ruby/object:Gem::Requirement
72
71
  requirements:
73
72
  - - "~>"
74
73
  - !ruby/object:Gem::Version
75
- version: '0.14'
74
+ version: '1.7'
76
75
  type: :runtime
77
76
  prerelease: false
78
77
  version_requirements: !ruby/object:Gem::Requirement
79
78
  requirements:
80
79
  - - "~>"
81
80
  - !ruby/object:Gem::Version
82
- version: '0.14'
81
+ version: '1.7'
82
+ - !ruby/object:Gem::Dependency
83
+ name: mongo
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '2.21'
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '2.21'
96
+ - !ruby/object:Gem::Dependency
97
+ name: nokogiri
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '1.18'
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '1.18'
110
+ - !ruby/object:Gem::Dependency
111
+ name: typhoeus
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '1.4'
117
+ type: :runtime
118
+ prerelease: false
119
+ version_requirements: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '1.4'
83
124
  - !ruby/object:Gem::Dependency
84
125
  name: byebug
85
126
  requirement: !ruby/object:Gem::Requirement
86
127
  requirements:
87
128
  - - "~>"
88
129
  - !ruby/object:Gem::Version
89
- version: '11.1'
130
+ version: '12.0'
90
131
  type: :development
91
132
  prerelease: false
92
133
  version_requirements: !ruby/object:Gem::Requirement
93
134
  requirements:
94
135
  - - "~>"
95
136
  - !ruby/object:Gem::Version
96
- version: '11.1'
137
+ version: '12.0'
97
138
  - !ruby/object:Gem::Dependency
98
139
  name: dotenv
99
140
  requirement: !ruby/object:Gem::Requirement
100
141
  requirements:
101
142
  - - "~>"
102
143
  - !ruby/object:Gem::Version
103
- version: '2.8'
144
+ version: '3.1'
104
145
  type: :development
105
146
  prerelease: false
106
147
  version_requirements: !ruby/object:Gem::Requirement
107
148
  requirements:
108
149
  - - "~>"
109
150
  - !ruby/object:Gem::Version
110
- version: '2.8'
151
+ version: '3.1'
111
152
  - !ruby/object:Gem::Dependency
112
153
  name: maxitest
113
154
  requirement: !ruby/object:Gem::Requirement
114
155
  requirements:
115
156
  - - "~>"
116
157
  - !ruby/object:Gem::Version
117
- version: '5.4'
158
+ version: '6.0'
118
159
  type: :development
119
160
  prerelease: false
120
161
  version_requirements: !ruby/object:Gem::Requirement
121
162
  requirements:
122
163
  - - "~>"
123
164
  - !ruby/object:Gem::Version
124
- version: '5.4'
165
+ version: '6.0'
125
166
  - !ruby/object:Gem::Dependency
126
167
  name: pry
127
168
  requirement: !ruby/object:Gem::Requirement
128
169
  requirements:
129
170
  - - "~>"
130
171
  - !ruby/object:Gem::Version
131
- version: '0.14'
172
+ version: '0.15'
132
173
  type: :development
133
174
  prerelease: false
134
175
  version_requirements: !ruby/object:Gem::Requirement
135
176
  requirements:
136
177
  - - "~>"
137
178
  - !ruby/object:Gem::Version
138
- version: '0.14'
179
+ version: '0.15'
139
180
  - !ruby/object:Gem::Dependency
140
181
  name: rubocop
141
182
  requirement: !ruby/object:Gem::Requirement
142
183
  requirements:
143
184
  - - "~>"
144
185
  - !ruby/object:Gem::Version
145
- version: '1.57'
186
+ version: '1.79'
146
187
  type: :development
147
188
  prerelease: false
148
189
  version_requirements: !ruby/object:Gem::Requirement
149
190
  requirements:
150
191
  - - "~>"
151
192
  - !ruby/object:Gem::Version
152
- version: '1.57'
193
+ version: '1.79'
153
194
  - !ruby/object:Gem::Dependency
154
195
  name: toys
155
196
  requirement: !ruby/object:Gem::Requirement
@@ -170,14 +211,14 @@ dependencies:
170
211
  requirements:
171
212
  - - "~>"
172
213
  - !ruby/object:Gem::Version
173
- version: '3.19'
214
+ version: '3.25'
174
215
  type: :development
175
216
  prerelease: false
176
217
  version_requirements: !ruby/object:Gem::Requirement
177
218
  requirements:
178
219
  - - "~>"
179
220
  - !ruby/object:Gem::Version
180
- version: '3.19'
221
+ version: '3.25'
181
222
  - !ruby/object:Gem::Dependency
182
223
  name: yard
183
224
  requirement: !ruby/object:Gem::Requirement
@@ -209,13 +250,17 @@ files:
209
250
  - "./lib/wgit/base.rb"
210
251
  - "./lib/wgit/core_ext.rb"
211
252
  - "./lib/wgit/crawler.rb"
253
+ - "./lib/wgit/database/adapters/in_memory.rb"
254
+ - "./lib/wgit/database/adapters/mongo_db.rb"
212
255
  - "./lib/wgit/database/database.rb"
213
- - "./lib/wgit/database/model.rb"
256
+ - "./lib/wgit/database/database_adapter.rb"
214
257
  - "./lib/wgit/document.rb"
215
258
  - "./lib/wgit/document_extractors.rb"
216
259
  - "./lib/wgit/dsl.rb"
260
+ - "./lib/wgit/html_to_text.rb"
217
261
  - "./lib/wgit/indexer.rb"
218
262
  - "./lib/wgit/logger.rb"
263
+ - "./lib/wgit/model.rb"
219
264
  - "./lib/wgit/response.rb"
220
265
  - "./lib/wgit/robots_parser.rb"
221
266
  - "./lib/wgit/url.rb"
@@ -256,8 +301,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
256
301
  - !ruby/object:Gem::Version
257
302
  version: '0'
258
303
  requirements: []
259
- rubygems_version: 3.5.3
260
- signing_key:
304
+ rubygems_version: 3.6.7
261
305
  specification_version: 4
262
306
  summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
263
307
  extract the data you want from the web.
@@ -1,60 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative '../utils'
4
-
5
- module Wgit
6
- # Module used to build the database collection objects, forming a data model.
7
- module Model
8
- # The data model for a Wgit::Url collection object and for an embedded
9
- # 'url' inside a Wgit::Document collection object.
10
- #
11
- # @param url [Wgit::Url] The Url data object.
12
- # @return [Hash] The URL model ready for DB insertion.
13
- def self.url(url)
14
- raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
15
-
16
- model = url.to_h
17
- select_bson_types(model)
18
- end
19
-
20
- # The data model for a Wgit::Document collection object.
21
- #
22
- # @param doc [Wgit::Document] The Document data object.
23
- # @return [Hash] The Document model ready for DB insertion.
24
- def self.document(doc)
25
- raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
26
-
27
- model = doc.to_h(include_html: false, include_score: false)
28
- model['url'] = url(doc.url) # Expand Url String into full object.
29
-
30
- select_bson_types(model)
31
- end
32
-
33
- # Common fields when inserting a record into the DB.
34
- #
35
- # @return [Hash] Insertion fields common to all models.
36
- def self.common_insert_data
37
- {
38
- date_added: Wgit::Utils.time_stamp,
39
- date_modified: Wgit::Utils.time_stamp
40
- }
41
- end
42
-
43
- # Common fields when updating a record in the DB.
44
- #
45
- # @return [Hash] Update fields common to all models.
46
- def self.common_update_data
47
- {
48
- date_modified: Wgit::Utils.time_stamp
49
- }
50
- end
51
-
52
- # Returns the model having removed non bson types (for use with MongoDB).
53
- #
54
- # @param model_hash [Hash] The model Hash to sanitize.
55
- # @return [Hash] The model Hash with non bson types removed.
56
- def self.select_bson_types(model_hash)
57
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
58
- end
59
- end
60
- end