wgit 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,204 @@
1
+ require_relative "../../utils"
2
+ require_relative "../../url"
3
+ require_relative "../../document"
4
+ require_relative "../../model"
5
+ require_relative "../database_adapter"
6
+
7
+ module Wgit::Database
8
+ # Database implementer class for in-memory (RAM) storage. This DB is mainly used
9
+ # for testing and experimenting with. This DB is thread safe.
10
+ class InMemory < DatabaseAdapter
11
+ # Initializes a thread safe InMemory Database instance.
12
+ #
13
+ # @param connection_string [String] Not used but needed to adhere to the
14
+ # DatabaseAdapter interface.
15
+ def initialize(connection_string = nil)
16
+ # Inits @urls and @docs vars.
17
+ initialize_store
18
+
19
+ super
20
+ end
21
+
22
+ # Overrides String#inspect to display collection sizes.
23
+ #
24
+ # @return [String] A short textual representation of this object.
25
+ def inspect
26
+ "#<Wgit::Database::InMemory num_urls=#{@urls.size} \
27
+ num_docs=#{@docs.size} size=#{size}>"
28
+ end
29
+
30
+ # The Wgit::Url's collection stored as an in-memory Concurrent::Array.
31
+ def urls(&block)
32
+ map_urls(@urls, &block)
33
+ end
34
+
35
+ # The Wgit::Document's collection stored as an in-memory Concurrent::Array.
36
+ def docs(&block)
37
+ map_documents(@docs, &block)
38
+ end
39
+
40
+ # The raw url Hashes, not mapped into their corresponding Wgit objects.
41
+ def url_hashes
42
+ @urls
43
+ end
44
+
45
+ # The raw doc Hashes, not mapped into their corresponding Wgit objects.
46
+ def doc_hashes
47
+ @docs
48
+ end
49
+
50
+ # Returns the current size of the in-memory database.
51
+ # An empty database will return a size of 4 because there are 4 bytes in
52
+ # two empty arrays (urls and docs collections).
53
+ #
54
+ # @return [Integer] The current size of the in-memory DB.
55
+ def size
56
+ @urls.to_s.size + @docs.to_s.size
57
+ end
58
+
59
+ # Searches the database's Document#text for the given query. The returned
60
+ # Documents are sorted for relevance, starting with the most relevant. Each
61
+ # Document's #score value will be set accordingly.
62
+ #
63
+ # @param query [Regexp, #to_s] The regex or text value to search each
64
+ # document's @text for.
65
+ # @param case_sensitive [Boolean] Whether character case must match.
66
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
67
+ # for separately.
68
+ # @param limit [Integer] The max number of results to return.
69
+ # @param skip [Integer] The number of results to skip.
70
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
71
+ # DB.
72
+ # @return [Array<Wgit::Document>] The search results obtained from the DB.
73
+ def search(
74
+ query, case_sensitive: false, whole_sentence: true,
75
+ limit: 10, skip: 0, &block
76
+ )
77
+ regex = Wgit::Utils.build_search_regex(
78
+ query, case_sensitive:, whole_sentence:)
79
+
80
+ # Search the Wgit::Document's, not the raw Hashes.
81
+ results = docs.select do |doc|
82
+ score = 0
83
+ doc.search(regex, case_sensitive:, whole_sentence:) do |results_hash|
84
+ score = results_hash.values.sum
85
+ end
86
+ next false if score.zero?
87
+
88
+ doc.instance_variable_set :@score, score
89
+ true
90
+ end
91
+
92
+ return [] if results.empty?
93
+
94
+ results = results.sort_by { |doc| -doc.score }
95
+
96
+ results = results[skip..]
97
+ return [] unless results
98
+
99
+ results = results[0...limit] if limit.positive?
100
+ results.each(&block) if block_given?
101
+
102
+ results
103
+ end
104
+
105
+ # Deletes everything in the urls and documents collections.
106
+ #
107
+ # @return [Integer] The number of deleted records.
108
+ def empty
109
+ previous_size = @urls.size + @docs.size
110
+ initialize_store
111
+
112
+ previous_size
113
+ end
114
+
115
+ # Returns Url records that haven't yet been crawled.
116
+ #
117
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
118
+ # @param skip [Integer] Skip n amount of Url's.
119
+ # @yield [url] Given each Url object (Wgit::Url) returned from the DB.
120
+ # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
121
+ def uncrawled_urls(limit: 0, skip: 0, &block)
122
+ uncrawled = @urls.reject { |url| url["crawled"] }
123
+ uncrawled = uncrawled[skip..]
124
+ return [] unless uncrawled
125
+
126
+ uncrawled = uncrawled[0...limit] if limit.positive?
127
+ map_urls(uncrawled, &block)
128
+ end
129
+
130
+ # Inserts or updates the object in the in-memory database.
131
+ #
132
+ # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
133
+ # @return [Boolean] True if inserted, false if updated.
134
+ def upsert(obj)
135
+ collection, index, model = get_model_info(obj)
136
+
137
+ if index
138
+ collection[index] = model
139
+ false
140
+ else
141
+ collection << model
142
+ true
143
+ end
144
+ end
145
+
146
+ # Bulk upserts the objects in the in-memory database collection.
147
+ # You cannot mix collection objs types, all must be Urls or Documents.
148
+ #
149
+ # @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
150
+ # inserted/updated.
151
+ # @return [Integer] The total number of newly inserted objects.
152
+ def bulk_upsert(objs)
153
+ assert_common_arr_types(objs, [Wgit::Url, Wgit::Document])
154
+
155
+ objs.reduce(0) do |inserted, obj|
156
+ inserted += 1 if upsert(obj)
157
+ inserted
158
+ end
159
+ end
160
+
161
+ private
162
+
163
+ # Creates a new Concurrent::Array for each collection.
164
+ def initialize_store
165
+ @urls = Concurrent::Array.new
166
+ @docs = Concurrent::Array.new
167
+ end
168
+
169
+ # Get the database's model info (collection type, index, model) for
170
+ # obj.
171
+ #
172
+ # Use like:
173
+ # ```
174
+ # collection, index, model = get_model_info(obj)
175
+ # ```
176
+ #
177
+ # Raises an error if obj isn't a Wgit::Url or Wgit::Document.
178
+ #
179
+ # @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
180
+ # @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
181
+ # @return [Array<Symbol, Hash>] The collection type, the obj's index (if in
182
+ # the collection, nil otherwise) and the Wgit::Model of obj.
183
+ def get_model_info(obj)
184
+ obj = obj.dup
185
+
186
+ case obj
187
+ when Wgit::Url
188
+ key = obj.to_s
189
+ collection = @urls
190
+ index = @urls.index { |url| url["url"] == key }
191
+ model = build_model(obj)
192
+ when Wgit::Document
193
+ key = obj.url.to_s
194
+ collection = @docs
195
+ index = @docs.index { |doc| doc["url"]&.[]("url") == key }
196
+ model = build_model(obj)
197
+ else
198
+ raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
199
+ end
200
+
201
+ [collection, index, model]
202
+ end
203
+ end
204
+ end