wgit 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -0
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +27 -24
- data/bin/wgit +72 -18
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +91 -20
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -663
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +187 -77
- data/lib/wgit/document_extractors.rb +15 -23
- data/lib/wgit/dsl.rb +64 -67
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +29 -10
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +5 -8
- data/lib/wgit/robots_parser.rb +8 -8
- data/lib/wgit/url.rb +38 -38
- data/lib/wgit/utils.rb +124 -14
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -14
- metadata +74 -30
- data/lib/wgit/database/model.rb +0 -60
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
require_relative "../../utils"
|
|
2
|
+
require_relative "../../url"
|
|
3
|
+
require_relative "../../document"
|
|
4
|
+
require_relative "../../model"
|
|
5
|
+
require_relative "../database_adapter"
|
|
6
|
+
|
|
7
|
+
module Wgit::Database
|
|
8
|
+
# Database implementer class for in-memory (RAM) storage. This DB is mainly used
|
|
9
|
+
# for testing and experimenting with. This DB is thread safe.
|
|
10
|
+
class InMemory < DatabaseAdapter
|
|
11
|
+
# Initializes a thread safe InMemory Database instance.
|
|
12
|
+
#
|
|
13
|
+
# @param connection_string [String] Not used but needed to adhere to the
|
|
14
|
+
# DatabaseAdapter interface.
|
|
15
|
+
def initialize(connection_string = nil)
|
|
16
|
+
# Inits @urls and @docs vars.
|
|
17
|
+
initialize_store
|
|
18
|
+
|
|
19
|
+
super
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Overrides String#inspect to display collection sizes.
|
|
23
|
+
#
|
|
24
|
+
# @return [String] A short textual representation of this object.
|
|
25
|
+
def inspect
|
|
26
|
+
"#<Wgit::Database::InMemory num_urls=#{@urls.size} \
|
|
27
|
+
num_docs=#{@docs.size} size=#{size}>"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# The Wgit::Url's collection stored as an in-memory Concurrent::Array.
|
|
31
|
+
def urls(&block)
|
|
32
|
+
map_urls(@urls, &block)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# The Wgit::Document's collection stored as an in-memory Concurrent::Array.
|
|
36
|
+
def docs(&block)
|
|
37
|
+
map_documents(@docs, &block)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# The raw url Hashes, not mapped into their corresponding Wgit objects.
|
|
41
|
+
def url_hashes
|
|
42
|
+
@urls
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# The raw doc Hashes, not mapped into their corresponding Wgit objects.
|
|
46
|
+
def doc_hashes
|
|
47
|
+
@docs
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Returns the current size of the in-memory database.
|
|
51
|
+
# An empty database will return a size of 4 because there are 4 bytes in
|
|
52
|
+
# two empty arrays (urls and docs collections).
|
|
53
|
+
#
|
|
54
|
+
# @return [Integer] The current size of the in-memory DB.
|
|
55
|
+
def size
|
|
56
|
+
@urls.to_s.size + @docs.to_s.size
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Searches the database's Document#text for the given query. The returned
|
|
60
|
+
# Documents are sorted for relevance, starting with the most relevant. Each
|
|
61
|
+
# Document's #score value will be set accordingly.
|
|
62
|
+
#
|
|
63
|
+
# @param query [Regexp, #to_s] The regex or text value to search each
|
|
64
|
+
# document's @text for.
|
|
65
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
|
66
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
|
67
|
+
# for separately.
|
|
68
|
+
# @param limit [Integer] The max number of results to return.
|
|
69
|
+
# @param skip [Integer] The number of results to skip.
|
|
70
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
|
71
|
+
# DB.
|
|
72
|
+
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
|
73
|
+
def search(
|
|
74
|
+
query, case_sensitive: false, whole_sentence: true,
|
|
75
|
+
limit: 10, skip: 0, &block
|
|
76
|
+
)
|
|
77
|
+
regex = Wgit::Utils.build_search_regex(
|
|
78
|
+
query, case_sensitive:, whole_sentence:)
|
|
79
|
+
|
|
80
|
+
# Search the Wgit::Document's, not the raw Hashes.
|
|
81
|
+
results = docs.select do |doc|
|
|
82
|
+
score = 0
|
|
83
|
+
doc.search(regex, case_sensitive:, whole_sentence:) do |results_hash|
|
|
84
|
+
score = results_hash.values.sum
|
|
85
|
+
end
|
|
86
|
+
next false if score.zero?
|
|
87
|
+
|
|
88
|
+
doc.instance_variable_set :@score, score
|
|
89
|
+
true
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
return [] if results.empty?
|
|
93
|
+
|
|
94
|
+
results = results.sort_by { |doc| -doc.score }
|
|
95
|
+
|
|
96
|
+
results = results[skip..]
|
|
97
|
+
return [] unless results
|
|
98
|
+
|
|
99
|
+
results = results[0...limit] if limit.positive?
|
|
100
|
+
results.each(&block) if block_given?
|
|
101
|
+
|
|
102
|
+
results
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Deletes everything in the urls and documents collections.
|
|
106
|
+
#
|
|
107
|
+
# @return [Integer] The number of deleted records.
|
|
108
|
+
def empty
|
|
109
|
+
previous_size = @urls.size + @docs.size
|
|
110
|
+
initialize_store
|
|
111
|
+
|
|
112
|
+
previous_size
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Returns Url records that haven't yet been crawled.
|
|
116
|
+
#
|
|
117
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
|
118
|
+
# @param skip [Integer] Skip n amount of Url's.
|
|
119
|
+
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
|
120
|
+
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
|
121
|
+
def uncrawled_urls(limit: 0, skip: 0, &block)
|
|
122
|
+
uncrawled = @urls.reject { |url| url["crawled"] }
|
|
123
|
+
uncrawled = uncrawled[skip..]
|
|
124
|
+
return [] unless uncrawled
|
|
125
|
+
|
|
126
|
+
uncrawled = uncrawled[0...limit] if limit.positive?
|
|
127
|
+
map_urls(uncrawled, &block)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Inserts or updates the object in the in-memory database.
|
|
131
|
+
#
|
|
132
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
|
|
133
|
+
# @return [Boolean] True if inserted, false if updated.
|
|
134
|
+
def upsert(obj)
|
|
135
|
+
collection, index, model = get_model_info(obj)
|
|
136
|
+
|
|
137
|
+
if index
|
|
138
|
+
collection[index] = model
|
|
139
|
+
false
|
|
140
|
+
else
|
|
141
|
+
collection << model
|
|
142
|
+
true
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Bulk upserts the objects in the in-memory database collection.
|
|
147
|
+
# You cannot mix collection objs types, all must be Urls or Documents.
|
|
148
|
+
#
|
|
149
|
+
# @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
|
|
150
|
+
# inserted/updated.
|
|
151
|
+
# @return [Integer] The total number of newly inserted objects.
|
|
152
|
+
def bulk_upsert(objs)
|
|
153
|
+
assert_common_arr_types(objs, [Wgit::Url, Wgit::Document])
|
|
154
|
+
|
|
155
|
+
objs.reduce(0) do |inserted, obj|
|
|
156
|
+
inserted += 1 if upsert(obj)
|
|
157
|
+
inserted
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
private
|
|
162
|
+
|
|
163
|
+
# Creates a new Concurrent::Array for each collection.
|
|
164
|
+
def initialize_store
|
|
165
|
+
@urls = Concurrent::Array.new
|
|
166
|
+
@docs = Concurrent::Array.new
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Get the database's model info (collection type, index, model) for
|
|
170
|
+
# obj.
|
|
171
|
+
#
|
|
172
|
+
# Use like:
|
|
173
|
+
# ```
|
|
174
|
+
# collection, index, model = get_model_info(obj)
|
|
175
|
+
# ```
|
|
176
|
+
#
|
|
177
|
+
# Raises an error if obj isn't a Wgit::Url or Wgit::Document.
|
|
178
|
+
#
|
|
179
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
|
|
180
|
+
# @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
|
|
181
|
+
# @return [Array<Symbol, Hash>] The collection type, the obj's index (if in
|
|
182
|
+
# the collection, nil otherwise) and the Wgit::Model of obj.
|
|
183
|
+
def get_model_info(obj)
|
|
184
|
+
obj = obj.dup
|
|
185
|
+
|
|
186
|
+
case obj
|
|
187
|
+
when Wgit::Url
|
|
188
|
+
key = obj.to_s
|
|
189
|
+
collection = @urls
|
|
190
|
+
index = @urls.index { |url| url["url"] == key }
|
|
191
|
+
model = build_model(obj)
|
|
192
|
+
when Wgit::Document
|
|
193
|
+
key = obj.url.to_s
|
|
194
|
+
collection = @docs
|
|
195
|
+
index = @docs.index { |doc| doc["url"]&.[]("url") == key }
|
|
196
|
+
model = build_model(obj)
|
|
197
|
+
else
|
|
198
|
+
raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
[collection, index, model]
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|