indexer101 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/indexer101.rb +47 -13
- metadata +1 -1
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '059a4e8a4809e20f53e1d35100dad97936de2da80e2477650fb7efd0841c1cc2'
|
4
|
+
data.tar.gz: 215873766f61840776a8d3540556a034536dd71621593b928af8cb100861f973
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 54afb23966c1b323821dca57198fb0b45a88e5f0de514fd48361ec1fbb59c1903152d2854c30fb73b03b78f96b582d174ae6211cd1174105c465bff92665be55
|
7
|
+
data.tar.gz: 864f8fa54173d1e769e70fbc688bd48c073aefb9aaf4b076d3e327f0c5c4c686303c1b1ab2a64295868aea8eced05bae276949edda6f2a2edb57427b2185ed58
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/indexer101.rb
CHANGED
@@ -118,7 +118,9 @@ class Indexer101
|
|
118
118
|
|
119
119
|
end
|
120
120
|
|
121
|
-
|
121
|
+
# scan levels: 0 = tags only; 1 = all words in title (including tags)
|
122
|
+
#
|
123
|
+
def scan_dxindex(*locations, level: 0)
|
122
124
|
|
123
125
|
t = Time.now
|
124
126
|
threads = locations.flatten.map do |location|
|
@@ -132,18 +134,41 @@ class Indexer101
|
|
132
134
|
puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
|
133
135
|
+ " seconds").info
|
134
136
|
|
135
|
-
|
137
|
+
|
138
|
+
id = 1
|
139
|
+
|
140
|
+
a.each do |dx|
|
141
|
+
|
142
|
+
id2 = id
|
136
143
|
|
137
144
|
@indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
|
138
|
-
|
145
|
+
{|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
|
139
146
|
|
140
|
-
dx.all.reverse.each
|
141
|
-
|
142
|
-
|
143
|
-
|
147
|
+
dx.all.reverse.each do |x|
|
148
|
+
|
149
|
+
case level
|
150
|
+
when 0
|
151
|
+
|
152
|
+
x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
|
153
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
154
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
155
|
+
end
|
156
|
+
|
157
|
+
when 1
|
158
|
+
|
159
|
+
x.title.split(/[\s:"!\?\(\)£]+(?=[\w#_'-]+)/).each do |keyword|
|
160
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
161
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
162
|
+
end
|
163
|
+
|
144
164
|
end
|
165
|
+
|
166
|
+
id2 += 1
|
167
|
+
|
145
168
|
end
|
146
169
|
|
170
|
+
id = id2
|
171
|
+
|
147
172
|
end
|
148
173
|
|
149
174
|
end
|
@@ -182,23 +207,32 @@ class Indexer101
|
|
182
207
|
|
183
208
|
t = Time.now
|
184
209
|
|
185
|
-
|
210
|
+
r = keywords.flatten(1).map do |x|
|
186
211
|
|
187
212
|
a = []
|
188
213
|
a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
|
189
214
|
|
190
215
|
if x.length > 3 then
|
191
|
-
a += @indexer.index.keys.
|
192
|
-
{|y| @indexer.index[y]}
|
193
|
-
a += @indexer.index.keys.
|
194
|
-
{|y| @indexer.index[y]}
|
216
|
+
a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
|
217
|
+
{|y| @indexer.index[y].reverse}
|
218
|
+
a += @indexer.index.keys.grep(/#{x}/i).flat_map\
|
219
|
+
{|y| @indexer.index[y].reverse}
|
195
220
|
end
|
196
221
|
|
197
222
|
puts ('a: ' + a.inspect).debug if @debug
|
198
|
-
a.uniq.map {|y| @indexer.uri_index[y].split(/\s+(?=https?[^\s]+$)/,2) }
|
199
223
|
|
224
|
+
a.uniq.map {|y| @indexer.uri_index[y]}
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
# group by number of results found, sort by count, then by date
|
229
|
+
a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
|
230
|
+
-([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
|
200
231
|
end
|
201
232
|
|
233
|
+
# fetch the 1st record from each group item
|
234
|
+
results = a3.map {|x| x.last.first}
|
235
|
+
|
202
236
|
t2 = Time.now - t
|
203
237
|
puts ("found %s results" % results.length).info
|
204
238
|
puts ("search took " + ("%.3f" % t2).brown + " seconds").info
|
metadata
CHANGED
metadata.gz.sig
CHANGED
Binary file
|