indexer101 0.2.0 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/indexer101.rb +75 -16
- metadata +48 -27
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58debc39c10c1dff7e46220800a6f063ee8ae0c2b7e47c5dc1161b1c2555d64d
|
4
|
+
data.tar.gz: e7b468fb8b030e9a6710625f7e3f6061e30b7d0ae0f371d5977c01f027ab3b32
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 480085f3742927accef9c583ef778bedaef2d08c113b87a4fb109fd910ef3b662324b3a35b7cce4c86f3a8d59043f49f03a69e3bfd5d0620a23ba354212e764d
|
7
|
+
data.tar.gz: 62f4cd3287838fb3e25d8c54ba74259b03f40d4bafbcb51198b6ad47c3d67adc0f6dcfcf72eddc771be60356253e05b644152b4b5d3b20bb0662226c4299f12e
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/indexer101.rb
CHANGED
@@ -6,6 +6,7 @@ require 'c32'
|
|
6
6
|
require 'thread'
|
7
7
|
require 'thwait'
|
8
8
|
require 'dynarex'
|
9
|
+
require 'dxlite'
|
9
10
|
|
10
11
|
|
11
12
|
class Indexer101
|
@@ -118,32 +119,81 @@ class Indexer101
|
|
118
119
|
|
119
120
|
end
|
120
121
|
|
121
|
-
|
122
|
+
# scan levels: 0 = tags only; 1 = all words in title (including tags)
|
123
|
+
#
|
124
|
+
def scan_dxindex(*locations, level: 0)
|
122
125
|
|
123
126
|
t = Time.now
|
124
127
|
threads = locations.flatten.map do |location|
|
125
|
-
|
128
|
+
|
129
|
+
Thread.new {
|
130
|
+
|
131
|
+
if location.is_a?(Dynarex) or location.is_a?(DxLite) then
|
132
|
+
|
133
|
+
Thread.current[:v] = location
|
134
|
+
|
135
|
+
elsif location.is_a? String
|
136
|
+
|
137
|
+
case File.extname(location)
|
138
|
+
when '.xml'
|
139
|
+
Thread.current[:v] = Dynarex.new location, debug: @debug
|
140
|
+
when '.json'
|
141
|
+
Thread.current[:v] = DxLite.new location, debug: @debug
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
}
|
126
146
|
end
|
127
147
|
|
128
148
|
ThreadsWait.all_waits(*threads)
|
129
149
|
|
130
150
|
a = threads.map {|x| x[:v]}
|
151
|
+
puts '_a: ' + a.inspect if @debug
|
131
152
|
t2 = Time.now - t
|
132
153
|
puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
|
133
154
|
+ " seconds").info
|
134
155
|
|
135
|
-
|
156
|
+
|
157
|
+
id = 1
|
158
|
+
|
159
|
+
a.each do |dx|
|
160
|
+
|
161
|
+
id2 = id
|
162
|
+
|
163
|
+
if @debug then
|
164
|
+
puts 'dx: ' + dx.class.inspect
|
165
|
+
puts 'dx.all: ' + dx.all.inspect
|
166
|
+
end
|
136
167
|
|
137
168
|
@indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
|
138
|
-
|
169
|
+
{|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
|
139
170
|
|
140
|
-
dx.all.reverse.each
|
141
|
-
|
142
|
-
|
143
|
-
|
171
|
+
dx.all.reverse.each do |x|
|
172
|
+
|
173
|
+
case level
|
174
|
+
when 0
|
175
|
+
|
176
|
+
x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
|
177
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
178
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
179
|
+
end
|
180
|
+
|
181
|
+
when 1
|
182
|
+
|
183
|
+
# \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
|
184
|
+
x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
|
185
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
186
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
187
|
+
end
|
188
|
+
|
144
189
|
end
|
190
|
+
|
191
|
+
id2 += 1
|
192
|
+
|
145
193
|
end
|
146
194
|
|
195
|
+
id = id2
|
196
|
+
|
147
197
|
end
|
148
198
|
|
149
199
|
end
|
@@ -178,27 +228,36 @@ class Indexer101
|
|
178
228
|
|
179
229
|
# enter the exact keywords to search from the index
|
180
230
|
#
|
181
|
-
def search(*keywords)
|
231
|
+
def search(*keywords, minchars: 3)
|
182
232
|
|
183
233
|
t = Time.now
|
184
234
|
|
185
|
-
|
235
|
+
r = keywords.flatten(1).map do |x|
|
186
236
|
|
187
237
|
a = []
|
188
238
|
a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
|
189
239
|
|
190
|
-
if x.length
|
191
|
-
a += @indexer.index.keys.
|
192
|
-
{|y| @indexer.index[y]}
|
193
|
-
a += @indexer.index.keys.
|
194
|
-
{|y| @indexer.index[y]}
|
240
|
+
if x.length >= minchars then
|
241
|
+
a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
|
242
|
+
{|y| @indexer.index[y].reverse}
|
243
|
+
a += @indexer.index.keys.grep(/#{x}/i).flat_map\
|
244
|
+
{|y| @indexer.index[y].reverse}
|
195
245
|
end
|
196
246
|
|
197
247
|
puts ('a: ' + a.inspect).debug if @debug
|
198
|
-
a.uniq.map {|y| @indexer.uri_index[y].split(/\s+(?=https?[^\s]+$)/,2) }
|
199
248
|
|
249
|
+
a.uniq.map {|y| @indexer.uri_index[y]}
|
250
|
+
|
251
|
+
end
|
252
|
+
|
253
|
+
# group by number of results found, sort by count, then by date
|
254
|
+
a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
|
255
|
+
-([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
|
200
256
|
end
|
201
257
|
|
258
|
+
# fetch the 1st record from each group item
|
259
|
+
results = a3.map {|x| x.last.first}
|
260
|
+
|
202
261
|
t2 = Time.now - t
|
203
262
|
puts ("found %s results" % results.length).info
|
204
263
|
puts ("search took " + ("%.3f" % t2).brown + " seconds").info
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indexer101
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -11,31 +11,31 @@ cert_chain:
|
|
11
11
|
- |
|
12
12
|
-----BEGIN CERTIFICATE-----
|
13
13
|
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjEwMjIyMDAwNzQ2WhcN
|
15
|
+
MjIwMjIyMDAwNzQ2WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDqAwJO
|
17
|
+
ET6LAnOZB9q04zgLVFB0gJTcuLvfwjf7vpH9gCa5uqshSJnfi0owkeS2Hao0OwD+
|
18
|
+
vJrRRYbPfmXHAhEV8l9bSE6Ul1uTT9A+XS7g724sgOm5tCKFkLu+rcDy25MSjqpD
|
19
|
+
q+cPG4SN3ZUGK5eR9tp//dzdrjCV2wsOaoYKPajVY698p+sRf1zsHsSMxYnJPD/8
|
20
|
+
IkeNC+3VdsJFQ7wAoSk4hSpDuIi1xknA61/elDy5O07r1M25PJMntBE6QpJZblvw
|
21
|
+
v5u8U7+nK9P82KFfUwAjqkrhizt90M+0eK6dG44PnqafnxF84K2v7Qr1W6hKMIeL
|
22
|
+
DcqKPjAop+DO8WPCtfKFcFQKGRSe+H2rej3h34eGPH/GVoh8/h+ZuoDUfQRfoWLb
|
23
|
+
zUTM4uC1XwMlRjg8W4uYgV8SWZ5eii3tpmUz7moKQ4k9DSNNpGO+/bk/IcMVbRkb
|
24
|
+
xG/LzBAa6JQyAtVw9AMq8WHZKtJeNrdthoJSczsurthCHb0nY7VUQ/pp9JsCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUVn1uBRaL
|
26
|
+
Zh8+3WB077Lz84bokZowJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
27
|
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEAUzwCxgrA0YiJvXoi1ZHlhhz+ROzRn6XSQZZVd0Ym
|
29
|
+
gQVkUcvb/iiMnE0PZdivLiorRjhdR5tIPAYhmuN7Mr5IscQFdb1cndyC+qUzy6zP
|
30
|
+
HJGDGqqHDtiYpWlQ3/VAD4V+mAYj67CTj8gM2Y0OfjOIzKLf4jeLzcR0XjxCS9bH
|
31
|
+
g3cF/0FdJ5ydwo6r9QW/mE5yej6yuWBD9NFjWVbV/TAY3rWWDtw9g1WG31HjZVRB
|
32
|
+
lYndPAx0WIUBse5IRDGTiQ1JuMI5vBrxYJCb1Je506nR2rktACDRVSe/DTM4sxZn
|
33
|
+
oP3LBd1hPOAhNya8tD4FmUjQg4tvuWwIKh55XorZVEkzTWGgAJSnu7XTxtPcjxFA
|
34
|
+
U/3nmRr1BTYMN96T+3L81oqJTW5CxAAlsR97O7H8eZhwnNdG9HjgAk4PwiMLOhPb
|
35
|
+
Ely2/UitUG79uLcra+83gWVYzYiqBYC2d5HR4vCpTeecqYFXjWo9E3LMrvyB5Unk
|
36
|
+
zn10wjI+T1ysW7U6t+VJft8s
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date:
|
38
|
+
date: 2021-02-22 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: dynarex
|
@@ -46,7 +46,7 @@ dependencies:
|
|
46
46
|
version: '1.8'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version: 1.8.
|
49
|
+
version: 1.8.25
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -56,7 +56,27 @@ dependencies:
|
|
56
56
|
version: '1.8'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version: 1.8.
|
59
|
+
version: 1.8.25
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: dxlite
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - "~>"
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0.2'
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.2.7
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0.2'
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.2.7
|
60
80
|
description:
|
61
81
|
email: james@jamesrobertson.eu
|
62
82
|
executables: []
|
@@ -83,7 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
103
|
- !ruby/object:Gem::Version
|
84
104
|
version: '0'
|
85
105
|
requirements: []
|
86
|
-
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 2.7.10
|
87
108
|
signing_key:
|
88
109
|
specification_version: 4
|
89
110
|
summary: Experimental gem to search a list of words 1 character at a time. Intended
|
metadata.gz.sig
CHANGED
Binary file
|