indexer101 0.1.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/indexer101.rb +138 -9
- metadata +51 -30
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 71b93d7302d137cd83ccd852e8506e349cbe3fc52723eed291c55875597606a3
|
4
|
+
data.tar.gz: 03d8a4f0800dac1ee5d7d7923742d7122898671ea72c9913b46c1d30e356dd35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3f8d541e4386651382c21f3936cfaa400f94cdff79729a95bfe7c10949ef628b4e3eddb1b873aecd9e1fc6ec97417ffcc79ab4d8639cec7e5b41ab93ecf2b2f
|
7
|
+
data.tar.gz: 52df9975cfbb4c4b7c1520d6d7e5efa3acc716ee70b11a15762534dbbe8acd70eee858428bb7982d718101c4395df33eb94fb53012e4c76e3839698531a0263c
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/indexer101.rb
CHANGED
@@ -5,24 +5,34 @@
|
|
5
5
|
require 'c32'
|
6
6
|
require 'thread'
|
7
7
|
require 'thwait'
|
8
|
+
require 'dynarex'
|
9
|
+
require 'dxlite'
|
8
10
|
|
9
11
|
|
10
12
|
class Indexer101
|
11
13
|
using ColouredText
|
12
14
|
|
13
15
|
class Index
|
14
|
-
|
16
|
+
|
15
17
|
attr_reader :h
|
16
|
-
attr_accessor :index
|
18
|
+
attr_accessor :uri_index, :index
|
17
19
|
|
18
20
|
def initialize()
|
21
|
+
|
22
|
+
@uri_index = {} # contains each URI long with the title
|
23
|
+
@index = {} # contains eack keyword
|
24
|
+
@h = {} # nested keywords constructed from shared string keys
|
25
|
+
|
19
26
|
end
|
20
27
|
|
21
28
|
def build(a)
|
22
29
|
|
23
30
|
threads = []
|
24
|
-
|
25
|
-
|
31
|
+
|
32
|
+
if @index.empty? then
|
33
|
+
threads << Thread.new do
|
34
|
+
@index = Hash[a.map(&:to_sym).zip([''] * a.length)]
|
35
|
+
end
|
26
36
|
end
|
27
37
|
|
28
38
|
threads << Thread.new { @h = group a }
|
@@ -70,14 +80,14 @@ class Indexer101
|
|
70
80
|
|
71
81
|
end
|
72
82
|
|
73
|
-
def build(a)
|
83
|
+
def build(a=@indexer.index.keys)
|
74
84
|
|
75
85
|
t = Time.now
|
76
86
|
@indexer.build(a)
|
77
87
|
t2 = Time.now - t
|
78
88
|
|
79
89
|
puts "%d words indexed".info % a.length
|
80
|
-
puts "index built in
|
90
|
+
puts ("index built in " + ("%.3f" % t2).brown + " seconds").info
|
81
91
|
|
82
92
|
self
|
83
93
|
end
|
@@ -97,7 +107,7 @@ class Indexer101
|
|
97
107
|
t2 = Time.now - t
|
98
108
|
|
99
109
|
puts "index contains %d words".info % @indexer.index.length
|
100
|
-
puts "index read in %.2f
|
110
|
+
puts "index read in " + ("%.2f" % t2).brown + " seconds".info
|
101
111
|
|
102
112
|
end
|
103
113
|
|
@@ -108,8 +118,86 @@ class Indexer101
|
|
108
118
|
end
|
109
119
|
|
110
120
|
end
|
121
|
+
|
122
|
+
# scan levels: 0 = tags only; 1 = all words in title (including tags)
|
123
|
+
#
|
124
|
+
def scan_dxindex(*locations, level: 0)
|
125
|
+
|
126
|
+
t = Time.now
|
127
|
+
threads = locations.flatten.map do |location|
|
128
|
+
|
129
|
+
Thread.new {
|
130
|
+
|
131
|
+
Thread.current[:v] = case File.extname(location)
|
132
|
+
when '.xml'
|
133
|
+
Dynarex.new location, debug: @debug
|
134
|
+
when '.json'
|
135
|
+
DxLite.new location, debug: @debug
|
136
|
+
end
|
137
|
+
}
|
138
|
+
end
|
139
|
+
|
140
|
+
ThreadsWait.all_waits(*threads)
|
141
|
+
|
142
|
+
a = threads.map {|x| x[:v]}
|
143
|
+
puts '_a: ' + a.inspect if @debug
|
144
|
+
t2 = Time.now - t
|
145
|
+
puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
|
146
|
+
+ " seconds").info
|
147
|
+
|
148
|
+
|
149
|
+
id = 1
|
150
|
+
|
151
|
+
a.each do |dx|
|
152
|
+
|
153
|
+
id2 = id
|
154
|
+
|
155
|
+
if @debug then
|
156
|
+
puts 'dx: ' + dx.class.inspect
|
157
|
+
puts 'dx.all: ' + dx.all.inspect
|
158
|
+
end
|
159
|
+
|
160
|
+
@indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
|
161
|
+
{|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
|
162
|
+
|
163
|
+
dx.all.reverse.each do |x|
|
164
|
+
|
165
|
+
case level
|
166
|
+
when 0
|
167
|
+
|
168
|
+
x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
|
169
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
170
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
171
|
+
end
|
172
|
+
|
173
|
+
when 1
|
174
|
+
|
175
|
+
# \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
|
176
|
+
x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
|
177
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
178
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
id2 += 1
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
id = id2
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
def uri_index()
|
194
|
+
@indexer.uri_index
|
195
|
+
end
|
111
196
|
|
112
|
-
|
197
|
+
# enter a few starting characters and lookup will suggest a few keywords
|
198
|
+
# useful for an auto suggest feature
|
199
|
+
#
|
200
|
+
def lookup(s, limit: 10)
|
113
201
|
|
114
202
|
t = Time.now
|
115
203
|
a = scan_path s
|
@@ -124,7 +212,48 @@ class Indexer101
|
|
124
212
|
|
125
213
|
results = scan_leaves(r).sort_by(&:length).take(limit)
|
126
214
|
t2 = Time.now - t
|
127
|
-
puts "
|
215
|
+
puts ("lookup took " + ("%.3f" % t2).brown + " seconds").info
|
216
|
+
|
217
|
+
return results
|
218
|
+
|
219
|
+
end
|
220
|
+
|
221
|
+
# enter the exact keywords to search from the index
|
222
|
+
#
|
223
|
+
def search(*keywords, minchars: 3)
|
224
|
+
|
225
|
+
t = Time.now
|
226
|
+
|
227
|
+
r = keywords.flatten(1).map do |x|
|
228
|
+
|
229
|
+
a = []
|
230
|
+
a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
|
231
|
+
|
232
|
+
if x.length >= minchars then
|
233
|
+
a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
|
234
|
+
{|y| @indexer.index[y].reverse}
|
235
|
+
a += @indexer.index.keys.grep(/#{x}/i).flat_map\
|
236
|
+
{|y| @indexer.index[y].reverse}
|
237
|
+
end
|
238
|
+
|
239
|
+
puts ('a: ' + a.inspect).debug if @debug
|
240
|
+
|
241
|
+
a.uniq.map {|y| @indexer.uri_index[y]}
|
242
|
+
|
243
|
+
end
|
244
|
+
|
245
|
+
# group by number of results found, sort by count, then by date
|
246
|
+
a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
|
247
|
+
-([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
|
248
|
+
end
|
249
|
+
|
250
|
+
# fetch the 1st record from each group item
|
251
|
+
results = a3.map {|x| x.last.first}
|
252
|
+
|
253
|
+
t2 = Time.now - t
|
254
|
+
puts ("found %s results" % results.length).info
|
255
|
+
puts ("search took " + ("%.3f" % t2).brown + " seconds").info
|
256
|
+
puts
|
128
257
|
|
129
258
|
return results
|
130
259
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indexer101
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -11,52 +11,72 @@ cert_chain:
|
|
11
11
|
- |
|
12
12
|
-----BEGIN CERTIFICATE-----
|
13
13
|
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjEwMjIyMDAwNzQ2WhcN
|
15
|
+
MjIwMjIyMDAwNzQ2WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDqAwJO
|
17
|
+
ET6LAnOZB9q04zgLVFB0gJTcuLvfwjf7vpH9gCa5uqshSJnfi0owkeS2Hao0OwD+
|
18
|
+
vJrRRYbPfmXHAhEV8l9bSE6Ul1uTT9A+XS7g724sgOm5tCKFkLu+rcDy25MSjqpD
|
19
|
+
q+cPG4SN3ZUGK5eR9tp//dzdrjCV2wsOaoYKPajVY698p+sRf1zsHsSMxYnJPD/8
|
20
|
+
IkeNC+3VdsJFQ7wAoSk4hSpDuIi1xknA61/elDy5O07r1M25PJMntBE6QpJZblvw
|
21
|
+
v5u8U7+nK9P82KFfUwAjqkrhizt90M+0eK6dG44PnqafnxF84K2v7Qr1W6hKMIeL
|
22
|
+
DcqKPjAop+DO8WPCtfKFcFQKGRSe+H2rej3h34eGPH/GVoh8/h+ZuoDUfQRfoWLb
|
23
|
+
zUTM4uC1XwMlRjg8W4uYgV8SWZ5eii3tpmUz7moKQ4k9DSNNpGO+/bk/IcMVbRkb
|
24
|
+
xG/LzBAa6JQyAtVw9AMq8WHZKtJeNrdthoJSczsurthCHb0nY7VUQ/pp9JsCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUVn1uBRaL
|
26
|
+
Zh8+3WB077Lz84bokZowJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
27
|
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEAUzwCxgrA0YiJvXoi1ZHlhhz+ROzRn6XSQZZVd0Ym
|
29
|
+
gQVkUcvb/iiMnE0PZdivLiorRjhdR5tIPAYhmuN7Mr5IscQFdb1cndyC+qUzy6zP
|
30
|
+
HJGDGqqHDtiYpWlQ3/VAD4V+mAYj67CTj8gM2Y0OfjOIzKLf4jeLzcR0XjxCS9bH
|
31
|
+
g3cF/0FdJ5ydwo6r9QW/mE5yej6yuWBD9NFjWVbV/TAY3rWWDtw9g1WG31HjZVRB
|
32
|
+
lYndPAx0WIUBse5IRDGTiQ1JuMI5vBrxYJCb1Je506nR2rktACDRVSe/DTM4sxZn
|
33
|
+
oP3LBd1hPOAhNya8tD4FmUjQg4tvuWwIKh55XorZVEkzTWGgAJSnu7XTxtPcjxFA
|
34
|
+
U/3nmRr1BTYMN96T+3L81oqJTW5CxAAlsR97O7H8eZhwnNdG9HjgAk4PwiMLOhPb
|
35
|
+
Ely2/UitUG79uLcra+83gWVYzYiqBYC2d5HR4vCpTeecqYFXjWo9E3LMrvyB5Unk
|
36
|
+
zn10wjI+T1ysW7U6t+VJft8s
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date:
|
38
|
+
date: 2021-02-22 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: dynarex
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '1.8'
|
44
47
|
- - ">="
|
45
48
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
49
|
+
version: 1.8.25
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - "~>"
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '1.8'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 1.8.25
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: dxlite
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
47
64
|
- - "~>"
|
48
65
|
- !ruby/object:Gem::Version
|
49
66
|
version: '0.2'
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.2.7
|
50
70
|
type: :runtime
|
51
71
|
prerelease: false
|
52
72
|
version_requirements: !ruby/object:Gem::Requirement
|
53
73
|
requirements:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 0.2.0
|
57
74
|
- - "~>"
|
58
75
|
- !ruby/object:Gem::Version
|
59
76
|
version: '0.2'
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.2.7
|
60
80
|
description:
|
61
81
|
email: james@jamesrobertson.eu
|
62
82
|
executables: []
|
@@ -83,7 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
103
|
- !ruby/object:Gem::Version
|
84
104
|
version: '0'
|
85
105
|
requirements: []
|
86
|
-
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 2.7.10
|
87
108
|
signing_key:
|
88
109
|
specification_version: 4
|
89
110
|
summary: Experimental gem to search a list of words 1 character at a time. Intended
|
metadata.gz.sig
CHANGED
Binary file
|