indexer101 0.1.0 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/indexer101.rb +138 -9
- metadata +51 -30
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 71b93d7302d137cd83ccd852e8506e349cbe3fc52723eed291c55875597606a3
|
4
|
+
data.tar.gz: 03d8a4f0800dac1ee5d7d7923742d7122898671ea72c9913b46c1d30e356dd35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3f8d541e4386651382c21f3936cfaa400f94cdff79729a95bfe7c10949ef628b4e3eddb1b873aecd9e1fc6ec97417ffcc79ab4d8639cec7e5b41ab93ecf2b2f
|
7
|
+
data.tar.gz: 52df9975cfbb4c4b7c1520d6d7e5efa3acc716ee70b11a15762534dbbe8acd70eee858428bb7982d718101c4395df33eb94fb53012e4c76e3839698531a0263c
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/indexer101.rb
CHANGED
@@ -5,24 +5,34 @@
|
|
5
5
|
require 'c32'
|
6
6
|
require 'thread'
|
7
7
|
require 'thwait'
|
8
|
+
require 'dynarex'
|
9
|
+
require 'dxlite'
|
8
10
|
|
9
11
|
|
10
12
|
class Indexer101
|
11
13
|
using ColouredText
|
12
14
|
|
13
15
|
class Index
|
14
|
-
|
16
|
+
|
15
17
|
attr_reader :h
|
16
|
-
attr_accessor :index
|
18
|
+
attr_accessor :uri_index, :index
|
17
19
|
|
18
20
|
def initialize()
|
21
|
+
|
22
|
+
@uri_index = {} # contains each URI long with the title
|
23
|
+
@index = {} # contains eack keyword
|
24
|
+
@h = {} # nested keywords constructed from shared string keys
|
25
|
+
|
19
26
|
end
|
20
27
|
|
21
28
|
def build(a)
|
22
29
|
|
23
30
|
threads = []
|
24
|
-
|
25
|
-
|
31
|
+
|
32
|
+
if @index.empty? then
|
33
|
+
threads << Thread.new do
|
34
|
+
@index = Hash[a.map(&:to_sym).zip([''] * a.length)]
|
35
|
+
end
|
26
36
|
end
|
27
37
|
|
28
38
|
threads << Thread.new { @h = group a }
|
@@ -70,14 +80,14 @@ class Indexer101
|
|
70
80
|
|
71
81
|
end
|
72
82
|
|
73
|
-
def build(a)
|
83
|
+
def build(a=@indexer.index.keys)
|
74
84
|
|
75
85
|
t = Time.now
|
76
86
|
@indexer.build(a)
|
77
87
|
t2 = Time.now - t
|
78
88
|
|
79
89
|
puts "%d words indexed".info % a.length
|
80
|
-
puts "index built in
|
90
|
+
puts ("index built in " + ("%.3f" % t2).brown + " seconds").info
|
81
91
|
|
82
92
|
self
|
83
93
|
end
|
@@ -97,7 +107,7 @@ class Indexer101
|
|
97
107
|
t2 = Time.now - t
|
98
108
|
|
99
109
|
puts "index contains %d words".info % @indexer.index.length
|
100
|
-
puts "index read in %.2f
|
110
|
+
puts "index read in " + ("%.2f" % t2).brown + " seconds".info
|
101
111
|
|
102
112
|
end
|
103
113
|
|
@@ -108,8 +118,86 @@ class Indexer101
|
|
108
118
|
end
|
109
119
|
|
110
120
|
end
|
121
|
+
|
122
|
+
# scan levels: 0 = tags only; 1 = all words in title (including tags)
|
123
|
+
#
|
124
|
+
def scan_dxindex(*locations, level: 0)
|
125
|
+
|
126
|
+
t = Time.now
|
127
|
+
threads = locations.flatten.map do |location|
|
128
|
+
|
129
|
+
Thread.new {
|
130
|
+
|
131
|
+
Thread.current[:v] = case File.extname(location)
|
132
|
+
when '.xml'
|
133
|
+
Dynarex.new location, debug: @debug
|
134
|
+
when '.json'
|
135
|
+
DxLite.new location, debug: @debug
|
136
|
+
end
|
137
|
+
}
|
138
|
+
end
|
139
|
+
|
140
|
+
ThreadsWait.all_waits(*threads)
|
141
|
+
|
142
|
+
a = threads.map {|x| x[:v]}
|
143
|
+
puts '_a: ' + a.inspect if @debug
|
144
|
+
t2 = Time.now - t
|
145
|
+
puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
|
146
|
+
+ " seconds").info
|
147
|
+
|
148
|
+
|
149
|
+
id = 1
|
150
|
+
|
151
|
+
a.each do |dx|
|
152
|
+
|
153
|
+
id2 = id
|
154
|
+
|
155
|
+
if @debug then
|
156
|
+
puts 'dx: ' + dx.class.inspect
|
157
|
+
puts 'dx.all: ' + dx.all.inspect
|
158
|
+
end
|
159
|
+
|
160
|
+
@indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
|
161
|
+
{|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
|
162
|
+
|
163
|
+
dx.all.reverse.each do |x|
|
164
|
+
|
165
|
+
case level
|
166
|
+
when 0
|
167
|
+
|
168
|
+
x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
|
169
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
170
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
171
|
+
end
|
172
|
+
|
173
|
+
when 1
|
174
|
+
|
175
|
+
# \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
|
176
|
+
x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
|
177
|
+
@indexer.index[keyword.downcase.to_sym] ||= []
|
178
|
+
@indexer.index[keyword.downcase.to_sym] << id2
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
id2 += 1
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
id = id2
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
def uri_index()
|
194
|
+
@indexer.uri_index
|
195
|
+
end
|
111
196
|
|
112
|
-
|
197
|
+
# enter a few starting characters and lookup will suggest a few keywords
|
198
|
+
# useful for an auto suggest feature
|
199
|
+
#
|
200
|
+
def lookup(s, limit: 10)
|
113
201
|
|
114
202
|
t = Time.now
|
115
203
|
a = scan_path s
|
@@ -124,7 +212,48 @@ class Indexer101
|
|
124
212
|
|
125
213
|
results = scan_leaves(r).sort_by(&:length).take(limit)
|
126
214
|
t2 = Time.now - t
|
127
|
-
puts "
|
215
|
+
puts ("lookup took " + ("%.3f" % t2).brown + " seconds").info
|
216
|
+
|
217
|
+
return results
|
218
|
+
|
219
|
+
end
|
220
|
+
|
221
|
+
# enter the exact keywords to search from the index
|
222
|
+
#
|
223
|
+
def search(*keywords, minchars: 3)
|
224
|
+
|
225
|
+
t = Time.now
|
226
|
+
|
227
|
+
r = keywords.flatten(1).map do |x|
|
228
|
+
|
229
|
+
a = []
|
230
|
+
a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
|
231
|
+
|
232
|
+
if x.length >= minchars then
|
233
|
+
a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
|
234
|
+
{|y| @indexer.index[y].reverse}
|
235
|
+
a += @indexer.index.keys.grep(/#{x}/i).flat_map\
|
236
|
+
{|y| @indexer.index[y].reverse}
|
237
|
+
end
|
238
|
+
|
239
|
+
puts ('a: ' + a.inspect).debug if @debug
|
240
|
+
|
241
|
+
a.uniq.map {|y| @indexer.uri_index[y]}
|
242
|
+
|
243
|
+
end
|
244
|
+
|
245
|
+
# group by number of results found, sort by count, then by date
|
246
|
+
a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
|
247
|
+
-([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
|
248
|
+
end
|
249
|
+
|
250
|
+
# fetch the 1st record from each group item
|
251
|
+
results = a3.map {|x| x.last.first}
|
252
|
+
|
253
|
+
t2 = Time.now - t
|
254
|
+
puts ("found %s results" % results.length).info
|
255
|
+
puts ("search took " + ("%.3f" % t2).brown + " seconds").info
|
256
|
+
puts
|
128
257
|
|
129
258
|
return results
|
130
259
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indexer101
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -11,52 +11,72 @@ cert_chain:
|
|
11
11
|
- |
|
12
12
|
-----BEGIN CERTIFICATE-----
|
13
13
|
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjEwMjIyMDAwNzQ2WhcN
|
15
|
+
MjIwMjIyMDAwNzQ2WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDqAwJO
|
17
|
+
ET6LAnOZB9q04zgLVFB0gJTcuLvfwjf7vpH9gCa5uqshSJnfi0owkeS2Hao0OwD+
|
18
|
+
vJrRRYbPfmXHAhEV8l9bSE6Ul1uTT9A+XS7g724sgOm5tCKFkLu+rcDy25MSjqpD
|
19
|
+
q+cPG4SN3ZUGK5eR9tp//dzdrjCV2wsOaoYKPajVY698p+sRf1zsHsSMxYnJPD/8
|
20
|
+
IkeNC+3VdsJFQ7wAoSk4hSpDuIi1xknA61/elDy5O07r1M25PJMntBE6QpJZblvw
|
21
|
+
v5u8U7+nK9P82KFfUwAjqkrhizt90M+0eK6dG44PnqafnxF84K2v7Qr1W6hKMIeL
|
22
|
+
DcqKPjAop+DO8WPCtfKFcFQKGRSe+H2rej3h34eGPH/GVoh8/h+ZuoDUfQRfoWLb
|
23
|
+
zUTM4uC1XwMlRjg8W4uYgV8SWZ5eii3tpmUz7moKQ4k9DSNNpGO+/bk/IcMVbRkb
|
24
|
+
xG/LzBAa6JQyAtVw9AMq8WHZKtJeNrdthoJSczsurthCHb0nY7VUQ/pp9JsCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUVn1uBRaL
|
26
|
+
Zh8+3WB077Lz84bokZowJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
27
|
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEAUzwCxgrA0YiJvXoi1ZHlhhz+ROzRn6XSQZZVd0Ym
|
29
|
+
gQVkUcvb/iiMnE0PZdivLiorRjhdR5tIPAYhmuN7Mr5IscQFdb1cndyC+qUzy6zP
|
30
|
+
HJGDGqqHDtiYpWlQ3/VAD4V+mAYj67CTj8gM2Y0OfjOIzKLf4jeLzcR0XjxCS9bH
|
31
|
+
g3cF/0FdJ5ydwo6r9QW/mE5yej6yuWBD9NFjWVbV/TAY3rWWDtw9g1WG31HjZVRB
|
32
|
+
lYndPAx0WIUBse5IRDGTiQ1JuMI5vBrxYJCb1Je506nR2rktACDRVSe/DTM4sxZn
|
33
|
+
oP3LBd1hPOAhNya8tD4FmUjQg4tvuWwIKh55XorZVEkzTWGgAJSnu7XTxtPcjxFA
|
34
|
+
U/3nmRr1BTYMN96T+3L81oqJTW5CxAAlsR97O7H8eZhwnNdG9HjgAk4PwiMLOhPb
|
35
|
+
Ely2/UitUG79uLcra+83gWVYzYiqBYC2d5HR4vCpTeecqYFXjWo9E3LMrvyB5Unk
|
36
|
+
zn10wjI+T1ysW7U6t+VJft8s
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date:
|
38
|
+
date: 2021-02-22 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: dynarex
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '1.8'
|
44
47
|
- - ">="
|
45
48
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
49
|
+
version: 1.8.25
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - "~>"
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '1.8'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 1.8.25
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: dxlite
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
47
64
|
- - "~>"
|
48
65
|
- !ruby/object:Gem::Version
|
49
66
|
version: '0.2'
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.2.7
|
50
70
|
type: :runtime
|
51
71
|
prerelease: false
|
52
72
|
version_requirements: !ruby/object:Gem::Requirement
|
53
73
|
requirements:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 0.2.0
|
57
74
|
- - "~>"
|
58
75
|
- !ruby/object:Gem::Version
|
59
76
|
version: '0.2'
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.2.7
|
60
80
|
description:
|
61
81
|
email: james@jamesrobertson.eu
|
62
82
|
executables: []
|
@@ -83,7 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
103
|
- !ruby/object:Gem::Version
|
84
104
|
version: '0'
|
85
105
|
requirements: []
|
86
|
-
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 2.7.10
|
87
108
|
signing_key:
|
88
109
|
specification_version: 4
|
89
110
|
summary: Experimental gem to search a list of words 1 character at a time. Intended
|
metadata.gz.sig
CHANGED
Binary file
|