indexer101 0.1.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f3987fe6c70a6da37dd0cbcb50bf3368601ee1b6ddc6a90da039d22dfa14f28b
4
- data.tar.gz: f7a79a0f09d97948a5576c253c8f2d2e5f58079fdc0830ee4fbeee28c208c3af
3
+ metadata.gz: 71b93d7302d137cd83ccd852e8506e349cbe3fc52723eed291c55875597606a3
4
+ data.tar.gz: 03d8a4f0800dac1ee5d7d7923742d7122898671ea72c9913b46c1d30e356dd35
5
5
  SHA512:
6
- metadata.gz: fe696d43cc8c49962e3bb44a6f55497dd178b527d31cde3e395c2b2c1ef984add180fc503ac7f97fb2f4a716e37f785535972e149b3f6ec0cbeaa5698d64b1c9
7
- data.tar.gz: e591fdfbec8eb9e579473075dabe554c04f163ffa23548746833d27053e5a55e7ba3514b2cd154acf43e8a4d2124f5cd753dd0bfcaa30697cd04d0e487bec318
6
+ metadata.gz: f3f8d541e4386651382c21f3936cfaa400f94cdff79729a95bfe7c10949ef628b4e3eddb1b873aecd9e1fc6ec97417ffcc79ab4d8639cec7e5b41ab93ecf2b2f
7
+ data.tar.gz: 52df9975cfbb4c4b7c1520d6d7e5efa3acc716ee70b11a15762534dbbe8acd70eee858428bb7982d718101c4395df33eb94fb53012e4c76e3839698531a0263c
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
data/lib/indexer101.rb CHANGED
@@ -5,24 +5,34 @@
5
5
  require 'c32'
6
6
  require 'thread'
7
7
  require 'thwait'
8
+ require 'dynarex'
9
+ require 'dxlite'
8
10
 
9
11
 
10
12
  class Indexer101
11
13
  using ColouredText
12
14
 
13
15
  class Index
14
-
16
+
15
17
  attr_reader :h
16
- attr_accessor :index
18
+ attr_accessor :uri_index, :index
17
19
 
18
20
  def initialize()
21
+
22
+ @uri_index = {} # contains each URI long with the title
23
+ @index = {} # contains eack keyword
24
+ @h = {} # nested keywords constructed from shared string keys
25
+
19
26
  end
20
27
 
21
28
  def build(a)
22
29
 
23
30
  threads = []
24
- threads << Thread.new do
25
- @index = Hash[a.map(&:to_sym).zip([''] * a.length)]
31
+
32
+ if @index.empty? then
33
+ threads << Thread.new do
34
+ @index = Hash[a.map(&:to_sym).zip([''] * a.length)]
35
+ end
26
36
  end
27
37
 
28
38
  threads << Thread.new { @h = group a }
@@ -70,14 +80,14 @@ class Indexer101
70
80
 
71
81
  end
72
82
 
73
- def build(a)
83
+ def build(a=@indexer.index.keys)
74
84
 
75
85
  t = Time.now
76
86
  @indexer.build(a)
77
87
  t2 = Time.now - t
78
88
 
79
89
  puts "%d words indexed".info % a.length
80
- puts "index built in %.2f seconds".info % t2
90
+ puts ("index built in " + ("%.3f" % t2).brown + " seconds").info
81
91
 
82
92
  self
83
93
  end
@@ -97,7 +107,7 @@ class Indexer101
97
107
  t2 = Time.now - t
98
108
 
99
109
  puts "index contains %d words".info % @indexer.index.length
100
- puts "index read in %.2f seconds".info % t2
110
+ puts "index read in " + ("%.2f" % t2).brown + " seconds".info
101
111
 
102
112
  end
103
113
 
@@ -108,8 +118,86 @@ class Indexer101
108
118
  end
109
119
 
110
120
  end
121
+
122
+ # scan levels: 0 = tags only; 1 = all words in title (including tags)
123
+ #
124
+ def scan_dxindex(*locations, level: 0)
125
+
126
+ t = Time.now
127
+ threads = locations.flatten.map do |location|
128
+
129
+ Thread.new {
130
+
131
+ Thread.current[:v] = case File.extname(location)
132
+ when '.xml'
133
+ Dynarex.new location, debug: @debug
134
+ when '.json'
135
+ DxLite.new location, debug: @debug
136
+ end
137
+ }
138
+ end
139
+
140
+ ThreadsWait.all_waits(*threads)
141
+
142
+ a = threads.map {|x| x[:v]}
143
+ puts '_a: ' + a.inspect if @debug
144
+ t2 = Time.now - t
145
+ puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
146
+ + " seconds").info
147
+
148
+
149
+ id = 1
150
+
151
+ a.each do |dx|
152
+
153
+ id2 = id
154
+
155
+ if @debug then
156
+ puts 'dx: ' + dx.class.inspect
157
+ puts 'dx.all: ' + dx.all.inspect
158
+ end
159
+
160
+ @indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
161
+ {|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
162
+
163
+ dx.all.reverse.each do |x|
164
+
165
+ case level
166
+ when 0
167
+
168
+ x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
169
+ @indexer.index[keyword.downcase.to_sym] ||= []
170
+ @indexer.index[keyword.downcase.to_sym] << id2
171
+ end
172
+
173
+ when 1
174
+
175
+ # \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
176
+ x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
177
+ @indexer.index[keyword.downcase.to_sym] ||= []
178
+ @indexer.index[keyword.downcase.to_sym] << id2
179
+ end
180
+
181
+ end
182
+
183
+ id2 += 1
184
+
185
+ end
186
+
187
+ id = id2
188
+
189
+ end
190
+
191
+ end
192
+
193
+ def uri_index()
194
+ @indexer.uri_index
195
+ end
111
196
 
112
- def search(s, limit: 10)
197
+ # enter a few starting characters and lookup will suggest a few keywords
198
+ # useful for an auto suggest feature
199
+ #
200
+ def lookup(s, limit: 10)
113
201
 
114
202
  t = Time.now
115
203
  a = scan_path s
@@ -124,7 +212,48 @@ class Indexer101
124
212
 
125
213
  results = scan_leaves(r).sort_by(&:length).take(limit)
126
214
  t2 = Time.now - t
127
- puts "search took %.2f seconds" % t2 if @debug
215
+ puts ("lookup took " + ("%.3f" % t2).brown + " seconds").info
216
+
217
+ return results
218
+
219
+ end
220
+
221
+ # enter the exact keywords to search from the index
222
+ #
223
+ def search(*keywords, minchars: 3)
224
+
225
+ t = Time.now
226
+
227
+ r = keywords.flatten(1).map do |x|
228
+
229
+ a = []
230
+ a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
231
+
232
+ if x.length >= minchars then
233
+ a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
234
+ {|y| @indexer.index[y].reverse}
235
+ a += @indexer.index.keys.grep(/#{x}/i).flat_map\
236
+ {|y| @indexer.index[y].reverse}
237
+ end
238
+
239
+ puts ('a: ' + a.inspect).debug if @debug
240
+
241
+ a.uniq.map {|y| @indexer.uri_index[y]}
242
+
243
+ end
244
+
245
+ # group by number of results found, sort by count, then by date
246
+ a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
247
+ -([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
248
+ end
249
+
250
+ # fetch the 1st record from each group item
251
+ results = a3.map {|x| x.last.first}
252
+
253
+ t2 = Time.now - t
254
+ puts ("found %s results" % results.length).info
255
+ puts ("search took " + ("%.3f" % t2).brown + " seconds").info
256
+ puts
128
257
 
129
258
  return results
130
259
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indexer101
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,52 +11,72 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMTkxMTExMjI1NjA5WhcN
15
- MjAxMTEwMjI1NjA5WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDEF92R
17
- JWEWjPhgU6nIiCKKXtWI9EE8DbqPtU+CnYuUN2BWN7c4dsbaYrU1tVMzxx22G+Of
18
- apTIeIGFrq/oqub6nhx+UbnkMAqpmbJS8zTgnwEhFsGHGI2CD9+4OXh5rl36SqqP
19
- IGxY7w45KDvuPWA/Htb2aC20cHclJebOjeaMNogpYDByVEjjxtZeiOmIJtJlQSf+
20
- YpUviQVTkFJqbSK0WkKsPLQZu29P1zHETkdBvIGlGGDdo13haBW2Hdj9a2INeWcz
21
- B+v6nAdnv9fTTvH0GX51XDa/EIisWNELaopHk1Hcx97pZdm92gaICQfdgUHje65s
22
- oUDjyynUKE8dq+LAPEq5B1wj3e4BsIOFI5PSvPCMWI5jpbJWBXV6owex9qln22UA
23
- lDUEP3COb9/+r6bGNiCBUvUwyy7l/RdmRXPSOYnP8jPcwD/qSVypJYObM+2q68qc
24
- 5Eg0WqCVdFgpIXzBJPVcxeDJiN6EKmFWr5vJkMwIGz9hhIcitQjjSguk4r8CAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUN0C2lCCv
26
- /ATnmYSeZRQfUNpplfwwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjEwMjIyMDAwNzQ2WhcN
15
+ MjIwMjIyMDAwNzQ2WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDqAwJO
17
+ ET6LAnOZB9q04zgLVFB0gJTcuLvfwjf7vpH9gCa5uqshSJnfi0owkeS2Hao0OwD+
18
+ vJrRRYbPfmXHAhEV8l9bSE6Ul1uTT9A+XS7g724sgOm5tCKFkLu+rcDy25MSjqpD
19
+ q+cPG4SN3ZUGK5eR9tp//dzdrjCV2wsOaoYKPajVY698p+sRf1zsHsSMxYnJPD/8
20
+ IkeNC+3VdsJFQ7wAoSk4hSpDuIi1xknA61/elDy5O07r1M25PJMntBE6QpJZblvw
21
+ v5u8U7+nK9P82KFfUwAjqkrhizt90M+0eK6dG44PnqafnxF84K2v7Qr1W6hKMIeL
22
+ DcqKPjAop+DO8WPCtfKFcFQKGRSe+H2rej3h34eGPH/GVoh8/h+ZuoDUfQRfoWLb
23
+ zUTM4uC1XwMlRjg8W4uYgV8SWZ5eii3tpmUz7moKQ4k9DSNNpGO+/bk/IcMVbRkb
24
+ xG/LzBAa6JQyAtVw9AMq8WHZKtJeNrdthoJSczsurthCHb0nY7VUQ/pp9JsCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUVn1uBRaL
26
+ Zh8+3WB077Lz84bokZowJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAXrKEIca0q3x//SBOsv17jkonBSlzwVLIBuXKXX4R
29
- f0q3kJw7vWBPJfIVpAcGn6mbx2ziWw3XvG/SMuwIfzitz0pKTapvwbKZFPscqy82
30
- KcDxGtJt1cjDHzl0Bm+mgN/MgY+PAj3TcT/osuCc8iTu4+Ib6UxkmOP/uy2svBLs
31
- Su2XzGoqd5SKxPpj7IwMOBSVQWrqgWN2B+gdkN0CqjUPVMmiEKuMNjz9Idu36nG/
32
- QOPsTlpqBxR+yFbPEP0DlN5X8BRRAsraQZ+LPi7W/bU1fjkvIJxXuUkQD9dMF99+
33
- me+6s7PoCJ1yMmc5XfMFmv4WYswC+VqI1EeG0EneAsxRo8MmhWZcokqRE/KUk+ym
34
- NlDFqcKPZDaMsdO5dkYvEeguet/iG3XS1u7WKAcZgfhPdiIbue7cAhz9eQpNybe7
35
- 08cN0E9zjqKINgH/PsZTot+ohuVRLwn6WmHHhb18oUrxt3a0u4/3TNcWOcMeR0F2
36
- GeYL+mKGct5bfjn8IZnAJVKY
28
+ BgkqhkiG9w0BAQsFAAOCAYEAUzwCxgrA0YiJvXoi1ZHlhhz+ROzRn6XSQZZVd0Ym
29
+ gQVkUcvb/iiMnE0PZdivLiorRjhdR5tIPAYhmuN7Mr5IscQFdb1cndyC+qUzy6zP
30
+ HJGDGqqHDtiYpWlQ3/VAD4V+mAYj67CTj8gM2Y0OfjOIzKLf4jeLzcR0XjxCS9bH
31
+ g3cF/0FdJ5ydwo6r9QW/mE5yej6yuWBD9NFjWVbV/TAY3rWWDtw9g1WG31HjZVRB
32
+ lYndPAx0WIUBse5IRDGTiQ1JuMI5vBrxYJCb1Je506nR2rktACDRVSe/DTM4sxZn
33
+ oP3LBd1hPOAhNya8tD4FmUjQg4tvuWwIKh55XorZVEkzTWGgAJSnu7XTxtPcjxFA
34
+ U/3nmRr1BTYMN96T+3L81oqJTW5CxAAlsR97O7H8eZhwnNdG9HjgAk4PwiMLOhPb
35
+ Ely2/UitUG79uLcra+83gWVYzYiqBYC2d5HR4vCpTeecqYFXjWo9E3LMrvyB5Unk
36
+ zn10wjI+T1ysW7U6t+VJft8s
37
37
  -----END CERTIFICATE-----
38
- date: 2019-11-11 00:00:00.000000000 Z
38
+ date: 2021-02-22 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
- name: c32
41
+ name: dynarex
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.8'
44
47
  - - ">="
45
48
  - !ruby/object:Gem::Version
46
- version: 0.2.0
49
+ version: 1.8.25
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.8'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 1.8.25
60
+ - !ruby/object:Gem::Dependency
61
+ name: dxlite
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
47
64
  - - "~>"
48
65
  - !ruby/object:Gem::Version
49
66
  version: '0.2'
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 0.2.7
50
70
  type: :runtime
51
71
  prerelease: false
52
72
  version_requirements: !ruby/object:Gem::Requirement
53
73
  requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 0.2.0
57
74
  - - "~>"
58
75
  - !ruby/object:Gem::Version
59
76
  version: '0.2'
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.2.7
60
80
  description:
61
81
  email: james@jamesrobertson.eu
62
82
  executables: []
@@ -83,7 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
103
  - !ruby/object:Gem::Version
84
104
  version: '0'
85
105
  requirements: []
86
- rubygems_version: 3.0.3
106
+ rubyforge_project:
107
+ rubygems_version: 2.7.10
87
108
  signing_key:
88
109
  specification_version: 4
89
110
  summary: Experimental gem to search a list of words 1 character at a time. Intended
metadata.gz.sig CHANGED
Binary file