indexer101 0.1.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f3987fe6c70a6da37dd0cbcb50bf3368601ee1b6ddc6a90da039d22dfa14f28b
4
- data.tar.gz: f7a79a0f09d97948a5576c253c8f2d2e5f58079fdc0830ee4fbeee28c208c3af
3
+ metadata.gz: 71b93d7302d137cd83ccd852e8506e349cbe3fc52723eed291c55875597606a3
4
+ data.tar.gz: 03d8a4f0800dac1ee5d7d7923742d7122898671ea72c9913b46c1d30e356dd35
5
5
  SHA512:
6
- metadata.gz: fe696d43cc8c49962e3bb44a6f55497dd178b527d31cde3e395c2b2c1ef984add180fc503ac7f97fb2f4a716e37f785535972e149b3f6ec0cbeaa5698d64b1c9
7
- data.tar.gz: e591fdfbec8eb9e579473075dabe554c04f163ffa23548746833d27053e5a55e7ba3514b2cd154acf43e8a4d2124f5cd753dd0bfcaa30697cd04d0e487bec318
6
+ metadata.gz: f3f8d541e4386651382c21f3936cfaa400f94cdff79729a95bfe7c10949ef628b4e3eddb1b873aecd9e1fc6ec97417ffcc79ab4d8639cec7e5b41ab93ecf2b2f
7
+ data.tar.gz: 52df9975cfbb4c4b7c1520d6d7e5efa3acc716ee70b11a15762534dbbe8acd70eee858428bb7982d718101c4395df33eb94fb53012e4c76e3839698531a0263c
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
data/lib/indexer101.rb CHANGED
@@ -5,24 +5,34 @@
5
5
  require 'c32'
6
6
  require 'thread'
7
7
  require 'thwait'
8
+ require 'dynarex'
9
+ require 'dxlite'
8
10
 
9
11
 
10
12
  class Indexer101
11
13
  using ColouredText
12
14
 
13
15
  class Index
14
-
16
+
15
17
  attr_reader :h
16
- attr_accessor :index
18
+ attr_accessor :uri_index, :index
17
19
 
18
20
  def initialize()
21
+
22
+ @uri_index = {} # contains each URI long with the title
23
+ @index = {} # contains eack keyword
24
+ @h = {} # nested keywords constructed from shared string keys
25
+
19
26
  end
20
27
 
21
28
  def build(a)
22
29
 
23
30
  threads = []
24
- threads << Thread.new do
25
- @index = Hash[a.map(&:to_sym).zip([''] * a.length)]
31
+
32
+ if @index.empty? then
33
+ threads << Thread.new do
34
+ @index = Hash[a.map(&:to_sym).zip([''] * a.length)]
35
+ end
26
36
  end
27
37
 
28
38
  threads << Thread.new { @h = group a }
@@ -70,14 +80,14 @@ class Indexer101
70
80
 
71
81
  end
72
82
 
73
- def build(a)
83
+ def build(a=@indexer.index.keys)
74
84
 
75
85
  t = Time.now
76
86
  @indexer.build(a)
77
87
  t2 = Time.now - t
78
88
 
79
89
  puts "%d words indexed".info % a.length
80
- puts "index built in %.2f seconds".info % t2
90
+ puts ("index built in " + ("%.3f" % t2).brown + " seconds").info
81
91
 
82
92
  self
83
93
  end
@@ -97,7 +107,7 @@ class Indexer101
97
107
  t2 = Time.now - t
98
108
 
99
109
  puts "index contains %d words".info % @indexer.index.length
100
- puts "index read in %.2f seconds".info % t2
110
+ puts "index read in " + ("%.2f" % t2).brown + " seconds".info
101
111
 
102
112
  end
103
113
 
@@ -108,8 +118,86 @@ class Indexer101
108
118
  end
109
119
 
110
120
  end
121
+
122
+ # scan levels: 0 = tags only; 1 = all words in title (including tags)
123
+ #
124
+ def scan_dxindex(*locations, level: 0)
125
+
126
+ t = Time.now
127
+ threads = locations.flatten.map do |location|
128
+
129
+ Thread.new {
130
+
131
+ Thread.current[:v] = case File.extname(location)
132
+ when '.xml'
133
+ Dynarex.new location, debug: @debug
134
+ when '.json'
135
+ DxLite.new location, debug: @debug
136
+ end
137
+ }
138
+ end
139
+
140
+ ThreadsWait.all_waits(*threads)
141
+
142
+ a = threads.map {|x| x[:v]}
143
+ puts '_a: ' + a.inspect if @debug
144
+ t2 = Time.now - t
145
+ puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
146
+ + " seconds").info
147
+
148
+
149
+ id = 1
150
+
151
+ a.each do |dx|
152
+
153
+ id2 = id
154
+
155
+ if @debug then
156
+ puts 'dx: ' + dx.class.inspect
157
+ puts 'dx.all: ' + dx.all.inspect
158
+ end
159
+
160
+ @indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
161
+ {|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
162
+
163
+ dx.all.reverse.each do |x|
164
+
165
+ case level
166
+ when 0
167
+
168
+ x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
169
+ @indexer.index[keyword.downcase.to_sym] ||= []
170
+ @indexer.index[keyword.downcase.to_sym] << id2
171
+ end
172
+
173
+ when 1
174
+
175
+ # \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
176
+ x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
177
+ @indexer.index[keyword.downcase.to_sym] ||= []
178
+ @indexer.index[keyword.downcase.to_sym] << id2
179
+ end
180
+
181
+ end
182
+
183
+ id2 += 1
184
+
185
+ end
186
+
187
+ id = id2
188
+
189
+ end
190
+
191
+ end
192
+
193
+ def uri_index()
194
+ @indexer.uri_index
195
+ end
111
196
 
112
- def search(s, limit: 10)
197
+ # enter a few starting characters and lookup will suggest a few keywords
198
+ # useful for an auto suggest feature
199
+ #
200
+ def lookup(s, limit: 10)
113
201
 
114
202
  t = Time.now
115
203
  a = scan_path s
@@ -124,7 +212,48 @@ class Indexer101
124
212
 
125
213
  results = scan_leaves(r).sort_by(&:length).take(limit)
126
214
  t2 = Time.now - t
127
- puts "search took %.2f seconds" % t2 if @debug
215
+ puts ("lookup took " + ("%.3f" % t2).brown + " seconds").info
216
+
217
+ return results
218
+
219
+ end
220
+
221
+ # enter the exact keywords to search from the index
222
+ #
223
+ def search(*keywords, minchars: 3)
224
+
225
+ t = Time.now
226
+
227
+ r = keywords.flatten(1).map do |x|
228
+
229
+ a = []
230
+ a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
231
+
232
+ if x.length >= minchars then
233
+ a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
234
+ {|y| @indexer.index[y].reverse}
235
+ a += @indexer.index.keys.grep(/#{x}/i).flat_map\
236
+ {|y| @indexer.index[y].reverse}
237
+ end
238
+
239
+ puts ('a: ' + a.inspect).debug if @debug
240
+
241
+ a.uniq.map {|y| @indexer.uri_index[y]}
242
+
243
+ end
244
+
245
+ # group by number of results found, sort by count, then by date
246
+ a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
247
+ -([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
248
+ end
249
+
250
+ # fetch the 1st record from each group item
251
+ results = a3.map {|x| x.last.first}
252
+
253
+ t2 = Time.now - t
254
+ puts ("found %s results" % results.length).info
255
+ puts ("search took " + ("%.3f" % t2).brown + " seconds").info
256
+ puts
128
257
 
129
258
  return results
130
259
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indexer101
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,52 +11,72 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMTkxMTExMjI1NjA5WhcN
15
- MjAxMTEwMjI1NjA5WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDEF92R
17
- JWEWjPhgU6nIiCKKXtWI9EE8DbqPtU+CnYuUN2BWN7c4dsbaYrU1tVMzxx22G+Of
18
- apTIeIGFrq/oqub6nhx+UbnkMAqpmbJS8zTgnwEhFsGHGI2CD9+4OXh5rl36SqqP
19
- IGxY7w45KDvuPWA/Htb2aC20cHclJebOjeaMNogpYDByVEjjxtZeiOmIJtJlQSf+
20
- YpUviQVTkFJqbSK0WkKsPLQZu29P1zHETkdBvIGlGGDdo13haBW2Hdj9a2INeWcz
21
- B+v6nAdnv9fTTvH0GX51XDa/EIisWNELaopHk1Hcx97pZdm92gaICQfdgUHje65s
22
- oUDjyynUKE8dq+LAPEq5B1wj3e4BsIOFI5PSvPCMWI5jpbJWBXV6owex9qln22UA
23
- lDUEP3COb9/+r6bGNiCBUvUwyy7l/RdmRXPSOYnP8jPcwD/qSVypJYObM+2q68qc
24
- 5Eg0WqCVdFgpIXzBJPVcxeDJiN6EKmFWr5vJkMwIGz9hhIcitQjjSguk4r8CAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUN0C2lCCv
26
- /ATnmYSeZRQfUNpplfwwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjEwMjIyMDAwNzQ2WhcN
15
+ MjIwMjIyMDAwNzQ2WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDqAwJO
17
+ ET6LAnOZB9q04zgLVFB0gJTcuLvfwjf7vpH9gCa5uqshSJnfi0owkeS2Hao0OwD+
18
+ vJrRRYbPfmXHAhEV8l9bSE6Ul1uTT9A+XS7g724sgOm5tCKFkLu+rcDy25MSjqpD
19
+ q+cPG4SN3ZUGK5eR9tp//dzdrjCV2wsOaoYKPajVY698p+sRf1zsHsSMxYnJPD/8
20
+ IkeNC+3VdsJFQ7wAoSk4hSpDuIi1xknA61/elDy5O07r1M25PJMntBE6QpJZblvw
21
+ v5u8U7+nK9P82KFfUwAjqkrhizt90M+0eK6dG44PnqafnxF84K2v7Qr1W6hKMIeL
22
+ DcqKPjAop+DO8WPCtfKFcFQKGRSe+H2rej3h34eGPH/GVoh8/h+ZuoDUfQRfoWLb
23
+ zUTM4uC1XwMlRjg8W4uYgV8SWZ5eii3tpmUz7moKQ4k9DSNNpGO+/bk/IcMVbRkb
24
+ xG/LzBAa6JQyAtVw9AMq8WHZKtJeNrdthoJSczsurthCHb0nY7VUQ/pp9JsCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUVn1uBRaL
26
+ Zh8+3WB077Lz84bokZowJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAXrKEIca0q3x//SBOsv17jkonBSlzwVLIBuXKXX4R
29
- f0q3kJw7vWBPJfIVpAcGn6mbx2ziWw3XvG/SMuwIfzitz0pKTapvwbKZFPscqy82
30
- KcDxGtJt1cjDHzl0Bm+mgN/MgY+PAj3TcT/osuCc8iTu4+Ib6UxkmOP/uy2svBLs
31
- Su2XzGoqd5SKxPpj7IwMOBSVQWrqgWN2B+gdkN0CqjUPVMmiEKuMNjz9Idu36nG/
32
- QOPsTlpqBxR+yFbPEP0DlN5X8BRRAsraQZ+LPi7W/bU1fjkvIJxXuUkQD9dMF99+
33
- me+6s7PoCJ1yMmc5XfMFmv4WYswC+VqI1EeG0EneAsxRo8MmhWZcokqRE/KUk+ym
34
- NlDFqcKPZDaMsdO5dkYvEeguet/iG3XS1u7WKAcZgfhPdiIbue7cAhz9eQpNybe7
35
- 08cN0E9zjqKINgH/PsZTot+ohuVRLwn6WmHHhb18oUrxt3a0u4/3TNcWOcMeR0F2
36
- GeYL+mKGct5bfjn8IZnAJVKY
28
+ BgkqhkiG9w0BAQsFAAOCAYEAUzwCxgrA0YiJvXoi1ZHlhhz+ROzRn6XSQZZVd0Ym
29
+ gQVkUcvb/iiMnE0PZdivLiorRjhdR5tIPAYhmuN7Mr5IscQFdb1cndyC+qUzy6zP
30
+ HJGDGqqHDtiYpWlQ3/VAD4V+mAYj67CTj8gM2Y0OfjOIzKLf4jeLzcR0XjxCS9bH
31
+ g3cF/0FdJ5ydwo6r9QW/mE5yej6yuWBD9NFjWVbV/TAY3rWWDtw9g1WG31HjZVRB
32
+ lYndPAx0WIUBse5IRDGTiQ1JuMI5vBrxYJCb1Je506nR2rktACDRVSe/DTM4sxZn
33
+ oP3LBd1hPOAhNya8tD4FmUjQg4tvuWwIKh55XorZVEkzTWGgAJSnu7XTxtPcjxFA
34
+ U/3nmRr1BTYMN96T+3L81oqJTW5CxAAlsR97O7H8eZhwnNdG9HjgAk4PwiMLOhPb
35
+ Ely2/UitUG79uLcra+83gWVYzYiqBYC2d5HR4vCpTeecqYFXjWo9E3LMrvyB5Unk
36
+ zn10wjI+T1ysW7U6t+VJft8s
37
37
  -----END CERTIFICATE-----
38
- date: 2019-11-11 00:00:00.000000000 Z
38
+ date: 2021-02-22 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
- name: c32
41
+ name: dynarex
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.8'
44
47
  - - ">="
45
48
  - !ruby/object:Gem::Version
46
- version: 0.2.0
49
+ version: 1.8.25
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.8'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 1.8.25
60
+ - !ruby/object:Gem::Dependency
61
+ name: dxlite
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
47
64
  - - "~>"
48
65
  - !ruby/object:Gem::Version
49
66
  version: '0.2'
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 0.2.7
50
70
  type: :runtime
51
71
  prerelease: false
52
72
  version_requirements: !ruby/object:Gem::Requirement
53
73
  requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 0.2.0
57
74
  - - "~>"
58
75
  - !ruby/object:Gem::Version
59
76
  version: '0.2'
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.2.7
60
80
  description:
61
81
  email: james@jamesrobertson.eu
62
82
  executables: []
@@ -83,7 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
103
  - !ruby/object:Gem::Version
84
104
  version: '0'
85
105
  requirements: []
86
- rubygems_version: 3.0.3
106
+ rubyforge_project:
107
+ rubygems_version: 2.7.10
87
108
  signing_key:
88
109
  specification_version: 4
89
110
  summary: Experimental gem to search a list of words 1 character at a time. Intended
metadata.gz.sig CHANGED
Binary file