indexer101 0.2.0 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c12f1a8d9fc5dcde5c95bd80e0116236f4d70b9b0f835899b8339d765f5771b8
4
- data.tar.gz: d3aa8b7f9146bbe35a28496bc01654ffe6d638289771fde4fb62dd6aec371682
3
+ metadata.gz: 58debc39c10c1dff7e46220800a6f063ee8ae0c2b7e47c5dc1161b1c2555d64d
4
+ data.tar.gz: e7b468fb8b030e9a6710625f7e3f6061e30b7d0ae0f371d5977c01f027ab3b32
5
5
  SHA512:
6
- metadata.gz: ad823a882a9052f38de0acb790ee88b5dae6bd5bd19abbea22aa858a665f7d795d8cc373e4b4cc6ff0c34c0d552822484258c2203df0c25cb362f9d54ba56c78
7
- data.tar.gz: d20146a83a706cac482fdfde143b99f4649e1b4f77e9bae9bd000120c4a82c533600777d762b75e3cefd9f36e011aba07cc13f0160b5fb599438cbca23296ce7
6
+ metadata.gz: 480085f3742927accef9c583ef778bedaef2d08c113b87a4fb109fd910ef3b662324b3a35b7cce4c86f3a8d59043f49f03a69e3bfd5d0620a23ba354212e764d
7
+ data.tar.gz: 62f4cd3287838fb3e25d8c54ba74259b03f40d4bafbcb51198b6ad47c3d67adc0f6dcfcf72eddc771be60356253e05b644152b4b5d3b20bb0662226c4299f12e
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
data/lib/indexer101.rb CHANGED
@@ -6,6 +6,7 @@ require 'c32'
6
6
  require 'thread'
7
7
  require 'thwait'
8
8
  require 'dynarex'
9
+ require 'dxlite'
9
10
 
10
11
 
11
12
  class Indexer101
@@ -118,32 +119,81 @@ class Indexer101
118
119
 
119
120
  end
120
121
 
121
- def scan_dxindex(*locations)
122
+ # scan levels: 0 = tags only; 1 = all words in title (including tags)
123
+ #
124
+ def scan_dxindex(*locations, level: 0)
122
125
 
123
126
  t = Time.now
124
127
  threads = locations.flatten.map do |location|
125
- Thread.new {Thread.current[:v] = Dynarex.new location}
128
+
129
+ Thread.new {
130
+
131
+ if location.is_a?(Dynarex) or location.is_a?(DxLite) then
132
+
133
+ Thread.current[:v] = location
134
+
135
+ elsif location.is_a? String
136
+
137
+ case File.extname(location)
138
+ when '.xml'
139
+ Thread.current[:v] = Dynarex.new location, debug: @debug
140
+ when '.json'
141
+ Thread.current[:v] = DxLite.new location, debug: @debug
142
+ end
143
+
144
+ end
145
+ }
126
146
  end
127
147
 
128
148
  ThreadsWait.all_waits(*threads)
129
149
 
130
150
  a = threads.map {|x| x[:v]}
151
+ puts '_a: ' + a.inspect if @debug
131
152
  t2 = Time.now - t
132
153
  puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
133
154
  + " seconds").info
134
155
 
135
- a.each.with_index do |dx, i|
156
+
157
+ id = 1
158
+
159
+ a.each do |dx|
160
+
161
+ id2 = id
162
+
163
+ if @debug then
164
+ puts 'dx: ' + dx.class.inspect
165
+ puts 'dx.all: ' + dx.all.inspect
166
+ end
136
167
 
137
168
  @indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
138
- {|x,j| [(i+1)*10000 + (j+1), [x.title, x.url].join(' ')]}]
169
+ {|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
139
170
 
140
- dx.all.reverse.each.with_index do |x,j|
141
- x.title.scan(/#(\w+)/).flatten(1).each do |keyword|
142
- @indexer.index[keyword.to_sym] ||= []
143
- @indexer.index[keyword.to_sym] << (i+1)*10000 + (j+1)
171
+ dx.all.reverse.each do |x|
172
+
173
+ case level
174
+ when 0
175
+
176
+ x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
177
+ @indexer.index[keyword.downcase.to_sym] ||= []
178
+ @indexer.index[keyword.downcase.to_sym] << id2
179
+ end
180
+
181
+ when 1
182
+
183
+ # \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
184
+ x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
185
+ @indexer.index[keyword.downcase.to_sym] ||= []
186
+ @indexer.index[keyword.downcase.to_sym] << id2
187
+ end
188
+
144
189
  end
190
+
191
+ id2 += 1
192
+
145
193
  end
146
194
 
195
+ id = id2
196
+
147
197
  end
148
198
 
149
199
  end
@@ -178,27 +228,36 @@ class Indexer101
178
228
 
179
229
  # enter the exact keywords to search from the index
180
230
  #
181
- def search(*keywords)
231
+ def search(*keywords, minchars: 3)
182
232
 
183
233
  t = Time.now
184
234
 
185
- results = keywords.flatten(1).flat_map do |x|
235
+ r = keywords.flatten(1).map do |x|
186
236
 
187
237
  a = []
188
238
  a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
189
239
 
190
- if x.length > 3 then
191
- a += @indexer.index.keys.reverse.grep(/^#{x}/).flat_map\
192
- {|y| @indexer.index[y]}
193
- a += @indexer.index.keys.reverse.grep(/#{x}/).flat_map\
194
- {|y| @indexer.index[y]}
240
+ if x.length >= minchars then
241
+ a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
242
+ {|y| @indexer.index[y].reverse}
243
+ a += @indexer.index.keys.grep(/#{x}/i).flat_map\
244
+ {|y| @indexer.index[y].reverse}
195
245
  end
196
246
 
197
247
  puts ('a: ' + a.inspect).debug if @debug
198
- a.uniq.map {|y| @indexer.uri_index[y].split(/\s+(?=https?[^\s]+$)/,2) }
199
248
 
249
+ a.uniq.map {|y| @indexer.uri_index[y]}
250
+
251
+ end
252
+
253
+ # group by number of results found, sort by count, then by date
254
+ a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
255
+ -([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
200
256
  end
201
257
 
258
+ # fetch the 1st record from each group item
259
+ results = a3.map {|x| x.last.first}
260
+
202
261
  t2 = Time.now - t
203
262
  puts ("found %s results" % results.length).info
204
263
  puts ("search took " + ("%.3f" % t2).brown + " seconds").info
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indexer101
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,31 +11,31 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMTkxMTExMjI1NjA5WhcN
15
- MjAxMTEwMjI1NjA5WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDEF92R
17
- JWEWjPhgU6nIiCKKXtWI9EE8DbqPtU+CnYuUN2BWN7c4dsbaYrU1tVMzxx22G+Of
18
- apTIeIGFrq/oqub6nhx+UbnkMAqpmbJS8zTgnwEhFsGHGI2CD9+4OXh5rl36SqqP
19
- IGxY7w45KDvuPWA/Htb2aC20cHclJebOjeaMNogpYDByVEjjxtZeiOmIJtJlQSf+
20
- YpUviQVTkFJqbSK0WkKsPLQZu29P1zHETkdBvIGlGGDdo13haBW2Hdj9a2INeWcz
21
- B+v6nAdnv9fTTvH0GX51XDa/EIisWNELaopHk1Hcx97pZdm92gaICQfdgUHje65s
22
- oUDjyynUKE8dq+LAPEq5B1wj3e4BsIOFI5PSvPCMWI5jpbJWBXV6owex9qln22UA
23
- lDUEP3COb9/+r6bGNiCBUvUwyy7l/RdmRXPSOYnP8jPcwD/qSVypJYObM+2q68qc
24
- 5Eg0WqCVdFgpIXzBJPVcxeDJiN6EKmFWr5vJkMwIGz9hhIcitQjjSguk4r8CAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUN0C2lCCv
26
- /ATnmYSeZRQfUNpplfwwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjEwMjIyMDAwNzQ2WhcN
15
+ MjIwMjIyMDAwNzQ2WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDqAwJO
17
+ ET6LAnOZB9q04zgLVFB0gJTcuLvfwjf7vpH9gCa5uqshSJnfi0owkeS2Hao0OwD+
18
+ vJrRRYbPfmXHAhEV8l9bSE6Ul1uTT9A+XS7g724sgOm5tCKFkLu+rcDy25MSjqpD
19
+ q+cPG4SN3ZUGK5eR9tp//dzdrjCV2wsOaoYKPajVY698p+sRf1zsHsSMxYnJPD/8
20
+ IkeNC+3VdsJFQ7wAoSk4hSpDuIi1xknA61/elDy5O07r1M25PJMntBE6QpJZblvw
21
+ v5u8U7+nK9P82KFfUwAjqkrhizt90M+0eK6dG44PnqafnxF84K2v7Qr1W6hKMIeL
22
+ DcqKPjAop+DO8WPCtfKFcFQKGRSe+H2rej3h34eGPH/GVoh8/h+ZuoDUfQRfoWLb
23
+ zUTM4uC1XwMlRjg8W4uYgV8SWZ5eii3tpmUz7moKQ4k9DSNNpGO+/bk/IcMVbRkb
24
+ xG/LzBAa6JQyAtVw9AMq8WHZKtJeNrdthoJSczsurthCHb0nY7VUQ/pp9JsCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUVn1uBRaL
26
+ Zh8+3WB077Lz84bokZowJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAXrKEIca0q3x//SBOsv17jkonBSlzwVLIBuXKXX4R
29
- f0q3kJw7vWBPJfIVpAcGn6mbx2ziWw3XvG/SMuwIfzitz0pKTapvwbKZFPscqy82
30
- KcDxGtJt1cjDHzl0Bm+mgN/MgY+PAj3TcT/osuCc8iTu4+Ib6UxkmOP/uy2svBLs
31
- Su2XzGoqd5SKxPpj7IwMOBSVQWrqgWN2B+gdkN0CqjUPVMmiEKuMNjz9Idu36nG/
32
- QOPsTlpqBxR+yFbPEP0DlN5X8BRRAsraQZ+LPi7W/bU1fjkvIJxXuUkQD9dMF99+
33
- me+6s7PoCJ1yMmc5XfMFmv4WYswC+VqI1EeG0EneAsxRo8MmhWZcokqRE/KUk+ym
34
- NlDFqcKPZDaMsdO5dkYvEeguet/iG3XS1u7WKAcZgfhPdiIbue7cAhz9eQpNybe7
35
- 08cN0E9zjqKINgH/PsZTot+ohuVRLwn6WmHHhb18oUrxt3a0u4/3TNcWOcMeR0F2
36
- GeYL+mKGct5bfjn8IZnAJVKY
28
+ BgkqhkiG9w0BAQsFAAOCAYEAUzwCxgrA0YiJvXoi1ZHlhhz+ROzRn6XSQZZVd0Ym
29
+ gQVkUcvb/iiMnE0PZdivLiorRjhdR5tIPAYhmuN7Mr5IscQFdb1cndyC+qUzy6zP
30
+ HJGDGqqHDtiYpWlQ3/VAD4V+mAYj67CTj8gM2Y0OfjOIzKLf4jeLzcR0XjxCS9bH
31
+ g3cF/0FdJ5ydwo6r9QW/mE5yej6yuWBD9NFjWVbV/TAY3rWWDtw9g1WG31HjZVRB
32
+ lYndPAx0WIUBse5IRDGTiQ1JuMI5vBrxYJCb1Je506nR2rktACDRVSe/DTM4sxZn
33
+ oP3LBd1hPOAhNya8tD4FmUjQg4tvuWwIKh55XorZVEkzTWGgAJSnu7XTxtPcjxFA
34
+ U/3nmRr1BTYMN96T+3L81oqJTW5CxAAlsR97O7H8eZhwnNdG9HjgAk4PwiMLOhPb
35
+ Ely2/UitUG79uLcra+83gWVYzYiqBYC2d5HR4vCpTeecqYFXjWo9E3LMrvyB5Unk
36
+ zn10wjI+T1ysW7U6t+VJft8s
37
37
  -----END CERTIFICATE-----
38
- date: 2019-11-12 00:00:00.000000000 Z
38
+ date: 2021-02-22 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: dynarex
@@ -46,7 +46,7 @@ dependencies:
46
46
  version: '1.8'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: 1.8.21
49
+ version: 1.8.25
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
@@ -56,7 +56,27 @@ dependencies:
56
56
  version: '1.8'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: 1.8.21
59
+ version: 1.8.25
60
+ - !ruby/object:Gem::Dependency
61
+ name: dxlite
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - "~>"
65
+ - !ruby/object:Gem::Version
66
+ version: '0.2'
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 0.2.7
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '0.2'
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.2.7
60
80
  description:
61
81
  email: james@jamesrobertson.eu
62
82
  executables: []
@@ -83,7 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
103
  - !ruby/object:Gem::Version
84
104
  version: '0'
85
105
  requirements: []
86
- rubygems_version: 3.0.3
106
+ rubyforge_project:
107
+ rubygems_version: 2.7.10
87
108
  signing_key:
88
109
  specification_version: 4
89
110
  summary: Experimental gem to search a list of words 1 character at a time. Intended
metadata.gz.sig CHANGED
Binary file