indexer101 0.2.0 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c12f1a8d9fc5dcde5c95bd80e0116236f4d70b9b0f835899b8339d765f5771b8
4
- data.tar.gz: d3aa8b7f9146bbe35a28496bc01654ffe6d638289771fde4fb62dd6aec371682
3
+ metadata.gz: 58debc39c10c1dff7e46220800a6f063ee8ae0c2b7e47c5dc1161b1c2555d64d
4
+ data.tar.gz: e7b468fb8b030e9a6710625f7e3f6061e30b7d0ae0f371d5977c01f027ab3b32
5
5
  SHA512:
6
- metadata.gz: ad823a882a9052f38de0acb790ee88b5dae6bd5bd19abbea22aa858a665f7d795d8cc373e4b4cc6ff0c34c0d552822484258c2203df0c25cb362f9d54ba56c78
7
- data.tar.gz: d20146a83a706cac482fdfde143b99f4649e1b4f77e9bae9bd000120c4a82c533600777d762b75e3cefd9f36e011aba07cc13f0160b5fb599438cbca23296ce7
6
+ metadata.gz: 480085f3742927accef9c583ef778bedaef2d08c113b87a4fb109fd910ef3b662324b3a35b7cce4c86f3a8d59043f49f03a69e3bfd5d0620a23ba354212e764d
7
+ data.tar.gz: 62f4cd3287838fb3e25d8c54ba74259b03f40d4bafbcb51198b6ad47c3d67adc0f6dcfcf72eddc771be60356253e05b644152b4b5d3b20bb0662226c4299f12e
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
data/lib/indexer101.rb CHANGED
@@ -6,6 +6,7 @@ require 'c32'
6
6
  require 'thread'
7
7
  require 'thwait'
8
8
  require 'dynarex'
9
+ require 'dxlite'
9
10
 
10
11
 
11
12
  class Indexer101
@@ -118,32 +119,81 @@ class Indexer101
118
119
 
119
120
  end
120
121
 
121
- def scan_dxindex(*locations)
122
+ # scan levels: 0 = tags only; 1 = all words in title (including tags)
123
+ #
124
+ def scan_dxindex(*locations, level: 0)
122
125
 
123
126
  t = Time.now
124
127
  threads = locations.flatten.map do |location|
125
- Thread.new {Thread.current[:v] = Dynarex.new location}
128
+
129
+ Thread.new {
130
+
131
+ if location.is_a?(Dynarex) or location.is_a?(DxLite) then
132
+
133
+ Thread.current[:v] = location
134
+
135
+ elsif location.is_a? String
136
+
137
+ case File.extname(location)
138
+ when '.xml'
139
+ Thread.current[:v] = Dynarex.new location, debug: @debug
140
+ when '.json'
141
+ Thread.current[:v] = DxLite.new location, debug: @debug
142
+ end
143
+
144
+ end
145
+ }
126
146
  end
127
147
 
128
148
  ThreadsWait.all_waits(*threads)
129
149
 
130
150
  a = threads.map {|x| x[:v]}
151
+ puts '_a: ' + a.inspect if @debug
131
152
  t2 = Time.now - t
132
153
  puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
133
154
  + " seconds").info
134
155
 
135
- a.each.with_index do |dx, i|
156
+
157
+ id = 1
158
+
159
+ a.each do |dx|
160
+
161
+ id2 = id
162
+
163
+ if @debug then
164
+ puts 'dx: ' + dx.class.inspect
165
+ puts 'dx.all: ' + dx.all.inspect
166
+ end
136
167
 
137
168
  @indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
138
- {|x,j| [(i+1)*10000 + (j+1), [x.title, x.url].join(' ')]}]
169
+ {|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
139
170
 
140
- dx.all.reverse.each.with_index do |x,j|
141
- x.title.scan(/#(\w+)/).flatten(1).each do |keyword|
142
- @indexer.index[keyword.to_sym] ||= []
143
- @indexer.index[keyword.to_sym] << (i+1)*10000 + (j+1)
171
+ dx.all.reverse.each do |x|
172
+
173
+ case level
174
+ when 0
175
+
176
+ x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
177
+ @indexer.index[keyword.downcase.to_sym] ||= []
178
+ @indexer.index[keyword.downcase.to_sym] << id2
179
+ end
180
+
181
+ when 1
182
+
183
+ # \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
184
+ x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
185
+ @indexer.index[keyword.downcase.to_sym] ||= []
186
+ @indexer.index[keyword.downcase.to_sym] << id2
187
+ end
188
+
144
189
  end
190
+
191
+ id2 += 1
192
+
145
193
  end
146
194
 
195
+ id = id2
196
+
147
197
  end
148
198
 
149
199
  end
@@ -178,27 +228,36 @@ class Indexer101
178
228
 
179
229
  # enter the exact keywords to search from the index
180
230
  #
181
- def search(*keywords)
231
+ def search(*keywords, minchars: 3)
182
232
 
183
233
  t = Time.now
184
234
 
185
- results = keywords.flatten(1).flat_map do |x|
235
+ r = keywords.flatten(1).map do |x|
186
236
 
187
237
  a = []
188
238
  a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
189
239
 
190
- if x.length > 3 then
191
- a += @indexer.index.keys.reverse.grep(/^#{x}/).flat_map\
192
- {|y| @indexer.index[y]}
193
- a += @indexer.index.keys.reverse.grep(/#{x}/).flat_map\
194
- {|y| @indexer.index[y]}
240
+ if x.length >= minchars then
241
+ a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
242
+ {|y| @indexer.index[y].reverse}
243
+ a += @indexer.index.keys.grep(/#{x}/i).flat_map\
244
+ {|y| @indexer.index[y].reverse}
195
245
  end
196
246
 
197
247
  puts ('a: ' + a.inspect).debug if @debug
198
- a.uniq.map {|y| @indexer.uri_index[y].split(/\s+(?=https?[^\s]+$)/,2) }
199
248
 
249
+ a.uniq.map {|y| @indexer.uri_index[y]}
250
+
251
+ end
252
+
253
+ # group by number of results found, sort by count, then by date
254
+ a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
255
+ -([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
200
256
  end
201
257
 
258
+ # fetch the 1st record from each group item
259
+ results = a3.map {|x| x.last.first}
260
+
202
261
  t2 = Time.now - t
203
262
  puts ("found %s results" % results.length).info
204
263
  puts ("search took " + ("%.3f" % t2).brown + " seconds").info
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indexer101
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,31 +11,31 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMTkxMTExMjI1NjA5WhcN
15
- MjAxMTEwMjI1NjA5WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDEF92R
17
- JWEWjPhgU6nIiCKKXtWI9EE8DbqPtU+CnYuUN2BWN7c4dsbaYrU1tVMzxx22G+Of
18
- apTIeIGFrq/oqub6nhx+UbnkMAqpmbJS8zTgnwEhFsGHGI2CD9+4OXh5rl36SqqP
19
- IGxY7w45KDvuPWA/Htb2aC20cHclJebOjeaMNogpYDByVEjjxtZeiOmIJtJlQSf+
20
- YpUviQVTkFJqbSK0WkKsPLQZu29P1zHETkdBvIGlGGDdo13haBW2Hdj9a2INeWcz
21
- B+v6nAdnv9fTTvH0GX51XDa/EIisWNELaopHk1Hcx97pZdm92gaICQfdgUHje65s
22
- oUDjyynUKE8dq+LAPEq5B1wj3e4BsIOFI5PSvPCMWI5jpbJWBXV6owex9qln22UA
23
- lDUEP3COb9/+r6bGNiCBUvUwyy7l/RdmRXPSOYnP8jPcwD/qSVypJYObM+2q68qc
24
- 5Eg0WqCVdFgpIXzBJPVcxeDJiN6EKmFWr5vJkMwIGz9hhIcitQjjSguk4r8CAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUN0C2lCCv
26
- /ATnmYSeZRQfUNpplfwwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjEwMjIyMDAwNzQ2WhcN
15
+ MjIwMjIyMDAwNzQ2WjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDqAwJO
17
+ ET6LAnOZB9q04zgLVFB0gJTcuLvfwjf7vpH9gCa5uqshSJnfi0owkeS2Hao0OwD+
18
+ vJrRRYbPfmXHAhEV8l9bSE6Ul1uTT9A+XS7g724sgOm5tCKFkLu+rcDy25MSjqpD
19
+ q+cPG4SN3ZUGK5eR9tp//dzdrjCV2wsOaoYKPajVY698p+sRf1zsHsSMxYnJPD/8
20
+ IkeNC+3VdsJFQ7wAoSk4hSpDuIi1xknA61/elDy5O07r1M25PJMntBE6QpJZblvw
21
+ v5u8U7+nK9P82KFfUwAjqkrhizt90M+0eK6dG44PnqafnxF84K2v7Qr1W6hKMIeL
22
+ DcqKPjAop+DO8WPCtfKFcFQKGRSe+H2rej3h34eGPH/GVoh8/h+ZuoDUfQRfoWLb
23
+ zUTM4uC1XwMlRjg8W4uYgV8SWZ5eii3tpmUz7moKQ4k9DSNNpGO+/bk/IcMVbRkb
24
+ xG/LzBAa6JQyAtVw9AMq8WHZKtJeNrdthoJSczsurthCHb0nY7VUQ/pp9JsCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUVn1uBRaL
26
+ Zh8+3WB077Lz84bokZowJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAXrKEIca0q3x//SBOsv17jkonBSlzwVLIBuXKXX4R
29
- f0q3kJw7vWBPJfIVpAcGn6mbx2ziWw3XvG/SMuwIfzitz0pKTapvwbKZFPscqy82
30
- KcDxGtJt1cjDHzl0Bm+mgN/MgY+PAj3TcT/osuCc8iTu4+Ib6UxkmOP/uy2svBLs
31
- Su2XzGoqd5SKxPpj7IwMOBSVQWrqgWN2B+gdkN0CqjUPVMmiEKuMNjz9Idu36nG/
32
- QOPsTlpqBxR+yFbPEP0DlN5X8BRRAsraQZ+LPi7W/bU1fjkvIJxXuUkQD9dMF99+
33
- me+6s7PoCJ1yMmc5XfMFmv4WYswC+VqI1EeG0EneAsxRo8MmhWZcokqRE/KUk+ym
34
- NlDFqcKPZDaMsdO5dkYvEeguet/iG3XS1u7WKAcZgfhPdiIbue7cAhz9eQpNybe7
35
- 08cN0E9zjqKINgH/PsZTot+ohuVRLwn6WmHHhb18oUrxt3a0u4/3TNcWOcMeR0F2
36
- GeYL+mKGct5bfjn8IZnAJVKY
28
+ BgkqhkiG9w0BAQsFAAOCAYEAUzwCxgrA0YiJvXoi1ZHlhhz+ROzRn6XSQZZVd0Ym
29
+ gQVkUcvb/iiMnE0PZdivLiorRjhdR5tIPAYhmuN7Mr5IscQFdb1cndyC+qUzy6zP
30
+ HJGDGqqHDtiYpWlQ3/VAD4V+mAYj67CTj8gM2Y0OfjOIzKLf4jeLzcR0XjxCS9bH
31
+ g3cF/0FdJ5ydwo6r9QW/mE5yej6yuWBD9NFjWVbV/TAY3rWWDtw9g1WG31HjZVRB
32
+ lYndPAx0WIUBse5IRDGTiQ1JuMI5vBrxYJCb1Je506nR2rktACDRVSe/DTM4sxZn
33
+ oP3LBd1hPOAhNya8tD4FmUjQg4tvuWwIKh55XorZVEkzTWGgAJSnu7XTxtPcjxFA
34
+ U/3nmRr1BTYMN96T+3L81oqJTW5CxAAlsR97O7H8eZhwnNdG9HjgAk4PwiMLOhPb
35
+ Ely2/UitUG79uLcra+83gWVYzYiqBYC2d5HR4vCpTeecqYFXjWo9E3LMrvyB5Unk
36
+ zn10wjI+T1ysW7U6t+VJft8s
37
37
  -----END CERTIFICATE-----
38
- date: 2019-11-12 00:00:00.000000000 Z
38
+ date: 2021-02-22 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: dynarex
@@ -46,7 +46,7 @@ dependencies:
46
46
  version: '1.8'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: 1.8.21
49
+ version: 1.8.25
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
@@ -56,7 +56,27 @@ dependencies:
56
56
  version: '1.8'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: 1.8.21
59
+ version: 1.8.25
60
+ - !ruby/object:Gem::Dependency
61
+ name: dxlite
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - "~>"
65
+ - !ruby/object:Gem::Version
66
+ version: '0.2'
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 0.2.7
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '0.2'
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.2.7
60
80
  description:
61
81
  email: james@jamesrobertson.eu
62
82
  executables: []
@@ -83,7 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
103
  - !ruby/object:Gem::Version
84
104
  version: '0'
85
105
  requirements: []
86
- rubygems_version: 3.0.3
106
+ rubyforge_project:
107
+ rubygems_version: 2.7.10
87
108
  signing_key:
88
109
  specification_version: 4
89
110
  summary: Experimental gem to search a list of words 1 character at a time. Intended
metadata.gz.sig CHANGED
Binary file