indexer101 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indexer101.rb +87 -9
- data.tar.gz.sig +0 -0
- metadata +11 -11
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c12f1a8d9fc5dcde5c95bd80e0116236f4d70b9b0f835899b8339d765f5771b8
|
4
|
+
data.tar.gz: d3aa8b7f9146bbe35a28496bc01654ffe6d638289771fde4fb62dd6aec371682
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad823a882a9052f38de0acb790ee88b5dae6bd5bd19abbea22aa858a665f7d795d8cc373e4b4cc6ff0c34c0d552822484258c2203df0c25cb362f9d54ba56c78
|
7
|
+
data.tar.gz: d20146a83a706cac482fdfde143b99f4649e1b4f77e9bae9bd000120c4a82c533600777d762b75e3cefd9f36e011aba07cc13f0160b5fb599438cbca23296ce7
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indexer101.rb
CHANGED
@@ -5,24 +5,33 @@
|
|
5
5
|
require 'c32'
|
6
6
|
require 'thread'
|
7
7
|
require 'thwait'
|
8
|
+
require 'dynarex'
|
8
9
|
|
9
10
|
|
10
11
|
class Indexer101
|
11
12
|
using ColouredText
|
12
13
|
|
13
14
|
class Index
|
14
|
-
|
15
|
+
|
15
16
|
attr_reader :h
|
16
|
-
attr_accessor :index
|
17
|
+
attr_accessor :uri_index, :index
|
17
18
|
|
18
19
|
def initialize()
|
20
|
+
|
21
|
+
@uri_index = {} # contains each URI long with the title
|
22
|
+
@index = {} # contains eack keyword
|
23
|
+
@h = {} # nested keywords constructed from shared string keys
|
24
|
+
|
19
25
|
end
|
20
26
|
|
21
27
|
def build(a)
|
22
28
|
|
23
29
|
threads = []
|
24
|
-
|
25
|
-
|
30
|
+
|
31
|
+
if @index.empty? then
|
32
|
+
threads << Thread.new do
|
33
|
+
@index = Hash[a.map(&:to_sym).zip([''] * a.length)]
|
34
|
+
end
|
26
35
|
end
|
27
36
|
|
28
37
|
threads << Thread.new { @h = group a }
|
@@ -70,14 +79,14 @@ class Indexer101
|
|
70
79
|
|
71
80
|
end
|
72
81
|
|
73
|
-
def build(a)
|
82
|
+
def build(a=@indexer.index.keys)
|
74
83
|
|
75
84
|
t = Time.now
|
76
85
|
@indexer.build(a)
|
77
86
|
t2 = Time.now - t
|
78
87
|
|
79
88
|
puts "%d words indexed".info % a.length
|
80
|
-
puts "index built in
|
89
|
+
puts ("index built in " + ("%.3f" % t2).brown + " seconds").info
|
81
90
|
|
82
91
|
self
|
83
92
|
end
|
@@ -97,7 +106,7 @@ class Indexer101
|
|
97
106
|
t2 = Time.now - t
|
98
107
|
|
99
108
|
puts "index contains %d words".info % @indexer.index.length
|
100
|
-
puts "index read in %.2f
|
109
|
+
puts "index read in " + ("%.2f" % t2).brown + " seconds".info
|
101
110
|
|
102
111
|
end
|
103
112
|
|
@@ -108,8 +117,45 @@ class Indexer101
|
|
108
117
|
end
|
109
118
|
|
110
119
|
end
|
120
|
+
|
121
|
+
def scan_dxindex(*locations)
|
122
|
+
|
123
|
+
t = Time.now
|
124
|
+
threads = locations.flatten.map do |location|
|
125
|
+
Thread.new {Thread.current[:v] = Dynarex.new location}
|
126
|
+
end
|
127
|
+
|
128
|
+
ThreadsWait.all_waits(*threads)
|
129
|
+
|
130
|
+
a = threads.map {|x| x[:v]}
|
131
|
+
t2 = Time.now - t
|
132
|
+
puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
|
133
|
+
+ " seconds").info
|
134
|
+
|
135
|
+
a.each.with_index do |dx, i|
|
136
|
+
|
137
|
+
@indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
|
138
|
+
{|x,j| [(i+1)*10000 + (j+1), [x.title, x.url].join(' ')]}]
|
139
|
+
|
140
|
+
dx.all.reverse.each.with_index do |x,j|
|
141
|
+
x.title.scan(/#(\w+)/).flatten(1).each do |keyword|
|
142
|
+
@indexer.index[keyword.to_sym] ||= []
|
143
|
+
@indexer.index[keyword.to_sym] << (i+1)*10000 + (j+1)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
def uri_index()
|
152
|
+
@indexer.uri_index
|
153
|
+
end
|
111
154
|
|
112
|
-
|
155
|
+
# enter a few starting characters and lookup will suggest a few keywords
|
156
|
+
# useful for an auto suggest feature
|
157
|
+
#
|
158
|
+
def lookup(s, limit: 10)
|
113
159
|
|
114
160
|
t = Time.now
|
115
161
|
a = scan_path s
|
@@ -124,7 +170,39 @@ class Indexer101
|
|
124
170
|
|
125
171
|
results = scan_leaves(r).sort_by(&:length).take(limit)
|
126
172
|
t2 = Time.now - t
|
127
|
-
puts "
|
173
|
+
puts ("lookup took " + ("%.3f" % t2).brown + " seconds").info
|
174
|
+
|
175
|
+
return results
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
# enter the exact keywords to search from the index
|
180
|
+
#
|
181
|
+
def search(*keywords)
|
182
|
+
|
183
|
+
t = Time.now
|
184
|
+
|
185
|
+
results = keywords.flatten(1).flat_map do |x|
|
186
|
+
|
187
|
+
a = []
|
188
|
+
a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
|
189
|
+
|
190
|
+
if x.length > 3 then
|
191
|
+
a += @indexer.index.keys.reverse.grep(/^#{x}/).flat_map\
|
192
|
+
{|y| @indexer.index[y]}
|
193
|
+
a += @indexer.index.keys.reverse.grep(/#{x}/).flat_map\
|
194
|
+
{|y| @indexer.index[y]}
|
195
|
+
end
|
196
|
+
|
197
|
+
puts ('a: ' + a.inspect).debug if @debug
|
198
|
+
a.uniq.map {|y| @indexer.uri_index[y].split(/\s+(?=https?[^\s]+$)/,2) }
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
t2 = Time.now - t
|
203
|
+
puts ("found %s results" % results.length).info
|
204
|
+
puts ("search took " + ("%.3f" % t2).brown + " seconds").info
|
205
|
+
puts
|
128
206
|
|
129
207
|
return results
|
130
208
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indexer101
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,28 +35,28 @@ cert_chain:
|
|
35
35
|
08cN0E9zjqKINgH/PsZTot+ohuVRLwn6WmHHhb18oUrxt3a0u4/3TNcWOcMeR0F2
|
36
36
|
GeYL+mKGct5bfjn8IZnAJVKY
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2019-11-
|
38
|
+
date: 2019-11-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: dynarex
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
-
- - ">="
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version: 0.2.0
|
47
44
|
- - "~>"
|
48
45
|
- !ruby/object:Gem::Version
|
49
|
-
version: '
|
46
|
+
version: '1.8'
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.8.21
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 0.2.0
|
57
54
|
- - "~>"
|
58
55
|
- !ruby/object:Gem::Version
|
59
|
-
version: '
|
56
|
+
version: '1.8'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 1.8.21
|
60
60
|
description:
|
61
61
|
email: james@jamesrobertson.eu
|
62
62
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|