indexer101 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indexer101.rb +87 -9
- data.tar.gz.sig +0 -0
- metadata +11 -11
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c12f1a8d9fc5dcde5c95bd80e0116236f4d70b9b0f835899b8339d765f5771b8
|
4
|
+
data.tar.gz: d3aa8b7f9146bbe35a28496bc01654ffe6d638289771fde4fb62dd6aec371682
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad823a882a9052f38de0acb790ee88b5dae6bd5bd19abbea22aa858a665f7d795d8cc373e4b4cc6ff0c34c0d552822484258c2203df0c25cb362f9d54ba56c78
|
7
|
+
data.tar.gz: d20146a83a706cac482fdfde143b99f4649e1b4f77e9bae9bd000120c4a82c533600777d762b75e3cefd9f36e011aba07cc13f0160b5fb599438cbca23296ce7
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indexer101.rb
CHANGED
@@ -5,24 +5,33 @@
|
|
5
5
|
require 'c32'
|
6
6
|
require 'thread'
|
7
7
|
require 'thwait'
|
8
|
+
require 'dynarex'
|
8
9
|
|
9
10
|
|
10
11
|
class Indexer101
|
11
12
|
using ColouredText
|
12
13
|
|
13
14
|
class Index
|
14
|
-
|
15
|
+
|
15
16
|
attr_reader :h
|
16
|
-
attr_accessor :index
|
17
|
+
attr_accessor :uri_index, :index
|
17
18
|
|
18
19
|
def initialize()
|
20
|
+
|
21
|
+
@uri_index = {} # contains each URI long with the title
|
22
|
+
@index = {} # contains eack keyword
|
23
|
+
@h = {} # nested keywords constructed from shared string keys
|
24
|
+
|
19
25
|
end
|
20
26
|
|
21
27
|
def build(a)
|
22
28
|
|
23
29
|
threads = []
|
24
|
-
|
25
|
-
|
30
|
+
|
31
|
+
if @index.empty? then
|
32
|
+
threads << Thread.new do
|
33
|
+
@index = Hash[a.map(&:to_sym).zip([''] * a.length)]
|
34
|
+
end
|
26
35
|
end
|
27
36
|
|
28
37
|
threads << Thread.new { @h = group a }
|
@@ -70,14 +79,14 @@ class Indexer101
|
|
70
79
|
|
71
80
|
end
|
72
81
|
|
73
|
-
def build(a)
|
82
|
+
def build(a=@indexer.index.keys)
|
74
83
|
|
75
84
|
t = Time.now
|
76
85
|
@indexer.build(a)
|
77
86
|
t2 = Time.now - t
|
78
87
|
|
79
88
|
puts "%d words indexed".info % a.length
|
80
|
-
puts "index built in
|
89
|
+
puts ("index built in " + ("%.3f" % t2).brown + " seconds").info
|
81
90
|
|
82
91
|
self
|
83
92
|
end
|
@@ -97,7 +106,7 @@ class Indexer101
|
|
97
106
|
t2 = Time.now - t
|
98
107
|
|
99
108
|
puts "index contains %d words".info % @indexer.index.length
|
100
|
-
puts "index read in %.2f
|
109
|
+
puts "index read in " + ("%.2f" % t2).brown + " seconds".info
|
101
110
|
|
102
111
|
end
|
103
112
|
|
@@ -108,8 +117,45 @@ class Indexer101
|
|
108
117
|
end
|
109
118
|
|
110
119
|
end
|
120
|
+
|
121
|
+
def scan_dxindex(*locations)
|
122
|
+
|
123
|
+
t = Time.now
|
124
|
+
threads = locations.flatten.map do |location|
|
125
|
+
Thread.new {Thread.current[:v] = Dynarex.new location}
|
126
|
+
end
|
127
|
+
|
128
|
+
ThreadsWait.all_waits(*threads)
|
129
|
+
|
130
|
+
a = threads.map {|x| x[:v]}
|
131
|
+
t2 = Time.now - t
|
132
|
+
puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
|
133
|
+
+ " seconds").info
|
134
|
+
|
135
|
+
a.each.with_index do |dx, i|
|
136
|
+
|
137
|
+
@indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
|
138
|
+
{|x,j| [(i+1)*10000 + (j+1), [x.title, x.url].join(' ')]}]
|
139
|
+
|
140
|
+
dx.all.reverse.each.with_index do |x,j|
|
141
|
+
x.title.scan(/#(\w+)/).flatten(1).each do |keyword|
|
142
|
+
@indexer.index[keyword.to_sym] ||= []
|
143
|
+
@indexer.index[keyword.to_sym] << (i+1)*10000 + (j+1)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
def uri_index()
|
152
|
+
@indexer.uri_index
|
153
|
+
end
|
111
154
|
|
112
|
-
|
155
|
+
# enter a few starting characters and lookup will suggest a few keywords
|
156
|
+
# useful for an auto suggest feature
|
157
|
+
#
|
158
|
+
def lookup(s, limit: 10)
|
113
159
|
|
114
160
|
t = Time.now
|
115
161
|
a = scan_path s
|
@@ -124,7 +170,39 @@ class Indexer101
|
|
124
170
|
|
125
171
|
results = scan_leaves(r).sort_by(&:length).take(limit)
|
126
172
|
t2 = Time.now - t
|
127
|
-
puts "
|
173
|
+
puts ("lookup took " + ("%.3f" % t2).brown + " seconds").info
|
174
|
+
|
175
|
+
return results
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
# enter the exact keywords to search from the index
|
180
|
+
#
|
181
|
+
def search(*keywords)
|
182
|
+
|
183
|
+
t = Time.now
|
184
|
+
|
185
|
+
results = keywords.flatten(1).flat_map do |x|
|
186
|
+
|
187
|
+
a = []
|
188
|
+
a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
|
189
|
+
|
190
|
+
if x.length > 3 then
|
191
|
+
a += @indexer.index.keys.reverse.grep(/^#{x}/).flat_map\
|
192
|
+
{|y| @indexer.index[y]}
|
193
|
+
a += @indexer.index.keys.reverse.grep(/#{x}/).flat_map\
|
194
|
+
{|y| @indexer.index[y]}
|
195
|
+
end
|
196
|
+
|
197
|
+
puts ('a: ' + a.inspect).debug if @debug
|
198
|
+
a.uniq.map {|y| @indexer.uri_index[y].split(/\s+(?=https?[^\s]+$)/,2) }
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
t2 = Time.now - t
|
203
|
+
puts ("found %s results" % results.length).info
|
204
|
+
puts ("search took " + ("%.3f" % t2).brown + " seconds").info
|
205
|
+
puts
|
128
206
|
|
129
207
|
return results
|
130
208
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indexer101
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,28 +35,28 @@ cert_chain:
|
|
35
35
|
08cN0E9zjqKINgH/PsZTot+ohuVRLwn6WmHHhb18oUrxt3a0u4/3TNcWOcMeR0F2
|
36
36
|
GeYL+mKGct5bfjn8IZnAJVKY
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2019-11-
|
38
|
+
date: 2019-11-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: dynarex
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
-
- - ">="
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version: 0.2.0
|
47
44
|
- - "~>"
|
48
45
|
- !ruby/object:Gem::Version
|
49
|
-
version: '
|
46
|
+
version: '1.8'
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.8.21
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 0.2.0
|
57
54
|
- - "~>"
|
58
55
|
- !ruby/object:Gem::Version
|
59
|
-
version: '
|
56
|
+
version: '1.8'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 1.8.21
|
60
60
|
description:
|
61
61
|
email: james@jamesrobertson.eu
|
62
62
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|