lumix 0.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +18 -0
- data/bin/lumix +4 -0
- data/bin/lumix-gui +4 -0
- data/lib/lumix/base.rb +56 -0
- data/lib/lumix/charset.rb +35 -0
- data/lib/lumix/cli.rb +96 -0
- data/lib/lumix/concordancer.rb +254 -0
- data/lib/lumix/corrections.rb +84 -0
- data/lib/lumix/fast_search.rb +91 -0
- data/lib/lumix/filter.rb +89 -0
- data/lib/lumix/gui.rb +148 -0
- data/lib/lumix/lookup.rb +105 -0
- data/lib/lumix/lookup_filter.rb +43 -0
- data/lib/lumix/lookup_search.rb +95 -0
- data/lib/lumix/main.rb +7 -0
- data/lib/lumix/model/base_models.rb +35 -0
- data/lib/lumix/model/maglev_models.rb +42 -0
- data/lib/lumix/model/mock_models.rb +46 -0
- data/lib/lumix/model/sequel_models.rb +53 -0
- data/lib/lumix/proto/lookup.rb +105 -0
- data/lib/lumix/proto/lookup_filter.rb +40 -0
- data/lib/lumix/proto/lookup_search.rb +81 -0
- data/lib/lumix/result_view.rb +93 -0
- data/lib/lumix/schema/001_create_tables.rb +35 -0
- data/lib/lumix/schema/002_categories.rb +28 -0
- data/lib/lumix/schema/003_add_fulltagged.rb +15 -0
- data/lib/lumix/schema/004_create_lookup_tables.rb +44 -0
- data/lib/lumix/slow_search.rb +104 -0
- data/lib/lumix/text_snippet.rb +29 -0
- data/lib/lumix/textprocessing.rb +108 -0
- data/lib/lumix/thread_pool.rb +127 -0
- data/spec/filter_spec.rb +55 -0
- data/spec/lookup_spec.rb +70 -0
- data/spec/text_snippet_spec.rb +55 -0
- metadata +175 -0
data/COPYING
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2010 Michael Klaus
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
16
|
+
THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/lumix
ADDED
data/bin/lumix-gui
ADDED
data/lib/lumix/base.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
module Lumix
|
4
|
+
Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
|
5
|
+
|
6
|
+
CONF = 'config.yaml'
|
7
|
+
ConfigStruct = Struct.new(:database_uri)
|
8
|
+
CConfig = if File.exists?(CONF)
|
9
|
+
YAML.load_file(CONF)
|
10
|
+
else
|
11
|
+
conf = ConfigStruct.new('jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer')
|
12
|
+
File.open(CONF, 'w') do |f|
|
13
|
+
f.write(conf.to_yaml)
|
14
|
+
end
|
15
|
+
conf
|
16
|
+
end
|
17
|
+
|
18
|
+
def conc
|
19
|
+
@conc ||= create_concordancer
|
20
|
+
end
|
21
|
+
|
22
|
+
def import_files(lang, *path)
|
23
|
+
conc.tp.lang = lang
|
24
|
+
conc.read(path)
|
25
|
+
end
|
26
|
+
|
27
|
+
def relink
|
28
|
+
conc.link!
|
29
|
+
end
|
30
|
+
|
31
|
+
def simulate_link
|
32
|
+
conc.simulate!
|
33
|
+
conc.link!
|
34
|
+
end
|
35
|
+
|
36
|
+
def link
|
37
|
+
conc.link
|
38
|
+
end
|
39
|
+
|
40
|
+
def reconnect(opts = {})
|
41
|
+
@conc = create_concordancer(opts)
|
42
|
+
end
|
43
|
+
|
44
|
+
def correct(*ids)
|
45
|
+
conc.correct *ids
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_filename(filter)
|
49
|
+
filter.gsub(/\s+/, "_").gsub(/[\.\"]/, '')
|
50
|
+
end
|
51
|
+
|
52
|
+
def create_concordancer(opts = {})
|
53
|
+
Concordancer.new(CConfig.database_uri, opts.merge(:progress_proc => progress_proc))
|
54
|
+
end
|
55
|
+
end
|
56
|
+
require 'lumix/concordancer'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'ffi-icu'
|
2
|
+
require 'iconv'
|
3
|
+
require 'htmlentities'
|
4
|
+
|
5
|
+
class String
|
6
|
+
|
7
|
+
NoMatchFound = Class.new(Exception)
|
8
|
+
|
9
|
+
def to_utf(default = 'utf-8')
|
10
|
+
@icu ||= ICU::CharDet::Detector.new
|
11
|
+
result = icu_return(default) || find_icu
|
12
|
+
raise NoMatchFound unless result
|
13
|
+
|
14
|
+
@entities ||= HTMLEntities.new
|
15
|
+
@entities.decode(result)
|
16
|
+
end
|
17
|
+
|
18
|
+
def find_icu
|
19
|
+
matches = @icu.detect_all(self)
|
20
|
+
matches.each do |match|
|
21
|
+
if d = icu_return(match.name)
|
22
|
+
return d
|
23
|
+
end
|
24
|
+
end
|
25
|
+
return nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def icu_return(cs)
|
29
|
+
begin
|
30
|
+
return Iconv.conv('UTF-8', cs, self)
|
31
|
+
rescue
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
data/lib/lumix/cli.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'lumix/base'
|
2
|
+
|
3
|
+
include Lumix
|
4
|
+
|
5
|
+
def help
|
6
|
+
puts "lumix-cli import <en|ro> <path>"
|
7
|
+
puts "lumix-cli [search] 'search string' ..."
|
8
|
+
puts "lumix-cli relink"
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
def search(*filters)
|
13
|
+
files = []
|
14
|
+
fs = filters.map do |filt|
|
15
|
+
file = create_findings_file(filt)
|
16
|
+
next unless file
|
17
|
+
files << file
|
18
|
+
conc.create_filter(filt) do |text, tagged|
|
19
|
+
file.puts "#{text.name}: #{text.left} | #{tagged.to_s} | #{text.right}"
|
20
|
+
#file.puts "#{text.name}: #{tagged.to_s}"
|
21
|
+
end
|
22
|
+
end.compact
|
23
|
+
|
24
|
+
conc.find(fs) unless fs.empty?
|
25
|
+
|
26
|
+
fs.each do |f|
|
27
|
+
puts "Found #{f.results == 0 ? 'no' : f.results} matches for #{f.filter}"
|
28
|
+
end
|
29
|
+
ensure
|
30
|
+
files.each{ |f| f.close }
|
31
|
+
end
|
32
|
+
|
33
|
+
def create_findings_file(filter, filename = to_filename(filter), &block)
|
34
|
+
if File.exists?(filename)
|
35
|
+
puts "File #{filename} already exists! Ignoring."
|
36
|
+
else
|
37
|
+
File.open(filename, 'w', &block)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def tag(lang, file)
|
42
|
+
conc.tp.lang = lang
|
43
|
+
puts conc.tp.process(File.read(file))
|
44
|
+
end
|
45
|
+
|
46
|
+
def import!(lang, *files)
|
47
|
+
conc.link_on_import!
|
48
|
+
import_files(lang, *files)
|
49
|
+
end
|
50
|
+
|
51
|
+
def tag(lang, *files)
|
52
|
+
p = Pool.new(10)
|
53
|
+
conc.tp.lang = lang
|
54
|
+
conc.tp.to_filelist(files).each do |file|
|
55
|
+
p.schedule do
|
56
|
+
tagged = conc.tp.create_tagged_filename(file)
|
57
|
+
conc.tp.process_file(file, tagged) unless File.exists?(tagged)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
p.shutdown
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
def progress_proc
|
65
|
+
task = nil
|
66
|
+
percent = 0
|
67
|
+
proc do |p|
|
68
|
+
if !task or p.task != task
|
69
|
+
task = p.task
|
70
|
+
percent = 0
|
71
|
+
puts Texts[task] || task
|
72
|
+
end
|
73
|
+
if p.done == p.work
|
74
|
+
puts "Done"
|
75
|
+
else
|
76
|
+
new_percent = (100 * p.done / p.work).to_i
|
77
|
+
if new_percent > percent
|
78
|
+
print "." * ((new_percent - percent) / 2)
|
79
|
+
percent = new_percent
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
cmd, *args = ARGV
|
87
|
+
if !cmd
|
88
|
+
#help
|
89
|
+
cmd, *args = 'search', 'N "de" N'
|
90
|
+
end
|
91
|
+
|
92
|
+
c = cmd.downcase.to_sym
|
93
|
+
cmd = :help if c =~ /^-{1,2}help$/i
|
94
|
+
cmd = :search if !respond_to?(c)
|
95
|
+
|
96
|
+
send c, *args
|
@@ -0,0 +1,254 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
# TODO take care of 's problem
|
4
|
+
# TODO remove Word count line
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'digest/md5'
|
8
|
+
require 'sequel'
|
9
|
+
require 'sequel/extensions/migration'
|
10
|
+
|
11
|
+
require 'lumix/model/sequel_models'
|
12
|
+
|
13
|
+
require 'lumix/thread_pool'
|
14
|
+
require 'lumix/textprocessing'
|
15
|
+
require 'lumix/lookup_search'
|
16
|
+
#require 'lumix/fast_search'
|
17
|
+
|
18
|
+
module Lumix
|
19
|
+
WORKERS = (ENV['LUMIX_WORKERS'] || 20).to_i
|
20
|
+
RELINK = ENV['LUMIX_RELINK']
|
21
|
+
|
22
|
+
DB_VERSION = 4
|
23
|
+
|
24
|
+
class ::String
|
25
|
+
def digest
|
26
|
+
return @digest if @digest
|
27
|
+
digest = Digest::MD5.new
|
28
|
+
digest.update self
|
29
|
+
@digest = digest.hexdigest
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Progress = Struct.new(:task, :work, :data, :done)
|
34
|
+
|
35
|
+
class Concordancer
|
36
|
+
|
37
|
+
class << self
|
38
|
+
end
|
39
|
+
|
40
|
+
attr_reader :db, :tp
|
41
|
+
attr_accessor :progress_proc
|
42
|
+
attr_writer :link_on_import
|
43
|
+
|
44
|
+
def initialize(db_uri, options = {})
|
45
|
+
@progress_proc = options[:progress_proc]
|
46
|
+
@db = connect(db_uri)
|
47
|
+
if options[:recreate]
|
48
|
+
db.tables.each{ |t| db.drop_table t }
|
49
|
+
migrate(db)
|
50
|
+
end
|
51
|
+
|
52
|
+
@ids = all
|
53
|
+
@tp = TextProcessing.new
|
54
|
+
end
|
55
|
+
|
56
|
+
def strategy
|
57
|
+
@strategy ||= SearchStrategy.new(@db, @progress_proc)
|
58
|
+
end
|
59
|
+
|
60
|
+
def create_link_pool
|
61
|
+
Pool.new(strategy.concurrent_link? ? 4 : 1)
|
62
|
+
end
|
63
|
+
|
64
|
+
def link_on_import?
|
65
|
+
@link_on_import
|
66
|
+
end
|
67
|
+
|
68
|
+
def link_on_import!
|
69
|
+
@link_on_import = true
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_id(file)
|
73
|
+
text = File.read(file).to_utf
|
74
|
+
saved = TaggedText[:digest => text.digest]
|
75
|
+
saved ? saved.id : nil
|
76
|
+
end
|
77
|
+
|
78
|
+
def read(*files)
|
79
|
+
files = tp.to_filelist(*files)
|
80
|
+
prog = Progress.new(:read, files.size)
|
81
|
+
puts "Reading #{files.size} files"
|
82
|
+
@unprocessed = if File.exists?('unprocessed.lst')
|
83
|
+
File.readlines('unprocessed.lst').map(&:chomp)
|
84
|
+
else
|
85
|
+
[]
|
86
|
+
end
|
87
|
+
|
88
|
+
File.open('unprocessed.lst', 'a') do |up|
|
89
|
+
l = create_link_pool
|
90
|
+
p = Pool.new(WORKERS)
|
91
|
+
|
92
|
+
l.schedule{ link! } if RELINK
|
93
|
+
|
94
|
+
files.each_with_index do |file, index|
|
95
|
+
if @unprocessed.member?(file)
|
96
|
+
puts "Ignoring #{file}"
|
97
|
+
next
|
98
|
+
end
|
99
|
+
p.schedule do
|
100
|
+
begin
|
101
|
+
id = read_file(file)
|
102
|
+
l.schedule { link id } if id and link_on_import?
|
103
|
+
rescue
|
104
|
+
puts "Error on file #{file}: #{$!}", $!.backtrace
|
105
|
+
@unprocessed << file
|
106
|
+
up.puts file
|
107
|
+
end
|
108
|
+
progress(prog, index + 1)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
l.schedule { link } if link_on_import? # make sure everything is linked
|
112
|
+
p.shutdown
|
113
|
+
l.shutdown
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def read_file(file)
|
118
|
+
text = File.read(file).to_utf
|
119
|
+
saved = TaggedText.exists?(:filename => file, :digest => text.digest)
|
120
|
+
|
121
|
+
unless saved
|
122
|
+
puts "Reading file #{file}"
|
123
|
+
# retrieve the tagged version
|
124
|
+
tagged_file = tp.create_tagged_filename(file)
|
125
|
+
tagged = if File.exists?(tagged_file)
|
126
|
+
File.read(tagged_file)
|
127
|
+
else
|
128
|
+
tagged = tp.process(text)
|
129
|
+
File.open(tagged_file, 'w') do |out|
|
130
|
+
out.write tagged
|
131
|
+
end
|
132
|
+
tagged
|
133
|
+
end
|
134
|
+
|
135
|
+
retagged = retag(tagged)
|
136
|
+
tt = TaggedText.create(:digest => text.digest, :text => text, :tagged => retagged, :filename => file, :tagged_filename => tagged_file)
|
137
|
+
@ids << tt.id
|
138
|
+
yield tt if block_given?
|
139
|
+
tt
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def correct(*ids)
|
144
|
+
ids = all if ids.empty?
|
145
|
+
ids.flatten.each do |id|
|
146
|
+
id = id.to_i
|
147
|
+
d = TaggedText[id]
|
148
|
+
next unless d
|
149
|
+
|
150
|
+
file = d.filename
|
151
|
+
|
152
|
+
text = File.read(file).to_utf
|
153
|
+
d.text = text
|
154
|
+
|
155
|
+
expected = text.digest
|
156
|
+
if d.digest != expected
|
157
|
+
puts "Correcting text #{file}"
|
158
|
+
d.digest = expected
|
159
|
+
end
|
160
|
+
d.save
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def all
|
165
|
+
TaggedText.ids
|
166
|
+
end
|
167
|
+
|
168
|
+
def simulate!
|
169
|
+
strategy.simulate!
|
170
|
+
end
|
171
|
+
|
172
|
+
def link!(*ids)
|
173
|
+
link(*ids) do |ds|
|
174
|
+
ds.delete
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def link(*ids)
|
179
|
+
ids = all if ids.empty?
|
180
|
+
ids.flatten!
|
181
|
+
prog = Progress.new(:link, ids.size)
|
182
|
+
progress(prog)
|
183
|
+
|
184
|
+
p = create_link_pool
|
185
|
+
ids.each_with_index do |id, index|
|
186
|
+
#ds = db[:assoc].filter(:text_id => id)
|
187
|
+
#yield ds if block_given?
|
188
|
+
|
189
|
+
# TODO implement force
|
190
|
+
p.schedule do
|
191
|
+
strategy.link_text(id) #if ds.empty?
|
192
|
+
progress(prog, index + 1)
|
193
|
+
end
|
194
|
+
end
|
195
|
+
p.shutdown
|
196
|
+
end
|
197
|
+
|
198
|
+
def create_filter(f, &block)
|
199
|
+
strategy.create_filter(f, &block)
|
200
|
+
end
|
201
|
+
|
202
|
+
def find(filters)
|
203
|
+
strategy.find(filters)
|
204
|
+
end
|
205
|
+
|
206
|
+
private
|
207
|
+
def connect(db_uri)
|
208
|
+
db = Sequel.connect(db_uri)
|
209
|
+
begin
|
210
|
+
db.get(1)
|
211
|
+
rescue Exception => e
|
212
|
+
puts 'Falling back to sqlite'
|
213
|
+
puts e
|
214
|
+
db = Sequel.connect('jdbc:sqlite://concordancer.db')
|
215
|
+
end
|
216
|
+
migrate(db)
|
217
|
+
TaggedText.db = db
|
218
|
+
end
|
219
|
+
|
220
|
+
def migrate(db)
|
221
|
+
migration_path = File.join(File.dirname(__FILE__), 'schema')
|
222
|
+
Sequel::Migrator.apply(db, migration_path, DB_VERSION)
|
223
|
+
end
|
224
|
+
|
225
|
+
def progress(prog, done = 0, data = prog.data)
|
226
|
+
if progress_proc
|
227
|
+
prog.done = done
|
228
|
+
prog.data = data
|
229
|
+
progress_proc.call(prog)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def retag(text)
|
234
|
+
chunks = text.split(/[ \n]/)
|
235
|
+
return text if (token = chunks.first.split(/\|/)).size != 4 # looks pre-retagged
|
236
|
+
tag_position = if token[2] =~ /\d+/ && token[3] =~ /\d+/ # looks like fulltagged
|
237
|
+
1
|
238
|
+
else
|
239
|
+
2
|
240
|
+
end
|
241
|
+
|
242
|
+
result = ''
|
243
|
+
chunks.each do |chunk|
|
244
|
+
next unless chunk.empty?
|
245
|
+
word, tag = chunk.split(/\|/)
|
246
|
+
result << ' ' unless result.empty?
|
247
|
+
result << "#{word}|#{tag[tag_position]}"
|
248
|
+
end
|
249
|
+
return result
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'lumix/charset'
|
2
|
+
|
3
|
+
CORRECTIONS = <<-TXT
|
4
|
+
catre | S
|
5
|
+
fetite | NPRN
|
6
|
+
in | S
|
7
|
+
si | C
|
8
|
+
circa | R
|
9
|
+
fata de| S
|
10
|
+
maxima | ASON
|
11
|
+
inainte| R
|
12
|
+
in materie de | R
|
13
|
+
tin | V3
|
14
|
+
beneficiaza | V3
|
15
|
+
: | COLON
|
16
|
+
ocupa | VN
|
17
|
+
asigurata | VPSF
|
18
|
+
mine | PPSA
|
19
|
+
batut | VPSM
|
20
|
+
insa | C
|
21
|
+
impotriva | S
|
22
|
+
americana | ASN
|
23
|
+
caruia | R
|
24
|
+
da | VN
|
25
|
+
duce| VN
|
26
|
+
primeasca | V3
|
27
|
+
daca | C
|
28
|
+
bulgara | ASN
|
29
|
+
ramina | V3
|
30
|
+
albaneza | ASN
|
31
|
+
pina | S
|
32
|
+
paraseasca | V3
|
33
|
+
publica | ASN
|
34
|
+
inceapa | V3
|
35
|
+
ecologic | ASN
|
36
|
+
internationala | ASN
|
37
|
+
ecologista | ASN
|
38
|
+
cada | V3
|
39
|
+
linga | S
|
40
|
+
adevaratele | APRY
|
41
|
+
citiva | PI
|
42
|
+
americana | ASN
|
43
|
+
Miclici| NP
|
44
|
+
fara | S
|
45
|
+
cit | PI
|
46
|
+
sugereaza | V3
|
47
|
+
incasa | VN
|
48
|
+
circa | R
|
49
|
+
ghiceste | V3
|
50
|
+
tarile |NPRY
|
51
|
+
araba | ASN
|
52
|
+
citeva | PI
|
53
|
+
schimbindu | VG
|
54
|
+
dupa | S
|
55
|
+
uleiurilor_vegetale | NPOY
|
56
|
+
botosaneana | ASN
|
57
|
+
oricarui | PI
|
58
|
+
TXT
|
59
|
+
|
60
|
+
def corrections
|
61
|
+
@corrections ||= CORRECTIONS.split(/\n/).map do |line|
|
62
|
+
word, tag = line.split(/\|/).map(&:strip)
|
63
|
+
puts "Tagging #{word} as #{tag}"
|
64
|
+
[/\b#{word}\|\S+/, "#{word}\|#{tag}"]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def correct(t)
|
69
|
+
corrections.inject(t) do |result, (re, sub)|
|
70
|
+
result.gsub(re, sub)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def correct_all(path)
|
75
|
+
fs = Dir.glob(File.join(path, '*tagged*'))
|
76
|
+
fs.each do |fn|
|
77
|
+
t = correct(File.read(fn))
|
78
|
+
File.open(fn, 'w') { |f| f.print t }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
if $0 == __FILE__
|
83
|
+
correct_all ARGV[0]
|
84
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'lumix/filter'
|
2
|
+
require 'lumix/text_snippet'
|
3
|
+
|
4
|
+
module Lumix
|
5
|
+
|
6
|
+
class FastSearch
|
7
|
+
|
8
|
+
TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
|
9
|
+
ORIG = /([^\|\s]*)\|([^\|\s]*)\|([^\|\s]*)\|(\S*)/ # X|Y|Z|W
|
10
|
+
|
11
|
+
def initialize(db, progress)
|
12
|
+
@db = db
|
13
|
+
@progress = progress
|
14
|
+
end
|
15
|
+
|
16
|
+
def concurrent_link?
|
17
|
+
true
|
18
|
+
end
|
19
|
+
|
20
|
+
def link_text(id)
|
21
|
+
ds = TaggedText[id]
|
22
|
+
return ds.fulltagged if ds.fulltagged
|
23
|
+
file, text, tagged = ds.filename, ds.text, ds.tagged
|
24
|
+
|
25
|
+
puts "Linking text #{file}"
|
26
|
+
|
27
|
+
txt_pos = 0
|
28
|
+
linked = ''
|
29
|
+
tagged.scan(TAGGED) do |word, tag|
|
30
|
+
tagged_begin = $~.begin(0)
|
31
|
+
|
32
|
+
# expand "x_y_z" notation to "x y z"
|
33
|
+
word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
|
34
|
+
src_match = text[txt_pos..-1].match(word_re) # find the word
|
35
|
+
if src_match
|
36
|
+
offset = src_match.begin(0)
|
37
|
+
src_begin = txt_pos + offset
|
38
|
+
src_end = txt_pos + src_match.end(0)
|
39
|
+
txt_pos = src_end
|
40
|
+
|
41
|
+
linked << ' ' unless linked.empty?
|
42
|
+
linked << word << '|' << tag << '|' << src_begin.to_s << '|' << src_end.to_s
|
43
|
+
else
|
44
|
+
STDERR.puts "Could not find match for '#{word}' in text #{file}"
|
45
|
+
STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
|
46
|
+
`echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
|
47
|
+
return nil
|
48
|
+
end
|
49
|
+
end
|
50
|
+
unless linked.empty?
|
51
|
+
ds.fulltagged = linked
|
52
|
+
ds.save
|
53
|
+
end
|
54
|
+
return linked
|
55
|
+
rescue => e # TODO remove this crap
|
56
|
+
STDERR.puts e
|
57
|
+
STDERR.puts e.backtrace
|
58
|
+
raise e
|
59
|
+
end
|
60
|
+
|
61
|
+
def create_filter(f, &block)
|
62
|
+
Lumix::Filter.new('\|(\d+)\|(\d+)', f, &block)
|
63
|
+
end
|
64
|
+
|
65
|
+
def find(filters)
|
66
|
+
prog = Progress.new(:search, TaggedText.count, "", 0)
|
67
|
+
@progress[prog] if @progress
|
68
|
+
|
69
|
+
|
70
|
+
TaggedText.each_with_index do |t, i|
|
71
|
+
# matches to ranges
|
72
|
+
filters.each do |f|
|
73
|
+
f.scan(t.fulltagged) do |hit, t_begin, t_end, m|
|
74
|
+
s_begin = m.captures.first.to_i
|
75
|
+
s_end = m.captures.last.to_i
|
76
|
+
|
77
|
+
fname = File.basename(t.filename)
|
78
|
+
tagged_snippet = Lumix::TextSnippet.new(fname, t.fulltagged, t_begin, t_end)
|
79
|
+
text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
|
80
|
+
f << [text_snippet, tagged_snippet]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
prog.done = i
|
84
|
+
@progress[prog] if @progress
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
SearchStrategy = FastSearch
|
91
|
+
end
|