lumix 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +18 -0
- data/bin/lumix +4 -0
- data/lib/lumix/concordancer.rb +263 -0
- data/lib/lumix/filter.rb +60 -0
- data/lib/lumix/gui.rb +148 -0
- data/lib/lumix/main.rb +7 -0
- data/lib/lumix/result_view.rb +93 -0
- data/lib/lumix/schema/001_create_tables.rb +35 -0
- data/lib/lumix/schema/002_categories.rb +28 -0
- data/lib/lumix/textprocessing.rb +84 -0
- data/spec/filter_spec.rb +47 -0
- data/spec/text_snippet_spec.rb +52 -0
- metadata +108 -0
data/COPYING
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2010 Michael Klaus
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
16
|
+
THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/lumix
ADDED
@@ -0,0 +1,263 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
# TODO take care of 's problem
|
4
|
+
# TODO remove Word count line
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'digest/md5'
|
8
|
+
require 'sequel'
|
9
|
+
require 'sequel/extensions/migration'
|
10
|
+
|
11
|
+
require 'lumix/textprocessing'
|
12
|
+
require 'lumix/filter'
|
13
|
+
|
14
|
+
DB_VERSION = 2
|
15
|
+
class String
|
16
|
+
def digest
|
17
|
+
return @digest if @digest
|
18
|
+
digest = Digest::MD5.new
|
19
|
+
digest.update self
|
20
|
+
@digest = digest.hexdigest
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class TextSnippet
|
25
|
+
attr_reader :name, :text, :begin, :end
|
26
|
+
def initialize(name, text, first, last)
|
27
|
+
@name = name
|
28
|
+
@text = text
|
29
|
+
@begin = first
|
30
|
+
@end = last
|
31
|
+
end
|
32
|
+
def to_s
|
33
|
+
cleanup(@text[@begin...@end])
|
34
|
+
end
|
35
|
+
def left(context = 5)
|
36
|
+
@text[0...@begin] =~ /((\S+\s+){0,#{context}}\S*)\z/m
|
37
|
+
cleanup($1)
|
38
|
+
end
|
39
|
+
def right(context = 5)
|
40
|
+
@text[@end..-1] =~ /\A(\S*(\s+\S+){0,#{context}})/m
|
41
|
+
cleanup($1)
|
42
|
+
end
|
43
|
+
def cleanup(txt)
|
44
|
+
txt.gsub(/\s+/, ' ')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
Progress = Struct.new(:task, :work, :data, :done)
|
49
|
+
|
50
|
+
class Concordancer
|
51
|
+
|
52
|
+
attr_reader :db, :tp
|
53
|
+
attr_accessor :progress_proc
|
54
|
+
|
55
|
+
def initialize(db_uri, options = {})
|
56
|
+
@progress_proc = options[:progress_proc]
|
57
|
+
@db = connect(db_uri) do |db|
|
58
|
+
db.tables.each{ |t| db.drop_table t } if options[:recreate]
|
59
|
+
end
|
60
|
+
@ids = db[:texts].map { |v| v[:id] }
|
61
|
+
@tp = TextProcessing.new
|
62
|
+
end
|
63
|
+
|
64
|
+
def fallback?
|
65
|
+
@fallback
|
66
|
+
end
|
67
|
+
|
68
|
+
def get_id(file)
|
69
|
+
text = File.read(file)
|
70
|
+
saved = db[:texts][:digest => text.digest]
|
71
|
+
saved ? saved[:id] : nil
|
72
|
+
end
|
73
|
+
|
74
|
+
def read(*files)
|
75
|
+
files = tp.to_filelist(files)
|
76
|
+
prog = Progress.new(:read, files.size)
|
77
|
+
puts "Reading #{files.size} files"
|
78
|
+
files.each_with_index do |file, index|
|
79
|
+
tp.read_file(file)
|
80
|
+
progress(prog, index + 1)
|
81
|
+
end
|
82
|
+
link
|
83
|
+
end
|
84
|
+
|
85
|
+
def read_file(file)
|
86
|
+
# read the raw text
|
87
|
+
text = File.read(file)
|
88
|
+
saved = db[:texts][:digest => text.digest]
|
89
|
+
|
90
|
+
unless saved
|
91
|
+
# retrieve the tagged version
|
92
|
+
tagged_file = create_tagged_filename(file)
|
93
|
+
process_file(file, tagged_file) unless File.exists?(tagged_file)
|
94
|
+
|
95
|
+
tagged = retag(File.read(tagged_file))
|
96
|
+
id = db[:texts].insert(:digest => text.digest, :text => text, :tagged => tagged, :filename => file, :tagged_filename => tagged_file)
|
97
|
+
@ids << id
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def all
|
102
|
+
db[:texts].select(:id).map{|v| v.values}
|
103
|
+
end
|
104
|
+
|
105
|
+
def link!(*ids)
|
106
|
+
link(*ids) do |ds|
|
107
|
+
ds.delete
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def link(*ids)
|
112
|
+
ids = all if ids.empty?
|
113
|
+
ids.flatten!
|
114
|
+
prog = Progress.new(:link, ids.size)
|
115
|
+
progress(prog)
|
116
|
+
|
117
|
+
ids.each_with_index do |id, index|
|
118
|
+
ds = db[:assoc].filter(:text_id => id)
|
119
|
+
yield ds if block_given?
|
120
|
+
|
121
|
+
link_text(id) if ds.empty?
|
122
|
+
progress(prog, index + 1)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def find(filter)
|
127
|
+
texts = db[:texts]
|
128
|
+
prog = Progress.new(:search, texts.count, filter)
|
129
|
+
progress(prog)
|
130
|
+
|
131
|
+
re = Filter.to_re(filter)
|
132
|
+
|
133
|
+
index = 0
|
134
|
+
texts.inject(0) do |result, t|
|
135
|
+
t_id, text, tagged = t[:id], t[:text], t[:tagged]
|
136
|
+
|
137
|
+
# matches to ranges
|
138
|
+
results = []
|
139
|
+
tagged.scan(re) do |hit|
|
140
|
+
t_begin = $~.begin(0)
|
141
|
+
t_end = $~.end(0)
|
142
|
+
# TODO decouple database operations for performance
|
143
|
+
results << find_range(t_id, t_begin, t_end)
|
144
|
+
end
|
145
|
+
|
146
|
+
result += results.inject(0) do |result, f|
|
147
|
+
text_snippet = TextSnippet.new(File.basename(t[:filename]), text, f[:src_begin].to_i, f[:src_end].to_i)
|
148
|
+
tagged_snippet = TextSnippet.new(File.basename(t[:tagged_filename]), tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
|
149
|
+
if block_given?
|
150
|
+
yield text_snippet, tagged_snippet
|
151
|
+
else
|
152
|
+
puts text_snippet
|
153
|
+
puts tagged_snippet
|
154
|
+
puts
|
155
|
+
end
|
156
|
+
result += 1
|
157
|
+
end
|
158
|
+
progress prog, (index += 1)
|
159
|
+
result
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def find_range(t_id, t_begin, t_end)
|
164
|
+
ds = db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
|
165
|
+
ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
|
166
|
+
end
|
167
|
+
|
168
|
+
private
|
169
|
+
def progress(prog, done = 0, data = prog.data)
|
170
|
+
if progress_proc
|
171
|
+
prog.done = done
|
172
|
+
prog.data = data
|
173
|
+
progress_proc.call(prog)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def connect(db_uri)
|
178
|
+
db = Sequel.connect(db_uri)
|
179
|
+
begin
|
180
|
+
db.get(1)
|
181
|
+
@fallback = false
|
182
|
+
rescue Exception => e
|
183
|
+
puts 'Falling back to sqlite'
|
184
|
+
puts e
|
185
|
+
db = Sequel.connect('jdbc:sqlite://concordancer.db')
|
186
|
+
@fallback = true
|
187
|
+
end
|
188
|
+
yield db
|
189
|
+
migration_path = File.join(File.dirname(__FILE__), 'schema')
|
190
|
+
Sequel::Migrator.apply(db, migration_path, DB_VERSION)
|
191
|
+
return db
|
192
|
+
end
|
193
|
+
|
194
|
+
def retag(text)
|
195
|
+
words = text.split(/[ \n]/).map do |word|
|
196
|
+
word.split(/\|/)
|
197
|
+
end
|
198
|
+
words.inject('') do |result, (word, lemma, tag, tag2)|
|
199
|
+
result + (word ? "#{word}|#{tag} " : "\n")
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def link_text(id)
|
204
|
+
ds = db[:texts][:id => id]
|
205
|
+
text, tagged = ds[:text], ds[:tagged]
|
206
|
+
puts "Linking text #{ds[:filename]}"
|
207
|
+
|
208
|
+
re = /([^\s\|]+)\|(\S+)/m
|
209
|
+
src_last = 0
|
210
|
+
position = 0
|
211
|
+
assoc = []
|
212
|
+
tagged.scan(re) do |word, tag|
|
213
|
+
tagged_begin = $~.begin(0)
|
214
|
+
tagged_end = $~.end(0)
|
215
|
+
|
216
|
+
word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
|
217
|
+
src_match = text[src_last..-1].match(word_re) # find the word
|
218
|
+
if src_match
|
219
|
+
src_begin = src_last + src_match.begin(0)
|
220
|
+
src_end = src_last + src_match.end(0)
|
221
|
+
|
222
|
+
src_last = src_end
|
223
|
+
assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
|
224
|
+
else
|
225
|
+
STDERR.puts "Could not find match for '#{word}' in text #{ds[:filename]}"
|
226
|
+
end
|
227
|
+
position += 1
|
228
|
+
end
|
229
|
+
db[:assoc].multi_insert(assoc)
|
230
|
+
rescue => e
|
231
|
+
STDERR.puts e
|
232
|
+
STDERR.puts e.backtrace
|
233
|
+
raise e
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|
237
|
+
|
238
|
+
if __FILE__ == $0
|
239
|
+
prog_proc = lambda do |prog|
|
240
|
+
puts "#{prog.task}#{prog.data ? "(#{prog.data})" : ""} #{prog.done}/#{prog.work}"
|
241
|
+
end
|
242
|
+
|
243
|
+
# uri = 'postgres://concordancer:concordancer@localhost:5433/concordancer'
|
244
|
+
uri = 'jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer'
|
245
|
+
c = Concordancer.new(uri, :progress_proc => prog_proc, :recreate => true)
|
246
|
+
#puts c.filter_to_re('"sunt" APN NPN')
|
247
|
+
c.read('raw')
|
248
|
+
#c.find(%q[("de")? (N*)+ "si" (N*){1,2} (AS*)?])
|
249
|
+
#c.link! #if RECREATE
|
250
|
+
#c.link c.all
|
251
|
+
#ds = db[:assoc].filter(:text_id => 1).order_by(:position).filter{tagged_end >= 150}.filter{tagged_begin < 330}
|
252
|
+
#puts ds.sql
|
253
|
+
#exit
|
254
|
+
|
255
|
+
t = Time.now
|
256
|
+
output = ""
|
257
|
+
results = c.find(%q[(*){0,3} N* N* (*){0,3}]) do |text, tagged|
|
258
|
+
output << "#{text}\n#{tagged}\n\n"
|
259
|
+
end
|
260
|
+
puts Time.now - t
|
261
|
+
puts "Results: #{ results }"
|
262
|
+
puts output
|
263
|
+
end
|
data/lib/lumix/filter.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
module Filter
|
2
|
+
class << self
|
3
|
+
|
4
|
+
HANDLERS = %w[handle_wildcard handle_choice handle_literals
|
5
|
+
handle_dangling_tags handle_multiplicators assure_wordbounds]
|
6
|
+
|
7
|
+
# TODO refactor
|
8
|
+
def to_re(filter)
|
9
|
+
re = HANDLERS.inject(filter) do |filter, handler|
|
10
|
+
puts filter
|
11
|
+
puts "#{handler} -->"
|
12
|
+
send handler, filter
|
13
|
+
end
|
14
|
+
puts re
|
15
|
+
Regexp.new(re)
|
16
|
+
end
|
17
|
+
|
18
|
+
# character wildcard replacement
|
19
|
+
def handle_wildcard(re)
|
20
|
+
re.gsub(/([^\)])\*/, '\1[^\b]*?')
|
21
|
+
end
|
22
|
+
|
23
|
+
# Takes (!A B C) and transforms it
|
24
|
+
def handle_choice(re)
|
25
|
+
re.gsub(/\(\!([^\)]+)\)/) do
|
26
|
+
c = $1.split.map{ |t| '(?!' + t + '\b)' }.join
|
27
|
+
'(?:' + c + '\S)*'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# transforms literals delimited by ""
|
32
|
+
def handle_literals(re)
|
33
|
+
re.gsub(/\"([^\"]*)\"(?:\|(\S+?))?/) do
|
34
|
+
str = $1
|
35
|
+
tag = $2 || '\S+?'
|
36
|
+
str.gsub(/ /, '_') + '\|' + tag
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# add wildcard word match on tag-only search criteria
|
41
|
+
def handle_dangling_tags(re)
|
42
|
+
re.split(/ /).map do |s|
|
43
|
+
if s['\|']
|
44
|
+
s
|
45
|
+
else
|
46
|
+
s.gsub(/(\(?)(\S+)/, '\1[^\s\|]+\|\2')
|
47
|
+
end
|
48
|
+
end.join('\s+')
|
49
|
+
end
|
50
|
+
# Handles the + * ? and {} qualifiers
|
51
|
+
def handle_multiplicators(re)
|
52
|
+
re.gsub(/\(([^\)]+)(\)((\{[^\}]+\})|\*|\+|\?)\s?)/, '(?:\b\1\b\2')
|
53
|
+
end
|
54
|
+
|
55
|
+
def assure_wordbounds(re)
|
56
|
+
'\b' + re + '\b'
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
data/lib/lumix/gui.rb
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'lumix/concordancer'
|
3
|
+
require 'sweet'
|
4
|
+
require 'lumix/result_view'
|
5
|
+
#Sweet.set_debug
|
6
|
+
|
7
|
+
|
8
|
+
Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
|
9
|
+
Indicator = %w'} ) ] | [ ( {'
|
10
|
+
|
11
|
+
CONF = 'config.yaml'
|
12
|
+
ConfigStruct = Struct.new(:database_uri)
|
13
|
+
CConfig = YAML.load_file(CONF) rescue ConfigStruct.new('jdbc:postgresql://localhost:5432/concordancer?user=concordancer&password=concordancer')
|
14
|
+
def save!
|
15
|
+
File.open(CONF, 'w') do |f|
|
16
|
+
f.write(CConfig.to_yaml)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
Sweet.app :title => 'Ruby Concordancer', :width => 800, :height => 700, :layout => :grid.conf(:numColumns => 3) do
|
21
|
+
def conc
|
22
|
+
@conc ||= Concordancer.new(CConfig.database_uri, :progress_proc => @progress_proc)#, :recreate => true)
|
23
|
+
end
|
24
|
+
|
25
|
+
@progress_proc = proc do |p|
|
26
|
+
task = Texts[p.task] || p.task
|
27
|
+
perform do
|
28
|
+
if p.done == p.work
|
29
|
+
@p_status.text = 'Done!'
|
30
|
+
@p_indicator.text = ''
|
31
|
+
@p_bar.fraction = 0
|
32
|
+
else
|
33
|
+
@p_status.text = task
|
34
|
+
@p_indicator.text = Indicator[p.done % Indicator.size]
|
35
|
+
@p_bar.fraction = p.done.to_f / p.work
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
save! unless File.exists?(CONF)
|
41
|
+
|
42
|
+
menubar do
|
43
|
+
submenu '&File' do
|
44
|
+
submenu '&Import...' do
|
45
|
+
item('E&nglish texts') { import_chooser('en') }
|
46
|
+
item('&Romanian texts') { import_chooser('ro') }
|
47
|
+
end
|
48
|
+
item('&Export findings...') { export_findings }
|
49
|
+
separator
|
50
|
+
item('&Relink texts') { relink }
|
51
|
+
item('&Clear the database') { reconnect :recreate => true }
|
52
|
+
separator
|
53
|
+
item('E&xit') { exit }
|
54
|
+
end
|
55
|
+
# submenu 'C&orpora' do
|
56
|
+
# @m_cat = submenu '&Category' do
|
57
|
+
# item('Cre&ate...') { create_category }
|
58
|
+
# item('&Import...') { import_chooser }
|
59
|
+
# separator
|
60
|
+
# item('&Edit...') { edit_category }
|
61
|
+
# item('&Delete') { delete_category }
|
62
|
+
# end
|
63
|
+
# @m_text = submenu '&Text' do
|
64
|
+
# item('&Reimport...') { reimport_chooser }
|
65
|
+
# item('&Delete') { delete_text }
|
66
|
+
# end
|
67
|
+
# end
|
68
|
+
# @m_stats = submenu '&Statistics' do
|
69
|
+
# item('&Editor') { script_editor }
|
70
|
+
# separator
|
71
|
+
# item('&Load Script...') { load_script }
|
72
|
+
# end
|
73
|
+
# submenu "&Help" do
|
74
|
+
# separator
|
75
|
+
# item('&About') { about }
|
76
|
+
# end
|
77
|
+
end
|
78
|
+
|
79
|
+
tree :grid_data => {:align => [:fill, :fill], :span => [1, 2], :grab => [true, true]}
|
80
|
+
|
81
|
+
@filter = edit_line 'NSN NSN', :grid_data => {:align => [:fill, :center], :grab => true}, :max_size => 40 do
|
82
|
+
perform_search
|
83
|
+
end
|
84
|
+
button 'Search' do
|
85
|
+
perform_search
|
86
|
+
end
|
87
|
+
|
88
|
+
@results = table :columns => %w[Text Left Hit Right], :sort => true, :grid_data => {:align => [:fill, :fill], :span => 2, :grab => [true, true]}, :scroll => true
|
89
|
+
|
90
|
+
@counter = label :grid_data => {:span => 2, :align => :fill}
|
91
|
+
|
92
|
+
@p_status = label(:grid_data => {:align => [:fill, :bottom], :grab => true})
|
93
|
+
@p_bar = progress(:width => 50, :grid_data => {:align => [:right, :bottom]})
|
94
|
+
@p_indicator = label(' ', :grid_data => {:align => [:right, :bottom]})
|
95
|
+
|
96
|
+
|
97
|
+
def perform_search
|
98
|
+
filter = @filter.text
|
99
|
+
@results.data.clear
|
100
|
+
Thread.new do
|
101
|
+
unless filter.empty?
|
102
|
+
puts "finding #{filter}"
|
103
|
+
found = conc.find(filter) do |text, tagged|
|
104
|
+
@results.add_hit(text.name, text.left, text.to_s, text.right)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
perform do
|
108
|
+
@counter.text = "#{found} matches"
|
109
|
+
@p_status.text = "Found #{found || 'no'} matches for #{filter}"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def import_chooser(lang)
|
115
|
+
conc.tp.lang = lang
|
116
|
+
Thread.new(conc) do |conc|
|
117
|
+
conc.read('raw')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def export_findings
|
122
|
+
filename = to_filename(@filter.text) + '.findings'
|
123
|
+
@p_status.text = "Exporting to #{filename}"
|
124
|
+
File.open(filename, 'w') do |f|
|
125
|
+
@results.items.each do |item|
|
126
|
+
unless item.getChecked
|
127
|
+
left, hit, right = (0..2).map{ |i| item.text(i) }
|
128
|
+
f.puts "#{left}\t#{hit}\t#{right}"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
@p_status.text = "Done! Exported to file #{filename}"
|
133
|
+
end
|
134
|
+
|
135
|
+
def relink
|
136
|
+
Thread.new(conc) do |conc|
|
137
|
+
conc.link!
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def to_filename(filter)
|
142
|
+
filter.gsub(/\s+/, "_").gsub(/[\*\.\?\"]/, '')
|
143
|
+
end
|
144
|
+
|
145
|
+
def reconnect(opts = {})
|
146
|
+
@conc = Concordancer.new(CConfig.database_uri, opts.mergs(:progress_proc => @progress_proc))
|
147
|
+
end
|
148
|
+
end
|
data/lib/lumix/main.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
class Java::OrgEclipseSwtWidgets::Table
|
2
|
+
|
3
|
+
attr_accessor :data, :tooltips
|
4
|
+
|
5
|
+
def sweeten(app, opts={}, &block)
|
6
|
+
@data = []
|
7
|
+
@tooltips = []
|
8
|
+
super
|
9
|
+
@redraw_thread = Thread.new do
|
10
|
+
while !isDisposed
|
11
|
+
if @dirty
|
12
|
+
@dirty = false
|
13
|
+
perform do
|
14
|
+
setItemCount data.size
|
15
|
+
clearAll if clear_all
|
16
|
+
end
|
17
|
+
end
|
18
|
+
sleep 1 # TODO find a better alternative
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# TODO implement tooltips
|
23
|
+
|
24
|
+
addListener swt::SetData do |e|
|
25
|
+
item = e.item
|
26
|
+
index = indexOf(item)
|
27
|
+
item.setText(Array(data[index]).to_java(:string))
|
28
|
+
end
|
29
|
+
|
30
|
+
addListener swt::Resize do |e|
|
31
|
+
default_weight = 1.0 / columns.size
|
32
|
+
current_width = @old_width
|
33
|
+
w = width
|
34
|
+
columns[0..-2].each do |c|
|
35
|
+
weight = c.width == 0 ? default_weight : c.width.to_f / current_width
|
36
|
+
c.width = w * weight
|
37
|
+
end
|
38
|
+
columns[columns.size - 1].pack
|
39
|
+
@old_width = w
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def columns=(*titles)
|
44
|
+
if titles
|
45
|
+
titles.each do |title|
|
46
|
+
col = widgets::TableColumn.new(self, swt::CENTER)
|
47
|
+
col.setText title
|
48
|
+
end
|
49
|
+
|
50
|
+
setHeaderVisible true
|
51
|
+
setLinesVisible true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def sort=(sort)
|
56
|
+
sort = Hash.new(true) if [true, :all].member?(sort)
|
57
|
+
if sort
|
58
|
+
columns.each_with_index do |col, index|
|
59
|
+
if sort[col.text]
|
60
|
+
col.addListener swt::Selection do
|
61
|
+
if data
|
62
|
+
@data = data.sort_by {|e| e[index] }
|
63
|
+
update :clear
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
::Sweet::WIDGET_DEFAULTS[:table] = {
|
73
|
+
:style => [:border, :virtual, :check]
|
74
|
+
}
|
75
|
+
::Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
|
76
|
+
:block_handler => :set_data,
|
77
|
+
:custom_code => proc {
|
78
|
+
def update(clear_all = false)
|
79
|
+
return if isDisposed
|
80
|
+
setItemCount data.size
|
81
|
+
clearAll if clear_all
|
82
|
+
end
|
83
|
+
|
84
|
+
def add_hit(*args)
|
85
|
+
opts = args.last === Hash ? args.pop : {}
|
86
|
+
d = opts[:data] || args
|
87
|
+
t = opts[:tooltips] || d
|
88
|
+
data << d
|
89
|
+
tooltips << t
|
90
|
+
@dirty = true
|
91
|
+
end
|
92
|
+
}
|
93
|
+
}
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class CreateTables < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :texts do
|
5
|
+
primary_key :id
|
6
|
+
String :digest
|
7
|
+
String :text
|
8
|
+
String :tagged
|
9
|
+
String :filename
|
10
|
+
String :tagged_filename
|
11
|
+
|
12
|
+
index :digest
|
13
|
+
end
|
14
|
+
|
15
|
+
create_table :assoc do
|
16
|
+
primary_key :id
|
17
|
+
Integer :text_id, :references => :texts
|
18
|
+
Integer :position
|
19
|
+
Integer :src_begin
|
20
|
+
Integer :src_end
|
21
|
+
Integer :tagged_begin
|
22
|
+
Integer :tagged_end
|
23
|
+
|
24
|
+
index [:text_id, :tagged_end]
|
25
|
+
index [:text_id, :tagged_begin]
|
26
|
+
index [:text_id, :position]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def down
|
31
|
+
drop_table :assoc
|
32
|
+
drop_table :texts
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class Categories < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :categories do
|
5
|
+
primary_key :id
|
6
|
+
Integer :parent_id, :references => :categories
|
7
|
+
String :name
|
8
|
+
String :key
|
9
|
+
|
10
|
+
index [:parent_id, :id]
|
11
|
+
end
|
12
|
+
|
13
|
+
alter_table :texts do
|
14
|
+
add_column :category_id, Integer, :references => :categories
|
15
|
+
|
16
|
+
add_index [:category_id, :id]
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def down
|
22
|
+
alter_table :texts do
|
23
|
+
drop_column :category_id
|
24
|
+
end
|
25
|
+
drop_table :categories
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
$KCODE='UTF8'
|
2
|
+
|
3
|
+
require 'soap/wsdlDriver'
|
4
|
+
|
5
|
+
class TextProcessing
|
6
|
+
|
7
|
+
attr_accessor :lang
|
8
|
+
|
9
|
+
def initialize(lang = 'ro')
|
10
|
+
@lang = lang
|
11
|
+
end
|
12
|
+
|
13
|
+
def rpc
|
14
|
+
@rpc if @rpc
|
15
|
+
wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
|
16
|
+
@rpc = wsdl.create_rpc_driver
|
17
|
+
end
|
18
|
+
|
19
|
+
# inserts "tagged" as the second to last part in the filename
|
20
|
+
# e.g.
|
21
|
+
# test.txt -> test.tagged.txt
|
22
|
+
# special case when no extension is present:
|
23
|
+
# README -> README.tagged
|
24
|
+
def create_tagged_filename(infile)
|
25
|
+
components = infile.split(/\./)
|
26
|
+
position = [1, components.size-1].max
|
27
|
+
components.insert position, 'tagged'
|
28
|
+
components.join '.'
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_filelist(*files)
|
32
|
+
files = files.flatten.map do |filename|
|
33
|
+
if File.directory? filename
|
34
|
+
Dir.glob File.join(filename, '**/*') # add all files from that directory
|
35
|
+
else
|
36
|
+
filename
|
37
|
+
end
|
38
|
+
end.flatten.compact.uniq # make sure every file is only processed once
|
39
|
+
files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
|
40
|
+
end
|
41
|
+
|
42
|
+
# the core processing routing using the webservice
|
43
|
+
def process(text)
|
44
|
+
response = rpc.Process(:input => text, :lang => lang)
|
45
|
+
response.processResult
|
46
|
+
end
|
47
|
+
|
48
|
+
def process_stdin
|
49
|
+
puts process($stdin.read)
|
50
|
+
end
|
51
|
+
|
52
|
+
# takes the text from infile and outputs the result into the outfile
|
53
|
+
def process_file(infile, outfile)
|
54
|
+
File.open(outfile, 'w') do |out|
|
55
|
+
out.write process(File.read(infile))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# process the args if called as main script
|
63
|
+
if __FILE__ == $0
|
64
|
+
args = ARGV
|
65
|
+
tp = if args.first == '-lang'
|
66
|
+
args.shift
|
67
|
+
TextProcessing.new(args.shift)
|
68
|
+
else
|
69
|
+
TextProcessing.new
|
70
|
+
end
|
71
|
+
|
72
|
+
if args.empty?
|
73
|
+
tp.process_stdin
|
74
|
+
else
|
75
|
+
files = tp.to_filelist(args)
|
76
|
+
|
77
|
+
puts "Processing files:"
|
78
|
+
for infile in files
|
79
|
+
outfile = tp.create_tagged_filename(infile)
|
80
|
+
puts "#{infile} -> #{outfile}"
|
81
|
+
tp.process_file(infile, outfile) unless File.exist?(outfile)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/spec/filter_spec.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# To change this template, choose Tools | Templates
|
2
|
+
# and open the template in the editor.
|
3
|
+
|
4
|
+
require 'filter'
|
5
|
+
puts RUBY_PLATFORM
|
6
|
+
TXT = "They|PPER3 have|AUXP business|NN uses|VERB3 derp|ADNE too|ADVE " +
|
7
|
+
"Apr|NN 4th|CD 2007|M have|DMKD .|PERIOD"
|
8
|
+
|
9
|
+
def search(filter)
|
10
|
+
TXT.scan(Filter.to_re(filter))
|
11
|
+
end
|
12
|
+
|
13
|
+
describe Filter do
|
14
|
+
|
15
|
+
it "should find tags" do
|
16
|
+
search('NN').should == %w[business|NN Apr|NN]
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should find words" do
|
20
|
+
search('"have"').should == %w[have|AUXP have|DMKD]
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should find word and tag combinations" do
|
24
|
+
search('"have" NN "uses"').should == ['have|AUXP business|NN uses|VERB3']
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should find wildcard tags" do
|
28
|
+
search('AU*').should == %w[have|AUXP]
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should find exclusions" do
|
32
|
+
search('A(!UXP DNE)').should == %w[too|ADVE]
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should find word|tag pairs" do
|
36
|
+
search('"have"|D*').should == %w[have|DMKD]
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should find unlimited repetitions" do
|
40
|
+
search('(AD*)+').should == ['derp|ADNE too|ADVE']
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should find limited repetitions" do
|
44
|
+
search('(AD*){3}').should == []
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'concordancer'
|
2
|
+
|
3
|
+
describe TextSnippet do
|
4
|
+
before(:each) do
|
5
|
+
end
|
6
|
+
|
7
|
+
it "should handle umlauts properly" do
|
8
|
+
ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /öfünfä/)
|
9
|
+
ts.left(3).should == 'zwei drei vierß '
|
10
|
+
ts.to_s.should == 'öfünfä'
|
11
|
+
ts.right(3).should == ' ßechs sieben acht'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should handle partial words and umlauts properly" do
|
15
|
+
ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /fünf/)
|
16
|
+
ts.left(3).should == 'zwei drei vierß ö'
|
17
|
+
ts.to_s.should == 'fünf'
|
18
|
+
ts.right(3).should == 'ä ßechs sieben acht'
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should have dynamic left context" do
|
22
|
+
ts = create_ts('one two three four five six seven eight nine ten', /five/)
|
23
|
+
ts.left(1).should == 'four '
|
24
|
+
ts.left(2).should == 'three four '
|
25
|
+
ts.left(10).should == 'one two three four '
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should have dynamic right context" do
|
29
|
+
ts = create_ts('one two three four five six seven eight nine ten', /five/)
|
30
|
+
ts.right(1).should == ' six'
|
31
|
+
ts.right(2).should == ' six seven'
|
32
|
+
ts.right(10).should == ' six seven eight nine ten'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should work correctly with newlines" do
|
36
|
+
ts = create_ts("one two\n three four five six seven eight\n nine ten", /five/)
|
37
|
+
ts.left(1).should == 'four '
|
38
|
+
ts.right(1).should == ' six'
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should replace newlines and tabs with spaces" do
|
42
|
+
ts = create_ts("one two three\n four five six\n seven eight nine ten", /five/)
|
43
|
+
ts.left(2).should == 'three four '
|
44
|
+
ts.right(2).should == ' six seven'
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def create_ts(text, re)
|
50
|
+
m = text.match(re)
|
51
|
+
TextSnippet.new text, m.begin(0), m.end(0)
|
52
|
+
end
|
metadata
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: lumix
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Michael Klaus
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-07-27 00:00:00 +02:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: sweet
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :runtime
|
31
|
+
version_requirements: *id001
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: sequel
|
34
|
+
prerelease: false
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
segments:
|
40
|
+
- 0
|
41
|
+
version: "0"
|
42
|
+
type: :runtime
|
43
|
+
version_requirements: *id002
|
44
|
+
- !ruby/object:Gem::Dependency
|
45
|
+
name: jdbc-postgres
|
46
|
+
prerelease: false
|
47
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
segments:
|
52
|
+
- 0
|
53
|
+
version: "0"
|
54
|
+
type: :runtime
|
55
|
+
version_requirements: *id003
|
56
|
+
description: A concordancer for corpus-based linuistic research.
|
57
|
+
email: Michael.Klaus@gmx.net
|
58
|
+
executables:
|
59
|
+
- lumix
|
60
|
+
extensions: []
|
61
|
+
|
62
|
+
extra_rdoc_files: []
|
63
|
+
|
64
|
+
files:
|
65
|
+
- COPYING
|
66
|
+
- bin/lumix
|
67
|
+
- spec/text_snippet_spec.rb
|
68
|
+
- spec/filter_spec.rb
|
69
|
+
- lib/lumix/filter.rb
|
70
|
+
- lib/lumix/result_view.rb
|
71
|
+
- lib/lumix/gui.rb
|
72
|
+
- lib/lumix/textprocessing.rb
|
73
|
+
- lib/lumix/main.rb
|
74
|
+
- lib/lumix/concordancer.rb
|
75
|
+
- lib/lumix/schema/001_create_tables.rb
|
76
|
+
- lib/lumix/schema/002_categories.rb
|
77
|
+
has_rdoc: true
|
78
|
+
homepage: http://github.org/QaDeS/lumix
|
79
|
+
licenses: []
|
80
|
+
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
|
84
|
+
require_paths:
|
85
|
+
- lib
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
segments:
|
91
|
+
- 0
|
92
|
+
version: "0"
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
segments:
|
98
|
+
- 0
|
99
|
+
version: "0"
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
rubyforge_project:
|
103
|
+
rubygems_version: 1.3.6
|
104
|
+
signing_key:
|
105
|
+
specification_version: 3
|
106
|
+
summary: A concordancer for corpus-based linuistic research.
|
107
|
+
test_files: []
|
108
|
+
|