lumix 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +18 -0
- data/bin/lumix +4 -0
- data/lib/lumix/concordancer.rb +263 -0
- data/lib/lumix/filter.rb +60 -0
- data/lib/lumix/gui.rb +148 -0
- data/lib/lumix/main.rb +7 -0
- data/lib/lumix/result_view.rb +93 -0
- data/lib/lumix/schema/001_create_tables.rb +35 -0
- data/lib/lumix/schema/002_categories.rb +28 -0
- data/lib/lumix/textprocessing.rb +84 -0
- data/spec/filter_spec.rb +47 -0
- data/spec/text_snippet_spec.rb +52 -0
- metadata +108 -0
data/COPYING
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2010 Michael Klaus
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
16
|
+
THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/lumix
ADDED
@@ -0,0 +1,263 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
# TODO take care of 's problem
|
4
|
+
# TODO remove Word count line
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'digest/md5'
|
8
|
+
require 'sequel'
|
9
|
+
require 'sequel/extensions/migration'
|
10
|
+
|
11
|
+
require 'lumix/textprocessing'
|
12
|
+
require 'lumix/filter'
|
13
|
+
|
14
|
+
DB_VERSION = 2
|
15
|
+
class String
|
16
|
+
def digest
|
17
|
+
return @digest if @digest
|
18
|
+
digest = Digest::MD5.new
|
19
|
+
digest.update self
|
20
|
+
@digest = digest.hexdigest
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class TextSnippet
|
25
|
+
attr_reader :name, :text, :begin, :end
|
26
|
+
def initialize(name, text, first, last)
|
27
|
+
@name = name
|
28
|
+
@text = text
|
29
|
+
@begin = first
|
30
|
+
@end = last
|
31
|
+
end
|
32
|
+
def to_s
|
33
|
+
cleanup(@text[@begin...@end])
|
34
|
+
end
|
35
|
+
def left(context = 5)
|
36
|
+
@text[0...@begin] =~ /((\S+\s+){0,#{context}}\S*)\z/m
|
37
|
+
cleanup($1)
|
38
|
+
end
|
39
|
+
def right(context = 5)
|
40
|
+
@text[@end..-1] =~ /\A(\S*(\s+\S+){0,#{context}})/m
|
41
|
+
cleanup($1)
|
42
|
+
end
|
43
|
+
def cleanup(txt)
|
44
|
+
txt.gsub(/\s+/, ' ')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
Progress = Struct.new(:task, :work, :data, :done)
|
49
|
+
|
50
|
+
class Concordancer
|
51
|
+
|
52
|
+
attr_reader :db, :tp
|
53
|
+
attr_accessor :progress_proc
|
54
|
+
|
55
|
+
def initialize(db_uri, options = {})
|
56
|
+
@progress_proc = options[:progress_proc]
|
57
|
+
@db = connect(db_uri) do |db|
|
58
|
+
db.tables.each{ |t| db.drop_table t } if options[:recreate]
|
59
|
+
end
|
60
|
+
@ids = db[:texts].map { |v| v[:id] }
|
61
|
+
@tp = TextProcessing.new
|
62
|
+
end
|
63
|
+
|
64
|
+
def fallback?
|
65
|
+
@fallback
|
66
|
+
end
|
67
|
+
|
68
|
+
def get_id(file)
|
69
|
+
text = File.read(file)
|
70
|
+
saved = db[:texts][:digest => text.digest]
|
71
|
+
saved ? saved[:id] : nil
|
72
|
+
end
|
73
|
+
|
74
|
+
def read(*files)
|
75
|
+
files = tp.to_filelist(files)
|
76
|
+
prog = Progress.new(:read, files.size)
|
77
|
+
puts "Reading #{files.size} files"
|
78
|
+
files.each_with_index do |file, index|
|
79
|
+
tp.read_file(file)
|
80
|
+
progress(prog, index + 1)
|
81
|
+
end
|
82
|
+
link
|
83
|
+
end
|
84
|
+
|
85
|
+
def read_file(file)
|
86
|
+
# read the raw text
|
87
|
+
text = File.read(file)
|
88
|
+
saved = db[:texts][:digest => text.digest]
|
89
|
+
|
90
|
+
unless saved
|
91
|
+
# retrieve the tagged version
|
92
|
+
tagged_file = create_tagged_filename(file)
|
93
|
+
process_file(file, tagged_file) unless File.exists?(tagged_file)
|
94
|
+
|
95
|
+
tagged = retag(File.read(tagged_file))
|
96
|
+
id = db[:texts].insert(:digest => text.digest, :text => text, :tagged => tagged, :filename => file, :tagged_filename => tagged_file)
|
97
|
+
@ids << id
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def all
|
102
|
+
db[:texts].select(:id).map{|v| v.values}
|
103
|
+
end
|
104
|
+
|
105
|
+
def link!(*ids)
|
106
|
+
link(*ids) do |ds|
|
107
|
+
ds.delete
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def link(*ids)
|
112
|
+
ids = all if ids.empty?
|
113
|
+
ids.flatten!
|
114
|
+
prog = Progress.new(:link, ids.size)
|
115
|
+
progress(prog)
|
116
|
+
|
117
|
+
ids.each_with_index do |id, index|
|
118
|
+
ds = db[:assoc].filter(:text_id => id)
|
119
|
+
yield ds if block_given?
|
120
|
+
|
121
|
+
link_text(id) if ds.empty?
|
122
|
+
progress(prog, index + 1)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def find(filter)
|
127
|
+
texts = db[:texts]
|
128
|
+
prog = Progress.new(:search, texts.count, filter)
|
129
|
+
progress(prog)
|
130
|
+
|
131
|
+
re = Filter.to_re(filter)
|
132
|
+
|
133
|
+
index = 0
|
134
|
+
texts.inject(0) do |result, t|
|
135
|
+
t_id, text, tagged = t[:id], t[:text], t[:tagged]
|
136
|
+
|
137
|
+
# matches to ranges
|
138
|
+
results = []
|
139
|
+
tagged.scan(re) do |hit|
|
140
|
+
t_begin = $~.begin(0)
|
141
|
+
t_end = $~.end(0)
|
142
|
+
# TODO decouple database operations for performance
|
143
|
+
results << find_range(t_id, t_begin, t_end)
|
144
|
+
end
|
145
|
+
|
146
|
+
result += results.inject(0) do |result, f|
|
147
|
+
text_snippet = TextSnippet.new(File.basename(t[:filename]), text, f[:src_begin].to_i, f[:src_end].to_i)
|
148
|
+
tagged_snippet = TextSnippet.new(File.basename(t[:tagged_filename]), tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
|
149
|
+
if block_given?
|
150
|
+
yield text_snippet, tagged_snippet
|
151
|
+
else
|
152
|
+
puts text_snippet
|
153
|
+
puts tagged_snippet
|
154
|
+
puts
|
155
|
+
end
|
156
|
+
result += 1
|
157
|
+
end
|
158
|
+
progress prog, (index += 1)
|
159
|
+
result
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def find_range(t_id, t_begin, t_end)
|
164
|
+
ds = db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
|
165
|
+
ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
|
166
|
+
end
|
167
|
+
|
168
|
+
private
|
169
|
+
def progress(prog, done = 0, data = prog.data)
|
170
|
+
if progress_proc
|
171
|
+
prog.done = done
|
172
|
+
prog.data = data
|
173
|
+
progress_proc.call(prog)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def connect(db_uri)
|
178
|
+
db = Sequel.connect(db_uri)
|
179
|
+
begin
|
180
|
+
db.get(1)
|
181
|
+
@fallback = false
|
182
|
+
rescue Exception => e
|
183
|
+
puts 'Falling back to sqlite'
|
184
|
+
puts e
|
185
|
+
db = Sequel.connect('jdbc:sqlite://concordancer.db')
|
186
|
+
@fallback = true
|
187
|
+
end
|
188
|
+
yield db
|
189
|
+
migration_path = File.join(File.dirname(__FILE__), 'schema')
|
190
|
+
Sequel::Migrator.apply(db, migration_path, DB_VERSION)
|
191
|
+
return db
|
192
|
+
end
|
193
|
+
|
194
|
+
def retag(text)
|
195
|
+
words = text.split(/[ \n]/).map do |word|
|
196
|
+
word.split(/\|/)
|
197
|
+
end
|
198
|
+
words.inject('') do |result, (word, lemma, tag, tag2)|
|
199
|
+
result + (word ? "#{word}|#{tag} " : "\n")
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def link_text(id)
|
204
|
+
ds = db[:texts][:id => id]
|
205
|
+
text, tagged = ds[:text], ds[:tagged]
|
206
|
+
puts "Linking text #{ds[:filename]}"
|
207
|
+
|
208
|
+
re = /([^\s\|]+)\|(\S+)/m
|
209
|
+
src_last = 0
|
210
|
+
position = 0
|
211
|
+
assoc = []
|
212
|
+
tagged.scan(re) do |word, tag|
|
213
|
+
tagged_begin = $~.begin(0)
|
214
|
+
tagged_end = $~.end(0)
|
215
|
+
|
216
|
+
word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
|
217
|
+
src_match = text[src_last..-1].match(word_re) # find the word
|
218
|
+
if src_match
|
219
|
+
src_begin = src_last + src_match.begin(0)
|
220
|
+
src_end = src_last + src_match.end(0)
|
221
|
+
|
222
|
+
src_last = src_end
|
223
|
+
assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
|
224
|
+
else
|
225
|
+
STDERR.puts "Could not find match for '#{word}' in text #{ds[:filename]}"
|
226
|
+
end
|
227
|
+
position += 1
|
228
|
+
end
|
229
|
+
db[:assoc].multi_insert(assoc)
|
230
|
+
rescue => e
|
231
|
+
STDERR.puts e
|
232
|
+
STDERR.puts e.backtrace
|
233
|
+
raise e
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|
237
|
+
|
238
|
+
if __FILE__ == $0
|
239
|
+
prog_proc = lambda do |prog|
|
240
|
+
puts "#{prog.task}#{prog.data ? "(#{prog.data})" : ""} #{prog.done}/#{prog.work}"
|
241
|
+
end
|
242
|
+
|
243
|
+
# uri = 'postgres://concordancer:concordancer@localhost:5433/concordancer'
|
244
|
+
uri = 'jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer'
|
245
|
+
c = Concordancer.new(uri, :progress_proc => prog_proc, :recreate => true)
|
246
|
+
#puts c.filter_to_re('"sunt" APN NPN')
|
247
|
+
c.read('raw')
|
248
|
+
#c.find(%q[("de")? (N*)+ "si" (N*){1,2} (AS*)?])
|
249
|
+
#c.link! #if RECREATE
|
250
|
+
#c.link c.all
|
251
|
+
#ds = db[:assoc].filter(:text_id => 1).order_by(:position).filter{tagged_end >= 150}.filter{tagged_begin < 330}
|
252
|
+
#puts ds.sql
|
253
|
+
#exit
|
254
|
+
|
255
|
+
t = Time.now
|
256
|
+
output = ""
|
257
|
+
results = c.find(%q[(*){0,3} N* N* (*){0,3}]) do |text, tagged|
|
258
|
+
output << "#{text}\n#{tagged}\n\n"
|
259
|
+
end
|
260
|
+
puts Time.now - t
|
261
|
+
puts "Results: #{ results }"
|
262
|
+
puts output
|
263
|
+
end
|
data/lib/lumix/filter.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
module Filter
|
2
|
+
class << self
|
3
|
+
|
4
|
+
HANDLERS = %w[handle_wildcard handle_choice handle_literals
|
5
|
+
handle_dangling_tags handle_multiplicators assure_wordbounds]
|
6
|
+
|
7
|
+
# TODO refactor
|
8
|
+
def to_re(filter)
|
9
|
+
re = HANDLERS.inject(filter) do |filter, handler|
|
10
|
+
puts filter
|
11
|
+
puts "#{handler} -->"
|
12
|
+
send handler, filter
|
13
|
+
end
|
14
|
+
puts re
|
15
|
+
Regexp.new(re)
|
16
|
+
end
|
17
|
+
|
18
|
+
# character wildcard replacement
|
19
|
+
def handle_wildcard(re)
|
20
|
+
re.gsub(/([^\)])\*/, '\1[^\b]*?')
|
21
|
+
end
|
22
|
+
|
23
|
+
# Takes (!A B C) and transforms it
|
24
|
+
def handle_choice(re)
|
25
|
+
re.gsub(/\(\!([^\)]+)\)/) do
|
26
|
+
c = $1.split.map{ |t| '(?!' + t + '\b)' }.join
|
27
|
+
'(?:' + c + '\S)*'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# transforms literals delimited by ""
|
32
|
+
def handle_literals(re)
|
33
|
+
re.gsub(/\"([^\"]*)\"(?:\|(\S+?))?/) do
|
34
|
+
str = $1
|
35
|
+
tag = $2 || '\S+?'
|
36
|
+
str.gsub(/ /, '_') + '\|' + tag
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# add wildcard word match on tag-only search criteria
|
41
|
+
def handle_dangling_tags(re)
|
42
|
+
re.split(/ /).map do |s|
|
43
|
+
if s['\|']
|
44
|
+
s
|
45
|
+
else
|
46
|
+
s.gsub(/(\(?)(\S+)/, '\1[^\s\|]+\|\2')
|
47
|
+
end
|
48
|
+
end.join('\s+')
|
49
|
+
end
|
50
|
+
# Handles the + * ? and {} qualifiers
|
51
|
+
def handle_multiplicators(re)
|
52
|
+
re.gsub(/\(([^\)]+)(\)((\{[^\}]+\})|\*|\+|\?)\s?)/, '(?:\b\1\b\2')
|
53
|
+
end
|
54
|
+
|
55
|
+
def assure_wordbounds(re)
|
56
|
+
'\b' + re + '\b'
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
data/lib/lumix/gui.rb
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'lumix/concordancer'
|
3
|
+
require 'sweet'
|
4
|
+
require 'lumix/result_view'
|
5
|
+
#Sweet.set_debug
|
6
|
+
|
7
|
+
|
8
|
+
Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
|
9
|
+
Indicator = %w'} ) ] | [ ( {'
|
10
|
+
|
11
|
+
CONF = 'config.yaml'
|
12
|
+
ConfigStruct = Struct.new(:database_uri)
|
13
|
+
CConfig = YAML.load_file(CONF) rescue ConfigStruct.new('jdbc:postgresql://localhost:5432/concordancer?user=concordancer&password=concordancer')
|
14
|
+
def save!
|
15
|
+
File.open(CONF, 'w') do |f|
|
16
|
+
f.write(CConfig.to_yaml)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
Sweet.app :title => 'Ruby Concordancer', :width => 800, :height => 700, :layout => :grid.conf(:numColumns => 3) do
|
21
|
+
def conc
|
22
|
+
@conc ||= Concordancer.new(CConfig.database_uri, :progress_proc => @progress_proc)#, :recreate => true)
|
23
|
+
end
|
24
|
+
|
25
|
+
@progress_proc = proc do |p|
|
26
|
+
task = Texts[p.task] || p.task
|
27
|
+
perform do
|
28
|
+
if p.done == p.work
|
29
|
+
@p_status.text = 'Done!'
|
30
|
+
@p_indicator.text = ''
|
31
|
+
@p_bar.fraction = 0
|
32
|
+
else
|
33
|
+
@p_status.text = task
|
34
|
+
@p_indicator.text = Indicator[p.done % Indicator.size]
|
35
|
+
@p_bar.fraction = p.done.to_f / p.work
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
save! unless File.exists?(CONF)
|
41
|
+
|
42
|
+
menubar do
|
43
|
+
submenu '&File' do
|
44
|
+
submenu '&Import...' do
|
45
|
+
item('E&nglish texts') { import_chooser('en') }
|
46
|
+
item('&Romanian texts') { import_chooser('ro') }
|
47
|
+
end
|
48
|
+
item('&Export findings...') { export_findings }
|
49
|
+
separator
|
50
|
+
item('&Relink texts') { relink }
|
51
|
+
item('&Clear the database') { reconnect :recreate => true }
|
52
|
+
separator
|
53
|
+
item('E&xit') { exit }
|
54
|
+
end
|
55
|
+
# submenu 'C&orpora' do
|
56
|
+
# @m_cat = submenu '&Category' do
|
57
|
+
# item('Cre&ate...') { create_category }
|
58
|
+
# item('&Import...') { import_chooser }
|
59
|
+
# separator
|
60
|
+
# item('&Edit...') { edit_category }
|
61
|
+
# item('&Delete') { delete_category }
|
62
|
+
# end
|
63
|
+
# @m_text = submenu '&Text' do
|
64
|
+
# item('&Reimport...') { reimport_chooser }
|
65
|
+
# item('&Delete') { delete_text }
|
66
|
+
# end
|
67
|
+
# end
|
68
|
+
# @m_stats = submenu '&Statistics' do
|
69
|
+
# item('&Editor') { script_editor }
|
70
|
+
# separator
|
71
|
+
# item('&Load Script...') { load_script }
|
72
|
+
# end
|
73
|
+
# submenu "&Help" do
|
74
|
+
# separator
|
75
|
+
# item('&About') { about }
|
76
|
+
# end
|
77
|
+
end
|
78
|
+
|
79
|
+
tree :grid_data => {:align => [:fill, :fill], :span => [1, 2], :grab => [true, true]}
|
80
|
+
|
81
|
+
@filter = edit_line 'NSN NSN', :grid_data => {:align => [:fill, :center], :grab => true}, :max_size => 40 do
|
82
|
+
perform_search
|
83
|
+
end
|
84
|
+
button 'Search' do
|
85
|
+
perform_search
|
86
|
+
end
|
87
|
+
|
88
|
+
@results = table :columns => %w[Text Left Hit Right], :sort => true, :grid_data => {:align => [:fill, :fill], :span => 2, :grab => [true, true]}, :scroll => true
|
89
|
+
|
90
|
+
@counter = label :grid_data => {:span => 2, :align => :fill}
|
91
|
+
|
92
|
+
@p_status = label(:grid_data => {:align => [:fill, :bottom], :grab => true})
|
93
|
+
@p_bar = progress(:width => 50, :grid_data => {:align => [:right, :bottom]})
|
94
|
+
@p_indicator = label(' ', :grid_data => {:align => [:right, :bottom]})
|
95
|
+
|
96
|
+
|
97
|
+
def perform_search
|
98
|
+
filter = @filter.text
|
99
|
+
@results.data.clear
|
100
|
+
Thread.new do
|
101
|
+
unless filter.empty?
|
102
|
+
puts "finding #{filter}"
|
103
|
+
found = conc.find(filter) do |text, tagged|
|
104
|
+
@results.add_hit(text.name, text.left, text.to_s, text.right)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
perform do
|
108
|
+
@counter.text = "#{found} matches"
|
109
|
+
@p_status.text = "Found #{found || 'no'} matches for #{filter}"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def import_chooser(lang)
|
115
|
+
conc.tp.lang = lang
|
116
|
+
Thread.new(conc) do |conc|
|
117
|
+
conc.read('raw')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def export_findings
|
122
|
+
filename = to_filename(@filter.text) + '.findings'
|
123
|
+
@p_status.text = "Exporting to #{filename}"
|
124
|
+
File.open(filename, 'w') do |f|
|
125
|
+
@results.items.each do |item|
|
126
|
+
unless item.getChecked
|
127
|
+
left, hit, right = (0..2).map{ |i| item.text(i) }
|
128
|
+
f.puts "#{left}\t#{hit}\t#{right}"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
@p_status.text = "Done! Exported to file #{filename}"
|
133
|
+
end
|
134
|
+
|
135
|
+
def relink
|
136
|
+
Thread.new(conc) do |conc|
|
137
|
+
conc.link!
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def to_filename(filter)
|
142
|
+
filter.gsub(/\s+/, "_").gsub(/[\*\.\?\"]/, '')
|
143
|
+
end
|
144
|
+
|
145
|
+
def reconnect(opts = {})
|
146
|
+
@conc = Concordancer.new(CConfig.database_uri, opts.mergs(:progress_proc => @progress_proc))
|
147
|
+
end
|
148
|
+
end
|
data/lib/lumix/main.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
class Java::OrgEclipseSwtWidgets::Table
|
2
|
+
|
3
|
+
attr_accessor :data, :tooltips
|
4
|
+
|
5
|
+
def sweeten(app, opts={}, &block)
|
6
|
+
@data = []
|
7
|
+
@tooltips = []
|
8
|
+
super
|
9
|
+
@redraw_thread = Thread.new do
|
10
|
+
while !isDisposed
|
11
|
+
if @dirty
|
12
|
+
@dirty = false
|
13
|
+
perform do
|
14
|
+
setItemCount data.size
|
15
|
+
clearAll if clear_all
|
16
|
+
end
|
17
|
+
end
|
18
|
+
sleep 1 # TODO find a better alternative
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# TODO implement tooltips
|
23
|
+
|
24
|
+
addListener swt::SetData do |e|
|
25
|
+
item = e.item
|
26
|
+
index = indexOf(item)
|
27
|
+
item.setText(Array(data[index]).to_java(:string))
|
28
|
+
end
|
29
|
+
|
30
|
+
addListener swt::Resize do |e|
|
31
|
+
default_weight = 1.0 / columns.size
|
32
|
+
current_width = @old_width
|
33
|
+
w = width
|
34
|
+
columns[0..-2].each do |c|
|
35
|
+
weight = c.width == 0 ? default_weight : c.width.to_f / current_width
|
36
|
+
c.width = w * weight
|
37
|
+
end
|
38
|
+
columns[columns.size - 1].pack
|
39
|
+
@old_width = w
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def columns=(*titles)
|
44
|
+
if titles
|
45
|
+
titles.each do |title|
|
46
|
+
col = widgets::TableColumn.new(self, swt::CENTER)
|
47
|
+
col.setText title
|
48
|
+
end
|
49
|
+
|
50
|
+
setHeaderVisible true
|
51
|
+
setLinesVisible true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def sort=(sort)
|
56
|
+
sort = Hash.new(true) if [true, :all].member?(sort)
|
57
|
+
if sort
|
58
|
+
columns.each_with_index do |col, index|
|
59
|
+
if sort[col.text]
|
60
|
+
col.addListener swt::Selection do
|
61
|
+
if data
|
62
|
+
@data = data.sort_by {|e| e[index] }
|
63
|
+
update :clear
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
::Sweet::WIDGET_DEFAULTS[:table] = {
|
73
|
+
:style => [:border, :virtual, :check]
|
74
|
+
}
|
75
|
+
::Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
|
76
|
+
:block_handler => :set_data,
|
77
|
+
:custom_code => proc {
|
78
|
+
def update(clear_all = false)
|
79
|
+
return if isDisposed
|
80
|
+
setItemCount data.size
|
81
|
+
clearAll if clear_all
|
82
|
+
end
|
83
|
+
|
84
|
+
def add_hit(*args)
|
85
|
+
opts = args.last === Hash ? args.pop : {}
|
86
|
+
d = opts[:data] || args
|
87
|
+
t = opts[:tooltips] || d
|
88
|
+
data << d
|
89
|
+
tooltips << t
|
90
|
+
@dirty = true
|
91
|
+
end
|
92
|
+
}
|
93
|
+
}
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class CreateTables < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :texts do
|
5
|
+
primary_key :id
|
6
|
+
String :digest
|
7
|
+
String :text
|
8
|
+
String :tagged
|
9
|
+
String :filename
|
10
|
+
String :tagged_filename
|
11
|
+
|
12
|
+
index :digest
|
13
|
+
end
|
14
|
+
|
15
|
+
create_table :assoc do
|
16
|
+
primary_key :id
|
17
|
+
Integer :text_id, :references => :texts
|
18
|
+
Integer :position
|
19
|
+
Integer :src_begin
|
20
|
+
Integer :src_end
|
21
|
+
Integer :tagged_begin
|
22
|
+
Integer :tagged_end
|
23
|
+
|
24
|
+
index [:text_id, :tagged_end]
|
25
|
+
index [:text_id, :tagged_begin]
|
26
|
+
index [:text_id, :position]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def down
|
31
|
+
drop_table :assoc
|
32
|
+
drop_table :texts
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class Categories < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :categories do
|
5
|
+
primary_key :id
|
6
|
+
Integer :parent_id, :references => :categories
|
7
|
+
String :name
|
8
|
+
String :key
|
9
|
+
|
10
|
+
index [:parent_id, :id]
|
11
|
+
end
|
12
|
+
|
13
|
+
alter_table :texts do
|
14
|
+
add_column :category_id, Integer, :references => :categories
|
15
|
+
|
16
|
+
add_index [:category_id, :id]
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def down
|
22
|
+
alter_table :texts do
|
23
|
+
drop_column :category_id
|
24
|
+
end
|
25
|
+
drop_table :categories
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
$KCODE='UTF8'
|
2
|
+
|
3
|
+
require 'soap/wsdlDriver'
|
4
|
+
|
5
|
+
class TextProcessing
|
6
|
+
|
7
|
+
attr_accessor :lang
|
8
|
+
|
9
|
+
def initialize(lang = 'ro')
|
10
|
+
@lang = lang
|
11
|
+
end
|
12
|
+
|
13
|
+
def rpc
|
14
|
+
@rpc if @rpc
|
15
|
+
wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
|
16
|
+
@rpc = wsdl.create_rpc_driver
|
17
|
+
end
|
18
|
+
|
19
|
+
# inserts "tagged" as the second to last part in the filename
|
20
|
+
# e.g.
|
21
|
+
# test.txt -> test.tagged.txt
|
22
|
+
# special case when no extension is present:
|
23
|
+
# README -> README.tagged
|
24
|
+
def create_tagged_filename(infile)
|
25
|
+
components = infile.split(/\./)
|
26
|
+
position = [1, components.size-1].max
|
27
|
+
components.insert position, 'tagged'
|
28
|
+
components.join '.'
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_filelist(*files)
|
32
|
+
files = files.flatten.map do |filename|
|
33
|
+
if File.directory? filename
|
34
|
+
Dir.glob File.join(filename, '**/*') # add all files from that directory
|
35
|
+
else
|
36
|
+
filename
|
37
|
+
end
|
38
|
+
end.flatten.compact.uniq # make sure every file is only processed once
|
39
|
+
files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
|
40
|
+
end
|
41
|
+
|
42
|
+
# the core processing routing using the webservice
|
43
|
+
def process(text)
|
44
|
+
response = rpc.Process(:input => text, :lang => lang)
|
45
|
+
response.processResult
|
46
|
+
end
|
47
|
+
|
48
|
+
def process_stdin
|
49
|
+
puts process($stdin.read)
|
50
|
+
end
|
51
|
+
|
52
|
+
# takes the text from infile and outputs the result into the outfile
|
53
|
+
def process_file(infile, outfile)
|
54
|
+
File.open(outfile, 'w') do |out|
|
55
|
+
out.write process(File.read(infile))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# process the args if called as main script
|
63
|
+
if __FILE__ == $0
|
64
|
+
args = ARGV
|
65
|
+
tp = if args.first == '-lang'
|
66
|
+
args.shift
|
67
|
+
TextProcessing.new(args.shift)
|
68
|
+
else
|
69
|
+
TextProcessing.new
|
70
|
+
end
|
71
|
+
|
72
|
+
if args.empty?
|
73
|
+
tp.process_stdin
|
74
|
+
else
|
75
|
+
files = tp.to_filelist(args)
|
76
|
+
|
77
|
+
puts "Processing files:"
|
78
|
+
for infile in files
|
79
|
+
outfile = tp.create_tagged_filename(infile)
|
80
|
+
puts "#{infile} -> #{outfile}"
|
81
|
+
tp.process_file(infile, outfile) unless File.exist?(outfile)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/spec/filter_spec.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# To change this template, choose Tools | Templates
|
2
|
+
# and open the template in the editor.
|
3
|
+
|
4
|
+
require 'filter'
|
5
|
+
puts RUBY_PLATFORM
|
6
|
+
TXT = "They|PPER3 have|AUXP business|NN uses|VERB3 derp|ADNE too|ADVE " +
|
7
|
+
"Apr|NN 4th|CD 2007|M have|DMKD .|PERIOD"
|
8
|
+
|
9
|
+
def search(filter)
|
10
|
+
TXT.scan(Filter.to_re(filter))
|
11
|
+
end
|
12
|
+
|
13
|
+
describe Filter do
|
14
|
+
|
15
|
+
it "should find tags" do
|
16
|
+
search('NN').should == %w[business|NN Apr|NN]
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should find words" do
|
20
|
+
search('"have"').should == %w[have|AUXP have|DMKD]
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should find word and tag combinations" do
|
24
|
+
search('"have" NN "uses"').should == ['have|AUXP business|NN uses|VERB3']
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should find wildcard tags" do
|
28
|
+
search('AU*').should == %w[have|AUXP]
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should find exclusions" do
|
32
|
+
search('A(!UXP DNE)').should == %w[too|ADVE]
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should find word|tag pairs" do
|
36
|
+
search('"have"|D*').should == %w[have|DMKD]
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should find unlimited repetitions" do
|
40
|
+
search('(AD*)+').should == ['derp|ADNE too|ADVE']
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should find limited repetitions" do
|
44
|
+
search('(AD*){3}').should == []
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'concordancer'
|
2
|
+
|
3
|
+
describe TextSnippet do
|
4
|
+
before(:each) do
|
5
|
+
end
|
6
|
+
|
7
|
+
it "should handle umlauts properly" do
|
8
|
+
ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /öfünfä/)
|
9
|
+
ts.left(3).should == 'zwei drei vierß '
|
10
|
+
ts.to_s.should == 'öfünfä'
|
11
|
+
ts.right(3).should == ' ßechs sieben acht'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should handle partial words and umlauts properly" do
|
15
|
+
ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /fünf/)
|
16
|
+
ts.left(3).should == 'zwei drei vierß ö'
|
17
|
+
ts.to_s.should == 'fünf'
|
18
|
+
ts.right(3).should == 'ä ßechs sieben acht'
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should have dynamic left context" do
|
22
|
+
ts = create_ts('one two three four five six seven eight nine ten', /five/)
|
23
|
+
ts.left(1).should == 'four '
|
24
|
+
ts.left(2).should == 'three four '
|
25
|
+
ts.left(10).should == 'one two three four '
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should have dynamic right context" do
|
29
|
+
ts = create_ts('one two three four five six seven eight nine ten', /five/)
|
30
|
+
ts.right(1).should == ' six'
|
31
|
+
ts.right(2).should == ' six seven'
|
32
|
+
ts.right(10).should == ' six seven eight nine ten'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should work correctly with newlines" do
|
36
|
+
ts = create_ts("one two\n three four five six seven eight\n nine ten", /five/)
|
37
|
+
ts.left(1).should == 'four '
|
38
|
+
ts.right(1).should == ' six'
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should replace newlines and tabs with spaces" do
|
42
|
+
ts = create_ts("one two three\n four five six\n seven eight nine ten", /five/)
|
43
|
+
ts.left(2).should == 'three four '
|
44
|
+
ts.right(2).should == ' six seven'
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def create_ts(text, re)
|
50
|
+
m = text.match(re)
|
51
|
+
TextSnippet.new text, m.begin(0), m.end(0)
|
52
|
+
end
|
metadata
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: lumix
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Michael Klaus
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-07-27 00:00:00 +02:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: sweet
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :runtime
|
31
|
+
version_requirements: *id001
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: sequel
|
34
|
+
prerelease: false
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
segments:
|
40
|
+
- 0
|
41
|
+
version: "0"
|
42
|
+
type: :runtime
|
43
|
+
version_requirements: *id002
|
44
|
+
- !ruby/object:Gem::Dependency
|
45
|
+
name: jdbc-postgres
|
46
|
+
prerelease: false
|
47
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
segments:
|
52
|
+
- 0
|
53
|
+
version: "0"
|
54
|
+
type: :runtime
|
55
|
+
version_requirements: *id003
|
56
|
+
description: A concordancer for corpus-based linuistic research.
|
57
|
+
email: Michael.Klaus@gmx.net
|
58
|
+
executables:
|
59
|
+
- lumix
|
60
|
+
extensions: []
|
61
|
+
|
62
|
+
extra_rdoc_files: []
|
63
|
+
|
64
|
+
files:
|
65
|
+
- COPYING
|
66
|
+
- bin/lumix
|
67
|
+
- spec/text_snippet_spec.rb
|
68
|
+
- spec/filter_spec.rb
|
69
|
+
- lib/lumix/filter.rb
|
70
|
+
- lib/lumix/result_view.rb
|
71
|
+
- lib/lumix/gui.rb
|
72
|
+
- lib/lumix/textprocessing.rb
|
73
|
+
- lib/lumix/main.rb
|
74
|
+
- lib/lumix/concordancer.rb
|
75
|
+
- lib/lumix/schema/001_create_tables.rb
|
76
|
+
- lib/lumix/schema/002_categories.rb
|
77
|
+
has_rdoc: true
|
78
|
+
homepage: http://github.org/QaDeS/lumix
|
79
|
+
licenses: []
|
80
|
+
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
|
84
|
+
require_paths:
|
85
|
+
- lib
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
segments:
|
91
|
+
- 0
|
92
|
+
version: "0"
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
segments:
|
98
|
+
- 0
|
99
|
+
version: "0"
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
rubyforge_project:
|
103
|
+
rubygems_version: 1.3.6
|
104
|
+
signing_key:
|
105
|
+
specification_version: 3
|
106
|
+
summary: A concordancer for corpus-based linuistic research.
|
107
|
+
test_files: []
|
108
|
+
|