lumix 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2010 Michael Klaus
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/bin/lumix ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ require 'rubygems'
4
+ require 'lumix/gui'
@@ -0,0 +1,263 @@
1
+ #!/bin/env ruby
2
+
3
+ # TODO take care of 's problem
4
+ # TODO remove Word count line
5
+
6
+ require 'rubygems'
7
+ require 'digest/md5'
8
+ require 'sequel'
9
+ require 'sequel/extensions/migration'
10
+
11
+ require 'lumix/textprocessing'
12
+ require 'lumix/filter'
13
+
14
+ DB_VERSION = 2
15
+ class String
16
+ def digest
17
+ return @digest if @digest
18
+ digest = Digest::MD5.new
19
+ digest.update self
20
+ @digest = digest.hexdigest
21
+ end
22
+ end
23
+
24
+ class TextSnippet
25
+ attr_reader :name, :text, :begin, :end
26
+ def initialize(name, text, first, last)
27
+ @name = name
28
+ @text = text
29
+ @begin = first
30
+ @end = last
31
+ end
32
+ def to_s
33
+ cleanup(@text[@begin...@end])
34
+ end
35
+ def left(context = 5)
36
+ @text[0...@begin] =~ /((\S+\s+){0,#{context}}\S*)\z/m
37
+ cleanup($1)
38
+ end
39
+ def right(context = 5)
40
+ @text[@end..-1] =~ /\A(\S*(\s+\S+){0,#{context}})/m
41
+ cleanup($1)
42
+ end
43
+ def cleanup(txt)
44
+ txt.gsub(/\s+/, ' ')
45
+ end
46
+ end
47
+
48
+ Progress = Struct.new(:task, :work, :data, :done)
49
+
50
+ class Concordancer
51
+
52
+ attr_reader :db, :tp
53
+ attr_accessor :progress_proc
54
+
55
+ def initialize(db_uri, options = {})
56
+ @progress_proc = options[:progress_proc]
57
+ @db = connect(db_uri) do |db|
58
+ db.tables.each{ |t| db.drop_table t } if options[:recreate]
59
+ end
60
+ @ids = db[:texts].map { |v| v[:id] }
61
+ @tp = TextProcessing.new
62
+ end
63
+
64
+ def fallback?
65
+ @fallback
66
+ end
67
+
68
+ def get_id(file)
69
+ text = File.read(file)
70
+ saved = db[:texts][:digest => text.digest]
71
+ saved ? saved[:id] : nil
72
+ end
73
+
74
+ def read(*files)
75
+ files = tp.to_filelist(files)
76
+ prog = Progress.new(:read, files.size)
77
+ puts "Reading #{files.size} files"
78
+ files.each_with_index do |file, index|
79
+ tp.read_file(file)
80
+ progress(prog, index + 1)
81
+ end
82
+ link
83
+ end
84
+
85
+ def read_file(file)
86
+ # read the raw text
87
+ text = File.read(file)
88
+ saved = db[:texts][:digest => text.digest]
89
+
90
+ unless saved
91
+ # retrieve the tagged version
92
+ tagged_file = create_tagged_filename(file)
93
+ process_file(file, tagged_file) unless File.exists?(tagged_file)
94
+
95
+ tagged = retag(File.read(tagged_file))
96
+ id = db[:texts].insert(:digest => text.digest, :text => text, :tagged => tagged, :filename => file, :tagged_filename => tagged_file)
97
+ @ids << id
98
+ end
99
+ end
100
+
101
+ def all
102
+ db[:texts].select(:id).map{|v| v.values}
103
+ end
104
+
105
+ def link!(*ids)
106
+ link(*ids) do |ds|
107
+ ds.delete
108
+ end
109
+ end
110
+
111
+ def link(*ids)
112
+ ids = all if ids.empty?
113
+ ids.flatten!
114
+ prog = Progress.new(:link, ids.size)
115
+ progress(prog)
116
+
117
+ ids.each_with_index do |id, index|
118
+ ds = db[:assoc].filter(:text_id => id)
119
+ yield ds if block_given?
120
+
121
+ link_text(id) if ds.empty?
122
+ progress(prog, index + 1)
123
+ end
124
+ end
125
+
126
+ def find(filter)
127
+ texts = db[:texts]
128
+ prog = Progress.new(:search, texts.count, filter)
129
+ progress(prog)
130
+
131
+ re = Filter.to_re(filter)
132
+
133
+ index = 0
134
+ texts.inject(0) do |result, t|
135
+ t_id, text, tagged = t[:id], t[:text], t[:tagged]
136
+
137
+ # matches to ranges
138
+ results = []
139
+ tagged.scan(re) do |hit|
140
+ t_begin = $~.begin(0)
141
+ t_end = $~.end(0)
142
+ # TODO decouple database operations for performance
143
+ results << find_range(t_id, t_begin, t_end)
144
+ end
145
+
146
+ result += results.inject(0) do |result, f|
147
+ text_snippet = TextSnippet.new(File.basename(t[:filename]), text, f[:src_begin].to_i, f[:src_end].to_i)
148
+ tagged_snippet = TextSnippet.new(File.basename(t[:tagged_filename]), tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
149
+ if block_given?
150
+ yield text_snippet, tagged_snippet
151
+ else
152
+ puts text_snippet
153
+ puts tagged_snippet
154
+ puts
155
+ end
156
+ result += 1
157
+ end
158
+ progress prog, (index += 1)
159
+ result
160
+ end
161
+ end
162
+
163
+ def find_range(t_id, t_begin, t_end)
164
+ ds = db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
165
+ ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
166
+ end
167
+
168
+ private
169
+ def progress(prog, done = 0, data = prog.data)
170
+ if progress_proc
171
+ prog.done = done
172
+ prog.data = data
173
+ progress_proc.call(prog)
174
+ end
175
+ end
176
+
177
+ def connect(db_uri)
178
+ db = Sequel.connect(db_uri)
179
+ begin
180
+ db.get(1)
181
+ @fallback = false
182
+ rescue Exception => e
183
+ puts 'Falling back to sqlite'
184
+ puts e
185
+ db = Sequel.connect('jdbc:sqlite://concordancer.db')
186
+ @fallback = true
187
+ end
188
+ yield db
189
+ migration_path = File.join(File.dirname(__FILE__), 'schema')
190
+ Sequel::Migrator.apply(db, migration_path, DB_VERSION)
191
+ return db
192
+ end
193
+
194
+ def retag(text)
195
+ words = text.split(/[ \n]/).map do |word|
196
+ word.split(/\|/)
197
+ end
198
+ words.inject('') do |result, (word, lemma, tag, tag2)|
199
+ result + (word ? "#{word}|#{tag} " : "\n")
200
+ end
201
+ end
202
+
203
+ def link_text(id)
204
+ ds = db[:texts][:id => id]
205
+ text, tagged = ds[:text], ds[:tagged]
206
+ puts "Linking text #{ds[:filename]}"
207
+
208
+ re = /([^\s\|]+)\|(\S+)/m
209
+ src_last = 0
210
+ position = 0
211
+ assoc = []
212
+ tagged.scan(re) do |word, tag|
213
+ tagged_begin = $~.begin(0)
214
+ tagged_end = $~.end(0)
215
+
216
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
217
+ src_match = text[src_last..-1].match(word_re) # find the word
218
+ if src_match
219
+ src_begin = src_last + src_match.begin(0)
220
+ src_end = src_last + src_match.end(0)
221
+
222
+ src_last = src_end
223
+ assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
224
+ else
225
+ STDERR.puts "Could not find match for '#{word}' in text #{ds[:filename]}"
226
+ end
227
+ position += 1
228
+ end
229
+ db[:assoc].multi_insert(assoc)
230
+ rescue => e
231
+ STDERR.puts e
232
+ STDERR.puts e.backtrace
233
+ raise e
234
+ end
235
+
236
+ end
237
+
238
+ if __FILE__ == $0
239
+ prog_proc = lambda do |prog|
240
+ puts "#{prog.task}#{prog.data ? "(#{prog.data})" : ""} #{prog.done}/#{prog.work}"
241
+ end
242
+
243
+ # uri = 'postgres://concordancer:concordancer@localhost:5433/concordancer'
244
+ uri = 'jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer'
245
+ c = Concordancer.new(uri, :progress_proc => prog_proc, :recreate => true)
246
+ #puts c.filter_to_re('"sunt" APN NPN')
247
+ c.read('raw')
248
+ #c.find(%q[("de")? (N*)+ "si" (N*){1,2} (AS*)?])
249
+ #c.link! #if RECREATE
250
+ #c.link c.all
251
+ #ds = db[:assoc].filter(:text_id => 1).order_by(:position).filter{tagged_end >= 150}.filter{tagged_begin < 330}
252
+ #puts ds.sql
253
+ #exit
254
+
255
+ t = Time.now
256
+ output = ""
257
+ results = c.find(%q[(*){0,3} N* N* (*){0,3}]) do |text, tagged|
258
+ output << "#{text}\n#{tagged}\n\n"
259
+ end
260
+ puts Time.now - t
261
+ puts "Results: #{ results }"
262
+ puts output
263
+ end
@@ -0,0 +1,60 @@
1
+ module Filter
2
+ class << self
3
+
4
+ HANDLERS = %w[handle_wildcard handle_choice handle_literals
5
+ handle_dangling_tags handle_multiplicators assure_wordbounds]
6
+
7
+ # TODO refactor
8
+ def to_re(filter)
9
+ re = HANDLERS.inject(filter) do |filter, handler|
10
+ puts filter
11
+ puts "#{handler} -->"
12
+ send handler, filter
13
+ end
14
+ puts re
15
+ Regexp.new(re)
16
+ end
17
+
18
+ # character wildcard replacement
19
+ def handle_wildcard(re)
20
+ re.gsub(/([^\)])\*/, '\1[^\b]*?')
21
+ end
22
+
23
+ # Takes (!A B C) and transforms it
24
+ def handle_choice(re)
25
+ re.gsub(/\(\!([^\)]+)\)/) do
26
+ c = $1.split.map{ |t| '(?!' + t + '\b)' }.join
27
+ '(?:' + c + '\S)*'
28
+ end
29
+ end
30
+
31
+ # transforms literals delimited by ""
32
+ def handle_literals(re)
33
+ re.gsub(/\"([^\"]*)\"(?:\|(\S+?))?/) do
34
+ str = $1
35
+ tag = $2 || '\S+?'
36
+ str.gsub(/ /, '_') + '\|' + tag
37
+ end
38
+ end
39
+
40
+ # add wildcard word match on tag-only search criteria
41
+ def handle_dangling_tags(re)
42
+ re.split(/ /).map do |s|
43
+ if s['\|']
44
+ s
45
+ else
46
+ s.gsub(/(\(?)(\S+)/, '\1[^\s\|]+\|\2')
47
+ end
48
+ end.join('\s+')
49
+ end
50
+ # Handles the + * ? and {} qualifiers
51
+ def handle_multiplicators(re)
52
+ re.gsub(/\(([^\)]+)(\)((\{[^\}]+\})|\*|\+|\?)\s?)/, '(?:\b\1\b\2')
53
+ end
54
+
55
+ def assure_wordbounds(re)
56
+ '\b' + re + '\b'
57
+ end
58
+
59
+ end
60
+ end
data/lib/lumix/gui.rb ADDED
@@ -0,0 +1,148 @@
1
+ require 'yaml'
2
+ require 'lumix/concordancer'
3
+ require 'sweet'
4
+ require 'lumix/result_view'
5
+ #Sweet.set_debug
6
+
7
+
8
+ Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
9
+ Indicator = %w'} ) ] | [ ( {'
10
+
11
+ CONF = 'config.yaml'
12
+ ConfigStruct = Struct.new(:database_uri)
13
+ CConfig = YAML.load_file(CONF) rescue ConfigStruct.new('jdbc:postgresql://localhost:5432/concordancer?user=concordancer&password=concordancer')
14
+ def save!
15
+ File.open(CONF, 'w') do |f|
16
+ f.write(CConfig.to_yaml)
17
+ end
18
+ end
19
+
20
+ Sweet.app :title => 'Ruby Concordancer', :width => 800, :height => 700, :layout => :grid.conf(:numColumns => 3) do
21
+ def conc
22
+ @conc ||= Concordancer.new(CConfig.database_uri, :progress_proc => @progress_proc)#, :recreate => true)
23
+ end
24
+
25
+ @progress_proc = proc do |p|
26
+ task = Texts[p.task] || p.task
27
+ perform do
28
+ if p.done == p.work
29
+ @p_status.text = 'Done!'
30
+ @p_indicator.text = ''
31
+ @p_bar.fraction = 0
32
+ else
33
+ @p_status.text = task
34
+ @p_indicator.text = Indicator[p.done % Indicator.size]
35
+ @p_bar.fraction = p.done.to_f / p.work
36
+ end
37
+ end
38
+ end
39
+
40
+ save! unless File.exists?(CONF)
41
+
42
+ menubar do
43
+ submenu '&File' do
44
+ submenu '&Import...' do
45
+ item('E&nglish texts') { import_chooser('en') }
46
+ item('&Romanian texts') { import_chooser('ro') }
47
+ end
48
+ item('&Export findings...') { export_findings }
49
+ separator
50
+ item('&Relink texts') { relink }
51
+ item('&Clear the database') { reconnect :recreate => true }
52
+ separator
53
+ item('E&xit') { exit }
54
+ end
55
+ # submenu 'C&orpora' do
56
+ # @m_cat = submenu '&Category' do
57
+ # item('Cre&ate...') { create_category }
58
+ # item('&Import...') { import_chooser }
59
+ # separator
60
+ # item('&Edit...') { edit_category }
61
+ # item('&Delete') { delete_category }
62
+ # end
63
+ # @m_text = submenu '&Text' do
64
+ # item('&Reimport...') { reimport_chooser }
65
+ # item('&Delete') { delete_text }
66
+ # end
67
+ # end
68
+ # @m_stats = submenu '&Statistics' do
69
+ # item('&Editor') { script_editor }
70
+ # separator
71
+ # item('&Load Script...') { load_script }
72
+ # end
73
+ # submenu "&Help" do
74
+ # separator
75
+ # item('&About') { about }
76
+ # end
77
+ end
78
+
79
+ tree :grid_data => {:align => [:fill, :fill], :span => [1, 2], :grab => [true, true]}
80
+
81
+ @filter = edit_line 'NSN NSN', :grid_data => {:align => [:fill, :center], :grab => true}, :max_size => 40 do
82
+ perform_search
83
+ end
84
+ button 'Search' do
85
+ perform_search
86
+ end
87
+
88
+ @results = table :columns => %w[Text Left Hit Right], :sort => true, :grid_data => {:align => [:fill, :fill], :span => 2, :grab => [true, true]}, :scroll => true
89
+
90
+ @counter = label :grid_data => {:span => 2, :align => :fill}
91
+
92
+ @p_status = label(:grid_data => {:align => [:fill, :bottom], :grab => true})
93
+ @p_bar = progress(:width => 50, :grid_data => {:align => [:right, :bottom]})
94
+ @p_indicator = label(' ', :grid_data => {:align => [:right, :bottom]})
95
+
96
+
97
+ def perform_search
98
+ filter = @filter.text
99
+ @results.data.clear
100
+ Thread.new do
101
+ unless filter.empty?
102
+ puts "finding #{filter}"
103
+ found = conc.find(filter) do |text, tagged|
104
+ @results.add_hit(text.name, text.left, text.to_s, text.right)
105
+ end
106
+ end
107
+ perform do
108
+ @counter.text = "#{found} matches"
109
+ @p_status.text = "Found #{found || 'no'} matches for #{filter}"
110
+ end
111
+ end
112
+ end
113
+
114
+ def import_chooser(lang)
115
+ conc.tp.lang = lang
116
+ Thread.new(conc) do |conc|
117
+ conc.read('raw')
118
+ end
119
+ end
120
+
121
+ def export_findings
122
+ filename = to_filename(@filter.text) + '.findings'
123
+ @p_status.text = "Exporting to #{filename}"
124
+ File.open(filename, 'w') do |f|
125
+ @results.items.each do |item|
126
+ unless item.getChecked
127
+ left, hit, right = (0..2).map{ |i| item.text(i) }
128
+ f.puts "#{left}\t#{hit}\t#{right}"
129
+ end
130
+ end
131
+ end
132
+ @p_status.text = "Done! Exported to file #{filename}"
133
+ end
134
+
135
+ def relink
136
+ Thread.new(conc) do |conc|
137
+ conc.link!
138
+ end
139
+ end
140
+
141
+ def to_filename(filter)
142
+ filter.gsub(/\s+/, "_").gsub(/[\*\.\?\"]/, '')
143
+ end
144
+
145
+ def reconnect(opts = {})
146
+ @conc = Concordancer.new(CConfig.database_uri, opts.mergs(:progress_proc => @progress_proc))
147
+ end
148
+ end
data/lib/lumix/main.rb ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), '..')
4
+ $: << File.join(File.dirname(__FILE__), '../../../Sweet/lib')
5
+
6
+ require 'rubygems'
7
+ require 'lumix/gui'
@@ -0,0 +1,93 @@
1
+ class Java::OrgEclipseSwtWidgets::Table
2
+
3
+ attr_accessor :data, :tooltips
4
+
5
+ def sweeten(app, opts={}, &block)
6
+ @data = []
7
+ @tooltips = []
8
+ super
9
+ @redraw_thread = Thread.new do
10
+ while !isDisposed
11
+ if @dirty
12
+ @dirty = false
13
+ perform do
14
+ setItemCount data.size
15
+ clearAll if clear_all
16
+ end
17
+ end
18
+ sleep 1 # TODO find a better alternative
19
+ end
20
+ end
21
+
22
+ # TODO implement tooltips
23
+
24
+ addListener swt::SetData do |e|
25
+ item = e.item
26
+ index = indexOf(item)
27
+ item.setText(Array(data[index]).to_java(:string))
28
+ end
29
+
30
+ addListener swt::Resize do |e|
31
+ default_weight = 1.0 / columns.size
32
+ current_width = @old_width
33
+ w = width
34
+ columns[0..-2].each do |c|
35
+ weight = c.width == 0 ? default_weight : c.width.to_f / current_width
36
+ c.width = w * weight
37
+ end
38
+ columns[columns.size - 1].pack
39
+ @old_width = w
40
+ end
41
+ end
42
+
43
+ def columns=(*titles)
44
+ if titles
45
+ titles.each do |title|
46
+ col = widgets::TableColumn.new(self, swt::CENTER)
47
+ col.setText title
48
+ end
49
+
50
+ setHeaderVisible true
51
+ setLinesVisible true
52
+ end
53
+ end
54
+
55
+ def sort=(sort)
56
+ sort = Hash.new(true) if [true, :all].member?(sort)
57
+ if sort
58
+ columns.each_with_index do |col, index|
59
+ if sort[col.text]
60
+ col.addListener swt::Selection do
61
+ if data
62
+ @data = data.sort_by {|e| e[index] }
63
+ update :clear
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ ::Sweet::WIDGET_DEFAULTS[:table] = {
73
+ :style => [:border, :virtual, :check]
74
+ }
75
+ ::Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
76
+ :block_handler => :set_data,
77
+ :custom_code => proc {
78
+ def update(clear_all = false)
79
+ return if isDisposed
80
+ setItemCount data.size
81
+ clearAll if clear_all
82
+ end
83
+
84
+ def add_hit(*args)
85
+ opts = args.last === Hash ? args.pop : {}
86
+ d = opts[:data] || args
87
+ t = opts[:tooltips] || d
88
+ data << d
89
+ tooltips << t
90
+ @dirty = true
91
+ end
92
+ }
93
+ }
@@ -0,0 +1,35 @@
1
+ class CreateTables < Sequel::Migration
2
+
3
+ def up
4
+ create_table :texts do
5
+ primary_key :id
6
+ String :digest
7
+ String :text
8
+ String :tagged
9
+ String :filename
10
+ String :tagged_filename
11
+
12
+ index :digest
13
+ end
14
+
15
+ create_table :assoc do
16
+ primary_key :id
17
+ Integer :text_id, :references => :texts
18
+ Integer :position
19
+ Integer :src_begin
20
+ Integer :src_end
21
+ Integer :tagged_begin
22
+ Integer :tagged_end
23
+
24
+ index [:text_id, :tagged_end]
25
+ index [:text_id, :tagged_begin]
26
+ index [:text_id, :position]
27
+ end
28
+ end
29
+
30
+ def down
31
+ drop_table :assoc
32
+ drop_table :texts
33
+ end
34
+
35
+ end
@@ -0,0 +1,28 @@
1
+ class Categories < Sequel::Migration
2
+
3
+ def up
4
+ create_table :categories do
5
+ primary_key :id
6
+ Integer :parent_id, :references => :categories
7
+ String :name
8
+ String :key
9
+
10
+ index [:parent_id, :id]
11
+ end
12
+
13
+ alter_table :texts do
14
+ add_column :category_id, Integer, :references => :categories
15
+
16
+ add_index [:category_id, :id]
17
+ end
18
+
19
+ end
20
+
21
+ def down
22
+ alter_table :texts do
23
+ drop_column :category_id
24
+ end
25
+ drop_table :categories
26
+ end
27
+
28
+ end
@@ -0,0 +1,84 @@
1
+ $KCODE='UTF8'
2
+
3
+ require 'soap/wsdlDriver'
4
+
5
+ class TextProcessing
6
+
7
+ attr_accessor :lang
8
+
9
+ def initialize(lang = 'ro')
10
+ @lang = lang
11
+ end
12
+
13
+ def rpc
14
+ @rpc if @rpc
15
+ wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
16
+ @rpc = wsdl.create_rpc_driver
17
+ end
18
+
19
+ # inserts "tagged" as the second to last part in the filename
20
+ # e.g.
21
+ # test.txt -> test.tagged.txt
22
+ # special case when no extension is present:
23
+ # README -> README.tagged
24
+ def create_tagged_filename(infile)
25
+ components = infile.split(/\./)
26
+ position = [1, components.size-1].max
27
+ components.insert position, 'tagged'
28
+ components.join '.'
29
+ end
30
+
31
+ def to_filelist(*files)
32
+ files = files.flatten.map do |filename|
33
+ if File.directory? filename
34
+ Dir.glob File.join(filename, '**/*') # add all files from that directory
35
+ else
36
+ filename
37
+ end
38
+ end.flatten.compact.uniq # make sure every file is only processed once
39
+ files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
40
+ end
41
+
42
+ # the core processing routing using the webservice
43
+ def process(text)
44
+ response = rpc.Process(:input => text, :lang => lang)
45
+ response.processResult
46
+ end
47
+
48
+ def process_stdin
49
+ puts process($stdin.read)
50
+ end
51
+
52
+ # takes the text from infile and outputs the result into the outfile
53
+ def process_file(infile, outfile)
54
+ File.open(outfile, 'w') do |out|
55
+ out.write process(File.read(infile))
56
+ end
57
+ end
58
+
59
+ end
60
+
61
+
62
+ # process the args if called as main script
63
+ if __FILE__ == $0
64
+ args = ARGV
65
+ tp = if args.first == '-lang'
66
+ args.shift
67
+ TextProcessing.new(args.shift)
68
+ else
69
+ TextProcessing.new
70
+ end
71
+
72
+ if args.empty?
73
+ tp.process_stdin
74
+ else
75
+ files = tp.to_filelist(args)
76
+
77
+ puts "Processing files:"
78
+ for infile in files
79
+ outfile = tp.create_tagged_filename(infile)
80
+ puts "#{infile} -> #{outfile}"
81
+ tp.process_file(infile, outfile) unless File.exist?(outfile)
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,47 @@
1
+ # To change this template, choose Tools | Templates
2
+ # and open the template in the editor.
3
+
4
+ require 'filter'
5
+ puts RUBY_PLATFORM
6
+ TXT = "They|PPER3 have|AUXP business|NN uses|VERB3 derp|ADNE too|ADVE " +
7
+ "Apr|NN 4th|CD 2007|M have|DMKD .|PERIOD"
8
+
9
+ def search(filter)
10
+ TXT.scan(Filter.to_re(filter))
11
+ end
12
+
13
+ describe Filter do
14
+
15
+ it "should find tags" do
16
+ search('NN').should == %w[business|NN Apr|NN]
17
+ end
18
+
19
+ it "should find words" do
20
+ search('"have"').should == %w[have|AUXP have|DMKD]
21
+ end
22
+
23
+ it "should find word and tag combinations" do
24
+ search('"have" NN "uses"').should == ['have|AUXP business|NN uses|VERB3']
25
+ end
26
+
27
+ it "should find wildcard tags" do
28
+ search('AU*').should == %w[have|AUXP]
29
+ end
30
+
31
+ it "should find exclusions" do
32
+ search('A(!UXP DNE)').should == %w[too|ADVE]
33
+ end
34
+
35
+ it "should find word|tag pairs" do
36
+ search('"have"|D*').should == %w[have|DMKD]
37
+ end
38
+
39
+ it "should find unlimited repetitions" do
40
+ search('(AD*)+').should == ['derp|ADNE too|ADVE']
41
+ end
42
+
43
+ it "should find limited repetitions" do
44
+ search('(AD*){3}').should == []
45
+ end
46
+ end
47
+
@@ -0,0 +1,52 @@
1
+ require 'concordancer'
2
+
3
+ describe TextSnippet do
4
+ before(:each) do
5
+ end
6
+
7
+ it "should handle umlauts properly" do
8
+ ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /öfünfä/)
9
+ ts.left(3).should == 'zwei drei vierß '
10
+ ts.to_s.should == 'öfünfä'
11
+ ts.right(3).should == ' ßechs sieben acht'
12
+ end
13
+
14
+ it "should handle partial words and umlauts properly" do
15
+ ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /fünf/)
16
+ ts.left(3).should == 'zwei drei vierß ö'
17
+ ts.to_s.should == 'fünf'
18
+ ts.right(3).should == 'ä ßechs sieben acht'
19
+ end
20
+
21
+ it "should have dynamic left context" do
22
+ ts = create_ts('one two three four five six seven eight nine ten', /five/)
23
+ ts.left(1).should == 'four '
24
+ ts.left(2).should == 'three four '
25
+ ts.left(10).should == 'one two three four '
26
+ end
27
+
28
+ it "should have dynamic right context" do
29
+ ts = create_ts('one two three four five six seven eight nine ten', /five/)
30
+ ts.right(1).should == ' six'
31
+ ts.right(2).should == ' six seven'
32
+ ts.right(10).should == ' six seven eight nine ten'
33
+ end
34
+
35
+ it "should work correctly with newlines" do
36
+ ts = create_ts("one two\n three four five six seven eight\n nine ten", /five/)
37
+ ts.left(1).should == 'four '
38
+ ts.right(1).should == ' six'
39
+ end
40
+
41
+ it "should replace newlines and tabs with spaces" do
42
+ ts = create_ts("one two three\n four five six\n seven eight nine ten", /five/)
43
+ ts.left(2).should == 'three four '
44
+ ts.right(2).should == ' six seven'
45
+ end
46
+
47
+ end
48
+
49
+ def create_ts(text, re)
50
+ m = text.match(re)
51
+ TextSnippet.new text, m.begin(0), m.end(0)
52
+ end
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lumix
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Michael Klaus
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-27 00:00:00 +02:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: sweet
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :runtime
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: sequel
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ version: "0"
42
+ type: :runtime
43
+ version_requirements: *id002
44
+ - !ruby/object:Gem::Dependency
45
+ name: jdbc-postgres
46
+ prerelease: false
47
+ requirement: &id003 !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ segments:
52
+ - 0
53
+ version: "0"
54
+ type: :runtime
55
+ version_requirements: *id003
56
+ description: A concordancer for corpus-based linuistic research.
57
+ email: Michael.Klaus@gmx.net
58
+ executables:
59
+ - lumix
60
+ extensions: []
61
+
62
+ extra_rdoc_files: []
63
+
64
+ files:
65
+ - COPYING
66
+ - bin/lumix
67
+ - spec/text_snippet_spec.rb
68
+ - spec/filter_spec.rb
69
+ - lib/lumix/filter.rb
70
+ - lib/lumix/result_view.rb
71
+ - lib/lumix/gui.rb
72
+ - lib/lumix/textprocessing.rb
73
+ - lib/lumix/main.rb
74
+ - lib/lumix/concordancer.rb
75
+ - lib/lumix/schema/001_create_tables.rb
76
+ - lib/lumix/schema/002_categories.rb
77
+ has_rdoc: true
78
+ homepage: http://github.org/QaDeS/lumix
79
+ licenses: []
80
+
81
+ post_install_message:
82
+ rdoc_options: []
83
+
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ segments:
91
+ - 0
92
+ version: "0"
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ segments:
98
+ - 0
99
+ version: "0"
100
+ requirements: []
101
+
102
+ rubyforge_project:
103
+ rubygems_version: 1.3.6
104
+ signing_key:
105
+ specification_version: 3
106
+ summary: A concordancer for corpus-based linuistic research.
107
+ test_files: []
108
+