lumix 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2010 Michael Klaus
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/bin/lumix ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ require 'rubygems'
4
+ require 'lumix/gui'
@@ -0,0 +1,263 @@
1
+ #!/bin/env ruby
2
+
3
+ # TODO take care of 's problem
4
+ # TODO remove Word count line
5
+
6
+ require 'rubygems'
7
+ require 'digest/md5'
8
+ require 'sequel'
9
+ require 'sequel/extensions/migration'
10
+
11
+ require 'lumix/textprocessing'
12
+ require 'lumix/filter'
13
+
14
+ DB_VERSION = 2
15
+ class String
16
+ def digest
17
+ return @digest if @digest
18
+ digest = Digest::MD5.new
19
+ digest.update self
20
+ @digest = digest.hexdigest
21
+ end
22
+ end
23
+
24
+ class TextSnippet
25
+ attr_reader :name, :text, :begin, :end
26
+ def initialize(name, text, first, last)
27
+ @name = name
28
+ @text = text
29
+ @begin = first
30
+ @end = last
31
+ end
32
+ def to_s
33
+ cleanup(@text[@begin...@end])
34
+ end
35
+ def left(context = 5)
36
+ @text[0...@begin] =~ /((\S+\s+){0,#{context}}\S*)\z/m
37
+ cleanup($1)
38
+ end
39
+ def right(context = 5)
40
+ @text[@end..-1] =~ /\A(\S*(\s+\S+){0,#{context}})/m
41
+ cleanup($1)
42
+ end
43
+ def cleanup(txt)
44
+ txt.gsub(/\s+/, ' ')
45
+ end
46
+ end
47
+
48
+ Progress = Struct.new(:task, :work, :data, :done)
49
+
50
+ class Concordancer
51
+
52
+ attr_reader :db, :tp
53
+ attr_accessor :progress_proc
54
+
55
+ def initialize(db_uri, options = {})
56
+ @progress_proc = options[:progress_proc]
57
+ @db = connect(db_uri) do |db|
58
+ db.tables.each{ |t| db.drop_table t } if options[:recreate]
59
+ end
60
+ @ids = db[:texts].map { |v| v[:id] }
61
+ @tp = TextProcessing.new
62
+ end
63
+
64
+ def fallback?
65
+ @fallback
66
+ end
67
+
68
+ def get_id(file)
69
+ text = File.read(file)
70
+ saved = db[:texts][:digest => text.digest]
71
+ saved ? saved[:id] : nil
72
+ end
73
+
74
+ def read(*files)
75
+ files = tp.to_filelist(files)
76
+ prog = Progress.new(:read, files.size)
77
+ puts "Reading #{files.size} files"
78
+ files.each_with_index do |file, index|
79
+ tp.read_file(file)
80
+ progress(prog, index + 1)
81
+ end
82
+ link
83
+ end
84
+
85
+ def read_file(file)
86
+ # read the raw text
87
+ text = File.read(file)
88
+ saved = db[:texts][:digest => text.digest]
89
+
90
+ unless saved
91
+ # retrieve the tagged version
92
+ tagged_file = create_tagged_filename(file)
93
+ process_file(file, tagged_file) unless File.exists?(tagged_file)
94
+
95
+ tagged = retag(File.read(tagged_file))
96
+ id = db[:texts].insert(:digest => text.digest, :text => text, :tagged => tagged, :filename => file, :tagged_filename => tagged_file)
97
+ @ids << id
98
+ end
99
+ end
100
+
101
+ def all
102
+ db[:texts].select(:id).map{|v| v.values}
103
+ end
104
+
105
+ def link!(*ids)
106
+ link(*ids) do |ds|
107
+ ds.delete
108
+ end
109
+ end
110
+
111
+ def link(*ids)
112
+ ids = all if ids.empty?
113
+ ids.flatten!
114
+ prog = Progress.new(:link, ids.size)
115
+ progress(prog)
116
+
117
+ ids.each_with_index do |id, index|
118
+ ds = db[:assoc].filter(:text_id => id)
119
+ yield ds if block_given?
120
+
121
+ link_text(id) if ds.empty?
122
+ progress(prog, index + 1)
123
+ end
124
+ end
125
+
126
+ def find(filter)
127
+ texts = db[:texts]
128
+ prog = Progress.new(:search, texts.count, filter)
129
+ progress(prog)
130
+
131
+ re = Filter.to_re(filter)
132
+
133
+ index = 0
134
+ texts.inject(0) do |result, t|
135
+ t_id, text, tagged = t[:id], t[:text], t[:tagged]
136
+
137
+ # matches to ranges
138
+ results = []
139
+ tagged.scan(re) do |hit|
140
+ t_begin = $~.begin(0)
141
+ t_end = $~.end(0)
142
+ # TODO decouple database operations for performance
143
+ results << find_range(t_id, t_begin, t_end)
144
+ end
145
+
146
+ result += results.inject(0) do |result, f|
147
+ text_snippet = TextSnippet.new(File.basename(t[:filename]), text, f[:src_begin].to_i, f[:src_end].to_i)
148
+ tagged_snippet = TextSnippet.new(File.basename(t[:tagged_filename]), tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
149
+ if block_given?
150
+ yield text_snippet, tagged_snippet
151
+ else
152
+ puts text_snippet
153
+ puts tagged_snippet
154
+ puts
155
+ end
156
+ result += 1
157
+ end
158
+ progress prog, (index += 1)
159
+ result
160
+ end
161
+ end
162
+
163
+ def find_range(t_id, t_begin, t_end)
164
+ ds = db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
165
+ ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
166
+ end
167
+
168
+ private
169
+ def progress(prog, done = 0, data = prog.data)
170
+ if progress_proc
171
+ prog.done = done
172
+ prog.data = data
173
+ progress_proc.call(prog)
174
+ end
175
+ end
176
+
177
+ def connect(db_uri)
178
+ db = Sequel.connect(db_uri)
179
+ begin
180
+ db.get(1)
181
+ @fallback = false
182
+ rescue Exception => e
183
+ puts 'Falling back to sqlite'
184
+ puts e
185
+ db = Sequel.connect('jdbc:sqlite://concordancer.db')
186
+ @fallback = true
187
+ end
188
+ yield db
189
+ migration_path = File.join(File.dirname(__FILE__), 'schema')
190
+ Sequel::Migrator.apply(db, migration_path, DB_VERSION)
191
+ return db
192
+ end
193
+
194
+ def retag(text)
195
+ words = text.split(/[ \n]/).map do |word|
196
+ word.split(/\|/)
197
+ end
198
+ words.inject('') do |result, (word, lemma, tag, tag2)|
199
+ result + (word ? "#{word}|#{tag} " : "\n")
200
+ end
201
+ end
202
+
203
+ def link_text(id)
204
+ ds = db[:texts][:id => id]
205
+ text, tagged = ds[:text], ds[:tagged]
206
+ puts "Linking text #{ds[:filename]}"
207
+
208
+ re = /([^\s\|]+)\|(\S+)/m
209
+ src_last = 0
210
+ position = 0
211
+ assoc = []
212
+ tagged.scan(re) do |word, tag|
213
+ tagged_begin = $~.begin(0)
214
+ tagged_end = $~.end(0)
215
+
216
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
217
+ src_match = text[src_last..-1].match(word_re) # find the word
218
+ if src_match
219
+ src_begin = src_last + src_match.begin(0)
220
+ src_end = src_last + src_match.end(0)
221
+
222
+ src_last = src_end
223
+ assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
224
+ else
225
+ STDERR.puts "Could not find match for '#{word}' in text #{ds[:filename]}"
226
+ end
227
+ position += 1
228
+ end
229
+ db[:assoc].multi_insert(assoc)
230
+ rescue => e
231
+ STDERR.puts e
232
+ STDERR.puts e.backtrace
233
+ raise e
234
+ end
235
+
236
+ end
237
+
238
+ if __FILE__ == $0
239
+ prog_proc = lambda do |prog|
240
+ puts "#{prog.task}#{prog.data ? "(#{prog.data})" : ""} #{prog.done}/#{prog.work}"
241
+ end
242
+
243
+ # uri = 'postgres://concordancer:concordancer@localhost:5433/concordancer'
244
+ uri = 'jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer'
245
+ c = Concordancer.new(uri, :progress_proc => prog_proc, :recreate => true)
246
+ #puts c.filter_to_re('"sunt" APN NPN')
247
+ c.read('raw')
248
+ #c.find(%q[("de")? (N*)+ "si" (N*){1,2} (AS*)?])
249
+ #c.link! #if RECREATE
250
+ #c.link c.all
251
+ #ds = db[:assoc].filter(:text_id => 1).order_by(:position).filter{tagged_end >= 150}.filter{tagged_begin < 330}
252
+ #puts ds.sql
253
+ #exit
254
+
255
+ t = Time.now
256
+ output = ""
257
+ results = c.find(%q[(*){0,3} N* N* (*){0,3}]) do |text, tagged|
258
+ output << "#{text}\n#{tagged}\n\n"
259
+ end
260
+ puts Time.now - t
261
+ puts "Results: #{ results }"
262
+ puts output
263
+ end
@@ -0,0 +1,60 @@
1
+ module Filter
2
+ class << self
3
+
4
+ HANDLERS = %w[handle_wildcard handle_choice handle_literals
5
+ handle_dangling_tags handle_multiplicators assure_wordbounds]
6
+
7
+ # TODO refactor
8
+ def to_re(filter)
9
+ re = HANDLERS.inject(filter) do |filter, handler|
10
+ puts filter
11
+ puts "#{handler} -->"
12
+ send handler, filter
13
+ end
14
+ puts re
15
+ Regexp.new(re)
16
+ end
17
+
18
+ # character wildcard replacement
19
+ def handle_wildcard(re)
20
+ re.gsub(/([^\)])\*/, '\1[^\b]*?')
21
+ end
22
+
23
+ # Takes (!A B C) and transforms it
24
+ def handle_choice(re)
25
+ re.gsub(/\(\!([^\)]+)\)/) do
26
+ c = $1.split.map{ |t| '(?!' + t + '\b)' }.join
27
+ '(?:' + c + '\S)*'
28
+ end
29
+ end
30
+
31
+ # transforms literals delimited by ""
32
+ def handle_literals(re)
33
+ re.gsub(/\"([^\"]*)\"(?:\|(\S+?))?/) do
34
+ str = $1
35
+ tag = $2 || '\S+?'
36
+ str.gsub(/ /, '_') + '\|' + tag
37
+ end
38
+ end
39
+
40
+ # add wildcard word match on tag-only search criteria
41
+ def handle_dangling_tags(re)
42
+ re.split(/ /).map do |s|
43
+ if s['\|']
44
+ s
45
+ else
46
+ s.gsub(/(\(?)(\S+)/, '\1[^\s\|]+\|\2')
47
+ end
48
+ end.join('\s+')
49
+ end
50
+ # Handles the + * ? and {} qualifiers
51
+ def handle_multiplicators(re)
52
+ re.gsub(/\(([^\)]+)(\)((\{[^\}]+\})|\*|\+|\?)\s?)/, '(?:\b\1\b\2')
53
+ end
54
+
55
+ def assure_wordbounds(re)
56
+ '\b' + re + '\b'
57
+ end
58
+
59
+ end
60
+ end
data/lib/lumix/gui.rb ADDED
@@ -0,0 +1,148 @@
1
+ require 'yaml'
2
+ require 'lumix/concordancer'
3
+ require 'sweet'
4
+ require 'lumix/result_view'
5
+ #Sweet.set_debug
6
+
7
+
8
+ Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
9
+ Indicator = %w'} ) ] | [ ( {'
10
+
11
+ CONF = 'config.yaml'
12
+ ConfigStruct = Struct.new(:database_uri)
13
+ CConfig = YAML.load_file(CONF) rescue ConfigStruct.new('jdbc:postgresql://localhost:5432/concordancer?user=concordancer&password=concordancer')
14
+ def save!
15
+ File.open(CONF, 'w') do |f|
16
+ f.write(CConfig.to_yaml)
17
+ end
18
+ end
19
+
20
+ Sweet.app :title => 'Ruby Concordancer', :width => 800, :height => 700, :layout => :grid.conf(:numColumns => 3) do
21
+ def conc
22
+ @conc ||= Concordancer.new(CConfig.database_uri, :progress_proc => @progress_proc)#, :recreate => true)
23
+ end
24
+
25
+ @progress_proc = proc do |p|
26
+ task = Texts[p.task] || p.task
27
+ perform do
28
+ if p.done == p.work
29
+ @p_status.text = 'Done!'
30
+ @p_indicator.text = ''
31
+ @p_bar.fraction = 0
32
+ else
33
+ @p_status.text = task
34
+ @p_indicator.text = Indicator[p.done % Indicator.size]
35
+ @p_bar.fraction = p.done.to_f / p.work
36
+ end
37
+ end
38
+ end
39
+
40
+ save! unless File.exists?(CONF)
41
+
42
+ menubar do
43
+ submenu '&File' do
44
+ submenu '&Import...' do
45
+ item('E&nglish texts') { import_chooser('en') }
46
+ item('&Romanian texts') { import_chooser('ro') }
47
+ end
48
+ item('&Export findings...') { export_findings }
49
+ separator
50
+ item('&Relink texts') { relink }
51
+ item('&Clear the database') { reconnect :recreate => true }
52
+ separator
53
+ item('E&xit') { exit }
54
+ end
55
+ # submenu 'C&orpora' do
56
+ # @m_cat = submenu '&Category' do
57
+ # item('Cre&ate...') { create_category }
58
+ # item('&Import...') { import_chooser }
59
+ # separator
60
+ # item('&Edit...') { edit_category }
61
+ # item('&Delete') { delete_category }
62
+ # end
63
+ # @m_text = submenu '&Text' do
64
+ # item('&Reimport...') { reimport_chooser }
65
+ # item('&Delete') { delete_text }
66
+ # end
67
+ # end
68
+ # @m_stats = submenu '&Statistics' do
69
+ # item('&Editor') { script_editor }
70
+ # separator
71
+ # item('&Load Script...') { load_script }
72
+ # end
73
+ # submenu "&Help" do
74
+ # separator
75
+ # item('&About') { about }
76
+ # end
77
+ end
78
+
79
+ tree :grid_data => {:align => [:fill, :fill], :span => [1, 2], :grab => [true, true]}
80
+
81
+ @filter = edit_line 'NSN NSN', :grid_data => {:align => [:fill, :center], :grab => true}, :max_size => 40 do
82
+ perform_search
83
+ end
84
+ button 'Search' do
85
+ perform_search
86
+ end
87
+
88
+ @results = table :columns => %w[Text Left Hit Right], :sort => true, :grid_data => {:align => [:fill, :fill], :span => 2, :grab => [true, true]}, :scroll => true
89
+
90
+ @counter = label :grid_data => {:span => 2, :align => :fill}
91
+
92
+ @p_status = label(:grid_data => {:align => [:fill, :bottom], :grab => true})
93
+ @p_bar = progress(:width => 50, :grid_data => {:align => [:right, :bottom]})
94
+ @p_indicator = label(' ', :grid_data => {:align => [:right, :bottom]})
95
+
96
+
97
+ def perform_search
98
+ filter = @filter.text
99
+ @results.data.clear
100
+ Thread.new do
101
+ unless filter.empty?
102
+ puts "finding #{filter}"
103
+ found = conc.find(filter) do |text, tagged|
104
+ @results.add_hit(text.name, text.left, text.to_s, text.right)
105
+ end
106
+ end
107
+ perform do
108
+ @counter.text = "#{found} matches"
109
+ @p_status.text = "Found #{found || 'no'} matches for #{filter}"
110
+ end
111
+ end
112
+ end
113
+
114
+ def import_chooser(lang)
115
+ conc.tp.lang = lang
116
+ Thread.new(conc) do |conc|
117
+ conc.read('raw')
118
+ end
119
+ end
120
+
121
+ def export_findings
122
+ filename = to_filename(@filter.text) + '.findings'
123
+ @p_status.text = "Exporting to #{filename}"
124
+ File.open(filename, 'w') do |f|
125
+ @results.items.each do |item|
126
+ unless item.getChecked
127
+ left, hit, right = (0..2).map{ |i| item.text(i) }
128
+ f.puts "#{left}\t#{hit}\t#{right}"
129
+ end
130
+ end
131
+ end
132
+ @p_status.text = "Done! Exported to file #{filename}"
133
+ end
134
+
135
+ def relink
136
+ Thread.new(conc) do |conc|
137
+ conc.link!
138
+ end
139
+ end
140
+
141
+ def to_filename(filter)
142
+ filter.gsub(/\s+/, "_").gsub(/[\*\.\?\"]/, '')
143
+ end
144
+
145
+ def reconnect(opts = {})
146
+ @conc = Concordancer.new(CConfig.database_uri, opts.mergs(:progress_proc => @progress_proc))
147
+ end
148
+ end
data/lib/lumix/main.rb ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), '..')
4
+ $: << File.join(File.dirname(__FILE__), '../../../Sweet/lib')
5
+
6
+ require 'rubygems'
7
+ require 'lumix/gui'
@@ -0,0 +1,93 @@
1
+ class Java::OrgEclipseSwtWidgets::Table
2
+
3
+ attr_accessor :data, :tooltips
4
+
5
+ def sweeten(app, opts={}, &block)
6
+ @data = []
7
+ @tooltips = []
8
+ super
9
+ @redraw_thread = Thread.new do
10
+ while !isDisposed
11
+ if @dirty
12
+ @dirty = false
13
+ perform do
14
+ setItemCount data.size
15
+ clearAll if clear_all
16
+ end
17
+ end
18
+ sleep 1 # TODO find a better alternative
19
+ end
20
+ end
21
+
22
+ # TODO implement tooltips
23
+
24
+ addListener swt::SetData do |e|
25
+ item = e.item
26
+ index = indexOf(item)
27
+ item.setText(Array(data[index]).to_java(:string))
28
+ end
29
+
30
+ addListener swt::Resize do |e|
31
+ default_weight = 1.0 / columns.size
32
+ current_width = @old_width
33
+ w = width
34
+ columns[0..-2].each do |c|
35
+ weight = c.width == 0 ? default_weight : c.width.to_f / current_width
36
+ c.width = w * weight
37
+ end
38
+ columns[columns.size - 1].pack
39
+ @old_width = w
40
+ end
41
+ end
42
+
43
+ def columns=(*titles)
44
+ if titles
45
+ titles.each do |title|
46
+ col = widgets::TableColumn.new(self, swt::CENTER)
47
+ col.setText title
48
+ end
49
+
50
+ setHeaderVisible true
51
+ setLinesVisible true
52
+ end
53
+ end
54
+
55
+ def sort=(sort)
56
+ sort = Hash.new(true) if [true, :all].member?(sort)
57
+ if sort
58
+ columns.each_with_index do |col, index|
59
+ if sort[col.text]
60
+ col.addListener swt::Selection do
61
+ if data
62
+ @data = data.sort_by {|e| e[index] }
63
+ update :clear
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ ::Sweet::WIDGET_DEFAULTS[:table] = {
73
+ :style => [:border, :virtual, :check]
74
+ }
75
+ ::Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
76
+ :block_handler => :set_data,
77
+ :custom_code => proc {
78
+ def update(clear_all = false)
79
+ return if isDisposed
80
+ setItemCount data.size
81
+ clearAll if clear_all
82
+ end
83
+
84
+ def add_hit(*args)
85
+ opts = args.last === Hash ? args.pop : {}
86
+ d = opts[:data] || args
87
+ t = opts[:tooltips] || d
88
+ data << d
89
+ tooltips << t
90
+ @dirty = true
91
+ end
92
+ }
93
+ }
@@ -0,0 +1,35 @@
1
+ class CreateTables < Sequel::Migration
2
+
3
+ def up
4
+ create_table :texts do
5
+ primary_key :id
6
+ String :digest
7
+ String :text
8
+ String :tagged
9
+ String :filename
10
+ String :tagged_filename
11
+
12
+ index :digest
13
+ end
14
+
15
+ create_table :assoc do
16
+ primary_key :id
17
+ Integer :text_id, :references => :texts
18
+ Integer :position
19
+ Integer :src_begin
20
+ Integer :src_end
21
+ Integer :tagged_begin
22
+ Integer :tagged_end
23
+
24
+ index [:text_id, :tagged_end]
25
+ index [:text_id, :tagged_begin]
26
+ index [:text_id, :position]
27
+ end
28
+ end
29
+
30
+ def down
31
+ drop_table :assoc
32
+ drop_table :texts
33
+ end
34
+
35
+ end
@@ -0,0 +1,28 @@
1
+ class Categories < Sequel::Migration
2
+
3
+ def up
4
+ create_table :categories do
5
+ primary_key :id
6
+ Integer :parent_id, :references => :categories
7
+ String :name
8
+ String :key
9
+
10
+ index [:parent_id, :id]
11
+ end
12
+
13
+ alter_table :texts do
14
+ add_column :category_id, Integer, :references => :categories
15
+
16
+ add_index [:category_id, :id]
17
+ end
18
+
19
+ end
20
+
21
+ def down
22
+ alter_table :texts do
23
+ drop_column :category_id
24
+ end
25
+ drop_table :categories
26
+ end
27
+
28
+ end
@@ -0,0 +1,84 @@
1
+ $KCODE='UTF8'
2
+
3
+ require 'soap/wsdlDriver'
4
+
5
+ class TextProcessing
6
+
7
+ attr_accessor :lang
8
+
9
+ def initialize(lang = 'ro')
10
+ @lang = lang
11
+ end
12
+
13
+ def rpc
14
+ @rpc if @rpc
15
+ wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
16
+ @rpc = wsdl.create_rpc_driver
17
+ end
18
+
19
+ # inserts "tagged" as the second to last part in the filename
20
+ # e.g.
21
+ # test.txt -> test.tagged.txt
22
+ # special case when no extension is present:
23
+ # README -> README.tagged
24
+ def create_tagged_filename(infile)
25
+ components = infile.split(/\./)
26
+ position = [1, components.size-1].max
27
+ components.insert position, 'tagged'
28
+ components.join '.'
29
+ end
30
+
31
+ def to_filelist(*files)
32
+ files = files.flatten.map do |filename|
33
+ if File.directory? filename
34
+ Dir.glob File.join(filename, '**/*') # add all files from that directory
35
+ else
36
+ filename
37
+ end
38
+ end.flatten.compact.uniq # make sure every file is only processed once
39
+ files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
40
+ end
41
+
42
+ # the core processing routing using the webservice
43
+ def process(text)
44
+ response = rpc.Process(:input => text, :lang => lang)
45
+ response.processResult
46
+ end
47
+
48
+ def process_stdin
49
+ puts process($stdin.read)
50
+ end
51
+
52
+ # takes the text from infile and outputs the result into the outfile
53
+ def process_file(infile, outfile)
54
+ File.open(outfile, 'w') do |out|
55
+ out.write process(File.read(infile))
56
+ end
57
+ end
58
+
59
+ end
60
+
61
+
62
+ # process the args if called as main script
63
+ if __FILE__ == $0
64
+ args = ARGV
65
+ tp = if args.first == '-lang'
66
+ args.shift
67
+ TextProcessing.new(args.shift)
68
+ else
69
+ TextProcessing.new
70
+ end
71
+
72
+ if args.empty?
73
+ tp.process_stdin
74
+ else
75
+ files = tp.to_filelist(args)
76
+
77
+ puts "Processing files:"
78
+ for infile in files
79
+ outfile = tp.create_tagged_filename(infile)
80
+ puts "#{infile} -> #{outfile}"
81
+ tp.process_file(infile, outfile) unless File.exist?(outfile)
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,47 @@
1
+ # To change this template, choose Tools | Templates
2
+ # and open the template in the editor.
3
+
4
+ require 'filter'
5
+ puts RUBY_PLATFORM
6
+ TXT = "They|PPER3 have|AUXP business|NN uses|VERB3 derp|ADNE too|ADVE " +
7
+ "Apr|NN 4th|CD 2007|M have|DMKD .|PERIOD"
8
+
9
+ def search(filter)
10
+ TXT.scan(Filter.to_re(filter))
11
+ end
12
+
13
+ describe Filter do
14
+
15
+ it "should find tags" do
16
+ search('NN').should == %w[business|NN Apr|NN]
17
+ end
18
+
19
+ it "should find words" do
20
+ search('"have"').should == %w[have|AUXP have|DMKD]
21
+ end
22
+
23
+ it "should find word and tag combinations" do
24
+ search('"have" NN "uses"').should == ['have|AUXP business|NN uses|VERB3']
25
+ end
26
+
27
+ it "should find wildcard tags" do
28
+ search('AU*').should == %w[have|AUXP]
29
+ end
30
+
31
+ it "should find exclusions" do
32
+ search('A(!UXP DNE)').should == %w[too|ADVE]
33
+ end
34
+
35
+ it "should find word|tag pairs" do
36
+ search('"have"|D*').should == %w[have|DMKD]
37
+ end
38
+
39
+ it "should find unlimited repetitions" do
40
+ search('(AD*)+').should == ['derp|ADNE too|ADVE']
41
+ end
42
+
43
+ it "should find limited repetitions" do
44
+ search('(AD*){3}').should == []
45
+ end
46
+ end
47
+
@@ -0,0 +1,52 @@
1
+ require 'concordancer'
2
+
3
+ describe TextSnippet do
4
+ before(:each) do
5
+ end
6
+
7
+ it "should handle umlauts properly" do
8
+ ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /öfünfä/)
9
+ ts.left(3).should == 'zwei drei vierß '
10
+ ts.to_s.should == 'öfünfä'
11
+ ts.right(3).should == ' ßechs sieben acht'
12
+ end
13
+
14
+ it "should handle partial words and umlauts properly" do
15
+ ts = create_ts('eins zwei drei vierß öfünfä ßechs sieben acht neun zehn', /fünf/)
16
+ ts.left(3).should == 'zwei drei vierß ö'
17
+ ts.to_s.should == 'fünf'
18
+ ts.right(3).should == 'ä ßechs sieben acht'
19
+ end
20
+
21
+ it "should have dynamic left context" do
22
+ ts = create_ts('one two three four five six seven eight nine ten', /five/)
23
+ ts.left(1).should == 'four '
24
+ ts.left(2).should == 'three four '
25
+ ts.left(10).should == 'one two three four '
26
+ end
27
+
28
+ it "should have dynamic right context" do
29
+ ts = create_ts('one two three four five six seven eight nine ten', /five/)
30
+ ts.right(1).should == ' six'
31
+ ts.right(2).should == ' six seven'
32
+ ts.right(10).should == ' six seven eight nine ten'
33
+ end
34
+
35
+ it "should work correctly with newlines" do
36
+ ts = create_ts("one two\n three four five six seven eight\n nine ten", /five/)
37
+ ts.left(1).should == 'four '
38
+ ts.right(1).should == ' six'
39
+ end
40
+
41
+ it "should replace newlines and tabs with spaces" do
42
+ ts = create_ts("one two three\n four five six\n seven eight nine ten", /five/)
43
+ ts.left(2).should == 'three four '
44
+ ts.right(2).should == ' six seven'
45
+ end
46
+
47
+ end
48
+
49
+ def create_ts(text, re)
50
+ m = text.match(re)
51
+ TextSnippet.new text, m.begin(0), m.end(0)
52
+ end
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lumix
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Michael Klaus
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-27 00:00:00 +02:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: sweet
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :runtime
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: sequel
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ version: "0"
42
+ type: :runtime
43
+ version_requirements: *id002
44
+ - !ruby/object:Gem::Dependency
45
+ name: jdbc-postgres
46
+ prerelease: false
47
+ requirement: &id003 !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ segments:
52
+ - 0
53
+ version: "0"
54
+ type: :runtime
55
+ version_requirements: *id003
56
+ description: A concordancer for corpus-based linuistic research.
57
+ email: Michael.Klaus@gmx.net
58
+ executables:
59
+ - lumix
60
+ extensions: []
61
+
62
+ extra_rdoc_files: []
63
+
64
+ files:
65
+ - COPYING
66
+ - bin/lumix
67
+ - spec/text_snippet_spec.rb
68
+ - spec/filter_spec.rb
69
+ - lib/lumix/filter.rb
70
+ - lib/lumix/result_view.rb
71
+ - lib/lumix/gui.rb
72
+ - lib/lumix/textprocessing.rb
73
+ - lib/lumix/main.rb
74
+ - lib/lumix/concordancer.rb
75
+ - lib/lumix/schema/001_create_tables.rb
76
+ - lib/lumix/schema/002_categories.rb
77
+ has_rdoc: true
78
+ homepage: http://github.org/QaDeS/lumix
79
+ licenses: []
80
+
81
+ post_install_message:
82
+ rdoc_options: []
83
+
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ segments:
91
+ - 0
92
+ version: "0"
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ segments:
98
+ - 0
99
+ version: "0"
100
+ requirements: []
101
+
102
+ rubyforge_project:
103
+ rubygems_version: 1.3.6
104
+ signing_key:
105
+ specification_version: 3
106
+ summary: A concordancer for corpus-based linuistic research.
107
+ test_files: []
108
+