lumix 0.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2010 Michael Klaus
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/bin/lumix ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env jruby -Eutf-8:utf-8 -Ku -U -J-Xmx1024m
2
+
3
+ require 'rubygems'
4
+ require 'lumix/cli'
data/bin/lumix-gui ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ require 'rubygems'
4
+ require 'lumix/gui'
data/lib/lumix/base.rb ADDED
@@ -0,0 +1,56 @@
1
+ require 'yaml'
2
+
3
+ module Lumix
4
+ Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
5
+
6
+ CONF = 'config.yaml'
7
+ ConfigStruct = Struct.new(:database_uri)
8
+ CConfig = if File.exists?(CONF)
9
+ YAML.load_file(CONF)
10
+ else
11
+ conf = ConfigStruct.new('jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer')
12
+ File.open(CONF, 'w') do |f|
13
+ f.write(conf.to_yaml)
14
+ end
15
+ conf
16
+ end
17
+
18
+ def conc
19
+ @conc ||= create_concordancer
20
+ end
21
+
22
+ def import_files(lang, *path)
23
+ conc.tp.lang = lang
24
+ conc.read(path)
25
+ end
26
+
27
+ def relink
28
+ conc.link!
29
+ end
30
+
31
+ def simulate_link
32
+ conc.simulate!
33
+ conc.link!
34
+ end
35
+
36
+ def link
37
+ conc.link
38
+ end
39
+
40
+ def reconnect(opts = {})
41
+ @conc = create_concordancer(opts)
42
+ end
43
+
44
+ def correct(*ids)
45
+ conc.correct *ids
46
+ end
47
+
48
+ def to_filename(filter)
49
+ filter.gsub(/\s+/, "_").gsub(/[\.\"]/, '')
50
+ end
51
+
52
+ def create_concordancer(opts = {})
53
+ Concordancer.new(CConfig.database_uri, opts.merge(:progress_proc => progress_proc))
54
+ end
55
+ end
56
+ require 'lumix/concordancer'
@@ -0,0 +1,35 @@
1
+ require 'ffi-icu'
2
+ require 'iconv'
3
+ require 'htmlentities'
4
+
5
+ class String
6
+
7
+ NoMatchFound = Class.new(Exception)
8
+
9
+ def to_utf(default = 'utf-8')
10
+ @icu ||= ICU::CharDet::Detector.new
11
+ result = icu_return(default) || find_icu
12
+ raise NoMatchFound unless result
13
+
14
+ @entities ||= HTMLEntities.new
15
+ @entities.decode(result)
16
+ end
17
+
18
+ def find_icu
19
+ matches = @icu.detect_all(self)
20
+ matches.each do |match|
21
+ if d = icu_return(match.name)
22
+ return d
23
+ end
24
+ end
25
+ return nil
26
+ end
27
+
28
+ def icu_return(cs)
29
+ begin
30
+ return Iconv.conv('UTF-8', cs, self)
31
+ rescue
32
+ end
33
+ end
34
+
35
+ end
data/lib/lumix/cli.rb ADDED
@@ -0,0 +1,96 @@
1
+ require 'lumix/base'
2
+
3
+ include Lumix
4
+
5
+ def help
6
+ puts "lumix-cli import <en|ro> <path>"
7
+ puts "lumix-cli [search] 'search string' ..."
8
+ puts "lumix-cli relink"
9
+ exit
10
+ end
11
+
12
+ def search(*filters)
13
+ files = []
14
+ fs = filters.map do |filt|
15
+ file = create_findings_file(filt)
16
+ next unless file
17
+ files << file
18
+ conc.create_filter(filt) do |text, tagged|
19
+ file.puts "#{text.name}: #{text.left} | #{tagged.to_s} | #{text.right}"
20
+ #file.puts "#{text.name}: #{tagged.to_s}"
21
+ end
22
+ end.compact
23
+
24
+ conc.find(fs) unless fs.empty?
25
+
26
+ fs.each do |f|
27
+ puts "Found #{f.results == 0 ? 'no' : f.results} matches for #{f.filter}"
28
+ end
29
+ ensure
30
+ files.each{ |f| f.close }
31
+ end
32
+
33
+ def create_findings_file(filter, filename = to_filename(filter), &block)
34
+ if File.exists?(filename)
35
+ puts "File #{filename} already exists! Ignoring."
36
+ else
37
+ File.open(filename, 'w', &block)
38
+ end
39
+ end
40
+
41
+ def tag(lang, file)
42
+ conc.tp.lang = lang
43
+ puts conc.tp.process(File.read(file))
44
+ end
45
+
46
+ def import!(lang, *files)
47
+ conc.link_on_import!
48
+ import_files(lang, *files)
49
+ end
50
+
51
+ def tag(lang, *files)
52
+ p = Pool.new(10)
53
+ conc.tp.lang = lang
54
+ conc.tp.to_filelist(files).each do |file|
55
+ p.schedule do
56
+ tagged = conc.tp.create_tagged_filename(file)
57
+ conc.tp.process_file(file, tagged) unless File.exists?(tagged)
58
+ end
59
+ end
60
+ p.shutdown
61
+ end
62
+
63
+ private
64
+ def progress_proc
65
+ task = nil
66
+ percent = 0
67
+ proc do |p|
68
+ if !task or p.task != task
69
+ task = p.task
70
+ percent = 0
71
+ puts Texts[task] || task
72
+ end
73
+ if p.done == p.work
74
+ puts "Done"
75
+ else
76
+ new_percent = (100 * p.done / p.work).to_i
77
+ if new_percent > percent
78
+ print "." * ((new_percent - percent) / 2)
79
+ percent = new_percent
80
+ end
81
+ end
82
+ end
83
+ end
84
+
85
+
86
+ cmd, *args = ARGV
87
+ if !cmd
88
+ #help
89
+ cmd, *args = 'search', 'N "de" N'
90
+ end
91
+
92
+ c = cmd.downcase.to_sym
93
+ cmd = :help if c =~ /^-{1,2}help$/i
94
+ cmd = :search if !respond_to?(c)
95
+
96
+ send c, *args
@@ -0,0 +1,254 @@
1
+ #!/bin/env ruby
2
+
3
+ # TODO take care of 's problem
4
+ # TODO remove Word count line
5
+
6
+ require 'rubygems'
7
+ require 'digest/md5'
8
+ require 'sequel'
9
+ require 'sequel/extensions/migration'
10
+
11
+ require 'lumix/model/sequel_models'
12
+
13
+ require 'lumix/thread_pool'
14
+ require 'lumix/textprocessing'
15
+ require 'lumix/lookup_search'
16
+ #require 'lumix/fast_search'
17
+
18
+ module Lumix
19
+ WORKERS = (ENV['LUMIX_WORKERS'] || 20).to_i
20
+ RELINK = ENV['LUMIX_RELINK']
21
+
22
+ DB_VERSION = 4
23
+
24
+ class ::String
25
+ def digest
26
+ return @digest if @digest
27
+ digest = Digest::MD5.new
28
+ digest.update self
29
+ @digest = digest.hexdigest
30
+ end
31
+ end
32
+
33
+ Progress = Struct.new(:task, :work, :data, :done)
34
+
35
+ class Concordancer
36
+
37
+ class << self
38
+ end
39
+
40
+ attr_reader :db, :tp
41
+ attr_accessor :progress_proc
42
+ attr_writer :link_on_import
43
+
44
+ def initialize(db_uri, options = {})
45
+ @progress_proc = options[:progress_proc]
46
+ @db = connect(db_uri)
47
+ if options[:recreate]
48
+ db.tables.each{ |t| db.drop_table t }
49
+ migrate(db)
50
+ end
51
+
52
+ @ids = all
53
+ @tp = TextProcessing.new
54
+ end
55
+
56
+ def strategy
57
+ @strategy ||= SearchStrategy.new(@db, @progress_proc)
58
+ end
59
+
60
+ def create_link_pool
61
+ Pool.new(strategy.concurrent_link? ? 4 : 1)
62
+ end
63
+
64
+ def link_on_import?
65
+ @link_on_import
66
+ end
67
+
68
+ def link_on_import!
69
+ @link_on_import = true
70
+ end
71
+
72
+ def get_id(file)
73
+ text = File.read(file).to_utf
74
+ saved = TaggedText[:digest => text.digest]
75
+ saved ? saved.id : nil
76
+ end
77
+
78
+ def read(*files)
79
+ files = tp.to_filelist(*files)
80
+ prog = Progress.new(:read, files.size)
81
+ puts "Reading #{files.size} files"
82
+ @unprocessed = if File.exists?('unprocessed.lst')
83
+ File.readlines('unprocessed.lst').map(&:chomp)
84
+ else
85
+ []
86
+ end
87
+
88
+ File.open('unprocessed.lst', 'a') do |up|
89
+ l = create_link_pool
90
+ p = Pool.new(WORKERS)
91
+
92
+ l.schedule{ link! } if RELINK
93
+
94
+ files.each_with_index do |file, index|
95
+ if @unprocessed.member?(file)
96
+ puts "Ignoring #{file}"
97
+ next
98
+ end
99
+ p.schedule do
100
+ begin
101
+ id = read_file(file)
102
+ l.schedule { link id } if id and link_on_import?
103
+ rescue
104
+ puts "Error on file #{file}: #{$!}", $!.backtrace
105
+ @unprocessed << file
106
+ up.puts file
107
+ end
108
+ progress(prog, index + 1)
109
+ end
110
+ end
111
+ l.schedule { link } if link_on_import? # make sure everything is linked
112
+ p.shutdown
113
+ l.shutdown
114
+ end
115
+ end
116
+
117
+ def read_file(file)
118
+ text = File.read(file).to_utf
119
+ saved = TaggedText.exists?(:filename => file, :digest => text.digest)
120
+
121
+ unless saved
122
+ puts "Reading file #{file}"
123
+ # retrieve the tagged version
124
+ tagged_file = tp.create_tagged_filename(file)
125
+ tagged = if File.exists?(tagged_file)
126
+ File.read(tagged_file)
127
+ else
128
+ tagged = tp.process(text)
129
+ File.open(tagged_file, 'w') do |out|
130
+ out.write tagged
131
+ end
132
+ tagged
133
+ end
134
+
135
+ retagged = retag(tagged)
136
+ tt = TaggedText.create(:digest => text.digest, :text => text, :tagged => retagged, :filename => file, :tagged_filename => tagged_file)
137
+ @ids << tt.id
138
+ yield tt if block_given?
139
+ tt
140
+ end
141
+ end
142
+
143
+ def correct(*ids)
144
+ ids = all if ids.empty?
145
+ ids.flatten.each do |id|
146
+ id = id.to_i
147
+ d = TaggedText[id]
148
+ next unless d
149
+
150
+ file = d.filename
151
+
152
+ text = File.read(file).to_utf
153
+ d.text = text
154
+
155
+ expected = text.digest
156
+ if d.digest != expected
157
+ puts "Correcting text #{file}"
158
+ d.digest = expected
159
+ end
160
+ d.save
161
+ end
162
+ end
163
+
164
+ def all
165
+ TaggedText.ids
166
+ end
167
+
168
+ def simulate!
169
+ strategy.simulate!
170
+ end
171
+
172
+ def link!(*ids)
173
+ link(*ids) do |ds|
174
+ ds.delete
175
+ end
176
+ end
177
+
178
+ def link(*ids)
179
+ ids = all if ids.empty?
180
+ ids.flatten!
181
+ prog = Progress.new(:link, ids.size)
182
+ progress(prog)
183
+
184
+ p = create_link_pool
185
+ ids.each_with_index do |id, index|
186
+ #ds = db[:assoc].filter(:text_id => id)
187
+ #yield ds if block_given?
188
+
189
+ # TODO implement force
190
+ p.schedule do
191
+ strategy.link_text(id) #if ds.empty?
192
+ progress(prog, index + 1)
193
+ end
194
+ end
195
+ p.shutdown
196
+ end
197
+
198
+ def create_filter(f, &block)
199
+ strategy.create_filter(f, &block)
200
+ end
201
+
202
+ def find(filters)
203
+ strategy.find(filters)
204
+ end
205
+
206
+ private
207
+ def connect(db_uri)
208
+ db = Sequel.connect(db_uri)
209
+ begin
210
+ db.get(1)
211
+ rescue Exception => e
212
+ puts 'Falling back to sqlite'
213
+ puts e
214
+ db = Sequel.connect('jdbc:sqlite://concordancer.db')
215
+ end
216
+ migrate(db)
217
+ TaggedText.db = db
218
+ end
219
+
220
+ def migrate(db)
221
+ migration_path = File.join(File.dirname(__FILE__), 'schema')
222
+ Sequel::Migrator.apply(db, migration_path, DB_VERSION)
223
+ end
224
+
225
+ def progress(prog, done = 0, data = prog.data)
226
+ if progress_proc
227
+ prog.done = done
228
+ prog.data = data
229
+ progress_proc.call(prog)
230
+ end
231
+ end
232
+
233
+ def retag(text)
234
+ chunks = text.split(/[ \n]/)
235
+ return text if (token = chunks.first.split(/\|/)).size != 4 # looks pre-retagged
236
+ tag_position = if token[2] =~ /\d+/ && token[3] =~ /\d+/ # looks like fulltagged
237
+ 1
238
+ else
239
+ 2
240
+ end
241
+
242
+ result = ''
243
+ chunks.each do |chunk|
244
+ next unless chunk.empty?
245
+ word, tag = chunk.split(/\|/)
246
+ result << ' ' unless result.empty?
247
+ result << "#{word}|#{tag[tag_position]}"
248
+ end
249
+ return result
250
+ end
251
+
252
+ end
253
+
254
+ end
@@ -0,0 +1,84 @@
1
+ require 'lumix/charset'
2
+
3
+ CORRECTIONS = <<-TXT
4
+ catre | S
5
+ fetite | NPRN
6
+ in | S
7
+ si | C
8
+ circa | R
9
+ fata de| S
10
+ maxima | ASON
11
+ inainte| R
12
+ in materie de | R
13
+ tin | V3
14
+ beneficiaza | V3
15
+ : | COLON
16
+ ocupa | VN
17
+ asigurata | VPSF
18
+ mine | PPSA
19
+ batut | VPSM
20
+ insa | C
21
+ impotriva | S
22
+ americana | ASN
23
+ caruia | R
24
+ da | VN
25
+ duce| VN
26
+ primeasca | V3
27
+ daca | C
28
+ bulgara | ASN
29
+ ramina | V3
30
+ albaneza | ASN
31
+ pina | S
32
+ paraseasca | V3
33
+ publica | ASN
34
+ inceapa | V3
35
+ ecologic | ASN
36
+ internationala | ASN
37
+ ecologista | ASN
38
+ cada | V3
39
+ linga | S
40
+ adevaratele | APRY
41
+ citiva | PI
42
+ americana | ASN
43
+ Miclici| NP
44
+ fara | S
45
+ cit | PI
46
+ sugereaza | V3
47
+ incasa | VN
48
+ circa | R
49
+ ghiceste | V3
50
+ tarile |NPRY
51
+ araba | ASN
52
+ citeva | PI
53
+ schimbindu | VG
54
+ dupa | S
55
+ uleiurilor_vegetale | NPOY
56
+ botosaneana | ASN
57
+ oricarui | PI
58
+ TXT
59
+
60
+ def corrections
61
+ @corrections ||= CORRECTIONS.split(/\n/).map do |line|
62
+ word, tag = line.split(/\|/).map(&:strip)
63
+ puts "Tagging #{word} as #{tag}"
64
+ [/\b#{word}\|\S+/, "#{word}\|#{tag}"]
65
+ end
66
+ end
67
+
68
+ def correct(t)
69
+ corrections.inject(t) do |result, (re, sub)|
70
+ result.gsub(re, sub)
71
+ end
72
+ end
73
+
74
+ def correct_all(path)
75
+ fs = Dir.glob(File.join(path, '*tagged*'))
76
+ fs.each do |fn|
77
+ t = correct(File.read(fn))
78
+ File.open(fn, 'w') { |f| f.print t }
79
+ end
80
+ end
81
+
82
+ if $0 == __FILE__
83
+ correct_all ARGV[0]
84
+ end
@@ -0,0 +1,91 @@
1
+ require 'lumix/filter'
2
+ require 'lumix/text_snippet'
3
+
4
+ module Lumix
5
+
6
+ class FastSearch
7
+
8
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
9
+ ORIG = /([^\|\s]*)\|([^\|\s]*)\|([^\|\s]*)\|(\S*)/ # X|Y|Z|W
10
+
11
+ def initialize(db, progress)
12
+ @db = db
13
+ @progress = progress
14
+ end
15
+
16
+ def concurrent_link?
17
+ true
18
+ end
19
+
20
+ def link_text(id)
21
+ ds = TaggedText[id]
22
+ return ds.fulltagged if ds.fulltagged
23
+ file, text, tagged = ds.filename, ds.text, ds.tagged
24
+
25
+ puts "Linking text #{file}"
26
+
27
+ txt_pos = 0
28
+ linked = ''
29
+ tagged.scan(TAGGED) do |word, tag|
30
+ tagged_begin = $~.begin(0)
31
+
32
+ # expand "x_y_z" notation to "x y z"
33
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
34
+ src_match = text[txt_pos..-1].match(word_re) # find the word
35
+ if src_match
36
+ offset = src_match.begin(0)
37
+ src_begin = txt_pos + offset
38
+ src_end = txt_pos + src_match.end(0)
39
+ txt_pos = src_end
40
+
41
+ linked << ' ' unless linked.empty?
42
+ linked << word << '|' << tag << '|' << src_begin.to_s << '|' << src_end.to_s
43
+ else
44
+ STDERR.puts "Could not find match for '#{word}' in text #{file}"
45
+ STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
46
+ `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
47
+ return nil
48
+ end
49
+ end
50
+ unless linked.empty?
51
+ ds.fulltagged = linked
52
+ ds.save
53
+ end
54
+ return linked
55
+ rescue => e # TODO remove this crap
56
+ STDERR.puts e
57
+ STDERR.puts e.backtrace
58
+ raise e
59
+ end
60
+
61
+ def create_filter(f, &block)
62
+ Lumix::Filter.new('\|(\d+)\|(\d+)', f, &block)
63
+ end
64
+
65
+ def find(filters)
66
+ prog = Progress.new(:search, TaggedText.count, "", 0)
67
+ @progress[prog] if @progress
68
+
69
+
70
+ TaggedText.each_with_index do |t, i|
71
+ # matches to ranges
72
+ filters.each do |f|
73
+ f.scan(t.fulltagged) do |hit, t_begin, t_end, m|
74
+ s_begin = m.captures.first.to_i
75
+ s_end = m.captures.last.to_i
76
+
77
+ fname = File.basename(t.filename)
78
+ tagged_snippet = Lumix::TextSnippet.new(fname, t.fulltagged, t_begin, t_end)
79
+ text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
80
+ f << [text_snippet, tagged_snippet]
81
+ end
82
+ end
83
+ prog.done = i
84
+ @progress[prog] if @progress
85
+ end
86
+ end
87
+
88
+ end
89
+
90
+ SearchStrategy = FastSearch
91
+ end