lumix 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2010 Michael Klaus
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/bin/lumix ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env jruby -Eutf-8:utf-8 -Ku -U -J-Xmx1024m
2
+
3
+ require 'rubygems'
4
+ require 'lumix/cli'
data/bin/lumix-gui ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ require 'rubygems'
4
+ require 'lumix/gui'
data/lib/lumix/base.rb ADDED
@@ -0,0 +1,56 @@
1
+ require 'yaml'
2
+
3
+ module Lumix
4
+ Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
5
+
6
+ CONF = 'config.yaml'
7
+ ConfigStruct = Struct.new(:database_uri)
8
+ CConfig = if File.exists?(CONF)
9
+ YAML.load_file(CONF)
10
+ else
11
+ conf = ConfigStruct.new('jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer')
12
+ File.open(CONF, 'w') do |f|
13
+ f.write(conf.to_yaml)
14
+ end
15
+ conf
16
+ end
17
+
18
+ def conc
19
+ @conc ||= create_concordancer
20
+ end
21
+
22
+ def import_files(lang, *path)
23
+ conc.tp.lang = lang
24
+ conc.read(path)
25
+ end
26
+
27
+ def relink
28
+ conc.link!
29
+ end
30
+
31
+ def simulate_link
32
+ conc.simulate!
33
+ conc.link!
34
+ end
35
+
36
+ def link
37
+ conc.link
38
+ end
39
+
40
+ def reconnect(opts = {})
41
+ @conc = create_concordancer(opts)
42
+ end
43
+
44
+ def correct(*ids)
45
+ conc.correct *ids
46
+ end
47
+
48
+ def to_filename(filter)
49
+ filter.gsub(/\s+/, "_").gsub(/[\.\"]/, '')
50
+ end
51
+
52
+ def create_concordancer(opts = {})
53
+ Concordancer.new(CConfig.database_uri, opts.merge(:progress_proc => progress_proc))
54
+ end
55
+ end
56
+ require 'lumix/concordancer'
@@ -0,0 +1,35 @@
1
+ require 'ffi-icu'
2
+ require 'iconv'
3
+ require 'htmlentities'
4
+
5
+ class String
6
+
7
+ NoMatchFound = Class.new(Exception)
8
+
9
+ def to_utf(default = 'utf-8')
10
+ @icu ||= ICU::CharDet::Detector.new
11
+ result = icu_return(default) || find_icu
12
+ raise NoMatchFound unless result
13
+
14
+ @entities ||= HTMLEntities.new
15
+ @entities.decode(result)
16
+ end
17
+
18
+ def find_icu
19
+ matches = @icu.detect_all(self)
20
+ matches.each do |match|
21
+ if d = icu_return(match.name)
22
+ return d
23
+ end
24
+ end
25
+ return nil
26
+ end
27
+
28
+ def icu_return(cs)
29
+ begin
30
+ return Iconv.conv('UTF-8', cs, self)
31
+ rescue
32
+ end
33
+ end
34
+
35
+ end
data/lib/lumix/cli.rb ADDED
@@ -0,0 +1,96 @@
1
+ require 'lumix/base'
2
+
3
+ include Lumix
4
+
5
+ def help
6
+ puts "lumix-cli import <en|ro> <path>"
7
+ puts "lumix-cli [search] 'search string' ..."
8
+ puts "lumix-cli relink"
9
+ exit
10
+ end
11
+
12
+ def search(*filters)
13
+ files = []
14
+ fs = filters.map do |filt|
15
+ file = create_findings_file(filt)
16
+ next unless file
17
+ files << file
18
+ conc.create_filter(filt) do |text, tagged|
19
+ file.puts "#{text.name}: #{text.left} | #{tagged.to_s} | #{text.right}"
20
+ #file.puts "#{text.name}: #{tagged.to_s}"
21
+ end
22
+ end.compact
23
+
24
+ conc.find(fs) unless fs.empty?
25
+
26
+ fs.each do |f|
27
+ puts "Found #{f.results == 0 ? 'no' : f.results} matches for #{f.filter}"
28
+ end
29
+ ensure
30
+ files.each{ |f| f.close }
31
+ end
32
+
33
+ def create_findings_file(filter, filename = to_filename(filter), &block)
34
+ if File.exists?(filename)
35
+ puts "File #{filename} already exists! Ignoring."
36
+ else
37
+ File.open(filename, 'w', &block)
38
+ end
39
+ end
40
+
41
+ def tag(lang, file)
42
+ conc.tp.lang = lang
43
+ puts conc.tp.process(File.read(file))
44
+ end
45
+
46
+ def import!(lang, *files)
47
+ conc.link_on_import!
48
+ import_files(lang, *files)
49
+ end
50
+
51
+ def tag(lang, *files)
52
+ p = Pool.new(10)
53
+ conc.tp.lang = lang
54
+ conc.tp.to_filelist(files).each do |file|
55
+ p.schedule do
56
+ tagged = conc.tp.create_tagged_filename(file)
57
+ conc.tp.process_file(file, tagged) unless File.exists?(tagged)
58
+ end
59
+ end
60
+ p.shutdown
61
+ end
62
+
63
+ private
64
+ def progress_proc
65
+ task = nil
66
+ percent = 0
67
+ proc do |p|
68
+ if !task or p.task != task
69
+ task = p.task
70
+ percent = 0
71
+ puts Texts[task] || task
72
+ end
73
+ if p.done == p.work
74
+ puts "Done"
75
+ else
76
+ new_percent = (100 * p.done / p.work).to_i
77
+ if new_percent > percent
78
+ print "." * ((new_percent - percent) / 2)
79
+ percent = new_percent
80
+ end
81
+ end
82
+ end
83
+ end
84
+
85
+
86
+ cmd, *args = ARGV
87
+ if !cmd
88
+ #help
89
+ cmd, *args = 'search', 'N "de" N'
90
+ end
91
+
92
+ c = cmd.downcase.to_sym
93
+ cmd = :help if c =~ /^-{1,2}help$/i
94
+ cmd = :search if !respond_to?(c)
95
+
96
+ send c, *args
@@ -0,0 +1,254 @@
1
+ #!/bin/env ruby
2
+
3
+ # TODO take care of 's problem
4
+ # TODO remove Word count line
5
+
6
+ require 'rubygems'
7
+ require 'digest/md5'
8
+ require 'sequel'
9
+ require 'sequel/extensions/migration'
10
+
11
+ require 'lumix/model/sequel_models'
12
+
13
+ require 'lumix/thread_pool'
14
+ require 'lumix/textprocessing'
15
+ require 'lumix/lookup_search'
16
+ #require 'lumix/fast_search'
17
+
18
+ module Lumix
19
+ WORKERS = (ENV['LUMIX_WORKERS'] || 20).to_i
20
+ RELINK = ENV['LUMIX_RELINK']
21
+
22
+ DB_VERSION = 4
23
+
24
+ class ::String
25
+ def digest
26
+ return @digest if @digest
27
+ digest = Digest::MD5.new
28
+ digest.update self
29
+ @digest = digest.hexdigest
30
+ end
31
+ end
32
+
33
+ Progress = Struct.new(:task, :work, :data, :done)
34
+
35
+ class Concordancer
36
+
37
+ class << self
38
+ end
39
+
40
+ attr_reader :db, :tp
41
+ attr_accessor :progress_proc
42
+ attr_writer :link_on_import
43
+
44
+ def initialize(db_uri, options = {})
45
+ @progress_proc = options[:progress_proc]
46
+ @db = connect(db_uri)
47
+ if options[:recreate]
48
+ db.tables.each{ |t| db.drop_table t }
49
+ migrate(db)
50
+ end
51
+
52
+ @ids = all
53
+ @tp = TextProcessing.new
54
+ end
55
+
56
+ def strategy
57
+ @strategy ||= SearchStrategy.new(@db, @progress_proc)
58
+ end
59
+
60
+ def create_link_pool
61
+ Pool.new(strategy.concurrent_link? ? 4 : 1)
62
+ end
63
+
64
+ def link_on_import?
65
+ @link_on_import
66
+ end
67
+
68
+ def link_on_import!
69
+ @link_on_import = true
70
+ end
71
+
72
+ def get_id(file)
73
+ text = File.read(file).to_utf
74
+ saved = TaggedText[:digest => text.digest]
75
+ saved ? saved.id : nil
76
+ end
77
+
78
+ def read(*files)
79
+ files = tp.to_filelist(*files)
80
+ prog = Progress.new(:read, files.size)
81
+ puts "Reading #{files.size} files"
82
+ @unprocessed = if File.exists?('unprocessed.lst')
83
+ File.readlines('unprocessed.lst').map(&:chomp)
84
+ else
85
+ []
86
+ end
87
+
88
+ File.open('unprocessed.lst', 'a') do |up|
89
+ l = create_link_pool
90
+ p = Pool.new(WORKERS)
91
+
92
+ l.schedule{ link! } if RELINK
93
+
94
+ files.each_with_index do |file, index|
95
+ if @unprocessed.member?(file)
96
+ puts "Ignoring #{file}"
97
+ next
98
+ end
99
+ p.schedule do
100
+ begin
101
+ id = read_file(file)
102
+ l.schedule { link id } if id and link_on_import?
103
+ rescue
104
+ puts "Error on file #{file}: #{$!}", $!.backtrace
105
+ @unprocessed << file
106
+ up.puts file
107
+ end
108
+ progress(prog, index + 1)
109
+ end
110
+ end
111
+ l.schedule { link } if link_on_import? # make sure everything is linked
112
+ p.shutdown
113
+ l.shutdown
114
+ end
115
+ end
116
+
117
+ def read_file(file)
118
+ text = File.read(file).to_utf
119
+ saved = TaggedText.exists?(:filename => file, :digest => text.digest)
120
+
121
+ unless saved
122
+ puts "Reading file #{file}"
123
+ # retrieve the tagged version
124
+ tagged_file = tp.create_tagged_filename(file)
125
+ tagged = if File.exists?(tagged_file)
126
+ File.read(tagged_file)
127
+ else
128
+ tagged = tp.process(text)
129
+ File.open(tagged_file, 'w') do |out|
130
+ out.write tagged
131
+ end
132
+ tagged
133
+ end
134
+
135
+ retagged = retag(tagged)
136
+ tt = TaggedText.create(:digest => text.digest, :text => text, :tagged => retagged, :filename => file, :tagged_filename => tagged_file)
137
+ @ids << tt.id
138
+ yield tt if block_given?
139
+ tt
140
+ end
141
+ end
142
+
143
+ def correct(*ids)
144
+ ids = all if ids.empty?
145
+ ids.flatten.each do |id|
146
+ id = id.to_i
147
+ d = TaggedText[id]
148
+ next unless d
149
+
150
+ file = d.filename
151
+
152
+ text = File.read(file).to_utf
153
+ d.text = text
154
+
155
+ expected = text.digest
156
+ if d.digest != expected
157
+ puts "Correcting text #{file}"
158
+ d.digest = expected
159
+ end
160
+ d.save
161
+ end
162
+ end
163
+
164
+ def all
165
+ TaggedText.ids
166
+ end
167
+
168
+ def simulate!
169
+ strategy.simulate!
170
+ end
171
+
172
+ def link!(*ids)
173
+ link(*ids) do |ds|
174
+ ds.delete
175
+ end
176
+ end
177
+
178
+ def link(*ids)
179
+ ids = all if ids.empty?
180
+ ids.flatten!
181
+ prog = Progress.new(:link, ids.size)
182
+ progress(prog)
183
+
184
+ p = create_link_pool
185
+ ids.each_with_index do |id, index|
186
+ #ds = db[:assoc].filter(:text_id => id)
187
+ #yield ds if block_given?
188
+
189
+ # TODO implement force
190
+ p.schedule do
191
+ strategy.link_text(id) #if ds.empty?
192
+ progress(prog, index + 1)
193
+ end
194
+ end
195
+ p.shutdown
196
+ end
197
+
198
+ def create_filter(f, &block)
199
+ strategy.create_filter(f, &block)
200
+ end
201
+
202
+ def find(filters)
203
+ strategy.find(filters)
204
+ end
205
+
206
+ private
207
+ def connect(db_uri)
208
+ db = Sequel.connect(db_uri)
209
+ begin
210
+ db.get(1)
211
+ rescue Exception => e
212
+ puts 'Falling back to sqlite'
213
+ puts e
214
+ db = Sequel.connect('jdbc:sqlite://concordancer.db')
215
+ end
216
+ migrate(db)
217
+ TaggedText.db = db
218
+ end
219
+
220
+ def migrate(db)
221
+ migration_path = File.join(File.dirname(__FILE__), 'schema')
222
+ Sequel::Migrator.apply(db, migration_path, DB_VERSION)
223
+ end
224
+
225
+ def progress(prog, done = 0, data = prog.data)
226
+ if progress_proc
227
+ prog.done = done
228
+ prog.data = data
229
+ progress_proc.call(prog)
230
+ end
231
+ end
232
+
233
+ def retag(text)
234
+ chunks = text.split(/[ \n]/)
235
+ return text if (token = chunks.first.split(/\|/)).size != 4 # looks pre-retagged
236
+ tag_position = if token[2] =~ /\d+/ && token[3] =~ /\d+/ # looks like fulltagged
237
+ 1
238
+ else
239
+ 2
240
+ end
241
+
242
+ result = ''
243
+ chunks.each do |chunk|
244
+ next unless chunk.empty?
245
+ word, tag = chunk.split(/\|/)
246
+ result << ' ' unless result.empty?
247
+ result << "#{word}|#{tag[tag_position]}"
248
+ end
249
+ return result
250
+ end
251
+
252
+ end
253
+
254
+ end
@@ -0,0 +1,84 @@
1
+ require 'lumix/charset'
2
+
3
+ CORRECTIONS = <<-TXT
4
+ catre | S
5
+ fetite | NPRN
6
+ in | S
7
+ si | C
8
+ circa | R
9
+ fata de| S
10
+ maxima | ASON
11
+ inainte| R
12
+ in materie de | R
13
+ tin | V3
14
+ beneficiaza | V3
15
+ : | COLON
16
+ ocupa | VN
17
+ asigurata | VPSF
18
+ mine | PPSA
19
+ batut | VPSM
20
+ insa | C
21
+ impotriva | S
22
+ americana | ASN
23
+ caruia | R
24
+ da | VN
25
+ duce| VN
26
+ primeasca | V3
27
+ daca | C
28
+ bulgara | ASN
29
+ ramina | V3
30
+ albaneza | ASN
31
+ pina | S
32
+ paraseasca | V3
33
+ publica | ASN
34
+ inceapa | V3
35
+ ecologic | ASN
36
+ internationala | ASN
37
+ ecologista | ASN
38
+ cada | V3
39
+ linga | S
40
+ adevaratele | APRY
41
+ citiva | PI
42
+ americana | ASN
43
+ Miclici| NP
44
+ fara | S
45
+ cit | PI
46
+ sugereaza | V3
47
+ incasa | VN
48
+ circa | R
49
+ ghiceste | V3
50
+ tarile |NPRY
51
+ araba | ASN
52
+ citeva | PI
53
+ schimbindu | VG
54
+ dupa | S
55
+ uleiurilor_vegetale | NPOY
56
+ botosaneana | ASN
57
+ oricarui | PI
58
+ TXT
59
+
60
+ def corrections
61
+ @corrections ||= CORRECTIONS.split(/\n/).map do |line|
62
+ word, tag = line.split(/\|/).map(&:strip)
63
+ puts "Tagging #{word} as #{tag}"
64
+ [/\b#{word}\|\S+/, "#{word}\|#{tag}"]
65
+ end
66
+ end
67
+
68
+ def correct(t)
69
+ corrections.inject(t) do |result, (re, sub)|
70
+ result.gsub(re, sub)
71
+ end
72
+ end
73
+
74
+ def correct_all(path)
75
+ fs = Dir.glob(File.join(path, '*tagged*'))
76
+ fs.each do |fn|
77
+ t = correct(File.read(fn))
78
+ File.open(fn, 'w') { |f| f.print t }
79
+ end
80
+ end
81
+
82
+ if $0 == __FILE__
83
+ correct_all ARGV[0]
84
+ end
@@ -0,0 +1,91 @@
1
+ require 'lumix/filter'
2
+ require 'lumix/text_snippet'
3
+
4
+ module Lumix
5
+
6
+ class FastSearch
7
+
8
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
9
+ ORIG = /([^\|\s]*)\|([^\|\s]*)\|([^\|\s]*)\|(\S*)/ # X|Y|Z|W
10
+
11
+ def initialize(db, progress)
12
+ @db = db
13
+ @progress = progress
14
+ end
15
+
16
+ def concurrent_link?
17
+ true
18
+ end
19
+
20
+ def link_text(id)
21
+ ds = TaggedText[id]
22
+ return ds.fulltagged if ds.fulltagged
23
+ file, text, tagged = ds.filename, ds.text, ds.tagged
24
+
25
+ puts "Linking text #{file}"
26
+
27
+ txt_pos = 0
28
+ linked = ''
29
+ tagged.scan(TAGGED) do |word, tag|
30
+ tagged_begin = $~.begin(0)
31
+
32
+ # expand "x_y_z" notation to "x y z"
33
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
34
+ src_match = text[txt_pos..-1].match(word_re) # find the word
35
+ if src_match
36
+ offset = src_match.begin(0)
37
+ src_begin = txt_pos + offset
38
+ src_end = txt_pos + src_match.end(0)
39
+ txt_pos = src_end
40
+
41
+ linked << ' ' unless linked.empty?
42
+ linked << word << '|' << tag << '|' << src_begin.to_s << '|' << src_end.to_s
43
+ else
44
+ STDERR.puts "Could not find match for '#{word}' in text #{file}"
45
+ STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
46
+ `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
47
+ return nil
48
+ end
49
+ end
50
+ unless linked.empty?
51
+ ds.fulltagged = linked
52
+ ds.save
53
+ end
54
+ return linked
55
+ rescue => e # TODO remove this crap
56
+ STDERR.puts e
57
+ STDERR.puts e.backtrace
58
+ raise e
59
+ end
60
+
61
+ def create_filter(f, &block)
62
+ Lumix::Filter.new('\|(\d+)\|(\d+)', f, &block)
63
+ end
64
+
65
+ def find(filters)
66
+ prog = Progress.new(:search, TaggedText.count, "", 0)
67
+ @progress[prog] if @progress
68
+
69
+
70
+ TaggedText.each_with_index do |t, i|
71
+ # matches to ranges
72
+ filters.each do |f|
73
+ f.scan(t.fulltagged) do |hit, t_begin, t_end, m|
74
+ s_begin = m.captures.first.to_i
75
+ s_end = m.captures.last.to_i
76
+
77
+ fname = File.basename(t.filename)
78
+ tagged_snippet = Lumix::TextSnippet.new(fname, t.fulltagged, t_begin, t_end)
79
+ text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
80
+ f << [text_snippet, tagged_snippet]
81
+ end
82
+ end
83
+ prog.done = i
84
+ @progress[prog] if @progress
85
+ end
86
+ end
87
+
88
+ end
89
+
90
+ SearchStrategy = FastSearch
91
+ end