lumix 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ module Lumix
2
+ class Filter
3
+
4
+ HANDLERS = %w[handle_wildcard handle_choice handle_literals
5
+ handle_dangling_tags handle_multiplicators ensure_wordbounds]
6
+
7
+ attr_reader :results, :filter
8
+
9
+ def initialize(suffix, filter, &result_proc)
10
+ @suffix = suffix.gsub(/\\\|/, '[\|]') # workaround to make handle_dangling_tags play nicely
11
+ @filter = filter
12
+ @result_proc = result_proc
13
+
14
+ @re = to_re(filter)
15
+ @results = 0
16
+ end
17
+
18
+ def <<(result)
19
+ @results += 1
20
+ @result_proc[*result] if @result_proc
21
+ end
22
+
23
+ def scan(text, &block)
24
+ results = []
25
+ return results unless text
26
+ (' ' + text + ' ').scan(@re) do |s|
27
+ t_begin = $~.begin(0) - 1
28
+ t_end = $~.end(0) - 1
29
+
30
+ s = block ? block[s, t_begin, t_end, $~] : s
31
+ results << s
32
+ end
33
+ results
34
+ end
35
+
36
+ def to_re(filter)
37
+ re = HANDLERS.inject(filter) do |filter, handler|
38
+ puts filter
39
+ puts "#{handler} -->"
40
+ send handler, filter
41
+ end
42
+ puts re
43
+ Regexp.new(re)
44
+ end
45
+
46
+ # character wildcard replacement
47
+ def handle_wildcard(re)
48
+ re.gsub(/([^\)])\*/, '\1[^\s\|]*')
49
+ end
50
+
51
+ # Takes (!A B C) and transforms it
52
+ def handle_choice(re)
53
+ re.gsub(/\(\!([^\)]+)\)/) do
54
+ c = $1.split.map{ |t| '(?!' + t + ')' }.join
55
+ '(?:' + c + '[^\s\|]*' + @suffix + ')'
56
+ end
57
+ end
58
+
59
+ # transforms literals delimited by ""
60
+ def handle_literals(re)
61
+ re.gsub(/\"([^\"]*)\"(?:\|(\S+?))?/) do
62
+ str = $1
63
+ tag = $2 || '[^\s\|]+'
64
+ str.gsub(/ /, '_') + '\|' + tag
65
+ end
66
+ end
67
+
68
+ # add wildcard word match on tag-only search criteria
69
+ def handle_dangling_tags(re)
70
+ re.split(/ /).map do |s|
71
+ if s =~ /\|[^\]]/
72
+ s + @suffix
73
+ else
74
+ s.gsub(/(\(?)([^\)]+)(\S*)/, '\1[^\s\|]+\|\2' + @suffix + '\3')
75
+ end
76
+ end.join('\s+')
77
+ end
78
+
79
+ # Handles the + * ? and {} qualifiers
80
+ def handle_multiplicators(re)
81
+ re.gsub(/\(([^\)]+)(\)((\{[^\}]+\})|\*|\+|\?)\s?)/, '(?:\1\s\2')
82
+ end
83
+
84
+ def ensure_wordbounds(re)
85
+ re # ending wordbounds is being taken of earlier
86
+ end
87
+
88
+ end
89
+ end
data/lib/lumix/gui.rb ADDED
@@ -0,0 +1,148 @@
1
+ require 'lumix/base'
2
+
3
+ require 'sweet'
4
+ require 'lumix/result_view'
5
+ #Sweet.set_debug
6
+
7
+
8
+ Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
9
+ Indicator = %w'} ) ] | [ ( {'
10
+
11
+ CONF = 'config.yaml'
12
+ ConfigStruct = Struct.new(:database_uri)
13
+ CConfig = YAML.load_file(CONF) rescue ConfigStruct.new('jdbc:postgresql://localhost:5432/concordancer?user=concordancer&password=concordancer')
14
+ def save!
15
+ File.open(CONF, 'w') do |f|
16
+ f.write(CConfig.to_yaml)
17
+ end
18
+ end
19
+
20
+ Sweet.app :title => 'Ruby Concordancer', :width => 800, :height => 700, :layout => :grid.conf(:numColumns => 3) do
21
+ def conc
22
+ @conc ||= Concordancer.new(CConfig.database_uri, :progress_proc => @progress_proc)#, :recreate => true)
23
+ end
24
+
25
+ @progress_proc = proc do |p|
26
+ task = Texts[p.task] || p.task
27
+ perform do
28
+ if p.done == p.work
29
+ @p_status.text = 'Done!'
30
+ @p_indicator.text = ''
31
+ @p_bar.fraction = 0
32
+ else
33
+ @p_status.text = task
34
+ @p_indicator.text = Indicator[p.done % Indicator.size]
35
+ @p_bar.fraction = p.done.to_f / p.work
36
+ end
37
+ end
38
+ end
39
+
40
+ save! unless File.exists?(CONF)
41
+
42
+ menubar do
43
+ submenu '&File' do
44
+ submenu '&Import...' do
45
+ item('E&nglish texts') { import_chooser('en') }
46
+ item('&Romanian texts') { import_chooser('ro') }
47
+ end
48
+ item('&Export findings...') { export_findings }
49
+ separator
50
+ item('&Relink texts') { relink }
51
+ item('&Clear the database') { reconnect :recreate => true }
52
+ separator
53
+ item('E&xit') { exit }
54
+ end
55
+ # submenu 'C&orpora' do
56
+ # @m_cat = submenu '&Category' do
57
+ # item('Cre&ate...') { create_category }
58
+ # item('&Import...') { import_chooser }
59
+ # separator
60
+ # item('&Edit...') { edit_category }
61
+ # item('&Delete') { delete_category }
62
+ # end
63
+ # @m_text = submenu '&Text' do
64
+ # item('&Reimport...') { reimport_chooser }
65
+ # item('&Delete') { delete_text }
66
+ # end
67
+ # end
68
+ # @m_stats = submenu '&Statistics' do
69
+ # item('&Editor') { script_editor }
70
+ # separator
71
+ # item('&Load Script...') { load_script }
72
+ # end
73
+ # submenu "&Help" do
74
+ # separator
75
+ # item('&About') { about }
76
+ # end
77
+ end
78
+
79
+ tree :grid_data => {:align => [:fill, :fill], :span => [1, 2], :grab => [true, true]}
80
+
81
+ @filter = edit_line 'NSN NSN', :grid_data => {:align => [:fill, :center], :grab => true}, :max_size => 40 do
82
+ perform_search
83
+ end
84
+ button 'Search' do
85
+ perform_search
86
+ end
87
+
88
+ @results = table :columns => %w[Text Left Hit Right], :sort => true, :grid_data => {:align => [:fill, :fill], :span => 2, :grab => [true, true]}, :scroll => true
89
+
90
+ @counter = label :grid_data => {:span => 2, :align => :fill}
91
+
92
+ @p_status = label(:grid_data => {:align => [:fill, :bottom], :grab => true})
93
+ @p_bar = progress(:width => 50, :grid_data => {:align => [:right, :bottom]})
94
+ @p_indicator = label(' ', :grid_data => {:align => [:right, :bottom]})
95
+
96
+
97
+ def perform_search
98
+ filter = @filter.text
99
+ @results.data.clear
100
+ Thread.new do
101
+ unless filter.empty?
102
+ puts "finding #{filter}"
103
+ found = conc.find(filter) do |text, tagged|
104
+ @results.add_hit(text.name, text.left, text.to_s, text.right)
105
+ end
106
+ end
107
+ perform do
108
+ @counter.text = "#{found} matches"
109
+ @p_status.text = "Found #{found || 'no'} matches for #{filter}"
110
+ end
111
+ end
112
+ end
113
+
114
+ def import_chooser(lang)
115
+ conc.tp.lang = lang
116
+ Thread.new(conc) do |conc|
117
+ conc.read('raw')
118
+ end
119
+ end
120
+
121
+ def export_findings
122
+ filename = to_filename(@filter.text) + '.findings'
123
+ @p_status.text = "Exporting to #{filename}"
124
+ File.open(filename, 'w') do |f|
125
+ @results.items.each do |item|
126
+ unless item.getChecked
127
+ left, hit, right = (0..2).map{ |i| item.text(i) }
128
+ f.puts "#{left}\t#{hit}\t#{right}"
129
+ end
130
+ end
131
+ end
132
+ @p_status.text = "Done! Exported to file #{filename}"
133
+ end
134
+
135
+ def relink
136
+ Thread.new(conc) do |conc|
137
+ conc.link!
138
+ end
139
+ end
140
+
141
+ def to_filename(filter)
142
+ filter.gsub(/\s+/, "_").gsub(/[\*\.\?\"]/, '')
143
+ end
144
+
145
+ def reconnect(opts = {})
146
+ @conc = Concordancer.new(CConfig.database_uri, opts.mergs(:progress_proc => @progress_proc))
147
+ end
148
+ end
@@ -0,0 +1,105 @@
1
+ module Lumix
2
+
3
+ class Lookup
4
+ class Document
5
+ def initialize(lookup)
6
+ @tokens_ds = lookup.tokens
7
+ @words = lookup.words
8
+ @tags = lookup.tags
9
+ @tokens = []
10
+ end
11
+
12
+ def add_token(text_id, position, word, tag, s_begin, s_end, t_begin, t_end)
13
+ @tokens << {:text_id => text_id, :position => position, :word_id => @words[word], :tag_id => @tags[tag],
14
+ :src_begin => s_begin, :src_end => s_end, :tagged_begin => t_begin, :tagged_end => t_end}
15
+ end
16
+
17
+ def flush
18
+ tokens, @tokens = @tokens, [] # make sure no double-flush occurs
19
+ @tokens_ds.multi_insert tokens
20
+ end
21
+ end
22
+
23
+ class LookupCollection < Hash
24
+ def initialize(ds, column)
25
+ @ds = ds
26
+ @column = column
27
+ super(){ |h,k| h[k] = create(k) }
28
+
29
+ @ds.each do |e|
30
+ self[e[@column]] = e[:id]
31
+ end
32
+ end
33
+
34
+ def create(value)
35
+ @ds.db.transaction(:isolation => :serializable) do
36
+ @ds.where(@column => value).select(:id).single_value || @ds.insert(@column => value)
37
+ end
38
+ end
39
+ end
40
+
41
+ attr_reader :tokens, :db
42
+
43
+ def initialize(db)
44
+ puts "Lookup"
45
+ @db = db
46
+ @tokens = db[:tokens]
47
+ end
48
+
49
+ def tags
50
+ # TODO create only in the context of linking
51
+ @tags ||= LookupCollection.new(db[:tags], :tag)
52
+ end
53
+
54
+ def words
55
+ @words ||= LookupCollection.new(db[:words], :word)
56
+ end
57
+
58
+ def process(text_id)
59
+ return true unless tokens.where(:text_id => text_id).empty?
60
+ doc = Document.new(self)
61
+ result = yield(doc) if block_given?
62
+ doc.flush if result
63
+ result
64
+ end
65
+
66
+ def find_word(re)
67
+ find_ids(db[:words], :word => re)
68
+ end
69
+
70
+ def find_tag(re)
71
+ find_ids(db[:tags], :tag => re)
72
+ end
73
+
74
+ # kindly crafted by jeremyevans
75
+ def find(filters)
76
+ ds = db[:tokens.as(:t0)]
77
+ f = filters[0]
78
+ ds = ds.where(:t0__word_id=>f.word) if f.word
79
+ ds = ds.where(:t0__tag_id=>f.tag) if f.tag
80
+ i = 0
81
+ filters[1..-1].each do |f|
82
+ as = "t#{i+=1}"
83
+ h = {}
84
+ h[:"#{as}__word_id"] = f.word if f.word
85
+ h[:"#{as}__tag_id"] = f.tag if f.tag
86
+ ds = ds.join(:tokens.as(as)){ |j, lj, js| {:text_id.qualify(j) => :text_id.qualify(lj), :position.qualify(j) => :position.qualify(lj) + 1} }.where(h)
87
+ end
88
+ select = ds.select(:t0__text_id.as(:text_id), :t0__src_begin.as(:src_begin), :"t#{i}__src_end".as(:src_end),
89
+ :t0__tagged_begin.as(:tagged_begin), :"t#{i}__tagged_end".as(:tagged_end))
90
+
91
+ puts select.sql
92
+ puts select.explain
93
+
94
+ select.each do |e|
95
+ yield [e[:text_id], e[:src_begin], e[:src_end], e[:tagged_begin], e[:tagged_end]]
96
+ end
97
+ end
98
+
99
+ private
100
+ def find_ids(tbl, opts)
101
+ tbl.where(opts).select(:id).map{|e| e[:id]}
102
+ end
103
+
104
+ end
105
+ end
@@ -0,0 +1,43 @@
1
+ module Lumix
2
+ class LookupFilter
3
+
4
+ attr_reader :results, :filter
5
+
6
+ Filter = Struct.new(:word, :tag)
7
+
8
+ def initialize(lookup, filter, &result_proc)
9
+ @filter = filter
10
+ @result_proc = result_proc
11
+
12
+ @filters = create_filters(lookup, filter)
13
+ @results = 0
14
+ end
15
+
16
+ def <<(result)
17
+ @results += 1
18
+ @result_proc[*result] if @result_proc
19
+ end
20
+
21
+ def apply(lookup, &block)
22
+ lookup.find(@filters) do |range|
23
+ block[*range] if block and range
24
+ end
25
+ end
26
+
27
+ def create_filters(lookup, filter)
28
+ filter.scan(/(?:(?:\"([^\"]+)\")|(\S+))+/).map do |word, tag|
29
+ word_re = to_re(word)
30
+ tag_re = to_re(tag)
31
+ word_ids = lookup.find_word(word_re) if word_re
32
+ tag_ids = lookup.find_tag(tag_re) if tag_re
33
+ Filter.new(word_ids, tag_ids)
34
+ end
35
+ end
36
+
37
+ def to_re(txt)
38
+ return nil if txt.nil? || txt.empty?
39
+ Regexp.new('^' + txt.gsub(/\s/, '_').gsub(/\*/, '\S*').gsub(/\?/, '\S') + '$')
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,95 @@
1
+ require 'lumix/lookup_filter'
2
+ require 'lumix/text_snippet'
3
+ require 'lumix/lookup'
4
+
5
+ module Lumix
6
+
7
+ class LookupSearch
8
+
9
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
10
+
11
+ def initialize(db, progress)
12
+ @lookup = Lookup.new(db)
13
+ @progress = progress
14
+ end
15
+
16
+ def concurrent_link?
17
+ true
18
+ end
19
+
20
+ def simulate!
21
+ @simulate = true
22
+ end
23
+
24
+ def link_text(id)
25
+ ds = TaggedText[id]
26
+ @lookup.process id do |doc|
27
+ result = true
28
+
29
+ file, text, tagged = ds.filename, ds.text, ds.tagged
30
+
31
+ puts "Linking text #{file}"
32
+
33
+ txt_pos = 0
34
+ position = 0
35
+ tagged.scan(TAGGED) do |word, tag|
36
+ tagged_begin = $~.begin(0)
37
+ tagged_end = $~.end(0)
38
+
39
+ # expand "x_y_z" notation to "x y z"
40
+ word_re = Regexp.new(Regexp.escape(word).gsub(/\_/, '\s+'))
41
+ src_match = text[txt_pos..-1].match(word_re) # find the word
42
+ if src_match
43
+ offset = src_match.begin(0)
44
+ src_begin = txt_pos + offset
45
+ src_end = txt_pos + src_match.end(0)
46
+ txt_pos = src_end
47
+
48
+ unless @simulate
49
+ doc.add_token(id, position, word, tag, src_begin, src_end, tagged_begin, tagged_end)
50
+ end
51
+ else
52
+ STDERR.puts "Could not find match for '#{word}' in text #{file}"
53
+ STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
54
+ `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
55
+ result = nil
56
+ break
57
+ end
58
+ position += 1
59
+ end
60
+ result
61
+ end
62
+ rescue => e # TODO remove this crap
63
+ STDERR.puts e
64
+ STDERR.puts e.backtrace
65
+ raise e
66
+ end
67
+
68
+ def create_filter(f, &block)
69
+ Lumix::LookupFilter.new(@lookup, f, &block)
70
+ end
71
+
72
+ def find(*filters, &block)
73
+ p = Pool.new(4)
74
+ filters.flatten.each do |f|
75
+ p.schedule do
76
+ last_id = -1
77
+ t = nil
78
+ f.apply(@lookup) do |text_id, s_begin, s_end, t_begin, t_end|
79
+ t = TaggedText[text_id] if text_id != last_id
80
+ last_id = text_id
81
+
82
+ fname = File.basename(t.filename)
83
+ text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
84
+ tagged_snippet = Lumix::TextSnippet.new(fname, t.tagged, t_begin, t_end)
85
+ f << [text_snippet, tagged_snippet]
86
+ end
87
+ end
88
+ end
89
+ p.shutdown
90
+ end
91
+
92
+ end
93
+
94
+ SearchStrategy = LookupSearch
95
+ end
data/lib/lumix/main.rb ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), '..')
4
+ $: << File.join(File.dirname(__FILE__), '../../../Sweet/lib')
5
+
6
+ require 'rubygems'
7
+ require 'lumix/gui'
@@ -0,0 +1,35 @@
1
+ class TaggedText
2
+ module InstanceMethods
3
+ include Enumerable
4
+
5
+ def create(attrs)
6
+ new(attrs).save_new
7
+ end
8
+
9
+ private
10
+ def accessor(*names)
11
+ names.each do |name|
12
+ define_method name do
13
+ @attrs[name]
14
+ end
15
+ define_method "#{name}=" do |v|
16
+ @attrs[name] = v
17
+ end
18
+ end
19
+ end
20
+ end
21
+ extend InstanceMethods
22
+
23
+ def initialize(attrs)
24
+ @id = attrs.delete(:id)
25
+ @attrs = attrs
26
+ end
27
+ attr_reader :id
28
+ accessor :text, :tagged, :fulltagged, :filename, :tagged_filename, :digest
29
+
30
+ def update(attrs)
31
+ @attrs.merge(attrs)
32
+ save
33
+ end
34
+
35
+ end
@@ -0,0 +1,42 @@
1
+ require 'lumix/model/base_models'
2
+
3
+ class TaggedText
4
+
5
+ def save
6
+ Maglev.commit_transaction
7
+ end
8
+
9
+ def save_new
10
+ self.table << self
11
+ end
12
+
13
+ class << self
14
+
15
+ def each(&block)
16
+ table.each &block
17
+ end
18
+
19
+ def [](key)
20
+ case key
21
+ when Hash
22
+ # find by values
23
+
24
+ when Integer
25
+ # find by id
26
+ when String
27
+ # find by filename
28
+ end
29
+ end
30
+
31
+ def exists?(attrs)
32
+ end
33
+
34
+ def ids
35
+ end
36
+
37
+ def count
38
+ end
39
+
40
+ end
41
+
42
+ end
@@ -0,0 +1,46 @@
1
+ require 'lumix/model/base_models'
2
+
3
+ class TaggedText
4
+
5
+ def save
6
+ # data aware ;)
7
+ end
8
+
9
+ def save_new
10
+ self.class.table << self
11
+ end
12
+
13
+ class << self
14
+
15
+ def table
16
+ @@table ||= []
17
+ end
18
+
19
+ def each(&block)
20
+ table.each &block
21
+ end
22
+
23
+ def [](key)
24
+ case key
25
+ when Hash
26
+ # find by values
27
+
28
+ when Integer
29
+ table[key]
30
+ when String
31
+ # find by filename
32
+ end
33
+ end
34
+
35
+ def exists?(attrs)
36
+ end
37
+
38
+ def ids
39
+ end
40
+
41
+ def count
42
+ end
43
+
44
+ end
45
+
46
+ end
@@ -0,0 +1,53 @@
1
+ require 'lumix/model/base_models'
2
+
3
+ class TaggedText
4
+
5
+ def save
6
+ self.class.table.where(:id => @id).update(@attrs)
7
+ end
8
+
9
+ def save_new
10
+ @id = self.class.table.insert(@attrs)
11
+ end
12
+
13
+ class << self
14
+ attr_accessor :db
15
+ def each(&block)
16
+ p = Pool.new(4)
17
+ table.select(:id).each do |id|
18
+ p.schedule{block.call self[id[:id]]}
19
+ end
20
+ p.shutdown
21
+ end
22
+
23
+ def table
24
+ db[:texts]
25
+ end
26
+
27
+ def [](key)
28
+ data = case key
29
+ when Hash
30
+ table[key]
31
+ when Integer
32
+ table[:id => key]
33
+ when String
34
+ table[:filename => key]
35
+ end
36
+ new data if data
37
+ end
38
+
39
+ def exists?(attrs)
40
+ table.where(attrs).count != 0
41
+ end
42
+
43
+ def ids
44
+ table.select(:id).map{|v| v[:id]}
45
+ end
46
+
47
+ def count
48
+ table.count
49
+ end
50
+
51
+ end
52
+
53
+ end