lumix 0.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,89 @@
1
+ module Lumix
2
+ class Filter
3
+
4
+ HANDLERS = %w[handle_wildcard handle_choice handle_literals
5
+ handle_dangling_tags handle_multiplicators ensure_wordbounds]
6
+
7
+ attr_reader :results, :filter
8
+
9
+ def initialize(suffix, filter, &result_proc)
10
+ @suffix = suffix.gsub(/\\\|/, '[\|]') # workaround to make handle_dangling_tags play nicely
11
+ @filter = filter
12
+ @result_proc = result_proc
13
+
14
+ @re = to_re(filter)
15
+ @results = 0
16
+ end
17
+
18
+ def <<(result)
19
+ @results += 1
20
+ @result_proc[*result] if @result_proc
21
+ end
22
+
23
+ def scan(text, &block)
24
+ results = []
25
+ return results unless text
26
+ (' ' + text + ' ').scan(@re) do |s|
27
+ t_begin = $~.begin(0) - 1
28
+ t_end = $~.end(0) - 1
29
+
30
+ s = block ? block[s, t_begin, t_end, $~] : s
31
+ results << s
32
+ end
33
+ results
34
+ end
35
+
36
+ def to_re(filter)
37
+ re = HANDLERS.inject(filter) do |filter, handler|
38
+ puts filter
39
+ puts "#{handler} -->"
40
+ send handler, filter
41
+ end
42
+ puts re
43
+ Regexp.new(re)
44
+ end
45
+
46
+ # character wildcard replacement
47
+ def handle_wildcard(re)
48
+ re.gsub(/([^\)])\*/, '\1[^\s\|]*')
49
+ end
50
+
51
+ # Takes (!A B C) and transforms it
52
+ def handle_choice(re)
53
+ re.gsub(/\(\!([^\)]+)\)/) do
54
+ c = $1.split.map{ |t| '(?!' + t + ')' }.join
55
+ '(?:' + c + '[^\s\|]*' + @suffix + ')'
56
+ end
57
+ end
58
+
59
+ # transforms literals delimited by ""
60
+ def handle_literals(re)
61
+ re.gsub(/\"([^\"]*)\"(?:\|(\S+?))?/) do
62
+ str = $1
63
+ tag = $2 || '[^\s\|]+'
64
+ str.gsub(/ /, '_') + '\|' + tag
65
+ end
66
+ end
67
+
68
+ # add wildcard word match on tag-only search criteria
69
+ def handle_dangling_tags(re)
70
+ re.split(/ /).map do |s|
71
+ if s =~ /\|[^\]]/
72
+ s + @suffix
73
+ else
74
+ s.gsub(/(\(?)([^\)]+)(\S*)/, '\1[^\s\|]+\|\2' + @suffix + '\3')
75
+ end
76
+ end.join('\s+')
77
+ end
78
+
79
+ # Handles the + * ? and {} qualifiers
80
+ def handle_multiplicators(re)
81
+ re.gsub(/\(([^\)]+)(\)((\{[^\}]+\})|\*|\+|\?)\s?)/, '(?:\1\s\2')
82
+ end
83
+
84
+ def ensure_wordbounds(re)
85
+ re # ending wordbounds is being taken of earlier
86
+ end
87
+
88
+ end
89
+ end
data/lib/lumix/gui.rb ADDED
@@ -0,0 +1,148 @@
1
+ require 'lumix/base'
2
+
3
+ require 'sweet'
4
+ require 'lumix/result_view'
5
+ #Sweet.set_debug
6
+
7
+
8
+ Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
9
+ Indicator = %w'} ) ] | [ ( {'
10
+
11
+ CONF = 'config.yaml'
12
+ ConfigStruct = Struct.new(:database_uri)
13
+ CConfig = YAML.load_file(CONF) rescue ConfigStruct.new('jdbc:postgresql://localhost:5432/concordancer?user=concordancer&password=concordancer')
14
+ def save!
15
+ File.open(CONF, 'w') do |f|
16
+ f.write(CConfig.to_yaml)
17
+ end
18
+ end
19
+
20
+ Sweet.app :title => 'Ruby Concordancer', :width => 800, :height => 700, :layout => :grid.conf(:numColumns => 3) do
21
+ def conc
22
+ @conc ||= Concordancer.new(CConfig.database_uri, :progress_proc => @progress_proc)#, :recreate => true)
23
+ end
24
+
25
+ @progress_proc = proc do |p|
26
+ task = Texts[p.task] || p.task
27
+ perform do
28
+ if p.done == p.work
29
+ @p_status.text = 'Done!'
30
+ @p_indicator.text = ''
31
+ @p_bar.fraction = 0
32
+ else
33
+ @p_status.text = task
34
+ @p_indicator.text = Indicator[p.done % Indicator.size]
35
+ @p_bar.fraction = p.done.to_f / p.work
36
+ end
37
+ end
38
+ end
39
+
40
+ save! unless File.exists?(CONF)
41
+
42
+ menubar do
43
+ submenu '&File' do
44
+ submenu '&Import...' do
45
+ item('E&nglish texts') { import_chooser('en') }
46
+ item('&Romanian texts') { import_chooser('ro') }
47
+ end
48
+ item('&Export findings...') { export_findings }
49
+ separator
50
+ item('&Relink texts') { relink }
51
+ item('&Clear the database') { reconnect :recreate => true }
52
+ separator
53
+ item('E&xit') { exit }
54
+ end
55
+ # submenu 'C&orpora' do
56
+ # @m_cat = submenu '&Category' do
57
+ # item('Cre&ate...') { create_category }
58
+ # item('&Import...') { import_chooser }
59
+ # separator
60
+ # item('&Edit...') { edit_category }
61
+ # item('&Delete') { delete_category }
62
+ # end
63
+ # @m_text = submenu '&Text' do
64
+ # item('&Reimport...') { reimport_chooser }
65
+ # item('&Delete') { delete_text }
66
+ # end
67
+ # end
68
+ # @m_stats = submenu '&Statistics' do
69
+ # item('&Editor') { script_editor }
70
+ # separator
71
+ # item('&Load Script...') { load_script }
72
+ # end
73
+ # submenu "&Help" do
74
+ # separator
75
+ # item('&About') { about }
76
+ # end
77
+ end
78
+
79
+ tree :grid_data => {:align => [:fill, :fill], :span => [1, 2], :grab => [true, true]}
80
+
81
+ @filter = edit_line 'NSN NSN', :grid_data => {:align => [:fill, :center], :grab => true}, :max_size => 40 do
82
+ perform_search
83
+ end
84
+ button 'Search' do
85
+ perform_search
86
+ end
87
+
88
+ @results = table :columns => %w[Text Left Hit Right], :sort => true, :grid_data => {:align => [:fill, :fill], :span => 2, :grab => [true, true]}, :scroll => true
89
+
90
+ @counter = label :grid_data => {:span => 2, :align => :fill}
91
+
92
+ @p_status = label(:grid_data => {:align => [:fill, :bottom], :grab => true})
93
+ @p_bar = progress(:width => 50, :grid_data => {:align => [:right, :bottom]})
94
+ @p_indicator = label(' ', :grid_data => {:align => [:right, :bottom]})
95
+
96
+
97
+ def perform_search
98
+ filter = @filter.text
99
+ @results.data.clear
100
+ Thread.new do
101
+ unless filter.empty?
102
+ puts "finding #{filter}"
103
+ found = conc.find(filter) do |text, tagged|
104
+ @results.add_hit(text.name, text.left, text.to_s, text.right)
105
+ end
106
+ end
107
+ perform do
108
+ @counter.text = "#{found} matches"
109
+ @p_status.text = "Found #{found || 'no'} matches for #{filter}"
110
+ end
111
+ end
112
+ end
113
+
114
+ def import_chooser(lang)
115
+ conc.tp.lang = lang
116
+ Thread.new(conc) do |conc|
117
+ conc.read('raw')
118
+ end
119
+ end
120
+
121
+ def export_findings
122
+ filename = to_filename(@filter.text) + '.findings'
123
+ @p_status.text = "Exporting to #{filename}"
124
+ File.open(filename, 'w') do |f|
125
+ @results.items.each do |item|
126
+ unless item.getChecked
127
+ left, hit, right = (0..2).map{ |i| item.text(i) }
128
+ f.puts "#{left}\t#{hit}\t#{right}"
129
+ end
130
+ end
131
+ end
132
+ @p_status.text = "Done! Exported to file #{filename}"
133
+ end
134
+
135
+ def relink
136
+ Thread.new(conc) do |conc|
137
+ conc.link!
138
+ end
139
+ end
140
+
141
+ def to_filename(filter)
142
+ filter.gsub(/\s+/, "_").gsub(/[\*\.\?\"]/, '')
143
+ end
144
+
145
+ def reconnect(opts = {})
146
+ @conc = Concordancer.new(CConfig.database_uri, opts.mergs(:progress_proc => @progress_proc))
147
+ end
148
+ end
@@ -0,0 +1,105 @@
1
+ module Lumix
2
+
3
+ class Lookup
4
+ class Document
5
+ def initialize(lookup)
6
+ @tokens_ds = lookup.tokens
7
+ @words = lookup.words
8
+ @tags = lookup.tags
9
+ @tokens = []
10
+ end
11
+
12
+ def add_token(text_id, position, word, tag, s_begin, s_end, t_begin, t_end)
13
+ @tokens << {:text_id => text_id, :position => position, :word_id => @words[word], :tag_id => @tags[tag],
14
+ :src_begin => s_begin, :src_end => s_end, :tagged_begin => t_begin, :tagged_end => t_end}
15
+ end
16
+
17
+ def flush
18
+ tokens, @tokens = @tokens, [] # make sure no double-flush occurs
19
+ @tokens_ds.multi_insert tokens
20
+ end
21
+ end
22
+
23
+ class LookupCollection < Hash
24
+ def initialize(ds, column)
25
+ @ds = ds
26
+ @column = column
27
+ super(){ |h,k| h[k] = create(k) }
28
+
29
+ @ds.each do |e|
30
+ self[e[@column]] = e[:id]
31
+ end
32
+ end
33
+
34
+ def create(value)
35
+ @ds.db.transaction(:isolation => :serializable) do
36
+ @ds.where(@column => value).select(:id).single_value || @ds.insert(@column => value)
37
+ end
38
+ end
39
+ end
40
+
41
+ attr_reader :tokens, :db
42
+
43
+ def initialize(db)
44
+ puts "Lookup"
45
+ @db = db
46
+ @tokens = db[:tokens]
47
+ end
48
+
49
+ def tags
50
+ # TODO create only in the context of linking
51
+ @tags ||= LookupCollection.new(db[:tags], :tag)
52
+ end
53
+
54
+ def words
55
+ @words ||= LookupCollection.new(db[:words], :word)
56
+ end
57
+
58
+ def process(text_id)
59
+ return true unless tokens.where(:text_id => text_id).empty?
60
+ doc = Document.new(self)
61
+ result = yield(doc) if block_given?
62
+ doc.flush if result
63
+ result
64
+ end
65
+
66
+ def find_word(re)
67
+ find_ids(db[:words], :word => re)
68
+ end
69
+
70
+ def find_tag(re)
71
+ find_ids(db[:tags], :tag => re)
72
+ end
73
+
74
+ # kindly crafted by jeremyevans
75
+ def find(filters)
76
+ ds = db[:tokens.as(:t0)]
77
+ f = filters[0]
78
+ ds = ds.where(:t0__word_id=>f.word) if f.word
79
+ ds = ds.where(:t0__tag_id=>f.tag) if f.tag
80
+ i = 0
81
+ filters[1..-1].each do |f|
82
+ as = "t#{i+=1}"
83
+ h = {}
84
+ h[:"#{as}__word_id"] = f.word if f.word
85
+ h[:"#{as}__tag_id"] = f.tag if f.tag
86
+ ds = ds.join(:tokens.as(as)){ |j, lj, js| {:text_id.qualify(j) => :text_id.qualify(lj), :position.qualify(j) => :position.qualify(lj) + 1} }.where(h)
87
+ end
88
+ select = ds.select(:t0__text_id.as(:text_id), :t0__src_begin.as(:src_begin), :"t#{i}__src_end".as(:src_end),
89
+ :t0__tagged_begin.as(:tagged_begin), :"t#{i}__tagged_end".as(:tagged_end))
90
+
91
+ puts select.sql
92
+ puts select.explain
93
+
94
+ select.each do |e|
95
+ yield [e[:text_id], e[:src_begin], e[:src_end], e[:tagged_begin], e[:tagged_end]]
96
+ end
97
+ end
98
+
99
+ private
100
+ def find_ids(tbl, opts)
101
+ tbl.where(opts).select(:id).map{|e| e[:id]}
102
+ end
103
+
104
+ end
105
+ end
@@ -0,0 +1,43 @@
1
+ module Lumix
2
+ class LookupFilter
3
+
4
+ attr_reader :results, :filter
5
+
6
+ Filter = Struct.new(:word, :tag)
7
+
8
+ def initialize(lookup, filter, &result_proc)
9
+ @filter = filter
10
+ @result_proc = result_proc
11
+
12
+ @filters = create_filters(lookup, filter)
13
+ @results = 0
14
+ end
15
+
16
+ def <<(result)
17
+ @results += 1
18
+ @result_proc[*result] if @result_proc
19
+ end
20
+
21
+ def apply(lookup, &block)
22
+ lookup.find(@filters) do |range|
23
+ block[*range] if block and range
24
+ end
25
+ end
26
+
27
+ def create_filters(lookup, filter)
28
+ filter.scan(/(?:(?:\"([^\"]+)\")|(\S+))+/).map do |word, tag|
29
+ word_re = to_re(word)
30
+ tag_re = to_re(tag)
31
+ word_ids = lookup.find_word(word_re) if word_re
32
+ tag_ids = lookup.find_tag(tag_re) if tag_re
33
+ Filter.new(word_ids, tag_ids)
34
+ end
35
+ end
36
+
37
+ def to_re(txt)
38
+ return nil if txt.nil? || txt.empty?
39
+ Regexp.new('^' + txt.gsub(/\s/, '_').gsub(/\*/, '\S*').gsub(/\?/, '\S') + '$')
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,95 @@
1
+ require 'lumix/lookup_filter'
2
+ require 'lumix/text_snippet'
3
+ require 'lumix/lookup'
4
+
5
+ module Lumix
6
+
7
+ class LookupSearch
8
+
9
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
10
+
11
+ def initialize(db, progress)
12
+ @lookup = Lookup.new(db)
13
+ @progress = progress
14
+ end
15
+
16
+ def concurrent_link?
17
+ true
18
+ end
19
+
20
+ def simulate!
21
+ @simulate = true
22
+ end
23
+
24
+ def link_text(id)
25
+ ds = TaggedText[id]
26
+ @lookup.process id do |doc|
27
+ result = true
28
+
29
+ file, text, tagged = ds.filename, ds.text, ds.tagged
30
+
31
+ puts "Linking text #{file}"
32
+
33
+ txt_pos = 0
34
+ position = 0
35
+ tagged.scan(TAGGED) do |word, tag|
36
+ tagged_begin = $~.begin(0)
37
+ tagged_end = $~.end(0)
38
+
39
+ # expand "x_y_z" notation to "x y z"
40
+ word_re = Regexp.new(Regexp.escape(word).gsub(/\_/, '\s+'))
41
+ src_match = text[txt_pos..-1].match(word_re) # find the word
42
+ if src_match
43
+ offset = src_match.begin(0)
44
+ src_begin = txt_pos + offset
45
+ src_end = txt_pos + src_match.end(0)
46
+ txt_pos = src_end
47
+
48
+ unless @simulate
49
+ doc.add_token(id, position, word, tag, src_begin, src_end, tagged_begin, tagged_end)
50
+ end
51
+ else
52
+ STDERR.puts "Could not find match for '#{word}' in text #{file}"
53
+ STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
54
+ `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
55
+ result = nil
56
+ break
57
+ end
58
+ position += 1
59
+ end
60
+ result
61
+ end
62
+ rescue => e # TODO remove this crap
63
+ STDERR.puts e
64
+ STDERR.puts e.backtrace
65
+ raise e
66
+ end
67
+
68
+ def create_filter(f, &block)
69
+ Lumix::LookupFilter.new(@lookup, f, &block)
70
+ end
71
+
72
+ def find(*filters, &block)
73
+ p = Pool.new(4)
74
+ filters.flatten.each do |f|
75
+ p.schedule do
76
+ last_id = -1
77
+ t = nil
78
+ f.apply(@lookup) do |text_id, s_begin, s_end, t_begin, t_end|
79
+ t = TaggedText[text_id] if text_id != last_id
80
+ last_id = text_id
81
+
82
+ fname = File.basename(t.filename)
83
+ text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
84
+ tagged_snippet = Lumix::TextSnippet.new(fname, t.tagged, t_begin, t_end)
85
+ f << [text_snippet, tagged_snippet]
86
+ end
87
+ end
88
+ end
89
+ p.shutdown
90
+ end
91
+
92
+ end
93
+
94
+ SearchStrategy = LookupSearch
95
+ end
data/lib/lumix/main.rb ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), '..')
4
+ $: << File.join(File.dirname(__FILE__), '../../../Sweet/lib')
5
+
6
+ require 'rubygems'
7
+ require 'lumix/gui'
@@ -0,0 +1,35 @@
1
+ class TaggedText
2
+ module InstanceMethods
3
+ include Enumerable
4
+
5
+ def create(attrs)
6
+ new(attrs).save_new
7
+ end
8
+
9
+ private
10
+ def accessor(*names)
11
+ names.each do |name|
12
+ define_method name do
13
+ @attrs[name]
14
+ end
15
+ define_method "#{name}=" do |v|
16
+ @attrs[name] = v
17
+ end
18
+ end
19
+ end
20
+ end
21
+ extend InstanceMethods
22
+
23
+ def initialize(attrs)
24
+ @id = attrs.delete(:id)
25
+ @attrs = attrs
26
+ end
27
+ attr_reader :id
28
+ accessor :text, :tagged, :fulltagged, :filename, :tagged_filename, :digest
29
+
30
+ def update(attrs)
31
+ @attrs.merge(attrs)
32
+ save
33
+ end
34
+
35
+ end
@@ -0,0 +1,42 @@
1
+ require 'lumix/model/base_models'
2
+
3
+ class TaggedText
4
+
5
+ def save
6
+ Maglev.commit_transaction
7
+ end
8
+
9
+ def save_new
10
+ self.table << self
11
+ end
12
+
13
+ class << self
14
+
15
+ def each(&block)
16
+ table.each &block
17
+ end
18
+
19
+ def [](key)
20
+ case key
21
+ when Hash
22
+ # find by values
23
+
24
+ when Integer
25
+ # find by id
26
+ when String
27
+ # find by filename
28
+ end
29
+ end
30
+
31
+ def exists?(attrs)
32
+ end
33
+
34
+ def ids
35
+ end
36
+
37
+ def count
38
+ end
39
+
40
+ end
41
+
42
+ end
@@ -0,0 +1,46 @@
1
+ require 'lumix/model/base_models'
2
+
3
+ class TaggedText
4
+
5
+ def save
6
+ # data aware ;)
7
+ end
8
+
9
+ def save_new
10
+ self.class.table << self
11
+ end
12
+
13
+ class << self
14
+
15
+ def table
16
+ @@table ||= []
17
+ end
18
+
19
+ def each(&block)
20
+ table.each &block
21
+ end
22
+
23
+ def [](key)
24
+ case key
25
+ when Hash
26
+ # find by values
27
+
28
+ when Integer
29
+ table[key]
30
+ when String
31
+ # find by filename
32
+ end
33
+ end
34
+
35
+ def exists?(attrs)
36
+ end
37
+
38
+ def ids
39
+ end
40
+
41
+ def count
42
+ end
43
+
44
+ end
45
+
46
+ end
@@ -0,0 +1,53 @@
1
+ require 'lumix/model/base_models'
2
+
3
+ class TaggedText
4
+
5
+ def save
6
+ self.class.table.where(:id => @id).update(@attrs)
7
+ end
8
+
9
+ def save_new
10
+ @id = self.class.table.insert(@attrs)
11
+ end
12
+
13
+ class << self
14
+ attr_accessor :db
15
+ def each(&block)
16
+ p = Pool.new(4)
17
+ table.select(:id).each do |id|
18
+ p.schedule{block.call self[id[:id]]}
19
+ end
20
+ p.shutdown
21
+ end
22
+
23
+ def table
24
+ db[:texts]
25
+ end
26
+
27
+ def [](key)
28
+ data = case key
29
+ when Hash
30
+ table[key]
31
+ when Integer
32
+ table[:id => key]
33
+ when String
34
+ table[:filename => key]
35
+ end
36
+ new data if data
37
+ end
38
+
39
+ def exists?(attrs)
40
+ table.where(attrs).count != 0
41
+ end
42
+
43
+ def ids
44
+ table.select(:id).map{|v| v[:id]}
45
+ end
46
+
47
+ def count
48
+ table.count
49
+ end
50
+
51
+ end
52
+
53
+ end