lumix 0.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,105 @@
1
+ require 'msgpack'
2
+
3
+ module Lumix
4
+
5
+ TEXT_ID = 0
6
+ S_BEGIN = 1
7
+ S_END = 2
8
+ T_BEGIN = 3
9
+ T_END = 4
10
+
11
+ class Lookup
12
+ def initialize
13
+ puts "Lookup"
14
+ @tags = {} # tag => token_id[]
15
+ @words = {} # word => token_id[]
16
+ @tokens = [] # :text_id, :s_begin, :s_end, :t_begin, :t_end
17
+ @texts = [] # text_id
18
+ Signal.trap('INT'){exit}
19
+ at_exit do
20
+ save
21
+ end
22
+ load
23
+ end
24
+
25
+ def load
26
+ @dirty = false
27
+ return unless File.exists?('lookup.dat')
28
+ puts "Loading"
29
+ load_file :tags
30
+ load_file :words
31
+ load_file :texts
32
+ load_file :tokens
33
+ end
34
+
35
+ def save
36
+ return unless @dirty
37
+ @saving = true
38
+ puts "Saving"
39
+ save_file :tags
40
+ save_file :words
41
+ save_file :texts
42
+ save_file :tokens
43
+ @saving = false
44
+ end
45
+
46
+ def with(*types)
47
+ args = types.flatten.map{|name| instance_variable_get("@#{name}") || instance_variable_get("@#{name}",load_file(name)) }
48
+ yield *args
49
+ end
50
+
51
+ def save_file(name)
52
+ data = instance_variable_get("@#{name}")
53
+ File.open(name.to_s + '.dat', 'w') do |f|
54
+ f.print MessagePack.pack(data)
55
+ end
56
+ end
57
+
58
+ def load_file(name)
59
+ MessagePack.unpack(File.read(name.to_s + '.dat'))
60
+ end
61
+
62
+ def process(text_id)
63
+ return if @saving
64
+ @dirty = true
65
+ return true if @texts.member?(text_id)
66
+ @texts << text_id
67
+
68
+ yield if block_given?
69
+ end
70
+
71
+ def add_token(text_id, word, tag, s_begin, s_end, t_begin, t_end)
72
+ return if @saving
73
+ @dirty = true
74
+ id = (@tokens << [text_id, s_begin, s_end, t_begin, t_end]).size - 1
75
+ (@words[word] ||= []) << id
76
+ (@tags[tag] ||= []) << id
77
+ end
78
+
79
+ def find_word(re)
80
+ find_ids @words, re
81
+ end
82
+
83
+ def find_tag(re)
84
+ find_ids @tags, re
85
+ end
86
+
87
+ # returns the start indices of matching sequences
88
+ def union(*id_sets)
89
+ unified = id_sets.each_with_index.map{|c,i| c.map{|e| e-i}}
90
+ unified.inject(:&)
91
+ end
92
+
93
+ def text_range(t_begin, t_end)
94
+ a, b = @tokens[t_begin], @tokens[t_end]
95
+ return nil unless a[TEXT_ID] == b[TEXT_ID]
96
+ return a[TEXT_ID], a[S_BEGIN], b[S_END], a[T_BEGIN], b[T_END]
97
+ end
98
+
99
+ private
100
+ def find_ids(arr, re)
101
+ elems = arr.keys.grep(re)
102
+ elems.map{|e| arr[e]}.flatten
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,40 @@
1
+ module Lumix
2
+ class LookupFilter
3
+
4
+ attr_reader :results, :filter
5
+
6
+ def initialize(filter, &result_proc)
7
+ @filter = filter
8
+ @result_proc = result_proc
9
+
10
+ @re = create_re(filter)
11
+ @results = 0
12
+ end
13
+
14
+ def <<(result)
15
+ @results += 1
16
+ @result_proc[*result] if @result_proc
17
+ end
18
+
19
+ def apply(lookup, &block)
20
+ results = @re.map do |(type, re)|
21
+ lookup.send("find_#{type}", re)
22
+ end
23
+ lookup.union(*results).each do |id|
24
+ range = lookup.text_range(id, id + @re.size - 1) # TODO make more dynamic
25
+ block[*range] if block and range
26
+ end
27
+ end
28
+
29
+ def create_re(filter)
30
+ filter.scan(/(?:(?:\"([^\"]+)\")|(\S+))+/).map do |word, tag|
31
+ word ? [:word, to_re(word)] : [:tag, to_re(tag)]
32
+ end
33
+ end
34
+
35
+ def to_re(txt)
36
+ Regexp.new('^' + txt.gsub(/\s/, '_').gsub(/\*/, '\S*').gsub(/\?/, '\S') + '$')
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,81 @@
1
+ require 'lumix/lookup_filter'
2
+ require 'lumix/text_snippet'
3
+ require 'lumix/lookup'
4
+
5
+ module Lumix
6
+
7
+ class LookupSearch
8
+
9
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
10
+
11
+ def initialize(db, progress)
12
+ @lookup = Lookup.new
13
+ @progress = progress
14
+ end
15
+
16
+ def concurrent_link?
17
+ false
18
+ end
19
+
20
+ def link_text(id)
21
+ ds = TaggedText[id]
22
+ @lookup.process id do
23
+ file, text, tagged = ds.filename, ds.text, ds.tagged
24
+
25
+ puts "Linking text #{file}"
26
+
27
+ txt_pos = 0
28
+ tagged.scan(TAGGED) do |word, tag|
29
+ tagged_begin = $~.begin(0)
30
+ tagged_end = $~.end(0)
31
+
32
+ # expand "x_y_z" notation to "x y z"
33
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
34
+ src_match = text[txt_pos..-1].match(word_re) # find the word
35
+ if src_match
36
+ offset = src_match.begin(0)
37
+ src_begin = txt_pos + offset
38
+ src_end = txt_pos + src_match.end(0)
39
+ txt_pos = src_end
40
+
41
+ @lookup.add_token(id, word, tag, src_begin, src_end, tagged_begin, tagged_end)
42
+ else
43
+ STDERR.puts "Could not find match for '#{word}' in text #{file}"
44
+ STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
45
+ `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
46
+ return nil
47
+ end
48
+ end
49
+ return true
50
+ end
51
+ rescue => e # TODO remove this crap
52
+ STDERR.puts e
53
+ STDERR.puts e.backtrace
54
+ @lookup.save
55
+ raise e
56
+ end
57
+
58
+ def create_filter(f, &block)
59
+ Lumix::LookupFilter.new(f, &block)
60
+ end
61
+
62
+ def find(*filters, &block)
63
+ last_id = -1
64
+ t = nil
65
+ filters.flatten.each do |f|
66
+ f.apply(@lookup) do |text_id, s_begin, s_end, t_begin, t_end|
67
+ t = TaggedText[text_id] if text_id != last_id
68
+ last_id = text_id
69
+
70
+ fname = File.basename(t.filename)
71
+ text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
72
+ tagged_snippet = Lumix::TextSnippet.new(fname, t.tagged, t_begin, t_end)
73
+ f << [text_snippet, tagged_snippet]
74
+ end
75
+ end
76
+ end
77
+
78
+ end
79
+
80
+ SearchStrategy = LookupSearch
81
+ end
@@ -0,0 +1,93 @@
1
+ class Java::OrgEclipseSwtWidgets::Table
2
+
3
+ attr_accessor :data, :tooltips
4
+
5
+ def sweeten(app, opts={}, &block)
6
+ @data = []
7
+ @tooltips = []
8
+ super
9
+ @redraw_thread = Thread.new do
10
+ while !isDisposed
11
+ if @dirty
12
+ @dirty = false
13
+ perform do
14
+ setItemCount data.size
15
+ clearAll if clear_all
16
+ end
17
+ end
18
+ sleep 1 # TODO find a better alternative
19
+ end
20
+ end
21
+
22
+ # TODO implement tooltips
23
+
24
+ addListener swt::SetData do |e|
25
+ item = e.item
26
+ index = indexOf(item)
27
+ item.setText(Array(data[index]).to_java(:string))
28
+ end
29
+
30
+ addListener swt::Resize do |e|
31
+ default_weight = 1.0 / columns.size
32
+ current_width = @old_width
33
+ w = width
34
+ columns[0..-2].each do |c|
35
+ weight = c.width == 0 ? default_weight : c.width.to_f / current_width
36
+ c.width = w * weight
37
+ end
38
+ columns[columns.size - 1].pack
39
+ @old_width = w
40
+ end
41
+ end
42
+
43
+ def columns=(*titles)
44
+ if titles
45
+ titles.each do |title|
46
+ col = widgets::TableColumn.new(self, swt::CENTER)
47
+ col.setText title
48
+ end
49
+
50
+ setHeaderVisible true
51
+ setLinesVisible true
52
+ end
53
+ end
54
+
55
+ def sort=(sort)
56
+ sort = Hash.new(true) if [true, :all].member?(sort)
57
+ if sort
58
+ columns.each_with_index do |col, index|
59
+ if sort[col.text]
60
+ col.addListener swt::Selection do
61
+ if data
62
+ @data = data.sort_by {|e| e[index] }
63
+ update :clear
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ Sweet::WIDGET_DEFAULTS[:table] = {
73
+ :style => [:border, :virtual, :check]
74
+ }
75
+ Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
76
+ :block_handler => :set_data,
77
+ :custom_code => proc {
78
+ def update(clear_all = false)
79
+ return if isDisposed
80
+ setItemCount data.size
81
+ clearAll if clear_all
82
+ end
83
+
84
+ def add_hit(*args)
85
+ opts = args.last === Hash ? args.pop : {}
86
+ d = opts[:data] || args
87
+ t = opts[:tooltips] || d
88
+ data << d
89
+ tooltips << t
90
+ @dirty = true
91
+ end
92
+ }
93
+ }
@@ -0,0 +1,35 @@
1
+ class CreateTables < Sequel::Migration
2
+
3
+ def up
4
+ create_table :texts do
5
+ primary_key :id
6
+ String :digest
7
+ String :text
8
+ String :tagged
9
+ String :filename
10
+ String :tagged_filename
11
+
12
+ index :digest
13
+ end
14
+
15
+ create_table :assoc do
16
+ primary_key :id
17
+ Integer :text_id, :references => :texts
18
+ Integer :position
19
+ Integer :src_begin
20
+ Integer :src_end
21
+ Integer :tagged_begin
22
+ Integer :tagged_end
23
+
24
+ index [:text_id, :tagged_end]
25
+ index [:text_id, :tagged_begin]
26
+ index [:text_id, :position]
27
+ end
28
+ end
29
+
30
+ def down
31
+ drop_table :assoc
32
+ drop_table :texts
33
+ end
34
+
35
+ end
@@ -0,0 +1,28 @@
1
+ class Categories < Sequel::Migration
2
+
3
+ def up
4
+ create_table :categories do
5
+ primary_key :id
6
+ Integer :parent_id, :references => :categories
7
+ String :name
8
+ String :key
9
+
10
+ index [:parent_id, :id]
11
+ end
12
+
13
+ alter_table :texts do
14
+ add_column :category_id, Integer, :references => :categories
15
+
16
+ add_index [:category_id, :id]
17
+ end
18
+
19
+ end
20
+
21
+ def down
22
+ alter_table :texts do
23
+ drop_column :category_id
24
+ end
25
+ drop_table :categories
26
+ end
27
+
28
+ end
@@ -0,0 +1,15 @@
1
+ class AddFulltagged < Sequel::Migration
2
+
3
+ def up
4
+ alter_table :texts do
5
+ add_column :fulltagged, String
6
+ end
7
+ end
8
+
9
+ def down
10
+ alter_table :texts do
11
+ drop_column :fulltagged
12
+ end
13
+ end
14
+
15
+ end
@@ -0,0 +1,44 @@
1
+ class CreateLookupTables < Sequel::Migration
2
+
3
+ def up
4
+ create_table :tags do
5
+ primary_key :id
6
+ String :tag
7
+
8
+ index :tag, :unique => true
9
+ end
10
+
11
+ create_table :words do
12
+ primary_key :id
13
+ String :word
14
+
15
+ index :word, :unique => true
16
+ end
17
+
18
+ create_table :tokens do
19
+ primary_key :id
20
+ Integer :text_id, :references => :texts
21
+
22
+ Integer :position
23
+ Integer :tag_id, :references => :tags
24
+ Integer :word_id, :references => :words
25
+
26
+ Integer :src_begin
27
+ Integer :src_end
28
+ Integer :tagged_begin
29
+ Integer :tagged_end
30
+
31
+ index [:text_id, :position], :unique => true
32
+ index :word_id
33
+ index :tag_id
34
+ end
35
+
36
+ end
37
+
38
+ def down
39
+ drop_table :tokens
40
+ drop_table :words
41
+ drop_table :tags
42
+ end
43
+
44
+ end
@@ -0,0 +1,104 @@
1
+ module Lumix
2
+
3
+ class SlowSearch
4
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
5
+
6
+ def initialize(db, progress)
7
+ @db = db
8
+ @progress = progress
9
+ end
10
+
11
+ def concurrent_link?
12
+ true
13
+ end
14
+
15
+ def link_text(id)
16
+ t = TaggedText[id]
17
+ text = t.text
18
+ puts "Linking text #{t.filename}"
19
+
20
+ src_last = 0
21
+ position = 0
22
+ assoc = []
23
+ t.tagged.scan(TAGGED) do |word, tag|
24
+ tagged_begin = $~.begin(0)
25
+ tagged_end = $~.end(0)
26
+
27
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
28
+ src_match = text[src_last..-1].match(word_re) # find the word
29
+ if src_match
30
+ src_begin = src_last + src_match.begin(0)
31
+ src_end = src_last + src_match.end(0)
32
+
33
+ src_last = src_end
34
+ assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
35
+ else
36
+ STDERR.puts "Could not find match for '#{word}' in text #{t.filename}"
37
+ `echo '#{t.filename}:#{tagged_begin}:#{src_last} unmatched "#{word}"' >> unlinked.lst`
38
+ return nil
39
+ end
40
+ position += 1
41
+ end
42
+ @db[:assoc].multi_insert(assoc)
43
+ rescue => e
44
+ STDERR.puts e
45
+ STDERR.puts e.backtrace
46
+ raise e
47
+ end
48
+
49
+ def create_filter
50
+ @filter ||= Filter.new('')
51
+ end
52
+
53
+ def find(filter, &block)
54
+ yield_text = block && block.arity >= 1
55
+ yield_tagged = block && block.arity >= 2
56
+
57
+ prog = Progress.new(:search, TaggedText.count, filter)
58
+ @progress[prog]
59
+
60
+ re = Filter.to_re(filter)
61
+
62
+ index = 0
63
+ TaggedText.inject(0) do |result, t|
64
+ fname = File.basename(t.filename)
65
+
66
+ # matches to ranges
67
+ results = []
68
+ t.tagged.scan(re) do |hit|
69
+ t_begin = $~.begin(0)
70
+ t_end = $~.end(0)
71
+ # TODO decouple database operations for performance
72
+ results << find_range(t.id, t_begin, t_end, yield_text)
73
+ end
74
+
75
+ result += results.inject(0) do |res, f|
76
+ if yield_tagged
77
+ tagged_snippet = TextSnippet.new(fname, t.tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
78
+ if yield_text
79
+ text_snippet = TextSnippet.new(fname, t.text, f[:src_begin].to_i, f[:src_end].to_i)
80
+ yield text_snippet, tagged_snippet
81
+ else
82
+ yield tagged_snippet
83
+ end
84
+ end
85
+ res += 1
86
+ end
87
+ @progress[prog, (index += 1)]
88
+ result
89
+ end
90
+ end
91
+
92
+ def find_range(t_id, t_begin, t_end, process_original)
93
+ if process_original
94
+ ds = @db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
95
+ ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
96
+ else
97
+ {:tagged_begin => t_begin, :tagged_end => t_end}
98
+ end
99
+ end
100
+
101
+ end
102
+
103
+ SearchStrategy = SlowSearch
104
+ end
@@ -0,0 +1,29 @@
1
+ module Lumix
2
+
3
+ class TextSnippet
4
+ attr_reader :name, :text, :begin, :end
5
+ def initialize(name, text, first, last)
6
+ @name = name
7
+ @text = text
8
+ @begin = first
9
+ @end = last
10
+ end
11
+ def to_s
12
+ cleanup(@text[@begin...@end])
13
+ end
14
+ def left(context = 5)
15
+ ctx = [@begin - context * 10, 0].max
16
+ @text[ctx...@begin].split(/\s+/).last(context).join(' ')# =~ /((\S+\s+){0,#{context}}\S*)\z/m
17
+ #cleanup($1)
18
+ end
19
+ def right(context = 5)
20
+ ctx = [@end + context * 10, @text.size].min
21
+ @text[@end..ctx].split(/\s+/).first(context).join(' ')# =~ /\A(\S*(\s+\S+){0,#{context}})/m
22
+ #cleanup($1)
23
+ end
24
+ def cleanup(txt)
25
+ txt.gsub(/\s+/, ' ')
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,108 @@
1
+ $KCODE='UTF-8'
2
+
3
+ require 'cgi'
4
+ require 'soap/wsdlDriver'
5
+ #require 'curb'
6
+ #require 'savon'
7
+ require 'lumix/charset' unless RUBY_ENGINE =~ /maglev/i
8
+
9
+ class TextProcessing
10
+
11
+ attr_accessor :lang
12
+
13
+ def initialize(lang = 'ro')
14
+ @lang = lang
15
+ end
16
+
17
+ def rpc
18
+ # Thread.current[:rpc] ||= begin
19
+ # wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
20
+ # wsdl.create_rpc_driver
21
+ # Savon::Client.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
22
+ # end
23
+ @rpc ||= SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL').create_rpc_driver
24
+ end
25
+
26
+ # the core processing routing using the webservice
27
+ def process(text)
28
+ response = rpc.Process(:input => text.to_utf, :lang => lang)
29
+ response.processResult
30
+ # response = rpc.request(:process) do
31
+ # soap.body = {:input => text, :lang => lang}
32
+ # end
33
+ # response.to_hash[:process_response][:process_result]
34
+ end
35
+
36
+ def cleanup(file)
37
+ @entities ||= HTMLEntities.new
38
+ @entities.decode()
39
+ end
40
+
41
+ # inserts "tagged" as the second to last part in the filename and as parent folder
42
+ # e.g.
43
+ # test.txt -> tagged/test.tagged.txt
44
+ # special case when no extension is present:
45
+ # README -> README.tagged
46
+ def create_tagged_filename(infile)
47
+ path = infile.split(/\//)
48
+
49
+ # take care of the filename...
50
+ components = path.pop.split(/\./)
51
+ position = [1, components.size-1].max
52
+ components.insert position, 'tagged'
53
+ path.push components.join('.')
54
+
55
+ # ...and of the path
56
+ path.insert -2, 'tagged'
57
+ path.join '/'
58
+ end
59
+
60
+ def to_filelist(*files)
61
+ files = files.flatten.map do |filename|
62
+ if File.directory? filename
63
+ Dir.glob File.join(filename, '**/*') # add all files from that directory
64
+ else
65
+ filename
66
+ end
67
+ end.flatten.compact.uniq # make sure every file is only processed once
68
+ files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
69
+ end
70
+
71
+ def process_stdin
72
+ puts process($stdin.read)
73
+ end
74
+
75
+ # takes the text from infile and outputs the result into the outfile
76
+ def process_file(infile, outfile = create_tagged_filename(infile))
77
+ result = process(File.read(file).to_utf)
78
+ File.open(outfile, 'w') do |out|
79
+ out.write result
80
+ end
81
+ end
82
+
83
+ end
84
+
85
+
86
+ # process the args if called as main script
87
+ if __FILE__ == $0
88
+ args = ARGV
89
+ tp = if args.first == '-lang'
90
+ args.shift
91
+ TextProcessing.new(args.shift)
92
+ else
93
+ TextProcessing.new
94
+ end
95
+
96
+ if args.empty?
97
+ tp.process_stdin
98
+ else
99
+ files = tp.to_filelist(args)
100
+
101
+ puts "Processing files:"
102
+ for infile in files
103
+ outfile = tp.create_tagged_filename(infile)
104
+ puts "#{infile} -> #{outfile}"
105
+ tp.process_file(infile, outfile) unless File.exist?(outfile)
106
+ end
107
+ end
108
+ end