lumix 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ require 'msgpack'
2
+
3
+ module Lumix
4
+
5
+ TEXT_ID = 0
6
+ S_BEGIN = 1
7
+ S_END = 2
8
+ T_BEGIN = 3
9
+ T_END = 4
10
+
11
+ class Lookup
12
+ def initialize
13
+ puts "Lookup"
14
+ @tags = {} # tag => token_id[]
15
+ @words = {} # word => token_id[]
16
+ @tokens = [] # :text_id, :s_begin, :s_end, :t_begin, :t_end
17
+ @texts = [] # text_id
18
+ Signal.trap('INT'){exit}
19
+ at_exit do
20
+ save
21
+ end
22
+ load
23
+ end
24
+
25
+ def load
26
+ @dirty = false
27
+ return unless File.exists?('lookup.dat')
28
+ puts "Loading"
29
+ load_file :tags
30
+ load_file :words
31
+ load_file :texts
32
+ load_file :tokens
33
+ end
34
+
35
+ def save
36
+ return unless @dirty
37
+ @saving = true
38
+ puts "Saving"
39
+ save_file :tags
40
+ save_file :words
41
+ save_file :texts
42
+ save_file :tokens
43
+ @saving = false
44
+ end
45
+
46
+ def with(*types)
47
+ args = types.flatten.map{|name| instance_variable_get("@#{name}") || instance_variable_get("@#{name}",load_file(name)) }
48
+ yield *args
49
+ end
50
+
51
+ def save_file(name)
52
+ data = instance_variable_get("@#{name}")
53
+ File.open(name.to_s + '.dat', 'w') do |f|
54
+ f.print MessagePack.pack(data)
55
+ end
56
+ end
57
+
58
+ def load_file(name)
59
+ MessagePack.unpack(File.read(name.to_s + '.dat'))
60
+ end
61
+
62
+ def process(text_id)
63
+ return if @saving
64
+ @dirty = true
65
+ return true if @texts.member?(text_id)
66
+ @texts << text_id
67
+
68
+ yield if block_given?
69
+ end
70
+
71
+ def add_token(text_id, word, tag, s_begin, s_end, t_begin, t_end)
72
+ return if @saving
73
+ @dirty = true
74
+ id = (@tokens << [text_id, s_begin, s_end, t_begin, t_end]).size - 1
75
+ (@words[word] ||= []) << id
76
+ (@tags[tag] ||= []) << id
77
+ end
78
+
79
+ def find_word(re)
80
+ find_ids @words, re
81
+ end
82
+
83
+ def find_tag(re)
84
+ find_ids @tags, re
85
+ end
86
+
87
+ # returns the start indices of matching sequences
88
+ def union(*id_sets)
89
+ unified = id_sets.each_with_index.map{|c,i| c.map{|e| e-i}}
90
+ unified.inject(:&)
91
+ end
92
+
93
+ def text_range(t_begin, t_end)
94
+ a, b = @tokens[t_begin], @tokens[t_end]
95
+ return nil unless a[TEXT_ID] == b[TEXT_ID]
96
+ return a[TEXT_ID], a[S_BEGIN], b[S_END], a[T_BEGIN], b[T_END]
97
+ end
98
+
99
+ private
100
+ def find_ids(arr, re)
101
+ elems = arr.keys.grep(re)
102
+ elems.map{|e| arr[e]}.flatten
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,40 @@
1
+ module Lumix
2
+ class LookupFilter
3
+
4
+ attr_reader :results, :filter
5
+
6
+ def initialize(filter, &result_proc)
7
+ @filter = filter
8
+ @result_proc = result_proc
9
+
10
+ @re = create_re(filter)
11
+ @results = 0
12
+ end
13
+
14
+ def <<(result)
15
+ @results += 1
16
+ @result_proc[*result] if @result_proc
17
+ end
18
+
19
+ def apply(lookup, &block)
20
+ results = @re.map do |(type, re)|
21
+ lookup.send("find_#{type}", re)
22
+ end
23
+ lookup.union(*results).each do |id|
24
+ range = lookup.text_range(id, id + @re.size - 1) # TODO make more dynamic
25
+ block[*range] if block and range
26
+ end
27
+ end
28
+
29
+ def create_re(filter)
30
+ filter.scan(/(?:(?:\"([^\"]+)\")|(\S+))+/).map do |word, tag|
31
+ word ? [:word, to_re(word)] : [:tag, to_re(tag)]
32
+ end
33
+ end
34
+
35
+ def to_re(txt)
36
+ Regexp.new('^' + txt.gsub(/\s/, '_').gsub(/\*/, '\S*').gsub(/\?/, '\S') + '$')
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,81 @@
1
+ require 'lumix/lookup_filter'
2
+ require 'lumix/text_snippet'
3
+ require 'lumix/lookup'
4
+
5
+ module Lumix
6
+
7
+ class LookupSearch
8
+
9
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
10
+
11
+ def initialize(db, progress)
12
+ @lookup = Lookup.new
13
+ @progress = progress
14
+ end
15
+
16
+ def concurrent_link?
17
+ false
18
+ end
19
+
20
+ def link_text(id)
21
+ ds = TaggedText[id]
22
+ @lookup.process id do
23
+ file, text, tagged = ds.filename, ds.text, ds.tagged
24
+
25
+ puts "Linking text #{file}"
26
+
27
+ txt_pos = 0
28
+ tagged.scan(TAGGED) do |word, tag|
29
+ tagged_begin = $~.begin(0)
30
+ tagged_end = $~.end(0)
31
+
32
+ # expand "x_y_z" notation to "x y z"
33
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
34
+ src_match = text[txt_pos..-1].match(word_re) # find the word
35
+ if src_match
36
+ offset = src_match.begin(0)
37
+ src_begin = txt_pos + offset
38
+ src_end = txt_pos + src_match.end(0)
39
+ txt_pos = src_end
40
+
41
+ @lookup.add_token(id, word, tag, src_begin, src_end, tagged_begin, tagged_end)
42
+ else
43
+ STDERR.puts "Could not find match for '#{word}' in text #{file}"
44
+ STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
45
+ `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
46
+ return nil
47
+ end
48
+ end
49
+ return true
50
+ end
51
+ rescue => e # TODO remove this crap
52
+ STDERR.puts e
53
+ STDERR.puts e.backtrace
54
+ @lookup.save
55
+ raise e
56
+ end
57
+
58
+ def create_filter(f, &block)
59
+ Lumix::LookupFilter.new(f, &block)
60
+ end
61
+
62
+ def find(*filters, &block)
63
+ last_id = -1
64
+ t = nil
65
+ filters.flatten.each do |f|
66
+ f.apply(@lookup) do |text_id, s_begin, s_end, t_begin, t_end|
67
+ t = TaggedText[text_id] if text_id != last_id
68
+ last_id = text_id
69
+
70
+ fname = File.basename(t.filename)
71
+ text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
72
+ tagged_snippet = Lumix::TextSnippet.new(fname, t.tagged, t_begin, t_end)
73
+ f << [text_snippet, tagged_snippet]
74
+ end
75
+ end
76
+ end
77
+
78
+ end
79
+
80
+ SearchStrategy = LookupSearch
81
+ end
@@ -0,0 +1,93 @@
1
+ class Java::OrgEclipseSwtWidgets::Table
2
+
3
+ attr_accessor :data, :tooltips
4
+
5
+ def sweeten(app, opts={}, &block)
6
+ @data = []
7
+ @tooltips = []
8
+ super
9
+ @redraw_thread = Thread.new do
10
+ while !isDisposed
11
+ if @dirty
12
+ @dirty = false
13
+ perform do
14
+ setItemCount data.size
15
+ clearAll if clear_all
16
+ end
17
+ end
18
+ sleep 1 # TODO find a better alternative
19
+ end
20
+ end
21
+
22
+ # TODO implement tooltips
23
+
24
+ addListener swt::SetData do |e|
25
+ item = e.item
26
+ index = indexOf(item)
27
+ item.setText(Array(data[index]).to_java(:string))
28
+ end
29
+
30
+ addListener swt::Resize do |e|
31
+ default_weight = 1.0 / columns.size
32
+ current_width = @old_width
33
+ w = width
34
+ columns[0..-2].each do |c|
35
+ weight = c.width == 0 ? default_weight : c.width.to_f / current_width
36
+ c.width = w * weight
37
+ end
38
+ columns[columns.size - 1].pack
39
+ @old_width = w
40
+ end
41
+ end
42
+
43
+ def columns=(*titles)
44
+ if titles
45
+ titles.each do |title|
46
+ col = widgets::TableColumn.new(self, swt::CENTER)
47
+ col.setText title
48
+ end
49
+
50
+ setHeaderVisible true
51
+ setLinesVisible true
52
+ end
53
+ end
54
+
55
+ def sort=(sort)
56
+ sort = Hash.new(true) if [true, :all].member?(sort)
57
+ if sort
58
+ columns.each_with_index do |col, index|
59
+ if sort[col.text]
60
+ col.addListener swt::Selection do
61
+ if data
62
+ @data = data.sort_by {|e| e[index] }
63
+ update :clear
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ Sweet::WIDGET_DEFAULTS[:table] = {
73
+ :style => [:border, :virtual, :check]
74
+ }
75
+ Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
76
+ :block_handler => :set_data,
77
+ :custom_code => proc {
78
+ def update(clear_all = false)
79
+ return if isDisposed
80
+ setItemCount data.size
81
+ clearAll if clear_all
82
+ end
83
+
84
+ def add_hit(*args)
85
+ opts = args.last === Hash ? args.pop : {}
86
+ d = opts[:data] || args
87
+ t = opts[:tooltips] || d
88
+ data << d
89
+ tooltips << t
90
+ @dirty = true
91
+ end
92
+ }
93
+ }
@@ -0,0 +1,35 @@
1
+ class CreateTables < Sequel::Migration
2
+
3
+ def up
4
+ create_table :texts do
5
+ primary_key :id
6
+ String :digest
7
+ String :text
8
+ String :tagged
9
+ String :filename
10
+ String :tagged_filename
11
+
12
+ index :digest
13
+ end
14
+
15
+ create_table :assoc do
16
+ primary_key :id
17
+ Integer :text_id, :references => :texts
18
+ Integer :position
19
+ Integer :src_begin
20
+ Integer :src_end
21
+ Integer :tagged_begin
22
+ Integer :tagged_end
23
+
24
+ index [:text_id, :tagged_end]
25
+ index [:text_id, :tagged_begin]
26
+ index [:text_id, :position]
27
+ end
28
+ end
29
+
30
+ def down
31
+ drop_table :assoc
32
+ drop_table :texts
33
+ end
34
+
35
+ end
@@ -0,0 +1,28 @@
1
+ class Categories < Sequel::Migration
2
+
3
+ def up
4
+ create_table :categories do
5
+ primary_key :id
6
+ Integer :parent_id, :references => :categories
7
+ String :name
8
+ String :key
9
+
10
+ index [:parent_id, :id]
11
+ end
12
+
13
+ alter_table :texts do
14
+ add_column :category_id, Integer, :references => :categories
15
+
16
+ add_index [:category_id, :id]
17
+ end
18
+
19
+ end
20
+
21
+ def down
22
+ alter_table :texts do
23
+ drop_column :category_id
24
+ end
25
+ drop_table :categories
26
+ end
27
+
28
+ end
@@ -0,0 +1,15 @@
1
+ class AddFulltagged < Sequel::Migration
2
+
3
+ def up
4
+ alter_table :texts do
5
+ add_column :fulltagged, String
6
+ end
7
+ end
8
+
9
+ def down
10
+ alter_table :texts do
11
+ drop_column :fulltagged
12
+ end
13
+ end
14
+
15
+ end
@@ -0,0 +1,44 @@
1
+ class CreateLookupTables < Sequel::Migration
2
+
3
+ def up
4
+ create_table :tags do
5
+ primary_key :id
6
+ String :tag
7
+
8
+ index :tag, :unique => true
9
+ end
10
+
11
+ create_table :words do
12
+ primary_key :id
13
+ String :word
14
+
15
+ index :word, :unique => true
16
+ end
17
+
18
+ create_table :tokens do
19
+ primary_key :id
20
+ Integer :text_id, :references => :texts
21
+
22
+ Integer :position
23
+ Integer :tag_id, :references => :tags
24
+ Integer :word_id, :references => :words
25
+
26
+ Integer :src_begin
27
+ Integer :src_end
28
+ Integer :tagged_begin
29
+ Integer :tagged_end
30
+
31
+ index [:text_id, :position], :unique => true
32
+ index :word_id
33
+ index :tag_id
34
+ end
35
+
36
+ end
37
+
38
+ def down
39
+ drop_table :tokens
40
+ drop_table :words
41
+ drop_table :tags
42
+ end
43
+
44
+ end
@@ -0,0 +1,104 @@
1
+ module Lumix
2
+
3
+ class SlowSearch
4
+ TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
5
+
6
+ def initialize(db, progress)
7
+ @db = db
8
+ @progress = progress
9
+ end
10
+
11
+ def concurrent_link?
12
+ true
13
+ end
14
+
15
+ def link_text(id)
16
+ t = TaggedText[id]
17
+ text = t.text
18
+ puts "Linking text #{t.filename}"
19
+
20
+ src_last = 0
21
+ position = 0
22
+ assoc = []
23
+ t.tagged.scan(TAGGED) do |word, tag|
24
+ tagged_begin = $~.begin(0)
25
+ tagged_end = $~.end(0)
26
+
27
+ word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
28
+ src_match = text[src_last..-1].match(word_re) # find the word
29
+ if src_match
30
+ src_begin = src_last + src_match.begin(0)
31
+ src_end = src_last + src_match.end(0)
32
+
33
+ src_last = src_end
34
+ assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
35
+ else
36
+ STDERR.puts "Could not find match for '#{word}' in text #{t.filename}"
37
+ `echo '#{t.filename}:#{tagged_begin}:#{src_last} unmatched "#{word}"' >> unlinked.lst`
38
+ return nil
39
+ end
40
+ position += 1
41
+ end
42
+ @db[:assoc].multi_insert(assoc)
43
+ rescue => e
44
+ STDERR.puts e
45
+ STDERR.puts e.backtrace
46
+ raise e
47
+ end
48
+
49
+ def create_filter
50
+ @filter ||= Filter.new('')
51
+ end
52
+
53
+ def find(filter, &block)
54
+ yield_text = block && block.arity >= 1
55
+ yield_tagged = block && block.arity >= 2
56
+
57
+ prog = Progress.new(:search, TaggedText.count, filter)
58
+ @progress[prog]
59
+
60
+ re = Filter.to_re(filter)
61
+
62
+ index = 0
63
+ TaggedText.inject(0) do |result, t|
64
+ fname = File.basename(t.filename)
65
+
66
+ # matches to ranges
67
+ results = []
68
+ t.tagged.scan(re) do |hit|
69
+ t_begin = $~.begin(0)
70
+ t_end = $~.end(0)
71
+ # TODO decouple database operations for performance
72
+ results << find_range(t.id, t_begin, t_end, yield_text)
73
+ end
74
+
75
+ result += results.inject(0) do |res, f|
76
+ if yield_tagged
77
+ tagged_snippet = TextSnippet.new(fname, t.tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
78
+ if yield_text
79
+ text_snippet = TextSnippet.new(fname, t.text, f[:src_begin].to_i, f[:src_end].to_i)
80
+ yield text_snippet, tagged_snippet
81
+ else
82
+ yield tagged_snippet
83
+ end
84
+ end
85
+ res += 1
86
+ end
87
+ @progress[prog, (index += 1)]
88
+ result
89
+ end
90
+ end
91
+
92
+ def find_range(t_id, t_begin, t_end, process_original)
93
+ if process_original
94
+ ds = @db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
95
+ ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
96
+ else
97
+ {:tagged_begin => t_begin, :tagged_end => t_end}
98
+ end
99
+ end
100
+
101
+ end
102
+
103
+ SearchStrategy = SlowSearch
104
+ end
@@ -0,0 +1,29 @@
1
+ module Lumix
2
+
3
+ class TextSnippet
4
+ attr_reader :name, :text, :begin, :end
5
+ def initialize(name, text, first, last)
6
+ @name = name
7
+ @text = text
8
+ @begin = first
9
+ @end = last
10
+ end
11
+ def to_s
12
+ cleanup(@text[@begin...@end])
13
+ end
14
+ def left(context = 5)
15
+ ctx = [@begin - context * 10, 0].max
16
+ @text[ctx...@begin].split(/\s+/).last(context).join(' ')# =~ /((\S+\s+){0,#{context}}\S*)\z/m
17
+ #cleanup($1)
18
+ end
19
+ def right(context = 5)
20
+ ctx = [@end + context * 10, @text.size].min
21
+ @text[@end..ctx].split(/\s+/).first(context).join(' ')# =~ /\A(\S*(\s+\S+){0,#{context}})/m
22
+ #cleanup($1)
23
+ end
24
+ def cleanup(txt)
25
+ txt.gsub(/\s+/, ' ')
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,108 @@
1
+ $KCODE='UTF-8'
2
+
3
+ require 'cgi'
4
+ require 'soap/wsdlDriver'
5
+ #require 'curb'
6
+ #require 'savon'
7
+ require 'lumix/charset' unless RUBY_ENGINE =~ /maglev/i
8
+
9
+ class TextProcessing
10
+
11
+ attr_accessor :lang
12
+
13
+ def initialize(lang = 'ro')
14
+ @lang = lang
15
+ end
16
+
17
+ def rpc
18
+ # Thread.current[:rpc] ||= begin
19
+ # wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
20
+ # wsdl.create_rpc_driver
21
+ # Savon::Client.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
22
+ # end
23
+ @rpc ||= SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL').create_rpc_driver
24
+ end
25
+
26
+ # the core processing routing using the webservice
27
+ def process(text)
28
+ response = rpc.Process(:input => text.to_utf, :lang => lang)
29
+ response.processResult
30
+ # response = rpc.request(:process) do
31
+ # soap.body = {:input => text, :lang => lang}
32
+ # end
33
+ # response.to_hash[:process_response][:process_result]
34
+ end
35
+
36
+ def cleanup(file)
37
+ @entities ||= HTMLEntities.new
38
+ @entities.decode()
39
+ end
40
+
41
+ # inserts "tagged" as the second to last part in the filename and as parent folder
42
+ # e.g.
43
+ # test.txt -> tagged/test.tagged.txt
44
+ # special case when no extension is present:
45
+ # README -> README.tagged
46
+ def create_tagged_filename(infile)
47
+ path = infile.split(/\//)
48
+
49
+ # take care of the filename...
50
+ components = path.pop.split(/\./)
51
+ position = [1, components.size-1].max
52
+ components.insert position, 'tagged'
53
+ path.push components.join('.')
54
+
55
+ # ...and of the path
56
+ path.insert -2, 'tagged'
57
+ path.join '/'
58
+ end
59
+
60
+ def to_filelist(*files)
61
+ files = files.flatten.map do |filename|
62
+ if File.directory? filename
63
+ Dir.glob File.join(filename, '**/*') # add all files from that directory
64
+ else
65
+ filename
66
+ end
67
+ end.flatten.compact.uniq # make sure every file is only processed once
68
+ files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
69
+ end
70
+
71
+ def process_stdin
72
+ puts process($stdin.read)
73
+ end
74
+
75
+ # takes the text from infile and outputs the result into the outfile
76
+ def process_file(infile, outfile = create_tagged_filename(infile))
77
+ result = process(File.read(file).to_utf)
78
+ File.open(outfile, 'w') do |out|
79
+ out.write result
80
+ end
81
+ end
82
+
83
+ end
84
+
85
+
86
+ # process the args if called as main script
87
+ if __FILE__ == $0
88
+ args = ARGV
89
+ tp = if args.first == '-lang'
90
+ args.shift
91
+ TextProcessing.new(args.shift)
92
+ else
93
+ TextProcessing.new
94
+ end
95
+
96
+ if args.empty?
97
+ tp.process_stdin
98
+ else
99
+ files = tp.to_filelist(args)
100
+
101
+ puts "Processing files:"
102
+ for infile in files
103
+ outfile = tp.create_tagged_filename(infile)
104
+ puts "#{infile} -> #{outfile}"
105
+ tp.process_file(infile, outfile) unless File.exist?(outfile)
106
+ end
107
+ end
108
+ end