lumix 0.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +18 -0
- data/bin/lumix +4 -0
- data/bin/lumix-gui +4 -0
- data/lib/lumix/base.rb +56 -0
- data/lib/lumix/charset.rb +35 -0
- data/lib/lumix/cli.rb +96 -0
- data/lib/lumix/concordancer.rb +254 -0
- data/lib/lumix/corrections.rb +84 -0
- data/lib/lumix/fast_search.rb +91 -0
- data/lib/lumix/filter.rb +89 -0
- data/lib/lumix/gui.rb +148 -0
- data/lib/lumix/lookup.rb +105 -0
- data/lib/lumix/lookup_filter.rb +43 -0
- data/lib/lumix/lookup_search.rb +95 -0
- data/lib/lumix/main.rb +7 -0
- data/lib/lumix/model/base_models.rb +35 -0
- data/lib/lumix/model/maglev_models.rb +42 -0
- data/lib/lumix/model/mock_models.rb +46 -0
- data/lib/lumix/model/sequel_models.rb +53 -0
- data/lib/lumix/proto/lookup.rb +105 -0
- data/lib/lumix/proto/lookup_filter.rb +40 -0
- data/lib/lumix/proto/lookup_search.rb +81 -0
- data/lib/lumix/result_view.rb +93 -0
- data/lib/lumix/schema/001_create_tables.rb +35 -0
- data/lib/lumix/schema/002_categories.rb +28 -0
- data/lib/lumix/schema/003_add_fulltagged.rb +15 -0
- data/lib/lumix/schema/004_create_lookup_tables.rb +44 -0
- data/lib/lumix/slow_search.rb +104 -0
- data/lib/lumix/text_snippet.rb +29 -0
- data/lib/lumix/textprocessing.rb +108 -0
- data/lib/lumix/thread_pool.rb +127 -0
- data/spec/filter_spec.rb +55 -0
- data/spec/lookup_spec.rb +70 -0
- data/spec/text_snippet_spec.rb +55 -0
- metadata +175 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'msgpack'
|
2
|
+
|
3
|
+
module Lumix
|
4
|
+
|
5
|
+
TEXT_ID = 0
|
6
|
+
S_BEGIN = 1
|
7
|
+
S_END = 2
|
8
|
+
T_BEGIN = 3
|
9
|
+
T_END = 4
|
10
|
+
|
11
|
+
class Lookup
|
12
|
+
def initialize
|
13
|
+
puts "Lookup"
|
14
|
+
@tags = {} # tag => token_id[]
|
15
|
+
@words = {} # word => token_id[]
|
16
|
+
@tokens = [] # :text_id, :s_begin, :s_end, :t_begin, :t_end
|
17
|
+
@texts = [] # text_id
|
18
|
+
Signal.trap('INT'){exit}
|
19
|
+
at_exit do
|
20
|
+
save
|
21
|
+
end
|
22
|
+
load
|
23
|
+
end
|
24
|
+
|
25
|
+
def load
|
26
|
+
@dirty = false
|
27
|
+
return unless File.exists?('lookup.dat')
|
28
|
+
puts "Loading"
|
29
|
+
load_file :tags
|
30
|
+
load_file :words
|
31
|
+
load_file :texts
|
32
|
+
load_file :tokens
|
33
|
+
end
|
34
|
+
|
35
|
+
def save
|
36
|
+
return unless @dirty
|
37
|
+
@saving = true
|
38
|
+
puts "Saving"
|
39
|
+
save_file :tags
|
40
|
+
save_file :words
|
41
|
+
save_file :texts
|
42
|
+
save_file :tokens
|
43
|
+
@saving = false
|
44
|
+
end
|
45
|
+
|
46
|
+
def with(*types)
|
47
|
+
args = types.flatten.map{|name| instance_variable_get("@#{name}") || instance_variable_get("@#{name}",load_file(name)) }
|
48
|
+
yield *args
|
49
|
+
end
|
50
|
+
|
51
|
+
def save_file(name)
|
52
|
+
data = instance_variable_get("@#{name}")
|
53
|
+
File.open(name.to_s + '.dat', 'w') do |f|
|
54
|
+
f.print MessagePack.pack(data)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def load_file(name)
|
59
|
+
MessagePack.unpack(File.read(name.to_s + '.dat'))
|
60
|
+
end
|
61
|
+
|
62
|
+
def process(text_id)
|
63
|
+
return if @saving
|
64
|
+
@dirty = true
|
65
|
+
return true if @texts.member?(text_id)
|
66
|
+
@texts << text_id
|
67
|
+
|
68
|
+
yield if block_given?
|
69
|
+
end
|
70
|
+
|
71
|
+
def add_token(text_id, word, tag, s_begin, s_end, t_begin, t_end)
|
72
|
+
return if @saving
|
73
|
+
@dirty = true
|
74
|
+
id = (@tokens << [text_id, s_begin, s_end, t_begin, t_end]).size - 1
|
75
|
+
(@words[word] ||= []) << id
|
76
|
+
(@tags[tag] ||= []) << id
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_word(re)
|
80
|
+
find_ids @words, re
|
81
|
+
end
|
82
|
+
|
83
|
+
def find_tag(re)
|
84
|
+
find_ids @tags, re
|
85
|
+
end
|
86
|
+
|
87
|
+
# returns the start indices of matching sequences
|
88
|
+
def union(*id_sets)
|
89
|
+
unified = id_sets.each_with_index.map{|c,i| c.map{|e| e-i}}
|
90
|
+
unified.inject(:&)
|
91
|
+
end
|
92
|
+
|
93
|
+
def text_range(t_begin, t_end)
|
94
|
+
a, b = @tokens[t_begin], @tokens[t_end]
|
95
|
+
return nil unless a[TEXT_ID] == b[TEXT_ID]
|
96
|
+
return a[TEXT_ID], a[S_BEGIN], b[S_END], a[T_BEGIN], b[T_END]
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
def find_ids(arr, re)
|
101
|
+
elems = arr.keys.grep(re)
|
102
|
+
elems.map{|e| arr[e]}.flatten
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Lumix
|
2
|
+
class LookupFilter
|
3
|
+
|
4
|
+
attr_reader :results, :filter
|
5
|
+
|
6
|
+
def initialize(filter, &result_proc)
|
7
|
+
@filter = filter
|
8
|
+
@result_proc = result_proc
|
9
|
+
|
10
|
+
@re = create_re(filter)
|
11
|
+
@results = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def <<(result)
|
15
|
+
@results += 1
|
16
|
+
@result_proc[*result] if @result_proc
|
17
|
+
end
|
18
|
+
|
19
|
+
def apply(lookup, &block)
|
20
|
+
results = @re.map do |(type, re)|
|
21
|
+
lookup.send("find_#{type}", re)
|
22
|
+
end
|
23
|
+
lookup.union(*results).each do |id|
|
24
|
+
range = lookup.text_range(id, id + @re.size - 1) # TODO make more dynamic
|
25
|
+
block[*range] if block and range
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def create_re(filter)
|
30
|
+
filter.scan(/(?:(?:\"([^\"]+)\")|(\S+))+/).map do |word, tag|
|
31
|
+
word ? [:word, to_re(word)] : [:tag, to_re(tag)]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_re(txt)
|
36
|
+
Regexp.new('^' + txt.gsub(/\s/, '_').gsub(/\*/, '\S*').gsub(/\?/, '\S') + '$')
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'lumix/lookup_filter'
|
2
|
+
require 'lumix/text_snippet'
|
3
|
+
require 'lumix/lookup'
|
4
|
+
|
5
|
+
module Lumix
|
6
|
+
|
7
|
+
class LookupSearch
|
8
|
+
|
9
|
+
TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
|
10
|
+
|
11
|
+
def initialize(db, progress)
|
12
|
+
@lookup = Lookup.new
|
13
|
+
@progress = progress
|
14
|
+
end
|
15
|
+
|
16
|
+
def concurrent_link?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
|
20
|
+
def link_text(id)
|
21
|
+
ds = TaggedText[id]
|
22
|
+
@lookup.process id do
|
23
|
+
file, text, tagged = ds.filename, ds.text, ds.tagged
|
24
|
+
|
25
|
+
puts "Linking text #{file}"
|
26
|
+
|
27
|
+
txt_pos = 0
|
28
|
+
tagged.scan(TAGGED) do |word, tag|
|
29
|
+
tagged_begin = $~.begin(0)
|
30
|
+
tagged_end = $~.end(0)
|
31
|
+
|
32
|
+
# expand "x_y_z" notation to "x y z"
|
33
|
+
word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
|
34
|
+
src_match = text[txt_pos..-1].match(word_re) # find the word
|
35
|
+
if src_match
|
36
|
+
offset = src_match.begin(0)
|
37
|
+
src_begin = txt_pos + offset
|
38
|
+
src_end = txt_pos + src_match.end(0)
|
39
|
+
txt_pos = src_end
|
40
|
+
|
41
|
+
@lookup.add_token(id, word, tag, src_begin, src_end, tagged_begin, tagged_end)
|
42
|
+
else
|
43
|
+
STDERR.puts "Could not find match for '#{word}' in text #{file}"
|
44
|
+
STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
|
45
|
+
`echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
end
|
49
|
+
return true
|
50
|
+
end
|
51
|
+
rescue => e # TODO remove this crap
|
52
|
+
STDERR.puts e
|
53
|
+
STDERR.puts e.backtrace
|
54
|
+
@lookup.save
|
55
|
+
raise e
|
56
|
+
end
|
57
|
+
|
58
|
+
def create_filter(f, &block)
|
59
|
+
Lumix::LookupFilter.new(f, &block)
|
60
|
+
end
|
61
|
+
|
62
|
+
def find(*filters, &block)
|
63
|
+
last_id = -1
|
64
|
+
t = nil
|
65
|
+
filters.flatten.each do |f|
|
66
|
+
f.apply(@lookup) do |text_id, s_begin, s_end, t_begin, t_end|
|
67
|
+
t = TaggedText[text_id] if text_id != last_id
|
68
|
+
last_id = text_id
|
69
|
+
|
70
|
+
fname = File.basename(t.filename)
|
71
|
+
text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
|
72
|
+
tagged_snippet = Lumix::TextSnippet.new(fname, t.tagged, t_begin, t_end)
|
73
|
+
f << [text_snippet, tagged_snippet]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
SearchStrategy = LookupSearch
|
81
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
class Java::OrgEclipseSwtWidgets::Table
|
2
|
+
|
3
|
+
attr_accessor :data, :tooltips
|
4
|
+
|
5
|
+
def sweeten(app, opts={}, &block)
|
6
|
+
@data = []
|
7
|
+
@tooltips = []
|
8
|
+
super
|
9
|
+
@redraw_thread = Thread.new do
|
10
|
+
while !isDisposed
|
11
|
+
if @dirty
|
12
|
+
@dirty = false
|
13
|
+
perform do
|
14
|
+
setItemCount data.size
|
15
|
+
clearAll if clear_all
|
16
|
+
end
|
17
|
+
end
|
18
|
+
sleep 1 # TODO find a better alternative
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# TODO implement tooltips
|
23
|
+
|
24
|
+
addListener swt::SetData do |e|
|
25
|
+
item = e.item
|
26
|
+
index = indexOf(item)
|
27
|
+
item.setText(Array(data[index]).to_java(:string))
|
28
|
+
end
|
29
|
+
|
30
|
+
addListener swt::Resize do |e|
|
31
|
+
default_weight = 1.0 / columns.size
|
32
|
+
current_width = @old_width
|
33
|
+
w = width
|
34
|
+
columns[0..-2].each do |c|
|
35
|
+
weight = c.width == 0 ? default_weight : c.width.to_f / current_width
|
36
|
+
c.width = w * weight
|
37
|
+
end
|
38
|
+
columns[columns.size - 1].pack
|
39
|
+
@old_width = w
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def columns=(*titles)
|
44
|
+
if titles
|
45
|
+
titles.each do |title|
|
46
|
+
col = widgets::TableColumn.new(self, swt::CENTER)
|
47
|
+
col.setText title
|
48
|
+
end
|
49
|
+
|
50
|
+
setHeaderVisible true
|
51
|
+
setLinesVisible true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def sort=(sort)
|
56
|
+
sort = Hash.new(true) if [true, :all].member?(sort)
|
57
|
+
if sort
|
58
|
+
columns.each_with_index do |col, index|
|
59
|
+
if sort[col.text]
|
60
|
+
col.addListener swt::Selection do
|
61
|
+
if data
|
62
|
+
@data = data.sort_by {|e| e[index] }
|
63
|
+
update :clear
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
Sweet::WIDGET_DEFAULTS[:table] = {
|
73
|
+
:style => [:border, :virtual, :check]
|
74
|
+
}
|
75
|
+
Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
|
76
|
+
:block_handler => :set_data,
|
77
|
+
:custom_code => proc {
|
78
|
+
def update(clear_all = false)
|
79
|
+
return if isDisposed
|
80
|
+
setItemCount data.size
|
81
|
+
clearAll if clear_all
|
82
|
+
end
|
83
|
+
|
84
|
+
def add_hit(*args)
|
85
|
+
opts = args.last === Hash ? args.pop : {}
|
86
|
+
d = opts[:data] || args
|
87
|
+
t = opts[:tooltips] || d
|
88
|
+
data << d
|
89
|
+
tooltips << t
|
90
|
+
@dirty = true
|
91
|
+
end
|
92
|
+
}
|
93
|
+
}
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class CreateTables < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :texts do
|
5
|
+
primary_key :id
|
6
|
+
String :digest
|
7
|
+
String :text
|
8
|
+
String :tagged
|
9
|
+
String :filename
|
10
|
+
String :tagged_filename
|
11
|
+
|
12
|
+
index :digest
|
13
|
+
end
|
14
|
+
|
15
|
+
create_table :assoc do
|
16
|
+
primary_key :id
|
17
|
+
Integer :text_id, :references => :texts
|
18
|
+
Integer :position
|
19
|
+
Integer :src_begin
|
20
|
+
Integer :src_end
|
21
|
+
Integer :tagged_begin
|
22
|
+
Integer :tagged_end
|
23
|
+
|
24
|
+
index [:text_id, :tagged_end]
|
25
|
+
index [:text_id, :tagged_begin]
|
26
|
+
index [:text_id, :position]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def down
|
31
|
+
drop_table :assoc
|
32
|
+
drop_table :texts
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class Categories < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :categories do
|
5
|
+
primary_key :id
|
6
|
+
Integer :parent_id, :references => :categories
|
7
|
+
String :name
|
8
|
+
String :key
|
9
|
+
|
10
|
+
index [:parent_id, :id]
|
11
|
+
end
|
12
|
+
|
13
|
+
alter_table :texts do
|
14
|
+
add_column :category_id, Integer, :references => :categories
|
15
|
+
|
16
|
+
add_index [:category_id, :id]
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def down
|
22
|
+
alter_table :texts do
|
23
|
+
drop_column :category_id
|
24
|
+
end
|
25
|
+
drop_table :categories
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
class CreateLookupTables < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :tags do
|
5
|
+
primary_key :id
|
6
|
+
String :tag
|
7
|
+
|
8
|
+
index :tag, :unique => true
|
9
|
+
end
|
10
|
+
|
11
|
+
create_table :words do
|
12
|
+
primary_key :id
|
13
|
+
String :word
|
14
|
+
|
15
|
+
index :word, :unique => true
|
16
|
+
end
|
17
|
+
|
18
|
+
create_table :tokens do
|
19
|
+
primary_key :id
|
20
|
+
Integer :text_id, :references => :texts
|
21
|
+
|
22
|
+
Integer :position
|
23
|
+
Integer :tag_id, :references => :tags
|
24
|
+
Integer :word_id, :references => :words
|
25
|
+
|
26
|
+
Integer :src_begin
|
27
|
+
Integer :src_end
|
28
|
+
Integer :tagged_begin
|
29
|
+
Integer :tagged_end
|
30
|
+
|
31
|
+
index [:text_id, :position], :unique => true
|
32
|
+
index :word_id
|
33
|
+
index :tag_id
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
def down
|
39
|
+
drop_table :tokens
|
40
|
+
drop_table :words
|
41
|
+
drop_table :tags
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
module Lumix
|
2
|
+
|
3
|
+
class SlowSearch
|
4
|
+
TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
|
5
|
+
|
6
|
+
def initialize(db, progress)
|
7
|
+
@db = db
|
8
|
+
@progress = progress
|
9
|
+
end
|
10
|
+
|
11
|
+
def concurrent_link?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def link_text(id)
|
16
|
+
t = TaggedText[id]
|
17
|
+
text = t.text
|
18
|
+
puts "Linking text #{t.filename}"
|
19
|
+
|
20
|
+
src_last = 0
|
21
|
+
position = 0
|
22
|
+
assoc = []
|
23
|
+
t.tagged.scan(TAGGED) do |word, tag|
|
24
|
+
tagged_begin = $~.begin(0)
|
25
|
+
tagged_end = $~.end(0)
|
26
|
+
|
27
|
+
word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
|
28
|
+
src_match = text[src_last..-1].match(word_re) # find the word
|
29
|
+
if src_match
|
30
|
+
src_begin = src_last + src_match.begin(0)
|
31
|
+
src_end = src_last + src_match.end(0)
|
32
|
+
|
33
|
+
src_last = src_end
|
34
|
+
assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
|
35
|
+
else
|
36
|
+
STDERR.puts "Could not find match for '#{word}' in text #{t.filename}"
|
37
|
+
`echo '#{t.filename}:#{tagged_begin}:#{src_last} unmatched "#{word}"' >> unlinked.lst`
|
38
|
+
return nil
|
39
|
+
end
|
40
|
+
position += 1
|
41
|
+
end
|
42
|
+
@db[:assoc].multi_insert(assoc)
|
43
|
+
rescue => e
|
44
|
+
STDERR.puts e
|
45
|
+
STDERR.puts e.backtrace
|
46
|
+
raise e
|
47
|
+
end
|
48
|
+
|
49
|
+
def create_filter
|
50
|
+
@filter ||= Filter.new('')
|
51
|
+
end
|
52
|
+
|
53
|
+
def find(filter, &block)
|
54
|
+
yield_text = block && block.arity >= 1
|
55
|
+
yield_tagged = block && block.arity >= 2
|
56
|
+
|
57
|
+
prog = Progress.new(:search, TaggedText.count, filter)
|
58
|
+
@progress[prog]
|
59
|
+
|
60
|
+
re = Filter.to_re(filter)
|
61
|
+
|
62
|
+
index = 0
|
63
|
+
TaggedText.inject(0) do |result, t|
|
64
|
+
fname = File.basename(t.filename)
|
65
|
+
|
66
|
+
# matches to ranges
|
67
|
+
results = []
|
68
|
+
t.tagged.scan(re) do |hit|
|
69
|
+
t_begin = $~.begin(0)
|
70
|
+
t_end = $~.end(0)
|
71
|
+
# TODO decouple database operations for performance
|
72
|
+
results << find_range(t.id, t_begin, t_end, yield_text)
|
73
|
+
end
|
74
|
+
|
75
|
+
result += results.inject(0) do |res, f|
|
76
|
+
if yield_tagged
|
77
|
+
tagged_snippet = TextSnippet.new(fname, t.tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
|
78
|
+
if yield_text
|
79
|
+
text_snippet = TextSnippet.new(fname, t.text, f[:src_begin].to_i, f[:src_end].to_i)
|
80
|
+
yield text_snippet, tagged_snippet
|
81
|
+
else
|
82
|
+
yield tagged_snippet
|
83
|
+
end
|
84
|
+
end
|
85
|
+
res += 1
|
86
|
+
end
|
87
|
+
@progress[prog, (index += 1)]
|
88
|
+
result
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def find_range(t_id, t_begin, t_end, process_original)
|
93
|
+
if process_original
|
94
|
+
ds = @db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
|
95
|
+
ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
|
96
|
+
else
|
97
|
+
{:tagged_begin => t_begin, :tagged_end => t_end}
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
SearchStrategy = SlowSearch
|
104
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Lumix
|
2
|
+
|
3
|
+
class TextSnippet
|
4
|
+
attr_reader :name, :text, :begin, :end
|
5
|
+
def initialize(name, text, first, last)
|
6
|
+
@name = name
|
7
|
+
@text = text
|
8
|
+
@begin = first
|
9
|
+
@end = last
|
10
|
+
end
|
11
|
+
def to_s
|
12
|
+
cleanup(@text[@begin...@end])
|
13
|
+
end
|
14
|
+
def left(context = 5)
|
15
|
+
ctx = [@begin - context * 10, 0].max
|
16
|
+
@text[ctx...@begin].split(/\s+/).last(context).join(' ')# =~ /((\S+\s+){0,#{context}}\S*)\z/m
|
17
|
+
#cleanup($1)
|
18
|
+
end
|
19
|
+
def right(context = 5)
|
20
|
+
ctx = [@end + context * 10, @text.size].min
|
21
|
+
@text[@end..ctx].split(/\s+/).first(context).join(' ')# =~ /\A(\S*(\s+\S+){0,#{context}})/m
|
22
|
+
#cleanup($1)
|
23
|
+
end
|
24
|
+
def cleanup(txt)
|
25
|
+
txt.gsub(/\s+/, ' ')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
$KCODE='UTF-8'
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
require 'soap/wsdlDriver'
|
5
|
+
#require 'curb'
|
6
|
+
#require 'savon'
|
7
|
+
require 'lumix/charset' unless RUBY_ENGINE =~ /maglev/i
|
8
|
+
|
9
|
+
class TextProcessing
|
10
|
+
|
11
|
+
attr_accessor :lang
|
12
|
+
|
13
|
+
def initialize(lang = 'ro')
|
14
|
+
@lang = lang
|
15
|
+
end
|
16
|
+
|
17
|
+
def rpc
|
18
|
+
# Thread.current[:rpc] ||= begin
|
19
|
+
# wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
|
20
|
+
# wsdl.create_rpc_driver
|
21
|
+
# Savon::Client.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
|
22
|
+
# end
|
23
|
+
@rpc ||= SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL').create_rpc_driver
|
24
|
+
end
|
25
|
+
|
26
|
+
# the core processing routing using the webservice
|
27
|
+
def process(text)
|
28
|
+
response = rpc.Process(:input => text.to_utf, :lang => lang)
|
29
|
+
response.processResult
|
30
|
+
# response = rpc.request(:process) do
|
31
|
+
# soap.body = {:input => text, :lang => lang}
|
32
|
+
# end
|
33
|
+
# response.to_hash[:process_response][:process_result]
|
34
|
+
end
|
35
|
+
|
36
|
+
def cleanup(file)
|
37
|
+
@entities ||= HTMLEntities.new
|
38
|
+
@entities.decode()
|
39
|
+
end
|
40
|
+
|
41
|
+
# inserts "tagged" as the second to last part in the filename and as parent folder
|
42
|
+
# e.g.
|
43
|
+
# test.txt -> tagged/test.tagged.txt
|
44
|
+
# special case when no extension is present:
|
45
|
+
# README -> README.tagged
|
46
|
+
def create_tagged_filename(infile)
|
47
|
+
path = infile.split(/\//)
|
48
|
+
|
49
|
+
# take care of the filename...
|
50
|
+
components = path.pop.split(/\./)
|
51
|
+
position = [1, components.size-1].max
|
52
|
+
components.insert position, 'tagged'
|
53
|
+
path.push components.join('.')
|
54
|
+
|
55
|
+
# ...and of the path
|
56
|
+
path.insert -2, 'tagged'
|
57
|
+
path.join '/'
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_filelist(*files)
|
61
|
+
files = files.flatten.map do |filename|
|
62
|
+
if File.directory? filename
|
63
|
+
Dir.glob File.join(filename, '**/*') # add all files from that directory
|
64
|
+
else
|
65
|
+
filename
|
66
|
+
end
|
67
|
+
end.flatten.compact.uniq # make sure every file is only processed once
|
68
|
+
files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
|
69
|
+
end
|
70
|
+
|
71
|
+
def process_stdin
|
72
|
+
puts process($stdin.read)
|
73
|
+
end
|
74
|
+
|
75
|
+
# takes the text from infile and outputs the result into the outfile
|
76
|
+
def process_file(infile, outfile = create_tagged_filename(infile))
|
77
|
+
result = process(File.read(file).to_utf)
|
78
|
+
File.open(outfile, 'w') do |out|
|
79
|
+
out.write result
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# process the args if called as main script
|
87
|
+
if __FILE__ == $0
|
88
|
+
args = ARGV
|
89
|
+
tp = if args.first == '-lang'
|
90
|
+
args.shift
|
91
|
+
TextProcessing.new(args.shift)
|
92
|
+
else
|
93
|
+
TextProcessing.new
|
94
|
+
end
|
95
|
+
|
96
|
+
if args.empty?
|
97
|
+
tp.process_stdin
|
98
|
+
else
|
99
|
+
files = tp.to_filelist(args)
|
100
|
+
|
101
|
+
puts "Processing files:"
|
102
|
+
for infile in files
|
103
|
+
outfile = tp.create_tagged_filename(infile)
|
104
|
+
puts "#{infile} -> #{outfile}"
|
105
|
+
tp.process_file(infile, outfile) unless File.exist?(outfile)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|