lumix 0.0.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +18 -0
- data/bin/lumix +4 -0
- data/bin/lumix-gui +4 -0
- data/lib/lumix/base.rb +56 -0
- data/lib/lumix/charset.rb +35 -0
- data/lib/lumix/cli.rb +96 -0
- data/lib/lumix/concordancer.rb +254 -0
- data/lib/lumix/corrections.rb +84 -0
- data/lib/lumix/fast_search.rb +91 -0
- data/lib/lumix/filter.rb +89 -0
- data/lib/lumix/gui.rb +148 -0
- data/lib/lumix/lookup.rb +105 -0
- data/lib/lumix/lookup_filter.rb +43 -0
- data/lib/lumix/lookup_search.rb +95 -0
- data/lib/lumix/main.rb +7 -0
- data/lib/lumix/model/base_models.rb +35 -0
- data/lib/lumix/model/maglev_models.rb +42 -0
- data/lib/lumix/model/mock_models.rb +46 -0
- data/lib/lumix/model/sequel_models.rb +53 -0
- data/lib/lumix/proto/lookup.rb +105 -0
- data/lib/lumix/proto/lookup_filter.rb +40 -0
- data/lib/lumix/proto/lookup_search.rb +81 -0
- data/lib/lumix/result_view.rb +93 -0
- data/lib/lumix/schema/001_create_tables.rb +35 -0
- data/lib/lumix/schema/002_categories.rb +28 -0
- data/lib/lumix/schema/003_add_fulltagged.rb +15 -0
- data/lib/lumix/schema/004_create_lookup_tables.rb +44 -0
- data/lib/lumix/slow_search.rb +104 -0
- data/lib/lumix/text_snippet.rb +29 -0
- data/lib/lumix/textprocessing.rb +108 -0
- data/lib/lumix/thread_pool.rb +127 -0
- data/spec/filter_spec.rb +55 -0
- data/spec/lookup_spec.rb +70 -0
- data/spec/text_snippet_spec.rb +55 -0
- metadata +175 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'msgpack'
|
2
|
+
|
3
|
+
module Lumix
|
4
|
+
|
5
|
+
TEXT_ID = 0
|
6
|
+
S_BEGIN = 1
|
7
|
+
S_END = 2
|
8
|
+
T_BEGIN = 3
|
9
|
+
T_END = 4
|
10
|
+
|
11
|
+
class Lookup
|
12
|
+
def initialize
|
13
|
+
puts "Lookup"
|
14
|
+
@tags = {} # tag => token_id[]
|
15
|
+
@words = {} # word => token_id[]
|
16
|
+
@tokens = [] # :text_id, :s_begin, :s_end, :t_begin, :t_end
|
17
|
+
@texts = [] # text_id
|
18
|
+
Signal.trap('INT'){exit}
|
19
|
+
at_exit do
|
20
|
+
save
|
21
|
+
end
|
22
|
+
load
|
23
|
+
end
|
24
|
+
|
25
|
+
def load
|
26
|
+
@dirty = false
|
27
|
+
return unless File.exists?('lookup.dat')
|
28
|
+
puts "Loading"
|
29
|
+
load_file :tags
|
30
|
+
load_file :words
|
31
|
+
load_file :texts
|
32
|
+
load_file :tokens
|
33
|
+
end
|
34
|
+
|
35
|
+
def save
|
36
|
+
return unless @dirty
|
37
|
+
@saving = true
|
38
|
+
puts "Saving"
|
39
|
+
save_file :tags
|
40
|
+
save_file :words
|
41
|
+
save_file :texts
|
42
|
+
save_file :tokens
|
43
|
+
@saving = false
|
44
|
+
end
|
45
|
+
|
46
|
+
def with(*types)
|
47
|
+
args = types.flatten.map{|name| instance_variable_get("@#{name}") || instance_variable_get("@#{name}",load_file(name)) }
|
48
|
+
yield *args
|
49
|
+
end
|
50
|
+
|
51
|
+
def save_file(name)
|
52
|
+
data = instance_variable_get("@#{name}")
|
53
|
+
File.open(name.to_s + '.dat', 'w') do |f|
|
54
|
+
f.print MessagePack.pack(data)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def load_file(name)
|
59
|
+
MessagePack.unpack(File.read(name.to_s + '.dat'))
|
60
|
+
end
|
61
|
+
|
62
|
+
def process(text_id)
|
63
|
+
return if @saving
|
64
|
+
@dirty = true
|
65
|
+
return true if @texts.member?(text_id)
|
66
|
+
@texts << text_id
|
67
|
+
|
68
|
+
yield if block_given?
|
69
|
+
end
|
70
|
+
|
71
|
+
def add_token(text_id, word, tag, s_begin, s_end, t_begin, t_end)
|
72
|
+
return if @saving
|
73
|
+
@dirty = true
|
74
|
+
id = (@tokens << [text_id, s_begin, s_end, t_begin, t_end]).size - 1
|
75
|
+
(@words[word] ||= []) << id
|
76
|
+
(@tags[tag] ||= []) << id
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_word(re)
|
80
|
+
find_ids @words, re
|
81
|
+
end
|
82
|
+
|
83
|
+
def find_tag(re)
|
84
|
+
find_ids @tags, re
|
85
|
+
end
|
86
|
+
|
87
|
+
# returns the start indices of matching sequences
|
88
|
+
def union(*id_sets)
|
89
|
+
unified = id_sets.each_with_index.map{|c,i| c.map{|e| e-i}}
|
90
|
+
unified.inject(:&)
|
91
|
+
end
|
92
|
+
|
93
|
+
def text_range(t_begin, t_end)
|
94
|
+
a, b = @tokens[t_begin], @tokens[t_end]
|
95
|
+
return nil unless a[TEXT_ID] == b[TEXT_ID]
|
96
|
+
return a[TEXT_ID], a[S_BEGIN], b[S_END], a[T_BEGIN], b[T_END]
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
def find_ids(arr, re)
|
101
|
+
elems = arr.keys.grep(re)
|
102
|
+
elems.map{|e| arr[e]}.flatten
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Lumix
|
2
|
+
class LookupFilter
|
3
|
+
|
4
|
+
attr_reader :results, :filter
|
5
|
+
|
6
|
+
def initialize(filter, &result_proc)
|
7
|
+
@filter = filter
|
8
|
+
@result_proc = result_proc
|
9
|
+
|
10
|
+
@re = create_re(filter)
|
11
|
+
@results = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def <<(result)
|
15
|
+
@results += 1
|
16
|
+
@result_proc[*result] if @result_proc
|
17
|
+
end
|
18
|
+
|
19
|
+
def apply(lookup, &block)
|
20
|
+
results = @re.map do |(type, re)|
|
21
|
+
lookup.send("find_#{type}", re)
|
22
|
+
end
|
23
|
+
lookup.union(*results).each do |id|
|
24
|
+
range = lookup.text_range(id, id + @re.size - 1) # TODO make more dynamic
|
25
|
+
block[*range] if block and range
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def create_re(filter)
|
30
|
+
filter.scan(/(?:(?:\"([^\"]+)\")|(\S+))+/).map do |word, tag|
|
31
|
+
word ? [:word, to_re(word)] : [:tag, to_re(tag)]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_re(txt)
|
36
|
+
Regexp.new('^' + txt.gsub(/\s/, '_').gsub(/\*/, '\S*').gsub(/\?/, '\S') + '$')
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'lumix/lookup_filter'
|
2
|
+
require 'lumix/text_snippet'
|
3
|
+
require 'lumix/lookup'
|
4
|
+
|
5
|
+
module Lumix
|
6
|
+
|
7
|
+
class LookupSearch
|
8
|
+
|
9
|
+
TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
|
10
|
+
|
11
|
+
def initialize(db, progress)
|
12
|
+
@lookup = Lookup.new
|
13
|
+
@progress = progress
|
14
|
+
end
|
15
|
+
|
16
|
+
def concurrent_link?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
|
20
|
+
def link_text(id)
|
21
|
+
ds = TaggedText[id]
|
22
|
+
@lookup.process id do
|
23
|
+
file, text, tagged = ds.filename, ds.text, ds.tagged
|
24
|
+
|
25
|
+
puts "Linking text #{file}"
|
26
|
+
|
27
|
+
txt_pos = 0
|
28
|
+
tagged.scan(TAGGED) do |word, tag|
|
29
|
+
tagged_begin = $~.begin(0)
|
30
|
+
tagged_end = $~.end(0)
|
31
|
+
|
32
|
+
# expand "x_y_z" notation to "x y z"
|
33
|
+
word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
|
34
|
+
src_match = text[txt_pos..-1].match(word_re) # find the word
|
35
|
+
if src_match
|
36
|
+
offset = src_match.begin(0)
|
37
|
+
src_begin = txt_pos + offset
|
38
|
+
src_end = txt_pos + src_match.end(0)
|
39
|
+
txt_pos = src_end
|
40
|
+
|
41
|
+
@lookup.add_token(id, word, tag, src_begin, src_end, tagged_begin, tagged_end)
|
42
|
+
else
|
43
|
+
STDERR.puts "Could not find match for '#{word}' in text #{file}"
|
44
|
+
STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
|
45
|
+
`echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
end
|
49
|
+
return true
|
50
|
+
end
|
51
|
+
rescue => e # TODO remove this crap
|
52
|
+
STDERR.puts e
|
53
|
+
STDERR.puts e.backtrace
|
54
|
+
@lookup.save
|
55
|
+
raise e
|
56
|
+
end
|
57
|
+
|
58
|
+
def create_filter(f, &block)
|
59
|
+
Lumix::LookupFilter.new(f, &block)
|
60
|
+
end
|
61
|
+
|
62
|
+
def find(*filters, &block)
|
63
|
+
last_id = -1
|
64
|
+
t = nil
|
65
|
+
filters.flatten.each do |f|
|
66
|
+
f.apply(@lookup) do |text_id, s_begin, s_end, t_begin, t_end|
|
67
|
+
t = TaggedText[text_id] if text_id != last_id
|
68
|
+
last_id = text_id
|
69
|
+
|
70
|
+
fname = File.basename(t.filename)
|
71
|
+
text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
|
72
|
+
tagged_snippet = Lumix::TextSnippet.new(fname, t.tagged, t_begin, t_end)
|
73
|
+
f << [text_snippet, tagged_snippet]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
SearchStrategy = LookupSearch
|
81
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
class Java::OrgEclipseSwtWidgets::Table
|
2
|
+
|
3
|
+
attr_accessor :data, :tooltips
|
4
|
+
|
5
|
+
def sweeten(app, opts={}, &block)
|
6
|
+
@data = []
|
7
|
+
@tooltips = []
|
8
|
+
super
|
9
|
+
@redraw_thread = Thread.new do
|
10
|
+
while !isDisposed
|
11
|
+
if @dirty
|
12
|
+
@dirty = false
|
13
|
+
perform do
|
14
|
+
setItemCount data.size
|
15
|
+
clearAll if clear_all
|
16
|
+
end
|
17
|
+
end
|
18
|
+
sleep 1 # TODO find a better alternative
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# TODO implement tooltips
|
23
|
+
|
24
|
+
addListener swt::SetData do |e|
|
25
|
+
item = e.item
|
26
|
+
index = indexOf(item)
|
27
|
+
item.setText(Array(data[index]).to_java(:string))
|
28
|
+
end
|
29
|
+
|
30
|
+
addListener swt::Resize do |e|
|
31
|
+
default_weight = 1.0 / columns.size
|
32
|
+
current_width = @old_width
|
33
|
+
w = width
|
34
|
+
columns[0..-2].each do |c|
|
35
|
+
weight = c.width == 0 ? default_weight : c.width.to_f / current_width
|
36
|
+
c.width = w * weight
|
37
|
+
end
|
38
|
+
columns[columns.size - 1].pack
|
39
|
+
@old_width = w
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def columns=(*titles)
|
44
|
+
if titles
|
45
|
+
titles.each do |title|
|
46
|
+
col = widgets::TableColumn.new(self, swt::CENTER)
|
47
|
+
col.setText title
|
48
|
+
end
|
49
|
+
|
50
|
+
setHeaderVisible true
|
51
|
+
setLinesVisible true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def sort=(sort)
|
56
|
+
sort = Hash.new(true) if [true, :all].member?(sort)
|
57
|
+
if sort
|
58
|
+
columns.each_with_index do |col, index|
|
59
|
+
if sort[col.text]
|
60
|
+
col.addListener swt::Selection do
|
61
|
+
if data
|
62
|
+
@data = data.sort_by {|e| e[index] }
|
63
|
+
update :clear
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
Sweet::WIDGET_DEFAULTS[:table] = {
|
73
|
+
:style => [:border, :virtual, :check]
|
74
|
+
}
|
75
|
+
Sweet::WIDGET_HACKS[Java::OrgEclipseSwtWidgets::Table] = {
|
76
|
+
:block_handler => :set_data,
|
77
|
+
:custom_code => proc {
|
78
|
+
def update(clear_all = false)
|
79
|
+
return if isDisposed
|
80
|
+
setItemCount data.size
|
81
|
+
clearAll if clear_all
|
82
|
+
end
|
83
|
+
|
84
|
+
def add_hit(*args)
|
85
|
+
opts = args.last === Hash ? args.pop : {}
|
86
|
+
d = opts[:data] || args
|
87
|
+
t = opts[:tooltips] || d
|
88
|
+
data << d
|
89
|
+
tooltips << t
|
90
|
+
@dirty = true
|
91
|
+
end
|
92
|
+
}
|
93
|
+
}
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class CreateTables < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :texts do
|
5
|
+
primary_key :id
|
6
|
+
String :digest
|
7
|
+
String :text
|
8
|
+
String :tagged
|
9
|
+
String :filename
|
10
|
+
String :tagged_filename
|
11
|
+
|
12
|
+
index :digest
|
13
|
+
end
|
14
|
+
|
15
|
+
create_table :assoc do
|
16
|
+
primary_key :id
|
17
|
+
Integer :text_id, :references => :texts
|
18
|
+
Integer :position
|
19
|
+
Integer :src_begin
|
20
|
+
Integer :src_end
|
21
|
+
Integer :tagged_begin
|
22
|
+
Integer :tagged_end
|
23
|
+
|
24
|
+
index [:text_id, :tagged_end]
|
25
|
+
index [:text_id, :tagged_begin]
|
26
|
+
index [:text_id, :position]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def down
|
31
|
+
drop_table :assoc
|
32
|
+
drop_table :texts
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class Categories < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :categories do
|
5
|
+
primary_key :id
|
6
|
+
Integer :parent_id, :references => :categories
|
7
|
+
String :name
|
8
|
+
String :key
|
9
|
+
|
10
|
+
index [:parent_id, :id]
|
11
|
+
end
|
12
|
+
|
13
|
+
alter_table :texts do
|
14
|
+
add_column :category_id, Integer, :references => :categories
|
15
|
+
|
16
|
+
add_index [:category_id, :id]
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def down
|
22
|
+
alter_table :texts do
|
23
|
+
drop_column :category_id
|
24
|
+
end
|
25
|
+
drop_table :categories
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
class CreateLookupTables < Sequel::Migration
|
2
|
+
|
3
|
+
def up
|
4
|
+
create_table :tags do
|
5
|
+
primary_key :id
|
6
|
+
String :tag
|
7
|
+
|
8
|
+
index :tag, :unique => true
|
9
|
+
end
|
10
|
+
|
11
|
+
create_table :words do
|
12
|
+
primary_key :id
|
13
|
+
String :word
|
14
|
+
|
15
|
+
index :word, :unique => true
|
16
|
+
end
|
17
|
+
|
18
|
+
create_table :tokens do
|
19
|
+
primary_key :id
|
20
|
+
Integer :text_id, :references => :texts
|
21
|
+
|
22
|
+
Integer :position
|
23
|
+
Integer :tag_id, :references => :tags
|
24
|
+
Integer :word_id, :references => :words
|
25
|
+
|
26
|
+
Integer :src_begin
|
27
|
+
Integer :src_end
|
28
|
+
Integer :tagged_begin
|
29
|
+
Integer :tagged_end
|
30
|
+
|
31
|
+
index [:text_id, :position], :unique => true
|
32
|
+
index :word_id
|
33
|
+
index :tag_id
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
def down
|
39
|
+
drop_table :tokens
|
40
|
+
drop_table :words
|
41
|
+
drop_table :tags
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
module Lumix
|
2
|
+
|
3
|
+
class SlowSearch
|
4
|
+
TAGGED = /([^\s\|]+)\|(\S+)/m # Xxx|YYY
|
5
|
+
|
6
|
+
def initialize(db, progress)
|
7
|
+
@db = db
|
8
|
+
@progress = progress
|
9
|
+
end
|
10
|
+
|
11
|
+
def concurrent_link?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def link_text(id)
|
16
|
+
t = TaggedText[id]
|
17
|
+
text = t.text
|
18
|
+
puts "Linking text #{t.filename}"
|
19
|
+
|
20
|
+
src_last = 0
|
21
|
+
position = 0
|
22
|
+
assoc = []
|
23
|
+
t.tagged.scan(TAGGED) do |word, tag|
|
24
|
+
tagged_begin = $~.begin(0)
|
25
|
+
tagged_end = $~.end(0)
|
26
|
+
|
27
|
+
word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
|
28
|
+
src_match = text[src_last..-1].match(word_re) # find the word
|
29
|
+
if src_match
|
30
|
+
src_begin = src_last + src_match.begin(0)
|
31
|
+
src_end = src_last + src_match.end(0)
|
32
|
+
|
33
|
+
src_last = src_end
|
34
|
+
assoc << {:text_id => id, :position => position, :src_begin => src_begin, :src_end => src_end, :tagged_begin => tagged_begin, :tagged_end => tagged_end}
|
35
|
+
else
|
36
|
+
STDERR.puts "Could not find match for '#{word}' in text #{t.filename}"
|
37
|
+
`echo '#{t.filename}:#{tagged_begin}:#{src_last} unmatched "#{word}"' >> unlinked.lst`
|
38
|
+
return nil
|
39
|
+
end
|
40
|
+
position += 1
|
41
|
+
end
|
42
|
+
@db[:assoc].multi_insert(assoc)
|
43
|
+
rescue => e
|
44
|
+
STDERR.puts e
|
45
|
+
STDERR.puts e.backtrace
|
46
|
+
raise e
|
47
|
+
end
|
48
|
+
|
49
|
+
def create_filter
|
50
|
+
@filter ||= Filter.new('')
|
51
|
+
end
|
52
|
+
|
53
|
+
def find(filter, &block)
|
54
|
+
yield_text = block && block.arity >= 1
|
55
|
+
yield_tagged = block && block.arity >= 2
|
56
|
+
|
57
|
+
prog = Progress.new(:search, TaggedText.count, filter)
|
58
|
+
@progress[prog]
|
59
|
+
|
60
|
+
re = Filter.to_re(filter)
|
61
|
+
|
62
|
+
index = 0
|
63
|
+
TaggedText.inject(0) do |result, t|
|
64
|
+
fname = File.basename(t.filename)
|
65
|
+
|
66
|
+
# matches to ranges
|
67
|
+
results = []
|
68
|
+
t.tagged.scan(re) do |hit|
|
69
|
+
t_begin = $~.begin(0)
|
70
|
+
t_end = $~.end(0)
|
71
|
+
# TODO decouple database operations for performance
|
72
|
+
results << find_range(t.id, t_begin, t_end, yield_text)
|
73
|
+
end
|
74
|
+
|
75
|
+
result += results.inject(0) do |res, f|
|
76
|
+
if yield_tagged
|
77
|
+
tagged_snippet = TextSnippet.new(fname, t.tagged, f[:tagged_begin].to_i, f[:tagged_end].to_i)
|
78
|
+
if yield_text
|
79
|
+
text_snippet = TextSnippet.new(fname, t.text, f[:src_begin].to_i, f[:src_end].to_i)
|
80
|
+
yield text_snippet, tagged_snippet
|
81
|
+
else
|
82
|
+
yield tagged_snippet
|
83
|
+
end
|
84
|
+
end
|
85
|
+
res += 1
|
86
|
+
end
|
87
|
+
@progress[prog, (index += 1)]
|
88
|
+
result
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def find_range(t_id, t_begin, t_end, process_original)
|
93
|
+
if process_original
|
94
|
+
ds = @db[:assoc].filter(:text_id => t_id).filter{tagged_end >= t_begin}.filter{tagged_begin < t_end}
|
95
|
+
ds.select{[{min(:src_begin) => :src_begin},{ max(:src_end) => :src_end}, {min(:tagged_begin) => :tagged_begin}, {max(:tagged_end) => :tagged_end}]}.first
|
96
|
+
else
|
97
|
+
{:tagged_begin => t_begin, :tagged_end => t_end}
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
SearchStrategy = SlowSearch
|
104
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Lumix
|
2
|
+
|
3
|
+
class TextSnippet
|
4
|
+
attr_reader :name, :text, :begin, :end
|
5
|
+
def initialize(name, text, first, last)
|
6
|
+
@name = name
|
7
|
+
@text = text
|
8
|
+
@begin = first
|
9
|
+
@end = last
|
10
|
+
end
|
11
|
+
def to_s
|
12
|
+
cleanup(@text[@begin...@end])
|
13
|
+
end
|
14
|
+
def left(context = 5)
|
15
|
+
ctx = [@begin - context * 10, 0].max
|
16
|
+
@text[ctx...@begin].split(/\s+/).last(context).join(' ')# =~ /((\S+\s+){0,#{context}}\S*)\z/m
|
17
|
+
#cleanup($1)
|
18
|
+
end
|
19
|
+
def right(context = 5)
|
20
|
+
ctx = [@end + context * 10, @text.size].min
|
21
|
+
@text[@end..ctx].split(/\s+/).first(context).join(' ')# =~ /\A(\S*(\s+\S+){0,#{context}})/m
|
22
|
+
#cleanup($1)
|
23
|
+
end
|
24
|
+
def cleanup(txt)
|
25
|
+
txt.gsub(/\s+/, ' ')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
$KCODE='UTF-8'
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
require 'soap/wsdlDriver'
|
5
|
+
#require 'curb'
|
6
|
+
#require 'savon'
|
7
|
+
require 'lumix/charset' unless RUBY_ENGINE =~ /maglev/i
|
8
|
+
|
9
|
+
class TextProcessing
|
10
|
+
|
11
|
+
attr_accessor :lang
|
12
|
+
|
13
|
+
def initialize(lang = 'ro')
|
14
|
+
@lang = lang
|
15
|
+
end
|
16
|
+
|
17
|
+
def rpc
|
18
|
+
# Thread.current[:rpc] ||= begin
|
19
|
+
# wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
|
20
|
+
# wsdl.create_rpc_driver
|
21
|
+
# Savon::Client.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
|
22
|
+
# end
|
23
|
+
@rpc ||= SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL').create_rpc_driver
|
24
|
+
end
|
25
|
+
|
26
|
+
# the core processing routing using the webservice
|
27
|
+
def process(text)
|
28
|
+
response = rpc.Process(:input => text.to_utf, :lang => lang)
|
29
|
+
response.processResult
|
30
|
+
# response = rpc.request(:process) do
|
31
|
+
# soap.body = {:input => text, :lang => lang}
|
32
|
+
# end
|
33
|
+
# response.to_hash[:process_response][:process_result]
|
34
|
+
end
|
35
|
+
|
36
|
+
def cleanup(file)
|
37
|
+
@entities ||= HTMLEntities.new
|
38
|
+
@entities.decode()
|
39
|
+
end
|
40
|
+
|
41
|
+
# inserts "tagged" as the second to last part in the filename and as parent folder
|
42
|
+
# e.g.
|
43
|
+
# test.txt -> tagged/test.tagged.txt
|
44
|
+
# special case when no extension is present:
|
45
|
+
# README -> README.tagged
|
46
|
+
def create_tagged_filename(infile)
|
47
|
+
path = infile.split(/\//)
|
48
|
+
|
49
|
+
# take care of the filename...
|
50
|
+
components = path.pop.split(/\./)
|
51
|
+
position = [1, components.size-1].max
|
52
|
+
components.insert position, 'tagged'
|
53
|
+
path.push components.join('.')
|
54
|
+
|
55
|
+
# ...and of the path
|
56
|
+
path.insert -2, 'tagged'
|
57
|
+
path.join '/'
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_filelist(*files)
|
61
|
+
files = files.flatten.map do |filename|
|
62
|
+
if File.directory? filename
|
63
|
+
Dir.glob File.join(filename, '**/*') # add all files from that directory
|
64
|
+
else
|
65
|
+
filename
|
66
|
+
end
|
67
|
+
end.flatten.compact.uniq # make sure every file is only processed once
|
68
|
+
files.delete_if { |filename| File.directory?(filename) || filename['.tagged']} # remove remaining folders
|
69
|
+
end
|
70
|
+
|
71
|
+
def process_stdin
|
72
|
+
puts process($stdin.read)
|
73
|
+
end
|
74
|
+
|
75
|
+
# takes the text from infile and outputs the result into the outfile
|
76
|
+
def process_file(infile, outfile = create_tagged_filename(infile))
|
77
|
+
result = process(File.read(file).to_utf)
|
78
|
+
File.open(outfile, 'w') do |out|
|
79
|
+
out.write result
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# process the args if called as main script
|
87
|
+
if __FILE__ == $0
|
88
|
+
args = ARGV
|
89
|
+
tp = if args.first == '-lang'
|
90
|
+
args.shift
|
91
|
+
TextProcessing.new(args.shift)
|
92
|
+
else
|
93
|
+
TextProcessing.new
|
94
|
+
end
|
95
|
+
|
96
|
+
if args.empty?
|
97
|
+
tp.process_stdin
|
98
|
+
else
|
99
|
+
files = tp.to_filelist(args)
|
100
|
+
|
101
|
+
puts "Processing files:"
|
102
|
+
for infile in files
|
103
|
+
outfile = tp.create_tagged_filename(infile)
|
104
|
+
puts "#{infile} -> #{outfile}"
|
105
|
+
tp.process_file(infile, outfile) unless File.exist?(outfile)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|