rbbt-text 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/corpus/corpus.rb +15 -6
- data/lib/rbbt/corpus/document.rb +100 -127
- data/lib/rbbt/corpus/document_repo.rb +72 -51
- data/lib/rbbt/ner/NER.rb +4 -4
- data/lib/rbbt/ner/abner.rb +5 -4
- data/lib/rbbt/ner/banner.rb +3 -3
- data/lib/rbbt/ner/chemical_tagger.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
- data/lib/rbbt/ner/oscar3.rb +3 -3
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +15 -13
- data/lib/rbbt/ner/regexpNER.rb +3 -2
- data/lib/rbbt/ner/rnorm.rb +2 -2
- data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
- data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
- data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
- data/lib/rbbt/ner/segment/relationship.rb +20 -0
- data/lib/rbbt/ner/segment/segmented.rb +13 -0
- data/lib/rbbt/ner/segment/token.rb +24 -0
- data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
- data/lib/rbbt/ner/token_trieNER.rb +30 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/nlp.rb +23 -37
- data/test/rbbt/corpus/test_document.rb +39 -37
- data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
- data/test/rbbt/ner/segment/test_segmented.rb +23 -0
- data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
- data/test/rbbt/ner/test_patterns.rb +11 -12
- data/test/rbbt/ner/test_regexpNER.rb +5 -4
- data/test/rbbt/ner/test_segment.rb +101 -0
- data/test/rbbt/ner/test_token_trieNER.rb +8 -9
- data/test/test_helper.rb +6 -6
- metadata +40 -22
- data/lib/rbbt/ner/annotations/annotated.rb +0 -15
- data/lib/rbbt/ner/annotations/relations.rb +0 -25
- data/lib/rbbt/ner/annotations/token.rb +0 -28
- data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
- data/test/rbbt/ner/test_annotations.rb +0 -70
data/lib/rbbt/ner/NER.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require 'rbbt/ner/
|
2
|
-
require 'rbbt/ner/
|
3
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
|
+
require 'rbbt/ner/segment/named_entity'
|
3
|
+
require 'rbbt/ner/segment/segmented'
|
4
4
|
|
5
5
|
class NER
|
6
6
|
def entities(text, protect = false, *args)
|
@@ -13,7 +13,7 @@ class NER
|
|
13
13
|
}
|
14
14
|
matches
|
15
15
|
end.flatten
|
16
|
-
when (
|
16
|
+
when (Segmented === text and protect)
|
17
17
|
entities(text.split_segments(true), protect, *args)
|
18
18
|
else
|
19
19
|
match(text, *args)
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/segment'
|
4
|
+
require 'rbbt/resource'
|
4
5
|
require 'rbbt/ner/NER'
|
5
6
|
|
6
7
|
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
7
8
|
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
8
9
|
class Abner < NER
|
9
10
|
|
10
|
-
Rbbt.software.opt.ABNER
|
11
|
+
Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
|
11
12
|
|
12
13
|
@@JFile = Rjb::import('java.io.File')
|
13
14
|
@@Tagger = Rjb::import('abner.Tagger')
|
@@ -38,9 +39,9 @@ class Abner < NER
|
|
38
39
|
mention = mention.to_s;
|
39
40
|
offset = text.index(mention)
|
40
41
|
if offset.nil?
|
41
|
-
NamedEntity.
|
42
|
+
NamedEntity.setup(mention, nil, type.to_s)
|
42
43
|
else
|
43
|
-
NamedEntity.
|
44
|
+
NamedEntity.setup(mention, offset + global_offset, type.to_s)
|
44
45
|
text = text[offset + mention.length..-1]
|
45
46
|
global_offset += offset + mention.length
|
46
47
|
end
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/segment'
|
4
4
|
require 'rbbt/ner/NER'
|
5
5
|
|
6
6
|
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
7
7
|
# in Java. Banner[http://banner.sourceforge.net/].
|
8
8
|
class Banner < NER
|
9
9
|
|
10
|
-
Rbbt.software.opt.BANNER
|
10
|
+
Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
|
11
11
|
|
12
12
|
@@JFile = Rjb::import('java.io.File')
|
13
13
|
@@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
|
@@ -68,7 +68,7 @@ class Banner < NER
|
|
68
68
|
mention.sub!(/^\s*/,'')
|
69
69
|
mention.sub!(/\s*$/,'')
|
70
70
|
offset = text.index(mention)
|
71
|
-
NamedEntity.
|
71
|
+
NamedEntity.setup(mention, offset, 'GENE')
|
72
72
|
mention
|
73
73
|
}
|
74
74
|
res
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/segment'
|
4
4
|
require 'rbbt/ner/NER'
|
5
5
|
require 'rbbt/util/log'
|
6
6
|
|
7
7
|
class ChemicalTagger < NER
|
8
|
-
Rbbt.software.opt.ChemicalTagger
|
8
|
+
Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
|
9
9
|
|
10
10
|
Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
|
11
11
|
|
@@ -24,7 +24,7 @@ class ChemicalTagger < NER
|
|
24
24
|
|
25
25
|
matches.collect do |mention|
|
26
26
|
offset = text.index mention
|
27
|
-
NamedEntity.
|
27
|
+
NamedEntity.setup mention, offset, "Chemical Mention", nil, nil
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
@@ -1,7 +1,8 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/util/
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/misc'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/ner/segment'
|
5
|
+
require 'rbbt/ner/segment/token'
|
5
6
|
require 'rbbt/ner/NER'
|
6
7
|
require 'inline'
|
7
8
|
|
@@ -41,9 +42,28 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
44
|
-
def self.
|
45
|
+
def self.process_stream(stream)
|
45
46
|
index = {}
|
46
|
-
|
47
|
+
while line = stream.gets
|
48
|
+
names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
|
49
|
+
code = names.shift
|
50
|
+
|
51
|
+
names.each do |name|
|
52
|
+
ngram = name[0..2].strip
|
53
|
+
index[ngram] ||= []
|
54
|
+
index[ngram] << [name, code]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
index
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.process_hash(hash)
|
62
|
+
index = {}
|
63
|
+
hash.monitor = true if hash.respond_to? :monitor
|
64
|
+
hash.unnamed = true if hash.respond_to? :unnamed
|
65
|
+
method = hash.respond_to?(:through)? :through : :each
|
66
|
+
hash.send(method) do |code, names|
|
47
67
|
names.each do |name|
|
48
68
|
ngram = name[0..2].strip
|
49
69
|
index[ngram] ||= []
|
@@ -94,15 +114,30 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
94
114
|
|
95
115
|
attr_accessor :index, :type
|
96
116
|
def initialize(file, type = nil)
|
97
|
-
tsv = TSV.new(file, :flat)
|
98
117
|
@type = type
|
99
|
-
|
100
|
-
|
118
|
+
case
|
119
|
+
when (TSV === file or Hash === file)
|
120
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
|
121
|
+
@index = NGramPrefixDictionary.process_hash(file)
|
122
|
+
when Path === file
|
123
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
124
|
+
@index = NGramPrefixDictionary.process_stream(file.open)
|
125
|
+
when Misc.is_filename?(file)
|
126
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
127
|
+
@index = NGramPrefixDictionary.process_stream(Open.open(file))
|
128
|
+
when StreamIO === file
|
129
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
|
130
|
+
@index = NGramPrefixDictionary.process_stream(file)
|
131
|
+
else
|
132
|
+
raise "Format of lexicon not understood: #{file.inspect}"
|
133
|
+
end
|
134
|
+
|
135
|
+
Log.debug("Ngram Prefix Dictionary. Loading done.")
|
101
136
|
end
|
102
137
|
|
103
138
|
def match(text)
|
104
139
|
NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
|
105
|
-
NamedEntity.
|
140
|
+
NamedEntity.setup(name, offset, type, code)
|
106
141
|
}
|
107
142
|
end
|
108
143
|
end
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/ner/
|
4
|
+
require 'rbbt/ner/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
8
8
|
class OSCAR3 < NER
|
9
|
-
Rbbt.software.opt.OSCAR3
|
9
|
+
Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
|
10
10
|
|
11
11
|
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
12
12
|
@@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
@@ -50,7 +50,7 @@ class OSCAR3 < NER
|
|
50
50
|
next unless type.nil? or type.include? mention_type
|
51
51
|
score = memm ? entities.get(key).to_string.to_f : nil
|
52
52
|
|
53
|
-
NamedEntity.
|
53
|
+
NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
|
54
54
|
|
55
55
|
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
|
56
56
|
end
|
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/ner/
|
4
|
+
require 'rbbt/ner/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
8
8
|
class OSCAR4 < NER
|
9
|
-
Rbbt.software.opt.OSCAR4
|
9
|
+
Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
|
10
10
|
|
11
11
|
Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
|
12
12
|
@@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
|
@@ -26,7 +26,7 @@ class OSCAR4 < NER
|
|
26
26
|
mention = entity.getSurface
|
27
27
|
result << mention
|
28
28
|
|
29
|
-
NamedEntity.
|
29
|
+
NamedEntity.setup mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
|
30
30
|
end
|
31
31
|
|
32
32
|
result
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt/ner/
|
2
|
-
require 'rbbt/ner/
|
3
|
-
require 'rbbt/ner/
|
4
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment/named_entity'
|
2
|
+
require 'rbbt/ner/segment/segmented'
|
3
|
+
require 'rbbt/ner/segment/transformed'
|
4
|
+
require 'rbbt/ner/segment/relationship'
|
5
5
|
require 'rbbt/ner/regexpNER'
|
6
6
|
require 'rbbt/ner/token_trieNER'
|
7
7
|
require 'rbbt/nlp/nlp'
|
@@ -12,7 +12,9 @@ class PatternRelExt
|
|
12
12
|
patterns = Array === patterns ? patterns : [patterns]
|
13
13
|
type ||= "Simple Pattern"
|
14
14
|
regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
|
15
|
-
|
15
|
+
segments = sentence.segments
|
16
|
+
segments = segments.values.flatten if Hash === segments
|
17
|
+
Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
|
16
18
|
regexpNER.entities(sentence)
|
17
19
|
end
|
18
20
|
end
|
@@ -23,23 +25,23 @@ class PatternRelExt
|
|
23
25
|
when key =~ /(.*)\[entity:(.*)\]/
|
24
26
|
chunk_type, chunk_value = $1, $2
|
25
27
|
annotation_types = chunk_value.split(",")
|
26
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
27
|
-
((Hash === chunk.
|
28
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
29
|
+
((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
|
28
30
|
|
29
31
|
when key =~ /(.*)\[code:(.*)\]/
|
30
32
|
chunk_type, chunk_value = $1, $2
|
31
33
|
annotation_codes = chunk_value.split(",")
|
32
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
33
|
-
((Hash === chunk.
|
34
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
35
|
+
((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
|
34
36
|
|
35
37
|
when key =~ /(.*)\[stem:(.*)\]/
|
36
38
|
chunk_type, chunk_value = $1, $2
|
37
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
39
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
38
40
|
chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
|
39
41
|
|
40
42
|
when key =~ /(.*)\[(.*)\]/
|
41
43
|
chunk_type, chunk_value = $1, $2
|
42
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
44
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
43
45
|
chunk.parts.values.select{|a| a == chunk_value}.any?}
|
44
46
|
|
45
47
|
else
|
@@ -120,9 +122,9 @@ class PatternRelExt
|
|
120
122
|
sentence_chunks = NLP.gdep_chunk_sentences(sentences)
|
121
123
|
|
122
124
|
sentences.zip(sentence_chunks).collect do |sentence, chunks|
|
123
|
-
annotation_index = Segment.index(sentence.
|
125
|
+
annotation_index = Segment.index(sentence.segments)
|
124
126
|
chunks.each do |chunk|
|
125
|
-
|
127
|
+
Segmented.setup(chunk, annotation_index[chunk.range])
|
126
128
|
end
|
127
129
|
|
128
130
|
match_chunks(chunks)
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
2
|
require 'rbbt/ner/NER'
|
3
3
|
require 'rbbt/util/simpleDSL'
|
4
4
|
|
@@ -23,7 +23,7 @@ class RegExpNER < NER
|
|
23
23
|
end
|
24
24
|
|
25
25
|
if match and not match.empty?
|
26
|
-
NamedEntity.
|
26
|
+
NamedEntity.setup(match, start + pre.length, type)
|
27
27
|
matches << match
|
28
28
|
end
|
29
29
|
|
@@ -86,6 +86,7 @@ class RegExpNER < NER
|
|
86
86
|
|
87
87
|
def match(text)
|
88
88
|
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
89
|
+
matches
|
89
90
|
end
|
90
91
|
|
91
92
|
end
|
data/lib/rbbt/ner/rnorm.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt/ner/rnorm/cue_index'
|
2
2
|
require 'rbbt/ner/rnorm/tokens'
|
3
3
|
require 'rbbt/util/open'
|
4
|
-
require 'rbbt/
|
4
|
+
require 'rbbt/tsv'
|
5
5
|
require 'rbbt/sources/entrez'
|
6
6
|
require 'rbbt/bow/bow.rb'
|
7
7
|
|
@@ -89,7 +89,7 @@ class Normalizer
|
|
89
89
|
|
90
90
|
|
91
91
|
def initialize(lexicon, options = {})
|
92
|
-
@synonyms = TSV.
|
92
|
+
@synonyms = TSV.open(lexicon, :flat)
|
93
93
|
|
94
94
|
@index = CueIndex.new
|
95
95
|
@index.load(lexicon, options[:max_candidates])
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt'
|
2
2
|
require 'rbbt/util/misc'
|
3
3
|
require 'rbbt/util/simpleDSL'
|
4
4
|
|
@@ -47,7 +47,7 @@ class CueIndex
|
|
47
47
|
|
48
48
|
def load(file, max_candidates = 50)
|
49
49
|
@indexes = Array.new(@rules.size){Hash.new}
|
50
|
-
data = TSV.
|
50
|
+
data = TSV.open(file, :flat)
|
51
51
|
data.each{|code, values|
|
52
52
|
values.each{|value|
|
53
53
|
cues(value).each_with_index{|cue_list,i|
|
@@ -1,63 +1,59 @@
|
|
1
|
+
require 'rbbt/annotations'
|
2
|
+
require 'rbbt/fix_width_table'
|
3
|
+
|
1
4
|
module Segment
|
2
|
-
|
3
|
-
|
4
|
-
def self.included(base)
|
5
|
-
if base.instance_methods.include? "segment_types"
|
6
|
-
class << base
|
7
|
-
self.module_eval do
|
8
|
-
define_method "extended" do |object|
|
9
|
-
object.segment_types ||= []
|
10
|
-
object.segment_types << self.to_s unless object.segment_types.include? self.to_s
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
5
|
+
extend Annotation
|
6
|
+
self.annotation :offset
|
16
7
|
|
17
|
-
|
18
|
-
string.extend Segment
|
19
|
-
string.offset = offset
|
20
|
-
string.docid = docid
|
21
|
-
string
|
22
|
-
end
|
8
|
+
#{{{ Ranges
|
23
9
|
|
24
|
-
def
|
25
|
-
|
26
|
-
|
10
|
+
def end
|
11
|
+
return nil if offset.nil?
|
12
|
+
offset + length - 1
|
27
13
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
info = {}
|
33
|
-
singleton_methods.select{|method| method[-1] == equal_ascii}.
|
34
|
-
collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
|
35
|
-
info
|
36
|
-
info.delete_if{|k,v| v.nil?}
|
37
|
-
info
|
14
|
+
|
15
|
+
def range
|
16
|
+
raise "No offset specified" if offset.nil?
|
17
|
+
(offset..self.end)
|
38
18
|
end
|
39
19
|
|
40
|
-
def
|
41
|
-
|
42
|
-
|
43
|
-
|
20
|
+
def pull(offset)
|
21
|
+
if self.offset.nil? or offset.nil?
|
22
|
+
self.offset = nil
|
23
|
+
else
|
24
|
+
self.offset += offset
|
25
|
+
end
|
44
26
|
|
45
|
-
|
46
|
-
|
47
|
-
types.each do |type| string.extend Misc.string2const(type) end
|
27
|
+
self
|
28
|
+
end
|
48
29
|
|
49
|
-
|
50
|
-
|
51
|
-
|
30
|
+
def push(offset)
|
31
|
+
if self.offset.nil? or offset.nil?
|
32
|
+
self.offset = nil
|
33
|
+
else
|
34
|
+
self.offset -= offset
|
52
35
|
end
|
53
36
|
|
54
|
-
|
55
|
-
|
37
|
+
self
|
38
|
+
end
|
56
39
|
|
57
|
-
|
40
|
+
def make_relative(segments)
|
41
|
+
segments.collect{|s| s.push offset}
|
58
42
|
end
|
59
43
|
|
60
|
-
|
44
|
+
def range_in(container = nil)
|
45
|
+
raise "No offset specified" if offset.nil?
|
46
|
+
case
|
47
|
+
when (Segment === container and not container.offset.nil?)
|
48
|
+
((offset - container.offset)..(self.end - container.offset))
|
49
|
+
when Integer === container
|
50
|
+
((offset - container)..(self.end - container))
|
51
|
+
else
|
52
|
+
range
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
#{{{ Sorting
|
61
57
|
|
62
58
|
def self.sort(segments, inline = true)
|
63
59
|
if inline
|
@@ -102,13 +98,15 @@ module Segment
|
|
102
98
|
sorted
|
103
99
|
end
|
104
100
|
|
101
|
+
#{{{ Splitting
|
102
|
+
|
105
103
|
def self.split(text, segments, skip_segments = false)
|
106
104
|
sorted_segments = clean_sort segments
|
107
105
|
|
108
106
|
chunks = []
|
109
107
|
segment_end = 0
|
110
108
|
text_offset = 0
|
111
|
-
sorted_segments.
|
109
|
+
sorted_segments.each do |segment|
|
112
110
|
return chunks if text.nil? or text.empty?
|
113
111
|
next if segment.offset.nil?
|
114
112
|
offset = segment.offset - text_offset
|
@@ -119,7 +117,7 @@ module Segment
|
|
119
117
|
next
|
120
118
|
when offset > 0 # Save pre
|
121
119
|
chunk = text[0..offset - 1]
|
122
|
-
Segment.
|
120
|
+
Segment.setup(chunk, text_offset)
|
123
121
|
chunks << chunk
|
124
122
|
end
|
125
123
|
|
@@ -127,7 +125,7 @@ module Segment
|
|
127
125
|
|
128
126
|
if not skip_segments
|
129
127
|
chunk = text[offset..segment_end]
|
130
|
-
Segment.
|
128
|
+
Segment.setup(chunk, text_offset + offset)
|
131
129
|
chunks << chunk
|
132
130
|
end
|
133
131
|
|
@@ -138,72 +136,29 @@ module Segment
|
|
138
136
|
|
139
137
|
if not text.nil? and text.any?
|
140
138
|
chunk = text.dup
|
141
|
-
Segment.
|
139
|
+
Segment.setup(chunk, text_offset)
|
142
140
|
chunks << chunk
|
143
141
|
end
|
144
142
|
|
145
143
|
chunks
|
146
144
|
end
|
147
145
|
|
148
|
-
# {{{ Ranges and manipulation
|
149
146
|
|
150
|
-
|
151
|
-
if self.offset.nil? or offset.nil?
|
152
|
-
self.offset = nil
|
153
|
-
else
|
154
|
-
self.offset += offset
|
155
|
-
end
|
156
|
-
|
157
|
-
self
|
158
|
-
end
|
159
|
-
|
160
|
-
def push(offset)
|
161
|
-
if self.offset.nil? or offset.nil?
|
162
|
-
self.offset = nil
|
163
|
-
else
|
164
|
-
self.offset -= offset
|
165
|
-
end
|
166
|
-
|
167
|
-
self
|
168
|
-
end
|
169
|
-
|
170
|
-
def make_relative(segments)
|
171
|
-
segments.collect{|s| s.push offset}
|
172
|
-
end
|
173
|
-
|
174
|
-
def end
|
175
|
-
return nil if offset.nil?
|
176
|
-
offset + length - 1
|
177
|
-
end
|
178
|
-
|
179
|
-
def range
|
180
|
-
raise "No offset specified" if offset.nil?
|
181
|
-
(offset..self.end)
|
182
|
-
end
|
183
|
-
|
184
|
-
def range_in(container = nil)
|
185
|
-
raise "No offset specified" if offset.nil?
|
186
|
-
case
|
187
|
-
when (Segment === container and not container.offset.nil?)
|
188
|
-
((offset - container.offset)..(self.end - container.offset))
|
189
|
-
when Integer === container
|
190
|
-
((offset - container)..(self.end - container))
|
191
|
-
else
|
192
|
-
range
|
193
|
-
end
|
194
|
-
end
|
147
|
+
#{{{ Align
|
195
148
|
|
196
149
|
def self.align(text, parts)
|
197
150
|
pre_offset = 0
|
198
151
|
parts.each do |part|
|
199
152
|
offset = text.index part
|
200
153
|
next if offset.nil?
|
201
|
-
Segment.
|
154
|
+
Segment.setup(part, pre_offset + offset)
|
202
155
|
pre_offset += offset + part.length - 1
|
203
156
|
text = text[(offset + part.length - 1)..-1]
|
204
157
|
end
|
205
158
|
end
|
206
159
|
|
160
|
+
#{{{ Index
|
161
|
+
|
207
162
|
class Index
|
208
163
|
attr_accessor :index, :data
|
209
164
|
def initialize(index, data)
|
@@ -216,12 +171,11 @@ module Segment
|
|
216
171
|
end
|
217
172
|
end
|
218
173
|
|
219
|
-
def self.index(segments,
|
220
|
-
|
174
|
+
def self.index(segments, persist_file = :memory)
|
221
175
|
segments = segments.values.flatten if Hash === segments
|
222
176
|
|
223
177
|
annotation_index =
|
224
|
-
|
178
|
+
Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
|
225
179
|
|
226
180
|
value_size = 0
|
227
181
|
index_data = segments.collect{|segment|
|
@@ -233,6 +187,7 @@ module Segment
|
|
233
187
|
|
234
188
|
fwt = FixWidthTable.get :memory, value_size, true
|
235
189
|
fwt.add_range index_data
|
190
|
+
|
236
191
|
fwt
|
237
192
|
end
|
238
193
|
|
@@ -241,14 +196,111 @@ module Segment
|
|
241
196
|
Index.new annotation_index, data
|
242
197
|
end
|
243
198
|
|
244
|
-
|
199
|
+
#{{{ Save and load
|
200
|
+
|
201
|
+
def self.tsv_values_for_segment(segment, fields)
|
202
|
+
info = segment.info
|
203
|
+
values = []
|
204
|
+
|
205
|
+
fields.each do |field|
|
206
|
+
values << case
|
207
|
+
when field == "JSON"
|
208
|
+
info.to_json
|
209
|
+
when field == "literal"
|
210
|
+
segment.gsub(/\n|\t/, ' ')
|
211
|
+
when field == "Start"
|
212
|
+
segment.offset
|
213
|
+
when field == "End"
|
214
|
+
segment.end
|
215
|
+
else
|
216
|
+
info.delete(field.to_sym)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
values
|
221
|
+
end
|
222
|
+
|
223
|
+
def self.load_tsv_values(text, values, fields)
|
224
|
+
info = {}
|
225
|
+
literal_pos = fields.index "literal"
|
226
|
+
|
227
|
+
object = if literal_pos.nil?
|
228
|
+
""
|
229
|
+
else
|
230
|
+
v = values[literal_pos]
|
231
|
+
v = v.first if Array === v
|
232
|
+
v
|
233
|
+
end
|
234
|
+
|
235
|
+
fields.each_with_index do |field, i|
|
236
|
+
if field == "JSON"
|
237
|
+
JSON.parse(values[i]).each do |key, value|
|
238
|
+
info[key.to_sym] = value
|
239
|
+
end
|
240
|
+
else
|
241
|
+
info[field.to_sym] = values[i]
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
start = info.delete(:Start)
|
246
|
+
if not (start.nil? or ((Array === start or String === start) and start.empty?))
|
247
|
+
if Array === start
|
248
|
+
start = start.first
|
249
|
+
end
|
250
|
+
start = start.to_i
|
251
|
+
info[:offset] = start
|
252
|
+
|
253
|
+
eend = info.delete(:End)
|
254
|
+
if Array === eend
|
255
|
+
eend = eend.first
|
256
|
+
end
|
257
|
+
eend = eend.to_i
|
258
|
+
|
259
|
+
if object.empty?
|
260
|
+
object.replace text[start..eend]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
info[:annotation_types] = [Segment] unless info.include? :annotation_types
|
245
265
|
|
246
|
-
|
247
|
-
include Segment
|
248
|
-
attr_accessor :comment
|
249
|
-
def self.annotate(text, comment = nil)
|
250
|
-
text.extend Comment
|
251
|
-
text.comment = (comment.nil? ? text : comment)
|
252
|
-
text
|
266
|
+
Annotated.load(object, info)
|
253
267
|
end
|
268
|
+
|
269
|
+
def self.set_tsv_fields(fields, segments)
|
270
|
+
tsv_fields = []
|
271
|
+
add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
|
272
|
+
literal = (fields.delete(:literal) || fields.delete("literal"))
|
273
|
+
tsv_fields << "Start" << "End"
|
274
|
+
tsv_fields << :annotation_types if add_types
|
275
|
+
tsv_fields << :literal if literal
|
276
|
+
|
277
|
+
if fields.any? and not (fields == [:all] or fields == ["all"])
|
278
|
+
tsv_fields.concat fields
|
279
|
+
else
|
280
|
+
tsv_fields.concat segments.first.annotations if segments.any?
|
281
|
+
end
|
282
|
+
tsv_fields
|
283
|
+
tsv_fields.collect!{|f| f.to_s}
|
284
|
+
tsv_fields.delete "offset"
|
285
|
+
tsv_fields
|
286
|
+
end
|
287
|
+
|
288
|
+
def self.tsv(segments, *fields)
|
289
|
+
fields = set_tsv_fields fields, segments
|
290
|
+
tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
291
|
+
|
292
|
+
segments.each do |segment|
|
293
|
+
tsv[segment.id] = self.tsv_values_for_segment(segment, fields)
|
294
|
+
end
|
295
|
+
|
296
|
+
tsv
|
297
|
+
end
|
298
|
+
|
299
|
+
def self.load_tsv(tsv)
|
300
|
+
tsv.collect do |id, values|
|
301
|
+
Annotated.load_tsv_values(id, values, tsv.fields)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
254
305
|
end
|
306
|
+
|