rbbt-text 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/corpus/corpus.rb +15 -6
- data/lib/rbbt/corpus/document.rb +100 -127
- data/lib/rbbt/corpus/document_repo.rb +72 -51
- data/lib/rbbt/ner/NER.rb +4 -4
- data/lib/rbbt/ner/abner.rb +5 -4
- data/lib/rbbt/ner/banner.rb +3 -3
- data/lib/rbbt/ner/chemical_tagger.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
- data/lib/rbbt/ner/oscar3.rb +3 -3
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +15 -13
- data/lib/rbbt/ner/regexpNER.rb +3 -2
- data/lib/rbbt/ner/rnorm.rb +2 -2
- data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
- data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
- data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
- data/lib/rbbt/ner/segment/relationship.rb +20 -0
- data/lib/rbbt/ner/segment/segmented.rb +13 -0
- data/lib/rbbt/ner/segment/token.rb +24 -0
- data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
- data/lib/rbbt/ner/token_trieNER.rb +30 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/nlp.rb +23 -37
- data/test/rbbt/corpus/test_document.rb +39 -37
- data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
- data/test/rbbt/ner/segment/test_segmented.rb +23 -0
- data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
- data/test/rbbt/ner/test_patterns.rb +11 -12
- data/test/rbbt/ner/test_regexpNER.rb +5 -4
- data/test/rbbt/ner/test_segment.rb +101 -0
- data/test/rbbt/ner/test_token_trieNER.rb +8 -9
- data/test/test_helper.rb +6 -6
- metadata +40 -22
- data/lib/rbbt/ner/annotations/annotated.rb +0 -15
- data/lib/rbbt/ner/annotations/relations.rb +0 -25
- data/lib/rbbt/ner/annotations/token.rb +0 -28
- data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
- data/test/rbbt/ner/test_annotations.rb +0 -70
data/lib/rbbt/ner/NER.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require 'rbbt/ner/
|
2
|
-
require 'rbbt/ner/
|
3
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
|
+
require 'rbbt/ner/segment/named_entity'
|
3
|
+
require 'rbbt/ner/segment/segmented'
|
4
4
|
|
5
5
|
class NER
|
6
6
|
def entities(text, protect = false, *args)
|
@@ -13,7 +13,7 @@ class NER
|
|
13
13
|
}
|
14
14
|
matches
|
15
15
|
end.flatten
|
16
|
-
when (
|
16
|
+
when (Segmented === text and protect)
|
17
17
|
entities(text.split_segments(true), protect, *args)
|
18
18
|
else
|
19
19
|
match(text, *args)
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/segment'
|
4
|
+
require 'rbbt/resource'
|
4
5
|
require 'rbbt/ner/NER'
|
5
6
|
|
6
7
|
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
7
8
|
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
8
9
|
class Abner < NER
|
9
10
|
|
10
|
-
Rbbt.software.opt.ABNER
|
11
|
+
Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
|
11
12
|
|
12
13
|
@@JFile = Rjb::import('java.io.File')
|
13
14
|
@@Tagger = Rjb::import('abner.Tagger')
|
@@ -38,9 +39,9 @@ class Abner < NER
|
|
38
39
|
mention = mention.to_s;
|
39
40
|
offset = text.index(mention)
|
40
41
|
if offset.nil?
|
41
|
-
NamedEntity.
|
42
|
+
NamedEntity.setup(mention, nil, type.to_s)
|
42
43
|
else
|
43
|
-
NamedEntity.
|
44
|
+
NamedEntity.setup(mention, offset + global_offset, type.to_s)
|
44
45
|
text = text[offset + mention.length..-1]
|
45
46
|
global_offset += offset + mention.length
|
46
47
|
end
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/segment'
|
4
4
|
require 'rbbt/ner/NER'
|
5
5
|
|
6
6
|
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
7
7
|
# in Java. Banner[http://banner.sourceforge.net/].
|
8
8
|
class Banner < NER
|
9
9
|
|
10
|
-
Rbbt.software.opt.BANNER
|
10
|
+
Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
|
11
11
|
|
12
12
|
@@JFile = Rjb::import('java.io.File')
|
13
13
|
@@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
|
@@ -68,7 +68,7 @@ class Banner < NER
|
|
68
68
|
mention.sub!(/^\s*/,'')
|
69
69
|
mention.sub!(/\s*$/,'')
|
70
70
|
offset = text.index(mention)
|
71
|
-
NamedEntity.
|
71
|
+
NamedEntity.setup(mention, offset, 'GENE')
|
72
72
|
mention
|
73
73
|
}
|
74
74
|
res
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/segment'
|
4
4
|
require 'rbbt/ner/NER'
|
5
5
|
require 'rbbt/util/log'
|
6
6
|
|
7
7
|
class ChemicalTagger < NER
|
8
|
-
Rbbt.software.opt.ChemicalTagger
|
8
|
+
Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
|
9
9
|
|
10
10
|
Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
|
11
11
|
|
@@ -24,7 +24,7 @@ class ChemicalTagger < NER
|
|
24
24
|
|
25
25
|
matches.collect do |mention|
|
26
26
|
offset = text.index mention
|
27
|
-
NamedEntity.
|
27
|
+
NamedEntity.setup mention, offset, "Chemical Mention", nil, nil
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
@@ -1,7 +1,8 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/util/
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/misc'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/ner/segment'
|
5
|
+
require 'rbbt/ner/segment/token'
|
5
6
|
require 'rbbt/ner/NER'
|
6
7
|
require 'inline'
|
7
8
|
|
@@ -41,9 +42,28 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
44
|
-
def self.
|
45
|
+
def self.process_stream(stream)
|
45
46
|
index = {}
|
46
|
-
|
47
|
+
while line = stream.gets
|
48
|
+
names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
|
49
|
+
code = names.shift
|
50
|
+
|
51
|
+
names.each do |name|
|
52
|
+
ngram = name[0..2].strip
|
53
|
+
index[ngram] ||= []
|
54
|
+
index[ngram] << [name, code]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
index
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.process_hash(hash)
|
62
|
+
index = {}
|
63
|
+
hash.monitor = true if hash.respond_to? :monitor
|
64
|
+
hash.unnamed = true if hash.respond_to? :unnamed
|
65
|
+
method = hash.respond_to?(:through)? :through : :each
|
66
|
+
hash.send(method) do |code, names|
|
47
67
|
names.each do |name|
|
48
68
|
ngram = name[0..2].strip
|
49
69
|
index[ngram] ||= []
|
@@ -94,15 +114,30 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
94
114
|
|
95
115
|
attr_accessor :index, :type
|
96
116
|
def initialize(file, type = nil)
|
97
|
-
tsv = TSV.new(file, :flat)
|
98
117
|
@type = type
|
99
|
-
|
100
|
-
|
118
|
+
case
|
119
|
+
when (TSV === file or Hash === file)
|
120
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
|
121
|
+
@index = NGramPrefixDictionary.process_hash(file)
|
122
|
+
when Path === file
|
123
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
124
|
+
@index = NGramPrefixDictionary.process_stream(file.open)
|
125
|
+
when Misc.is_filename?(file)
|
126
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
127
|
+
@index = NGramPrefixDictionary.process_stream(Open.open(file))
|
128
|
+
when StreamIO === file
|
129
|
+
Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
|
130
|
+
@index = NGramPrefixDictionary.process_stream(file)
|
131
|
+
else
|
132
|
+
raise "Format of lexicon not understood: #{file.inspect}"
|
133
|
+
end
|
134
|
+
|
135
|
+
Log.debug("Ngram Prefix Dictionary. Loading done.")
|
101
136
|
end
|
102
137
|
|
103
138
|
def match(text)
|
104
139
|
NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
|
105
|
-
NamedEntity.
|
140
|
+
NamedEntity.setup(name, offset, type, code)
|
106
141
|
}
|
107
142
|
end
|
108
143
|
end
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/ner/
|
4
|
+
require 'rbbt/ner/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
8
8
|
class OSCAR3 < NER
|
9
|
-
Rbbt.software.opt.OSCAR3
|
9
|
+
Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
|
10
10
|
|
11
11
|
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
12
12
|
@@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
@@ -50,7 +50,7 @@ class OSCAR3 < NER
|
|
50
50
|
next unless type.nil? or type.include? mention_type
|
51
51
|
score = memm ? entities.get(key).to_string.to_f : nil
|
52
52
|
|
53
|
-
NamedEntity.
|
53
|
+
NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
|
54
54
|
|
55
55
|
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
|
56
56
|
end
|
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/ner/
|
4
|
+
require 'rbbt/ner/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
8
8
|
class OSCAR4 < NER
|
9
|
-
Rbbt.software.opt.OSCAR4
|
9
|
+
Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
|
10
10
|
|
11
11
|
Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
|
12
12
|
@@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
|
@@ -26,7 +26,7 @@ class OSCAR4 < NER
|
|
26
26
|
mention = entity.getSurface
|
27
27
|
result << mention
|
28
28
|
|
29
|
-
NamedEntity.
|
29
|
+
NamedEntity.setup mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
|
30
30
|
end
|
31
31
|
|
32
32
|
result
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt/ner/
|
2
|
-
require 'rbbt/ner/
|
3
|
-
require 'rbbt/ner/
|
4
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment/named_entity'
|
2
|
+
require 'rbbt/ner/segment/segmented'
|
3
|
+
require 'rbbt/ner/segment/transformed'
|
4
|
+
require 'rbbt/ner/segment/relationship'
|
5
5
|
require 'rbbt/ner/regexpNER'
|
6
6
|
require 'rbbt/ner/token_trieNER'
|
7
7
|
require 'rbbt/nlp/nlp'
|
@@ -12,7 +12,9 @@ class PatternRelExt
|
|
12
12
|
patterns = Array === patterns ? patterns : [patterns]
|
13
13
|
type ||= "Simple Pattern"
|
14
14
|
regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
|
15
|
-
|
15
|
+
segments = sentence.segments
|
16
|
+
segments = segments.values.flatten if Hash === segments
|
17
|
+
Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
|
16
18
|
regexpNER.entities(sentence)
|
17
19
|
end
|
18
20
|
end
|
@@ -23,23 +25,23 @@ class PatternRelExt
|
|
23
25
|
when key =~ /(.*)\[entity:(.*)\]/
|
24
26
|
chunk_type, chunk_value = $1, $2
|
25
27
|
annotation_types = chunk_value.split(",")
|
26
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
27
|
-
((Hash === chunk.
|
28
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
29
|
+
((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
|
28
30
|
|
29
31
|
when key =~ /(.*)\[code:(.*)\]/
|
30
32
|
chunk_type, chunk_value = $1, $2
|
31
33
|
annotation_codes = chunk_value.split(",")
|
32
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
33
|
-
((Hash === chunk.
|
34
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
35
|
+
((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
|
34
36
|
|
35
37
|
when key =~ /(.*)\[stem:(.*)\]/
|
36
38
|
chunk_type, chunk_value = $1, $2
|
37
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
39
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
38
40
|
chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
|
39
41
|
|
40
42
|
when key =~ /(.*)\[(.*)\]/
|
41
43
|
chunk_type, chunk_value = $1, $2
|
42
|
-
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
44
|
+
Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
|
43
45
|
chunk.parts.values.select{|a| a == chunk_value}.any?}
|
44
46
|
|
45
47
|
else
|
@@ -120,9 +122,9 @@ class PatternRelExt
|
|
120
122
|
sentence_chunks = NLP.gdep_chunk_sentences(sentences)
|
121
123
|
|
122
124
|
sentences.zip(sentence_chunks).collect do |sentence, chunks|
|
123
|
-
annotation_index = Segment.index(sentence.
|
125
|
+
annotation_index = Segment.index(sentence.segments)
|
124
126
|
chunks.each do |chunk|
|
125
|
-
|
127
|
+
Segmented.setup(chunk, annotation_index[chunk.range])
|
126
128
|
end
|
127
129
|
|
128
130
|
match_chunks(chunks)
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
2
|
require 'rbbt/ner/NER'
|
3
3
|
require 'rbbt/util/simpleDSL'
|
4
4
|
|
@@ -23,7 +23,7 @@ class RegExpNER < NER
|
|
23
23
|
end
|
24
24
|
|
25
25
|
if match and not match.empty?
|
26
|
-
NamedEntity.
|
26
|
+
NamedEntity.setup(match, start + pre.length, type)
|
27
27
|
matches << match
|
28
28
|
end
|
29
29
|
|
@@ -86,6 +86,7 @@ class RegExpNER < NER
|
|
86
86
|
|
87
87
|
def match(text)
|
88
88
|
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
89
|
+
matches
|
89
90
|
end
|
90
91
|
|
91
92
|
end
|
data/lib/rbbt/ner/rnorm.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt/ner/rnorm/cue_index'
|
2
2
|
require 'rbbt/ner/rnorm/tokens'
|
3
3
|
require 'rbbt/util/open'
|
4
|
-
require 'rbbt/
|
4
|
+
require 'rbbt/tsv'
|
5
5
|
require 'rbbt/sources/entrez'
|
6
6
|
require 'rbbt/bow/bow.rb'
|
7
7
|
|
@@ -89,7 +89,7 @@ class Normalizer
|
|
89
89
|
|
90
90
|
|
91
91
|
def initialize(lexicon, options = {})
|
92
|
-
@synonyms = TSV.
|
92
|
+
@synonyms = TSV.open(lexicon, :flat)
|
93
93
|
|
94
94
|
@index = CueIndex.new
|
95
95
|
@index.load(lexicon, options[:max_candidates])
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt'
|
2
2
|
require 'rbbt/util/misc'
|
3
3
|
require 'rbbt/util/simpleDSL'
|
4
4
|
|
@@ -47,7 +47,7 @@ class CueIndex
|
|
47
47
|
|
48
48
|
def load(file, max_candidates = 50)
|
49
49
|
@indexes = Array.new(@rules.size){Hash.new}
|
50
|
-
data = TSV.
|
50
|
+
data = TSV.open(file, :flat)
|
51
51
|
data.each{|code, values|
|
52
52
|
values.each{|value|
|
53
53
|
cues(value).each_with_index{|cue_list,i|
|
@@ -1,63 +1,59 @@
|
|
1
|
+
require 'rbbt/annotations'
|
2
|
+
require 'rbbt/fix_width_table'
|
3
|
+
|
1
4
|
module Segment
|
2
|
-
|
3
|
-
|
4
|
-
def self.included(base)
|
5
|
-
if base.instance_methods.include? "segment_types"
|
6
|
-
class << base
|
7
|
-
self.module_eval do
|
8
|
-
define_method "extended" do |object|
|
9
|
-
object.segment_types ||= []
|
10
|
-
object.segment_types << self.to_s unless object.segment_types.include? self.to_s
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
5
|
+
extend Annotation
|
6
|
+
self.annotation :offset
|
16
7
|
|
17
|
-
|
18
|
-
string.extend Segment
|
19
|
-
string.offset = offset
|
20
|
-
string.docid = docid
|
21
|
-
string
|
22
|
-
end
|
8
|
+
#{{{ Ranges
|
23
9
|
|
24
|
-
def
|
25
|
-
|
26
|
-
|
10
|
+
def end
|
11
|
+
return nil if offset.nil?
|
12
|
+
offset + length - 1
|
27
13
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
info = {}
|
33
|
-
singleton_methods.select{|method| method[-1] == equal_ascii}.
|
34
|
-
collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
|
35
|
-
info
|
36
|
-
info.delete_if{|k,v| v.nil?}
|
37
|
-
info
|
14
|
+
|
15
|
+
def range
|
16
|
+
raise "No offset specified" if offset.nil?
|
17
|
+
(offset..self.end)
|
38
18
|
end
|
39
19
|
|
40
|
-
def
|
41
|
-
|
42
|
-
|
43
|
-
|
20
|
+
def pull(offset)
|
21
|
+
if self.offset.nil? or offset.nil?
|
22
|
+
self.offset = nil
|
23
|
+
else
|
24
|
+
self.offset += offset
|
25
|
+
end
|
44
26
|
|
45
|
-
|
46
|
-
|
47
|
-
types.each do |type| string.extend Misc.string2const(type) end
|
27
|
+
self
|
28
|
+
end
|
48
29
|
|
49
|
-
|
50
|
-
|
51
|
-
|
30
|
+
def push(offset)
|
31
|
+
if self.offset.nil? or offset.nil?
|
32
|
+
self.offset = nil
|
33
|
+
else
|
34
|
+
self.offset -= offset
|
52
35
|
end
|
53
36
|
|
54
|
-
|
55
|
-
|
37
|
+
self
|
38
|
+
end
|
56
39
|
|
57
|
-
|
40
|
+
def make_relative(segments)
|
41
|
+
segments.collect{|s| s.push offset}
|
58
42
|
end
|
59
43
|
|
60
|
-
|
44
|
+
def range_in(container = nil)
|
45
|
+
raise "No offset specified" if offset.nil?
|
46
|
+
case
|
47
|
+
when (Segment === container and not container.offset.nil?)
|
48
|
+
((offset - container.offset)..(self.end - container.offset))
|
49
|
+
when Integer === container
|
50
|
+
((offset - container)..(self.end - container))
|
51
|
+
else
|
52
|
+
range
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
#{{{ Sorting
|
61
57
|
|
62
58
|
def self.sort(segments, inline = true)
|
63
59
|
if inline
|
@@ -102,13 +98,15 @@ module Segment
|
|
102
98
|
sorted
|
103
99
|
end
|
104
100
|
|
101
|
+
#{{{ Splitting
|
102
|
+
|
105
103
|
def self.split(text, segments, skip_segments = false)
|
106
104
|
sorted_segments = clean_sort segments
|
107
105
|
|
108
106
|
chunks = []
|
109
107
|
segment_end = 0
|
110
108
|
text_offset = 0
|
111
|
-
sorted_segments.
|
109
|
+
sorted_segments.each do |segment|
|
112
110
|
return chunks if text.nil? or text.empty?
|
113
111
|
next if segment.offset.nil?
|
114
112
|
offset = segment.offset - text_offset
|
@@ -119,7 +117,7 @@ module Segment
|
|
119
117
|
next
|
120
118
|
when offset > 0 # Save pre
|
121
119
|
chunk = text[0..offset - 1]
|
122
|
-
Segment.
|
120
|
+
Segment.setup(chunk, text_offset)
|
123
121
|
chunks << chunk
|
124
122
|
end
|
125
123
|
|
@@ -127,7 +125,7 @@ module Segment
|
|
127
125
|
|
128
126
|
if not skip_segments
|
129
127
|
chunk = text[offset..segment_end]
|
130
|
-
Segment.
|
128
|
+
Segment.setup(chunk, text_offset + offset)
|
131
129
|
chunks << chunk
|
132
130
|
end
|
133
131
|
|
@@ -138,72 +136,29 @@ module Segment
|
|
138
136
|
|
139
137
|
if not text.nil? and text.any?
|
140
138
|
chunk = text.dup
|
141
|
-
Segment.
|
139
|
+
Segment.setup(chunk, text_offset)
|
142
140
|
chunks << chunk
|
143
141
|
end
|
144
142
|
|
145
143
|
chunks
|
146
144
|
end
|
147
145
|
|
148
|
-
# {{{ Ranges and manipulation
|
149
146
|
|
150
|
-
|
151
|
-
if self.offset.nil? or offset.nil?
|
152
|
-
self.offset = nil
|
153
|
-
else
|
154
|
-
self.offset += offset
|
155
|
-
end
|
156
|
-
|
157
|
-
self
|
158
|
-
end
|
159
|
-
|
160
|
-
def push(offset)
|
161
|
-
if self.offset.nil? or offset.nil?
|
162
|
-
self.offset = nil
|
163
|
-
else
|
164
|
-
self.offset -= offset
|
165
|
-
end
|
166
|
-
|
167
|
-
self
|
168
|
-
end
|
169
|
-
|
170
|
-
def make_relative(segments)
|
171
|
-
segments.collect{|s| s.push offset}
|
172
|
-
end
|
173
|
-
|
174
|
-
def end
|
175
|
-
return nil if offset.nil?
|
176
|
-
offset + length - 1
|
177
|
-
end
|
178
|
-
|
179
|
-
def range
|
180
|
-
raise "No offset specified" if offset.nil?
|
181
|
-
(offset..self.end)
|
182
|
-
end
|
183
|
-
|
184
|
-
def range_in(container = nil)
|
185
|
-
raise "No offset specified" if offset.nil?
|
186
|
-
case
|
187
|
-
when (Segment === container and not container.offset.nil?)
|
188
|
-
((offset - container.offset)..(self.end - container.offset))
|
189
|
-
when Integer === container
|
190
|
-
((offset - container)..(self.end - container))
|
191
|
-
else
|
192
|
-
range
|
193
|
-
end
|
194
|
-
end
|
147
|
+
#{{{ Align
|
195
148
|
|
196
149
|
def self.align(text, parts)
|
197
150
|
pre_offset = 0
|
198
151
|
parts.each do |part|
|
199
152
|
offset = text.index part
|
200
153
|
next if offset.nil?
|
201
|
-
Segment.
|
154
|
+
Segment.setup(part, pre_offset + offset)
|
202
155
|
pre_offset += offset + part.length - 1
|
203
156
|
text = text[(offset + part.length - 1)..-1]
|
204
157
|
end
|
205
158
|
end
|
206
159
|
|
160
|
+
#{{{ Index
|
161
|
+
|
207
162
|
class Index
|
208
163
|
attr_accessor :index, :data
|
209
164
|
def initialize(index, data)
|
@@ -216,12 +171,11 @@ module Segment
|
|
216
171
|
end
|
217
172
|
end
|
218
173
|
|
219
|
-
def self.index(segments,
|
220
|
-
|
174
|
+
def self.index(segments, persist_file = :memory)
|
221
175
|
segments = segments.values.flatten if Hash === segments
|
222
176
|
|
223
177
|
annotation_index =
|
224
|
-
|
178
|
+
Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
|
225
179
|
|
226
180
|
value_size = 0
|
227
181
|
index_data = segments.collect{|segment|
|
@@ -233,6 +187,7 @@ module Segment
|
|
233
187
|
|
234
188
|
fwt = FixWidthTable.get :memory, value_size, true
|
235
189
|
fwt.add_range index_data
|
190
|
+
|
236
191
|
fwt
|
237
192
|
end
|
238
193
|
|
@@ -241,14 +196,111 @@ module Segment
|
|
241
196
|
Index.new annotation_index, data
|
242
197
|
end
|
243
198
|
|
244
|
-
|
199
|
+
#{{{ Save and load
|
200
|
+
|
201
|
+
def self.tsv_values_for_segment(segment, fields)
|
202
|
+
info = segment.info
|
203
|
+
values = []
|
204
|
+
|
205
|
+
fields.each do |field|
|
206
|
+
values << case
|
207
|
+
when field == "JSON"
|
208
|
+
info.to_json
|
209
|
+
when field == "literal"
|
210
|
+
segment.gsub(/\n|\t/, ' ')
|
211
|
+
when field == "Start"
|
212
|
+
segment.offset
|
213
|
+
when field == "End"
|
214
|
+
segment.end
|
215
|
+
else
|
216
|
+
info.delete(field.to_sym)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
values
|
221
|
+
end
|
222
|
+
|
223
|
+
def self.load_tsv_values(text, values, fields)
|
224
|
+
info = {}
|
225
|
+
literal_pos = fields.index "literal"
|
226
|
+
|
227
|
+
object = if literal_pos.nil?
|
228
|
+
""
|
229
|
+
else
|
230
|
+
v = values[literal_pos]
|
231
|
+
v = v.first if Array === v
|
232
|
+
v
|
233
|
+
end
|
234
|
+
|
235
|
+
fields.each_with_index do |field, i|
|
236
|
+
if field == "JSON"
|
237
|
+
JSON.parse(values[i]).each do |key, value|
|
238
|
+
info[key.to_sym] = value
|
239
|
+
end
|
240
|
+
else
|
241
|
+
info[field.to_sym] = values[i]
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
start = info.delete(:Start)
|
246
|
+
if not (start.nil? or ((Array === start or String === start) and start.empty?))
|
247
|
+
if Array === start
|
248
|
+
start = start.first
|
249
|
+
end
|
250
|
+
start = start.to_i
|
251
|
+
info[:offset] = start
|
252
|
+
|
253
|
+
eend = info.delete(:End)
|
254
|
+
if Array === eend
|
255
|
+
eend = eend.first
|
256
|
+
end
|
257
|
+
eend = eend.to_i
|
258
|
+
|
259
|
+
if object.empty?
|
260
|
+
object.replace text[start..eend]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
info[:annotation_types] = [Segment] unless info.include? :annotation_types
|
245
265
|
|
246
|
-
|
247
|
-
include Segment
|
248
|
-
attr_accessor :comment
|
249
|
-
def self.annotate(text, comment = nil)
|
250
|
-
text.extend Comment
|
251
|
-
text.comment = (comment.nil? ? text : comment)
|
252
|
-
text
|
266
|
+
Annotated.load(object, info)
|
253
267
|
end
|
268
|
+
|
269
|
+
def self.set_tsv_fields(fields, segments)
|
270
|
+
tsv_fields = []
|
271
|
+
add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
|
272
|
+
literal = (fields.delete(:literal) || fields.delete("literal"))
|
273
|
+
tsv_fields << "Start" << "End"
|
274
|
+
tsv_fields << :annotation_types if add_types
|
275
|
+
tsv_fields << :literal if literal
|
276
|
+
|
277
|
+
if fields.any? and not (fields == [:all] or fields == ["all"])
|
278
|
+
tsv_fields.concat fields
|
279
|
+
else
|
280
|
+
tsv_fields.concat segments.first.annotations if segments.any?
|
281
|
+
end
|
282
|
+
tsv_fields
|
283
|
+
tsv_fields.collect!{|f| f.to_s}
|
284
|
+
tsv_fields.delete "offset"
|
285
|
+
tsv_fields
|
286
|
+
end
|
287
|
+
|
288
|
+
def self.tsv(segments, *fields)
|
289
|
+
fields = set_tsv_fields fields, segments
|
290
|
+
tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
291
|
+
|
292
|
+
segments.each do |segment|
|
293
|
+
tsv[segment.id] = self.tsv_values_for_segment(segment, fields)
|
294
|
+
end
|
295
|
+
|
296
|
+
tsv
|
297
|
+
end
|
298
|
+
|
299
|
+
def self.load_tsv(tsv)
|
300
|
+
tsv.collect do |id, values|
|
301
|
+
Annotated.load_tsv_values(id, values, tsv.fields)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
254
305
|
end
|
306
|
+
|