rbbt-text 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
data/lib/rbbt/ner/NER.rb CHANGED
@@ -1,6 +1,6 @@
1
- require 'rbbt/ner/annotations'
2
- require 'rbbt/ner/annotations/named_entity'
3
- require 'rbbt/ner/annotations/annotated'
1
+ require 'rbbt/ner/segment'
2
+ require 'rbbt/ner/segment/named_entity'
3
+ require 'rbbt/ner/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -13,7 +13,7 @@ class NER
13
13
  }
14
14
  matches
15
15
  end.flatten
16
- when (Annotated === text and protect)
16
+ when (Segmented === text and protect)
17
17
  entities(text.split_segments(true), protect, *args)
18
18
  else
19
19
  match(text, *args)
@@ -1,13 +1,14 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/segment'
4
+ require 'rbbt/resource'
4
5
  require 'rbbt/ner/NER'
5
6
 
6
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
7
8
  # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
8
9
  class Abner < NER
9
10
 
10
- Rbbt.software.opt.ABNER.define_as_install Rbbt.share.install.software.ABNER.find
11
+ Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
11
12
 
12
13
  @@JFile = Rjb::import('java.io.File')
13
14
  @@Tagger = Rjb::import('abner.Tagger')
@@ -38,9 +39,9 @@ class Abner < NER
38
39
  mention = mention.to_s;
39
40
  offset = text.index(mention)
40
41
  if offset.nil?
41
- NamedEntity.annotate(mention, nil, type.to_s)
42
+ NamedEntity.setup(mention, nil, type.to_s)
42
43
  else
43
- NamedEntity.annotate(mention, offset + global_offset, type.to_s)
44
+ NamedEntity.setup(mention, offset + global_offset, type.to_s)
44
45
  text = text[offset + mention.length..-1]
45
46
  global_offset += offset + mention.length
46
47
  end
@@ -1,13 +1,13 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/segment'
4
4
  require 'rbbt/ner/NER'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
8
8
  class Banner < NER
9
9
 
10
- Rbbt.software.opt.BANNER.define_as_install Rbbt.share.install.software.BANNER.find
10
+ Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
11
11
 
12
12
  @@JFile = Rjb::import('java.io.File')
13
13
  @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
@@ -68,7 +68,7 @@ class Banner < NER
68
68
  mention.sub!(/^\s*/,'')
69
69
  mention.sub!(/\s*$/,'')
70
70
  offset = text.index(mention)
71
- NamedEntity.annotate(mention, offset, 'GENE')
71
+ NamedEntity.setup(mention, offset, 'GENE')
72
72
  mention
73
73
  }
74
74
  res
@@ -1,11 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/segment'
4
4
  require 'rbbt/ner/NER'
5
5
  require 'rbbt/util/log'
6
6
 
7
7
  class ChemicalTagger < NER
8
- Rbbt.software.opt.ChemicalTagger.define_as_install Rbbt.share.install.software.ChemicalTagger.find
8
+ Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
9
 
10
10
  Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
11
11
 
@@ -24,7 +24,7 @@ class ChemicalTagger < NER
24
24
 
25
25
  matches.collect do |mention|
26
26
  offset = text.index mention
27
- NamedEntity.annotate mention, offset, "Chemical Mention", nil, nil
27
+ NamedEntity.setup mention, offset, "Chemical Mention", nil, nil
28
28
  end
29
29
  end
30
30
 
@@ -1,7 +1,8 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/ner/annotations'
4
- require 'rbbt/ner/annotations/token'
1
+ require 'rbbt'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/ner/segment'
5
+ require 'rbbt/ner/segment/token'
5
6
  require 'rbbt/ner/NER'
6
7
  require 'inline'
7
8
 
@@ -41,9 +42,28 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
41
42
  end
42
43
  end
43
44
 
44
- def self.process(hash)
45
+ def self.process_stream(stream)
45
46
  index = {}
46
- hash.each do |code, names|
47
+ while line = stream.gets
48
+ names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
49
+ code = names.shift
50
+
51
+ names.each do |name|
52
+ ngram = name[0..2].strip
53
+ index[ngram] ||= []
54
+ index[ngram] << [name, code]
55
+ end
56
+ end
57
+ index
58
+
59
+ end
60
+
61
+ def self.process_hash(hash)
62
+ index = {}
63
+ hash.monitor = true if hash.respond_to? :monitor
64
+ hash.unnamed = true if hash.respond_to? :unnamed
65
+ method = hash.respond_to?(:through)? :through : :each
66
+ hash.send(method) do |code, names|
47
67
  names.each do |name|
48
68
  ngram = name[0..2].strip
49
69
  index[ngram] ||= []
@@ -94,15 +114,30 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
94
114
 
95
115
  attr_accessor :index, :type
96
116
  def initialize(file, type = nil)
97
- tsv = TSV.new(file, :flat)
98
117
  @type = type
99
- tsv.unnamed = true
100
- @index = NGramPrefixDictionary.process(tsv)
118
+ case
119
+ when (TSV === file or Hash === file)
120
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
121
+ @index = NGramPrefixDictionary.process_hash(file)
122
+ when Path === file
123
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
124
+ @index = NGramPrefixDictionary.process_stream(file.open)
125
+ when Misc.is_filename?(file)
126
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
127
+ @index = NGramPrefixDictionary.process_stream(Open.open(file))
128
+ when StreamIO === file
129
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
130
+ @index = NGramPrefixDictionary.process_stream(file)
131
+ else
132
+ raise "Format of lexicon not understood: #{file.inspect}"
133
+ end
134
+
135
+ Log.debug("Ngram Prefix Dictionary. Loading done.")
101
136
  end
102
137
 
103
138
  def match(text)
104
139
  NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
105
- NamedEntity.annotate(name, offset, type, code)
140
+ NamedEntity.setup(name, offset, type, code)
106
141
  }
107
142
  end
108
143
  end
@@ -1,12 +1,12 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
8
8
  class OSCAR3 < NER
9
- Rbbt.software.opt.OSCAR3.define_as_install Rbbt.share.install.software.OSCAR3.find
9
+ Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
10
10
 
11
11
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
12
12
  @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
@@ -50,7 +50,7 @@ class OSCAR3 < NER
50
50
  next unless type.nil? or type.include? mention_type
51
51
  score = memm ? entities.get(key).to_string.to_f : nil
52
52
 
53
- NamedEntity.annotate mention, rstart.to_i + offset, mention_type, nil, score
53
+ NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
54
54
 
55
55
  mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
56
56
  end
@@ -1,12 +1,12 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
8
8
  class OSCAR4 < NER
9
- Rbbt.software.opt.OSCAR4.define_as_install Rbbt.share.install.software.OSCAR4.find
9
+ Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
10
10
 
11
11
  Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
12
12
  @@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
@@ -26,7 +26,7 @@ class OSCAR4 < NER
26
26
  mention = entity.getSurface
27
27
  result << mention
28
28
 
29
- NamedEntity.annotate mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
29
+ NamedEntity.setup mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
30
30
  end
31
31
 
32
32
  result
@@ -1,7 +1,7 @@
1
- require 'rbbt/ner/annotations/named_entity'
2
- require 'rbbt/ner/annotations/annotated'
3
- require 'rbbt/ner/annotations/transformed'
4
- require 'rbbt/ner/annotations/relations'
1
+ require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/ner/segment/segmented'
3
+ require 'rbbt/ner/segment/transformed'
4
+ require 'rbbt/ner/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -12,7 +12,9 @@ class PatternRelExt
12
12
  patterns = Array === patterns ? patterns : [patterns]
13
13
  type ||= "Simple Pattern"
14
14
  regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
- Transformed.with_transform(sentence, sentence.annotations, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
15
+ segments = sentence.segments
16
+ segments = segments.values.flatten if Hash === segments
17
+ Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
16
18
  regexpNER.entities(sentence)
17
19
  end
18
20
  end
@@ -23,23 +25,23 @@ class PatternRelExt
23
25
  when key =~ /(.*)\[entity:(.*)\]/
24
26
  chunk_type, chunk_value = $1, $2
25
27
  annotation_types = chunk_value.split(",")
26
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
27
- ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
28
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
29
+ ((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
28
30
 
29
31
  when key =~ /(.*)\[code:(.*)\]/
30
32
  chunk_type, chunk_value = $1, $2
31
33
  annotation_codes = chunk_value.split(",")
32
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
33
- ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
34
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
35
+ ((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
34
36
 
35
37
  when key =~ /(.*)\[stem:(.*)\]/
36
38
  chunk_type, chunk_value = $1, $2
37
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
39
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
38
40
  chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
39
41
 
40
42
  when key =~ /(.*)\[(.*)\]/
41
43
  chunk_type, chunk_value = $1, $2
42
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
44
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
43
45
  chunk.parts.values.select{|a| a == chunk_value}.any?}
44
46
 
45
47
  else
@@ -120,9 +122,9 @@ class PatternRelExt
120
122
  sentence_chunks = NLP.gdep_chunk_sentences(sentences)
121
123
 
122
124
  sentences.zip(sentence_chunks).collect do |sentence, chunks|
123
- annotation_index = Segment.index(sentence.annotations)
125
+ annotation_index = Segment.index(sentence.segments)
124
126
  chunks.each do |chunk|
125
- Annotated.annotate(chunk, annotation_index[chunk.range])
127
+ Segmented.setup(chunk, annotation_index[chunk.range])
126
128
  end
127
129
 
128
130
  match_chunks(chunks)
@@ -1,4 +1,4 @@
1
- require 'rbbt/ner/annotations'
1
+ require 'rbbt/ner/segment'
2
2
  require 'rbbt/ner/NER'
3
3
  require 'rbbt/util/simpleDSL'
4
4
 
@@ -23,7 +23,7 @@ class RegExpNER < NER
23
23
  end
24
24
 
25
25
  if match and not match.empty?
26
- NamedEntity.annotate(match, start + pre.length, type)
26
+ NamedEntity.setup(match, start + pre.length, type)
27
27
  matches << match
28
28
  end
29
29
 
@@ -86,6 +86,7 @@ class RegExpNER < NER
86
86
 
87
87
  def match(text)
88
88
  matches = RegExpNER.match_regexp_hash(text, @regexps)
89
+ matches
89
90
  end
90
91
 
91
92
  end
@@ -1,7 +1,7 @@
1
1
  require 'rbbt/ner/rnorm/cue_index'
2
2
  require 'rbbt/ner/rnorm/tokens'
3
3
  require 'rbbt/util/open'
4
- require 'rbbt/util/tsv'
4
+ require 'rbbt/tsv'
5
5
  require 'rbbt/sources/entrez'
6
6
  require 'rbbt/bow/bow.rb'
7
7
 
@@ -89,7 +89,7 @@ class Normalizer
89
89
 
90
90
 
91
91
  def initialize(lexicon, options = {})
92
- @synonyms = TSV.new(lexicon, :flat)
92
+ @synonyms = TSV.open(lexicon, :flat)
93
93
 
94
94
  @index = CueIndex.new
95
95
  @index.load(lexicon, options[:max_candidates])
@@ -1,4 +1,4 @@
1
- require 'rbbt-util'
1
+ require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/util/simpleDSL'
4
4
 
@@ -47,7 +47,7 @@ class CueIndex
47
47
 
48
48
  def load(file, max_candidates = 50)
49
49
  @indexes = Array.new(@rules.size){Hash.new}
50
- data = TSV.new(file, :flat)
50
+ data = TSV.open(file, :flat)
51
51
  data.each{|code, values|
52
52
  values.each{|value|
53
53
  cues(value).each_with_index{|cue_list,i|
@@ -1,63 +1,59 @@
1
+ require 'rbbt/annotations'
2
+ require 'rbbt/fix_width_table'
3
+
1
4
  module Segment
2
- attr_accessor :offset, :docid
3
-
4
- def self.included(base)
5
- if base.instance_methods.include? "segment_types"
6
- class << base
7
- self.module_eval do
8
- define_method "extended" do |object|
9
- object.segment_types ||= []
10
- object.segment_types << self.to_s unless object.segment_types.include? self.to_s
11
- end
12
- end
13
- end
14
- end
15
- end
5
+ extend Annotation
6
+ self.annotation :offset
16
7
 
17
- def self.annotate(string, offset = nil, docid = nil)
18
- string.extend Segment
19
- string.offset = offset
20
- string.docid = docid
21
- string
22
- end
8
+ #{{{ Ranges
23
9
 
24
- def id
25
- new = info.dup
26
- Digest::MD5.hexdigest(Misc.hash2string(new) << self << (offset || 0).to_s)
10
+ def end
11
+ return nil if offset.nil?
12
+ offset + length - 1
27
13
  end
28
-
29
- SKIP = %w(docid offset)
30
- def info
31
- equal_ascii = "="[0]
32
- info = {}
33
- singleton_methods.select{|method| method[-1] == equal_ascii}.
34
- collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
35
- info
36
- info.delete_if{|k,v| v.nil?}
37
- info
14
+
15
+ def range
16
+ raise "No offset specified" if offset.nil?
17
+ (offset..self.end)
38
18
  end
39
19
 
40
- def self.load(text, start, eend, info, docid = nil)
41
- string = text[start.to_i..eend.to_i] if start and eend
42
- string ||= info[:literal]
43
- string.extend Segment
20
+ def pull(offset)
21
+ if self.offset.nil? or offset.nil?
22
+ self.offset = nil
23
+ else
24
+ self.offset += offset
25
+ end
44
26
 
45
- # add types
46
- types = info.delete("segment_types")|| info.delete(:segment_types) || []
47
- types.each do |type| string.extend Misc.string2const(type) end
27
+ self
28
+ end
48
29
 
49
- # set info data
50
- info.each do |key,value|
51
- string.send key + '=', value if string.respond_to? key.to_sym
30
+ def push(offset)
31
+ if self.offset.nil? or offset.nil?
32
+ self.offset = nil
33
+ else
34
+ self.offset -= offset
52
35
  end
53
36
 
54
- string.docid = docid
55
- string.offset = start.to_i
37
+ self
38
+ end
56
39
 
57
- string
40
+ def make_relative(segments)
41
+ segments.collect{|s| s.push offset}
58
42
  end
59
43
 
60
- # {{{ Sorting and splitting
44
+ def range_in(container = nil)
45
+ raise "No offset specified" if offset.nil?
46
+ case
47
+ when (Segment === container and not container.offset.nil?)
48
+ ((offset - container.offset)..(self.end - container.offset))
49
+ when Integer === container
50
+ ((offset - container)..(self.end - container))
51
+ else
52
+ range
53
+ end
54
+ end
55
+
56
+ #{{{ Sorting
61
57
 
62
58
  def self.sort(segments, inline = true)
63
59
  if inline
@@ -102,13 +98,15 @@ module Segment
102
98
  sorted
103
99
  end
104
100
 
101
+ #{{{ Splitting
102
+
105
103
  def self.split(text, segments, skip_segments = false)
106
104
  sorted_segments = clean_sort segments
107
105
 
108
106
  chunks = []
109
107
  segment_end = 0
110
108
  text_offset = 0
111
- sorted_segments.reverse.each do |segment|
109
+ sorted_segments.each do |segment|
112
110
  return chunks if text.nil? or text.empty?
113
111
  next if segment.offset.nil?
114
112
  offset = segment.offset - text_offset
@@ -119,7 +117,7 @@ module Segment
119
117
  next
120
118
  when offset > 0 # Save pre
121
119
  chunk = text[0..offset - 1]
122
- Segment.annotate(chunk, text_offset)
120
+ Segment.setup(chunk, text_offset)
123
121
  chunks << chunk
124
122
  end
125
123
 
@@ -127,7 +125,7 @@ module Segment
127
125
 
128
126
  if not skip_segments
129
127
  chunk = text[offset..segment_end]
130
- Segment.annotate(chunk, text_offset + offset)
128
+ Segment.setup(chunk, text_offset + offset)
131
129
  chunks << chunk
132
130
  end
133
131
 
@@ -138,72 +136,29 @@ module Segment
138
136
 
139
137
  if not text.nil? and text.any?
140
138
  chunk = text.dup
141
- Segment.annotate(chunk, text_offset)
139
+ Segment.setup(chunk, text_offset)
142
140
  chunks << chunk
143
141
  end
144
142
 
145
143
  chunks
146
144
  end
147
145
 
148
- # {{{ Ranges and manipulation
149
146
 
150
- def pull(offset)
151
- if self.offset.nil? or offset.nil?
152
- self.offset = nil
153
- else
154
- self.offset += offset
155
- end
156
-
157
- self
158
- end
159
-
160
- def push(offset)
161
- if self.offset.nil? or offset.nil?
162
- self.offset = nil
163
- else
164
- self.offset -= offset
165
- end
166
-
167
- self
168
- end
169
-
170
- def make_relative(segments)
171
- segments.collect{|s| s.push offset}
172
- end
173
-
174
- def end
175
- return nil if offset.nil?
176
- offset + length - 1
177
- end
178
-
179
- def range
180
- raise "No offset specified" if offset.nil?
181
- (offset..self.end)
182
- end
183
-
184
- def range_in(container = nil)
185
- raise "No offset specified" if offset.nil?
186
- case
187
- when (Segment === container and not container.offset.nil?)
188
- ((offset - container.offset)..(self.end - container.offset))
189
- when Integer === container
190
- ((offset - container)..(self.end - container))
191
- else
192
- range
193
- end
194
- end
147
+ #{{{ Align
195
148
 
196
149
  def self.align(text, parts)
197
150
  pre_offset = 0
198
151
  parts.each do |part|
199
152
  offset = text.index part
200
153
  next if offset.nil?
201
- Segment.annotate(part, pre_offset + offset)
154
+ Segment.setup(part, pre_offset + offset)
202
155
  pre_offset += offset + part.length - 1
203
156
  text = text[(offset + part.length - 1)..-1]
204
157
  end
205
158
  end
206
159
 
160
+ #{{{ Index
161
+
207
162
  class Index
208
163
  attr_accessor :index, :data
209
164
  def initialize(index, data)
@@ -216,12 +171,11 @@ module Segment
216
171
  end
217
172
  end
218
173
 
219
- def self.index(segments, persistence_file = :memory)
220
-
174
+ def self.index(segments, persist_file = :memory)
221
175
  segments = segments.values.flatten if Hash === segments
222
176
 
223
177
  annotation_index =
224
- Persistence.persist("Index", :Index, :fwt, :persistence => (! (persistence_file.nil? or persistence_file == :memory)), :persistence_file => persistence_file, :range => true) do
178
+ Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
225
179
 
226
180
  value_size = 0
227
181
  index_data = segments.collect{|segment|
@@ -233,6 +187,7 @@ module Segment
233
187
 
234
188
  fwt = FixWidthTable.get :memory, value_size, true
235
189
  fwt.add_range index_data
190
+
236
191
  fwt
237
192
  end
238
193
 
@@ -241,14 +196,111 @@ module Segment
241
196
  Index.new annotation_index, data
242
197
  end
243
198
 
244
- end
199
+ #{{{ Save and load
200
+
201
+ def self.tsv_values_for_segment(segment, fields)
202
+ info = segment.info
203
+ values = []
204
+
205
+ fields.each do |field|
206
+ values << case
207
+ when field == "JSON"
208
+ info.to_json
209
+ when field == "literal"
210
+ segment.gsub(/\n|\t/, ' ')
211
+ when field == "Start"
212
+ segment.offset
213
+ when field == "End"
214
+ segment.end
215
+ else
216
+ info.delete(field.to_sym)
217
+ end
218
+ end
219
+
220
+ values
221
+ end
222
+
223
+ def self.load_tsv_values(text, values, fields)
224
+ info = {}
225
+ literal_pos = fields.index "literal"
226
+
227
+ object = if literal_pos.nil?
228
+ ""
229
+ else
230
+ v = values[literal_pos]
231
+ v = v.first if Array === v
232
+ v
233
+ end
234
+
235
+ fields.each_with_index do |field, i|
236
+ if field == "JSON"
237
+ JSON.parse(values[i]).each do |key, value|
238
+ info[key.to_sym] = value
239
+ end
240
+ else
241
+ info[field.to_sym] = values[i]
242
+ end
243
+ end
244
+
245
+ start = info.delete(:Start)
246
+ if not (start.nil? or ((Array === start or String === start) and start.empty?))
247
+ if Array === start
248
+ start = start.first
249
+ end
250
+ start = start.to_i
251
+ info[:offset] = start
252
+
253
+ eend = info.delete(:End)
254
+ if Array === eend
255
+ eend = eend.first
256
+ end
257
+ eend = eend.to_i
258
+
259
+ if object.empty?
260
+ object.replace text[start..eend]
261
+ end
262
+ end
263
+
264
+ info[:annotation_types] = [Segment] unless info.include? :annotation_types
245
265
 
246
- module Comment
247
- include Segment
248
- attr_accessor :comment
249
- def self.annotate(text, comment = nil)
250
- text.extend Comment
251
- text.comment = (comment.nil? ? text : comment)
252
- text
266
+ Annotated.load(object, info)
253
267
  end
268
+
269
+ def self.set_tsv_fields(fields, segments)
270
+ tsv_fields = []
271
+ add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
272
+ literal = (fields.delete(:literal) || fields.delete("literal"))
273
+ tsv_fields << "Start" << "End"
274
+ tsv_fields << :annotation_types if add_types
275
+ tsv_fields << :literal if literal
276
+
277
+ if fields.any? and not (fields == [:all] or fields == ["all"])
278
+ tsv_fields.concat fields
279
+ else
280
+ tsv_fields.concat segments.first.annotations if segments.any?
281
+ end
282
+ tsv_fields
283
+ tsv_fields.collect!{|f| f.to_s}
284
+ tsv_fields.delete "offset"
285
+ tsv_fields
286
+ end
287
+
288
+ def self.tsv(segments, *fields)
289
+ fields = set_tsv_fields fields, segments
290
+ tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
291
+
292
+ segments.each do |segment|
293
+ tsv[segment.id] = self.tsv_values_for_segment(segment, fields)
294
+ end
295
+
296
+ tsv
297
+ end
298
+
299
+ def self.load_tsv(tsv)
300
+ tsv.collect do |id, values|
301
+ Annotated.load_tsv_values(id, values, tsv.fields)
302
+ end
303
+ end
304
+
254
305
  end
306
+