rbbt-text 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
data/lib/rbbt/ner/NER.rb CHANGED
@@ -1,6 +1,6 @@
1
- require 'rbbt/ner/annotations'
2
- require 'rbbt/ner/annotations/named_entity'
3
- require 'rbbt/ner/annotations/annotated'
1
+ require 'rbbt/ner/segment'
2
+ require 'rbbt/ner/segment/named_entity'
3
+ require 'rbbt/ner/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -13,7 +13,7 @@ class NER
13
13
  }
14
14
  matches
15
15
  end.flatten
16
- when (Annotated === text and protect)
16
+ when (Segmented === text and protect)
17
17
  entities(text.split_segments(true), protect, *args)
18
18
  else
19
19
  match(text, *args)
@@ -1,13 +1,14 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/segment'
4
+ require 'rbbt/resource'
4
5
  require 'rbbt/ner/NER'
5
6
 
6
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
7
8
  # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
8
9
  class Abner < NER
9
10
 
10
- Rbbt.software.opt.ABNER.define_as_install Rbbt.share.install.software.ABNER.find
11
+ Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
11
12
 
12
13
  @@JFile = Rjb::import('java.io.File')
13
14
  @@Tagger = Rjb::import('abner.Tagger')
@@ -38,9 +39,9 @@ class Abner < NER
38
39
  mention = mention.to_s;
39
40
  offset = text.index(mention)
40
41
  if offset.nil?
41
- NamedEntity.annotate(mention, nil, type.to_s)
42
+ NamedEntity.setup(mention, nil, type.to_s)
42
43
  else
43
- NamedEntity.annotate(mention, offset + global_offset, type.to_s)
44
+ NamedEntity.setup(mention, offset + global_offset, type.to_s)
44
45
  text = text[offset + mention.length..-1]
45
46
  global_offset += offset + mention.length
46
47
  end
@@ -1,13 +1,13 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/segment'
4
4
  require 'rbbt/ner/NER'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
8
8
  class Banner < NER
9
9
 
10
- Rbbt.software.opt.BANNER.define_as_install Rbbt.share.install.software.BANNER.find
10
+ Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
11
11
 
12
12
  @@JFile = Rjb::import('java.io.File')
13
13
  @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
@@ -68,7 +68,7 @@ class Banner < NER
68
68
  mention.sub!(/^\s*/,'')
69
69
  mention.sub!(/\s*$/,'')
70
70
  offset = text.index(mention)
71
- NamedEntity.annotate(mention, offset, 'GENE')
71
+ NamedEntity.setup(mention, offset, 'GENE')
72
72
  mention
73
73
  }
74
74
  res
@@ -1,11 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/segment'
4
4
  require 'rbbt/ner/NER'
5
5
  require 'rbbt/util/log'
6
6
 
7
7
  class ChemicalTagger < NER
8
- Rbbt.software.opt.ChemicalTagger.define_as_install Rbbt.share.install.software.ChemicalTagger.find
8
+ Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
9
 
10
10
  Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
11
11
 
@@ -24,7 +24,7 @@ class ChemicalTagger < NER
24
24
 
25
25
  matches.collect do |mention|
26
26
  offset = text.index mention
27
- NamedEntity.annotate mention, offset, "Chemical Mention", nil, nil
27
+ NamedEntity.setup mention, offset, "Chemical Mention", nil, nil
28
28
  end
29
29
  end
30
30
 
@@ -1,7 +1,8 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/ner/annotations'
4
- require 'rbbt/ner/annotations/token'
1
+ require 'rbbt'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/ner/segment'
5
+ require 'rbbt/ner/segment/token'
5
6
  require 'rbbt/ner/NER'
6
7
  require 'inline'
7
8
 
@@ -41,9 +42,28 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
41
42
  end
42
43
  end
43
44
 
44
- def self.process(hash)
45
+ def self.process_stream(stream)
45
46
  index = {}
46
- hash.each do |code, names|
47
+ while line = stream.gets
48
+ names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
49
+ code = names.shift
50
+
51
+ names.each do |name|
52
+ ngram = name[0..2].strip
53
+ index[ngram] ||= []
54
+ index[ngram] << [name, code]
55
+ end
56
+ end
57
+ index
58
+
59
+ end
60
+
61
+ def self.process_hash(hash)
62
+ index = {}
63
+ hash.monitor = true if hash.respond_to? :monitor
64
+ hash.unnamed = true if hash.respond_to? :unnamed
65
+ method = hash.respond_to?(:through)? :through : :each
66
+ hash.send(method) do |code, names|
47
67
  names.each do |name|
48
68
  ngram = name[0..2].strip
49
69
  index[ngram] ||= []
@@ -94,15 +114,30 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
94
114
 
95
115
  attr_accessor :index, :type
96
116
  def initialize(file, type = nil)
97
- tsv = TSV.new(file, :flat)
98
117
  @type = type
99
- tsv.unnamed = true
100
- @index = NGramPrefixDictionary.process(tsv)
118
+ case
119
+ when (TSV === file or Hash === file)
120
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
121
+ @index = NGramPrefixDictionary.process_hash(file)
122
+ when Path === file
123
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
124
+ @index = NGramPrefixDictionary.process_stream(file.open)
125
+ when Misc.is_filename?(file)
126
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
127
+ @index = NGramPrefixDictionary.process_stream(Open.open(file))
128
+ when StreamIO === file
129
+ Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
130
+ @index = NGramPrefixDictionary.process_stream(file)
131
+ else
132
+ raise "Format of lexicon not understood: #{file.inspect}"
133
+ end
134
+
135
+ Log.debug("Ngram Prefix Dictionary. Loading done.")
101
136
  end
102
137
 
103
138
  def match(text)
104
139
  NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
105
- NamedEntity.annotate(name, offset, type, code)
140
+ NamedEntity.setup(name, offset, type, code)
106
141
  }
107
142
  end
108
143
  end
@@ -1,12 +1,12 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
8
8
  class OSCAR3 < NER
9
- Rbbt.software.opt.OSCAR3.define_as_install Rbbt.share.install.software.OSCAR3.find
9
+ Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
10
10
 
11
11
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
12
12
  @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
@@ -50,7 +50,7 @@ class OSCAR3 < NER
50
50
  next unless type.nil? or type.include? mention_type
51
51
  score = memm ? entities.get(key).to_string.to_f : nil
52
52
 
53
- NamedEntity.annotate mention, rstart.to_i + offset, mention_type, nil, score
53
+ NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
54
54
 
55
55
  mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
56
56
  end
@@ -1,12 +1,12 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
8
8
  class OSCAR4 < NER
9
- Rbbt.software.opt.OSCAR4.define_as_install Rbbt.share.install.software.OSCAR4.find
9
+ Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
10
10
 
11
11
  Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
12
12
  @@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
@@ -26,7 +26,7 @@ class OSCAR4 < NER
26
26
  mention = entity.getSurface
27
27
  result << mention
28
28
 
29
- NamedEntity.annotate mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
29
+ NamedEntity.setup mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
30
30
  end
31
31
 
32
32
  result
@@ -1,7 +1,7 @@
1
- require 'rbbt/ner/annotations/named_entity'
2
- require 'rbbt/ner/annotations/annotated'
3
- require 'rbbt/ner/annotations/transformed'
4
- require 'rbbt/ner/annotations/relations'
1
+ require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/ner/segment/segmented'
3
+ require 'rbbt/ner/segment/transformed'
4
+ require 'rbbt/ner/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -12,7 +12,9 @@ class PatternRelExt
12
12
  patterns = Array === patterns ? patterns : [patterns]
13
13
  type ||= "Simple Pattern"
14
14
  regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
- Transformed.with_transform(sentence, sentence.annotations, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
15
+ segments = sentence.segments
16
+ segments = segments.values.flatten if Hash === segments
17
+ Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
16
18
  regexpNER.entities(sentence)
17
19
  end
18
20
  end
@@ -23,23 +25,23 @@ class PatternRelExt
23
25
  when key =~ /(.*)\[entity:(.*)\]/
24
26
  chunk_type, chunk_value = $1, $2
25
27
  annotation_types = chunk_value.split(",")
26
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
27
- ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
28
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
29
+ ((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
28
30
 
29
31
  when key =~ /(.*)\[code:(.*)\]/
30
32
  chunk_type, chunk_value = $1, $2
31
33
  annotation_codes = chunk_value.split(",")
32
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
33
- ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
34
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
35
+ ((Hash === chunk.segments ? chunk.segments.values.flatten : chunk.segments).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
34
36
 
35
37
  when key =~ /(.*)\[stem:(.*)\]/
36
38
  chunk_type, chunk_value = $1, $2
37
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
39
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
38
40
  chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
39
41
 
40
42
  when key =~ /(.*)\[(.*)\]/
41
43
  chunk_type, chunk_value = $1, $2
42
- Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
44
+ Proc.new{|chunk| (chunk_type == "all" or (Array === chunk.type ? chunk.type.include?(chunk_type) : chunk.type == chunk_type)) and
43
45
  chunk.parts.values.select{|a| a == chunk_value}.any?}
44
46
 
45
47
  else
@@ -120,9 +122,9 @@ class PatternRelExt
120
122
  sentence_chunks = NLP.gdep_chunk_sentences(sentences)
121
123
 
122
124
  sentences.zip(sentence_chunks).collect do |sentence, chunks|
123
- annotation_index = Segment.index(sentence.annotations)
125
+ annotation_index = Segment.index(sentence.segments)
124
126
  chunks.each do |chunk|
125
- Annotated.annotate(chunk, annotation_index[chunk.range])
127
+ Segmented.setup(chunk, annotation_index[chunk.range])
126
128
  end
127
129
 
128
130
  match_chunks(chunks)
@@ -1,4 +1,4 @@
1
- require 'rbbt/ner/annotations'
1
+ require 'rbbt/ner/segment'
2
2
  require 'rbbt/ner/NER'
3
3
  require 'rbbt/util/simpleDSL'
4
4
 
@@ -23,7 +23,7 @@ class RegExpNER < NER
23
23
  end
24
24
 
25
25
  if match and not match.empty?
26
- NamedEntity.annotate(match, start + pre.length, type)
26
+ NamedEntity.setup(match, start + pre.length, type)
27
27
  matches << match
28
28
  end
29
29
 
@@ -86,6 +86,7 @@ class RegExpNER < NER
86
86
 
87
87
  def match(text)
88
88
  matches = RegExpNER.match_regexp_hash(text, @regexps)
89
+ matches
89
90
  end
90
91
 
91
92
  end
@@ -1,7 +1,7 @@
1
1
  require 'rbbt/ner/rnorm/cue_index'
2
2
  require 'rbbt/ner/rnorm/tokens'
3
3
  require 'rbbt/util/open'
4
- require 'rbbt/util/tsv'
4
+ require 'rbbt/tsv'
5
5
  require 'rbbt/sources/entrez'
6
6
  require 'rbbt/bow/bow.rb'
7
7
 
@@ -89,7 +89,7 @@ class Normalizer
89
89
 
90
90
 
91
91
  def initialize(lexicon, options = {})
92
- @synonyms = TSV.new(lexicon, :flat)
92
+ @synonyms = TSV.open(lexicon, :flat)
93
93
 
94
94
  @index = CueIndex.new
95
95
  @index.load(lexicon, options[:max_candidates])
@@ -1,4 +1,4 @@
1
- require 'rbbt-util'
1
+ require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/util/simpleDSL'
4
4
 
@@ -47,7 +47,7 @@ class CueIndex
47
47
 
48
48
  def load(file, max_candidates = 50)
49
49
  @indexes = Array.new(@rules.size){Hash.new}
50
- data = TSV.new(file, :flat)
50
+ data = TSV.open(file, :flat)
51
51
  data.each{|code, values|
52
52
  values.each{|value|
53
53
  cues(value).each_with_index{|cue_list,i|
@@ -1,63 +1,59 @@
1
+ require 'rbbt/annotations'
2
+ require 'rbbt/fix_width_table'
3
+
1
4
  module Segment
2
- attr_accessor :offset, :docid
3
-
4
- def self.included(base)
5
- if base.instance_methods.include? "segment_types"
6
- class << base
7
- self.module_eval do
8
- define_method "extended" do |object|
9
- object.segment_types ||= []
10
- object.segment_types << self.to_s unless object.segment_types.include? self.to_s
11
- end
12
- end
13
- end
14
- end
15
- end
5
+ extend Annotation
6
+ self.annotation :offset
16
7
 
17
- def self.annotate(string, offset = nil, docid = nil)
18
- string.extend Segment
19
- string.offset = offset
20
- string.docid = docid
21
- string
22
- end
8
+ #{{{ Ranges
23
9
 
24
- def id
25
- new = info.dup
26
- Digest::MD5.hexdigest(Misc.hash2string(new) << self << (offset || 0).to_s)
10
+ def end
11
+ return nil if offset.nil?
12
+ offset + length - 1
27
13
  end
28
-
29
- SKIP = %w(docid offset)
30
- def info
31
- equal_ascii = "="[0]
32
- info = {}
33
- singleton_methods.select{|method| method[-1] == equal_ascii}.
34
- collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
35
- info
36
- info.delete_if{|k,v| v.nil?}
37
- info
14
+
15
+ def range
16
+ raise "No offset specified" if offset.nil?
17
+ (offset..self.end)
38
18
  end
39
19
 
40
- def self.load(text, start, eend, info, docid = nil)
41
- string = text[start.to_i..eend.to_i] if start and eend
42
- string ||= info[:literal]
43
- string.extend Segment
20
+ def pull(offset)
21
+ if self.offset.nil? or offset.nil?
22
+ self.offset = nil
23
+ else
24
+ self.offset += offset
25
+ end
44
26
 
45
- # add types
46
- types = info.delete("segment_types")|| info.delete(:segment_types) || []
47
- types.each do |type| string.extend Misc.string2const(type) end
27
+ self
28
+ end
48
29
 
49
- # set info data
50
- info.each do |key,value|
51
- string.send key + '=', value if string.respond_to? key.to_sym
30
+ def push(offset)
31
+ if self.offset.nil? or offset.nil?
32
+ self.offset = nil
33
+ else
34
+ self.offset -= offset
52
35
  end
53
36
 
54
- string.docid = docid
55
- string.offset = start.to_i
37
+ self
38
+ end
56
39
 
57
- string
40
+ def make_relative(segments)
41
+ segments.collect{|s| s.push offset}
58
42
  end
59
43
 
60
- # {{{ Sorting and splitting
44
+ def range_in(container = nil)
45
+ raise "No offset specified" if offset.nil?
46
+ case
47
+ when (Segment === container and not container.offset.nil?)
48
+ ((offset - container.offset)..(self.end - container.offset))
49
+ when Integer === container
50
+ ((offset - container)..(self.end - container))
51
+ else
52
+ range
53
+ end
54
+ end
55
+
56
+ #{{{ Sorting
61
57
 
62
58
  def self.sort(segments, inline = true)
63
59
  if inline
@@ -102,13 +98,15 @@ module Segment
102
98
  sorted
103
99
  end
104
100
 
101
+ #{{{ Splitting
102
+
105
103
  def self.split(text, segments, skip_segments = false)
106
104
  sorted_segments = clean_sort segments
107
105
 
108
106
  chunks = []
109
107
  segment_end = 0
110
108
  text_offset = 0
111
- sorted_segments.reverse.each do |segment|
109
+ sorted_segments.each do |segment|
112
110
  return chunks if text.nil? or text.empty?
113
111
  next if segment.offset.nil?
114
112
  offset = segment.offset - text_offset
@@ -119,7 +117,7 @@ module Segment
119
117
  next
120
118
  when offset > 0 # Save pre
121
119
  chunk = text[0..offset - 1]
122
- Segment.annotate(chunk, text_offset)
120
+ Segment.setup(chunk, text_offset)
123
121
  chunks << chunk
124
122
  end
125
123
 
@@ -127,7 +125,7 @@ module Segment
127
125
 
128
126
  if not skip_segments
129
127
  chunk = text[offset..segment_end]
130
- Segment.annotate(chunk, text_offset + offset)
128
+ Segment.setup(chunk, text_offset + offset)
131
129
  chunks << chunk
132
130
  end
133
131
 
@@ -138,72 +136,29 @@ module Segment
138
136
 
139
137
  if not text.nil? and text.any?
140
138
  chunk = text.dup
141
- Segment.annotate(chunk, text_offset)
139
+ Segment.setup(chunk, text_offset)
142
140
  chunks << chunk
143
141
  end
144
142
 
145
143
  chunks
146
144
  end
147
145
 
148
- # {{{ Ranges and manipulation
149
146
 
150
- def pull(offset)
151
- if self.offset.nil? or offset.nil?
152
- self.offset = nil
153
- else
154
- self.offset += offset
155
- end
156
-
157
- self
158
- end
159
-
160
- def push(offset)
161
- if self.offset.nil? or offset.nil?
162
- self.offset = nil
163
- else
164
- self.offset -= offset
165
- end
166
-
167
- self
168
- end
169
-
170
- def make_relative(segments)
171
- segments.collect{|s| s.push offset}
172
- end
173
-
174
- def end
175
- return nil if offset.nil?
176
- offset + length - 1
177
- end
178
-
179
- def range
180
- raise "No offset specified" if offset.nil?
181
- (offset..self.end)
182
- end
183
-
184
- def range_in(container = nil)
185
- raise "No offset specified" if offset.nil?
186
- case
187
- when (Segment === container and not container.offset.nil?)
188
- ((offset - container.offset)..(self.end - container.offset))
189
- when Integer === container
190
- ((offset - container)..(self.end - container))
191
- else
192
- range
193
- end
194
- end
147
+ #{{{ Align
195
148
 
196
149
  def self.align(text, parts)
197
150
  pre_offset = 0
198
151
  parts.each do |part|
199
152
  offset = text.index part
200
153
  next if offset.nil?
201
- Segment.annotate(part, pre_offset + offset)
154
+ Segment.setup(part, pre_offset + offset)
202
155
  pre_offset += offset + part.length - 1
203
156
  text = text[(offset + part.length - 1)..-1]
204
157
  end
205
158
  end
206
159
 
160
+ #{{{ Index
161
+
207
162
  class Index
208
163
  attr_accessor :index, :data
209
164
  def initialize(index, data)
@@ -216,12 +171,11 @@ module Segment
216
171
  end
217
172
  end
218
173
 
219
- def self.index(segments, persistence_file = :memory)
220
-
174
+ def self.index(segments, persist_file = :memory)
221
175
  segments = segments.values.flatten if Hash === segments
222
176
 
223
177
  annotation_index =
224
- Persistence.persist("Index", :Index, :fwt, :persistence => (! (persistence_file.nil? or persistence_file == :memory)), :persistence_file => persistence_file, :range => true) do
178
+ Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
225
179
 
226
180
  value_size = 0
227
181
  index_data = segments.collect{|segment|
@@ -233,6 +187,7 @@ module Segment
233
187
 
234
188
  fwt = FixWidthTable.get :memory, value_size, true
235
189
  fwt.add_range index_data
190
+
236
191
  fwt
237
192
  end
238
193
 
@@ -241,14 +196,111 @@ module Segment
241
196
  Index.new annotation_index, data
242
197
  end
243
198
 
244
- end
199
+ #{{{ Save and load
200
+
201
+ def self.tsv_values_for_segment(segment, fields)
202
+ info = segment.info
203
+ values = []
204
+
205
+ fields.each do |field|
206
+ values << case
207
+ when field == "JSON"
208
+ info.to_json
209
+ when field == "literal"
210
+ segment.gsub(/\n|\t/, ' ')
211
+ when field == "Start"
212
+ segment.offset
213
+ when field == "End"
214
+ segment.end
215
+ else
216
+ info.delete(field.to_sym)
217
+ end
218
+ end
219
+
220
+ values
221
+ end
222
+
223
+ def self.load_tsv_values(text, values, fields)
224
+ info = {}
225
+ literal_pos = fields.index "literal"
226
+
227
+ object = if literal_pos.nil?
228
+ ""
229
+ else
230
+ v = values[literal_pos]
231
+ v = v.first if Array === v
232
+ v
233
+ end
234
+
235
+ fields.each_with_index do |field, i|
236
+ if field == "JSON"
237
+ JSON.parse(values[i]).each do |key, value|
238
+ info[key.to_sym] = value
239
+ end
240
+ else
241
+ info[field.to_sym] = values[i]
242
+ end
243
+ end
244
+
245
+ start = info.delete(:Start)
246
+ if not (start.nil? or ((Array === start or String === start) and start.empty?))
247
+ if Array === start
248
+ start = start.first
249
+ end
250
+ start = start.to_i
251
+ info[:offset] = start
252
+
253
+ eend = info.delete(:End)
254
+ if Array === eend
255
+ eend = eend.first
256
+ end
257
+ eend = eend.to_i
258
+
259
+ if object.empty?
260
+ object.replace text[start..eend]
261
+ end
262
+ end
263
+
264
+ info[:annotation_types] = [Segment] unless info.include? :annotation_types
245
265
 
246
- module Comment
247
- include Segment
248
- attr_accessor :comment
249
- def self.annotate(text, comment = nil)
250
- text.extend Comment
251
- text.comment = (comment.nil? ? text : comment)
252
- text
266
+ Annotated.load(object, info)
253
267
  end
268
+
269
+ def self.set_tsv_fields(fields, segments)
270
+ tsv_fields = []
271
+ add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
272
+ literal = (fields.delete(:literal) || fields.delete("literal"))
273
+ tsv_fields << "Start" << "End"
274
+ tsv_fields << :annotation_types if add_types
275
+ tsv_fields << :literal if literal
276
+
277
+ if fields.any? and not (fields == [:all] or fields == ["all"])
278
+ tsv_fields.concat fields
279
+ else
280
+ tsv_fields.concat segments.first.annotations if segments.any?
281
+ end
282
+ tsv_fields
283
+ tsv_fields.collect!{|f| f.to_s}
284
+ tsv_fields.delete "offset"
285
+ tsv_fields
286
+ end
287
+
288
+ def self.tsv(segments, *fields)
289
+ fields = set_tsv_fields fields, segments
290
+ tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
291
+
292
+ segments.each do |segment|
293
+ tsv[segment.id] = self.tsv_values_for_segment(segment, fields)
294
+ end
295
+
296
+ tsv
297
+ end
298
+
299
+ def self.load_tsv(tsv)
300
+ tsv.collect do |id, values|
301
+ Annotated.load_tsv_values(id, values, tsv.fields)
302
+ end
303
+ end
304
+
254
305
  end
306
+