rbbt-text 1.2.0 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,7 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
7
6
 
@@ -53,7 +52,7 @@ class OSCAR3 < NER
53
52
  next unless type.nil? or type.include? mention_type
54
53
  score = memm ? entities.get(key).to_string.to_f : nil
55
54
 
56
- NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
55
+ NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
57
56
 
58
57
  mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
59
58
  end
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -25,7 +25,7 @@ class OSCAR4 < NER
25
25
  @@tagger ||= @@OSCAR.new()
26
26
  end
27
27
 
28
- def self.match(text, type = nil)
28
+ def self.match(text, protect = false, type = nil)
29
29
  self.init
30
30
 
31
31
  return [] if text.nil? or text.strip.empty?
@@ -46,7 +46,7 @@ class OSCAR4 < NER
46
46
 
47
47
  next unless entity.getType.toString == type unless type.nil?
48
48
 
49
- NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
49
+ NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
50
50
 
51
51
  result << mention
52
52
  end
@@ -1,7 +1,7 @@
1
- require 'rbbt/text/segment/named_entity'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/transformed'
4
- require 'rbbt/text/segment/relationship'
1
+ require 'rbbt/segment/named_entity'
2
+ require 'rbbt/segment/segmented'
3
+ require 'rbbt/segment/transformed'
4
+ #require 'rbbt/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -14,7 +14,7 @@ class PatternRelExt
14
14
  regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
- Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
17
+ Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
18
  regexpNER.entities(sentence)
19
19
  end
20
20
  end
@@ -1,4 +1,3 @@
1
- require 'rbbt/text/segment'
2
1
  require 'rbbt/ner/NER'
3
2
  require 'rbbt/util/simpleDSL'
4
3
 
@@ -23,7 +22,7 @@ class RegExpNER < NER
23
22
  end
24
23
 
25
24
  if match and not match.empty?
26
- NamedEntity.setup(match, start + pre.length, type)
25
+ NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
27
26
  matches << match
28
27
  end
29
28
 
@@ -1,46 +1,58 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
- require 'rbbt/text/segment'
4
- require 'rbbt/text/segment/token'
3
+ require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
5
+ require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
- def self.clean(token)
8
+ def self.clean(token, stem = false)
9
9
  if token.length > 3
10
- token.downcase.sub(/-/,'')
10
+ upcase = token !~ /[a-z]/
11
+ token = token.downcase.sub(/-/,'')
12
+
13
+ if stem && ! upcase
14
+ require 'stemmer'
15
+ if stem == :double
16
+ token = token.stem.stem
17
+ else
18
+ token = token.stem
19
+ end
20
+ end
21
+
22
+ token
11
23
  else
12
24
  token
13
25
  end
14
26
  end
15
27
 
16
- def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
28
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
17
29
  if no_clean
18
30
  if extend_to_token
19
- Token.setup(clean(token), start, token)
31
+ Token.setup(token, :offset => start, :original => token)
20
32
  else
21
33
  token
22
34
  end
23
35
  else
24
36
  if extend_to_token
25
- Token.setup(clean(token), start, token)
37
+ Token.setup(clean(token, stem), :offset => start, :original => token)
26
38
  else
27
- clean(token)
39
+ clean(token, stem)
28
40
  end
29
41
  end
30
42
  end
31
43
 
32
- def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
33
- split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
44
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
45
+ split_at = /\s|(\(|\)|[-."':,;])/ if split_at.nil?
34
46
 
35
47
  tokens = []
36
48
  while matchdata = text.match(split_at)
37
- tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
49
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
50
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
39
51
  start += matchdata.end(0)
40
52
  text = matchdata.post_match
41
53
  end
42
54
 
43
- tokens << prepare_token(text, start, extend_to_token) unless text.empty?
55
+ tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
44
56
 
45
57
  tokens
46
58
  end
@@ -130,14 +142,14 @@ class TokenTrieNER < NER
130
142
  index1
131
143
  end
132
144
 
133
- def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
145
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
134
146
 
135
147
  chunk_size = hash.size / 100
136
148
  items_in_chunk = 0
137
149
  tmp_index = {}
138
150
  hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
139
151
  names = Array === names ? names : [names]
140
- names.flatten! if Array === names.first and not Token === names.first.first
152
+ names.flatten! if Array === names.first and not Segment === names.first.first
141
153
 
142
154
  if names.empty?
143
155
  names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
146
158
  names.each do |name|
147
159
  next if name.empty? or (String === name and name.length < 2)
148
160
 
149
- tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
161
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
150
162
  tokens.extend EnumeratedArray
151
163
 
152
164
  token_index = index_for_tokens(tokens, code, type, slack)
@@ -237,10 +249,10 @@ class TokenTrieNER < NER
237
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
238
250
  }
239
251
 
240
- NamedEntity.setup(match, match_tokens.first.offset, type, codes)
252
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
241
253
  end
242
254
 
243
- attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
255
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
244
256
  def initialize(type = nil, file = nil, options = {})
245
257
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
246
258
  :persist => false
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
248
260
  @longest_match = options.delete :longest_match
249
261
  @split_at = options.delete :split_at
250
262
  @no_clean = options.delete :no_clean
263
+ @stem = options.delete :stem
251
264
 
252
265
  file = [] if file.nil?
253
266
  file = [file] unless Array === file
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
273
286
  Log.debug "TokenTrieNER merging TSV"
274
287
  new.with_unnamed do
275
288
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
276
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
289
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
277
290
  end
278
291
  end
279
292
  when Hash === new
@@ -284,18 +297,18 @@ class TokenTrieNER < NER
284
297
  new = TSV.open(new, :flat)
285
298
  new.with_unnamed do
286
299
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
287
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
300
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
288
301
  end
289
302
  end
290
303
  end
291
304
  end
292
305
 
293
306
  def match(text)
294
- tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
307
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
295
308
 
296
309
  tokens.extend EnumeratedArray
297
310
  tokens.pos = 0
298
-
311
+
299
312
  matches = []
300
313
  while tokens.left?
301
314
  new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/nlp/nlp'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
  module NLP
4
4
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
5
 
@@ -239,6 +239,7 @@ module NLP
239
239
  end
240
240
 
241
241
  def self.geniass_sentence_splitter(text)
242
+ Rbbt.software.opt.Geniass.produce
242
243
  offsets = []
243
244
 
244
245
  cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
294
295
  offsets.collect do |s,e|
295
296
  sentence = text[s..e]
296
297
  next if sentence.nil?
297
- #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ sentence.gsub!(NEW_LINE_MASK, "\n")
298
299
  Segment.setup sentence, s
299
300
  sentence
300
301
  end
@@ -2,8 +2,8 @@ require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
3
  require 'rbbt/persist'
4
4
  require 'rbbt/resource'
5
- require 'rbbt/text/segment'
6
- require 'rbbt/text/segment/segmented'
5
+ require 'rbbt/segment'
6
+ require 'rbbt/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -101,7 +101,7 @@ module NLP
101
101
  input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
102
102
  sentence_tokens = TmpFile.with_file(input) do |fin|
103
103
  out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
104
- CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
104
+ CMD.cmd("cd #{Rbbt.software.opt.Gdep.produce.find}; ./gdep #{ fin }").read
105
105
  end
106
106
 
107
107
  out.split(/^$/).collect do |sentence|
@@ -120,10 +120,10 @@ module NLP
120
120
 
121
121
 
122
122
  def self.gdep_parse_sentences_extension(sentences)
123
- require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
123
+ require Rbbt.software.opt.Gdep.produce.ruby["Gdep.so"].find
124
124
  gdep = Gdep.new
125
125
  if not gdep.gdep_is_loaded
126
- Misc.in_dir Rbbt.software.opt.Gdep.find do
126
+ Misc.in_dir Rbbt.software.opt.Gdep.produce.find do
127
127
  gdep.load_gdep
128
128
  end
129
129
  end
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
3
+ require 'rbbt/segment'
4
4
  require 'rbbt/resource'
5
5
 
6
6
  module OpenNLP
@@ -33,48 +33,49 @@ module OpenNLP
33
33
  def self.sentence_splitter(text)
34
34
  return [] if text.nil? or text.empty?
35
35
 
36
- text = Misc.to_utf8(text)
37
- last = 0
38
- begin
39
- sentence_split_detector = self.sentence_split_detector
40
-
41
- sentences = nil
42
- TmpFile.with_file do |tmpfile|
43
- start_time = Time.now
44
-
45
- begin
46
- pid = Process.fork do
47
- sent = sentence_split_detector.sentDetect(text)
48
- Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
- end
36
+ Segment.ascii(text, "?") do
37
+ last = 0
38
+ begin
39
+ sentence_split_detector = self.sentence_split_detector
50
40
 
51
- while not Process.waitpid(pid)
52
- if Time.now - start_time > MAX
53
- Process.kill(9, pid)
54
- raise "Taking to long (> #{MAX} seconds)"
55
- end
56
- sleep 0.1
57
- end
41
+ sentences = nil
42
+ TmpFile.with_file do |tmpfile|
43
+ start_time = Time.now
58
44
 
59
45
  begin
60
- Process.waitpid(pid)
46
+ pid = Process.fork do
47
+ sent = sentence_split_detector.sentDetect(text)
48
+ Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
+ end
50
+
51
+ while not Process.waitpid(pid)
52
+ if Time.now - start_time > MAX
53
+ Process.kill(9, pid)
54
+ raise "Taking to long (> #{MAX} seconds)"
55
+ end
56
+ sleep 0.1
57
+ end
58
+
59
+ begin
60
+ Process.waitpid(pid)
61
+ end
62
+ rescue Errno::ECHILD
61
63
  end
62
- rescue Errno::ECHILD
64
+
65
+ sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
63
66
  end
64
67
 
65
- sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
68
+ sentences.collect{|sentence|
69
+ sentence = Misc.to_utf8(sentence)
70
+ start = text.index(sentence, last)
71
+ Segment.setup sentence, start
72
+ last = start + sentence.length - 1
73
+ sentence
74
+ }
75
+ rescue Exception
76
+ raise $!
77
+ raise "Sentence splitter raised exception: #{$!.message}"
66
78
  end
67
-
68
- sentences.collect{|sentence|
69
- sentence = Misc.to_utf8(sentence)
70
- start = text.index(sentence, last)
71
- Segment.setup sentence, start
72
- last = start + sentence.length - 1
73
- sentence
74
- }
75
- rescue Exception
76
- raise $!
77
- raise "Sentence splitter raised exception: #{$!.message}"
78
79
  end
79
80
  end
80
81
  end
@@ -0,0 +1,52 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/document'
3
+ require 'rbbt/segment/annotation'
4
+ require 'rbbt/util/python'
5
+
6
+ module SpaCy
7
+
8
+ PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
9
+
10
+ def self.tokens(text, lang = 'en')
11
+
12
+ tokens = []
13
+ RbbtPython.run 'spacy' do
14
+ nlp = spacy.load(lang)
15
+ doc = nlp.call(text)
16
+ doc.__len__.times do |i|
17
+ tokens << doc.__getitem__(i)
18
+ end
19
+ end
20
+ tokens
21
+ end
22
+
23
+ def self.segments(text, lang = 'en')
24
+ docid = text.docid if Document === text
25
+ corpus = text.corpus if Document === text
26
+ tokens = self.tokens(text, lang).collect do |token|
27
+ info = {}
28
+ PROPERTIES.each do |p|
29
+ info[p] = token.instance_eval(p.to_s)
30
+ end
31
+ info[:type] = "SpaCy"
32
+ info[:offset] = token.idx
33
+ info[:dep] = token.dep_ + "->" + token.head.idx.to_s
34
+ info[:docid] = docid if docid
35
+ info[:corpus] = corpus if corpus
36
+ SpaCyToken.setup(token.text, info)
37
+ end
38
+ SpaCyToken.setup(tokens, :corpus => corpus)
39
+ end
40
+ end
41
+
42
+ module SpaCyToken
43
+ extend Entity
44
+ include SegmentAnnotation
45
+
46
+ self.annotation *SpaCy::PROPERTIES
47
+ self.annotation :dep
48
+ end
49
+
50
+ if __FILE__ == $0
51
+ ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
52
+ end
@@ -0,0 +1,179 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/document'
4
+
5
+ module SegID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def range
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
15
+ end
16
+
17
+ def docid
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
19
+ end
20
+
21
+ def offset
22
+ range.begin
23
+ end
24
+
25
+ def segment_length
26
+ range.end - range.begin + 1
27
+ end
28
+
29
+ property :segment => :single do
30
+ docid = self.docid
31
+ document = DocID.setup(docid, :corpus => corpus).document
32
+
33
+ text = document[range]
34
+
35
+ Segment.setup(text, :docid => docid, :offset => offset)
36
+ end
37
+
38
+ property :segid do
39
+ self
40
+ end
41
+
42
+ end
43
+
44
+ module Segment
45
+ extend Entity
46
+ self.annotation :offset, :docid
47
+
48
+ def segment_length
49
+ length
50
+ end
51
+
52
+ def eend
53
+ offset.to_i + length - 1
54
+ end
55
+
56
+ def range
57
+ (offset.to_i..eend)
58
+ end
59
+
60
+ property :segid do |corpus=nil|
61
+ SegID.setup([docid, range] * ":", :corpus => corpus)
62
+ end
63
+
64
+ alias id segid
65
+
66
+ property :segment do
67
+ self
68
+ end
69
+
70
+ def self.sort(segments, inline = true)
71
+ if inline
72
+ segments.sort do |a,b|
73
+ case
74
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
75
+ 0
76
+ when (a.nil? or a.offset.nil?)
77
+ -1
78
+ when (b.nil? or b.offset.nil?)
79
+ +1
80
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
81
+ a.offset.to_i <=> b.offset.to_i
82
+ else
83
+ a.segment_length <=> b.segment_length
84
+ end
85
+ end
86
+ else
87
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
88
+ end
89
+ end
90
+
91
+ def self.overlaps(sorted_segments)
92
+ last = nil
93
+ overlaped = []
94
+
95
+ sorted_segments.reverse.each do |segment|
96
+ overlaped << segment if (not last.nil?) and segment.range.end > last
97
+ last = segment.range.begin
98
+ end
99
+
100
+ overlaped
101
+ end
102
+
103
+ def self.clean_sort(segments)
104
+ sorted = sort(segments).reject{|s| s.offset.nil?}
105
+ overlaps = overlaps(sorted)
106
+ overlaps.each do |s|
107
+ sorted.delete s
108
+ end
109
+
110
+ sorted
111
+ end
112
+
113
+ def self.split(text, segments, skip_segments = false)
114
+ sorted_segments = clean_sort segments
115
+
116
+ chunks = []
117
+ segment_end = 0
118
+ text_offset = 0
119
+ sorted_segments.each do |segment|
120
+ return chunks if text.nil? or text.empty?
121
+ next if segment.offset.nil?
122
+ offset = segment.offset - text_offset
123
+
124
+ # Consider segment offset. Save pre, or skip if overlap
125
+ case
126
+ when offset < 0 # Overlap, skip
127
+ next
128
+ when offset > 0 # Save pre
129
+ chunk = text[0..offset - 1]
130
+ Segment.setup(chunk, text_offset)
131
+ chunks << chunk
132
+ end
133
+
134
+ segment_end = offset + segment.segment_length - 1
135
+
136
+ if not skip_segments
137
+ chunk = text[offset..segment_end]
138
+ Segment.setup(chunk, text_offset + offset)
139
+ chunks << chunk
140
+ end
141
+
142
+ text_offset += segment_end + 1
143
+ text = text[segment_end + 1..-1]
144
+
145
+ end
146
+
147
+ if not text.nil? and not text.empty?
148
+ chunk = text.dup
149
+ Segment.setup(chunk, text_offset)
150
+ chunks << chunk
151
+ end
152
+
153
+ chunks
154
+ end
155
+
156
+ def self.align(text, parts)
157
+ pre_offset = 0
158
+ docid = text.respond_to?(:docid) ? text.docid : nil
159
+ parts.each do |part|
160
+ offset = text.index part
161
+ next if offset.nil?
162
+ Segment.setup(part, pre_offset + offset, docid)
163
+ pre_offset += offset + part.segment_length - 1
164
+ text = text[(offset + part.segment_length - 1)..-1]
165
+ end
166
+ end
167
+
168
+ def self.index(*args)
169
+ Segment::RangeIndex.index(*args)
170
+ end
171
+
172
+ end
173
+
174
+ require 'rbbt/segment/range_index'
175
+ require 'rbbt/segment/overlaps'
176
+ require 'rbbt/segment/transformed'
177
+ require 'rbbt/segment/segmented'
178
+ require 'rbbt/segment/encoding'
179
+