rbbt-text 1.2.0 → 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,7 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
7
6
 
@@ -53,7 +52,7 @@ class OSCAR3 < NER
53
52
  next unless type.nil? or type.include? mention_type
54
53
  score = memm ? entities.get(key).to_string.to_f : nil
55
54
 
56
- NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
55
+ NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
57
56
 
58
57
  mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
59
58
  end
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -25,7 +25,7 @@ class OSCAR4 < NER
25
25
  @@tagger ||= @@OSCAR.new()
26
26
  end
27
27
 
28
- def self.match(text, type = nil)
28
+ def self.match(text, protect = false, type = nil)
29
29
  self.init
30
30
 
31
31
  return [] if text.nil? or text.strip.empty?
@@ -46,7 +46,7 @@ class OSCAR4 < NER
46
46
 
47
47
  next unless entity.getType.toString == type unless type.nil?
48
48
 
49
- NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
49
+ NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
50
50
 
51
51
  result << mention
52
52
  end
@@ -1,7 +1,7 @@
1
- require 'rbbt/text/segment/named_entity'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/transformed'
4
- require 'rbbt/text/segment/relationship'
1
+ require 'rbbt/segment/named_entity'
2
+ require 'rbbt/segment/segmented'
3
+ require 'rbbt/segment/transformed'
4
+ #require 'rbbt/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -14,7 +14,7 @@ class PatternRelExt
14
14
  regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
- Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
17
+ Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
18
  regexpNER.entities(sentence)
19
19
  end
20
20
  end
@@ -1,4 +1,3 @@
1
- require 'rbbt/text/segment'
2
1
  require 'rbbt/ner/NER'
3
2
  require 'rbbt/util/simpleDSL'
4
3
 
@@ -23,7 +22,7 @@ class RegExpNER < NER
23
22
  end
24
23
 
25
24
  if match and not match.empty?
26
- NamedEntity.setup(match, start + pre.length, type)
25
+ NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
27
26
  matches << match
28
27
  end
29
28
 
@@ -1,46 +1,58 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
- require 'rbbt/text/segment'
4
- require 'rbbt/text/segment/token'
3
+ require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
5
+ require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
- def self.clean(token)
8
+ def self.clean(token, stem = false)
9
9
  if token.length > 3
10
- token.downcase.sub(/-/,'')
10
+ upcase = token !~ /[a-z]/
11
+ token = token.downcase.sub(/-/,'')
12
+
13
+ if stem && ! upcase
14
+ require 'stemmer'
15
+ if stem == :double
16
+ token = token.stem.stem
17
+ else
18
+ token = token.stem
19
+ end
20
+ end
21
+
22
+ token
11
23
  else
12
24
  token
13
25
  end
14
26
  end
15
27
 
16
- def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
28
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
17
29
  if no_clean
18
30
  if extend_to_token
19
- Token.setup(clean(token), start, token)
31
+ Token.setup(token, :offset => start, :original => token)
20
32
  else
21
33
  token
22
34
  end
23
35
  else
24
36
  if extend_to_token
25
- Token.setup(clean(token), start, token)
37
+ Token.setup(clean(token, stem), :offset => start, :original => token)
26
38
  else
27
- clean(token)
39
+ clean(token, stem)
28
40
  end
29
41
  end
30
42
  end
31
43
 
32
- def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
33
- split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
44
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
45
+ split_at = /\s|(\(|\)|[-."':,;])/ if split_at.nil?
34
46
 
35
47
  tokens = []
36
48
  while matchdata = text.match(split_at)
37
- tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
49
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
50
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
39
51
  start += matchdata.end(0)
40
52
  text = matchdata.post_match
41
53
  end
42
54
 
43
- tokens << prepare_token(text, start, extend_to_token) unless text.empty?
55
+ tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
44
56
 
45
57
  tokens
46
58
  end
@@ -130,14 +142,14 @@ class TokenTrieNER < NER
130
142
  index1
131
143
  end
132
144
 
133
- def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
145
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
134
146
 
135
147
  chunk_size = hash.size / 100
136
148
  items_in_chunk = 0
137
149
  tmp_index = {}
138
150
  hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
139
151
  names = Array === names ? names : [names]
140
- names.flatten! if Array === names.first and not Token === names.first.first
152
+ names.flatten! if Array === names.first and not Segment === names.first.first
141
153
 
142
154
  if names.empty?
143
155
  names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
146
158
  names.each do |name|
147
159
  next if name.empty? or (String === name and name.length < 2)
148
160
 
149
- tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
161
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
150
162
  tokens.extend EnumeratedArray
151
163
 
152
164
  token_index = index_for_tokens(tokens, code, type, slack)
@@ -237,10 +249,10 @@ class TokenTrieNER < NER
237
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
238
250
  }
239
251
 
240
- NamedEntity.setup(match, match_tokens.first.offset, type, codes)
252
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
241
253
  end
242
254
 
243
- attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
255
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
244
256
  def initialize(type = nil, file = nil, options = {})
245
257
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
246
258
  :persist => false
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
248
260
  @longest_match = options.delete :longest_match
249
261
  @split_at = options.delete :split_at
250
262
  @no_clean = options.delete :no_clean
263
+ @stem = options.delete :stem
251
264
 
252
265
  file = [] if file.nil?
253
266
  file = [file] unless Array === file
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
273
286
  Log.debug "TokenTrieNER merging TSV"
274
287
  new.with_unnamed do
275
288
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
276
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
289
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
277
290
  end
278
291
  end
279
292
  when Hash === new
@@ -284,18 +297,18 @@ class TokenTrieNER < NER
284
297
  new = TSV.open(new, :flat)
285
298
  new.with_unnamed do
286
299
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
287
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
300
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
288
301
  end
289
302
  end
290
303
  end
291
304
  end
292
305
 
293
306
  def match(text)
294
- tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
307
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
295
308
 
296
309
  tokens.extend EnumeratedArray
297
310
  tokens.pos = 0
298
-
311
+
299
312
  matches = []
300
313
  while tokens.left?
301
314
  new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/nlp/nlp'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
  module NLP
4
4
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
5
 
@@ -239,6 +239,7 @@ module NLP
239
239
  end
240
240
 
241
241
  def self.geniass_sentence_splitter(text)
242
+ Rbbt.software.opt.Geniass.produce
242
243
  offsets = []
243
244
 
244
245
  cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
294
295
  offsets.collect do |s,e|
295
296
  sentence = text[s..e]
296
297
  next if sentence.nil?
297
- #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ sentence.gsub!(NEW_LINE_MASK, "\n")
298
299
  Segment.setup sentence, s
299
300
  sentence
300
301
  end
@@ -2,8 +2,8 @@ require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
3
  require 'rbbt/persist'
4
4
  require 'rbbt/resource'
5
- require 'rbbt/text/segment'
6
- require 'rbbt/text/segment/segmented'
5
+ require 'rbbt/segment'
6
+ require 'rbbt/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -101,7 +101,7 @@ module NLP
101
101
  input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
102
102
  sentence_tokens = TmpFile.with_file(input) do |fin|
103
103
  out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
104
- CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
104
+ CMD.cmd("cd #{Rbbt.software.opt.Gdep.produce.find}; ./gdep #{ fin }").read
105
105
  end
106
106
 
107
107
  out.split(/^$/).collect do |sentence|
@@ -120,10 +120,10 @@ module NLP
120
120
 
121
121
 
122
122
  def self.gdep_parse_sentences_extension(sentences)
123
- require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
123
+ require Rbbt.software.opt.Gdep.produce.ruby["Gdep.so"].find
124
124
  gdep = Gdep.new
125
125
  if not gdep.gdep_is_loaded
126
- Misc.in_dir Rbbt.software.opt.Gdep.find do
126
+ Misc.in_dir Rbbt.software.opt.Gdep.produce.find do
127
127
  gdep.load_gdep
128
128
  end
129
129
  end
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
3
+ require 'rbbt/segment'
4
4
  require 'rbbt/resource'
5
5
 
6
6
  module OpenNLP
@@ -33,48 +33,49 @@ module OpenNLP
33
33
  def self.sentence_splitter(text)
34
34
  return [] if text.nil? or text.empty?
35
35
 
36
- text = Misc.to_utf8(text)
37
- last = 0
38
- begin
39
- sentence_split_detector = self.sentence_split_detector
40
-
41
- sentences = nil
42
- TmpFile.with_file do |tmpfile|
43
- start_time = Time.now
44
-
45
- begin
46
- pid = Process.fork do
47
- sent = sentence_split_detector.sentDetect(text)
48
- Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
- end
36
+ Segment.ascii(text, "?") do
37
+ last = 0
38
+ begin
39
+ sentence_split_detector = self.sentence_split_detector
50
40
 
51
- while not Process.waitpid(pid)
52
- if Time.now - start_time > MAX
53
- Process.kill(9, pid)
54
- raise "Taking to long (> #{MAX} seconds)"
55
- end
56
- sleep 0.1
57
- end
41
+ sentences = nil
42
+ TmpFile.with_file do |tmpfile|
43
+ start_time = Time.now
58
44
 
59
45
  begin
60
- Process.waitpid(pid)
46
+ pid = Process.fork do
47
+ sent = sentence_split_detector.sentDetect(text)
48
+ Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
+ end
50
+
51
+ while not Process.waitpid(pid)
52
+ if Time.now - start_time > MAX
53
+ Process.kill(9, pid)
54
+ raise "Taking to long (> #{MAX} seconds)"
55
+ end
56
+ sleep 0.1
57
+ end
58
+
59
+ begin
60
+ Process.waitpid(pid)
61
+ end
62
+ rescue Errno::ECHILD
61
63
  end
62
- rescue Errno::ECHILD
64
+
65
+ sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
63
66
  end
64
67
 
65
- sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
68
+ sentences.collect{|sentence|
69
+ sentence = Misc.to_utf8(sentence)
70
+ start = text.index(sentence, last)
71
+ Segment.setup sentence, start
72
+ last = start + sentence.length - 1
73
+ sentence
74
+ }
75
+ rescue Exception
76
+ raise $!
77
+ raise "Sentence splitter raised exception: #{$!.message}"
66
78
  end
67
-
68
- sentences.collect{|sentence|
69
- sentence = Misc.to_utf8(sentence)
70
- start = text.index(sentence, last)
71
- Segment.setup sentence, start
72
- last = start + sentence.length - 1
73
- sentence
74
- }
75
- rescue Exception
76
- raise $!
77
- raise "Sentence splitter raised exception: #{$!.message}"
78
79
  end
79
80
  end
80
81
  end
@@ -0,0 +1,52 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/document'
3
+ require 'rbbt/segment/annotation'
4
+ require 'rbbt/util/python'
5
+
6
+ module SpaCy
7
+
8
+ PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
9
+
10
+ def self.tokens(text, lang = 'en')
11
+
12
+ tokens = []
13
+ RbbtPython.run 'spacy' do
14
+ nlp = spacy.load(lang)
15
+ doc = nlp.call(text)
16
+ doc.__len__.times do |i|
17
+ tokens << doc.__getitem__(i)
18
+ end
19
+ end
20
+ tokens
21
+ end
22
+
23
+ def self.segments(text, lang = 'en')
24
+ docid = text.docid if Document === text
25
+ corpus = text.corpus if Document === text
26
+ tokens = self.tokens(text, lang).collect do |token|
27
+ info = {}
28
+ PROPERTIES.each do |p|
29
+ info[p] = token.instance_eval(p.to_s)
30
+ end
31
+ info[:type] = "SpaCy"
32
+ info[:offset] = token.idx
33
+ info[:dep] = token.dep_ + "->" + token.head.idx.to_s
34
+ info[:docid] = docid if docid
35
+ info[:corpus] = corpus if corpus
36
+ SpaCyToken.setup(token.text, info)
37
+ end
38
+ SpaCyToken.setup(tokens, :corpus => corpus)
39
+ end
40
+ end
41
+
42
+ module SpaCyToken
43
+ extend Entity
44
+ include SegmentAnnotation
45
+
46
+ self.annotation *SpaCy::PROPERTIES
47
+ self.annotation :dep
48
+ end
49
+
50
+ if __FILE__ == $0
51
+ ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
52
+ end
@@ -0,0 +1,179 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/document'
4
+
5
+ module SegID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def range
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
15
+ end
16
+
17
+ def docid
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
19
+ end
20
+
21
+ def offset
22
+ range.begin
23
+ end
24
+
25
+ def segment_length
26
+ range.end - range.begin + 1
27
+ end
28
+
29
+ property :segment => :single do
30
+ docid = self.docid
31
+ document = DocID.setup(docid, :corpus => corpus).document
32
+
33
+ text = document[range]
34
+
35
+ Segment.setup(text, :docid => docid, :offset => offset)
36
+ end
37
+
38
+ property :segid do
39
+ self
40
+ end
41
+
42
+ end
43
+
44
+ module Segment
45
+ extend Entity
46
+ self.annotation :offset, :docid
47
+
48
+ def segment_length
49
+ length
50
+ end
51
+
52
+ def eend
53
+ offset.to_i + length - 1
54
+ end
55
+
56
+ def range
57
+ (offset.to_i..eend)
58
+ end
59
+
60
+ property :segid do |corpus=nil|
61
+ SegID.setup([docid, range] * ":", :corpus => corpus)
62
+ end
63
+
64
+ alias id segid
65
+
66
+ property :segment do
67
+ self
68
+ end
69
+
70
+ def self.sort(segments, inline = true)
71
+ if inline
72
+ segments.sort do |a,b|
73
+ case
74
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
75
+ 0
76
+ when (a.nil? or a.offset.nil?)
77
+ -1
78
+ when (b.nil? or b.offset.nil?)
79
+ +1
80
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
81
+ a.offset.to_i <=> b.offset.to_i
82
+ else
83
+ a.segment_length <=> b.segment_length
84
+ end
85
+ end
86
+ else
87
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
88
+ end
89
+ end
90
+
91
+ def self.overlaps(sorted_segments)
92
+ last = nil
93
+ overlaped = []
94
+
95
+ sorted_segments.reverse.each do |segment|
96
+ overlaped << segment if (not last.nil?) and segment.range.end > last
97
+ last = segment.range.begin
98
+ end
99
+
100
+ overlaped
101
+ end
102
+
103
+ def self.clean_sort(segments)
104
+ sorted = sort(segments).reject{|s| s.offset.nil?}
105
+ overlaps = overlaps(sorted)
106
+ overlaps.each do |s|
107
+ sorted.delete s
108
+ end
109
+
110
+ sorted
111
+ end
112
+
113
+ def self.split(text, segments, skip_segments = false)
114
+ sorted_segments = clean_sort segments
115
+
116
+ chunks = []
117
+ segment_end = 0
118
+ text_offset = 0
119
+ sorted_segments.each do |segment|
120
+ return chunks if text.nil? or text.empty?
121
+ next if segment.offset.nil?
122
+ offset = segment.offset - text_offset
123
+
124
+ # Consider segment offset. Save pre, or skip if overlap
125
+ case
126
+ when offset < 0 # Overlap, skip
127
+ next
128
+ when offset > 0 # Save pre
129
+ chunk = text[0..offset - 1]
130
+ Segment.setup(chunk, text_offset)
131
+ chunks << chunk
132
+ end
133
+
134
+ segment_end = offset + segment.segment_length - 1
135
+
136
+ if not skip_segments
137
+ chunk = text[offset..segment_end]
138
+ Segment.setup(chunk, text_offset + offset)
139
+ chunks << chunk
140
+ end
141
+
142
+ text_offset += segment_end + 1
143
+ text = text[segment_end + 1..-1]
144
+
145
+ end
146
+
147
+ if not text.nil? and not text.empty?
148
+ chunk = text.dup
149
+ Segment.setup(chunk, text_offset)
150
+ chunks << chunk
151
+ end
152
+
153
+ chunks
154
+ end
155
+
156
+ def self.align(text, parts)
157
+ pre_offset = 0
158
+ docid = text.respond_to?(:docid) ? text.docid : nil
159
+ parts.each do |part|
160
+ offset = text.index part
161
+ next if offset.nil?
162
+ Segment.setup(part, pre_offset + offset, docid)
163
+ pre_offset += offset + part.segment_length - 1
164
+ text = text[(offset + part.segment_length - 1)..-1]
165
+ end
166
+ end
167
+
168
+ def self.index(*args)
169
+ Segment::RangeIndex.index(*args)
170
+ end
171
+
172
+ end
173
+
174
+ require 'rbbt/segment/range_index'
175
+ require 'rbbt/segment/overlaps'
176
+ require 'rbbt/segment/transformed'
177
+ require 'rbbt/segment/segmented'
178
+ require 'rbbt/segment/encoding'
179
+