rbbt-text 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/nlp/nlp'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
  module NLP
4
4
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
5
 
@@ -2,8 +2,8 @@ require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
3
  require 'rbbt/persist'
4
4
  require 'rbbt/resource'
5
- require 'rbbt/text/segment'
6
- require 'rbbt/text/segment/segmented'
5
+ require 'rbbt/segment'
6
+ require 'rbbt/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -101,7 +101,7 @@ module NLP
101
101
  input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
102
102
  sentence_tokens = TmpFile.with_file(input) do |fin|
103
103
  out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
104
- CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
104
+ CMD.cmd("cd #{Rbbt.software.opt.Gdep.produce.find}; ./gdep #{ fin }").read
105
105
  end
106
106
 
107
107
  out.split(/^$/).collect do |sentence|
@@ -120,10 +120,10 @@ module NLP
120
120
 
121
121
 
122
122
  def self.gdep_parse_sentences_extension(sentences)
123
- require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
123
+ require Rbbt.software.opt.Gdep.produce.ruby["Gdep.so"].find
124
124
  gdep = Gdep.new
125
125
  if not gdep.gdep_is_loaded
126
- Misc.in_dir Rbbt.software.opt.Gdep.find do
126
+ Misc.in_dir Rbbt.software.opt.Gdep.produce.find do
127
127
  gdep.load_gdep
128
128
  end
129
129
  end
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
3
+ require 'rbbt/segment'
4
4
  require 'rbbt/resource'
5
5
 
6
6
  module OpenNLP
@@ -33,48 +33,49 @@ module OpenNLP
33
33
  def self.sentence_splitter(text)
34
34
  return [] if text.nil? or text.empty?
35
35
 
36
- text = Misc.to_utf8(text)
37
- last = 0
38
- begin
39
- sentence_split_detector = self.sentence_split_detector
40
-
41
- sentences = nil
42
- TmpFile.with_file do |tmpfile|
43
- start_time = Time.now
44
-
45
- begin
46
- pid = Process.fork do
47
- sent = sentence_split_detector.sentDetect(text)
48
- Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
- end
36
+ Segment.ascii(text, "?") do
37
+ last = 0
38
+ begin
39
+ sentence_split_detector = self.sentence_split_detector
50
40
 
51
- while not Process.waitpid(pid)
52
- if Time.now - start_time > MAX
53
- Process.kill(9, pid)
54
- raise "Taking to long (> #{MAX} seconds)"
55
- end
56
- sleep 0.1
57
- end
41
+ sentences = nil
42
+ TmpFile.with_file do |tmpfile|
43
+ start_time = Time.now
58
44
 
59
45
  begin
60
- Process.waitpid(pid)
46
+ pid = Process.fork do
47
+ sent = sentence_split_detector.sentDetect(text)
48
+ Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
+ end
50
+
51
+ while not Process.waitpid(pid)
52
+ if Time.now - start_time > MAX
53
+ Process.kill(9, pid)
54
+ raise "Taking to long (> #{MAX} seconds)"
55
+ end
56
+ sleep 0.1
57
+ end
58
+
59
+ begin
60
+ Process.waitpid(pid)
61
+ end
62
+ rescue Errno::ECHILD
61
63
  end
62
- rescue Errno::ECHILD
64
+
65
+ sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
63
66
  end
64
67
 
65
- sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
68
+ sentences.collect{|sentence|
69
+ sentence = Misc.to_utf8(sentence)
70
+ start = text.index(sentence, last)
71
+ Segment.setup sentence, start
72
+ last = start + sentence.length - 1
73
+ sentence
74
+ }
75
+ rescue Exception
76
+ raise $!
77
+ raise "Sentence splitter raised exception: #{$!.message}"
66
78
  end
67
-
68
- sentences.collect{|sentence|
69
- sentence = Misc.to_utf8(sentence)
70
- start = text.index(sentence, last)
71
- Segment.setup sentence, start
72
- last = start + sentence.length - 1
73
- sentence
74
- }
75
- rescue Exception
76
- raise $!
77
- raise "Sentence splitter raised exception: #{$!.message}"
78
79
  end
79
80
  end
80
81
  end
@@ -0,0 +1,177 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+
4
+ module SegID
5
+ extend Entity
6
+ self.annotation :corpus
7
+
8
+ def _parts
9
+ @parts ||= self.split(":")
10
+ end
11
+
12
+ def range
13
+ @range ||= Range.new(*_parts.last.split("..").map(&:to_i))
14
+ end
15
+
16
+ def docid
17
+ @docid ||= _parts[0..3] * ":"
18
+ end
19
+
20
+ def offset
21
+ range.begin
22
+ end
23
+
24
+ def segment_length
25
+ range.end - range.begin + 1
26
+ end
27
+
28
+ property :segment do
29
+ document = DocID.setup(docid, :corpus => corpus).document
30
+
31
+ text = document[range]
32
+
33
+ Segment.setup(text, docid)
34
+ end
35
+
36
+ property :segid do
37
+ self
38
+ end
39
+
40
+ end
41
+
42
+ module Segment
43
+ extend Entity
44
+ self.annotation :offset, :docid
45
+
46
+ def segment_length
47
+ length
48
+ end
49
+
50
+ def eend
51
+ offset.to_i + length - 1
52
+ end
53
+
54
+ def range
55
+ (offset.to_i..eend)
56
+ end
57
+
58
+ property :segid do |corpus=nil|
59
+ SegID.setup([docid, range] * ":", :corpus => corpus)
60
+ end
61
+
62
+ alias id segid
63
+
64
+ property :segment do
65
+ self
66
+ end
67
+
68
+ def self.sort(segments, inline = true)
69
+ if inline
70
+ segments.sort do |a,b|
71
+ case
72
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
73
+ 0
74
+ when (a.nil? or a.offset.nil?)
75
+ -1
76
+ when (b.nil? or b.offset.nil?)
77
+ +1
78
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
79
+ a.offset.to_i <=> b.offset.to_i
80
+ else
81
+ a.segment_length <=> b.segment_length
82
+ end
83
+ end
84
+ else
85
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
86
+ end
87
+ end
88
+
89
+ def self.overlaps(sorted_segments)
90
+ last = nil
91
+ overlaped = []
92
+
93
+ sorted_segments.reverse.each do |segment|
94
+ overlaped << segment if (not last.nil?) and segment.range.end > last
95
+ last = segment.range.begin
96
+ end
97
+
98
+ overlaped
99
+ end
100
+
101
+ def self.clean_sort(segments)
102
+ sorted = sort(segments).reject{|s| s.offset.nil?}
103
+ overlaps = overlaps(sorted)
104
+ overlaps.each do |s|
105
+ sorted.delete s
106
+ end
107
+
108
+ sorted
109
+ end
110
+
111
+ def self.split(text, segments, skip_segments = false)
112
+ sorted_segments = clean_sort segments
113
+
114
+ chunks = []
115
+ segment_end = 0
116
+ text_offset = 0
117
+ sorted_segments.each do |segment|
118
+ return chunks if text.nil? or text.empty?
119
+ next if segment.offset.nil?
120
+ offset = segment.offset - text_offset
121
+
122
+ # Consider segment offset. Save pre, or skip if overlap
123
+ case
124
+ when offset < 0 # Overlap, skip
125
+ next
126
+ when offset > 0 # Save pre
127
+ chunk = text[0..offset - 1]
128
+ Segment.setup(chunk, text_offset)
129
+ chunks << chunk
130
+ end
131
+
132
+ segment_end = offset + segment.segment_length - 1
133
+
134
+ if not skip_segments
135
+ chunk = text[offset..segment_end]
136
+ Segment.setup(chunk, text_offset + offset)
137
+ chunks << chunk
138
+ end
139
+
140
+ text_offset += segment_end + 1
141
+ text = text[segment_end + 1..-1]
142
+
143
+ end
144
+
145
+ if not text.nil? and not text.empty?
146
+ chunk = text.dup
147
+ Segment.setup(chunk, text_offset)
148
+ chunks << chunk
149
+ end
150
+
151
+ chunks
152
+ end
153
+
154
+ def self.align(text, parts)
155
+ pre_offset = 0
156
+ docid = text.respond_to?(:docid) ? text.docid : nil
157
+ parts.each do |part|
158
+ offset = text.index part
159
+ next if offset.nil?
160
+ Segment.setup(part, pre_offset + offset, docid)
161
+ pre_offset += offset + part.segment_length - 1
162
+ text = text[(offset + part.segment_length - 1)..-1]
163
+ end
164
+ end
165
+
166
+ def self.index(*args)
167
+ Segment::RangeIndex.index(*args)
168
+ end
169
+
170
+ end
171
+
172
+ require 'rbbt/segment/range_index'
173
+ require 'rbbt/segment/overlaps'
174
+ require 'rbbt/segment/transformed'
175
+ require 'rbbt/segment/segmented'
176
+ require 'rbbt/segment/encoding'
177
+
@@ -0,0 +1,58 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/segment'
4
+
5
+ module AnnotID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def segid
14
+ @segid ||= _parts[0..4] * ":"
15
+ end
16
+
17
+ def type
18
+ @type ||= _parts[5]
19
+ end
20
+
21
+ property :annotation do
22
+ segment = SegID.setup(segid, :corpus => corpus).segment
23
+
24
+ SegmentAnnotation.setup(segment, :type => type)
25
+ end
26
+
27
+ property :annotid do
28
+ self
29
+ end
30
+
31
+ end
32
+
33
+ module SegmentAnnotation
34
+ extend Entity
35
+ include Segment
36
+ self.annotation :type
37
+
38
+ property :segid do
39
+ case self
40
+ when SegID
41
+ self
42
+ when Segment
43
+ super()
44
+ else
45
+ raise "Unknown object: #{self}"
46
+ end
47
+ end
48
+
49
+ property :annotid do |corpus=nil|
50
+ AnnotID.setup([segid, type] * ":", :corpus => corpus)
51
+ end
52
+
53
+ alias id annotid
54
+
55
+ property :annotation do
56
+ self
57
+ end
58
+ end
@@ -0,0 +1,18 @@
1
+ require 'rbbt/segment'
2
+ module Segment
3
+ def self.bad_chars(text)
4
+ segments = []
5
+ text.chars.each_with_index do |c,i|
6
+ if ! c.ascii_only?
7
+ segments << Segment.setup(c, :offset => i)
8
+ end
9
+ end
10
+ segments
11
+ end
12
+
13
+ def self.ascii(text, replace = nil, &block)
14
+ bad = bad_chars(text)
15
+ replace = "?" if replace.nil?
16
+ Transformed.with_transform(text, bad, replace, &block)
17
+ end
18
+ end
@@ -1,17 +1,18 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/entity'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/annotation'
3
3
 
4
- module NamedEntity
4
+ module NamedEntity
5
5
  extend Entity
6
6
  include Segment
7
+ include SegmentAnnotation
7
8
 
8
- self.annotation :type, :code, :score
9
+ self.annotation :entity_type, :code, :score
9
10
 
10
11
  def report
11
12
  <<-EOF
12
13
  String: #{ self }
13
14
  Offset: #{ offset.inspect }
14
- Type: #{type.inspect}
15
+ Type: #{entity_type.inspect}
15
16
  Code: #{code.inspect}
16
17
  Score: #{score.inspect}
17
18
  EOF
@@ -20,7 +21,7 @@ Score: #{score.inspect}
20
21
  def html
21
22
  text = <<-EOF
22
23
  <span class='Entity'\
23
- #{type.nil? ? "" : " attr-entity-type='#{Array === type ? type * " " : type}'"}\
24
+ #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
24
25
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
25
26
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
26
27
  >#{ self }</span>
@@ -29,14 +30,14 @@ Score: #{score.inspect}
29
30
  end
30
31
 
31
32
  def entity(params = nil)
32
- code = self.dup
33
+ code = self.code || self.dup
33
34
  format, entity = code.split(":")
34
35
  entity, format = format, nil if entity.nil?
35
-
36
- if defined?(Entity) && Entity.formats.include?(type) or Entity.formats.include?(format)
36
+
37
+ if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
37
38
  params ||= {}
38
39
  params[:format] = format if format and params[:format].nil?
39
- mod = (Entity.formats[type] || Entity.format[entity])
40
+ mod = (Entity.formats[entity_type] || Entity.format[entity])
40
41
  mod.setup(entity, params)
41
42
  end
42
43
 
@@ -44,4 +45,3 @@ Score: #{score.inspect}
44
45
  end
45
46
 
46
47
  end
47
-
@@ -0,0 +1,63 @@
1
+ module Segment
2
+ def pull(offset)
3
+ if self.offset.nil? or offset.nil?
4
+ self.offset = nil
5
+ else
6
+ self.offset += offset
7
+ end
8
+
9
+ self
10
+ end
11
+
12
+ def push(offset)
13
+ if self.offset.nil? or offset.nil?
14
+ self.offset = nil
15
+ else
16
+ self.offset -= offset
17
+ end
18
+
19
+ self
20
+ end
21
+
22
+ def make_relative(segments, &block)
23
+ if block_given?
24
+ segments.each{|s| s.push offset}
25
+ yield(segments)
26
+ segments.each{|s| s.pull offset}
27
+ else
28
+ segments.each{|s| s.push offset}
29
+ end
30
+ end
31
+
32
+ def range_in(container = nil)
33
+ raise "No offset specified" if offset.nil?
34
+ case
35
+ when (Segment === container and not container.offset.nil?)
36
+ ((offset - container.offset)..(self.eend - container.offset))
37
+ when Integer === container
38
+ ((offset - container)..(self.eend - container))
39
+ else
40
+ range
41
+ end
42
+ end
43
+
44
+ def includes?(segment)
45
+ (segment.offset.to_i >= self.offset.to_i) and
46
+ (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
47
+ end
48
+
49
+ def overlaps?(segment)
50
+ segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
51
+ self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
52
+ end
53
+
54
+ def overlaps(segments)
55
+ segments.select{|s| self.overlaps?(s) }
56
+ end
57
+
58
+ def self.collisions(main, secondary)
59
+ secondary.select do |ss|
60
+ main.select{|ms| ms.overlaps? ss }.any?
61
+ end
62
+ end
63
+ end