rbbt-text 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/nlp/nlp'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
  module NLP
4
4
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
5
 
@@ -2,8 +2,8 @@ require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
3
  require 'rbbt/persist'
4
4
  require 'rbbt/resource'
5
- require 'rbbt/text/segment'
6
- require 'rbbt/text/segment/segmented'
5
+ require 'rbbt/segment'
6
+ require 'rbbt/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -101,7 +101,7 @@ module NLP
101
101
  input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
102
102
  sentence_tokens = TmpFile.with_file(input) do |fin|
103
103
  out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
104
- CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
104
+ CMD.cmd("cd #{Rbbt.software.opt.Gdep.produce.find}; ./gdep #{ fin }").read
105
105
  end
106
106
 
107
107
  out.split(/^$/).collect do |sentence|
@@ -120,10 +120,10 @@ module NLP
120
120
 
121
121
 
122
122
  def self.gdep_parse_sentences_extension(sentences)
123
- require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
123
+ require Rbbt.software.opt.Gdep.produce.ruby["Gdep.so"].find
124
124
  gdep = Gdep.new
125
125
  if not gdep.gdep_is_loaded
126
- Misc.in_dir Rbbt.software.opt.Gdep.find do
126
+ Misc.in_dir Rbbt.software.opt.Gdep.produce.find do
127
127
  gdep.load_gdep
128
128
  end
129
129
  end
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
3
+ require 'rbbt/segment'
4
4
  require 'rbbt/resource'
5
5
 
6
6
  module OpenNLP
@@ -33,48 +33,49 @@ module OpenNLP
33
33
  def self.sentence_splitter(text)
34
34
  return [] if text.nil? or text.empty?
35
35
 
36
- text = Misc.to_utf8(text)
37
- last = 0
38
- begin
39
- sentence_split_detector = self.sentence_split_detector
40
-
41
- sentences = nil
42
- TmpFile.with_file do |tmpfile|
43
- start_time = Time.now
44
-
45
- begin
46
- pid = Process.fork do
47
- sent = sentence_split_detector.sentDetect(text)
48
- Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
- end
36
+ Segment.ascii(text, "?") do
37
+ last = 0
38
+ begin
39
+ sentence_split_detector = self.sentence_split_detector
50
40
 
51
- while not Process.waitpid(pid)
52
- if Time.now - start_time > MAX
53
- Process.kill(9, pid)
54
- raise "Taking to long (> #{MAX} seconds)"
55
- end
56
- sleep 0.1
57
- end
41
+ sentences = nil
42
+ TmpFile.with_file do |tmpfile|
43
+ start_time = Time.now
58
44
 
59
45
  begin
60
- Process.waitpid(pid)
46
+ pid = Process.fork do
47
+ sent = sentence_split_detector.sentDetect(text)
48
+ Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
+ end
50
+
51
+ while not Process.waitpid(pid)
52
+ if Time.now - start_time > MAX
53
+ Process.kill(9, pid)
54
+ raise "Taking to long (> #{MAX} seconds)"
55
+ end
56
+ sleep 0.1
57
+ end
58
+
59
+ begin
60
+ Process.waitpid(pid)
61
+ end
62
+ rescue Errno::ECHILD
61
63
  end
62
- rescue Errno::ECHILD
64
+
65
+ sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
63
66
  end
64
67
 
65
- sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
68
+ sentences.collect{|sentence|
69
+ sentence = Misc.to_utf8(sentence)
70
+ start = text.index(sentence, last)
71
+ Segment.setup sentence, start
72
+ last = start + sentence.length - 1
73
+ sentence
74
+ }
75
+ rescue Exception
76
+ raise $!
77
+ raise "Sentence splitter raised exception: #{$!.message}"
66
78
  end
67
-
68
- sentences.collect{|sentence|
69
- sentence = Misc.to_utf8(sentence)
70
- start = text.index(sentence, last)
71
- Segment.setup sentence, start
72
- last = start + sentence.length - 1
73
- sentence
74
- }
75
- rescue Exception
76
- raise $!
77
- raise "Sentence splitter raised exception: #{$!.message}"
78
79
  end
79
80
  end
80
81
  end
@@ -0,0 +1,177 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+
4
+ module SegID
5
+ extend Entity
6
+ self.annotation :corpus
7
+
8
+ def _parts
9
+ @parts ||= self.split(":")
10
+ end
11
+
12
+ def range
13
+ @range ||= Range.new(*_parts.last.split("..").map(&:to_i))
14
+ end
15
+
16
+ def docid
17
+ @docid ||= _parts[0..3] * ":"
18
+ end
19
+
20
+ def offset
21
+ range.begin
22
+ end
23
+
24
+ def segment_length
25
+ range.end - range.begin + 1
26
+ end
27
+
28
+ property :segment do
29
+ document = DocID.setup(docid, :corpus => corpus).document
30
+
31
+ text = document[range]
32
+
33
+ Segment.setup(text, docid)
34
+ end
35
+
36
+ property :segid do
37
+ self
38
+ end
39
+
40
+ end
41
+
42
+ module Segment
43
+ extend Entity
44
+ self.annotation :offset, :docid
45
+
46
+ def segment_length
47
+ length
48
+ end
49
+
50
+ def eend
51
+ offset.to_i + length - 1
52
+ end
53
+
54
+ def range
55
+ (offset.to_i..eend)
56
+ end
57
+
58
+ property :segid do |corpus=nil|
59
+ SegID.setup([docid, range] * ":", :corpus => corpus)
60
+ end
61
+
62
+ alias id segid
63
+
64
+ property :segment do
65
+ self
66
+ end
67
+
68
+ def self.sort(segments, inline = true)
69
+ if inline
70
+ segments.sort do |a,b|
71
+ case
72
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
73
+ 0
74
+ when (a.nil? or a.offset.nil?)
75
+ -1
76
+ when (b.nil? or b.offset.nil?)
77
+ +1
78
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
79
+ a.offset.to_i <=> b.offset.to_i
80
+ else
81
+ a.segment_length <=> b.segment_length
82
+ end
83
+ end
84
+ else
85
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
86
+ end
87
+ end
88
+
89
+ def self.overlaps(sorted_segments)
90
+ last = nil
91
+ overlaped = []
92
+
93
+ sorted_segments.reverse.each do |segment|
94
+ overlaped << segment if (not last.nil?) and segment.range.end > last
95
+ last = segment.range.begin
96
+ end
97
+
98
+ overlaped
99
+ end
100
+
101
+ def self.clean_sort(segments)
102
+ sorted = sort(segments).reject{|s| s.offset.nil?}
103
+ overlaps = overlaps(sorted)
104
+ overlaps.each do |s|
105
+ sorted.delete s
106
+ end
107
+
108
+ sorted
109
+ end
110
+
111
+ def self.split(text, segments, skip_segments = false)
112
+ sorted_segments = clean_sort segments
113
+
114
+ chunks = []
115
+ segment_end = 0
116
+ text_offset = 0
117
+ sorted_segments.each do |segment|
118
+ return chunks if text.nil? or text.empty?
119
+ next if segment.offset.nil?
120
+ offset = segment.offset - text_offset
121
+
122
+ # Consider segment offset. Save pre, or skip if overlap
123
+ case
124
+ when offset < 0 # Overlap, skip
125
+ next
126
+ when offset > 0 # Save pre
127
+ chunk = text[0..offset - 1]
128
+ Segment.setup(chunk, text_offset)
129
+ chunks << chunk
130
+ end
131
+
132
+ segment_end = offset + segment.segment_length - 1
133
+
134
+ if not skip_segments
135
+ chunk = text[offset..segment_end]
136
+ Segment.setup(chunk, text_offset + offset)
137
+ chunks << chunk
138
+ end
139
+
140
+ text_offset += segment_end + 1
141
+ text = text[segment_end + 1..-1]
142
+
143
+ end
144
+
145
+ if not text.nil? and not text.empty?
146
+ chunk = text.dup
147
+ Segment.setup(chunk, text_offset)
148
+ chunks << chunk
149
+ end
150
+
151
+ chunks
152
+ end
153
+
154
+ def self.align(text, parts)
155
+ pre_offset = 0
156
+ docid = text.respond_to?(:docid) ? text.docid : nil
157
+ parts.each do |part|
158
+ offset = text.index part
159
+ next if offset.nil?
160
+ Segment.setup(part, pre_offset + offset, docid)
161
+ pre_offset += offset + part.segment_length - 1
162
+ text = text[(offset + part.segment_length - 1)..-1]
163
+ end
164
+ end
165
+
166
+ def self.index(*args)
167
+ Segment::RangeIndex.index(*args)
168
+ end
169
+
170
+ end
171
+
172
+ require 'rbbt/segment/range_index'
173
+ require 'rbbt/segment/overlaps'
174
+ require 'rbbt/segment/transformed'
175
+ require 'rbbt/segment/segmented'
176
+ require 'rbbt/segment/encoding'
177
+
@@ -0,0 +1,58 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/segment'
4
+
5
+ module AnnotID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def segid
14
+ @segid ||= _parts[0..4] * ":"
15
+ end
16
+
17
+ def type
18
+ @type ||= _parts[5]
19
+ end
20
+
21
+ property :annotation do
22
+ segment = SegID.setup(segid, :corpus => corpus).segment
23
+
24
+ SegmentAnnotation.setup(segment, :type => type)
25
+ end
26
+
27
+ property :annotid do
28
+ self
29
+ end
30
+
31
+ end
32
+
33
+ module SegmentAnnotation
34
+ extend Entity
35
+ include Segment
36
+ self.annotation :type
37
+
38
+ property :segid do
39
+ case self
40
+ when SegID
41
+ self
42
+ when Segment
43
+ super()
44
+ else
45
+ raise "Unknown object: #{self}"
46
+ end
47
+ end
48
+
49
+ property :annotid do |corpus=nil|
50
+ AnnotID.setup([segid, type] * ":", :corpus => corpus)
51
+ end
52
+
53
+ alias id annotid
54
+
55
+ property :annotation do
56
+ self
57
+ end
58
+ end
@@ -0,0 +1,18 @@
1
+ require 'rbbt/segment'
2
+ module Segment
3
+ def self.bad_chars(text)
4
+ segments = []
5
+ text.chars.each_with_index do |c,i|
6
+ if ! c.ascii_only?
7
+ segments << Segment.setup(c, :offset => i)
8
+ end
9
+ end
10
+ segments
11
+ end
12
+
13
+ def self.ascii(text, replace = nil, &block)
14
+ bad = bad_chars(text)
15
+ replace = "?" if replace.nil?
16
+ Transformed.with_transform(text, bad, replace, &block)
17
+ end
18
+ end
@@ -1,17 +1,18 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/entity'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/annotation'
3
3
 
4
- module NamedEntity
4
+ module NamedEntity
5
5
  extend Entity
6
6
  include Segment
7
+ include SegmentAnnotation
7
8
 
8
- self.annotation :type, :code, :score
9
+ self.annotation :entity_type, :code, :score
9
10
 
10
11
  def report
11
12
  <<-EOF
12
13
  String: #{ self }
13
14
  Offset: #{ offset.inspect }
14
- Type: #{type.inspect}
15
+ Type: #{entity_type.inspect}
15
16
  Code: #{code.inspect}
16
17
  Score: #{score.inspect}
17
18
  EOF
@@ -20,7 +21,7 @@ Score: #{score.inspect}
20
21
  def html
21
22
  text = <<-EOF
22
23
  <span class='Entity'\
23
- #{type.nil? ? "" : " attr-entity-type='#{Array === type ? type * " " : type}'"}\
24
+ #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
24
25
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
25
26
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
26
27
  >#{ self }</span>
@@ -29,14 +30,14 @@ Score: #{score.inspect}
29
30
  end
30
31
 
31
32
  def entity(params = nil)
32
- code = self.dup
33
+ code = self.code || self.dup
33
34
  format, entity = code.split(":")
34
35
  entity, format = format, nil if entity.nil?
35
-
36
- if defined?(Entity) && Entity.formats.include?(type) or Entity.formats.include?(format)
36
+
37
+ if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
37
38
  params ||= {}
38
39
  params[:format] = format if format and params[:format].nil?
39
- mod = (Entity.formats[type] || Entity.format[entity])
40
+ mod = (Entity.formats[entity_type] || Entity.format[entity])
40
41
  mod.setup(entity, params)
41
42
  end
42
43
 
@@ -44,4 +45,3 @@ Score: #{score.inspect}
44
45
  end
45
46
 
46
47
  end
47
-
@@ -0,0 +1,63 @@
1
+ module Segment
2
+ def pull(offset)
3
+ if self.offset.nil? or offset.nil?
4
+ self.offset = nil
5
+ else
6
+ self.offset += offset
7
+ end
8
+
9
+ self
10
+ end
11
+
12
+ def push(offset)
13
+ if self.offset.nil? or offset.nil?
14
+ self.offset = nil
15
+ else
16
+ self.offset -= offset
17
+ end
18
+
19
+ self
20
+ end
21
+
22
+ def make_relative(segments, &block)
23
+ if block_given?
24
+ segments.each{|s| s.push offset}
25
+ yield(segments)
26
+ segments.each{|s| s.pull offset}
27
+ else
28
+ segments.each{|s| s.push offset}
29
+ end
30
+ end
31
+
32
+ def range_in(container = nil)
33
+ raise "No offset specified" if offset.nil?
34
+ case
35
+ when (Segment === container and not container.offset.nil?)
36
+ ((offset - container.offset)..(self.eend - container.offset))
37
+ when Integer === container
38
+ ((offset - container)..(self.eend - container))
39
+ else
40
+ range
41
+ end
42
+ end
43
+
44
+ def includes?(segment)
45
+ (segment.offset.to_i >= self.offset.to_i) and
46
+ (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
47
+ end
48
+
49
+ def overlaps?(segment)
50
+ segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
51
+ self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
52
+ end
53
+
54
+ def overlaps(segments)
55
+ segments.select{|s| self.overlaps?(s) }
56
+ end
57
+
58
+ def self.collisions(main, secondary)
59
+ secondary.select do |ss|
60
+ main.select{|ms| ms.overlaps? ss }.any?
61
+ end
62
+ end
63
+ end