rbbt-text 1.1.9 → 1.3.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +56 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +61 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +42 -12
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -361
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -355
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -52
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,179 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/document'
4
+
5
+ module SegID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def range
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
15
+ end
16
+
17
+ def docid
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
19
+ end
20
+
21
+ def offset
22
+ range.begin
23
+ end
24
+
25
+ def segment_length
26
+ range.end - range.begin + 1
27
+ end
28
+
29
+ property :segment => :single do
30
+ docid = self.docid
31
+ document = DocID.setup(docid, :corpus => corpus).document
32
+
33
+ text = document[range]
34
+
35
+ Segment.setup(text, :docid => docid, :offset => offset)
36
+ end
37
+
38
+ property :segid do
39
+ self
40
+ end
41
+
42
+ end
43
+
44
+ module Segment
45
+ extend Entity
46
+ self.annotation :offset, :docid
47
+
48
+ def segment_length
49
+ length
50
+ end
51
+
52
+ def eend
53
+ offset.to_i + length - 1
54
+ end
55
+
56
+ def range
57
+ (offset.to_i..eend)
58
+ end
59
+
60
+ property :segid do |corpus=nil|
61
+ SegID.setup([docid, range] * ":", :corpus => corpus)
62
+ end
63
+
64
+ alias id segid
65
+
66
+ property :segment do
67
+ self
68
+ end
69
+
70
+ def self.sort(segments, inline = true)
71
+ if inline
72
+ segments.sort do |a,b|
73
+ case
74
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
75
+ 0
76
+ when (a.nil? or a.offset.nil?)
77
+ -1
78
+ when (b.nil? or b.offset.nil?)
79
+ +1
80
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
81
+ a.offset.to_i <=> b.offset.to_i
82
+ else
83
+ a.segment_length <=> b.segment_length
84
+ end
85
+ end
86
+ else
87
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
88
+ end
89
+ end
90
+
91
+ def self.overlaps(sorted_segments)
92
+ last = nil
93
+ overlaped = []
94
+
95
+ sorted_segments.reverse.each do |segment|
96
+ overlaped << segment if (not last.nil?) and segment.range.end > last
97
+ last = segment.range.begin
98
+ end
99
+
100
+ overlaped
101
+ end
102
+
103
+ def self.clean_sort(segments)
104
+ sorted = sort(segments).reject{|s| s.offset.nil?}
105
+ overlaps = overlaps(sorted)
106
+ overlaps.each do |s|
107
+ sorted.delete s
108
+ end
109
+
110
+ sorted
111
+ end
112
+
113
+ def self.split(text, segments, skip_segments = false)
114
+ sorted_segments = clean_sort segments
115
+
116
+ chunks = []
117
+ segment_end = 0
118
+ text_offset = 0
119
+ sorted_segments.each do |segment|
120
+ return chunks if text.nil? or text.empty?
121
+ next if segment.offset.nil?
122
+ offset = segment.offset - text_offset
123
+
124
+ # Consider segment offset. Save pre, or skip if overlap
125
+ case
126
+ when offset < 0 # Overlap, skip
127
+ next
128
+ when offset > 0 # Save pre
129
+ chunk = text[0..offset - 1]
130
+ Segment.setup(chunk, text_offset)
131
+ chunks << chunk
132
+ end
133
+
134
+ segment_end = offset + segment.segment_length - 1
135
+
136
+ if not skip_segments
137
+ chunk = text[offset..segment_end]
138
+ Segment.setup(chunk, text_offset + offset)
139
+ chunks << chunk
140
+ end
141
+
142
+ text_offset += segment_end + 1
143
+ text = text[segment_end + 1..-1]
144
+
145
+ end
146
+
147
+ if not text.nil? and not text.empty?
148
+ chunk = text.dup
149
+ Segment.setup(chunk, text_offset)
150
+ chunks << chunk
151
+ end
152
+
153
+ chunks
154
+ end
155
+
156
+ def self.align(text, parts)
157
+ pre_offset = 0
158
+ docid = text.respond_to?(:docid) ? text.docid : nil
159
+ parts.each do |part|
160
+ offset = text.index part
161
+ next if offset.nil?
162
+ Segment.setup(part, pre_offset + offset, docid)
163
+ pre_offset += offset + part.segment_length - 1
164
+ text = text[(offset + part.segment_length - 1)..-1]
165
+ end
166
+ end
167
+
168
+ def self.index(*args)
169
+ Segment::RangeIndex.index(*args)
170
+ end
171
+
172
+ end
173
+
174
+ require 'rbbt/segment/range_index'
175
+ require 'rbbt/segment/overlaps'
176
+ require 'rbbt/segment/transformed'
177
+ require 'rbbt/segment/segmented'
178
+ require 'rbbt/segment/encoding'
179
+
@@ -0,0 +1,58 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/segment'
3
+ require 'rbbt/entity'
4
+
5
+ module AnnotID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def segid
14
+ @segid ||= _parts[0..4] * ":"
15
+ end
16
+
17
+ def type
18
+ @type ||= _parts[5]
19
+ end
20
+
21
+ property :annotation do
22
+ segment = SegID.setup(segid, :corpus => corpus).segment
23
+
24
+ SegmentAnnotation.setup(segment, :type => type)
25
+ end
26
+
27
+ property :annotid do
28
+ self
29
+ end
30
+
31
+ end
32
+
33
+ module SegmentAnnotation
34
+ extend Entity
35
+ include Object::Segment
36
+ self.annotation :type
37
+
38
+ property :segid do
39
+ case self
40
+ when SegID
41
+ self
42
+ when Segment
43
+ super()
44
+ else
45
+ raise "Unknown object: #{self}"
46
+ end
47
+ end
48
+
49
+ property :annotid do |corpus=nil|
50
+ AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
51
+ end
52
+
53
+ alias id annotid
54
+
55
+ property :annotation do
56
+ self
57
+ end
58
+ end
@@ -0,0 +1,18 @@
1
+ require 'rbbt/segment'
2
+ module Segment
3
+ def self.bad_chars(text)
4
+ segments = []
5
+ text.chars.each_with_index do |c,i|
6
+ if ! c.ascii_only?
7
+ segments << Segment.setup(c, :offset => i)
8
+ end
9
+ end
10
+ segments
11
+ end
12
+
13
+ def self.ascii(text, replace = nil, &block)
14
+ bad = bad_chars(text)
15
+ replace = "?" if replace.nil?
16
+ Transformed.with_transform(text, bad, replace, &block)
17
+ end
18
+ end
@@ -1,17 +1,18 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/entity'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/annotation'
3
3
 
4
- module NamedEntity
4
+ module NamedEntity
5
5
  extend Entity
6
6
  include Segment
7
+ include SegmentAnnotation
7
8
 
8
- self.annotation :type, :code, :score
9
+ self.annotation :entity_type, :code, :score
9
10
 
10
11
  def report
11
12
  <<-EOF
12
13
  String: #{ self }
13
14
  Offset: #{ offset.inspect }
14
- Type: #{type.inspect}
15
+ Type: #{entity_type.inspect}
15
16
  Code: #{code.inspect}
16
17
  Score: #{score.inspect}
17
18
  EOF
@@ -20,7 +21,7 @@ Score: #{score.inspect}
20
21
  def html
21
22
  text = <<-EOF
22
23
  <span class='Entity'\
23
- #{type.nil? ? "" : " attr-entity-type='#{Array === type ? type * " " : type}'"}\
24
+ #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
24
25
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
25
26
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
26
27
  >#{ self }</span>
@@ -29,13 +30,14 @@ Score: #{score.inspect}
29
30
  end
30
31
 
31
32
  def entity(params = nil)
33
+ code = self.code || self.dup
32
34
  format, entity = code.split(":")
33
35
  entity, format = format, nil if entity.nil?
34
-
35
- if defined?(Entity) && Entity.formats.include?(type) or Entity.formats.include?(format)
36
+
37
+ if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
36
38
  params ||= {}
37
39
  params[:format] = format if format and params[:format].nil?
38
- mod = (Entity.formats[type] || Entity.format[entity])
40
+ mod = (Entity.formats[entity_type] || Entity.format[entity])
39
41
  mod.setup(entity, params)
40
42
  end
41
43
 
@@ -43,4 +45,3 @@ Score: #{score.inspect}
43
45
  end
44
46
 
45
47
  end
46
-
@@ -0,0 +1,63 @@
1
+ module Segment
2
+ def pull(offset)
3
+ if self.offset.nil? or offset.nil?
4
+ self.offset = nil
5
+ else
6
+ self.offset += offset
7
+ end
8
+
9
+ self
10
+ end
11
+
12
+ def push(offset)
13
+ if self.offset.nil? or offset.nil?
14
+ self.offset = nil
15
+ else
16
+ self.offset -= offset
17
+ end
18
+
19
+ self
20
+ end
21
+
22
+ def make_relative(segments, &block)
23
+ if block_given?
24
+ segments.each{|s| s.push offset}
25
+ yield(segments)
26
+ segments.each{|s| s.pull offset}
27
+ else
28
+ segments.each{|s| s.push offset}
29
+ end
30
+ end
31
+
32
+ def range_in(container = nil)
33
+ raise "No offset specified" if offset.nil?
34
+ case
35
+ when (Segment === container and not container.offset.nil?)
36
+ ((offset - container.offset)..(self.eend - container.offset))
37
+ when Integer === container
38
+ ((offset - container)..(self.eend - container))
39
+ else
40
+ range
41
+ end
42
+ end
43
+
44
+ def includes?(segment)
45
+ (segment.offset.to_i >= self.offset.to_i) and
46
+ (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
47
+ end
48
+
49
+ def overlaps?(segment)
50
+ segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
51
+ self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
52
+ end
53
+
54
+ def overlaps(segments)
55
+ segments.select{|s| self.overlaps?(s) }
56
+ end
57
+
58
+ def self.collisions(main, secondary)
59
+ secondary.select do |ss|
60
+ main.select{|ms| ms.overlaps? ss }.any?
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,35 @@
1
+ module Segment::RangeIndex
2
+ attr_accessor :corpus
3
+
4
+ def [](*args)
5
+ res = super(*args)
6
+ SegID.setup(res, :corpus => corpus)
7
+ end
8
+
9
+ def self.index(segments, corpus, persist_file = :memory)
10
+ segments = segments.values.flatten if Hash === segments
11
+
12
+ annotation_index =
13
+ Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
14
+
15
+ value_size = 0
16
+ index_data = segments.collect{|segment|
17
+ next if segment.offset.nil?
18
+ range = segment.range
19
+ value_size = [segment.segid.length, value_size].max
20
+ [segment.segid, [range.begin, range.end]]
21
+ }.compact
22
+
23
+ fwt = FixWidthTable.get :memory, value_size, true
24
+ fwt.add_range index_data
25
+
26
+ fwt
27
+ end
28
+
29
+ annotation_index.extend Segment::RangeIndex
30
+ annotation_index.corpus = corpus
31
+ annotation_index
32
+ end
33
+
34
+ end
35
+
@@ -0,0 +1,7 @@
1
+ module Relationship
2
+ extend Entity
3
+
4
+ self.annotation :segments
5
+ self.annotation :type
6
+
7
+ end
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/annotations'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
 
4
4
  module Segmented
5
5
  extend Annotation
@@ -0,0 +1,23 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Token
4
+ extend Entity
5
+ include Segment
6
+
7
+ self.annotation :original
8
+
9
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
10
+
11
+ tokens = []
12
+ while matchdata = text.match(split_at)
13
+ tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
14
+ tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
15
+ start += matchdata.end(0)
16
+ text = matchdata.post_match
17
+ end
18
+
19
+ tokens << Token.setup(text, :offset => start) unless text.empty?
20
+
21
+ tokens
22
+ end
23
+ end