rbbt-text 1.1.9 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +56 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +61 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +42 -12
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -361
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -355
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -52
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,179 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/document'
4
+
5
+ module SegID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def range
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
15
+ end
16
+
17
+ def docid
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
19
+ end
20
+
21
+ def offset
22
+ range.begin
23
+ end
24
+
25
+ def segment_length
26
+ range.end - range.begin + 1
27
+ end
28
+
29
+ property :segment => :single do
30
+ docid = self.docid
31
+ document = DocID.setup(docid, :corpus => corpus).document
32
+
33
+ text = document[range]
34
+
35
+ Segment.setup(text, :docid => docid, :offset => offset)
36
+ end
37
+
38
+ property :segid do
39
+ self
40
+ end
41
+
42
+ end
43
+
44
+ module Segment
45
+ extend Entity
46
+ self.annotation :offset, :docid
47
+
48
+ def segment_length
49
+ length
50
+ end
51
+
52
+ def eend
53
+ offset.to_i + length - 1
54
+ end
55
+
56
+ def range
57
+ (offset.to_i..eend)
58
+ end
59
+
60
+ property :segid do |corpus=nil|
61
+ SegID.setup([docid, range] * ":", :corpus => corpus)
62
+ end
63
+
64
+ alias id segid
65
+
66
+ property :segment do
67
+ self
68
+ end
69
+
70
+ def self.sort(segments, inline = true)
71
+ if inline
72
+ segments.sort do |a,b|
73
+ case
74
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
75
+ 0
76
+ when (a.nil? or a.offset.nil?)
77
+ -1
78
+ when (b.nil? or b.offset.nil?)
79
+ +1
80
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
81
+ a.offset.to_i <=> b.offset.to_i
82
+ else
83
+ a.segment_length <=> b.segment_length
84
+ end
85
+ end
86
+ else
87
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
88
+ end
89
+ end
90
+
91
+ def self.overlaps(sorted_segments)
92
+ last = nil
93
+ overlaped = []
94
+
95
+ sorted_segments.reverse.each do |segment|
96
+ overlaped << segment if (not last.nil?) and segment.range.end > last
97
+ last = segment.range.begin
98
+ end
99
+
100
+ overlaped
101
+ end
102
+
103
+ def self.clean_sort(segments)
104
+ sorted = sort(segments).reject{|s| s.offset.nil?}
105
+ overlaps = overlaps(sorted)
106
+ overlaps.each do |s|
107
+ sorted.delete s
108
+ end
109
+
110
+ sorted
111
+ end
112
+
113
+ def self.split(text, segments, skip_segments = false)
114
+ sorted_segments = clean_sort segments
115
+
116
+ chunks = []
117
+ segment_end = 0
118
+ text_offset = 0
119
+ sorted_segments.each do |segment|
120
+ return chunks if text.nil? or text.empty?
121
+ next if segment.offset.nil?
122
+ offset = segment.offset - text_offset
123
+
124
+ # Consider segment offset. Save pre, or skip if overlap
125
+ case
126
+ when offset < 0 # Overlap, skip
127
+ next
128
+ when offset > 0 # Save pre
129
+ chunk = text[0..offset - 1]
130
+ Segment.setup(chunk, text_offset)
131
+ chunks << chunk
132
+ end
133
+
134
+ segment_end = offset + segment.segment_length - 1
135
+
136
+ if not skip_segments
137
+ chunk = text[offset..segment_end]
138
+ Segment.setup(chunk, text_offset + offset)
139
+ chunks << chunk
140
+ end
141
+
142
+ text_offset += segment_end + 1
143
+ text = text[segment_end + 1..-1]
144
+
145
+ end
146
+
147
+ if not text.nil? and not text.empty?
148
+ chunk = text.dup
149
+ Segment.setup(chunk, text_offset)
150
+ chunks << chunk
151
+ end
152
+
153
+ chunks
154
+ end
155
+
156
+ def self.align(text, parts)
157
+ pre_offset = 0
158
+ docid = text.respond_to?(:docid) ? text.docid : nil
159
+ parts.each do |part|
160
+ offset = text.index part
161
+ next if offset.nil?
162
+ Segment.setup(part, pre_offset + offset, docid)
163
+ pre_offset += offset + part.segment_length - 1
164
+ text = text[(offset + part.segment_length - 1)..-1]
165
+ end
166
+ end
167
+
168
+ def self.index(*args)
169
+ Segment::RangeIndex.index(*args)
170
+ end
171
+
172
+ end
173
+
174
+ require 'rbbt/segment/range_index'
175
+ require 'rbbt/segment/overlaps'
176
+ require 'rbbt/segment/transformed'
177
+ require 'rbbt/segment/segmented'
178
+ require 'rbbt/segment/encoding'
179
+
@@ -0,0 +1,58 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/segment'
3
+ require 'rbbt/entity'
4
+
5
+ module AnnotID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ def _parts
10
+ @parts ||= self.split(":")
11
+ end
12
+
13
+ def segid
14
+ @segid ||= _parts[0..4] * ":"
15
+ end
16
+
17
+ def type
18
+ @type ||= _parts[5]
19
+ end
20
+
21
+ property :annotation do
22
+ segment = SegID.setup(segid, :corpus => corpus).segment
23
+
24
+ SegmentAnnotation.setup(segment, :type => type)
25
+ end
26
+
27
+ property :annotid do
28
+ self
29
+ end
30
+
31
+ end
32
+
33
+ module SegmentAnnotation
34
+ extend Entity
35
+ include Object::Segment
36
+ self.annotation :type
37
+
38
+ property :segid do
39
+ case self
40
+ when SegID
41
+ self
42
+ when Segment
43
+ super()
44
+ else
45
+ raise "Unknown object: #{self}"
46
+ end
47
+ end
48
+
49
+ property :annotid do |corpus=nil|
50
+ AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
51
+ end
52
+
53
+ alias id annotid
54
+
55
+ property :annotation do
56
+ self
57
+ end
58
+ end
@@ -0,0 +1,18 @@
1
+ require 'rbbt/segment'
2
+ module Segment
3
+ def self.bad_chars(text)
4
+ segments = []
5
+ text.chars.each_with_index do |c,i|
6
+ if ! c.ascii_only?
7
+ segments << Segment.setup(c, :offset => i)
8
+ end
9
+ end
10
+ segments
11
+ end
12
+
13
+ def self.ascii(text, replace = nil, &block)
14
+ bad = bad_chars(text)
15
+ replace = "?" if replace.nil?
16
+ Transformed.with_transform(text, bad, replace, &block)
17
+ end
18
+ end
@@ -1,17 +1,18 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/entity'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/annotation'
3
3
 
4
- module NamedEntity
4
+ module NamedEntity
5
5
  extend Entity
6
6
  include Segment
7
+ include SegmentAnnotation
7
8
 
8
- self.annotation :type, :code, :score
9
+ self.annotation :entity_type, :code, :score
9
10
 
10
11
  def report
11
12
  <<-EOF
12
13
  String: #{ self }
13
14
  Offset: #{ offset.inspect }
14
- Type: #{type.inspect}
15
+ Type: #{entity_type.inspect}
15
16
  Code: #{code.inspect}
16
17
  Score: #{score.inspect}
17
18
  EOF
@@ -20,7 +21,7 @@ Score: #{score.inspect}
20
21
  def html
21
22
  text = <<-EOF
22
23
  <span class='Entity'\
23
- #{type.nil? ? "" : " attr-entity-type='#{Array === type ? type * " " : type}'"}\
24
+ #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
24
25
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
25
26
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
26
27
  >#{ self }</span>
@@ -29,13 +30,14 @@ Score: #{score.inspect}
29
30
  end
30
31
 
31
32
  def entity(params = nil)
33
+ code = self.code || self.dup
32
34
  format, entity = code.split(":")
33
35
  entity, format = format, nil if entity.nil?
34
-
35
- if defined?(Entity) && Entity.formats.include?(type) or Entity.formats.include?(format)
36
+
37
+ if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
36
38
  params ||= {}
37
39
  params[:format] = format if format and params[:format].nil?
38
- mod = (Entity.formats[type] || Entity.format[entity])
40
+ mod = (Entity.formats[entity_type] || Entity.format[entity])
39
41
  mod.setup(entity, params)
40
42
  end
41
43
 
@@ -43,4 +45,3 @@ Score: #{score.inspect}
43
45
  end
44
46
 
45
47
  end
46
-
@@ -0,0 +1,63 @@
1
+ module Segment
2
+ def pull(offset)
3
+ if self.offset.nil? or offset.nil?
4
+ self.offset = nil
5
+ else
6
+ self.offset += offset
7
+ end
8
+
9
+ self
10
+ end
11
+
12
+ def push(offset)
13
+ if self.offset.nil? or offset.nil?
14
+ self.offset = nil
15
+ else
16
+ self.offset -= offset
17
+ end
18
+
19
+ self
20
+ end
21
+
22
+ def make_relative(segments, &block)
23
+ if block_given?
24
+ segments.each{|s| s.push offset}
25
+ yield(segments)
26
+ segments.each{|s| s.pull offset}
27
+ else
28
+ segments.each{|s| s.push offset}
29
+ end
30
+ end
31
+
32
+ def range_in(container = nil)
33
+ raise "No offset specified" if offset.nil?
34
+ case
35
+ when (Segment === container and not container.offset.nil?)
36
+ ((offset - container.offset)..(self.eend - container.offset))
37
+ when Integer === container
38
+ ((offset - container)..(self.eend - container))
39
+ else
40
+ range
41
+ end
42
+ end
43
+
44
+ def includes?(segment)
45
+ (segment.offset.to_i >= self.offset.to_i) and
46
+ (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
47
+ end
48
+
49
+ def overlaps?(segment)
50
+ segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
51
+ self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
52
+ end
53
+
54
+ def overlaps(segments)
55
+ segments.select{|s| self.overlaps?(s) }
56
+ end
57
+
58
+ def self.collisions(main, secondary)
59
+ secondary.select do |ss|
60
+ main.select{|ms| ms.overlaps? ss }.any?
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,35 @@
1
+ module Segment::RangeIndex
2
+ attr_accessor :corpus
3
+
4
+ def [](*args)
5
+ res = super(*args)
6
+ SegID.setup(res, :corpus => corpus)
7
+ end
8
+
9
+ def self.index(segments, corpus, persist_file = :memory)
10
+ segments = segments.values.flatten if Hash === segments
11
+
12
+ annotation_index =
13
+ Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
14
+
15
+ value_size = 0
16
+ index_data = segments.collect{|segment|
17
+ next if segment.offset.nil?
18
+ range = segment.range
19
+ value_size = [segment.segid.length, value_size].max
20
+ [segment.segid, [range.begin, range.end]]
21
+ }.compact
22
+
23
+ fwt = FixWidthTable.get :memory, value_size, true
24
+ fwt.add_range index_data
25
+
26
+ fwt
27
+ end
28
+
29
+ annotation_index.extend Segment::RangeIndex
30
+ annotation_index.corpus = corpus
31
+ annotation_index
32
+ end
33
+
34
+ end
35
+
@@ -0,0 +1,7 @@
1
+ module Relationship
2
+ extend Entity
3
+
4
+ self.annotation :segments
5
+ self.annotation :type
6
+
7
+ end
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/annotations'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
 
4
4
  module Segmented
5
5
  extend Annotation
@@ -0,0 +1,23 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Token
4
+ extend Entity
5
+ include Segment
6
+
7
+ self.annotation :original
8
+
9
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
10
+
11
+ tokens = []
12
+ while matchdata = text.match(split_at)
13
+ tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
14
+ tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
15
+ start += matchdata.end(0)
16
+ text = matchdata.post_match
17
+ end
18
+
19
+ tokens << Token.setup(text, :offset => start) unless text.empty?
20
+
21
+ tokens
22
+ end
23
+ end