rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,15 @@
1
+ require 'rbbt/ner/annotations'
2
+ module Annotated
3
+ attr_accessor :annotations
4
+ def self.annotate(string, annotations = nil)
5
+ string.extend Annotated
6
+ string.annotations = annotations || []
7
+ string
8
+ end
9
+
10
+ def split_segments(skip_segments = false)
11
+ Segment.split(self, @annotations, skip_segments)
12
+ end
13
+ end
14
+
15
+
@@ -0,0 +1,37 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ module NamedEntity
4
+ attr_accessor :type, :code, :score, :segment_types
5
+ include Segment
6
+
7
+ def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
8
+ string.extend NamedEntity
9
+ string.offset = offset unless offset.nil?
10
+ string.type = type unless type.nil?
11
+ string.code = code unless code.nil?
12
+ string.score = score unless score.nil?
13
+ string
14
+ end
15
+
16
+ def report
17
+ <<-EOF
18
+ String: #{ self }
19
+ Offset: #{ offset.inspect }
20
+ Type: #{type.inspect}
21
+ Code: #{code.inspect}
22
+ Score: #{score.inspect}
23
+ EOF
24
+ end
25
+
26
+ def html
27
+ text = <<-EOF
28
+ <span class='Entity'\
29
+ #{type.nil? ? "" : " attr-entity-type='#{Array === type ? type * " " : type}'"}\
30
+ #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
31
+ #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
32
+ >#{ self }</span>
33
+ EOF
34
+ text.chomp
35
+ end
36
+ end
37
+
@@ -0,0 +1,25 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ module Relationship
4
+ attr_accessor :terms, :segment_types
5
+ include Segment
6
+ def self.annotate(string, offset = nil, terms = nil)
7
+ string.extend PPI
8
+ string.offset = offset unless offset.nil?
9
+ string.terms = terms unless terms.nil?
10
+ string
11
+ end
12
+
13
+ def html
14
+ text = <<-EOF
15
+ <span class='Relationship'\
16
+ >#{ self }</span>
17
+ EOF
18
+ text.chomp
19
+ end
20
+
21
+ def html_with_entities(*types)
22
+ annotations.values_at(*types).each do |segments|
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,28 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ module Token
4
+ include Segment
5
+ attr_accessor :original
6
+ def self.annotate(string, offset = nil, original = nil)
7
+ string.extend Token
8
+ string.offset = offset unless offset.nil?
9
+ string.original = original || string.dup
10
+ string
11
+ end
12
+
13
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
14
+
15
+ tokens = []
16
+ while matchdata = text.match(split_at)
17
+ tokens << Token.annotate(matchdata.pre_match, start) unless matchdata.pre_match.empty?
18
+ tokens << Token.annotate(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
19
+ start += matchdata.end(0)
20
+ text = matchdata.post_match
21
+ end
22
+
23
+ tokens << Token.annotate(text, start) unless text.empty?
24
+
25
+ tokens
26
+ end
27
+ end
28
+
@@ -0,0 +1,170 @@
1
+ require 'rbbt/ner/annotations'
2
+ module Transformed
3
+ attr_accessor :transformation_offset_differences, :transformation_original
4
+
5
+ def self.with_transform(text, segments, replacement)
6
+ require 'rbbt/util/misc'
7
+
8
+ text.extend Transformed
9
+ text.replace(segments, replacement)
10
+
11
+ segments = yield text
12
+
13
+ segments = nil unless Array === segments
14
+
15
+ text.restore(segments, true)
16
+ end
17
+
18
+ def self.transform(text, segments, replacement = nil, &block)
19
+ require 'rbbt/util/misc'
20
+
21
+ text.extend Transformed
22
+ text.replace(segments, replacement, &block)
23
+
24
+ text
25
+ end
26
+
27
+ def transform_pos(pos)
28
+ return pos if transformation_offset_differences.nil?
29
+ # tranformation_offset_differences are assumed to be sorted in reverse
30
+ # order
31
+ transformation_offset_differences.reverse.each do |trans_diff|
32
+ acc = 0
33
+ trans_diff.reverse.each do |offset, diff, orig_length, trans_length|
34
+ break if offset >= pos
35
+ acc += diff
36
+ end
37
+ pos = pos - acc
38
+ end
39
+
40
+ pos
41
+ end
42
+
43
+ def transform_range(range)
44
+ (transform_pos(range.begin)..transform_pos(range.end))
45
+ end
46
+
47
+ def transformed_set(pos, value)
48
+ transformed_pos = case
49
+ when Range === pos
50
+ transform_range(pos)
51
+ when Integer === pos
52
+ transform_pos(pos)
53
+ else
54
+ raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
55
+ end
56
+
57
+ self[transformed_pos] = value
58
+ end
59
+
60
+ def transformed_get(pos)
61
+ transformed_pos = case
62
+ when Range === pos
63
+ transform_range(pos)
64
+ when Integer === pos
65
+ transform_pos(pos)
66
+ else
67
+ raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
68
+ end
69
+
70
+ self[transformed_pos]
71
+ end
72
+
73
+ def conflict?(segment_range)
74
+ return false if @transformation_offset_differences.nil? or @transformation_offset_differences.empty?
75
+ transformation_offset_difference = @transformation_offset_differences.last
76
+
77
+ transformation_offset_difference.each do |info|
78
+ offset, diff, orig_length, trans_length = info
79
+ return true if segment_range.begin > offset and segment_range.begin < offset + trans_length or
80
+ segment_range.end > offset and segment_range.end < offset + trans_length
81
+ end
82
+
83
+ return false
84
+ end
85
+
86
+ def replace(segments, replacement = nil, &block)
87
+ replacement ||= block
88
+ raise "No replacement given" if replacement.nil?
89
+ transformation_offset_differences = []
90
+ transformation_original = []
91
+
92
+ Segment.clean_sort(segments).reverse.each do |segment|
93
+ untransformed_segment_range_here= segment.range_in(self)
94
+ transformed_segment_range = self.transform_range(untransformed_segment_range_here)
95
+ next if conflict?(transformed_segment_range)
96
+
97
+ text_before_transform = self[transformed_segment_range]
98
+
99
+ case
100
+ when String === replacement
101
+ transformed_text = replacement
102
+ when Proc === replacement
103
+
104
+ # Prepare segment with new text
105
+ save_segment_text = segment.dup
106
+ save_offset = segment.offset
107
+ segment.replace text_before_transform
108
+ segment.offset = transformed_segment_range.begin
109
+
110
+ transformed_text = replacement.call segment
111
+
112
+ # Restore segment with original text
113
+ segment.replace save_segment_text
114
+ segment.offset = save_offset
115
+ else
116
+ raise "Replacemente not String nor Proc"
117
+ end
118
+ diff = segment.length - transformed_text.length
119
+ self[transformed_segment_range] = transformed_text
120
+
121
+ transformation_offset_differences << [untransformed_segment_range_here.begin, diff, text_before_transform.length, transformed_text.length]
122
+ transformation_original << text_before_transform
123
+ end
124
+
125
+ @transformation_offset_differences ||= []
126
+ @transformation_offset_differences << transformation_offset_differences
127
+ @transformation_original ||= []
128
+ @transformation_original << transformation_original
129
+ end
130
+
131
+ def restore(segments = nil, first_only = false)
132
+ stop = false
133
+ while self.transformation_offset_differences.any? and not stop
134
+ transformation_offset_differences = self.transformation_offset_differences.pop
135
+ transformation_original = self.transformation_original.pop
136
+
137
+ ranges = transformation_offset_differences.collect do |offset,diff,orig_length,rep_length|
138
+ (offset..(offset + rep_length - 1))
139
+ end
140
+
141
+ ranges.zip(transformation_original).reverse.each do |range,text|
142
+ self.transformed_set(range, text)
143
+ end
144
+
145
+ stop = true if first_only
146
+
147
+ next if segments.nil?
148
+
149
+ segment_ranges = segments.each do |segment|
150
+ r = segment.range
151
+
152
+ s = r.begin
153
+ e = r.end
154
+ sdiff = 0
155
+ ediff = 0
156
+ transformation_offset_differences.reverse.each do |offset,diff,orig_length,rep_length|
157
+ sdiff += diff if offset < s
158
+ ediff += diff if offset + rep_length - 1 < e
159
+ end
160
+
161
+ segment.offset = s + sdiff
162
+ segment.replace self[(s+sdiff)..(e + ediff)]
163
+ end
164
+ end
165
+
166
+ segments
167
+ end
168
+ end
169
+
170
+
@@ -7,7 +7,7 @@ require 'rbbt/ner/NER'
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
8
8
  class Banner < NER
9
9
 
10
- Rbbt.add_software "BANNER" => ['','']
10
+ Rbbt.software.opt.BANNER.define_as_install Rbbt.share.install.software.BANNER.find
11
11
 
12
12
  @@JFile = Rjb::import('java.io.File')
13
13
  @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
@@ -22,9 +22,9 @@ class Banner < NER
22
22
  # The parameters are set to default values, the only one that one
23
23
  # might want to change is the modelfile to point to a custom trained
24
24
  # one.
25
- def initialize(modelfile = File.join(Rbbt.find_software('BANNER'), 'gene_model.bin'),
26
- lemmadir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/lemmatiser'),
27
- taggerdir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/tagger')
25
+ def initialize(modelfile = Rbbt.software.opt.BANNER["gene_model.bin"].find,
26
+ lemmadir = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
27
+ taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
28
28
  )
29
29
 
30
30
  @tokenizer = @@SimpleTokenizer.new
@@ -50,8 +50,10 @@ class Banner < NER
50
50
  # Returns an array with the mention found in the provided piece of
51
51
  # text.
52
52
  def match(text)
53
+ return [] if text.nil?
53
54
  text.gsub!(/\n/,' ')
54
55
  text.gsub!(/\|/,'/') # Character | gives an error
56
+ return [] if text.strip.empty?
55
57
  sentence = @@Sentence.new(text)
56
58
 
57
59
  @tokenizer.tokenize(sentence)
@@ -65,7 +67,8 @@ class Banner < NER
65
67
  mention = $1
66
68
  mention.sub!(/^\s*/,'')
67
69
  mention.sub!(/\s*$/,'')
68
- NamedEntity.annotate mention, nil, 'GENE'
70
+ offset = text.index(mention)
71
+ NamedEntity.annotate(mention, offset, 'GENE')
69
72
  mention
70
73
  }
71
74
  res
@@ -0,0 +1,34 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
5
+ require 'rbbt/util/log'
6
+
7
+ class ChemicalTagger < NER
8
+ Rbbt.software.opt.ChemicalTagger.define_as_install Rbbt.share.install.software.ChemicalTagger.find
9
+
10
+ Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
11
+
12
+ @@RbbtChemicalTagger = Rjb::import('RbbtChemicalTagger')
13
+
14
+ def self.match(text, type = nil, memm = false)
15
+
16
+ return [] if text.nil? or text.strip.empty?
17
+
18
+ begin
19
+ matches = @@RbbtChemicalTagger.match(text)
20
+ rescue
21
+ Log.debug "ChemicalTagger Error: #{$!.message}"
22
+ return []
23
+ end
24
+
25
+ matches.collect do |mention|
26
+ offset = text.index mention
27
+ NamedEntity.annotate mention, offset, "Chemical Mention", nil, nil
28
+ end
29
+ end
30
+
31
+ def match(*args)
32
+ ChemicalTagger.match(*args)
33
+ end
34
+ end
@@ -0,0 +1,136 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/annotations/token'
5
+ require 'rbbt/ner/NER'
6
+ require 'inline'
7
+
8
+ # This code was adapted from Ashish Tendulkar (ASK MARTIN)
9
+ class NGramPrefixDictionary < NER
10
+ STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
11
+ STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
12
+ class << self
13
+ inline do |builder|
14
+
15
+ builder.c_raw <<-EOC
16
+ int is_stop_letter(char letter)
17
+ {
18
+
19
+ if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
20
+
21
+ return 0;
22
+ }
23
+ EOC
24
+
25
+ builder.c <<-EOC
26
+ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
27
+ {
28
+ int length_cmp = RSTRING_LEN(cmp);
29
+ int length_str = RSTRING_LEN(str);
30
+
31
+ if (memcmp(RSTRING_PTR(str)+ offset, RSTRING_PTR(cmp), length_cmp) == 0){
32
+ if (length_cmp - offset == length_str || is_stop_letter(RSTRING_PTR(str)[offset + length_cmp]))
33
+ return Qtrue;
34
+ else
35
+ return Qfalse;
36
+ }
37
+
38
+ return Qfalse;
39
+ }
40
+ EOC
41
+ end
42
+ end
43
+
44
+ def self.process(hash)
45
+ index = {}
46
+ hash.each do |code, names|
47
+ names.each do |name|
48
+ ngram = name[0..2].strip
49
+ index[ngram] ||= []
50
+ index[ngram] << [name, code]
51
+ end
52
+ end
53
+ index
54
+ end
55
+
56
+ def self.match(index, text)
57
+ matches = []
58
+
59
+ text_offset = 0
60
+ text_length = text.length
61
+ while (not text_offset.nil?) and text_offset < text_length
62
+ text_offset += 1 if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
63
+ ngram = text[text_offset..text_offset + 2].strip
64
+
65
+ found = nil
66
+ if index.include? ngram
67
+
68
+ diff = text_length - text_offset
69
+ # Match with entries
70
+ index[ngram].each do |name, code|
71
+ if name.length < diff
72
+ #if piece.start_with? name and
73
+ # (text_offset + name.length == text_length or piece[name.length] == " "[0])
74
+
75
+ if fast_start_with(text, name, text_offset)
76
+ found = [name, code, text_offset]
77
+ break
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ if found.nil?
84
+ text_offset = text.index(" ", text_offset)
85
+ text_offset += 1 unless text_offset.nil?
86
+ else
87
+ matches << found
88
+ text_offset += found.first.length
89
+ end
90
+ end
91
+
92
+ matches
93
+ end
94
+
95
+ attr_accessor :index, :type
96
+ def initialize(file, type = nil)
97
+ tsv = TSV.new(file, :flat)
98
+ @type = type
99
+ tsv.unnamed = true
100
+ @index = NGramPrefixDictionary.process(tsv)
101
+ end
102
+
103
+ def match(text)
104
+ NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
105
+ NamedEntity.annotate(name, offset, type, code)
106
+ }
107
+ end
108
+ end
109
+
110
+ if __FILE__ == $0
111
+ require 'rbbt/sources/jochem'
112
+ require 'rbbt/sources/pubmed'
113
+
114
+ texts = []
115
+ index = {}
116
+
117
+ texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
118
+ article.text
119
+ end
120
+
121
+ texts *= 150/texts.length
122
+
123
+ tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => false, :grep => "GB"
124
+ #tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => true
125
+
126
+ tsv.unnamed = true
127
+ ner = NGramPrefixDictionary.new(tsv)
128
+
129
+ Misc.benchmark do
130
+ texts.each do |text|
131
+ ner.match(text)
132
+ end
133
+ end
134
+
135
+
136
+ end