rbbt-text 0.2.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,15 @@
1
+ require 'rbbt/ner/annotations'
2
+ module Annotated
3
+ attr_accessor :annotations
4
+ def self.annotate(string, annotations = nil)
5
+ string.extend Annotated
6
+ string.annotations = annotations || []
7
+ string
8
+ end
9
+
10
+ def split_segments(skip_segments = false)
11
+ Segment.split(self, @annotations, skip_segments)
12
+ end
13
+ end
14
+
15
+
@@ -0,0 +1,37 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ module NamedEntity
4
+ attr_accessor :type, :code, :score, :segment_types
5
+ include Segment
6
+
7
+ def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
8
+ string.extend NamedEntity
9
+ string.offset = offset unless offset.nil?
10
+ string.type = type unless type.nil?
11
+ string.code = code unless code.nil?
12
+ string.score = score unless score.nil?
13
+ string
14
+ end
15
+
16
+ def report
17
+ <<-EOF
18
+ String: #{ self }
19
+ Offset: #{ offset.inspect }
20
+ Type: #{type.inspect}
21
+ Code: #{code.inspect}
22
+ Score: #{score.inspect}
23
+ EOF
24
+ end
25
+
26
+ def html
27
+ text = <<-EOF
28
+ <span class='Entity'\
29
+ #{type.nil? ? "" : " attr-entity-type='#{Array === type ? type * " " : type}'"}\
30
+ #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
31
+ #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
32
+ >#{ self }</span>
33
+ EOF
34
+ text.chomp
35
+ end
36
+ end
37
+
@@ -0,0 +1,25 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ module Relationship
4
+ attr_accessor :terms, :segment_types
5
+ include Segment
6
+ def self.annotate(string, offset = nil, terms = nil)
7
+ string.extend PPI
8
+ string.offset = offset unless offset.nil?
9
+ string.terms = terms unless terms.nil?
10
+ string
11
+ end
12
+
13
+ def html
14
+ text = <<-EOF
15
+ <span class='Relationship'\
16
+ >#{ self }</span>
17
+ EOF
18
+ text.chomp
19
+ end
20
+
21
+ def html_with_entities(*types)
22
+ annotations.values_at(*types).each do |segments|
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,28 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ module Token
4
+ include Segment
5
+ attr_accessor :original
6
+ def self.annotate(string, offset = nil, original = nil)
7
+ string.extend Token
8
+ string.offset = offset unless offset.nil?
9
+ string.original = original || string.dup
10
+ string
11
+ end
12
+
13
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
14
+
15
+ tokens = []
16
+ while matchdata = text.match(split_at)
17
+ tokens << Token.annotate(matchdata.pre_match, start) unless matchdata.pre_match.empty?
18
+ tokens << Token.annotate(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
19
+ start += matchdata.end(0)
20
+ text = matchdata.post_match
21
+ end
22
+
23
+ tokens << Token.annotate(text, start) unless text.empty?
24
+
25
+ tokens
26
+ end
27
+ end
28
+
@@ -0,0 +1,170 @@
1
+ require 'rbbt/ner/annotations'
2
+ module Transformed
3
+ attr_accessor :transformation_offset_differences, :transformation_original
4
+
5
+ def self.with_transform(text, segments, replacement)
6
+ require 'rbbt/util/misc'
7
+
8
+ text.extend Transformed
9
+ text.replace(segments, replacement)
10
+
11
+ segments = yield text
12
+
13
+ segments = nil unless Array === segments
14
+
15
+ text.restore(segments, true)
16
+ end
17
+
18
+ def self.transform(text, segments, replacement = nil, &block)
19
+ require 'rbbt/util/misc'
20
+
21
+ text.extend Transformed
22
+ text.replace(segments, replacement, &block)
23
+
24
+ text
25
+ end
26
+
27
+ def transform_pos(pos)
28
+ return pos if transformation_offset_differences.nil?
29
+ # tranformation_offset_differences are assumed to be sorted in reverse
30
+ # order
31
+ transformation_offset_differences.reverse.each do |trans_diff|
32
+ acc = 0
33
+ trans_diff.reverse.each do |offset, diff, orig_length, trans_length|
34
+ break if offset >= pos
35
+ acc += diff
36
+ end
37
+ pos = pos - acc
38
+ end
39
+
40
+ pos
41
+ end
42
+
43
+ def transform_range(range)
44
+ (transform_pos(range.begin)..transform_pos(range.end))
45
+ end
46
+
47
+ def transformed_set(pos, value)
48
+ transformed_pos = case
49
+ when Range === pos
50
+ transform_range(pos)
51
+ when Integer === pos
52
+ transform_pos(pos)
53
+ else
54
+ raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
55
+ end
56
+
57
+ self[transformed_pos] = value
58
+ end
59
+
60
+ def transformed_get(pos)
61
+ transformed_pos = case
62
+ when Range === pos
63
+ transform_range(pos)
64
+ when Integer === pos
65
+ transform_pos(pos)
66
+ else
67
+ raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
68
+ end
69
+
70
+ self[transformed_pos]
71
+ end
72
+
73
+ def conflict?(segment_range)
74
+ return false if @transformation_offset_differences.nil? or @transformation_offset_differences.empty?
75
+ transformation_offset_difference = @transformation_offset_differences.last
76
+
77
+ transformation_offset_difference.each do |info|
78
+ offset, diff, orig_length, trans_length = info
79
+ return true if segment_range.begin > offset and segment_range.begin < offset + trans_length or
80
+ segment_range.end > offset and segment_range.end < offset + trans_length
81
+ end
82
+
83
+ return false
84
+ end
85
+
86
+ def replace(segments, replacement = nil, &block)
87
+ replacement ||= block
88
+ raise "No replacement given" if replacement.nil?
89
+ transformation_offset_differences = []
90
+ transformation_original = []
91
+
92
+ Segment.clean_sort(segments).reverse.each do |segment|
93
+ untransformed_segment_range_here= segment.range_in(self)
94
+ transformed_segment_range = self.transform_range(untransformed_segment_range_here)
95
+ next if conflict?(transformed_segment_range)
96
+
97
+ text_before_transform = self[transformed_segment_range]
98
+
99
+ case
100
+ when String === replacement
101
+ transformed_text = replacement
102
+ when Proc === replacement
103
+
104
+ # Prepare segment with new text
105
+ save_segment_text = segment.dup
106
+ save_offset = segment.offset
107
+ segment.replace text_before_transform
108
+ segment.offset = transformed_segment_range.begin
109
+
110
+ transformed_text = replacement.call segment
111
+
112
+ # Restore segment with original text
113
+ segment.replace save_segment_text
114
+ segment.offset = save_offset
115
+ else
116
+ raise "Replacemente not String nor Proc"
117
+ end
118
+ diff = segment.length - transformed_text.length
119
+ self[transformed_segment_range] = transformed_text
120
+
121
+ transformation_offset_differences << [untransformed_segment_range_here.begin, diff, text_before_transform.length, transformed_text.length]
122
+ transformation_original << text_before_transform
123
+ end
124
+
125
+ @transformation_offset_differences ||= []
126
+ @transformation_offset_differences << transformation_offset_differences
127
+ @transformation_original ||= []
128
+ @transformation_original << transformation_original
129
+ end
130
+
131
+ def restore(segments = nil, first_only = false)
132
+ stop = false
133
+ while self.transformation_offset_differences.any? and not stop
134
+ transformation_offset_differences = self.transformation_offset_differences.pop
135
+ transformation_original = self.transformation_original.pop
136
+
137
+ ranges = transformation_offset_differences.collect do |offset,diff,orig_length,rep_length|
138
+ (offset..(offset + rep_length - 1))
139
+ end
140
+
141
+ ranges.zip(transformation_original).reverse.each do |range,text|
142
+ self.transformed_set(range, text)
143
+ end
144
+
145
+ stop = true if first_only
146
+
147
+ next if segments.nil?
148
+
149
+ segment_ranges = segments.each do |segment|
150
+ r = segment.range
151
+
152
+ s = r.begin
153
+ e = r.end
154
+ sdiff = 0
155
+ ediff = 0
156
+ transformation_offset_differences.reverse.each do |offset,diff,orig_length,rep_length|
157
+ sdiff += diff if offset < s
158
+ ediff += diff if offset + rep_length - 1 < e
159
+ end
160
+
161
+ segment.offset = s + sdiff
162
+ segment.replace self[(s+sdiff)..(e + ediff)]
163
+ end
164
+ end
165
+
166
+ segments
167
+ end
168
+ end
169
+
170
+
@@ -7,7 +7,7 @@ require 'rbbt/ner/NER'
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
8
8
  class Banner < NER
9
9
 
10
- Rbbt.add_software "BANNER" => ['','']
10
+ Rbbt.software.opt.BANNER.define_as_install Rbbt.share.install.software.BANNER.find
11
11
 
12
12
  @@JFile = Rjb::import('java.io.File')
13
13
  @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
@@ -22,9 +22,9 @@ class Banner < NER
22
22
  # The parameters are set to default values, the only one that one
23
23
  # might want to change is the modelfile to point to a custom trained
24
24
  # one.
25
- def initialize(modelfile = File.join(Rbbt.find_software('BANNER'), 'gene_model.bin'),
26
- lemmadir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/lemmatiser'),
27
- taggerdir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/tagger')
25
+ def initialize(modelfile = Rbbt.software.opt.BANNER["gene_model.bin"].find,
26
+ lemmadir = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
27
+ taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
28
28
  )
29
29
 
30
30
  @tokenizer = @@SimpleTokenizer.new
@@ -50,8 +50,10 @@ class Banner < NER
50
50
  # Returns an array with the mention found in the provided piece of
51
51
  # text.
52
52
  def match(text)
53
+ return [] if text.nil?
53
54
  text.gsub!(/\n/,' ')
54
55
  text.gsub!(/\|/,'/') # Character | gives an error
56
+ return [] if text.strip.empty?
55
57
  sentence = @@Sentence.new(text)
56
58
 
57
59
  @tokenizer.tokenize(sentence)
@@ -65,7 +67,8 @@ class Banner < NER
65
67
  mention = $1
66
68
  mention.sub!(/^\s*/,'')
67
69
  mention.sub!(/\s*$/,'')
68
- NamedEntity.annotate mention, nil, 'GENE'
70
+ offset = text.index(mention)
71
+ NamedEntity.annotate(mention, offset, 'GENE')
69
72
  mention
70
73
  }
71
74
  res
@@ -0,0 +1,34 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
5
+ require 'rbbt/util/log'
6
+
7
+ class ChemicalTagger < NER
8
+ Rbbt.software.opt.ChemicalTagger.define_as_install Rbbt.share.install.software.ChemicalTagger.find
9
+
10
+ Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
11
+
12
+ @@RbbtChemicalTagger = Rjb::import('RbbtChemicalTagger')
13
+
14
+ def self.match(text, type = nil, memm = false)
15
+
16
+ return [] if text.nil? or text.strip.empty?
17
+
18
+ begin
19
+ matches = @@RbbtChemicalTagger.match(text)
20
+ rescue
21
+ Log.debug "ChemicalTagger Error: #{$!.message}"
22
+ return []
23
+ end
24
+
25
+ matches.collect do |mention|
26
+ offset = text.index mention
27
+ NamedEntity.annotate mention, offset, "Chemical Mention", nil, nil
28
+ end
29
+ end
30
+
31
+ def match(*args)
32
+ ChemicalTagger.match(*args)
33
+ end
34
+ end
@@ -0,0 +1,136 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/annotations/token'
5
+ require 'rbbt/ner/NER'
6
+ require 'inline'
7
+
8
+ # This code was adapted from Ashish Tendulkar (ASK MARTIN)
9
+ class NGramPrefixDictionary < NER
10
+ STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
11
+ STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
12
+ class << self
13
+ inline do |builder|
14
+
15
+ builder.c_raw <<-EOC
16
+ int is_stop_letter(char letter)
17
+ {
18
+
19
+ if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
20
+
21
+ return 0;
22
+ }
23
+ EOC
24
+
25
+ builder.c <<-EOC
26
+ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
27
+ {
28
+ int length_cmp = RSTRING_LEN(cmp);
29
+ int length_str = RSTRING_LEN(str);
30
+
31
+ if (memcmp(RSTRING_PTR(str)+ offset, RSTRING_PTR(cmp), length_cmp) == 0){
32
+ if (length_cmp - offset == length_str || is_stop_letter(RSTRING_PTR(str)[offset + length_cmp]))
33
+ return Qtrue;
34
+ else
35
+ return Qfalse;
36
+ }
37
+
38
+ return Qfalse;
39
+ }
40
+ EOC
41
+ end
42
+ end
43
+
44
+ def self.process(hash)
45
+ index = {}
46
+ hash.each do |code, names|
47
+ names.each do |name|
48
+ ngram = name[0..2].strip
49
+ index[ngram] ||= []
50
+ index[ngram] << [name, code]
51
+ end
52
+ end
53
+ index
54
+ end
55
+
56
+ def self.match(index, text)
57
+ matches = []
58
+
59
+ text_offset = 0
60
+ text_length = text.length
61
+ while (not text_offset.nil?) and text_offset < text_length
62
+ text_offset += 1 if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
63
+ ngram = text[text_offset..text_offset + 2].strip
64
+
65
+ found = nil
66
+ if index.include? ngram
67
+
68
+ diff = text_length - text_offset
69
+ # Match with entries
70
+ index[ngram].each do |name, code|
71
+ if name.length < diff
72
+ #if piece.start_with? name and
73
+ # (text_offset + name.length == text_length or piece[name.length] == " "[0])
74
+
75
+ if fast_start_with(text, name, text_offset)
76
+ found = [name, code, text_offset]
77
+ break
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ if found.nil?
84
+ text_offset = text.index(" ", text_offset)
85
+ text_offset += 1 unless text_offset.nil?
86
+ else
87
+ matches << found
88
+ text_offset += found.first.length
89
+ end
90
+ end
91
+
92
+ matches
93
+ end
94
+
95
+ attr_accessor :index, :type
96
+ def initialize(file, type = nil)
97
+ tsv = TSV.new(file, :flat)
98
+ @type = type
99
+ tsv.unnamed = true
100
+ @index = NGramPrefixDictionary.process(tsv)
101
+ end
102
+
103
+ def match(text)
104
+ NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
105
+ NamedEntity.annotate(name, offset, type, code)
106
+ }
107
+ end
108
+ end
109
+
110
+ if __FILE__ == $0
111
+ require 'rbbt/sources/jochem'
112
+ require 'rbbt/sources/pubmed'
113
+
114
+ texts = []
115
+ index = {}
116
+
117
+ texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
118
+ article.text
119
+ end
120
+
121
+ texts *= 150/texts.length
122
+
123
+ tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => false, :grep => "GB"
124
+ #tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => true
125
+
126
+ tsv.unnamed = true
127
+ ner = NGramPrefixDictionary.new(tsv)
128
+
129
+ Misc.benchmark do
130
+ texts.each do |text|
131
+ ner.match(text)
132
+ end
133
+ end
134
+
135
+
136
+ end