rbbt-text 0.6.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -254,6 +254,7 @@ class Document
254
254
  name = name.to_s
255
255
  index = segment_index(name, persist_dir)
256
256
  annotations = index[segment.range]
257
+ segment.segments[name] ||= {}
257
258
  segment.segments[name] = annotations
258
259
  class << segment
259
260
  self
@@ -1,4 +1,5 @@
1
1
  require 'rbbt/entity'
2
+ require 'rbbt/ner/segment/docid'
2
3
 
3
4
  module Document
4
5
  extend Entity
@@ -7,34 +8,77 @@ module Document
7
8
  attr_accessor :corpus
8
9
  end
9
10
 
10
- property :text => :array2single do
11
+ attr_accessor :docid
12
+
13
+ property :docid => :single2array do |*args|
14
+ @docid ||= if self =~ /^text:/
15
+ self
16
+ else
17
+ ["text", Misc.digest(self.inspect)] * ":"
18
+ end
19
+ @docid
20
+ end
21
+
22
+ #property :annotation_id => :single2array do |*args|
23
+ # docid(*args)
24
+ #end
25
+
26
+ property :annotation_id => :both do |*args|
27
+ if Array === self
28
+ Misc.hash2md5(info.merge(:self => self))
29
+ else
30
+ docid(*args)
31
+ end
32
+ end
33
+
34
+ property :_get_text => :single do
35
+ self
36
+ end
37
+
38
+ property :text => :array2single do |*args|
11
39
  article_text = {}
12
40
  missing = []
13
41
 
14
- self.each do |doc|
42
+ if Document.corpus.nil?
43
+ self._get_text(*args)
44
+ else
45
+
15
46
  Document.corpus.read if Document.corpus.respond_to? :read
16
- if Document.corpus.include?(doc)
17
- article_text[doc] = Document.corpus[doc]
18
- else
19
- missing << doc
47
+ self.each do |doc|
48
+
49
+ case
50
+ when Document.corpus.include?(doc)
51
+ article_text[doc] = Document.corpus[doc]
52
+ when Document.corpus.include?(doc.docid(*args))
53
+ article_text[doc] = Document.corpus[doc.docid(*args)]
54
+ else
55
+ missing << doc
56
+ end
57
+
20
58
  end
21
- end
59
+ Document.corpus.close if Document.corpus.respond_to? :close
60
+
61
+ if missing.any?
62
+ missing.first.annotate missing
63
+ missing_text = Misc.process_to_hash(missing){|list| list._get_text(*args)}
22
64
 
23
- if missing.any?
24
- missing.first.annotate missing
25
- missing_text = Misc.process_to_hash(missing){|list| list._get_text}
65
+ Misc.lock(Document.corpus.respond_to?(:persistence_path) ? Document.corpus.persistence_path : nil) do
66
+ Document.corpus.write if Document.corpus.respond_to? :write and not Document.corpus.write?
26
67
 
27
- Misc.lock Document.corpus.persistence_path do
28
- Document.corpus.write if Document.corpus.respond_to? :write
29
- missing_text.each do |doc, text|
30
- article_text[doc] = text
31
- Document.corpus[doc] = text
68
+ missing_text.each do |doc, doc_text|
69
+ doc = missing.first.annotate doc.dup
70
+ Document.corpus[doc.docid(*args)] = doc_text
71
+ article_text[doc] = doc_text
72
+ end
73
+
74
+ Document.corpus.close if Document.corpus.respond_to? :close
32
75
  end
33
- Document.corpus.read if Document.corpus.respond_to? :read
76
+
34
77
  end
35
- end
36
78
 
37
- article_text.values_at *self
79
+ article_text.values_at *self
80
+ end
38
81
  end
39
82
 
40
83
  end
84
+
@@ -10,13 +10,16 @@ class Abner < NER
10
10
 
11
11
  Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
12
12
 
13
- @@JFile = Rjb::import('java.io.File')
14
- @@Tagger = Rjb::import('abner.Tagger')
15
- @@Trainer = Rjb::import('abner.Trainer')
13
+ def self.init
14
+ @@JFile ||= Rjb::import('java.io.File')
15
+ @@Tagger ||= Rjb::import('abner.Tagger')
16
+ @@Trainer ||= Rjb::import('abner.Trainer')
17
+ end
16
18
 
17
19
  # If modelfile is present a custom trained model can be used,
18
20
  # otherwise, the default BioCreative model is used.
19
21
  def initialize(modelfile=nil)
22
+ Abner.init
20
23
  if modelfile == nil
21
24
  @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
22
25
  else
@@ -9,13 +9,15 @@ class Banner < NER
9
9
 
10
10
  Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
11
11
 
12
- @@JFile = Rjb::import('java.io.File')
13
- @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
14
- @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
15
- @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
16
- @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
17
- @@Sentence = Rjb::import('banner.Sentence')
18
- @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
12
+ def self.init
13
+ @@JFile ||= Rjb::import('java.io.File')
14
+ @@SimpleTokenizer ||= Rjb::import('banner.tokenization.SimpleTokenizer')
15
+ @@CRFTagger ||= Rjb::import('banner.tagging.CRFTagger')
16
+ @@ParenthesisPostProcessor ||= Rjb::import('banner.processing.ParenthesisPostProcessor')
17
+ @@HeppleTagger ||= Rjb::import('dragon.nlp.tool.HeppleTagger')
18
+ @@Sentence ||= Rjb::import('banner.Sentence')
19
+ @@EngLemmatiser ||= Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
20
+ end
19
21
 
20
22
 
21
23
 
@@ -26,6 +28,7 @@ class Banner < NER
26
28
  lemmadir = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
27
29
  taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
28
30
  )
31
+ Banner.init
29
32
 
30
33
  @tokenizer = @@SimpleTokenizer.new
31
34
 
@@ -7,11 +7,13 @@ require 'rbbt/util/log'
7
7
  class ChemicalTagger < NER
8
8
  Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
9
 
10
- Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
11
-
12
- @@RbbtChemicalTagger = Rjb::import('RbbtChemicalTagger')
10
+ def self.init
11
+ Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
12
+ @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
13
+ end
13
14
 
14
15
  def self.match(text, type = nil, memm = false)
16
+ self.init
15
17
 
16
18
  return [] if text.nil? or text.strip.empty?
17
19
 
@@ -0,0 +1,60 @@
1
+ require 'rbbt/ner/rnorm'
2
+
3
+ class Finder
4
+
5
+ if defined? Entity
6
+ module Match
7
+ extend Entity
8
+
9
+ self.annotation :format
10
+ self.annotation :namespace
11
+ self.annotation :score
12
+ end
13
+ end
14
+
15
+ class Instance
16
+ attr_accessor :namespace, :format, :normalizer
17
+ def initialize(path, open_options = {})
18
+ if TSV === path
19
+ @namespace = path.namespace
20
+ @format = path.key_field
21
+ @normalizer = Normalizer.new(path)
22
+ else
23
+ open_options = Misc.add_defaults open_options, :type => :flat
24
+ parser = TSV::Parser.new(Open.open(Path === path ? path.find : path), open_options)
25
+ @namespace = parser.namespace
26
+ @format = parser.key_field
27
+ @normalizer = Normalizer.new(Path === path ? path.tsv(open_options) : TSV.open(path, open_options))
28
+ end
29
+ end
30
+
31
+ def find(name)
32
+ candidates = @normalizer.match(name)
33
+ if defined? Finder::Match
34
+ candidates.collect{|c|
35
+ Finder::Match.setup(c.dup, @format, @namespace, @normalizer.token_score(c, name))
36
+ }
37
+ else
38
+ candidates
39
+ end
40
+ end
41
+ end
42
+
43
+ attr_accessor :instances
44
+ def initialize(path = nil, open_options = {})
45
+ @instances ||= []
46
+ @instances << Finder::Instance.new(path, open_options) unless path.nil?
47
+ end
48
+
49
+ def add_instance(path, open_options = {})
50
+ @instances << Finder::Instance.new(path, open_options)
51
+ end
52
+
53
+ def find(name)
54
+ @instances.inject([]) do |acc,instance|
55
+ acc += instance.find(name)
56
+ end
57
+ end
58
+
59
+ end
60
+
@@ -0,0 +1,38 @@
1
+ require 'rjb'
2
+ require 'rbbt'
3
+ require 'rbbt/ner/segment/named_entity'
4
+ module Linnaeus
5
+
6
+ Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
7
+
8
+
9
+
10
+ ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
11
+
12
+ def self.init
13
+ begin
14
+ Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx4G']) unless Rjb.loaded?
15
+ @@ArgParser = Rjb::import('martin.common.ArgParser')
16
+ @@Args = @@ArgParser.new(ARGS)
17
+ @@Loggers = Rjb::import('martin.common.Loggers')
18
+ @@Logger = @@Loggers.getDefaultLogger(@@Args)
19
+ @@EntityTagger = Rjb::import('uk.ac.man.entitytagger.EntityTagger')
20
+ @@Matcher = @@EntityTagger.getMatcher(@@Args, @@Logger)
21
+ rescue
22
+ if $!.message =~ /heap space/i
23
+ Log.warn "Heap Space seems too low. Make sure Linnaeus is loaded before other Java wrappers so that it has the chance to init the Java Bridge with sufficient heap space"
24
+ end
25
+ raise $!
26
+ end
27
+ end
28
+
29
+ def self.match(text)
30
+
31
+ init unless defined? @@Matcher
32
+
33
+ @@Matcher.match(text).toArray().collect do |mention|
34
+ NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
35
+ end
36
+ end
37
+ end
38
+
@@ -6,24 +6,26 @@ require 'rbbt/ner/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
9
+
9
10
  # This code was adapted from Ashish Tendulkar (ASK MARTIN)
10
11
  class NGramPrefixDictionary < NER
11
12
  STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
12
- STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
13
- class << self
14
- inline do |builder|
13
+ STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}
14
+ LETTER_REGEXP = Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)
15
+
16
+ inline do |builder|
15
17
 
16
- builder.c_raw <<-EOC
18
+ builder.c_raw_singleton <<-EOC
17
19
  int is_stop_letter(char letter)
18
20
  {
19
21
 
20
- if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
22
+ if( letter == ' ' || letter == '\\n' || letter == '\\r' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
21
23
 
22
24
  return 0;
23
25
  }
24
- EOC
26
+ EOC
25
27
 
26
- builder.c <<-EOC
28
+ builder.c_singleton <<-EOC
27
29
  VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
28
30
  {
29
31
  int length_cmp = RSTRING_LEN(cmp);
@@ -38,48 +40,59 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
38
40
 
39
41
  return Qfalse;
40
42
  }
41
- EOC
42
- end
43
+ EOC
43
44
  end
44
45
 
45
- def self.process_stream(stream)
46
+ def self.process_stream(stream, case_insensitive = false)
46
47
  index = {}
48
+
47
49
  while line = stream.gets
48
50
  names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
49
51
  code = names.shift
50
52
 
51
53
  names.each do |name|
54
+ name = name.downcase if case_insensitive
52
55
  ngram = name[0..2].strip
53
56
  index[ngram] ||= []
54
57
  index[ngram] << [name, code]
55
58
  end
56
59
  end
60
+
57
61
  index
58
-
59
62
  end
60
63
 
61
- def self.process_hash(hash)
64
+ def self.process_hash(hash, case_insensitive = false)
62
65
  index = {}
66
+
63
67
  hash.monitor = true if hash.respond_to? :monitor
64
68
  hash.unnamed = true if hash.respond_to? :unnamed
65
69
  method = hash.respond_to?(:through)? :through : :each
70
+
66
71
  hash.send(method) do |code, names|
67
72
  names.each do |name|
73
+ name = name.downcase if case_insensitive
68
74
  ngram = name[0..2].strip
69
75
  index[ngram] ||= []
70
76
  index[ngram] << [name, code]
71
77
  end
72
78
  end
79
+
73
80
  index
74
81
  end
75
82
 
83
+
76
84
  def self.match(index, text)
85
+ return [] if text.nil? or text.empty?
86
+
77
87
  matches = []
78
88
 
79
89
  text_offset = 0
80
90
  text_length = text.length
81
91
  while (not text_offset.nil?) and text_offset < text_length
82
- text_offset += 1 if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
92
+ if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
93
+ text_offset += 1
94
+ next
95
+ end
83
96
  ngram = text[text_offset..text_offset + 2].strip
84
97
 
85
98
  found = nil
@@ -88,12 +101,12 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
88
101
  diff = text_length - text_offset
89
102
  # Match with entries
90
103
  index[ngram].each do |name, code|
91
- if name.length < diff
104
+ if name.length <= diff
92
105
  #if piece.start_with? name and
93
106
  # (text_offset + name.length == text_length or piece[name.length] == " "[0])
94
107
 
95
108
  if fast_start_with(text, name, text_offset)
96
- found = [name, code, text_offset]
109
+ found = [name.dup, code, text_offset]
97
110
  break
98
111
  end
99
112
  end
@@ -101,7 +114,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
101
114
  end
102
115
 
103
116
  if found.nil?
104
- text_offset = text.index(" ", text_offset)
117
+ text_offset = text.index(LETTER_REGEXP, text_offset)
105
118
  text_offset += 1 unless text_offset.nil?
106
119
  else
107
120
  matches << found
@@ -112,22 +125,24 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
112
125
  matches
113
126
  end
114
127
 
115
- attr_accessor :index, :type
116
- def initialize(file, type = nil)
128
+
129
+ attr_accessor :index, :type, :case_insensitive
130
+ def initialize(file, type = nil, case_insensitive = false)
117
131
  @type = type
132
+ @case_insensitive = case_insensitive
118
133
  case
119
134
  when (TSV === file or Hash === file)
120
135
  Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
121
- @index = NGramPrefixDictionary.process_hash(file)
136
+ @index = NGramPrefixDictionary.process_hash(file, case_insensitive)
122
137
  when Path === file
123
138
  Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
124
- @index = NGramPrefixDictionary.process_stream(file.open)
139
+ @index = NGramPrefixDictionary.process_stream(file.open, case_insensitive)
125
140
  when Misc.is_filename?(file)
126
141
  Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
127
142
  @index = NGramPrefixDictionary.process_stream(Open.open(file))
128
143
  when StreamIO === file
129
144
  Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
130
- @index = NGramPrefixDictionary.process_stream(file)
145
+ @index = NGramPrefixDictionary.process_stream(file, case_insensitive)
131
146
  else
132
147
  raise "Format of lexicon not understood: #{file.inspect}"
133
148
  end
@@ -136,36 +151,15 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
136
151
  end
137
152
 
138
153
  def match(text)
139
- NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
154
+ matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
140
155
  NamedEntity.setup(name, offset, type, code)
141
156
  }
142
- end
143
- end
144
-
145
- if __FILE__ == $0
146
- require 'rbbt/sources/jochem'
147
- require 'rbbt/sources/pubmed'
148
-
149
- texts = []
150
- index = {}
151
-
152
- texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
153
- article.text
154
- end
155
157
 
156
- texts *= 150/texts.length
157
-
158
- tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => false, :grep => "GB"
159
- #tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => true
160
-
161
- tsv.unnamed = true
162
- ner = NGramPrefixDictionary.new(tsv)
163
-
164
- Misc.benchmark do
165
- texts.each do |text|
166
- ner.match(text)
158
+ if case_insensitive
159
+ matches.each{|m| m.replace(text[m.range])}
160
+ matches
161
+ else
162
+ matches
167
163
  end
168
164
  end
169
-
170
-
171
165
  end