rbbt-text 1.1.7 → 1.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 8373a3408f7b9cbc481ef108d899de4283513115
4
- data.tar.gz: e431a61729bd0f67a4129384a06a5b3a81824afc
2
+ SHA256:
3
+ metadata.gz: ea1646b5f32644bb5872f57422534b49955f988df26df4a65c8dda592515eac3
4
+ data.tar.gz: 3f6bc60546b79c76b6b35840712453616c377fcc088f321e95847f116776bef1
5
5
  SHA512:
6
- metadata.gz: 575313a7d598cbec0ec05827bebee1f4ccc8b56ccdac89478beb4993b5188356384172cf762e6ca58f0c0953fae0f29fcd46423d1291391d9958b66ce62230d0
7
- data.tar.gz: 4609a5ce9448f0a0a3ad480bc9f0cc6f3dfc728eacb0c6bf291d8b29b23d084f56a76c8776d421d935e50ef097fb10009d41121ef046b95c85177060335a1629
6
+ metadata.gz: 9376c68bad67733b5771b57ead7c962d45ff29c44362d1c51bf3480d3c3bf9f1f75284e40044fc4ed95bd94a03ab0759b3b7320bf1e3da00a0cdd82255c9395c
7
+ data.tar.gz: cd25a9cd91fde366be195801d45238d555edfc94f2b06391db7db2d9f4781b34dd599514385782d6c7e22af2841c5f3322ba74bf0a3a9c1fdbe308a255f00098
@@ -1,13 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rbbt-util'
4
- require 'rbbt/annotations/corpus'
5
- require 'rbbt/annotations/corpus/pubmed'
6
- require 'rbbt/annotations/relationships/ppi'
4
+ require 'rbbt/corpus/corpus'
5
+ require 'rbbt/corpus/sources/pubmed'
6
+ #require 'rbbt/annotations/relationships/ppi'
7
7
  require 'rbbt/sources/pubmed'
8
- require 'rbbt/ner/annotations'
8
+ #require 'rbbt/ner/annotations'
9
9
  require 'rbbt/ner/token_trieNER'
10
- require 'rbbt/ner/annotations/transformed'
10
+ #require 'rbbt/ner/annotations/transformed'
11
11
  require 'rbbt/ner/chemical_tagger'
12
12
 
13
13
  Corpus.define_entity_ner "Compounds", false do |doc|
@@ -182,7 +182,4 @@ class Dictionary::KL
182
182
  def weights(options = {})
183
183
  best(options)
184
184
  end
185
-
186
-
187
-
188
185
  end
@@ -8,10 +8,10 @@ require 'json'
8
8
 
9
9
  class Document
10
10
 
11
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indeces, :persist_dir, :global_persistence
11
+ attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence
12
12
  def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil)
13
13
  @segments = {}
14
- @segment_indeces = {}
14
+ @segment_indices = {}
15
15
 
16
16
  if not persist_dir.nil?
17
17
  @persist_dir = persist_dir
@@ -236,7 +236,7 @@ class Document
236
236
  end
237
237
 
238
238
  def segment_index(name, persist_dir = nil)
239
- @segment_indeces[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
239
+ @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
240
240
  end
241
241
 
242
242
  def load_into(segment, *annotations)
@@ -10,8 +10,9 @@ class Corpus
10
10
  type = nil if String === type and type.empty?
11
11
 
12
12
  PubMed.get_article(pmids).collect do |pmid, article|
13
+ add_document(article.title, :pubmed, pmid, :title)
13
14
  if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
14
- add_document(article.text, :pubmed, pmid, :abstract)
15
+ add_document(article.abstract || "", :pubmed, pmid, :abstract)
15
16
  else
16
17
  raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
17
18
  add_document(article.full_text, :pubmed, pmid, :fulltext)
@@ -11,6 +11,7 @@ class Abner < NER
11
11
  Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
12
12
 
13
13
  def self.init
14
+ Rbbt.software.opt.ABNER.produce
14
15
  @@JFile ||= Rjb::import('java.io.File')
15
16
  @@Tagger ||= Rjb::import('abner.Tagger')
16
17
  @@Trainer ||= Rjb::import('abner.Trainer')
@@ -10,6 +10,7 @@ class Banner < NER
10
10
  Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
11
11
 
12
12
  def self.init
13
+ Rbbt.software.opt.BANNER.produce
13
14
  @@JFile ||= Rjb::import('java.io.File')
14
15
  @@SimpleTokenizer ||= Rjb::import('banner.tokenization.SimpleTokenizer')
15
16
  @@CRFTagger ||= Rjb::import('banner.tagging.CRFTagger')
@@ -0,0 +1,30 @@
1
+ require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/ner/segment/relationship'
3
+ module Brat
4
+ Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
+
6
+ def self.load(file)
7
+ entities = {}
8
+ relationships = {}
9
+ entity_ids = {}
10
+ TSV.traverse file, :type => :array do |line|
11
+ id, info, literal = line.split("\t")
12
+ case id[0]
13
+ when "T"
14
+ type, start, eend = info.split(" ")
15
+ entities[id] = NamedEntity.setup(literal, :offset => start.to_i, :type => type)
16
+ when "#"
17
+ type, id = info.split(" ")
18
+ entities[id].code = literal unless entities[id].nil?
19
+ when "R"
20
+ type, *args = info.split(" ")
21
+ tf, tg = args.collect{|e| e.split(":").last }
22
+ tf = entities[tf]
23
+ tg = entities[tg]
24
+ relationship = Relationship.setup([tf,tg] * "~" + "#" + type, :terms => [tf,tg], :type => type)
25
+ relationships[id] = relationship
26
+ end
27
+ end
28
+ [entities.values, relationships.values]
29
+ end
30
+ end
@@ -0,0 +1,80 @@
1
+ require 'rbbt-util'
2
+ module GNormPlus
3
+
4
+ Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
5
+ url = "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusJava.zip"
6
+ script =<<-EOF
7
+ (cd $(opt_dir $name); sh Installation.sh; chmod +x Ab3P identify_abbr)
8
+ EOF
9
+ {:src => url, :commands => script}
10
+ end
11
+
12
+ CONFIG =<<-EOF
13
+
14
+ #===Annotation
15
+ #Attribution setting:
16
+ #FocusSpecies = Taxonomy ID
17
+ # All: All species
18
+ # 9606: Human
19
+ # 4932: yeast
20
+ # 7227: Fly
21
+ # 10090: Mouse
22
+ # 10116: Rat
23
+ # 7955: Zebrafish
24
+ # 3702: Arabidopsis thaliana
25
+ #open: True
26
+ #close: False
27
+
28
+ [Focus Species]
29
+ FocusSpecies = All
30
+ [Dictionary & Model]
31
+ DictionaryFolder = ./Dictionary
32
+ GNRModel = ./Dictionary/GNR.Model
33
+ SCModel = ./Dictionary/SimConcept.Model
34
+ GeneIDMatch = True
35
+ Normalization2Protein = False
36
+ DeleteTmp = True
37
+ EOF
38
+
39
+ def self.process(texts)
40
+ TmpFile.with_file do |tmpdir|
41
+ Open.mkdir tmpdir
42
+ Misc.in_dir tmpdir do
43
+ Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
44
+ Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
45
+ Open.ln_s Rbbt.software.opt.GNormPlus["Ab3P"].find, '.'
46
+ Open.ln_s Rbbt.software.opt.GNormPlus["CRF"].find, '.'
47
+ Open.mkdir 'input'
48
+ Open.mkdir 'output'
49
+ Open.mkdir 'tmp'
50
+
51
+ texts.each do |name,text|
52
+ Open.write("input/#{name}.txt") do |f|
53
+ f.puts "#{name}|a|" << text
54
+ f.puts
55
+ end
56
+ end
57
+ Open.write('config', CONFIG)
58
+ CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
59
+
60
+ if texts.respond_to? :key_field
61
+ key_field = texts.key_field
62
+ else
63
+ key_field = "ID"
64
+ end
65
+ tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
66
+ Dir.glob("output/*.txt").each do |file|
67
+ name = File.basename(file).sub(".txt",'')
68
+ entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
69
+ tsv[name] = entities
70
+ end
71
+ tsv
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ if __FILE__ == $0
78
+ Log.severity = 0
79
+ Rbbt.software.opt.GNormPlus.produce
80
+ end
@@ -8,8 +8,8 @@ module Linnaeus
8
8
 
9
9
  ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
10
10
 
11
- Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
12
11
 
12
+ Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
13
13
  def self.init
14
14
  begin
15
15
  @@ArgParser = Rjb::import('martin.common.ArgParser')
@@ -72,6 +72,17 @@ module Segment
72
72
  (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
73
73
  end
74
74
 
75
+ def overlaps?(segment)
76
+ segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.end ||
77
+ self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.end
78
+ end
79
+
80
+ def self.collisions(main, secondary)
81
+ collisions = secondary.select do |ss|
82
+ collisions = main.select{|ms| ms.overlaps? ss }.any?
83
+ end
84
+ end
85
+
75
86
  #{{{ Sorting
76
87
 
77
88
  def self.sort(segments, inline = true)
@@ -84,14 +95,14 @@ module Segment
84
95
  -1
85
96
  when (b.nil? or b.offset.nil?)
86
97
  +1
87
- when (not a.range.include? b.offset and not b.range.include? a.offset)
88
- a.offset <=> b.offset
98
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
99
+ a.offset.to_i <=> b.offset.to_i
89
100
  else
90
101
  a.segment_length <=> b.segment_length
91
102
  end
92
103
  end
93
104
  else
94
- segments.sort_by do |segment| segment.offset || 0 end.reverse
105
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
95
106
  end
96
107
  end
97
108
 
@@ -282,7 +293,7 @@ module Segment
282
293
 
283
294
  info[:annotation_types] = [Segment] unless info.include? :annotation_types
284
295
 
285
- Annotated.load(object, info)
296
+ Annotated.load_entity(object, info)
286
297
  end
287
298
 
288
299
  def self.set_tsv_fields(fields, segments)
@@ -324,5 +335,16 @@ module Segment
324
335
  end
325
336
  end
326
337
 
338
+ def ansi(color)
339
+ Log.color color, self
340
+ end
341
+
342
+ def locus
343
+ [offset, self.end] * ".."
344
+ end
345
+
346
+ def ==(other)
347
+ self.id == other.id
348
+ end
327
349
  end
328
350
 
@@ -41,5 +41,6 @@ Score: #{score.inspect}
41
41
 
42
42
  entity
43
43
  end
44
+
44
45
  end
45
46
 
@@ -2,19 +2,23 @@ require 'rbbt/ner/segment'
2
2
 
3
3
  module Relationship
4
4
  extend Annotation
5
- include Segment
5
+ self.annotation :segment
6
6
  self.annotation :terms
7
+ self.annotation :type
8
+
9
+ def text
10
+ if segment
11
+ segment
12
+ else
13
+ type + ": " + terms * ", "
14
+ end
15
+ end
7
16
 
8
17
  def html
9
18
  text = <<-EOF
10
19
  <span class='Relationship'\
11
- >#{ self }</span>
20
+ >#{ self.text }</span>
12
21
  EOF
13
22
  text.chomp
14
23
  end
15
-
16
- def html_with_entities(*types)
17
- annotations.values_at(*types).each do |segments|
18
- end
19
- end
20
24
  end
@@ -6,7 +6,7 @@ module Transformed
6
6
  def self.transform(text, segments, replacement = nil, &block)
7
7
 
8
8
  text.extend Transformed
9
- text.replace(segments, replacement, &block)
9
+ text.replace_segments(segments, replacement, &block)
10
10
 
11
11
  text
12
12
  end
@@ -14,11 +14,11 @@ module Transformed
14
14
  def self.with_transform(text, segments, replacement = nil)
15
15
 
16
16
  text.extend Transformed
17
- text.replace(segments, replacement)
17
+ text.replace_segments(segments, replacement)
18
18
 
19
19
  segments = yield text
20
20
 
21
- segments = nil unless Array === segments
21
+ segments = nil unless Array === segments && Segment === segments.first
22
22
 
23
23
  text.restore(segments, true)
24
24
  end
@@ -59,39 +59,41 @@ module Transformed
59
59
  [begin_shift, end_shift]
60
60
  end
61
61
 
62
- def self.sort(segments)
63
- segments.compact.sort do |a,b|
64
- case
65
- when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
66
- 0
67
- when (a.nil? or a.offset.nil?)
68
- -1
69
- when (b.nil? or b.offset.nil?)
70
- +1
71
- # Non-overlap
72
- when (a.end < b.offset.to_i or b.end < a.offset.to_i)
73
- b.offset <=> a.offset
74
- # b includes a
75
- when (a.offset.to_i >= b.offset.to_i and a.end <= b.end)
76
- -1
77
- # b includes a
78
- when (b.offset.to_i >= a.offset.to_i and b.end <= a.end)
79
- +1
80
- # Overlap
81
- when (a.offset.to_i > b.offset.to_i and a.end > b.end or b.offset.to_i < a.offset.to_i and b.end > a.end)
82
- a.length <=> b.length
83
- else
84
- raise "Unexpected case in sort: #{a.range} - #{b.range}"
85
- end
86
- end
87
- end
88
-
89
- def replace(segments, replacement = nil, &block)
62
+ #def self.sort(segments)
63
+ # segments.compact.sort do |a,b|
64
+ # case
65
+ # when ((a.nil? && b.nil?) || (a.offset.nil? && b.offset.nil?))
66
+ # 0
67
+ # when (a.nil? || a.offset.nil?)
68
+ # -1
69
+ # when (b.nil? || b.offset.nil?)
70
+ # +1
71
+ # # Non-overlap
72
+ # when (a.end < b.offset.to_i || b.end < a.offset.to_i)
73
+ # b.offset <=> a.offset
74
+ # # b includes a
75
+ # when (a.offset.to_i >= b.offset.to_i && a.end <= b.end)
76
+ # -1
77
+ # # b includes a
78
+ # when (b.offset.to_i >= a.offset.to_i && b.end <= a.end)
79
+ # +1
80
+ # # Overlap
81
+ # when (a.offset.to_i > b.offset.to_i && a.end > b.end || b.offset.to_i > a.offset.to_i && b.end > a.end)
82
+ # b.length <=> a.length
83
+ # else
84
+ # raise "Unexpected case in sort: #{a.range} - #{b.range}"
85
+ # end
86
+ # end
87
+ #end
88
+
89
+ def replace_segments(segments, replacement = nil, &block)
90
90
  @transformed_segments ||= {}
91
91
  @transformation_stack ||= []
92
92
  stack = []
93
93
 
94
- Transformed.sort(segments).each do |segment|
94
+ segments = [segments] unless Array === segments
95
+ orig_length = self.length
96
+ Segment.sort(segments).each do |segment|
95
97
  next if segment.offset.nil?
96
98
  shift = shift segment.range
97
99
 
@@ -106,6 +108,10 @@ module Transformed
106
108
  updated_range = (updated_begin..updated_end)
107
109
 
108
110
  updated_text = self[updated_begin..updated_end]
111
+ if updated_text.nil?
112
+ Log.warn "Range outside of segment: #{self.length} #{segment.locus} (#{updated_range})"
113
+ next
114
+ end
109
115
 
110
116
  original_text = segment.dup
111
117
  segment.replace updated_text
@@ -137,7 +143,7 @@ module Transformed
137
143
  when segment.end < range.begin
138
144
  # After
139
145
  when segment.offset.to_i > range.end + diff
140
- segment.offset.to_i -= diff
146
+ segment.offset = segment.offset.to_i - diff
141
147
  # Includes
142
148
  when (segment.offset.to_i <= range.begin and segment.end >= range.end + diff)
143
149
  segment.replace self[segment.offset.to_i..segment.end - diff]
@@ -170,4 +176,9 @@ module Transformed
170
176
  segments
171
177
  end
172
178
  end
179
+
180
+ def self.ansi(text, entities, colors = nil)
181
+
182
+
183
+ end
173
184
  end
@@ -1,6 +1,8 @@
1
1
  require 'rbbt/nlp/nlp'
2
2
  require 'rbbt/ner/segment'
3
3
  module NLP
4
+ Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
+
4
6
  def self.returnFeatures(prevWord, delimiter, nextWord)
5
7
  if nextWord.match(/__ss__/)
6
8
  nw = nextWord.sub(/__ss__/, "")
@@ -235,4 +237,67 @@ module NLP
235
237
  end
236
238
 
237
239
  end
240
+
241
+ def self.geniass_sentence_splitter(text)
242
+ offsets = []
243
+
244
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
245
+ TmpFile.with_file(cleaned) do |fin|
246
+ TmpFile.with_file do |fout|
247
+ CMD.cmd("cd #{Rbbt.software.opt.Geniass.find}; ./geniass #{ fin } #{ fout }")
248
+
249
+
250
+ Open.write(fin, Open.read(fin).gsub(NEW_LINE_MASK, "\n"))
251
+ Open.write(fout, Open.read(fout).gsub("\n", '|').gsub(NEW_LINE_MASK, "\n"))
252
+ # Addapted from sentence2standOff.rb in Geniass package
253
+
254
+ inTxtStrict = Open.open(fin)
255
+ inTxtNew = Open.open(fout)
256
+
257
+ marker = "|"[0]
258
+ position = 0
259
+ sentenceCount = 1
260
+ target = ''
261
+ targetNew = ''
262
+ start = 0
263
+ finish = 0
264
+
265
+ while(!inTxtNew.eof?) do
266
+ targetNew = inTxtNew.getc
267
+ target = inTxtStrict.getc
268
+ position += 1
269
+ if targetNew == marker
270
+ sentenceCount += 1
271
+ finish = position - 1
272
+ offsets << [start, finish] if finish - start > 10
273
+ if targetNew == target
274
+ start = position
275
+ else
276
+ targetNew = inTxtNew.getc
277
+ while targetNew != target do
278
+ target = inTxtStrict.getc
279
+ position += 1
280
+ end
281
+ start = position - 1
282
+ end
283
+ end
284
+ end
285
+
286
+ finish = position - 1
287
+ offsets << [start, finish] if finish > start
288
+
289
+ inTxtStrict.close
290
+ inTxtNew.close
291
+ end
292
+ end
293
+
294
+ offsets.collect do |s,e|
295
+ sentence = text[s..e]
296
+ next if sentence.nil?
297
+ #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ Segment.setup sentence, s
299
+ sentence
300
+ end
301
+ end
302
+
238
303
  end
@@ -16,76 +16,10 @@ module NLP
16
16
  #Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
17
17
  #Rbbt.software.opt.StanfordParser.produce
18
18
 
19
- Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
20
- Rbbt.software.opt.Geniass.produce
21
-
22
19
  Rbbt.claim Rbbt.software.opt.Gdep, :install, Rbbt.share.install.software.Gdep.find
23
- Rbbt.software.opt.Gdep.produce
24
20
 
25
21
  NEW_LINE_MASK = "\t\t \t \t"
26
22
 
27
- def self.geniass_sentence_splitter(text)
28
- offsets = []
29
-
30
- cleaned = text.gsub("\n",NEW_LINE_MASK)
31
- TmpFile.with_file(cleaned) do |fin|
32
- TmpFile.with_file do |fout|
33
- CMD.cmd("cd #{Rbbt.software.opt.Geniass.find}; ./geniass #{ fin } #{ fout }")
34
-
35
-
36
- Open.write(fin, Open.read(fin).gsub(NEW_LINE_MASK, "\n"))
37
- Open.write(fout, Open.read(fout).gsub("\n", '|').gsub(NEW_LINE_MASK, "\n"))
38
- # Addapted from sentence2standOff.rb in Geniass package
39
-
40
- inTxtStrict = Open.open(fin)
41
- inTxtNew = Open.open(fout)
42
-
43
- marker = "|"[0]
44
- position = 0
45
- sentenceCount = 1
46
- target = ''
47
- targetNew = ''
48
- start = 0
49
- finish = 0
50
-
51
- while(!inTxtNew.eof?) do
52
- targetNew = inTxtNew.getc
53
- target = inTxtStrict.getc
54
- position += 1
55
- if targetNew == marker
56
- sentenceCount += 1
57
- finish = position - 1
58
- offsets << [start, finish] if finish - start > 10
59
- if targetNew == target
60
- start = position
61
- else
62
- targetNew = inTxtNew.getc
63
- while targetNew != target do
64
- target = inTxtStrict.getc
65
- position += 1
66
- end
67
- start = position - 1
68
- end
69
- end
70
- end
71
-
72
- finish = position - 1
73
- offsets << [start, finish] if finish > start
74
-
75
- inTxtStrict.close
76
- inTxtNew.close
77
- end
78
- end
79
-
80
- offsets.collect do |s,e|
81
- sentence = text[s..e]
82
- next if sentence.nil?
83
- #sentence.gsub!(NEW_LINE_MASK, "\n")
84
- Segment.setup sentence, s
85
- sentence
86
- end
87
- end
88
-
89
23
  module GdepToken
90
24
  extend Annotation
91
25
  include Segment
@@ -219,3 +153,8 @@ module NLP
219
153
  end
220
154
  end
221
155
  end
156
+
157
+ if __FILE__ == $0
158
+ Log.severity = 0
159
+ Rbbt.software.opt.Gdep.produce
160
+ end
@@ -6,16 +6,20 @@ require 'rbbt/resource'
6
6
  module OpenNLP
7
7
  Rbbt.claim Rbbt.software.opt.OpenNLP, :install, Rbbt.share.install.software.OpenNLP.find
8
8
 
9
+
9
10
  Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "http://opennlp.sourceforge.net/models-1.5/de-sent.bin"
10
11
 
11
12
  MAX = 5
12
13
 
13
- @@FileInputStream = Rjb::import('java.io.FileInputStream')
14
- @@SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
15
- @@SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
16
-
17
14
  def self.sentence_split_detector
18
15
  @@sentence_split_detector ||= begin
16
+ Rbbt.software.opt.OpenNLP.produce
17
+ Rbbt.software.opt.OpenNLP.models["da-sent.bin"].produce
18
+
19
+ @@FileInputStream = Rjb::import('java.io.FileInputStream')
20
+ @@SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
21
+ @@SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
22
+
19
23
  modelIn = @@FileInputStream.new(Rbbt.software.opt.OpenNLP.models["da-sent.bin"].produce.find);
20
24
 
21
25
  model = @@SentenceModel.new(modelIn);
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+
3
+ name='GNormPlus'
4
+ url="https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusJava.zip"
5
+
6
+ prepare_pkg "$name" "$url"
7
+
@@ -5,7 +5,7 @@ RBBT_SOFTWARE_DIR="$2"
5
5
  source "$INSTALL_HELPER_FILE"
6
6
 
7
7
  name='Gdep'
8
- url="http://people.ict.usc.edu/~sagae/parser/gdep/gdep-beta2.tgz"
8
+ url="http://www.sagae.org/gdep/gdep-beta2.tgz"
9
9
 
10
10
 
11
11
  get_pkg "$name" "$url"
@@ -1,7 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es//opennlp/opennlp-1.5.3/apache-opennlp-1.5.3-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
5
5
 
6
6
  get_src "$name" "$url"
7
7
  move_opt "$name"
@@ -6,7 +6,7 @@ class TestClass < Test::Unit::TestCase
6
6
  def test_info
7
7
  a = ["test"]
8
8
  NamedEntity.setup a
9
- assert(! a.info.keys.include?(:code))
9
+ assert(a.info[:code].nil?)
10
10
  a.code = 10
11
11
  a.offset = 100
12
12
  assert a.info.include? :code
@@ -26,4 +26,27 @@ class TestClass < Test::Unit::TestCase
26
26
  assert Segment.tsv([a], nil).fields.include? "code"
27
27
  assert Segment.tsv([a], "literal").fields.include? "code"
28
28
  end
29
+
30
+ def test_segment_brat
31
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
32
+
33
+ gene1 = "TP53"
34
+ gene1.extend NamedEntity
35
+ gene1.offset = a.index gene1
36
+ gene1.type = "Gene"
37
+
38
+ gene2 = "CDK5R1"
39
+ gene2.extend NamedEntity
40
+ gene2.offset = a.index gene2
41
+ gene2.type = "Gene"
42
+
43
+ gene3 = "TP53 gene"
44
+ gene3.extend NamedEntity
45
+ gene3.offset = a.index gene3
46
+ gene3.type = "Gene"
47
+
48
+ segments = [gene1, gene2, gene3]
49
+ assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
50
+
51
+ end
29
52
  end
@@ -2,10 +2,23 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_he
2
2
  require 'rbbt/ner/segment/transformed'
3
3
  require 'rbbt/ner/segment/named_entity'
4
4
  require 'rexml/document'
5
- require 'rand'
6
5
 
7
6
  class TestClass < Test::Unit::TestCase
8
- def tttest_transform
7
+ def test_sort
8
+ text = <<-EOF
9
+ More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
10
+ EOF
11
+
12
+ entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
13
+ NamedEntity.setup(literal, :offset => text.index(literal))
14
+ end
15
+
16
+ Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
17
+ assert text.include? "such as [IL-2]"
18
+ end
19
+ end
20
+
21
+ def ___test_transform
9
22
  a = "This sentence mentions the TP53 gene and the CDK5 protein"
10
23
  original = a.dup
11
24
 
@@ -27,6 +40,8 @@ class TestClass < Test::Unit::TestCase
27
40
  c[gene1.range] = "GN"
28
41
  assert_equal c, Transformed.transform(a,[gene1], "GN")
29
42
 
43
+ iii a.transformation_offset_differences
44
+ raise
30
45
  assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
31
46
  assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
32
47
 
@@ -216,5 +231,60 @@ class TestClass < Test::Unit::TestCase
216
231
  end
217
232
  end
218
233
  end
234
+
235
+ def test_nested_transform
236
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
237
+
238
+ gene1 = "TP53"
239
+ gene1.extend NamedEntity
240
+ gene1.offset = a.index gene1
241
+ gene1.type = "Gene"
242
+
243
+ gene2 = "CDK5R1"
244
+ gene2.extend NamedEntity
245
+ gene2.offset = a.index gene2
246
+ gene2.type = "Protein"
247
+
248
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
249
+ assert_equal "This sentence mentions the [G] gene and the [G] protein", a
250
+ end
251
+ Transformed.with_transform(a, [gene1], "[G1]") do
252
+ Transformed.with_transform(a, [gene2], "[G2]") do
253
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
254
+ end
255
+ end
256
+ Transformed.with_transform(a, [gene2], "[G2]") do
257
+ Transformed.with_transform(a, [gene1], "[G1]") do
258
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
259
+ end
260
+ end
261
+ end
262
+
263
+ def test_offset_transform
264
+ a = "ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
265
+
266
+ gene1 = "ILF"
267
+ gene1.extend NamedEntity
268
+ gene1.offset = a.index gene1
269
+ gene1.type = "Gene"
270
+
271
+ gene2 = "interleukin-2"
272
+ gene2.extend NamedEntity
273
+ gene2.offset = a.index gene2
274
+ gene2.type = "Protein"
275
+
276
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
277
+ assert_equal "[G] can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the [G] promoter.", a
278
+ end
279
+
280
+ offset = 100
281
+ a = Segment.setup(a, :offset => offset)
282
+ gene1.offset += offset
283
+ gene2.offset += offset
284
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
285
+ assert_equal "[G] can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the [G] promoter.", a
286
+ end
287
+
288
+ end
219
289
  end
220
290
 
@@ -0,0 +1,64 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/brat'
3
+
4
+ class TestBrat < Test::Unit::TestCase
5
+ def test_load
6
+ text =<<-EOF
7
+ T2 DBTF 52 55 Nrl
8
+ #2 AnnotatorNotes T2 4901
9
+ T3 NONDBTF 80 89 rhodopsin
10
+ #3 AnnotatorNotes T3 6010
11
+ T4 BIOLOGICALPROCESS 90 105 gene expression
12
+ #4 AnnotatorNotes T4 -
13
+ T5 DBTF 127 130 Nrl
14
+ #5 AnnotatorNotes T5 4901
15
+ T7 MOLECULARFUNCTION 197 204 binding
16
+ #7 AnnotatorNotes T7 -
17
+ T8 PHENOTYPE 241 252 extended AP
18
+ #8 AnnotatorNotes T8 -
19
+ T10 DBTF 331 334 Nrl
20
+ #10 AnnotatorNotes T10 4901
21
+ T11 TISSUE 381 399 photoreceptor cell
22
+ #11 AnnotatorNotes T11 -
23
+ T12 NONDBTF 414 423 rhodopsin
24
+ #12 AnnotatorNotes T12 6010
25
+ T13 CELLULARCOMPONENT 494 501 nuclear
26
+ #13 AnnotatorNotes T13 -
27
+ T14 TISSUE 548 572 retinoblastoma cell line
28
+ #14 AnnotatorNotes T14 -
29
+ T17 NONDBTF 660 669 rhodopsin
30
+ #17 AnnotatorNotes T17 6010
31
+ T18 DBTF 676 679 Nrl
32
+ #18 AnnotatorNotes T18 4901
33
+ T19 CELLULARCOMPONENT 749 764 protein complex
34
+ #19 AnnotatorNotes T19 -
35
+ T20 DBTF 797 800 Nrl
36
+ #20 AnnotatorNotes T20 4901
37
+ T21 DBTF 853 856 Nrl
38
+ #21 AnnotatorNotes T21 4901
39
+ T22 MOLECULARFUNCTION 882 892 luciferase
40
+ #22 AnnotatorNotes T22 -
41
+ T23 DBTF 943 946 Nrl
42
+ #23 AnnotatorNotes T23 4901
43
+ T24 NONDBTF 989 998 rhodopsin
44
+ #24 AnnotatorNotes T24 6010
45
+ T26 DBTF 1110 1113 Nrl
46
+ #26 AnnotatorNotes T26 4901
47
+ T27 DBTF 1224 1227 Nrl
48
+ #27 AnnotatorNotes T27 4901
49
+ T28 DBTF 1271 1274 Nrl
50
+ #28 AnnotatorNotes T28 4901
51
+ T30 DBTF 1385 1388 Nrl
52
+ #30 AnnotatorNotes T30 4901
53
+ R1 ACTIVATION Arg1:T2 Arg2:T3
54
+ R2 ACTIVATION Arg1:T10 Arg2:T12
55
+ R3 ACTIVATION Arg1:T23 Arg2:T24
56
+ T1 DBTF 250 254 AP-1
57
+ EOF
58
+
59
+ io = StringIO.new text
60
+ iii Brat.load io
61
+
62
+ end
63
+ end
64
+
@@ -0,0 +1,16 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/g_norm_plus'
3
+
4
+ Log.severity = 0
5
+ class TestGNormPlus < Test::Unit::TestCase
6
+ def test_match
7
+ text =<<-EOF
8
+ We found that TP53 is regulated by MDM2 in Homo sapiens
9
+ EOF
10
+
11
+
12
+ mentions = GNormPlus.process({:file => text})
13
+ Log.tsv mentions
14
+ end
15
+ end
16
+
@@ -96,6 +96,5 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
96
96
  assert_equal %w(CDK5R1), index[gene2.offset + 1]
97
97
  end
98
98
  end
99
-
100
99
  end
101
100
 
@@ -0,0 +1,9 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/nlp/genia/sentence_splitter'
3
+
4
+ class TestClass < Test::Unit::TestCase
5
+ def test_true
6
+ assert true
7
+ end
8
+ end
9
+
@@ -29,11 +29,14 @@ sentence. This is
29
29
  another sentence.
30
30
  EOF
31
31
 
32
+ iii OpenNLP.sentence_split_detector.sentDetect(text)
33
+ assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
34
+
32
35
  assert_equal 5, OpenNLP.sentence_splitter(text).length
33
36
  assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
34
37
  end
35
38
 
36
- def test_text_sentences
39
+ def _test_text_sentences
37
40
  Misc.benchmark(100) do
38
41
  OpenNLP.sentence_splitter($text).include? "Our
39
42
  findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.7
4
+ version: 1.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-05-04 00:00:00.000000000 Z
11
+ date: 2020-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: rjb
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
69
  description: 'Text mining tools: named entity recognition and normalization, document
84
70
  classification, bag-of-words, dictionaries, etc'
85
71
  email: miguel.vazquez@fdi.ucm.es
@@ -100,8 +86,10 @@ files:
100
86
  - lib/rbbt/ner/NER.rb
101
87
  - lib/rbbt/ner/abner.rb
102
88
  - lib/rbbt/ner/banner.rb
89
+ - lib/rbbt/ner/brat.rb
103
90
  - lib/rbbt/ner/chemical_tagger.rb
104
91
  - lib/rbbt/ner/finder.rb
92
+ - lib/rbbt/ner/g_norm_plus.rb
105
93
  - lib/rbbt/ner/linnaeus.rb
106
94
  - lib/rbbt/ner/ngram_prefix_dictionary.rb
107
95
  - lib/rbbt/ner/oscar3.rb
@@ -125,6 +113,7 @@ files:
125
113
  - share/install/software/ABNER
126
114
  - share/install/software/BANNER
127
115
  - share/install/software/ChemicalTagger
116
+ - share/install/software/GNormPlus
128
117
  - share/install/software/Gdep
129
118
  - share/install/software/Geniass
130
119
  - share/install/software/Linnaeus
@@ -141,13 +130,16 @@ files:
141
130
  - test/rbbt/bow/test_misc.rb
142
131
  - test/rbbt/entity/test_document.rb
143
132
  - test/rbbt/ner/segment/test_named_entity.rb
133
+ - test/rbbt/ner/segment/test_relationship.rb
144
134
  - test/rbbt/ner/segment/test_segmented.rb
145
135
  - test/rbbt/ner/segment/test_transformed.rb
146
136
  - test/rbbt/ner/test_NER.rb
147
137
  - test/rbbt/ner/test_abner.rb
148
138
  - test/rbbt/ner/test_banner.rb
139
+ - test/rbbt/ner/test_brat.rb
149
140
  - test/rbbt/ner/test_chemical_tagger.rb
150
141
  - test/rbbt/ner/test_finder.rb
142
+ - test/rbbt/ner/test_g_norm_plus.rb
151
143
  - test/rbbt/ner/test_linnaeus.rb
152
144
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
153
145
  - test/rbbt/ner/test_oscar4.rb
@@ -156,6 +148,7 @@ files:
156
148
  - test/rbbt/ner/test_rnorm.rb
157
149
  - test/rbbt/ner/test_segment.rb
158
150
  - test/rbbt/ner/test_token_trieNER.rb
151
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb
159
152
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
160
153
  - test/rbbt/nlp/test_nlp.rb
161
154
  - test/test_helper.rb
@@ -177,14 +170,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
177
170
  - !ruby/object:Gem::Version
178
171
  version: '0'
179
172
  requirements: []
180
- rubyforge_project:
181
- rubygems_version: 2.6.13
173
+ rubygems_version: 3.0.6
182
174
  signing_key:
183
175
  specification_version: 4
184
176
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
185
177
  test_files:
186
178
  - test/rbbt/nlp/test_nlp.rb
187
179
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
180
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb
188
181
  - test/rbbt/bow/test_bow.rb
189
182
  - test/rbbt/bow/test_misc.rb
190
183
  - test/rbbt/bow/test_dictionary.rb
@@ -195,6 +188,8 @@ test_files:
195
188
  - test/rbbt/ner/test_rnorm.rb
196
189
  - test/rbbt/ner/test_regexpNER.rb
197
190
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
191
+ - test/rbbt/ner/test_brat.rb
192
+ - test/rbbt/ner/test_g_norm_plus.rb
198
193
  - test/rbbt/ner/test_chemical_tagger.rb
199
194
  - test/rbbt/ner/test_banner.rb
200
195
  - test/rbbt/ner/test_token_trieNER.rb
@@ -202,6 +197,7 @@ test_files:
202
197
  - test/rbbt/ner/test_segment.rb
203
198
  - test/rbbt/ner/test_linnaeus.rb
204
199
  - test/rbbt/ner/segment/test_transformed.rb
200
+ - test/rbbt/ner/segment/test_relationship.rb
205
201
  - test/rbbt/ner/segment/test_named_entity.rb
206
202
  - test/rbbt/ner/segment/test_segmented.rb
207
203
  - test/rbbt/ner/test_oscar4.rb