rbbt-text 1.1.7 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 8373a3408f7b9cbc481ef108d899de4283513115
4
- data.tar.gz: e431a61729bd0f67a4129384a06a5b3a81824afc
2
+ SHA256:
3
+ metadata.gz: ea1646b5f32644bb5872f57422534b49955f988df26df4a65c8dda592515eac3
4
+ data.tar.gz: 3f6bc60546b79c76b6b35840712453616c377fcc088f321e95847f116776bef1
5
5
  SHA512:
6
- metadata.gz: 575313a7d598cbec0ec05827bebee1f4ccc8b56ccdac89478beb4993b5188356384172cf762e6ca58f0c0953fae0f29fcd46423d1291391d9958b66ce62230d0
7
- data.tar.gz: 4609a5ce9448f0a0a3ad480bc9f0cc6f3dfc728eacb0c6bf291d8b29b23d084f56a76c8776d421d935e50ef097fb10009d41121ef046b95c85177060335a1629
6
+ metadata.gz: 9376c68bad67733b5771b57ead7c962d45ff29c44362d1c51bf3480d3c3bf9f1f75284e40044fc4ed95bd94a03ab0759b3b7320bf1e3da00a0cdd82255c9395c
7
+ data.tar.gz: cd25a9cd91fde366be195801d45238d555edfc94f2b06391db7db2d9f4781b34dd599514385782d6c7e22af2841c5f3322ba74bf0a3a9c1fdbe308a255f00098
@@ -1,13 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rbbt-util'
4
- require 'rbbt/annotations/corpus'
5
- require 'rbbt/annotations/corpus/pubmed'
6
- require 'rbbt/annotations/relationships/ppi'
4
+ require 'rbbt/corpus/corpus'
5
+ require 'rbbt/corpus/sources/pubmed'
6
+ #require 'rbbt/annotations/relationships/ppi'
7
7
  require 'rbbt/sources/pubmed'
8
- require 'rbbt/ner/annotations'
8
+ #require 'rbbt/ner/annotations'
9
9
  require 'rbbt/ner/token_trieNER'
10
- require 'rbbt/ner/annotations/transformed'
10
+ #require 'rbbt/ner/annotations/transformed'
11
11
  require 'rbbt/ner/chemical_tagger'
12
12
 
13
13
  Corpus.define_entity_ner "Compounds", false do |doc|
@@ -182,7 +182,4 @@ class Dictionary::KL
182
182
  def weights(options = {})
183
183
  best(options)
184
184
  end
185
-
186
-
187
-
188
185
  end
@@ -8,10 +8,10 @@ require 'json'
8
8
 
9
9
  class Document
10
10
 
11
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indeces, :persist_dir, :global_persistence
11
+ attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence
12
12
  def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil)
13
13
  @segments = {}
14
- @segment_indeces = {}
14
+ @segment_indices = {}
15
15
 
16
16
  if not persist_dir.nil?
17
17
  @persist_dir = persist_dir
@@ -236,7 +236,7 @@ class Document
236
236
  end
237
237
 
238
238
  def segment_index(name, persist_dir = nil)
239
- @segment_indeces[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
239
+ @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
240
240
  end
241
241
 
242
242
  def load_into(segment, *annotations)
@@ -10,8 +10,9 @@ class Corpus
10
10
  type = nil if String === type and type.empty?
11
11
 
12
12
  PubMed.get_article(pmids).collect do |pmid, article|
13
+ add_document(article.title, :pubmed, pmid, :title)
13
14
  if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
14
- add_document(article.text, :pubmed, pmid, :abstract)
15
+ add_document(article.abstract || "", :pubmed, pmid, :abstract)
15
16
  else
16
17
  raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
17
18
  add_document(article.full_text, :pubmed, pmid, :fulltext)
@@ -11,6 +11,7 @@ class Abner < NER
11
11
  Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
12
12
 
13
13
  def self.init
14
+ Rbbt.software.opt.ABNER.produce
14
15
  @@JFile ||= Rjb::import('java.io.File')
15
16
  @@Tagger ||= Rjb::import('abner.Tagger')
16
17
  @@Trainer ||= Rjb::import('abner.Trainer')
@@ -10,6 +10,7 @@ class Banner < NER
10
10
  Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
11
11
 
12
12
  def self.init
13
+ Rbbt.software.opt.BANNER.produce
13
14
  @@JFile ||= Rjb::import('java.io.File')
14
15
  @@SimpleTokenizer ||= Rjb::import('banner.tokenization.SimpleTokenizer')
15
16
  @@CRFTagger ||= Rjb::import('banner.tagging.CRFTagger')
@@ -0,0 +1,30 @@
1
+ require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/ner/segment/relationship'
3
+ module Brat
4
+ Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
+
6
+ def self.load(file)
7
+ entities = {}
8
+ relationships = {}
9
+ entity_ids = {}
10
+ TSV.traverse file, :type => :array do |line|
11
+ id, info, literal = line.split("\t")
12
+ case id[0]
13
+ when "T"
14
+ type, start, eend = info.split(" ")
15
+ entities[id] = NamedEntity.setup(literal, :offset => start.to_i, :type => type)
16
+ when "#"
17
+ type, id = info.split(" ")
18
+ entities[id].code = literal unless entities[id].nil?
19
+ when "R"
20
+ type, *args = info.split(" ")
21
+ tf, tg = args.collect{|e| e.split(":").last }
22
+ tf = entities[tf]
23
+ tg = entities[tg]
24
+ relationship = Relationship.setup([tf,tg] * "~" + "#" + type, :terms => [tf,tg], :type => type)
25
+ relationships[id] = relationship
26
+ end
27
+ end
28
+ [entities.values, relationships.values]
29
+ end
30
+ end
@@ -0,0 +1,80 @@
1
+ require 'rbbt-util'
2
+ module GNormPlus
3
+
4
+ Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
5
+ url = "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusJava.zip"
6
+ script =<<-EOF
7
+ (cd $(opt_dir $name); sh Installation.sh; chmod +x Ab3P identify_abbr)
8
+ EOF
9
+ {:src => url, :commands => script}
10
+ end
11
+
12
+ CONFIG =<<-EOF
13
+
14
+ #===Annotation
15
+ #Attribution setting:
16
+ #FocusSpecies = Taxonomy ID
17
+ # All: All species
18
+ # 9606: Human
19
+ # 4932: yeast
20
+ # 7227: Fly
21
+ # 10090: Mouse
22
+ # 10116: Rat
23
+ # 7955: Zebrafish
24
+ # 3702: Arabidopsis thaliana
25
+ #open: True
26
+ #close: False
27
+
28
+ [Focus Species]
29
+ FocusSpecies = All
30
+ [Dictionary & Model]
31
+ DictionaryFolder = ./Dictionary
32
+ GNRModel = ./Dictionary/GNR.Model
33
+ SCModel = ./Dictionary/SimConcept.Model
34
+ GeneIDMatch = True
35
+ Normalization2Protein = False
36
+ DeleteTmp = True
37
+ EOF
38
+
39
+ def self.process(texts)
40
+ TmpFile.with_file do |tmpdir|
41
+ Open.mkdir tmpdir
42
+ Misc.in_dir tmpdir do
43
+ Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
44
+ Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
45
+ Open.ln_s Rbbt.software.opt.GNormPlus["Ab3P"].find, '.'
46
+ Open.ln_s Rbbt.software.opt.GNormPlus["CRF"].find, '.'
47
+ Open.mkdir 'input'
48
+ Open.mkdir 'output'
49
+ Open.mkdir 'tmp'
50
+
51
+ texts.each do |name,text|
52
+ Open.write("input/#{name}.txt") do |f|
53
+ f.puts "#{name}|a|" << text
54
+ f.puts
55
+ end
56
+ end
57
+ Open.write('config', CONFIG)
58
+ CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
59
+
60
+ if texts.respond_to? :key_field
61
+ key_field = texts.key_field
62
+ else
63
+ key_field = "ID"
64
+ end
65
+ tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
66
+ Dir.glob("output/*.txt").each do |file|
67
+ name = File.basename(file).sub(".txt",'')
68
+ entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
69
+ tsv[name] = entities
70
+ end
71
+ tsv
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ if __FILE__ == $0
78
+ Log.severity = 0
79
+ Rbbt.software.opt.GNormPlus.produce
80
+ end
@@ -8,8 +8,8 @@ module Linnaeus
8
8
 
9
9
  ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
10
10
 
11
- Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
12
11
 
12
+ Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
13
13
  def self.init
14
14
  begin
15
15
  @@ArgParser = Rjb::import('martin.common.ArgParser')
@@ -72,6 +72,17 @@ module Segment
72
72
  (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
73
73
  end
74
74
 
75
+ def overlaps?(segment)
76
+ segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.end ||
77
+ self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.end
78
+ end
79
+
80
+ def self.collisions(main, secondary)
81
+ collisions = secondary.select do |ss|
82
+ collisions = main.select{|ms| ms.overlaps? ss }.any?
83
+ end
84
+ end
85
+
75
86
  #{{{ Sorting
76
87
 
77
88
  def self.sort(segments, inline = true)
@@ -84,14 +95,14 @@ module Segment
84
95
  -1
85
96
  when (b.nil? or b.offset.nil?)
86
97
  +1
87
- when (not a.range.include? b.offset and not b.range.include? a.offset)
88
- a.offset <=> b.offset
98
+ when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
99
+ a.offset.to_i <=> b.offset.to_i
89
100
  else
90
101
  a.segment_length <=> b.segment_length
91
102
  end
92
103
  end
93
104
  else
94
- segments.sort_by do |segment| segment.offset || 0 end.reverse
105
+ segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
95
106
  end
96
107
  end
97
108
 
@@ -282,7 +293,7 @@ module Segment
282
293
 
283
294
  info[:annotation_types] = [Segment] unless info.include? :annotation_types
284
295
 
285
- Annotated.load(object, info)
296
+ Annotated.load_entity(object, info)
286
297
  end
287
298
 
288
299
  def self.set_tsv_fields(fields, segments)
@@ -324,5 +335,16 @@ module Segment
324
335
  end
325
336
  end
326
337
 
338
+ def ansi(color)
339
+ Log.color color, self
340
+ end
341
+
342
+ def locus
343
+ [offset, self.end] * ".."
344
+ end
345
+
346
+ def ==(other)
347
+ self.id == other.id
348
+ end
327
349
  end
328
350
 
@@ -41,5 +41,6 @@ Score: #{score.inspect}
41
41
 
42
42
  entity
43
43
  end
44
+
44
45
  end
45
46
 
@@ -2,19 +2,23 @@ require 'rbbt/ner/segment'
2
2
 
3
3
  module Relationship
4
4
  extend Annotation
5
- include Segment
5
+ self.annotation :segment
6
6
  self.annotation :terms
7
+ self.annotation :type
8
+
9
+ def text
10
+ if segment
11
+ segment
12
+ else
13
+ type + ": " + terms * ", "
14
+ end
15
+ end
7
16
 
8
17
  def html
9
18
  text = <<-EOF
10
19
  <span class='Relationship'\
11
- >#{ self }</span>
20
+ >#{ self.text }</span>
12
21
  EOF
13
22
  text.chomp
14
23
  end
15
-
16
- def html_with_entities(*types)
17
- annotations.values_at(*types).each do |segments|
18
- end
19
- end
20
24
  end
@@ -6,7 +6,7 @@ module Transformed
6
6
  def self.transform(text, segments, replacement = nil, &block)
7
7
 
8
8
  text.extend Transformed
9
- text.replace(segments, replacement, &block)
9
+ text.replace_segments(segments, replacement, &block)
10
10
 
11
11
  text
12
12
  end
@@ -14,11 +14,11 @@ module Transformed
14
14
  def self.with_transform(text, segments, replacement = nil)
15
15
 
16
16
  text.extend Transformed
17
- text.replace(segments, replacement)
17
+ text.replace_segments(segments, replacement)
18
18
 
19
19
  segments = yield text
20
20
 
21
- segments = nil unless Array === segments
21
+ segments = nil unless Array === segments && Segment === segments.first
22
22
 
23
23
  text.restore(segments, true)
24
24
  end
@@ -59,39 +59,41 @@ module Transformed
59
59
  [begin_shift, end_shift]
60
60
  end
61
61
 
62
- def self.sort(segments)
63
- segments.compact.sort do |a,b|
64
- case
65
- when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
66
- 0
67
- when (a.nil? or a.offset.nil?)
68
- -1
69
- when (b.nil? or b.offset.nil?)
70
- +1
71
- # Non-overlap
72
- when (a.end < b.offset.to_i or b.end < a.offset.to_i)
73
- b.offset <=> a.offset
74
- # b includes a
75
- when (a.offset.to_i >= b.offset.to_i and a.end <= b.end)
76
- -1
77
- # b includes a
78
- when (b.offset.to_i >= a.offset.to_i and b.end <= a.end)
79
- +1
80
- # Overlap
81
- when (a.offset.to_i > b.offset.to_i and a.end > b.end or b.offset.to_i < a.offset.to_i and b.end > a.end)
82
- a.length <=> b.length
83
- else
84
- raise "Unexpected case in sort: #{a.range} - #{b.range}"
85
- end
86
- end
87
- end
88
-
89
- def replace(segments, replacement = nil, &block)
62
+ #def self.sort(segments)
63
+ # segments.compact.sort do |a,b|
64
+ # case
65
+ # when ((a.nil? && b.nil?) || (a.offset.nil? && b.offset.nil?))
66
+ # 0
67
+ # when (a.nil? || a.offset.nil?)
68
+ # -1
69
+ # when (b.nil? || b.offset.nil?)
70
+ # +1
71
+ # # Non-overlap
72
+ # when (a.end < b.offset.to_i || b.end < a.offset.to_i)
73
+ # b.offset <=> a.offset
74
+ # # b includes a
75
+ # when (a.offset.to_i >= b.offset.to_i && a.end <= b.end)
76
+ # -1
77
+ # # b includes a
78
+ # when (b.offset.to_i >= a.offset.to_i && b.end <= a.end)
79
+ # +1
80
+ # # Overlap
81
+ # when (a.offset.to_i > b.offset.to_i && a.end > b.end || b.offset.to_i > a.offset.to_i && b.end > a.end)
82
+ # b.length <=> a.length
83
+ # else
84
+ # raise "Unexpected case in sort: #{a.range} - #{b.range}"
85
+ # end
86
+ # end
87
+ #end
88
+
89
+ def replace_segments(segments, replacement = nil, &block)
90
90
  @transformed_segments ||= {}
91
91
  @transformation_stack ||= []
92
92
  stack = []
93
93
 
94
- Transformed.sort(segments).each do |segment|
94
+ segments = [segments] unless Array === segments
95
+ orig_length = self.length
96
+ Segment.sort(segments).each do |segment|
95
97
  next if segment.offset.nil?
96
98
  shift = shift segment.range
97
99
 
@@ -106,6 +108,10 @@ module Transformed
106
108
  updated_range = (updated_begin..updated_end)
107
109
 
108
110
  updated_text = self[updated_begin..updated_end]
111
+ if updated_text.nil?
112
+ Log.warn "Range outside of segment: #{self.length} #{segment.locus} (#{updated_range})"
113
+ next
114
+ end
109
115
 
110
116
  original_text = segment.dup
111
117
  segment.replace updated_text
@@ -137,7 +143,7 @@ module Transformed
137
143
  when segment.end < range.begin
138
144
  # After
139
145
  when segment.offset.to_i > range.end + diff
140
- segment.offset.to_i -= diff
146
+ segment.offset = segment.offset.to_i - diff
141
147
  # Includes
142
148
  when (segment.offset.to_i <= range.begin and segment.end >= range.end + diff)
143
149
  segment.replace self[segment.offset.to_i..segment.end - diff]
@@ -170,4 +176,9 @@ module Transformed
170
176
  segments
171
177
  end
172
178
  end
179
+
180
+ def self.ansi(text, entities, colors = nil)
181
+
182
+
183
+ end
173
184
  end
@@ -1,6 +1,8 @@
1
1
  require 'rbbt/nlp/nlp'
2
2
  require 'rbbt/ner/segment'
3
3
  module NLP
4
+ Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
+
4
6
  def self.returnFeatures(prevWord, delimiter, nextWord)
5
7
  if nextWord.match(/__ss__/)
6
8
  nw = nextWord.sub(/__ss__/, "")
@@ -235,4 +237,67 @@ module NLP
235
237
  end
236
238
 
237
239
  end
240
+
241
+ def self.geniass_sentence_splitter(text)
242
+ offsets = []
243
+
244
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
245
+ TmpFile.with_file(cleaned) do |fin|
246
+ TmpFile.with_file do |fout|
247
+ CMD.cmd("cd #{Rbbt.software.opt.Geniass.find}; ./geniass #{ fin } #{ fout }")
248
+
249
+
250
+ Open.write(fin, Open.read(fin).gsub(NEW_LINE_MASK, "\n"))
251
+ Open.write(fout, Open.read(fout).gsub("\n", '|').gsub(NEW_LINE_MASK, "\n"))
252
+ # Addapted from sentence2standOff.rb in Geniass package
253
+
254
+ inTxtStrict = Open.open(fin)
255
+ inTxtNew = Open.open(fout)
256
+
257
+ marker = "|"[0]
258
+ position = 0
259
+ sentenceCount = 1
260
+ target = ''
261
+ targetNew = ''
262
+ start = 0
263
+ finish = 0
264
+
265
+ while(!inTxtNew.eof?) do
266
+ targetNew = inTxtNew.getc
267
+ target = inTxtStrict.getc
268
+ position += 1
269
+ if targetNew == marker
270
+ sentenceCount += 1
271
+ finish = position - 1
272
+ offsets << [start, finish] if finish - start > 10
273
+ if targetNew == target
274
+ start = position
275
+ else
276
+ targetNew = inTxtNew.getc
277
+ while targetNew != target do
278
+ target = inTxtStrict.getc
279
+ position += 1
280
+ end
281
+ start = position - 1
282
+ end
283
+ end
284
+ end
285
+
286
+ finish = position - 1
287
+ offsets << [start, finish] if finish > start
288
+
289
+ inTxtStrict.close
290
+ inTxtNew.close
291
+ end
292
+ end
293
+
294
+ offsets.collect do |s,e|
295
+ sentence = text[s..e]
296
+ next if sentence.nil?
297
+ #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ Segment.setup sentence, s
299
+ sentence
300
+ end
301
+ end
302
+
238
303
  end
@@ -16,76 +16,10 @@ module NLP
16
16
  #Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
17
17
  #Rbbt.software.opt.StanfordParser.produce
18
18
 
19
- Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
20
- Rbbt.software.opt.Geniass.produce
21
-
22
19
  Rbbt.claim Rbbt.software.opt.Gdep, :install, Rbbt.share.install.software.Gdep.find
23
- Rbbt.software.opt.Gdep.produce
24
20
 
25
21
  NEW_LINE_MASK = "\t\t \t \t"
26
22
 
27
- def self.geniass_sentence_splitter(text)
28
- offsets = []
29
-
30
- cleaned = text.gsub("\n",NEW_LINE_MASK)
31
- TmpFile.with_file(cleaned) do |fin|
32
- TmpFile.with_file do |fout|
33
- CMD.cmd("cd #{Rbbt.software.opt.Geniass.find}; ./geniass #{ fin } #{ fout }")
34
-
35
-
36
- Open.write(fin, Open.read(fin).gsub(NEW_LINE_MASK, "\n"))
37
- Open.write(fout, Open.read(fout).gsub("\n", '|').gsub(NEW_LINE_MASK, "\n"))
38
- # Addapted from sentence2standOff.rb in Geniass package
39
-
40
- inTxtStrict = Open.open(fin)
41
- inTxtNew = Open.open(fout)
42
-
43
- marker = "|"[0]
44
- position = 0
45
- sentenceCount = 1
46
- target = ''
47
- targetNew = ''
48
- start = 0
49
- finish = 0
50
-
51
- while(!inTxtNew.eof?) do
52
- targetNew = inTxtNew.getc
53
- target = inTxtStrict.getc
54
- position += 1
55
- if targetNew == marker
56
- sentenceCount += 1
57
- finish = position - 1
58
- offsets << [start, finish] if finish - start > 10
59
- if targetNew == target
60
- start = position
61
- else
62
- targetNew = inTxtNew.getc
63
- while targetNew != target do
64
- target = inTxtStrict.getc
65
- position += 1
66
- end
67
- start = position - 1
68
- end
69
- end
70
- end
71
-
72
- finish = position - 1
73
- offsets << [start, finish] if finish > start
74
-
75
- inTxtStrict.close
76
- inTxtNew.close
77
- end
78
- end
79
-
80
- offsets.collect do |s,e|
81
- sentence = text[s..e]
82
- next if sentence.nil?
83
- #sentence.gsub!(NEW_LINE_MASK, "\n")
84
- Segment.setup sentence, s
85
- sentence
86
- end
87
- end
88
-
89
23
  module GdepToken
90
24
  extend Annotation
91
25
  include Segment
@@ -219,3 +153,8 @@ module NLP
219
153
  end
220
154
  end
221
155
  end
156
+
157
+ if __FILE__ == $0
158
+ Log.severity = 0
159
+ Rbbt.software.opt.Gdep.produce
160
+ end
@@ -6,16 +6,20 @@ require 'rbbt/resource'
6
6
  module OpenNLP
7
7
  Rbbt.claim Rbbt.software.opt.OpenNLP, :install, Rbbt.share.install.software.OpenNLP.find
8
8
 
9
+
9
10
  Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "http://opennlp.sourceforge.net/models-1.5/de-sent.bin"
10
11
 
11
12
  MAX = 5
12
13
 
13
- @@FileInputStream = Rjb::import('java.io.FileInputStream')
14
- @@SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
15
- @@SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
16
-
17
14
  def self.sentence_split_detector
18
15
  @@sentence_split_detector ||= begin
16
+ Rbbt.software.opt.OpenNLP.produce
17
+ Rbbt.software.opt.OpenNLP.models["da-sent.bin"].produce
18
+
19
+ @@FileInputStream = Rjb::import('java.io.FileInputStream')
20
+ @@SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
21
+ @@SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
22
+
19
23
  modelIn = @@FileInputStream.new(Rbbt.software.opt.OpenNLP.models["da-sent.bin"].produce.find);
20
24
 
21
25
  model = @@SentenceModel.new(modelIn);
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+
3
+ name='GNormPlus'
4
+ url="https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusJava.zip"
5
+
6
+ prepare_pkg "$name" "$url"
7
+
@@ -5,7 +5,7 @@ RBBT_SOFTWARE_DIR="$2"
5
5
  source "$INSTALL_HELPER_FILE"
6
6
 
7
7
  name='Gdep'
8
- url="http://people.ict.usc.edu/~sagae/parser/gdep/gdep-beta2.tgz"
8
+ url="http://www.sagae.org/gdep/gdep-beta2.tgz"
9
9
 
10
10
 
11
11
  get_pkg "$name" "$url"
@@ -1,7 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es//opennlp/opennlp-1.5.3/apache-opennlp-1.5.3-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
5
5
 
6
6
  get_src "$name" "$url"
7
7
  move_opt "$name"
@@ -6,7 +6,7 @@ class TestClass < Test::Unit::TestCase
6
6
  def test_info
7
7
  a = ["test"]
8
8
  NamedEntity.setup a
9
- assert(! a.info.keys.include?(:code))
9
+ assert(a.info[:code].nil?)
10
10
  a.code = 10
11
11
  a.offset = 100
12
12
  assert a.info.include? :code
@@ -26,4 +26,27 @@ class TestClass < Test::Unit::TestCase
26
26
  assert Segment.tsv([a], nil).fields.include? "code"
27
27
  assert Segment.tsv([a], "literal").fields.include? "code"
28
28
  end
29
+
30
+ def test_segment_brat
31
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
32
+
33
+ gene1 = "TP53"
34
+ gene1.extend NamedEntity
35
+ gene1.offset = a.index gene1
36
+ gene1.type = "Gene"
37
+
38
+ gene2 = "CDK5R1"
39
+ gene2.extend NamedEntity
40
+ gene2.offset = a.index gene2
41
+ gene2.type = "Gene"
42
+
43
+ gene3 = "TP53 gene"
44
+ gene3.extend NamedEntity
45
+ gene3.offset = a.index gene3
46
+ gene3.type = "Gene"
47
+
48
+ segments = [gene1, gene2, gene3]
49
+ assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
50
+
51
+ end
29
52
  end
@@ -2,10 +2,23 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_he
2
2
  require 'rbbt/ner/segment/transformed'
3
3
  require 'rbbt/ner/segment/named_entity'
4
4
  require 'rexml/document'
5
- require 'rand'
6
5
 
7
6
  class TestClass < Test::Unit::TestCase
8
- def tttest_transform
7
+ def test_sort
8
+ text = <<-EOF
9
+ More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
10
+ EOF
11
+
12
+ entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
13
+ NamedEntity.setup(literal, :offset => text.index(literal))
14
+ end
15
+
16
+ Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
17
+ assert text.include? "such as [IL-2]"
18
+ end
19
+ end
20
+
21
+ def ___test_transform
9
22
  a = "This sentence mentions the TP53 gene and the CDK5 protein"
10
23
  original = a.dup
11
24
 
@@ -27,6 +40,8 @@ class TestClass < Test::Unit::TestCase
27
40
  c[gene1.range] = "GN"
28
41
  assert_equal c, Transformed.transform(a,[gene1], "GN")
29
42
 
43
+ iii a.transformation_offset_differences
44
+ raise
30
45
  assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
31
46
  assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
32
47
 
@@ -216,5 +231,60 @@ class TestClass < Test::Unit::TestCase
216
231
  end
217
232
  end
218
233
  end
234
+
235
+ def test_nested_transform
236
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
237
+
238
+ gene1 = "TP53"
239
+ gene1.extend NamedEntity
240
+ gene1.offset = a.index gene1
241
+ gene1.type = "Gene"
242
+
243
+ gene2 = "CDK5R1"
244
+ gene2.extend NamedEntity
245
+ gene2.offset = a.index gene2
246
+ gene2.type = "Protein"
247
+
248
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
249
+ assert_equal "This sentence mentions the [G] gene and the [G] protein", a
250
+ end
251
+ Transformed.with_transform(a, [gene1], "[G1]") do
252
+ Transformed.with_transform(a, [gene2], "[G2]") do
253
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
254
+ end
255
+ end
256
+ Transformed.with_transform(a, [gene2], "[G2]") do
257
+ Transformed.with_transform(a, [gene1], "[G1]") do
258
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
259
+ end
260
+ end
261
+ end
262
+
263
+ def test_offset_transform
264
+ a = "ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
265
+
266
+ gene1 = "ILF"
267
+ gene1.extend NamedEntity
268
+ gene1.offset = a.index gene1
269
+ gene1.type = "Gene"
270
+
271
+ gene2 = "interleukin-2"
272
+ gene2.extend NamedEntity
273
+ gene2.offset = a.index gene2
274
+ gene2.type = "Protein"
275
+
276
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
277
+ assert_equal "[G] can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the [G] promoter.", a
278
+ end
279
+
280
+ offset = 100
281
+ a = Segment.setup(a, :offset => offset)
282
+ gene1.offset += offset
283
+ gene2.offset += offset
284
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
285
+ assert_equal "[G] can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the [G] promoter.", a
286
+ end
287
+
288
+ end
219
289
  end
220
290
 
@@ -0,0 +1,64 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/brat'
3
+
4
+ class TestBrat < Test::Unit::TestCase
5
+ def test_load
6
+ text =<<-EOF
7
+ T2 DBTF 52 55 Nrl
8
+ #2 AnnotatorNotes T2 4901
9
+ T3 NONDBTF 80 89 rhodopsin
10
+ #3 AnnotatorNotes T3 6010
11
+ T4 BIOLOGICALPROCESS 90 105 gene expression
12
+ #4 AnnotatorNotes T4 -
13
+ T5 DBTF 127 130 Nrl
14
+ #5 AnnotatorNotes T5 4901
15
+ T7 MOLECULARFUNCTION 197 204 binding
16
+ #7 AnnotatorNotes T7 -
17
+ T8 PHENOTYPE 241 252 extended AP
18
+ #8 AnnotatorNotes T8 -
19
+ T10 DBTF 331 334 Nrl
20
+ #10 AnnotatorNotes T10 4901
21
+ T11 TISSUE 381 399 photoreceptor cell
22
+ #11 AnnotatorNotes T11 -
23
+ T12 NONDBTF 414 423 rhodopsin
24
+ #12 AnnotatorNotes T12 6010
25
+ T13 CELLULARCOMPONENT 494 501 nuclear
26
+ #13 AnnotatorNotes T13 -
27
+ T14 TISSUE 548 572 retinoblastoma cell line
28
+ #14 AnnotatorNotes T14 -
29
+ T17 NONDBTF 660 669 rhodopsin
30
+ #17 AnnotatorNotes T17 6010
31
+ T18 DBTF 676 679 Nrl
32
+ #18 AnnotatorNotes T18 4901
33
+ T19 CELLULARCOMPONENT 749 764 protein complex
34
+ #19 AnnotatorNotes T19 -
35
+ T20 DBTF 797 800 Nrl
36
+ #20 AnnotatorNotes T20 4901
37
+ T21 DBTF 853 856 Nrl
38
+ #21 AnnotatorNotes T21 4901
39
+ T22 MOLECULARFUNCTION 882 892 luciferase
40
+ #22 AnnotatorNotes T22 -
41
+ T23 DBTF 943 946 Nrl
42
+ #23 AnnotatorNotes T23 4901
43
+ T24 NONDBTF 989 998 rhodopsin
44
+ #24 AnnotatorNotes T24 6010
45
+ T26 DBTF 1110 1113 Nrl
46
+ #26 AnnotatorNotes T26 4901
47
+ T27 DBTF 1224 1227 Nrl
48
+ #27 AnnotatorNotes T27 4901
49
+ T28 DBTF 1271 1274 Nrl
50
+ #28 AnnotatorNotes T28 4901
51
+ T30 DBTF 1385 1388 Nrl
52
+ #30 AnnotatorNotes T30 4901
53
+ R1 ACTIVATION Arg1:T2 Arg2:T3
54
+ R2 ACTIVATION Arg1:T10 Arg2:T12
55
+ R3 ACTIVATION Arg1:T23 Arg2:T24
56
+ T1 DBTF 250 254 AP-1
57
+ EOF
58
+
59
+ io = StringIO.new text
60
+ iii Brat.load io
61
+
62
+ end
63
+ end
64
+
@@ -0,0 +1,16 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/g_norm_plus'
3
+
4
+ Log.severity = 0
5
+ class TestGNormPlus < Test::Unit::TestCase
6
+ def test_match
7
+ text =<<-EOF
8
+ We found that TP53 is regulated by MDM2 in Homo sapiens
9
+ EOF
10
+
11
+
12
+ mentions = GNormPlus.process({:file => text})
13
+ Log.tsv mentions
14
+ end
15
+ end
16
+
@@ -96,6 +96,5 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
96
96
  assert_equal %w(CDK5R1), index[gene2.offset + 1]
97
97
  end
98
98
  end
99
-
100
99
  end
101
100
 
@@ -0,0 +1,9 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/nlp/genia/sentence_splitter'
3
+
4
+ class TestClass < Test::Unit::TestCase
5
+ def test_true
6
+ assert true
7
+ end
8
+ end
9
+
@@ -29,11 +29,14 @@ sentence. This is
29
29
  another sentence.
30
30
  EOF
31
31
 
32
+ iii OpenNLP.sentence_split_detector.sentDetect(text)
33
+ assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
34
+
32
35
  assert_equal 5, OpenNLP.sentence_splitter(text).length
33
36
  assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
34
37
  end
35
38
 
36
- def test_text_sentences
39
+ def _test_text_sentences
37
40
  Misc.benchmark(100) do
38
41
  OpenNLP.sentence_splitter($text).include? "Our
39
42
  findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.7
4
+ version: 1.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-05-04 00:00:00.000000000 Z
11
+ date: 2020-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: rjb
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
69
  description: 'Text mining tools: named entity recognition and normalization, document
84
70
  classification, bag-of-words, dictionaries, etc'
85
71
  email: miguel.vazquez@fdi.ucm.es
@@ -100,8 +86,10 @@ files:
100
86
  - lib/rbbt/ner/NER.rb
101
87
  - lib/rbbt/ner/abner.rb
102
88
  - lib/rbbt/ner/banner.rb
89
+ - lib/rbbt/ner/brat.rb
103
90
  - lib/rbbt/ner/chemical_tagger.rb
104
91
  - lib/rbbt/ner/finder.rb
92
+ - lib/rbbt/ner/g_norm_plus.rb
105
93
  - lib/rbbt/ner/linnaeus.rb
106
94
  - lib/rbbt/ner/ngram_prefix_dictionary.rb
107
95
  - lib/rbbt/ner/oscar3.rb
@@ -125,6 +113,7 @@ files:
125
113
  - share/install/software/ABNER
126
114
  - share/install/software/BANNER
127
115
  - share/install/software/ChemicalTagger
116
+ - share/install/software/GNormPlus
128
117
  - share/install/software/Gdep
129
118
  - share/install/software/Geniass
130
119
  - share/install/software/Linnaeus
@@ -141,13 +130,16 @@ files:
141
130
  - test/rbbt/bow/test_misc.rb
142
131
  - test/rbbt/entity/test_document.rb
143
132
  - test/rbbt/ner/segment/test_named_entity.rb
133
+ - test/rbbt/ner/segment/test_relationship.rb
144
134
  - test/rbbt/ner/segment/test_segmented.rb
145
135
  - test/rbbt/ner/segment/test_transformed.rb
146
136
  - test/rbbt/ner/test_NER.rb
147
137
  - test/rbbt/ner/test_abner.rb
148
138
  - test/rbbt/ner/test_banner.rb
139
+ - test/rbbt/ner/test_brat.rb
149
140
  - test/rbbt/ner/test_chemical_tagger.rb
150
141
  - test/rbbt/ner/test_finder.rb
142
+ - test/rbbt/ner/test_g_norm_plus.rb
151
143
  - test/rbbt/ner/test_linnaeus.rb
152
144
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
153
145
  - test/rbbt/ner/test_oscar4.rb
@@ -156,6 +148,7 @@ files:
156
148
  - test/rbbt/ner/test_rnorm.rb
157
149
  - test/rbbt/ner/test_segment.rb
158
150
  - test/rbbt/ner/test_token_trieNER.rb
151
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb
159
152
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
160
153
  - test/rbbt/nlp/test_nlp.rb
161
154
  - test/test_helper.rb
@@ -177,14 +170,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
177
170
  - !ruby/object:Gem::Version
178
171
  version: '0'
179
172
  requirements: []
180
- rubyforge_project:
181
- rubygems_version: 2.6.13
173
+ rubygems_version: 3.0.6
182
174
  signing_key:
183
175
  specification_version: 4
184
176
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
185
177
  test_files:
186
178
  - test/rbbt/nlp/test_nlp.rb
187
179
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
180
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb
188
181
  - test/rbbt/bow/test_bow.rb
189
182
  - test/rbbt/bow/test_misc.rb
190
183
  - test/rbbt/bow/test_dictionary.rb
@@ -195,6 +188,8 @@ test_files:
195
188
  - test/rbbt/ner/test_rnorm.rb
196
189
  - test/rbbt/ner/test_regexpNER.rb
197
190
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
191
+ - test/rbbt/ner/test_brat.rb
192
+ - test/rbbt/ner/test_g_norm_plus.rb
198
193
  - test/rbbt/ner/test_chemical_tagger.rb
199
194
  - test/rbbt/ner/test_banner.rb
200
195
  - test/rbbt/ner/test_token_trieNER.rb
@@ -202,6 +197,7 @@ test_files:
202
197
  - test/rbbt/ner/test_segment.rb
203
198
  - test/rbbt/ner/test_linnaeus.rb
204
199
  - test/rbbt/ner/segment/test_transformed.rb
200
+ - test/rbbt/ner/segment/test_relationship.rb
205
201
  - test/rbbt/ner/segment/test_named_entity.rb
206
202
  - test/rbbt/ner/segment/test_segmented.rb
207
203
  - test/rbbt/ner/test_oscar4.rb