rbbt-text 1.3.11 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 587b7971fd0f8fec2602323ea480521445ca417c3a407d057cf95df1f3a36216
4
- data.tar.gz: 4f66306f80e838a0a27299705d79e5856b38dd936005d2b18004539bbb192431
3
+ metadata.gz: d969cb752d1e7bc80458663d989ca4f58a2e134b7f708748dcf8383ca44f01d3
4
+ data.tar.gz: b0df4c7e9bb43f47b6031965b70a21ea9e3c7a12e012747d38776ad785f580e3
5
5
  SHA512:
6
- metadata.gz: 4aa191aa4e5cb5e3f7d4a49b30beb1eb7259a34074a7521a10b9951cd1cc7a097a06ba6a97d9f4f4e100b2058de3f94f5199cb069a030f93a3f69bf1ecec09ff
7
- data.tar.gz: f3d5eb11d12f8a9d951d1073abd7e6cb5ace99bd075e7dc897f0aa715ae1552271019b4eb6849a172529d830f30bbc09ce40dd351fd81f9c06f338b075523e36
6
+ metadata.gz: c7436bae2f407303bb81b812b586ec09bb97a70b2272f386f6bd574b257cde6a22789b362a22dd89546761c38147214423e2e53e58ff73ae6553630e34e2f6d2
7
+ data.tar.gz: f60cfc48e60112b2639d182eda684bed33f9249e1aeabdef220b77af05f4b77eb07f6f55d9e980c8372f2030e126847369c5a80dded711217b3fd78520a00db6
@@ -20,13 +20,25 @@ module Document::Corpus
20
20
  elsif type.to_sym == :title
21
21
  Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
22
  elsif type.to_sym == :title_and_abstract
23
- Document.setup((article.title || "") + " " + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
23
+ title = article.title
24
+ abstract = article.abstract
25
+
26
+ if title.nil? || title == ""
27
+ text = article.abstract
28
+ text = "" if text.nil?
29
+ else
30
+ title = title + "." unless title.end_with?(".")
31
+
32
+ text = title + " " + abstract if abstract && ! abstract.empty?
33
+ end
34
+
35
+ Document.setup(text, PUBMED_NAMESPACE, pmid, type.to_sym, self)
24
36
  else
25
37
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
26
38
  Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
27
39
  end
28
40
  Log.debug "Loading pmid #{pmid}"
29
- add_document(document)
41
+ add_document(document) if document
30
42
  document
31
43
  end
32
44
 
@@ -66,7 +66,7 @@ EOF
66
66
  end
67
67
 
68
68
  Open.write('config', CONFIG)
69
- mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
69
+ mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "8G")
70
70
  CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
71
71
 
72
72
  if texts.respond_to? :key_field
data/lib/rbbt/segment.rb CHANGED
@@ -172,10 +172,24 @@ module Segment
172
172
  end
173
173
  end
174
174
 
175
+ def self.relocate(segment, original, target, pad = 20)
176
+ if segment != target[segment.range]
177
+ start_pad = [pad, segment.offset].min
178
+ end_pad = [pad, original.length - segment.end].min
179
+ start = segment.offset - start_pad
180
+ eend = segment.end + end_pad
181
+
182
+ context = original[start..eend].gsub(/\s/,' ')
183
+ target = target.gsub(/\s/, ' ')
184
+ i = target.index context
185
+ raise "Context not found in original text" if i.nil?
186
+ segment.offset = i + start_pad
187
+ end
188
+ end
189
+
175
190
  def self.index(*args)
176
191
  Segment::RangeIndex.index(*args)
177
192
  end
178
-
179
193
  end
180
194
 
181
195
  require 'rbbt/segment/range_index'
@@ -1,7 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-2.3.0/apache-opennlp-2.3.0-bin.tar.gz"
5
5
 
6
6
  install_src $name $url
7
7
  (cd $OPT_DIR/jars; ln -s $OPT_DIR/$name/lib/*.jar .)
@@ -3,7 +3,7 @@ require 'rbbt/ner/g_norm_plus'
3
3
 
4
4
  Log.severity = 0
5
5
  class TestGNormPlus < Test::Unit::TestCase
6
- def test_match
6
+ def _test_match
7
7
  text =<<-EOF
8
8
 
9
9
  Introduction
@@ -12,7 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
12
12
  sapiens
13
13
  EOF
14
14
 
15
- Rbbt::Config.add_entry :java_mem, "2G", :gnp
15
+ Rbbt::Config.add_entry :java_mem, "16G", :gnp
16
16
  mentions = GNormPlus.process({:file => text})
17
17
 
18
18
  assert_equal 1, mentions.length
@@ -24,7 +24,7 @@ sapiens
24
24
  We found that TP53 is regulated by MDM2 in Homo sapiens
25
25
  EOF
26
26
 
27
- Rbbt::Config.add_entry :java_mem, "2G", :gnp
27
+ Rbbt::Config.add_entry :java_mem, "16G", :gnp
28
28
  mentions = GNormPlus.entities({:file => text})
29
29
  assert mentions["file"].include?("TP53")
30
30
  mentions["file"].each do |mention|
@@ -134,6 +134,25 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
134
134
  assert_equal parts.first.docid, text.docid
135
135
  end
136
136
 
137
+ def test_relocate
138
+ original =<<-EOF
139
+ This sentences contains
140
+ a mention to gene TP53
141
+ This is a followup sentence
142
+ EOF
143
+
144
+ target = <<-EOF
145
+ This sentence is added before
146
+ This sentences contains a mention to gene TP53
147
+ This is a followup sentence
148
+ EOF
149
+
150
+ segment = Segment.setup("TP53")
151
+ Segment.align(original, [segment])
152
+ Segment.relocate(segment, original, target)
153
+ assert_equal segment, target[segment.range]
154
+ end
155
+
137
156
  def test_segment_index
138
157
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
139
158
  Document.setup(text, "TEST", "test_doc1", nil)
data/test/test_helper.rb CHANGED
@@ -15,7 +15,11 @@ class Test::Unit::TestCase
15
15
 
16
16
  def setup
17
17
  FileUtils.mkdir_p Rbbt.tmp.test.persistence.find(:user)
18
- Persist.cachedir = Rbbt.tmp.test.persistence.find :user
18
+ begin
19
+ Persist.cachedir = Rbbt.tmp.test.persistence.find :user
20
+ rescue
21
+ Persist.cache_dir = Rbbt.tmp.test.persistence.find :user
22
+ end
19
23
  end
20
24
 
21
25
  def teardown
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.11
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-11 00:00:00.000000000 Z
11
+ date: 2024-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  - !ruby/object:Gem::Version
176
176
  version: '0'
177
177
  requirements: []
178
- rubygems_version: 3.4.8
178
+ rubygems_version: 3.5.0.dev
179
179
  signing_key:
180
180
  specification_version: 4
181
181
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
182
182
  test_files:
183
- - test/rbbt/nlp/test_nlp.rb
184
- - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
185
- - test/rbbt/nlp/genia/test_sentence_splitter.rb
186
183
  - test/rbbt/bow/test_bow.rb
187
- - test/rbbt/bow/test_misc.rb
188
184
  - test/rbbt/bow/test_dictionary.rb
189
- - test/rbbt/test_document.rb
190
- - test/rbbt/document/test_annotation.rb
185
+ - test/rbbt/bow/test_misc.rb
191
186
  - test/rbbt/document/corpus/test_pubmed.rb
187
+ - test/rbbt/document/test_annotation.rb
192
188
  - test/rbbt/document/test_corpus.rb
193
189
  - test/rbbt/entity/test_document.rb
194
- - test/rbbt/ner/test_patterns.rb
190
+ - test/rbbt/ner/rnorm/test_tokens.rb
195
191
  - test/rbbt/ner/test_NER.rb
196
192
  - test/rbbt/ner/test_abner.rb
197
- - test/rbbt/ner/rnorm/test_tokens.rb
198
- - test/rbbt/ner/test_rnorm.rb
199
- - test/rbbt/ner/test_regexpNER.rb
200
- - test/rbbt/ner/test_ngram_prefix_dictionary.rb
193
+ - test/rbbt/ner/test_banner.rb
201
194
  - test/rbbt/ner/test_brat.rb
202
- - test/rbbt/ner/test_g_norm_plus.rb
203
195
  - test/rbbt/ner/test_chemical_tagger.rb
204
- - test/rbbt/ner/test_banner.rb
205
- - test/rbbt/ner/test_token_trieNER.rb
206
196
  - test/rbbt/ner/test_finder.rb
207
- - test/rbbt/ner/test_rner.rb
197
+ - test/rbbt/ner/test_g_norm_plus.rb
208
198
  - test/rbbt/ner/test_linnaeus.rb
199
+ - test/rbbt/ner/test_ngram_prefix_dictionary.rb
209
200
  - test/rbbt/ner/test_oscar4.rb
210
- - test/rbbt/test_segment.rb
211
- - test/rbbt/segment/test_transformed.rb
212
- - test/rbbt/segment/test_overlaps.rb
201
+ - test/rbbt/ner/test_patterns.rb
202
+ - test/rbbt/ner/test_regexpNER.rb
203
+ - test/rbbt/ner/test_rner.rb
204
+ - test/rbbt/ner/test_rnorm.rb
205
+ - test/rbbt/ner/test_token_trieNER.rb
206
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb
207
+ - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
208
+ - test/rbbt/nlp/test_nlp.rb
213
209
  - test/rbbt/segment/test_annotation.rb
214
- - test/rbbt/segment/test_named_entity.rb
210
+ - test/rbbt/segment/test_corpus.rb
215
211
  - test/rbbt/segment/test_encoding.rb
212
+ - test/rbbt/segment/test_named_entity.rb
213
+ - test/rbbt/segment/test_overlaps.rb
216
214
  - test/rbbt/segment/test_range_index.rb
217
- - test/rbbt/segment/test_corpus.rb
218
- - test/test_spaCy.rb
215
+ - test/rbbt/segment/test_transformed.rb
216
+ - test/rbbt/test_document.rb
217
+ - test/rbbt/test_segment.rb
219
218
  - test/test_helper.rb
219
+ - test/test_spaCy.rb