rbbt-text 1.3.6 → 1.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 110d05d205305e48b17ca63a91f8fef66ba6cb0d24639031614521c463dfde8f
4
- data.tar.gz: 97df4097eb1a0cb645f4f6a0eeec9138f296760e09fcdc85a5ebebe6996640d0
3
+ metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
4
+ data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
5
5
  SHA512:
6
- metadata.gz: bc3c92ae7b5268f02b232f04d28a2b6491740a618c9a9f3d78a099d51ed03dee489ebb805485c82552fc38d8164a4eb75a8e19a6d1df53aaac536e99287009fb
7
- data.tar.gz: 1bb1594b9cd5831455e1a7e2454932b57fdd7da4401bc2596146dc1669fa6783931df3042c9deed6a3cf032c59d8697500a470a8aeba859061868cd8ab8b4ab8
6
+ metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
7
+ data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010-2022 Miguel Vázquez García
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -4,10 +4,11 @@ module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
5
  def add_pmid(pmid, type = nil, update = false)
6
6
  type = :abstract if type.nil?
7
- if update == false
7
+
8
+ if ! (update || Array === pmid)
8
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
9
10
  documents = self.documents(id)
10
- return documents if documents.any?
11
+ return documents.first if documents.any?
11
12
  end
12
13
 
13
14
  pmids = Array === pmid ? pmid : [pmid]
@@ -27,7 +28,14 @@ module Document::Corpus
27
28
  document
28
29
  end
29
30
 
30
- Document.setup(res)
31
+ if Array === pmid
32
+ corpus = res.first.corpus if res.first
33
+ Document.setup(res, :corpus => corpus)
34
+ else
35
+ res = res.first
36
+ end
37
+
38
+ res
31
39
  end
32
40
 
33
41
  def add_pubmed_query(query, max = 3000, type = nil)
@@ -35,8 +43,8 @@ module Document::Corpus
35
43
  add_pmid(pmids, type)
36
44
  end
37
45
 
38
- self.claim "PMID" do |id, type|
46
+ self.claim "PMID" do |id,type,update|
39
47
  Log.debug "Claiming #{id}"
40
- self.add_pmid(id, type).first
48
+ self.add_pmid(id, type,update)
41
49
  end
42
50
  end
@@ -20,9 +20,9 @@ module Document::Corpus
20
20
 
21
21
  def docids(*prefix)
22
22
  prefix = prefix * ":"
23
- prefix += ":" unless prefix == :all || prefix[-1] == ":"
23
+ prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
24
24
  docids = self.read_and_close do
25
- prefix == :all ? self.keys : self.prefix(prefix)
25
+ prefix == "all" ? self.keys : self.prefix(prefix)
26
26
  end
27
27
  DocID.setup(docids, :corpus => self)
28
28
  end
@@ -34,7 +34,7 @@ module Document::Corpus
34
34
  def [](*args)
35
35
  docid, *rest = args
36
36
 
37
- res = self.read_and_close do
37
+ res = self.with_read do
38
38
  super(*args)
39
39
  end
40
40
 
@@ -44,7 +44,7 @@ module Document::Corpus
44
44
  namespace, id, type = docid.split(":")
45
45
 
46
46
  if res.nil?
47
- if Document::Corpus.claims.include?(namespace.to_s)
47
+ if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
48
48
  res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
49
49
  end
50
50
  end
data/lib/rbbt/document.rb CHANGED
@@ -22,9 +22,7 @@ module DocID
22
22
  if Array === self
23
23
  namespace, id, type = nil, nil, nil
24
24
  docs = self.collect do |docid|
25
- text = self.corpus[docid]
26
- namespace, id, type = docid.split(":")
27
- text
25
+ self.corpus[docid]
28
26
  end
29
27
  Document.setup(docs, :corpus => corpus)
30
28
  else
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'libxml'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'libxml'
4
3
  require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
@@ -172,6 +172,7 @@ class Tokenizer
172
172
 
173
173
  #{{{ Token Types
174
174
  GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
175
+ GREEK_LETTER_RE = "(?:" + $inverse_greek.keys.select{|w| w.length == 1}.collect{|w| w.upcase}.join("|") + ")"
175
176
  def tokenize(word)
176
177
  return word.
177
178
  gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
@@ -180,6 +181,7 @@ class Tokenizer
180
181
  gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
181
182
  gsub(/^(#{GREEK_RE})/,'\1-').
182
183
  gsub(/(#{GREEK_RE})$/,'-\1').
184
+ gsub(/(#{GREEK_LETTER_RE})$/,'-\1').
183
185
  split( /[^\w.]+/). # Split by separator char
184
186
  select{|t| !t.empty? }
185
187
  end
@@ -204,7 +206,7 @@ class Tokenizer
204
206
  end
205
207
 
206
208
  #{{{ Comparisons
207
-
209
+
208
210
  def evaluate_tokens(list1, list2)
209
211
  @operations.inject(0){|acc, o|
210
212
  acc + o.eval(list1, list2)
@@ -18,6 +18,10 @@ class Normalizer
18
18
  values.select{|p| p[1] == best}
19
19
  end
20
20
 
21
+ def token_evaluate(mention, name)
22
+ @tokens.evaluate(mention, name)
23
+ end
24
+
21
25
  # Compares the tokens and gives each candidate a score based on the
22
26
  # commonalities and differences amongst the tokens.
23
27
  def token_score(code, mention)
@@ -31,7 +35,7 @@ class Normalizer
31
35
  when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
32
36
  80
33
37
  else
34
- @tokens.evaluate(mention, name)
38
+ token_evaluate(mention, name)
35
39
  end
36
40
  [value, name]
37
41
  }.sort_by{|value, name| value }.last
@@ -153,8 +153,8 @@ module SpaCy
153
153
 
154
154
  chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
155
155
 
156
- source_id = chunk_index[source.offset].first || source.segid
157
- target_id = chunk_index[target.offset].first || target.segid
156
+ source_id = chunk_index[source.offset.to_i].first || source.segid
157
+ target_id = chunk_index[target.offset.to_i].first || target.segid
158
158
 
159
159
  path = Paths.dijkstra(graph, source_id, [target_id])
160
160
 
@@ -70,7 +70,15 @@ module Transformed
70
70
  orig_length = self.length
71
71
 
72
72
  offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
- segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
73
+
74
+ segments = segments.select do |s|
75
+ shift = shift s.range
76
+ s_offset = s.offset.to_i
77
+ s_offset += shift.first if shift
78
+
79
+ s_offset >= offset &&
80
+ s_offset <= offset + self.length - 1
81
+ end
74
82
 
75
83
  Segment.clean_sort(segments).each do |segment|
76
84
  next if segment.offset.nil?
@@ -1,6 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es/opennlp/opennlp-1.9.3/apache-opennlp-1.9.3-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.tar.gz"
5
5
 
6
- install_jar $name $url
6
+ install_src $name $url
7
+ (cd $OPT_DIR/jars; ln -s $OPT_DIR/$name/lib/*.jar .)
@@ -7,7 +7,7 @@ class TestCorpusPubmed < Test::Unit::TestCase
7
7
  def test_add_pmid
8
8
  corpus = Document::Corpus.setup({})
9
9
 
10
- document = corpus.add_pmid("33359141", :abstract).first
10
+ document = corpus.add_pmid("33359141", :abstract, true)
11
11
  title = document.to(:title)
12
12
  assert title.include?("COVID-19")
13
13
  end
@@ -0,0 +1,11 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/ner/rnorm'
3
+
4
+ class TestRNorm < Test::Unit::TestCase
5
+ def test_evaluate
6
+ t = Tokenizer.new
7
+ assert t.evaluate("PDGFRA","PDGFRalpha") > 0
8
+ iii t.evaluate("JUNB","JunB")
9
+ end
10
+ end
11
+
@@ -43,4 +43,9 @@ S000000376 AAA GENE1 DDD
43
43
  def test_order
44
44
  assert_equal(["S000000375"], @norm.resolve("GENE1"))
45
45
  end
46
+
47
+ def test_token_evaluate
48
+ iii @norm.token_evaluate("PDGFRA","PDGFRalpha")
49
+ end
50
+
46
51
  end
@@ -393,43 +393,26 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
393
393
  end
394
394
  end
395
395
 
396
- def ___test_transform
397
- a = "This sentence mentions the TP53 gene and the CDK5 protein"
396
+ def test_transform_sorter_end
397
+ a = "The transcription factors farnesoid X receptor, small heterodimer partner, liver receptor homolog-1, and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of apoM."
398
398
  original = a.dup
399
399
 
400
- gene1 = "TP53"
400
+ gene1 = "liver receptor homolog-1"
401
401
  gene1.extend Segment
402
402
  gene1.offset = a.index gene1
403
403
 
404
- gene2 = "CDK5"
404
+ gene2 = "apoM"
405
405
  gene2.extend Segment
406
406
  gene2.offset = a.index gene2
407
407
 
408
408
  assert_equal gene1, a[gene1.range]
409
409
  assert_equal gene2, a[gene2.range]
410
410
 
411
- c = a.dup
412
-
413
- c[gene2.range] = "GN"
414
- assert_equal c, Transformed.transform(a,[gene2], "GN")
415
- c[gene1.range] = "GN"
416
- assert_equal c, Transformed.transform(a,[gene1], "GN")
417
-
418
- iii a.transformation_offset_differences
419
- raise
420
- assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
421
- assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
422
-
423
-
424
- gene3 = "GN gene"
425
- gene3.extend Segment
426
- gene3.offset = a.index gene3
427
-
428
- assert_equal gene3, a[gene3.range]
429
-
430
- a.restore([gene3])
431
- assert_equal original, a
432
- assert_equal "TP53 gene", a[gene3.range]
411
+ Transformed.with_transform(a, [gene1], "[TF]") do
412
+ Transformed.with_transform(a, [gene2], "[TG]") do
413
+ assert_equal "The transcription factors farnesoid X receptor, small heterodimer partner, [TF], and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of [TG].", a
414
+ end
415
+ end
433
416
 
434
417
  end
435
418
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.6
4
+ version: 1.3.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-25 00:00:00.000000000 Z
11
+ date: 2022-11-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: libxml-ruby
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: json
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -72,8 +58,10 @@ email: miguel.vazquez@fdi.ucm.es
72
58
  executables:
73
59
  - get_ppis.rb
74
60
  extensions: []
75
- extra_rdoc_files: []
61
+ extra_rdoc_files:
62
+ - LICENSE
76
63
  files:
64
+ - LICENSE
77
65
  - bin/get_ppis.rb
78
66
  - lib/rbbt/bow/bow.rb
79
67
  - lib/rbbt/bow/dictionary.rb
@@ -139,6 +127,7 @@ files:
139
127
  - test/rbbt/document/test_annotation.rb
140
128
  - test/rbbt/document/test_corpus.rb
141
129
  - test/rbbt/entity/test_document.rb
130
+ - test/rbbt/ner/rnorm/test_tokens.rb
142
131
  - test/rbbt/ner/test_NER.rb
143
132
  - test/rbbt/ner/test_abner.rb
144
133
  - test/rbbt/ner/test_banner.rb
@@ -205,6 +194,7 @@ test_files:
205
194
  - test/rbbt/ner/test_patterns.rb
206
195
  - test/rbbt/ner/test_NER.rb
207
196
  - test/rbbt/ner/test_abner.rb
197
+ - test/rbbt/ner/rnorm/test_tokens.rb
208
198
  - test/rbbt/ner/test_rnorm.rb
209
199
  - test/rbbt/ner/test_regexpNER.rb
210
200
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb