rbbt-text 1.3.6 → 1.3.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 110d05d205305e48b17ca63a91f8fef66ba6cb0d24639031614521c463dfde8f
4
- data.tar.gz: 97df4097eb1a0cb645f4f6a0eeec9138f296760e09fcdc85a5ebebe6996640d0
3
+ metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
4
+ data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
5
5
  SHA512:
6
- metadata.gz: bc3c92ae7b5268f02b232f04d28a2b6491740a618c9a9f3d78a099d51ed03dee489ebb805485c82552fc38d8164a4eb75a8e19a6d1df53aaac536e99287009fb
7
- data.tar.gz: 1bb1594b9cd5831455e1a7e2454932b57fdd7da4401bc2596146dc1669fa6783931df3042c9deed6a3cf032c59d8697500a470a8aeba859061868cd8ab8b4ab8
6
+ metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
7
+ data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010-2022 Miguel Vázquez García
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -4,10 +4,11 @@ module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
5
  def add_pmid(pmid, type = nil, update = false)
6
6
  type = :abstract if type.nil?
7
- if update == false
7
+
8
+ if ! (update || Array === pmid)
8
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
9
10
  documents = self.documents(id)
10
- return documents if documents.any?
11
+ return documents.first if documents.any?
11
12
  end
12
13
 
13
14
  pmids = Array === pmid ? pmid : [pmid]
@@ -27,7 +28,14 @@ module Document::Corpus
27
28
  document
28
29
  end
29
30
 
30
- Document.setup(res)
31
+ if Array === pmid
32
+ corpus = res.first.corpus if res.first
33
+ Document.setup(res, :corpus => corpus)
34
+ else
35
+ res = res.first
36
+ end
37
+
38
+ res
31
39
  end
32
40
 
33
41
  def add_pubmed_query(query, max = 3000, type = nil)
@@ -35,8 +43,8 @@ module Document::Corpus
35
43
  add_pmid(pmids, type)
36
44
  end
37
45
 
38
- self.claim "PMID" do |id, type|
46
+ self.claim "PMID" do |id,type,update|
39
47
  Log.debug "Claiming #{id}"
40
- self.add_pmid(id, type).first
48
+ self.add_pmid(id, type,update)
41
49
  end
42
50
  end
@@ -20,9 +20,9 @@ module Document::Corpus
20
20
 
21
21
  def docids(*prefix)
22
22
  prefix = prefix * ":"
23
- prefix += ":" unless prefix == :all || prefix[-1] == ":"
23
+ prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
24
24
  docids = self.read_and_close do
25
- prefix == :all ? self.keys : self.prefix(prefix)
25
+ prefix == "all" ? self.keys : self.prefix(prefix)
26
26
  end
27
27
  DocID.setup(docids, :corpus => self)
28
28
  end
@@ -34,7 +34,7 @@ module Document::Corpus
34
34
  def [](*args)
35
35
  docid, *rest = args
36
36
 
37
- res = self.read_and_close do
37
+ res = self.with_read do
38
38
  super(*args)
39
39
  end
40
40
 
@@ -44,7 +44,7 @@ module Document::Corpus
44
44
  namespace, id, type = docid.split(":")
45
45
 
46
46
  if res.nil?
47
- if Document::Corpus.claims.include?(namespace.to_s)
47
+ if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
48
48
  res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
49
49
  end
50
50
  end
data/lib/rbbt/document.rb CHANGED
@@ -22,9 +22,7 @@ module DocID
22
22
  if Array === self
23
23
  namespace, id, type = nil, nil, nil
24
24
  docs = self.collect do |docid|
25
- text = self.corpus[docid]
26
- namespace, id, type = docid.split(":")
27
- text
25
+ self.corpus[docid]
28
26
  end
29
27
  Document.setup(docs, :corpus => corpus)
30
28
  else
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'libxml'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'libxml'
4
3
  require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
@@ -172,6 +172,7 @@ class Tokenizer
172
172
 
173
173
  #{{{ Token Types
174
174
  GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
175
+ GREEK_LETTER_RE = "(?:" + $inverse_greek.keys.select{|w| w.length == 1}.collect{|w| w.upcase}.join("|") + ")"
175
176
  def tokenize(word)
176
177
  return word.
177
178
  gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
@@ -180,6 +181,7 @@ class Tokenizer
180
181
  gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
181
182
  gsub(/^(#{GREEK_RE})/,'\1-').
182
183
  gsub(/(#{GREEK_RE})$/,'-\1').
184
+ gsub(/(#{GREEK_LETTER_RE})$/,'-\1').
183
185
  split( /[^\w.]+/). # Split by separator char
184
186
  select{|t| !t.empty? }
185
187
  end
@@ -204,7 +206,7 @@ class Tokenizer
204
206
  end
205
207
 
206
208
  #{{{ Comparisons
207
-
209
+
208
210
  def evaluate_tokens(list1, list2)
209
211
  @operations.inject(0){|acc, o|
210
212
  acc + o.eval(list1, list2)
@@ -18,6 +18,10 @@ class Normalizer
18
18
  values.select{|p| p[1] == best}
19
19
  end
20
20
 
21
+ def token_evaluate(mention, name)
22
+ @tokens.evaluate(mention, name)
23
+ end
24
+
21
25
  # Compares the tokens and gives each candidate a score based on the
22
26
  # commonalities and differences amongst the tokens.
23
27
  def token_score(code, mention)
@@ -31,7 +35,7 @@ class Normalizer
31
35
  when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
32
36
  80
33
37
  else
34
- @tokens.evaluate(mention, name)
38
+ token_evaluate(mention, name)
35
39
  end
36
40
  [value, name]
37
41
  }.sort_by{|value, name| value }.last
@@ -153,8 +153,8 @@ module SpaCy
153
153
 
154
154
  chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
155
155
 
156
- source_id = chunk_index[source.offset].first || source.segid
157
- target_id = chunk_index[target.offset].first || target.segid
156
+ source_id = chunk_index[source.offset.to_i].first || source.segid
157
+ target_id = chunk_index[target.offset.to_i].first || target.segid
158
158
 
159
159
  path = Paths.dijkstra(graph, source_id, [target_id])
160
160
 
@@ -70,7 +70,15 @@ module Transformed
70
70
  orig_length = self.length
71
71
 
72
72
  offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
- segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
73
+
74
+ segments = segments.select do |s|
75
+ shift = shift s.range
76
+ s_offset = s.offset.to_i
77
+ s_offset += shift.first if shift
78
+
79
+ s_offset >= offset &&
80
+ s_offset <= offset + self.length - 1
81
+ end
74
82
 
75
83
  Segment.clean_sort(segments).each do |segment|
76
84
  next if segment.offset.nil?
@@ -1,6 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es/opennlp/opennlp-1.9.3/apache-opennlp-1.9.3-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.tar.gz"
5
5
 
6
- install_jar $name $url
6
+ install_src $name $url
7
+ (cd $OPT_DIR/jars; ln -s $OPT_DIR/$name/lib/*.jar .)
@@ -7,7 +7,7 @@ class TestCorpusPubmed < Test::Unit::TestCase
7
7
  def test_add_pmid
8
8
  corpus = Document::Corpus.setup({})
9
9
 
10
- document = corpus.add_pmid("33359141", :abstract).first
10
+ document = corpus.add_pmid("33359141", :abstract, true)
11
11
  title = document.to(:title)
12
12
  assert title.include?("COVID-19")
13
13
  end
@@ -0,0 +1,11 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/ner/rnorm'
3
+
4
+ class TestRNorm < Test::Unit::TestCase
5
+ def test_evaluate
6
+ t = Tokenizer.new
7
+ assert t.evaluate("PDGFRA","PDGFRalpha") > 0
8
+ iii t.evaluate("JUNB","JunB")
9
+ end
10
+ end
11
+
@@ -43,4 +43,9 @@ S000000376 AAA GENE1 DDD
43
43
  def test_order
44
44
  assert_equal(["S000000375"], @norm.resolve("GENE1"))
45
45
  end
46
+
47
+ def test_token_evaluate
48
+ iii @norm.token_evaluate("PDGFRA","PDGFRalpha")
49
+ end
50
+
46
51
  end
@@ -393,43 +393,26 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
393
393
  end
394
394
  end
395
395
 
396
- def ___test_transform
397
- a = "This sentence mentions the TP53 gene and the CDK5 protein"
396
+ def test_transform_sorter_end
397
+ a = "The transcription factors farnesoid X receptor, small heterodimer partner, liver receptor homolog-1, and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of apoM."
398
398
  original = a.dup
399
399
 
400
- gene1 = "TP53"
400
+ gene1 = "liver receptor homolog-1"
401
401
  gene1.extend Segment
402
402
  gene1.offset = a.index gene1
403
403
 
404
- gene2 = "CDK5"
404
+ gene2 = "apoM"
405
405
  gene2.extend Segment
406
406
  gene2.offset = a.index gene2
407
407
 
408
408
  assert_equal gene1, a[gene1.range]
409
409
  assert_equal gene2, a[gene2.range]
410
410
 
411
- c = a.dup
412
-
413
- c[gene2.range] = "GN"
414
- assert_equal c, Transformed.transform(a,[gene2], "GN")
415
- c[gene1.range] = "GN"
416
- assert_equal c, Transformed.transform(a,[gene1], "GN")
417
-
418
- iii a.transformation_offset_differences
419
- raise
420
- assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
421
- assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
422
-
423
-
424
- gene3 = "GN gene"
425
- gene3.extend Segment
426
- gene3.offset = a.index gene3
427
-
428
- assert_equal gene3, a[gene3.range]
429
-
430
- a.restore([gene3])
431
- assert_equal original, a
432
- assert_equal "TP53 gene", a[gene3.range]
411
+ Transformed.with_transform(a, [gene1], "[TF]") do
412
+ Transformed.with_transform(a, [gene2], "[TG]") do
413
+ assert_equal "The transcription factors farnesoid X receptor, small heterodimer partner, [TF], and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of [TG].", a
414
+ end
415
+ end
433
416
 
434
417
  end
435
418
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.6
4
+ version: 1.3.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-25 00:00:00.000000000 Z
11
+ date: 2022-11-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: libxml-ruby
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: json
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -72,8 +58,10 @@ email: miguel.vazquez@fdi.ucm.es
72
58
  executables:
73
59
  - get_ppis.rb
74
60
  extensions: []
75
- extra_rdoc_files: []
61
+ extra_rdoc_files:
62
+ - LICENSE
76
63
  files:
64
+ - LICENSE
77
65
  - bin/get_ppis.rb
78
66
  - lib/rbbt/bow/bow.rb
79
67
  - lib/rbbt/bow/dictionary.rb
@@ -139,6 +127,7 @@ files:
139
127
  - test/rbbt/document/test_annotation.rb
140
128
  - test/rbbt/document/test_corpus.rb
141
129
  - test/rbbt/entity/test_document.rb
130
+ - test/rbbt/ner/rnorm/test_tokens.rb
142
131
  - test/rbbt/ner/test_NER.rb
143
132
  - test/rbbt/ner/test_abner.rb
144
133
  - test/rbbt/ner/test_banner.rb
@@ -205,6 +194,7 @@ test_files:
205
194
  - test/rbbt/ner/test_patterns.rb
206
195
  - test/rbbt/ner/test_NER.rb
207
196
  - test/rbbt/ner/test_abner.rb
197
+ - test/rbbt/ner/rnorm/test_tokens.rb
208
198
  - test/rbbt/ner/test_rnorm.rb
209
199
  - test/rbbt/ner/test_regexpNER.rb
210
200
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb