rbbt-text 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +46 -0
- data/lib/rbbt/document/annotation.rb +42 -0
- data/lib/rbbt/document/corpus.rb +38 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +19 -2
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +6 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +6 -6
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/segment.rb +177 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +40 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +43 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +187 -0
- data/test/test_helper.rb +5 -3
- metadata +40 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,24 +0,0 @@
|
|
1
|
-
require 'rbbt/text/segment'
|
2
|
-
|
3
|
-
module Relationship
|
4
|
-
extend Annotation
|
5
|
-
self.annotation :segment
|
6
|
-
self.annotation :terms
|
7
|
-
self.annotation :type
|
8
|
-
|
9
|
-
def text
|
10
|
-
if segment
|
11
|
-
segment
|
12
|
-
else
|
13
|
-
type + ": " + terms * ", "
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def html
|
18
|
-
text = <<-EOF
|
19
|
-
<span class='Relationship'\
|
20
|
-
>#{ self.text }</span>
|
21
|
-
EOF
|
22
|
-
text.chomp
|
23
|
-
end
|
24
|
-
end
|
@@ -1,49 +0,0 @@
|
|
1
|
-
require 'rbbt/annotations'
|
2
|
-
require 'rbbt/text/segment'
|
3
|
-
|
4
|
-
module Token
|
5
|
-
attr_accessor :offset, :original
|
6
|
-
|
7
|
-
def self.all_annotations
|
8
|
-
[:offset, :original]
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.setup(text, start, original = nil)
|
12
|
-
text.extend Token
|
13
|
-
text.offset = start
|
14
|
-
text.original = original
|
15
|
-
text
|
16
|
-
end
|
17
|
-
|
18
|
-
def info
|
19
|
-
{:original => original, :offset => offset}
|
20
|
-
end
|
21
|
-
|
22
|
-
def id
|
23
|
-
Misc.hash2md5 info.merge :self => self
|
24
|
-
end
|
25
|
-
|
26
|
-
def end
|
27
|
-
offset + self.length - 1
|
28
|
-
end
|
29
|
-
|
30
|
-
def range
|
31
|
-
(offset..self.end)
|
32
|
-
end
|
33
|
-
|
34
|
-
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
35
|
-
|
36
|
-
tokens = []
|
37
|
-
while matchdata = text.match(split_at)
|
38
|
-
tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
39
|
-
tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
40
|
-
start += matchdata.end(0)
|
41
|
-
text = matchdata.post_match
|
42
|
-
end
|
43
|
-
|
44
|
-
tokens << Token.setup(text, start) unless text.empty?
|
45
|
-
|
46
|
-
tokens
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
@@ -1,33 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/document'
|
3
|
-
require 'rbbt/text/corpus'
|
4
|
-
require 'rbbt/text/corpus/sources/pmid'
|
5
|
-
|
6
|
-
class TestCorpusPMID < Test::Unit::TestCase
|
7
|
-
def setup
|
8
|
-
Log.severity = 0
|
9
|
-
Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
|
10
|
-
|
11
|
-
Corpus::Document.define :words do
|
12
|
-
words = self.text.split(" ")
|
13
|
-
Segment.align(self.text, words)
|
14
|
-
end
|
15
|
-
|
16
|
-
Corpus::Document.define :genes do
|
17
|
-
require 'rbbt/ner/banner'
|
18
|
-
Banner.new.match(self.text)
|
19
|
-
end
|
20
|
-
|
21
|
-
Corpus::Document.persist_in_global_tsv("genes")
|
22
|
-
Corpus::Document.persist_in_global_tsv(:words)
|
23
|
-
end
|
24
|
-
|
25
|
-
def test_query
|
26
|
-
docids = Document.corpus.add_pubmed_query("SARS-Cov-2", 2000, :abstract)
|
27
|
-
|
28
|
-
docids.each do |docid|
|
29
|
-
iif Document.corpus.docid(docid).text
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
@@ -1,82 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/corpus/document'
|
3
|
-
|
4
|
-
class TestCorpusDocument < Test::Unit::TestCase
|
5
|
-
def setup
|
6
|
-
Log.severity = 0
|
7
|
-
|
8
|
-
Corpus::Document.define :words do
|
9
|
-
words = self.text.split(" ")
|
10
|
-
Segment.align(self.text, words)
|
11
|
-
end
|
12
|
-
|
13
|
-
Corpus::Document.define_multiple :words2 do |documents|
|
14
|
-
documents.collect do |doc|
|
15
|
-
words = doc.text.split(" ")
|
16
|
-
Segment.align(doc.text, words)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
Open.mkdir Rbbt.tmp.test.annotations.find
|
21
|
-
|
22
|
-
Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
|
23
|
-
Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_words
|
27
|
-
text = "This is a test document"
|
28
|
-
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
|
29
|
-
assert_equal Segment.sort(document.words), text.split(" ")
|
30
|
-
assert document.words.first.docid
|
31
|
-
assert document.words.first.segment_id.include?("TEST")
|
32
|
-
end
|
33
|
-
|
34
|
-
def test_words_multiple
|
35
|
-
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
36
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
|
37
|
-
document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
|
38
|
-
|
39
|
-
docs = [document1, document2, document3]
|
40
|
-
|
41
|
-
Corpus::Document.prepare_multiple(docs, :words2)
|
42
|
-
|
43
|
-
assert document1.words.first.docid
|
44
|
-
assert document1.words.first.segment_id.include?("TEST")
|
45
|
-
|
46
|
-
assert_equal document1.words2, document1.text.split(" ")
|
47
|
-
assert_equal document2.words2, document2.text.split(" ")
|
48
|
-
assert_equal document3.words2, document3.text.split(" ")
|
49
|
-
|
50
|
-
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
51
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
|
52
|
-
|
53
|
-
docs = [document1, document2]
|
54
|
-
|
55
|
-
Corpus::Document.prepare_multiple(docs, :words2)
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_parallel
|
59
|
-
text =<<-EOF
|
60
|
-
This is a test document number
|
61
|
-
EOF
|
62
|
-
|
63
|
-
docs = []
|
64
|
-
100.times do |i|
|
65
|
-
docs << text.chomp + " " + i.to_s
|
66
|
-
end
|
67
|
-
|
68
|
-
Log.with_severity 0 do
|
69
|
-
TSV.traverse docs, :cpus => 10, :bar => true do |doc|
|
70
|
-
hash = Misc.digest(doc)
|
71
|
-
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
|
72
|
-
assert_equal Segment.sort(document.words), document.text.split(" ")
|
73
|
-
end
|
74
|
-
TSV.traverse docs, :cpus => 10, :bar => true do |doc|
|
75
|
-
hash = Misc.digest(doc)
|
76
|
-
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
|
77
|
-
assert_equal Segment.sort(document.words), document.text.split(" ")
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
File without changes
|
@@ -1,23 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/segment/segmented'
|
3
|
-
|
4
|
-
class TestClass < Test::Unit::TestCase
|
5
|
-
def test_split
|
6
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
7
|
-
|
8
|
-
gene1 = "TP53"
|
9
|
-
gene1.extend Segment
|
10
|
-
gene1.offset = a.index gene1
|
11
|
-
|
12
|
-
gene2 = "CDK5R1"
|
13
|
-
gene2.extend Segment
|
14
|
-
gene2.offset = a.index gene2
|
15
|
-
|
16
|
-
gene3 = "TP53 gene"
|
17
|
-
gene3.extend Segment
|
18
|
-
gene3.offset = a.index gene3
|
19
|
-
|
20
|
-
Segmented.setup(a, [gene2, gene1, gene3])
|
21
|
-
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], a.split_segments
|
22
|
-
end
|
23
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
-
require 'test/unit'
|
4
|
-
require 'rbbt-util'
|
5
|
-
require 'rbbt/text/corpus'
|
6
|
-
|
7
|
-
class Corpus::Document
|
8
|
-
|
9
|
-
define :words do
|
10
|
-
text.split(" ")
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
class TestClass < Test::Unit::TestCase
|
15
|
-
def test_document
|
16
|
-
Log.severity = 0
|
17
|
-
text = "This is a test document"
|
18
|
-
|
19
|
-
docid = nil
|
20
|
-
TmpFile.with_file do |dir|
|
21
|
-
corpus = Corpus.new dir
|
22
|
-
docid = corpus.add_document text, :TEST, :test_doc
|
23
|
-
document = corpus.docid(docid)
|
24
|
-
assert_equal text, document.text
|
25
|
-
|
26
|
-
corpus = Corpus.new dir
|
27
|
-
document = corpus.docid(docid)
|
28
|
-
assert_equal text, document.text
|
29
|
-
document = corpus.find(:TEST, :test_doc).first
|
30
|
-
assert_equal text, document.text
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
@@ -1,58 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/document'
|
3
|
-
require 'rbbt/text/corpus/sources/pmid'
|
4
|
-
|
5
|
-
class TestDocument < Test::Unit::TestCase
|
6
|
-
def setup
|
7
|
-
Log.severity = 0
|
8
|
-
Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
|
9
|
-
|
10
|
-
Corpus::Document.define :words do
|
11
|
-
words = self.text.split(" ")
|
12
|
-
Segment.align(self.text, words)
|
13
|
-
end
|
14
|
-
|
15
|
-
Corpus::Document.define :genes do
|
16
|
-
require 'rbbt/ner/banner'
|
17
|
-
Banner.new.match(self.text)
|
18
|
-
end
|
19
|
-
|
20
|
-
Corpus::Document.persist_in_global_tsv("genes")
|
21
|
-
Corpus::Document.persist_in_global_tsv(:words)
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_title_and_text
|
25
|
-
document = Document.setup('PMID:32272262')
|
26
|
-
|
27
|
-
assert document.text.downcase.include?("covid")
|
28
|
-
assert_equal "High-resolution Chest CT Features and Clinical Characteristics of Patients Infected with COVID-19 in Jiangsu, China.", document.title
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_full_text
|
32
|
-
document = Document.setup('PMID:4304705')
|
33
|
-
assert document.text.length < document.full_text.length
|
34
|
-
end
|
35
|
-
|
36
|
-
def test_words
|
37
|
-
document = Document.setup('PMID:32272262')
|
38
|
-
words = document.entities :words
|
39
|
-
assert words.first.respond_to?(:offset)
|
40
|
-
end
|
41
|
-
|
42
|
-
def test_genes
|
43
|
-
text = "This is a mention to TP53, a gene that should be found"
|
44
|
-
document = Document.setup(Document.corpus.add_document(text, "TEST"))
|
45
|
-
genes = document.entities :genes
|
46
|
-
|
47
|
-
assert_equal "TP53", genes.first
|
48
|
-
assert genes.first.respond_to?(:offset)
|
49
|
-
|
50
|
-
text = "This is a mention to TP53, a gene that should be found"
|
51
|
-
document = Document.setup(Document.corpus.add_document(text, "TEST"))
|
52
|
-
genes = document.entities :genes
|
53
|
-
|
54
|
-
assert_equal "TP53", genes.first
|
55
|
-
assert genes.first.respond_to?(:offset)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
@@ -1,100 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/segment'
|
3
|
-
|
4
|
-
class TestSegment < Test::Unit::TestCase
|
5
|
-
def test_info
|
6
|
-
a = "test"
|
7
|
-
a.extend Segment
|
8
|
-
a.offset = 10
|
9
|
-
assert a.info.include? :offset
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_sort
|
13
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
14
|
-
|
15
|
-
gene1 = "TP53"
|
16
|
-
gene1.extend Segment
|
17
|
-
gene1.offset = a.index gene1
|
18
|
-
|
19
|
-
gene2 = "CDK5R1"
|
20
|
-
gene2.extend Segment
|
21
|
-
gene2.offset = a.index gene2
|
22
|
-
|
23
|
-
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_clean_sort
|
27
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
28
|
-
|
29
|
-
gene1 = "TP53"
|
30
|
-
gene1.extend Segment
|
31
|
-
gene1.offset = a.index gene1
|
32
|
-
|
33
|
-
gene2 = "CDK5R1"
|
34
|
-
gene2.extend Segment
|
35
|
-
gene2.offset = a.index gene2
|
36
|
-
|
37
|
-
gene3 = "TP53 gene"
|
38
|
-
gene3.extend Segment
|
39
|
-
gene3.offset = a.index gene3
|
40
|
-
|
41
|
-
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
42
|
-
end
|
43
|
-
|
44
|
-
def test_split
|
45
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
46
|
-
|
47
|
-
gene1 = "TP53"
|
48
|
-
gene1.extend Segment
|
49
|
-
gene1.offset = a.index gene1
|
50
|
-
|
51
|
-
gene2 = "CDK5R1"
|
52
|
-
gene2.extend Segment
|
53
|
-
gene2.offset = a.index gene2
|
54
|
-
|
55
|
-
gene3 = "TP53 gene"
|
56
|
-
gene3.extend Segment
|
57
|
-
gene3.offset = a.index gene3
|
58
|
-
|
59
|
-
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(a, [gene2,gene1,gene3])
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
def test_align
|
64
|
-
text =<<-EOF
|
65
|
-
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
66
|
-
EOF
|
67
|
-
|
68
|
-
parts = text.split(/\W/)
|
69
|
-
Segment.align(text, parts)
|
70
|
-
|
71
|
-
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
72
|
-
end
|
73
|
-
|
74
|
-
def test_segment_index
|
75
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
76
|
-
|
77
|
-
gene1 = "TP53"
|
78
|
-
gene1.extend Segment
|
79
|
-
gene1.offset = a.index gene1
|
80
|
-
|
81
|
-
gene2 = "CDK5R1"
|
82
|
-
gene2.extend Segment
|
83
|
-
gene2.offset = a.index gene2
|
84
|
-
|
85
|
-
gene3 = "TP53 gene"
|
86
|
-
gene3.extend Segment
|
87
|
-
gene3.offset = a.index gene3
|
88
|
-
|
89
|
-
index = Segment.index([gene1, gene2, gene3])
|
90
|
-
assert_equal %w(CDK5R1), index[gene2.offset + 1]
|
91
|
-
|
92
|
-
TmpFile.with_file do |fwt|
|
93
|
-
index = Segment.index([gene1, gene2, gene3], fwt)
|
94
|
-
assert_equal %w(CDK5R1), index[gene2.offset + 1]
|
95
|
-
index = Segment.index([gene1, gene2, gene3], fwt)
|
96
|
-
assert_equal %w(CDK5R1), index[gene2.offset + 1]
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|