rbbt-text 1.1.9 → 1.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +56 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +61 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +42 -12
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -361
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -355
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -52
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,49 +0,0 @@
|
|
1
|
-
require 'rbbt/annotations'
|
2
|
-
require 'rbbt/text/segment'
|
3
|
-
|
4
|
-
module Token
|
5
|
-
attr_accessor :offset, :original
|
6
|
-
|
7
|
-
def self.all_annotations
|
8
|
-
[:offset, :original]
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.setup(text, start, original = nil)
|
12
|
-
text.extend Token
|
13
|
-
text.offset = start
|
14
|
-
text.original = original
|
15
|
-
text
|
16
|
-
end
|
17
|
-
|
18
|
-
def info
|
19
|
-
{:original => original, :offset => offset}
|
20
|
-
end
|
21
|
-
|
22
|
-
def id
|
23
|
-
Misc.hash2md5 info.merge :self => self
|
24
|
-
end
|
25
|
-
|
26
|
-
def end
|
27
|
-
offset + self.length - 1
|
28
|
-
end
|
29
|
-
|
30
|
-
def range
|
31
|
-
(offset..self.end)
|
32
|
-
end
|
33
|
-
|
34
|
-
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
35
|
-
|
36
|
-
tokens = []
|
37
|
-
while matchdata = text.match(split_at)
|
38
|
-
tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
39
|
-
tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
40
|
-
start += matchdata.end(0)
|
41
|
-
text = matchdata.post_match
|
42
|
-
end
|
43
|
-
|
44
|
-
tokens << Token.setup(text, start) unless text.empty?
|
45
|
-
|
46
|
-
tokens
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
@@ -1,33 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/document'
|
3
|
-
require 'rbbt/text/corpus'
|
4
|
-
require 'rbbt/text/corpus/sources/pmid'
|
5
|
-
|
6
|
-
class TestCorpusPMID < Test::Unit::TestCase
|
7
|
-
def setup
|
8
|
-
Log.severity = 0
|
9
|
-
Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
|
10
|
-
|
11
|
-
Corpus::Document.define :words do
|
12
|
-
words = self.text.split(" ")
|
13
|
-
Segment.align(self.text, words)
|
14
|
-
end
|
15
|
-
|
16
|
-
Corpus::Document.define :genes do
|
17
|
-
require 'rbbt/ner/banner'
|
18
|
-
Banner.new.match(self.text)
|
19
|
-
end
|
20
|
-
|
21
|
-
Corpus::Document.persist_in_global_tsv("genes")
|
22
|
-
Corpus::Document.persist_in_global_tsv(:words)
|
23
|
-
end
|
24
|
-
|
25
|
-
def test_query
|
26
|
-
docids = Document.corpus.add_pubmed_query("SARS-Cov-2", 2000, :abstract)
|
27
|
-
|
28
|
-
docids.each do |docid|
|
29
|
-
iif Document.corpus.docid(docid).text
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
@@ -1,52 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/corpus/document'
|
3
|
-
|
4
|
-
class TestCorpusDocument < Test::Unit::TestCase
|
5
|
-
def setup
|
6
|
-
Log.severity = 0
|
7
|
-
|
8
|
-
Corpus::Document.define :words do
|
9
|
-
words = self.text.split(" ")
|
10
|
-
Segment.align(self.text, words)
|
11
|
-
end
|
12
|
-
|
13
|
-
Open.mkdir Rbbt.tmp.test.annotations.find
|
14
|
-
Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
|
15
|
-
|
16
|
-
|
17
|
-
Corpus::Document.define_multiple :words2 do |documents|
|
18
|
-
documents.collect do |doc|
|
19
|
-
words = doc.text.split(" ")
|
20
|
-
Segment.align(doc.text, words)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_words
|
28
|
-
text = "This is a test document"
|
29
|
-
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
|
30
|
-
assert_equal Segment.sort(document.words), text.split(" ")
|
31
|
-
end
|
32
|
-
|
33
|
-
def test_words_multiple
|
34
|
-
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
35
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
|
36
|
-
|
37
|
-
docs = [document1, document2]
|
38
|
-
|
39
|
-
Corpus::Document.prepare_multiple(docs, :words2)
|
40
|
-
|
41
|
-
assert_equal document1.words2, document1.text.split(" ")
|
42
|
-
assert_equal document2.words2, document2.text.split(" ")
|
43
|
-
|
44
|
-
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
45
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
|
46
|
-
|
47
|
-
docs = [document1, document2]
|
48
|
-
|
49
|
-
Corpus::Document.prepare_multiple(docs, :words2)
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
File without changes
|
@@ -1,23 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/segment/segmented'
|
3
|
-
|
4
|
-
class TestClass < Test::Unit::TestCase
|
5
|
-
def test_split
|
6
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
7
|
-
|
8
|
-
gene1 = "TP53"
|
9
|
-
gene1.extend Segment
|
10
|
-
gene1.offset = a.index gene1
|
11
|
-
|
12
|
-
gene2 = "CDK5R1"
|
13
|
-
gene2.extend Segment
|
14
|
-
gene2.offset = a.index gene2
|
15
|
-
|
16
|
-
gene3 = "TP53 gene"
|
17
|
-
gene3.extend Segment
|
18
|
-
gene3.offset = a.index gene3
|
19
|
-
|
20
|
-
Segmented.setup(a, [gene2, gene1, gene3])
|
21
|
-
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], a.split_segments
|
22
|
-
end
|
23
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
-
require 'test/unit'
|
4
|
-
require 'rbbt-util'
|
5
|
-
require 'rbbt/text/corpus'
|
6
|
-
|
7
|
-
class Corpus::Document
|
8
|
-
|
9
|
-
define :words do
|
10
|
-
text.split(" ")
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
class TestClass < Test::Unit::TestCase
|
15
|
-
def test_document
|
16
|
-
Log.severity = 0
|
17
|
-
text = "This is a test document"
|
18
|
-
|
19
|
-
docid = nil
|
20
|
-
TmpFile.with_file do |dir|
|
21
|
-
corpus = Corpus.new dir
|
22
|
-
docid = corpus.add_document text, :TEST, :test_doc
|
23
|
-
document = corpus.docid(docid)
|
24
|
-
assert_equal text, document.text
|
25
|
-
|
26
|
-
corpus = Corpus.new dir
|
27
|
-
document = corpus.docid(docid)
|
28
|
-
assert_equal text, document.text
|
29
|
-
document = corpus.find(:TEST, :test_doc).first
|
30
|
-
assert_equal text, document.text
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
@@ -1,58 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/document'
|
3
|
-
require 'rbbt/text/corpus/sources/pmid'
|
4
|
-
|
5
|
-
class TestDocument < Test::Unit::TestCase
|
6
|
-
def setup
|
7
|
-
Log.severity = 0
|
8
|
-
Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
|
9
|
-
|
10
|
-
Corpus::Document.define :words do
|
11
|
-
words = self.text.split(" ")
|
12
|
-
Segment.align(self.text, words)
|
13
|
-
end
|
14
|
-
|
15
|
-
Corpus::Document.define :genes do
|
16
|
-
require 'rbbt/ner/banner'
|
17
|
-
Banner.new.match(self.text)
|
18
|
-
end
|
19
|
-
|
20
|
-
Corpus::Document.persist_in_global_tsv("genes")
|
21
|
-
Corpus::Document.persist_in_global_tsv(:words)
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_title_and_text
|
25
|
-
document = Document.setup('PMID:32272262')
|
26
|
-
|
27
|
-
assert document.text.downcase.include?("covid")
|
28
|
-
assert_equal "High-resolution Chest CT Features and Clinical Characteristics of Patients Infected with COVID-19 in Jiangsu, China.", document.title
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_full_text
|
32
|
-
document = Document.setup('PMID:4304705')
|
33
|
-
assert document.text.length < document.full_text.length
|
34
|
-
end
|
35
|
-
|
36
|
-
def test_words
|
37
|
-
document = Document.setup('PMID:32272262')
|
38
|
-
words = document.entities :words
|
39
|
-
assert words.first.respond_to?(:offset)
|
40
|
-
end
|
41
|
-
|
42
|
-
def test_genes
|
43
|
-
text = "This is a mention to TP53, a gene that should be found"
|
44
|
-
document = Document.setup(Document.corpus.add_document(text, "TEST"))
|
45
|
-
genes = document.entities :genes
|
46
|
-
|
47
|
-
assert_equal "TP53", genes.first
|
48
|
-
assert genes.first.respond_to?(:offset)
|
49
|
-
|
50
|
-
text = "This is a mention to TP53, a gene that should be found"
|
51
|
-
document = Document.setup(Document.corpus.add_document(text, "TEST"))
|
52
|
-
genes = document.entities :genes
|
53
|
-
|
54
|
-
assert_equal "TP53", genes.first
|
55
|
-
assert genes.first.respond_to?(:offset)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
@@ -1,100 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/text/segment'
|
3
|
-
|
4
|
-
class TestSegment < Test::Unit::TestCase
|
5
|
-
def test_info
|
6
|
-
a = "test"
|
7
|
-
a.extend Segment
|
8
|
-
a.offset = 10
|
9
|
-
assert a.info.include? :offset
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_sort
|
13
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
14
|
-
|
15
|
-
gene1 = "TP53"
|
16
|
-
gene1.extend Segment
|
17
|
-
gene1.offset = a.index gene1
|
18
|
-
|
19
|
-
gene2 = "CDK5R1"
|
20
|
-
gene2.extend Segment
|
21
|
-
gene2.offset = a.index gene2
|
22
|
-
|
23
|
-
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_clean_sort
|
27
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
28
|
-
|
29
|
-
gene1 = "TP53"
|
30
|
-
gene1.extend Segment
|
31
|
-
gene1.offset = a.index gene1
|
32
|
-
|
33
|
-
gene2 = "CDK5R1"
|
34
|
-
gene2.extend Segment
|
35
|
-
gene2.offset = a.index gene2
|
36
|
-
|
37
|
-
gene3 = "TP53 gene"
|
38
|
-
gene3.extend Segment
|
39
|
-
gene3.offset = a.index gene3
|
40
|
-
|
41
|
-
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
42
|
-
end
|
43
|
-
|
44
|
-
def test_split
|
45
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
46
|
-
|
47
|
-
gene1 = "TP53"
|
48
|
-
gene1.extend Segment
|
49
|
-
gene1.offset = a.index gene1
|
50
|
-
|
51
|
-
gene2 = "CDK5R1"
|
52
|
-
gene2.extend Segment
|
53
|
-
gene2.offset = a.index gene2
|
54
|
-
|
55
|
-
gene3 = "TP53 gene"
|
56
|
-
gene3.extend Segment
|
57
|
-
gene3.offset = a.index gene3
|
58
|
-
|
59
|
-
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(a, [gene2,gene1,gene3])
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
def test_align
|
64
|
-
text =<<-EOF
|
65
|
-
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
66
|
-
EOF
|
67
|
-
|
68
|
-
parts = text.split(/\W/)
|
69
|
-
Segment.align(text, parts)
|
70
|
-
|
71
|
-
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
72
|
-
end
|
73
|
-
|
74
|
-
def test_segment_index
|
75
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
76
|
-
|
77
|
-
gene1 = "TP53"
|
78
|
-
gene1.extend Segment
|
79
|
-
gene1.offset = a.index gene1
|
80
|
-
|
81
|
-
gene2 = "CDK5R1"
|
82
|
-
gene2.extend Segment
|
83
|
-
gene2.offset = a.index gene2
|
84
|
-
|
85
|
-
gene3 = "TP53 gene"
|
86
|
-
gene3.extend Segment
|
87
|
-
gene3.offset = a.index gene3
|
88
|
-
|
89
|
-
index = Segment.index([gene1, gene2, gene3])
|
90
|
-
assert_equal %w(CDK5R1), index[gene2.offset + 1]
|
91
|
-
|
92
|
-
TmpFile.with_file do |fwt|
|
93
|
-
index = Segment.index([gene1, gene2, gene3], fwt)
|
94
|
-
assert_equal %w(CDK5R1), index[gene2.offset + 1]
|
95
|
-
index = Segment.index([gene1, gene2, gene3], fwt)
|
96
|
-
assert_equal %w(CDK5R1), index[gene2.offset + 1]
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|