rbbt-text 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +3 -3
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +2 -2
- data/lib/rbbt/ner/chemical_tagger.rb +1 -1
- data/lib/rbbt/ner/linnaeus.rb +1 -1
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +1 -1
- data/lib/rbbt/ner/patterns.rb +4 -4
- data/lib/rbbt/ner/regexpNER.rb +1 -1
- data/lib/rbbt/ner/token_trieNER.rb +2 -2
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +2 -2
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
- data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
- data/lib/rbbt/text/corpus/document.rb +361 -0
- data/lib/rbbt/text/corpus/document_repo.rb +68 -0
- data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
- data/lib/rbbt/text/document.rb +39 -0
- data/lib/rbbt/{ner → text}/segment.rb +11 -6
- data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
- data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
- data/test/rbbt/entity/test_document.rb +1 -0
- data/test/rbbt/ner/test_abner.rb +1 -0
- data/test/rbbt/ner/test_linnaeus.rb +1 -0
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
- data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
- data/test/rbbt/text/corpus/test_document.rb +52 -0
- data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
- data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
- data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
- data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
- data/test/rbbt/text/test_corpus.rb +34 -0
- data/test/rbbt/text/test_document.rb +58 -0
- data/test/rbbt/{ner → text}/test_segment.rb +2 -2
- data/test/test_helper.rb +3 -3
- metadata +32 -24
- data/lib/rbbt/corpus/document.rb +0 -266
- data/lib/rbbt/corpus/document_repo.rb +0 -137
- data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
- data/lib/rbbt/entity/document.rb +0 -75
@@ -0,0 +1,52 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/text/corpus/document'
|
3
|
+
|
4
|
+
class TestCorpusDocument < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
Log.severity = 0
|
7
|
+
|
8
|
+
Corpus::Document.define :words do
|
9
|
+
words = self.text.split(" ")
|
10
|
+
Segment.align(self.text, words)
|
11
|
+
end
|
12
|
+
|
13
|
+
Open.mkdir Rbbt.tmp.test.annotations.find
|
14
|
+
Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
|
15
|
+
|
16
|
+
|
17
|
+
Corpus::Document.define_multiple :words2 do |documents|
|
18
|
+
documents.collect do |doc|
|
19
|
+
words = doc.text.split(" ")
|
20
|
+
Segment.align(doc.text, words)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_words
|
28
|
+
text = "This is a test document"
|
29
|
+
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
|
30
|
+
assert_equal Segment.sort(document.words), text.split(" ")
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_words_multiple
|
34
|
+
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
35
|
+
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
|
36
|
+
|
37
|
+
docs = [document1, document2]
|
38
|
+
|
39
|
+
Corpus::Document.prepare_multiple(docs, :words2)
|
40
|
+
|
41
|
+
assert_equal document1.words2, document1.text.split(" ")
|
42
|
+
assert_equal document2.words2, document2.text.split(" ")
|
43
|
+
|
44
|
+
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
45
|
+
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
|
46
|
+
|
47
|
+
docs = [document1, document2]
|
48
|
+
|
49
|
+
Corpus::Document.prepare_multiple(docs, :words2)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/text/segment'
|
3
|
+
require 'rbbt/text/segment/named_entity'
|
4
4
|
|
5
5
|
class TestClass < Test::Unit::TestCase
|
6
6
|
def test_info
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/text/segment/transformed'
|
3
|
+
require 'rbbt/text/segment/named_entity'
|
4
4
|
require 'rexml/document'
|
5
5
|
|
6
6
|
class TestClass < Test::Unit::TestCase
|
@@ -98,7 +98,6 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
98
98
|
|
99
99
|
assert_equal original, a
|
100
100
|
|
101
|
-
|
102
101
|
assert_equal original, a
|
103
102
|
|
104
103
|
exp1, exp2 = nil, nil
|
@@ -286,5 +285,99 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
286
285
|
end
|
287
286
|
|
288
287
|
end
|
288
|
+
|
289
|
+
def test_by_sentence
|
290
|
+
a = "This is a first sentences. ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
|
291
|
+
|
292
|
+
sentence_pos = a.index('.')+2
|
293
|
+
sentence = a[sentence_pos..-1]
|
294
|
+
Segment.setup sentence, sentence_pos
|
295
|
+
|
296
|
+
gene1 = "ILF"
|
297
|
+
gene1.extend NamedEntity
|
298
|
+
gene1.offset = a.index gene1
|
299
|
+
gene1.type = "Gene"
|
300
|
+
|
301
|
+
Transformed.with_transform(sentence, [gene1], "[G]") do
|
302
|
+
assert_equal sentence.sub("ILF", "[G]"), sentence
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
def test_collisions
|
307
|
+
text =<<-EOF.chomp
|
308
|
+
This is another sentence. Protein (nsp1), helicase (nsp13).
|
309
|
+
EOF
|
310
|
+
|
311
|
+
sentence_pos = text.index(".") + 2
|
312
|
+
sentence = Segment.setup(text[sentence_pos..-1], sentence_pos)
|
313
|
+
|
314
|
+
viral = %w(nsp1 nsp13)
|
315
|
+
human = %w(helicase)
|
316
|
+
|
317
|
+
viral = viral.collect do |e|
|
318
|
+
next unless text.index(e)
|
319
|
+
NamedEntity.setup(e, text.index(e), "VirGene")
|
320
|
+
end.compact
|
321
|
+
|
322
|
+
human = human.collect do |e|
|
323
|
+
next unless text.index(e)
|
324
|
+
NamedEntity.setup(e, text.index(e), "HumGene")
|
325
|
+
end
|
326
|
+
|
327
|
+
clean = human.reject{|s| s.overlaps(viral).any?}
|
328
|
+
|
329
|
+
Transformed.with_transform(sentence, viral, Proc.new{|e| "[VIRAL=#{e}]"}) do
|
330
|
+
assert_equal sentence, "Protein ([VIRAL=nsp1]), helicase ([VIRAL=nsp13])."
|
331
|
+
Transformed.with_transform(sentence, clean, Proc.new{|e| "[HUMAN=#{e}]"}) do
|
332
|
+
assert_equal sentence, "Protein ([VIRAL=nsp1]), [HUMAN=helicase] ([VIRAL=nsp13])."
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
|
338
|
+
def test_collisions2
|
339
|
+
text =<<-EOF.chomp
|
340
|
+
This is another sentence. Among the nonstructural proteins, the leader protein (nsp1), the papain-like protease (nsp3), the nsp4, the 3C-like protease (nsp5), the nsp7, the nsp8, the nsp9, the nsp10, the RNA-directed RNA polymerase (nsp12), the helicase (nsp13), the guanine-N7 methyltransferase (nsp14), the uridylate-specific endoribonuclease (nsp15), the 2'-O-methyltransferase (nsp16), and the ORF7a protein could be built on the basis of homology templates.
|
341
|
+
EOF
|
342
|
+
|
343
|
+
sentence_pos = text.index(".") + 2
|
344
|
+
sentence = Segment.setup(text[sentence_pos..-1], sentence_pos)
|
345
|
+
|
346
|
+
target = sentence.dup
|
347
|
+
|
348
|
+
viral = %w(nsp1 nsp4 nsp5 nsp7 nsp8 nsp9 nsp10 nsp12 nsp13 nsp14 nsp15 ORF7a spike)
|
349
|
+
human = %w(helicase nsp5 nsp4 nsp3)
|
350
|
+
|
351
|
+
viral = viral.collect do |e|
|
352
|
+
next unless text.index(e)
|
353
|
+
NamedEntity.setup(e, text.index(e), "VirGene")
|
354
|
+
end.compact
|
355
|
+
|
356
|
+
human = human.collect do |e|
|
357
|
+
next unless text.index(e)
|
358
|
+
NamedEntity.setup(e, text.index(e), "HumGene")
|
359
|
+
end
|
360
|
+
|
361
|
+
clean = human.reject{|s| s.overlaps(viral).any?}
|
362
|
+
|
363
|
+
tag = Misc.digest("TAG")
|
364
|
+
|
365
|
+
viral.each do |e|
|
366
|
+
target.gsub!(/\b#{e}\b/, "[VIRAL=#{e}-#{tag}]")
|
367
|
+
end
|
368
|
+
|
369
|
+
target_tmp = target.dup
|
370
|
+
|
371
|
+
clean.each do |e|
|
372
|
+
target.gsub!(/\b#{e}\b/, "[HUMAN=#{e}-#{tag}]")
|
373
|
+
end
|
374
|
+
|
375
|
+
Transformed.with_transform(sentence, viral, Proc.new{|e| "[VIRAL=#{e}-#{tag}]"}) do
|
376
|
+
assert_equal sentence, target_tmp
|
377
|
+
Transformed.with_transform(sentence, clean, Proc.new{|e| "[HUMAN=#{e}-#{tag}]"}) do
|
378
|
+
assert_equal sentence, target
|
379
|
+
end
|
380
|
+
end
|
381
|
+
end
|
289
382
|
end
|
290
383
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rbbt-util'
|
5
|
+
require 'rbbt/text/corpus'
|
6
|
+
|
7
|
+
class Corpus::Document
|
8
|
+
|
9
|
+
define :words do
|
10
|
+
text.split(" ")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class TestClass < Test::Unit::TestCase
|
15
|
+
def test_document
|
16
|
+
Log.severity = 0
|
17
|
+
text = "This is a test document"
|
18
|
+
|
19
|
+
docid = nil
|
20
|
+
TmpFile.with_file do |dir|
|
21
|
+
corpus = Corpus.new dir
|
22
|
+
docid = corpus.add_document text, :TEST, :test_doc
|
23
|
+
document = corpus.docid(docid)
|
24
|
+
assert_equal text, document.text
|
25
|
+
|
26
|
+
corpus = Corpus.new dir
|
27
|
+
document = corpus.docid(docid)
|
28
|
+
assert_equal text, document.text
|
29
|
+
document = corpus.find(:TEST, :test_doc).first
|
30
|
+
assert_equal text, document.text
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/text/document'
|
3
|
+
require 'rbbt/text/corpus/sources/pmid'
|
4
|
+
|
5
|
+
class TestDocument < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
Log.severity = 0
|
8
|
+
Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
|
9
|
+
|
10
|
+
Corpus::Document.define :words do
|
11
|
+
words = self.text.split(" ")
|
12
|
+
Segment.align(self.text, words)
|
13
|
+
end
|
14
|
+
|
15
|
+
Corpus::Document.define :genes do
|
16
|
+
require 'rbbt/ner/banner'
|
17
|
+
Banner.new.match(self.text)
|
18
|
+
end
|
19
|
+
|
20
|
+
Corpus::Document.persist_in_global_tsv("genes")
|
21
|
+
Corpus::Document.persist_in_global_tsv(:words)
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_title_and_text
|
25
|
+
document = Document.setup('PMID:32272262')
|
26
|
+
|
27
|
+
assert document.text.downcase.include?("covid")
|
28
|
+
assert_equal "High-resolution Chest CT Features and Clinical Characteristics of Patients Infected with COVID-19 in Jiangsu, China.", document.title
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_full_text
|
32
|
+
document = Document.setup('PMID:4304705')
|
33
|
+
assert document.text.length < document.full_text.length
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_words
|
37
|
+
document = Document.setup('PMID:32272262')
|
38
|
+
words = document.entities :words
|
39
|
+
assert words.first.respond_to?(:offset)
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_genes
|
43
|
+
text = "This is a mention to TP53, a gene that should be found"
|
44
|
+
document = Document.setup(Document.corpus.add_document(text, "TEST"))
|
45
|
+
genes = document.entities :genes
|
46
|
+
|
47
|
+
assert_equal "TP53", genes.first
|
48
|
+
assert genes.first.respond_to?(:offset)
|
49
|
+
|
50
|
+
text = "This is a mention to TP53, a gene that should be found"
|
51
|
+
document = Document.setup(Document.corpus.add_document(text, "TEST"))
|
52
|
+
genes = document.entities :genes
|
53
|
+
|
54
|
+
assert_equal "TP53", genes.first
|
55
|
+
assert genes.first.respond_to?(:offset)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/text/segment'
|
3
3
|
|
4
|
-
class
|
4
|
+
class TestSegment < Test::Unit::TestCase
|
5
5
|
def test_info
|
6
6
|
a = "test"
|
7
7
|
a.extend Segment
|
data/test/test_helper.rb
CHANGED
@@ -6,7 +6,7 @@ require 'rbbt'
|
|
6
6
|
require 'rbbt/persist'
|
7
7
|
require 'rbbt/util/tmpfile'
|
8
8
|
require 'rbbt/util/log'
|
9
|
-
require 'rbbt/corpus
|
9
|
+
require 'rbbt/text/corpus'
|
10
10
|
|
11
11
|
class Test::Unit::TestCase
|
12
12
|
def get_test_datafile(file)
|
@@ -22,8 +22,8 @@ class Test::Unit::TestCase
|
|
22
22
|
FileUtils.rm_rf Rbbt.tmp.test.find :user
|
23
23
|
Persist::CONNECTIONS.values.each do |c| c.close end
|
24
24
|
Persist::CONNECTIONS.clear
|
25
|
-
DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
|
26
|
-
DocumentRepo::TC_CONNECTIONS.clear
|
25
|
+
Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
|
26
|
+
Corpus::DocumentRepo::TC_CONNECTIONS.clear
|
27
27
|
end
|
28
28
|
|
29
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-04-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -78,11 +78,6 @@ files:
|
|
78
78
|
- lib/rbbt/bow/bow.rb
|
79
79
|
- lib/rbbt/bow/dictionary.rb
|
80
80
|
- lib/rbbt/bow/misc.rb
|
81
|
-
- lib/rbbt/corpus/corpus.rb
|
82
|
-
- lib/rbbt/corpus/document.rb
|
83
|
-
- lib/rbbt/corpus/document_repo.rb
|
84
|
-
- lib/rbbt/corpus/sources/pubmed.rb
|
85
|
-
- lib/rbbt/entity/document.rb
|
86
81
|
- lib/rbbt/ner/NER.rb
|
87
82
|
- lib/rbbt/ner/abner.rb
|
88
83
|
- lib/rbbt/ner/banner.rb
|
@@ -99,17 +94,22 @@ files:
|
|
99
94
|
- lib/rbbt/ner/rnorm.rb
|
100
95
|
- lib/rbbt/ner/rnorm/cue_index.rb
|
101
96
|
- lib/rbbt/ner/rnorm/tokens.rb
|
102
|
-
- lib/rbbt/ner/segment.rb
|
103
|
-
- lib/rbbt/ner/segment/docid.rb
|
104
|
-
- lib/rbbt/ner/segment/named_entity.rb
|
105
|
-
- lib/rbbt/ner/segment/relationship.rb
|
106
|
-
- lib/rbbt/ner/segment/segmented.rb
|
107
|
-
- lib/rbbt/ner/segment/token.rb
|
108
|
-
- lib/rbbt/ner/segment/transformed.rb
|
109
97
|
- lib/rbbt/ner/token_trieNER.rb
|
110
98
|
- lib/rbbt/nlp/genia/sentence_splitter.rb
|
111
99
|
- lib/rbbt/nlp/nlp.rb
|
112
100
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
101
|
+
- lib/rbbt/text/corpus.rb
|
102
|
+
- lib/rbbt/text/corpus/document.rb
|
103
|
+
- lib/rbbt/text/corpus/document_repo.rb
|
104
|
+
- lib/rbbt/text/corpus/sources/pmid.rb
|
105
|
+
- lib/rbbt/text/document.rb
|
106
|
+
- lib/rbbt/text/segment.rb
|
107
|
+
- lib/rbbt/text/segment/docid.rb
|
108
|
+
- lib/rbbt/text/segment/named_entity.rb
|
109
|
+
- lib/rbbt/text/segment/relationship.rb
|
110
|
+
- lib/rbbt/text/segment/segmented.rb
|
111
|
+
- lib/rbbt/text/segment/token.rb
|
112
|
+
- lib/rbbt/text/segment/transformed.rb
|
113
113
|
- share/install/software/ABNER
|
114
114
|
- share/install/software/BANNER
|
115
115
|
- share/install/software/ChemicalTagger
|
@@ -129,10 +129,6 @@ files:
|
|
129
129
|
- test/rbbt/bow/test_dictionary.rb
|
130
130
|
- test/rbbt/bow/test_misc.rb
|
131
131
|
- test/rbbt/entity/test_document.rb
|
132
|
-
- test/rbbt/ner/segment/test_named_entity.rb
|
133
|
-
- test/rbbt/ner/segment/test_relationship.rb
|
134
|
-
- test/rbbt/ner/segment/test_segmented.rb
|
135
|
-
- test/rbbt/ner/segment/test_transformed.rb
|
136
132
|
- test/rbbt/ner/test_NER.rb
|
137
133
|
- test/rbbt/ner/test_abner.rb
|
138
134
|
- test/rbbt/ner/test_banner.rb
|
@@ -146,11 +142,19 @@ files:
|
|
146
142
|
- test/rbbt/ner/test_patterns.rb
|
147
143
|
- test/rbbt/ner/test_regexpNER.rb
|
148
144
|
- test/rbbt/ner/test_rnorm.rb
|
149
|
-
- test/rbbt/ner/test_segment.rb
|
150
145
|
- test/rbbt/ner/test_token_trieNER.rb
|
151
146
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
152
147
|
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
153
148
|
- test/rbbt/nlp/test_nlp.rb
|
149
|
+
- test/rbbt/text/corpus/sources/test_pmid.rb
|
150
|
+
- test/rbbt/text/corpus/test_document.rb
|
151
|
+
- test/rbbt/text/segment/test_named_entity.rb
|
152
|
+
- test/rbbt/text/segment/test_relationship.rb
|
153
|
+
- test/rbbt/text/segment/test_segmented.rb
|
154
|
+
- test/rbbt/text/segment/test_transformed.rb
|
155
|
+
- test/rbbt/text/test_corpus.rb
|
156
|
+
- test/rbbt/text/test_document.rb
|
157
|
+
- test/rbbt/text/test_segment.rb
|
154
158
|
- test/test_helper.rb
|
155
159
|
homepage: http://github.com/mikisvaz/rbbt-util
|
156
160
|
licenses: []
|
@@ -178,6 +182,15 @@ test_files:
|
|
178
182
|
- test/rbbt/nlp/test_nlp.rb
|
179
183
|
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
180
184
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
185
|
+
- test/rbbt/text/test_document.rb
|
186
|
+
- test/rbbt/text/corpus/sources/test_pmid.rb
|
187
|
+
- test/rbbt/text/corpus/test_document.rb
|
188
|
+
- test/rbbt/text/test_segment.rb
|
189
|
+
- test/rbbt/text/test_corpus.rb
|
190
|
+
- test/rbbt/text/segment/test_transformed.rb
|
191
|
+
- test/rbbt/text/segment/test_relationship.rb
|
192
|
+
- test/rbbt/text/segment/test_named_entity.rb
|
193
|
+
- test/rbbt/text/segment/test_segmented.rb
|
181
194
|
- test/rbbt/bow/test_bow.rb
|
182
195
|
- test/rbbt/bow/test_misc.rb
|
183
196
|
- test/rbbt/bow/test_dictionary.rb
|
@@ -194,11 +207,6 @@ test_files:
|
|
194
207
|
- test/rbbt/ner/test_banner.rb
|
195
208
|
- test/rbbt/ner/test_token_trieNER.rb
|
196
209
|
- test/rbbt/ner/test_finder.rb
|
197
|
-
- test/rbbt/ner/test_segment.rb
|
198
210
|
- test/rbbt/ner/test_linnaeus.rb
|
199
|
-
- test/rbbt/ner/segment/test_transformed.rb
|
200
|
-
- test/rbbt/ner/segment/test_relationship.rb
|
201
|
-
- test/rbbt/ner/segment/test_named_entity.rb
|
202
|
-
- test/rbbt/ner/segment/test_segmented.rb
|
203
211
|
- test/rbbt/ner/test_oscar4.rb
|
204
212
|
- test/test_helper.rb
|
data/lib/rbbt/corpus/document.rb
DELETED
@@ -1,266 +0,0 @@
|
|
1
|
-
require 'rbbt/ner/segment'
|
2
|
-
require 'rbbt/ner/segment/segmented'
|
3
|
-
require 'rbbt/tsv'
|
4
|
-
require 'rbbt/resource/path'
|
5
|
-
require 'rbbt/persist/tsv'
|
6
|
-
require 'rbbt/util/misc'
|
7
|
-
require 'json'
|
8
|
-
|
9
|
-
class Document
|
10
|
-
|
11
|
-
attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence
|
12
|
-
def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil)
|
13
|
-
@segments = {}
|
14
|
-
@segment_indices = {}
|
15
|
-
|
16
|
-
if not persist_dir.nil?
|
17
|
-
@persist_dir = persist_dir
|
18
|
-
@persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
|
19
|
-
end
|
20
|
-
|
21
|
-
@global_persistence = global_persistence
|
22
|
-
|
23
|
-
if not docid.nil?
|
24
|
-
@docid = docid
|
25
|
-
update_docid
|
26
|
-
end
|
27
|
-
@text = text unless text.nil?
|
28
|
-
end
|
29
|
-
|
30
|
-
def update_docid
|
31
|
-
@namespace, @id, @type, @hash = docid.split(":", -1)
|
32
|
-
end
|
33
|
-
|
34
|
-
def docid=(docid)
|
35
|
-
@docid = docid
|
36
|
-
update_docid
|
37
|
-
end
|
38
|
-
|
39
|
-
#{{{ PERSISTENCE
|
40
|
-
|
41
|
-
TSV_REPOS = {}
|
42
|
-
FIELDS_FOR_ENTITY_PERSISTENCE = {}
|
43
|
-
def self.persist(entity, fields = nil)
|
44
|
-
|
45
|
-
if not fields.nil?
|
46
|
-
fields = [fields] if not Array === fields
|
47
|
-
fields = fields.collect{|f| f.to_s}
|
48
|
-
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
|
49
|
-
end
|
50
|
-
|
51
|
-
self.class_eval <<-EOC
|
52
|
-
def load_with_persistence_#{entity}(raw = false)
|
53
|
-
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
54
|
-
|
55
|
-
tsv_file = File.join(@persist_dir.find, "#{ entity }")
|
56
|
-
|
57
|
-
return nil if raw == :check and File.exists? tsv_file
|
58
|
-
|
59
|
-
annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
|
60
|
-
segments = produce_#{entity}
|
61
|
-
tsv = Segment.tsv(segments, fields)
|
62
|
-
end
|
63
|
-
|
64
|
-
return annotations if raw
|
65
|
-
|
66
|
-
annotations.unnamed = true
|
67
|
-
annotations.collect{|id, annotation|
|
68
|
-
Segment.load_tsv_values(text, annotation, annotations.fields)
|
69
|
-
}
|
70
|
-
end
|
71
|
-
EOC
|
72
|
-
end
|
73
|
-
|
74
|
-
def self.persist_in_tsv(entity, tsv = nil, fields = nil)
|
75
|
-
if not tsv.nil? and not tsv.respond_to?(:keys)
|
76
|
-
fields = tsv
|
77
|
-
tsv = nil
|
78
|
-
end
|
79
|
-
|
80
|
-
TSV_REPOS[entity.to_s] = tsv
|
81
|
-
|
82
|
-
if not fields.nil?
|
83
|
-
fields = [fields] if not Array === fields
|
84
|
-
fields = fields.collect{|f| f.to_s}
|
85
|
-
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
86
|
-
end
|
87
|
-
|
88
|
-
self.class_eval <<-EOC
|
89
|
-
def load_with_persistence_#{entity}(raw = false)
|
90
|
-
repo = TSV_REPOS["#{ entity }"]
|
91
|
-
if repo.nil?
|
92
|
-
raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
|
93
|
-
repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
|
94
|
-
end
|
95
|
-
|
96
|
-
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
97
|
-
if not repo.include? "#{ entity }"
|
98
|
-
segments = produce_#{entity}
|
99
|
-
repo.write
|
100
|
-
repo["#{entity}"] = Segment.tsv(segments, fields)
|
101
|
-
repo.read
|
102
|
-
else
|
103
|
-
if raw == :check
|
104
|
-
repo.close
|
105
|
-
return nil
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
|
110
|
-
annotations = repo["#{entity}"]
|
111
|
-
|
112
|
-
repo.close
|
113
|
-
|
114
|
-
|
115
|
-
return annotations if raw
|
116
|
-
|
117
|
-
annotations.unnamed = true
|
118
|
-
annotations.collect{|id, annotation|
|
119
|
-
Segment.load_tsv_values(text, annotation, annotations.fields)
|
120
|
-
}
|
121
|
-
end
|
122
|
-
EOC
|
123
|
-
end
|
124
|
-
|
125
|
-
def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
|
126
|
-
doc_field ||= "Document ID"
|
127
|
-
entity_field ||= "Entity Type"
|
128
|
-
|
129
|
-
TSV_REPOS[entity.to_s] = tsv
|
130
|
-
|
131
|
-
if not fields.nil?
|
132
|
-
fields = [fields] if not Array === fields
|
133
|
-
fields = fields.collect{|f| f.to_s}
|
134
|
-
else
|
135
|
-
fields = nil
|
136
|
-
end
|
137
|
-
|
138
|
-
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
|
139
|
-
|
140
|
-
self.class_eval <<-EOC
|
141
|
-
def load_with_persistence_#{entity}(raw = false)
|
142
|
-
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
143
|
-
|
144
|
-
data = TSV_REPOS["#{ entity }"] || @global_persistence
|
145
|
-
|
146
|
-
data.read true
|
147
|
-
|
148
|
-
fields = data.fields if fields.nil? and data.respond_to? :fields
|
149
|
-
|
150
|
-
|
151
|
-
if data.respond_to? :persistence_path and String === data.persistence_path
|
152
|
-
data.filter(data.persistence_path + '.filters')
|
153
|
-
end
|
154
|
-
data.add_filter("field:#{ doc_field }", @docid)
|
155
|
-
data.add_filter("field:#{ entity_field }", "#{ entity }")
|
156
|
-
keys = data.keys
|
157
|
-
data.pop_filter
|
158
|
-
data.pop_filter
|
159
|
-
|
160
|
-
if keys.empty?
|
161
|
-
segments = produce_#{entity}
|
162
|
-
segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
|
163
|
-
tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
|
164
|
-
|
165
|
-
tsv.add_field "#{ doc_field }" do
|
166
|
-
@docid
|
167
|
-
end
|
168
|
-
|
169
|
-
tsv.add_field "#{ entity_field }" do
|
170
|
-
"#{ entity }"
|
171
|
-
end
|
172
|
-
|
173
|
-
data.add_filter("field:#{ doc_field }", @docid)
|
174
|
-
data.add_filter("field:#{ entity_field }", "#{ entity }")
|
175
|
-
data.write true
|
176
|
-
keys = tsv.collect do |key, value|
|
177
|
-
data[key] = value
|
178
|
-
key
|
179
|
-
end
|
180
|
-
data.pop_filter
|
181
|
-
data.pop_filter
|
182
|
-
data.read
|
183
|
-
|
184
|
-
else
|
185
|
-
if raw == :check
|
186
|
-
data.close
|
187
|
-
return nil
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
return data.values if raw
|
192
|
-
|
193
|
-
start_pos = data.identify_field "Start"
|
194
|
-
segments = data.values_at(*keys).collect{|annotation|
|
195
|
-
pos = annotation[start_pos]
|
196
|
-
Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
|
197
|
-
}.compact
|
198
|
-
data.close
|
199
|
-
|
200
|
-
segments
|
201
|
-
end
|
202
|
-
EOC
|
203
|
-
end
|
204
|
-
|
205
|
-
|
206
|
-
def self.define(entity, &block)
|
207
|
-
send :define_method, "produce_#{entity}", &block
|
208
|
-
|
209
|
-
self.class_eval <<-EOC
|
210
|
-
def load_#{entity}(raw = false)
|
211
|
-
return if segments.include? "#{ entity }"
|
212
|
-
if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
|
213
|
-
segments["#{entity}"] = load_with_persistence_#{entity}(raw)
|
214
|
-
else
|
215
|
-
segments["#{ entity }"] = produce_#{entity}
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
def #{entity}(raw = false)
|
220
|
-
begin
|
221
|
-
entities = segments["#{ entity }"]
|
222
|
-
if entities.nil?
|
223
|
-
load_#{entity}(raw)
|
224
|
-
entities = segments["#{ entity }"]
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
|
-
entities
|
229
|
-
end
|
230
|
-
|
231
|
-
def #{entity}_at(pos, persist = false)
|
232
|
-
segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
|
233
|
-
end
|
234
|
-
|
235
|
-
EOC
|
236
|
-
end
|
237
|
-
|
238
|
-
def segment_index(name, persist_dir = nil)
|
239
|
-
@segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
|
240
|
-
end
|
241
|
-
|
242
|
-
def load_into(segment, *annotations)
|
243
|
-
options = annotations.pop if Hash === annotations.last
|
244
|
-
options ||= {}
|
245
|
-
|
246
|
-
if options[:persist] and not @persist_dir.nil?
|
247
|
-
persist_dir = File.join(@persist_dir, 'ranges')
|
248
|
-
else
|
249
|
-
persist_dir = nil
|
250
|
-
end
|
251
|
-
|
252
|
-
Segmented.setup(segment, {})
|
253
|
-
annotations.collect do |name|
|
254
|
-
name = name.to_s
|
255
|
-
index = segment_index(name, persist_dir)
|
256
|
-
annotations = index[segment.range]
|
257
|
-
segment.segments[name] ||= {}
|
258
|
-
segment.segments[name] = annotations
|
259
|
-
class << segment
|
260
|
-
self
|
261
|
-
end.class_eval "def #{ name }; @segments['#{ name }']; end"
|
262
|
-
end
|
263
|
-
|
264
|
-
segment
|
265
|
-
end
|
266
|
-
end
|