rbbt-text 1.1.9 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +56 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +61 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +42 -12
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -361
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -355
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -52
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
|
4
|
+
class TestDocument < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_docid
|
7
|
+
text = "This is a document"
|
8
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
9
|
+
|
10
|
+
assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
@@ -0,0 +1,182 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
|
6
|
+
class TestSegment < Test::Unit::TestCase
|
7
|
+
def test_segment
|
8
|
+
text = "This is a document"
|
9
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
10
|
+
|
11
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
12
|
+
|
13
|
+
assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_segid
|
17
|
+
text = "This is a document"
|
18
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
19
|
+
|
20
|
+
corpus = Document::Corpus.setup({})
|
21
|
+
|
22
|
+
corpus.add_document(text)
|
23
|
+
|
24
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
25
|
+
|
26
|
+
segid = segment.segid(corpus)
|
27
|
+
|
28
|
+
segment = segid.segment
|
29
|
+
assert_equal "is", segment
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_info
|
33
|
+
segment = "test"
|
34
|
+
segment.extend Segment
|
35
|
+
segment.offset = 10
|
36
|
+
assert segment.info.include? :offset
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_sort
|
40
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
41
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
42
|
+
|
43
|
+
corpus = Document::Corpus.setup({})
|
44
|
+
|
45
|
+
corpus.add_document(text)
|
46
|
+
|
47
|
+
gene1 = "TP53"
|
48
|
+
gene1.extend Segment
|
49
|
+
gene1.offset = text.index gene1
|
50
|
+
gene1.docid = text.docid
|
51
|
+
|
52
|
+
gene2 = "CDK5R1"
|
53
|
+
gene2.extend Segment
|
54
|
+
gene2.offset = text.index gene2
|
55
|
+
gene2.docid = text.docid
|
56
|
+
|
57
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
58
|
+
|
59
|
+
assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_clean_sort
|
63
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
64
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
65
|
+
|
66
|
+
corpus = Document::Corpus.setup({})
|
67
|
+
|
68
|
+
corpus.add_document(text)
|
69
|
+
|
70
|
+
gene1 = "TP53"
|
71
|
+
gene1.extend Segment
|
72
|
+
gene1.offset = text.index gene1
|
73
|
+
gene1.docid = text.docid
|
74
|
+
|
75
|
+
gene2 = "CDK5R1"
|
76
|
+
gene2.extend Segment
|
77
|
+
gene2.offset = text.index gene2
|
78
|
+
gene2.docid = text.docid
|
79
|
+
|
80
|
+
gene3 = "TP53 gene"
|
81
|
+
gene3.extend Segment
|
82
|
+
gene3.offset = text.index gene1
|
83
|
+
gene3.docid = text.docid
|
84
|
+
|
85
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
86
|
+
|
87
|
+
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_split
|
91
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
92
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
93
|
+
|
94
|
+
corpus = Document::Corpus.setup({})
|
95
|
+
|
96
|
+
corpus.add_document(text)
|
97
|
+
|
98
|
+
gene1 = "TP53"
|
99
|
+
gene1.extend Segment
|
100
|
+
gene1.offset = text.index gene1
|
101
|
+
gene1.docid = text.docid
|
102
|
+
|
103
|
+
gene2 = "CDK5R1"
|
104
|
+
gene2.extend Segment
|
105
|
+
gene2.offset = text.index gene2
|
106
|
+
gene2.docid = text.docid
|
107
|
+
|
108
|
+
gene3 = "TP53 gene"
|
109
|
+
gene3.extend Segment
|
110
|
+
gene3.offset = text.index gene1
|
111
|
+
gene3.docid = text.docid
|
112
|
+
|
113
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
|
114
|
+
|
115
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
def test_align
|
120
|
+
text =<<-EOF
|
121
|
+
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
122
|
+
EOF
|
123
|
+
|
124
|
+
parts = text.split(/\W/)
|
125
|
+
Segment.align(text, parts)
|
126
|
+
|
127
|
+
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
128
|
+
|
129
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
130
|
+
|
131
|
+
parts = text.split(/\W/)
|
132
|
+
Segment.align(text, parts)
|
133
|
+
|
134
|
+
assert_equal parts.first.docid, text.docid
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_segment_index
|
138
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
139
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
140
|
+
|
141
|
+
corpus = Document::Corpus.setup({})
|
142
|
+
|
143
|
+
corpus.add_document(text)
|
144
|
+
|
145
|
+
gene1 = "TP53"
|
146
|
+
gene1.extend Segment
|
147
|
+
gene1.offset = text.index gene1
|
148
|
+
gene1.docid = text.docid
|
149
|
+
|
150
|
+
gene2 = "CDK5R1"
|
151
|
+
gene2.extend Segment
|
152
|
+
gene2.offset = text.index gene2
|
153
|
+
gene2.docid = text.docid
|
154
|
+
|
155
|
+
gene3 = "TP53 gene"
|
156
|
+
gene3.extend Segment
|
157
|
+
gene3.offset = text.index gene1
|
158
|
+
gene3.docid = text.docid
|
159
|
+
|
160
|
+
index = Segment.index([gene1, gene2, gene3], corpus)
|
161
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
162
|
+
|
163
|
+
TmpFile.with_file do |fwt|
|
164
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
165
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
166
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
167
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
168
|
+
end
|
169
|
+
|
170
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
|
171
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
172
|
+
|
173
|
+
TmpFile.with_file do |fwt|
|
174
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
175
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
176
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
177
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
data/test/test_helper.rb
CHANGED
@@ -6,7 +6,7 @@ require 'rbbt'
|
|
6
6
|
require 'rbbt/persist'
|
7
7
|
require 'rbbt/util/tmpfile'
|
8
8
|
require 'rbbt/util/log'
|
9
|
-
require 'rbbt/text/corpus'
|
9
|
+
#require 'rbbt/text/corpus'
|
10
10
|
|
11
11
|
class Test::Unit::TestCase
|
12
12
|
def get_test_datafile(file)
|
@@ -22,8 +22,10 @@ class Test::Unit::TestCase
|
|
22
22
|
FileUtils.rm_rf Rbbt.tmp.test.find :user
|
23
23
|
Persist::CONNECTIONS.values.each do |c| c.close end
|
24
24
|
Persist::CONNECTIONS.clear
|
25
|
-
|
26
|
-
|
25
|
+
if defined? Corpus
|
26
|
+
Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
|
27
|
+
Corpus::DocumentRepo::TC_CONNECTIONS.clear
|
28
|
+
end
|
27
29
|
end
|
28
30
|
|
29
31
|
end
|
data/test/test_spaCy.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
|
2
|
+
require 'rbbt/nlp/spaCy'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
|
5
|
+
class TestSpaCy < Test::Unit::TestCase
|
6
|
+
def _test_tokens
|
7
|
+
text = "I tell a story"
|
8
|
+
|
9
|
+
tokens = SpaCy.tokens(text)
|
10
|
+
|
11
|
+
assert_equal 4, tokens.length
|
12
|
+
assert_equal "tell", tokens[1].to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_segments
|
16
|
+
text = "I tell a story. It's a very good story."
|
17
|
+
|
18
|
+
corpus = Document::Corpus.setup({})
|
19
|
+
|
20
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
21
|
+
|
22
|
+
corpus.add_document text
|
23
|
+
text.corpus = corpus
|
24
|
+
|
25
|
+
segments = SpaCy.segments(text)
|
26
|
+
|
27
|
+
segments.each do |segment|
|
28
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -78,6 +78,10 @@ files:
|
|
78
78
|
- lib/rbbt/bow/bow.rb
|
79
79
|
- lib/rbbt/bow/dictionary.rb
|
80
80
|
- lib/rbbt/bow/misc.rb
|
81
|
+
- lib/rbbt/document.rb
|
82
|
+
- lib/rbbt/document/annotation.rb
|
83
|
+
- lib/rbbt/document/corpus.rb
|
84
|
+
- lib/rbbt/document/corpus/pubmed.rb
|
81
85
|
- lib/rbbt/ner/NER.rb
|
82
86
|
- lib/rbbt/ner/abner.rb
|
83
87
|
- lib/rbbt/ner/banner.rb
|
@@ -98,18 +102,18 @@ files:
|
|
98
102
|
- lib/rbbt/nlp/genia/sentence_splitter.rb
|
99
103
|
- lib/rbbt/nlp/nlp.rb
|
100
104
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
101
|
-
- lib/rbbt/
|
102
|
-
- lib/rbbt/
|
103
|
-
- lib/rbbt/
|
104
|
-
- lib/rbbt/
|
105
|
-
- lib/rbbt/
|
106
|
-
- lib/rbbt/
|
107
|
-
- lib/rbbt/
|
108
|
-
- lib/rbbt/
|
109
|
-
- lib/rbbt/
|
110
|
-
- lib/rbbt/
|
111
|
-
- lib/rbbt/
|
112
|
-
- lib/rbbt/
|
105
|
+
- lib/rbbt/nlp/spaCy.rb
|
106
|
+
- lib/rbbt/segment.rb
|
107
|
+
- lib/rbbt/segment/annotation.rb
|
108
|
+
- lib/rbbt/segment/encoding.rb
|
109
|
+
- lib/rbbt/segment/named_entity.rb
|
110
|
+
- lib/rbbt/segment/overlaps.rb
|
111
|
+
- lib/rbbt/segment/range_index.rb
|
112
|
+
- lib/rbbt/segment/relationship.rb
|
113
|
+
- lib/rbbt/segment/segmented.rb
|
114
|
+
- lib/rbbt/segment/token.rb
|
115
|
+
- lib/rbbt/segment/transformed.rb
|
116
|
+
- lib/rbbt/segment/tsv.rb
|
113
117
|
- share/install/software/ABNER
|
114
118
|
- share/install/software/BANNER
|
115
119
|
- share/install/software/ChemicalTagger
|
@@ -128,6 +132,9 @@ files:
|
|
128
132
|
- test/rbbt/bow/test_bow.rb
|
129
133
|
- test/rbbt/bow/test_dictionary.rb
|
130
134
|
- test/rbbt/bow/test_misc.rb
|
135
|
+
- test/rbbt/document/corpus/test_pubmed.rb
|
136
|
+
- test/rbbt/document/test_annotation.rb
|
137
|
+
- test/rbbt/document/test_corpus.rb
|
131
138
|
- test/rbbt/entity/test_document.rb
|
132
139
|
- test/rbbt/ner/test_NER.rb
|
133
140
|
- test/rbbt/ner/test_abner.rb
|
@@ -146,16 +153,17 @@ files:
|
|
146
153
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
147
154
|
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
148
155
|
- test/rbbt/nlp/test_nlp.rb
|
149
|
-
- test/rbbt/
|
150
|
-
- test/rbbt/
|
151
|
-
- test/rbbt/
|
152
|
-
- test/rbbt/
|
153
|
-
- test/rbbt/
|
154
|
-
- test/rbbt/
|
155
|
-
- test/rbbt/
|
156
|
-
- test/rbbt/
|
157
|
-
- test/rbbt/
|
156
|
+
- test/rbbt/segment/test_annotation.rb
|
157
|
+
- test/rbbt/segment/test_corpus.rb
|
158
|
+
- test/rbbt/segment/test_encoding.rb
|
159
|
+
- test/rbbt/segment/test_named_entity.rb
|
160
|
+
- test/rbbt/segment/test_overlaps.rb
|
161
|
+
- test/rbbt/segment/test_range_index.rb
|
162
|
+
- test/rbbt/segment/test_transformed.rb
|
163
|
+
- test/rbbt/test_document.rb
|
164
|
+
- test/rbbt/test_segment.rb
|
158
165
|
- test/test_helper.rb
|
166
|
+
- test/test_spaCy.rb
|
159
167
|
homepage: http://github.com/mikisvaz/rbbt-util
|
160
168
|
licenses: []
|
161
169
|
metadata: {}
|
@@ -182,18 +190,13 @@ test_files:
|
|
182
190
|
- test/rbbt/nlp/test_nlp.rb
|
183
191
|
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
184
192
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
185
|
-
- test/rbbt/text/test_document.rb
|
186
|
-
- test/rbbt/text/corpus/sources/test_pmid.rb
|
187
|
-
- test/rbbt/text/corpus/test_document.rb
|
188
|
-
- test/rbbt/text/test_segment.rb
|
189
|
-
- test/rbbt/text/test_corpus.rb
|
190
|
-
- test/rbbt/text/segment/test_transformed.rb
|
191
|
-
- test/rbbt/text/segment/test_relationship.rb
|
192
|
-
- test/rbbt/text/segment/test_named_entity.rb
|
193
|
-
- test/rbbt/text/segment/test_segmented.rb
|
194
193
|
- test/rbbt/bow/test_bow.rb
|
195
194
|
- test/rbbt/bow/test_misc.rb
|
196
195
|
- test/rbbt/bow/test_dictionary.rb
|
196
|
+
- test/rbbt/test_document.rb
|
197
|
+
- test/rbbt/document/test_annotation.rb
|
198
|
+
- test/rbbt/document/corpus/test_pubmed.rb
|
199
|
+
- test/rbbt/document/test_corpus.rb
|
197
200
|
- test/rbbt/entity/test_document.rb
|
198
201
|
- test/rbbt/ner/test_patterns.rb
|
199
202
|
- test/rbbt/ner/test_NER.rb
|
@@ -209,4 +212,13 @@ test_files:
|
|
209
212
|
- test/rbbt/ner/test_finder.rb
|
210
213
|
- test/rbbt/ner/test_linnaeus.rb
|
211
214
|
- test/rbbt/ner/test_oscar4.rb
|
215
|
+
- test/rbbt/test_segment.rb
|
216
|
+
- test/rbbt/segment/test_transformed.rb
|
217
|
+
- test/rbbt/segment/test_overlaps.rb
|
218
|
+
- test/rbbt/segment/test_annotation.rb
|
219
|
+
- test/rbbt/segment/test_named_entity.rb
|
220
|
+
- test/rbbt/segment/test_encoding.rb
|
221
|
+
- test/rbbt/segment/test_range_index.rb
|
222
|
+
- test/rbbt/segment/test_corpus.rb
|
223
|
+
- test/test_spaCy.rb
|
212
224
|
- test/test_helper.rb
|
data/lib/rbbt/text/corpus.rb
DELETED
@@ -1,106 +0,0 @@
|
|
1
|
-
require 'rbbt/text/corpus/document'
|
2
|
-
require 'rbbt/text/corpus/document_repo'
|
3
|
-
|
4
|
-
class Corpus
|
5
|
-
class << self
|
6
|
-
attr_accessor :claims
|
7
|
-
def claim(namespace, &block)
|
8
|
-
@@claims = {}
|
9
|
-
@@claims[namespace] = block
|
10
|
-
end
|
11
|
-
|
12
|
-
end
|
13
|
-
attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
|
14
|
-
|
15
|
-
def initialize(corpora_path = nil)
|
16
|
-
@corpora_path = case
|
17
|
-
when corpora_path.nil?
|
18
|
-
Rbbt.corpora
|
19
|
-
when (not Path === corpora_path)
|
20
|
-
Path.setup(corpora_path)
|
21
|
-
else
|
22
|
-
corpora_path
|
23
|
-
end
|
24
|
-
|
25
|
-
@corpora_path = @corpora_path.find
|
26
|
-
@persistence_dir = File.join(@corpora_path, "annotations")
|
27
|
-
|
28
|
-
Misc.lock(@persistence_dir) do
|
29
|
-
@global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
|
30
|
-
@global_annotations.unnamed = true
|
31
|
-
@global_annotations.close
|
32
|
-
end
|
33
|
-
|
34
|
-
Misc.lock(@corpora_path.document_repo) do
|
35
|
-
@document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
|
36
|
-
@document_repo.close
|
37
|
-
end
|
38
|
-
|
39
|
-
end
|
40
|
-
|
41
|
-
def persistence_for(docid)
|
42
|
-
File.join(persistence_dir, docid)
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
def docid(docid)
|
47
|
-
begin
|
48
|
-
if @document_repo.include?(docid)
|
49
|
-
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
|
50
|
-
else
|
51
|
-
namespace, id, type = docid.split(":")
|
52
|
-
if @@claims.include?(namespace)
|
53
|
-
|
54
|
-
docid = self.instance_exec id, type, &(@@claims[namespace])
|
55
|
-
docid = docid.first if Array === docid
|
56
|
-
self.docid(docid)
|
57
|
-
else
|
58
|
-
raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
|
59
|
-
end
|
60
|
-
end
|
61
|
-
ensure
|
62
|
-
@document_repo.close
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def document(namespace, id, type, hash)
|
67
|
-
docid = [namespace, id, type, hash] * ":"
|
68
|
-
self.docid(docid)
|
69
|
-
end
|
70
|
-
|
71
|
-
def add_document(text, namespace = nil, id = nil, type = nil)
|
72
|
-
text = Misc.fixutf8(text)
|
73
|
-
hash = Digest::MD5.hexdigest(text)
|
74
|
-
@document_repo.add(text, namespace, id, type, hash)
|
75
|
-
end
|
76
|
-
|
77
|
-
def add_docid(text, docid)
|
78
|
-
namespace, id, type, hash = docid.split(":")
|
79
|
-
@document_repo.add(text, namespace, id, type, hash)
|
80
|
-
end
|
81
|
-
|
82
|
-
|
83
|
-
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
84
|
-
@document_repo.find(namespace, id, type, hash).collect{|docid|
|
85
|
-
self.docid(docid)
|
86
|
-
}
|
87
|
-
end
|
88
|
-
|
89
|
-
def find_docid(docid)
|
90
|
-
@document_repo.find_docid(docid).collect{|docid|
|
91
|
-
self.docid(docid)
|
92
|
-
}
|
93
|
-
end
|
94
|
-
|
95
|
-
def exists?(namespace=nil, id = nil, type = nil, hash = nil)
|
96
|
-
find(namespace, id, type, hash).any?
|
97
|
-
end
|
98
|
-
|
99
|
-
def [](docid)
|
100
|
-
self.docid(docid)
|
101
|
-
end
|
102
|
-
|
103
|
-
def include?(id)
|
104
|
-
@document_repo.include? id
|
105
|
-
end
|
106
|
-
end
|