rbbt-text 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +46 -0
- data/lib/rbbt/document/annotation.rb +42 -0
- data/lib/rbbt/document/corpus.rb +38 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +19 -2
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +6 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +6 -6
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/segment.rb +177 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +40 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +43 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +187 -0
- data/test/test_helper.rb +5 -3
- metadata +40 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
|
4
|
+
class TestDocument < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_docid
|
7
|
+
text = "This is a document"
|
8
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
9
|
+
|
10
|
+
assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
@@ -0,0 +1,187 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
|
6
|
+
class TestSegment < Test::Unit::TestCase
|
7
|
+
def test_segment
|
8
|
+
text = "This is a document"
|
9
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
10
|
+
|
11
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
12
|
+
|
13
|
+
assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_segid
|
17
|
+
text = "This is a document"
|
18
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
19
|
+
|
20
|
+
corpus = {}
|
21
|
+
corpus.extend Document::Corpus
|
22
|
+
|
23
|
+
corpus.add_document(text)
|
24
|
+
|
25
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
26
|
+
|
27
|
+
segid = segment.segid(corpus)
|
28
|
+
|
29
|
+
segment = segid.segment
|
30
|
+
assert_equal "is", segment
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_info
|
34
|
+
segment = "test"
|
35
|
+
segment.extend Segment
|
36
|
+
segment.offset = 10
|
37
|
+
assert segment.info.include? :offset
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_sort
|
41
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
42
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
43
|
+
|
44
|
+
corpus = {}
|
45
|
+
corpus.extend Document::Corpus
|
46
|
+
|
47
|
+
corpus.add_document(text)
|
48
|
+
|
49
|
+
gene1 = "TP53"
|
50
|
+
gene1.extend Segment
|
51
|
+
gene1.offset = text.index gene1
|
52
|
+
gene1.docid = text.docid
|
53
|
+
|
54
|
+
gene2 = "CDK5R1"
|
55
|
+
gene2.extend Segment
|
56
|
+
gene2.offset = text.index gene2
|
57
|
+
gene2.docid = text.docid
|
58
|
+
|
59
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
60
|
+
|
61
|
+
assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_clean_sort
|
65
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
66
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
67
|
+
|
68
|
+
corpus = {}
|
69
|
+
corpus.extend Document::Corpus
|
70
|
+
|
71
|
+
corpus.add_document(text)
|
72
|
+
|
73
|
+
gene1 = "TP53"
|
74
|
+
gene1.extend Segment
|
75
|
+
gene1.offset = text.index gene1
|
76
|
+
gene1.docid = text.docid
|
77
|
+
|
78
|
+
gene2 = "CDK5R1"
|
79
|
+
gene2.extend Segment
|
80
|
+
gene2.offset = text.index gene2
|
81
|
+
gene2.docid = text.docid
|
82
|
+
|
83
|
+
gene3 = "TP53 gene"
|
84
|
+
gene3.extend Segment
|
85
|
+
gene3.offset = text.index gene1
|
86
|
+
gene3.docid = text.docid
|
87
|
+
|
88
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
89
|
+
|
90
|
+
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_split
|
94
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
95
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
96
|
+
|
97
|
+
corpus = {}
|
98
|
+
corpus.extend Document::Corpus
|
99
|
+
|
100
|
+
corpus.add_document(text)
|
101
|
+
|
102
|
+
gene1 = "TP53"
|
103
|
+
gene1.extend Segment
|
104
|
+
gene1.offset = text.index gene1
|
105
|
+
gene1.docid = text.docid
|
106
|
+
|
107
|
+
gene2 = "CDK5R1"
|
108
|
+
gene2.extend Segment
|
109
|
+
gene2.offset = text.index gene2
|
110
|
+
gene2.docid = text.docid
|
111
|
+
|
112
|
+
gene3 = "TP53 gene"
|
113
|
+
gene3.extend Segment
|
114
|
+
gene3.offset = text.index gene1
|
115
|
+
gene3.docid = text.docid
|
116
|
+
|
117
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
|
118
|
+
|
119
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
def test_align
|
124
|
+
text =<<-EOF
|
125
|
+
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
126
|
+
EOF
|
127
|
+
|
128
|
+
parts = text.split(/\W/)
|
129
|
+
Segment.align(text, parts)
|
130
|
+
|
131
|
+
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
132
|
+
|
133
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
134
|
+
|
135
|
+
parts = text.split(/\W/)
|
136
|
+
Segment.align(text, parts)
|
137
|
+
|
138
|
+
assert_equal parts.first.docid, text.docid
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_segment_index
|
142
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
143
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
144
|
+
|
145
|
+
corpus = {}
|
146
|
+
corpus.extend Document::Corpus
|
147
|
+
|
148
|
+
corpus.add_document(text)
|
149
|
+
|
150
|
+
gene1 = "TP53"
|
151
|
+
gene1.extend Segment
|
152
|
+
gene1.offset = text.index gene1
|
153
|
+
gene1.docid = text.docid
|
154
|
+
|
155
|
+
gene2 = "CDK5R1"
|
156
|
+
gene2.extend Segment
|
157
|
+
gene2.offset = text.index gene2
|
158
|
+
gene2.docid = text.docid
|
159
|
+
|
160
|
+
gene3 = "TP53 gene"
|
161
|
+
gene3.extend Segment
|
162
|
+
gene3.offset = text.index gene1
|
163
|
+
gene3.docid = text.docid
|
164
|
+
|
165
|
+
index = Segment.index([gene1, gene2, gene3], corpus)
|
166
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
167
|
+
|
168
|
+
TmpFile.with_file do |fwt|
|
169
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
170
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
171
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
172
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
173
|
+
end
|
174
|
+
|
175
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
|
176
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
177
|
+
|
178
|
+
TmpFile.with_file do |fwt|
|
179
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
180
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
181
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
182
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
|
data/test/test_helper.rb
CHANGED
@@ -6,7 +6,7 @@ require 'rbbt'
|
|
6
6
|
require 'rbbt/persist'
|
7
7
|
require 'rbbt/util/tmpfile'
|
8
8
|
require 'rbbt/util/log'
|
9
|
-
require 'rbbt/text/corpus'
|
9
|
+
#require 'rbbt/text/corpus'
|
10
10
|
|
11
11
|
class Test::Unit::TestCase
|
12
12
|
def get_test_datafile(file)
|
@@ -22,8 +22,10 @@ class Test::Unit::TestCase
|
|
22
22
|
FileUtils.rm_rf Rbbt.tmp.test.find :user
|
23
23
|
Persist::CONNECTIONS.values.each do |c| c.close end
|
24
24
|
Persist::CONNECTIONS.clear
|
25
|
-
|
26
|
-
|
25
|
+
if defined? Corpus
|
26
|
+
Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
|
27
|
+
Corpus::DocumentRepo::TC_CONNECTIONS.clear
|
28
|
+
end
|
27
29
|
end
|
28
30
|
|
29
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-04-
|
11
|
+
date: 2020-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -78,6 +78,10 @@ files:
|
|
78
78
|
- lib/rbbt/bow/bow.rb
|
79
79
|
- lib/rbbt/bow/dictionary.rb
|
80
80
|
- lib/rbbt/bow/misc.rb
|
81
|
+
- lib/rbbt/document.rb
|
82
|
+
- lib/rbbt/document/annotation.rb
|
83
|
+
- lib/rbbt/document/corpus.rb
|
84
|
+
- lib/rbbt/document/corpus/pubmed.rb
|
81
85
|
- lib/rbbt/ner/NER.rb
|
82
86
|
- lib/rbbt/ner/abner.rb
|
83
87
|
- lib/rbbt/ner/banner.rb
|
@@ -98,18 +102,16 @@ files:
|
|
98
102
|
- lib/rbbt/nlp/genia/sentence_splitter.rb
|
99
103
|
- lib/rbbt/nlp/nlp.rb
|
100
104
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
101
|
-
- lib/rbbt/
|
102
|
-
- lib/rbbt/
|
103
|
-
- lib/rbbt/
|
104
|
-
- lib/rbbt/
|
105
|
-
- lib/rbbt/
|
106
|
-
- lib/rbbt/
|
107
|
-
- lib/rbbt/
|
108
|
-
- lib/rbbt/
|
109
|
-
- lib/rbbt/
|
110
|
-
- lib/rbbt/
|
111
|
-
- lib/rbbt/text/segment/token.rb
|
112
|
-
- lib/rbbt/text/segment/transformed.rb
|
105
|
+
- lib/rbbt/segment.rb
|
106
|
+
- lib/rbbt/segment/annotation.rb
|
107
|
+
- lib/rbbt/segment/encoding.rb
|
108
|
+
- lib/rbbt/segment/named_entity.rb
|
109
|
+
- lib/rbbt/segment/overlaps.rb
|
110
|
+
- lib/rbbt/segment/range_index.rb
|
111
|
+
- lib/rbbt/segment/segmented.rb
|
112
|
+
- lib/rbbt/segment/token.rb
|
113
|
+
- lib/rbbt/segment/transformed.rb
|
114
|
+
- lib/rbbt/segment/tsv.rb
|
113
115
|
- share/install/software/ABNER
|
114
116
|
- share/install/software/BANNER
|
115
117
|
- share/install/software/ChemicalTagger
|
@@ -128,6 +130,9 @@ files:
|
|
128
130
|
- test/rbbt/bow/test_bow.rb
|
129
131
|
- test/rbbt/bow/test_dictionary.rb
|
130
132
|
- test/rbbt/bow/test_misc.rb
|
133
|
+
- test/rbbt/document/corpus/test_pubmed.rb
|
134
|
+
- test/rbbt/document/test_annotation.rb
|
135
|
+
- test/rbbt/document/test_corpus.rb
|
131
136
|
- test/rbbt/entity/test_document.rb
|
132
137
|
- test/rbbt/ner/test_NER.rb
|
133
138
|
- test/rbbt/ner/test_abner.rb
|
@@ -146,15 +151,15 @@ files:
|
|
146
151
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
147
152
|
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
148
153
|
- test/rbbt/nlp/test_nlp.rb
|
149
|
-
- test/rbbt/
|
150
|
-
- test/rbbt/
|
151
|
-
- test/rbbt/
|
152
|
-
- test/rbbt/
|
153
|
-
- test/rbbt/
|
154
|
-
- test/rbbt/
|
155
|
-
- test/rbbt/
|
156
|
-
- test/rbbt/
|
157
|
-
- test/rbbt/
|
154
|
+
- test/rbbt/segment/test_annotation.rb
|
155
|
+
- test/rbbt/segment/test_corpus.rb
|
156
|
+
- test/rbbt/segment/test_encoding.rb
|
157
|
+
- test/rbbt/segment/test_named_entity.rb
|
158
|
+
- test/rbbt/segment/test_overlaps.rb
|
159
|
+
- test/rbbt/segment/test_range_index.rb
|
160
|
+
- test/rbbt/segment/test_transformed.rb
|
161
|
+
- test/rbbt/test_document.rb
|
162
|
+
- test/rbbt/test_segment.rb
|
158
163
|
- test/test_helper.rb
|
159
164
|
homepage: http://github.com/mikisvaz/rbbt-util
|
160
165
|
licenses: []
|
@@ -182,18 +187,13 @@ test_files:
|
|
182
187
|
- test/rbbt/nlp/test_nlp.rb
|
183
188
|
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
184
189
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
185
|
-
- test/rbbt/text/test_document.rb
|
186
|
-
- test/rbbt/text/corpus/sources/test_pmid.rb
|
187
|
-
- test/rbbt/text/corpus/test_document.rb
|
188
|
-
- test/rbbt/text/test_segment.rb
|
189
|
-
- test/rbbt/text/test_corpus.rb
|
190
|
-
- test/rbbt/text/segment/test_transformed.rb
|
191
|
-
- test/rbbt/text/segment/test_relationship.rb
|
192
|
-
- test/rbbt/text/segment/test_named_entity.rb
|
193
|
-
- test/rbbt/text/segment/test_segmented.rb
|
194
190
|
- test/rbbt/bow/test_bow.rb
|
195
191
|
- test/rbbt/bow/test_misc.rb
|
196
192
|
- test/rbbt/bow/test_dictionary.rb
|
193
|
+
- test/rbbt/test_document.rb
|
194
|
+
- test/rbbt/document/test_annotation.rb
|
195
|
+
- test/rbbt/document/corpus/test_pubmed.rb
|
196
|
+
- test/rbbt/document/test_corpus.rb
|
197
197
|
- test/rbbt/entity/test_document.rb
|
198
198
|
- test/rbbt/ner/test_patterns.rb
|
199
199
|
- test/rbbt/ner/test_NER.rb
|
@@ -209,4 +209,12 @@ test_files:
|
|
209
209
|
- test/rbbt/ner/test_finder.rb
|
210
210
|
- test/rbbt/ner/test_linnaeus.rb
|
211
211
|
- test/rbbt/ner/test_oscar4.rb
|
212
|
+
- test/rbbt/test_segment.rb
|
213
|
+
- test/rbbt/segment/test_transformed.rb
|
214
|
+
- test/rbbt/segment/test_overlaps.rb
|
215
|
+
- test/rbbt/segment/test_annotation.rb
|
216
|
+
- test/rbbt/segment/test_named_entity.rb
|
217
|
+
- test/rbbt/segment/test_encoding.rb
|
218
|
+
- test/rbbt/segment/test_range_index.rb
|
219
|
+
- test/rbbt/segment/test_corpus.rb
|
212
220
|
- test/test_helper.rb
|
data/lib/rbbt/text/corpus.rb
DELETED
@@ -1,106 +0,0 @@
|
|
1
|
-
require 'rbbt/text/corpus/document'
|
2
|
-
require 'rbbt/text/corpus/document_repo'
|
3
|
-
|
4
|
-
class Corpus
|
5
|
-
class << self
|
6
|
-
attr_accessor :claims
|
7
|
-
def claim(namespace, &block)
|
8
|
-
@@claims = {}
|
9
|
-
@@claims[namespace] = block
|
10
|
-
end
|
11
|
-
|
12
|
-
end
|
13
|
-
attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
|
14
|
-
|
15
|
-
def initialize(corpora_path = nil)
|
16
|
-
@corpora_path = case
|
17
|
-
when corpora_path.nil?
|
18
|
-
Rbbt.corpora
|
19
|
-
when (not Path === corpora_path)
|
20
|
-
Path.setup(corpora_path)
|
21
|
-
else
|
22
|
-
corpora_path
|
23
|
-
end
|
24
|
-
|
25
|
-
@corpora_path = @corpora_path.find
|
26
|
-
@persistence_dir = File.join(@corpora_path, "annotations")
|
27
|
-
|
28
|
-
Misc.lock(@persistence_dir) do
|
29
|
-
@global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
|
30
|
-
@global_annotations.unnamed = true
|
31
|
-
@global_annotations.close
|
32
|
-
end
|
33
|
-
|
34
|
-
Misc.lock(@corpora_path.document_repo) do
|
35
|
-
@document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
|
36
|
-
@document_repo.close
|
37
|
-
end
|
38
|
-
|
39
|
-
end
|
40
|
-
|
41
|
-
def persistence_for(docid)
|
42
|
-
File.join(persistence_dir, docid)
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
def docid(docid)
|
47
|
-
begin
|
48
|
-
if @document_repo.include?(docid)
|
49
|
-
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
|
50
|
-
else
|
51
|
-
namespace, id, type = docid.split(":")
|
52
|
-
if @@claims.include?(namespace)
|
53
|
-
|
54
|
-
docid = self.instance_exec id, type, &(@@claims[namespace])
|
55
|
-
docid = docid.first if Array === docid
|
56
|
-
self.docid(docid)
|
57
|
-
else
|
58
|
-
raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
|
59
|
-
end
|
60
|
-
end
|
61
|
-
ensure
|
62
|
-
@document_repo.close
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def document(namespace, id, type, hash)
|
67
|
-
docid = [namespace, id, type, hash] * ":"
|
68
|
-
self.docid(docid)
|
69
|
-
end
|
70
|
-
|
71
|
-
def add_document(text, namespace = nil, id = nil, type = nil)
|
72
|
-
text = Misc.fixutf8(text)
|
73
|
-
hash = Digest::MD5.hexdigest(text)
|
74
|
-
@document_repo.add(text, namespace, id, type, hash)
|
75
|
-
end
|
76
|
-
|
77
|
-
def add_docid(text, docid)
|
78
|
-
namespace, id, type, hash = docid.split(":")
|
79
|
-
@document_repo.add(text, namespace, id, type, hash)
|
80
|
-
end
|
81
|
-
|
82
|
-
|
83
|
-
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
84
|
-
@document_repo.find(namespace, id, type, hash).collect{|docid|
|
85
|
-
self.docid(docid)
|
86
|
-
}
|
87
|
-
end
|
88
|
-
|
89
|
-
def find_docid(docid)
|
90
|
-
@document_repo.find_docid(docid).collect{|docid|
|
91
|
-
self.docid(docid)
|
92
|
-
}
|
93
|
-
end
|
94
|
-
|
95
|
-
def exists?(namespace=nil, id = nil, type = nil, hash = nil)
|
96
|
-
find(namespace, id, type, hash).any?
|
97
|
-
end
|
98
|
-
|
99
|
-
def [](docid)
|
100
|
-
self.docid(docid)
|
101
|
-
end
|
102
|
-
|
103
|
-
def include?(id)
|
104
|
-
@document_repo.include? id
|
105
|
-
end
|
106
|
-
end
|