rbbt-text 1.2.0 → 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -6,7 +6,7 @@ require 'rbbt'
6
6
  require 'rbbt/persist'
7
7
  require 'rbbt/util/tmpfile'
8
8
  require 'rbbt/util/log'
9
- require 'rbbt/text/corpus'
9
+ #require 'rbbt/text/corpus'
10
10
 
11
11
  class Test::Unit::TestCase
12
12
  def get_test_datafile(file)
@@ -22,8 +22,10 @@ class Test::Unit::TestCase
22
22
  FileUtils.rm_rf Rbbt.tmp.test.find :user
23
23
  Persist::CONNECTIONS.values.each do |c| c.close end
24
24
  Persist::CONNECTIONS.clear
25
- Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
- Corpus::DocumentRepo::TC_CONNECTIONS.clear
25
+ if defined? Corpus
26
+ Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
27
+ Corpus::DocumentRepo::TC_CONNECTIONS.clear
28
+ end
27
29
  end
28
30
 
29
31
  end
@@ -0,0 +1,32 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
2
+ require 'rbbt/nlp/spaCy'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestSpaCy < Test::Unit::TestCase
6
+ def _test_tokens
7
+ text = "I tell a story"
8
+
9
+ tokens = SpaCy.tokens(text)
10
+
11
+ assert_equal 4, tokens.length
12
+ assert_equal "tell", tokens[1].to_s
13
+ end
14
+
15
+ def test_segments
16
+ text = "I tell a story. It's a very good story."
17
+
18
+ corpus = Document::Corpus.setup({})
19
+
20
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
21
+
22
+ corpus.add_document text
23
+ text.corpus = corpus
24
+
25
+ segments = SpaCy.segments(text)
26
+
27
+ segments.each do |segment|
28
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
29
+ end
30
+ end
31
+ end
32
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-16 00:00:00.000000000 Z
11
+ date: 2020-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -78,6 +78,10 @@ files:
78
78
  - lib/rbbt/bow/bow.rb
79
79
  - lib/rbbt/bow/dictionary.rb
80
80
  - lib/rbbt/bow/misc.rb
81
+ - lib/rbbt/document.rb
82
+ - lib/rbbt/document/annotation.rb
83
+ - lib/rbbt/document/corpus.rb
84
+ - lib/rbbt/document/corpus/pubmed.rb
81
85
  - lib/rbbt/ner/NER.rb
82
86
  - lib/rbbt/ner/abner.rb
83
87
  - lib/rbbt/ner/banner.rb
@@ -98,18 +102,18 @@ files:
98
102
  - lib/rbbt/nlp/genia/sentence_splitter.rb
99
103
  - lib/rbbt/nlp/nlp.rb
100
104
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
101
- - lib/rbbt/text/corpus.rb
102
- - lib/rbbt/text/corpus/document.rb
103
- - lib/rbbt/text/corpus/document_repo.rb
104
- - lib/rbbt/text/corpus/sources/pmid.rb
105
- - lib/rbbt/text/document.rb
106
- - lib/rbbt/text/segment.rb
107
- - lib/rbbt/text/segment/docid.rb
108
- - lib/rbbt/text/segment/named_entity.rb
109
- - lib/rbbt/text/segment/relationship.rb
110
- - lib/rbbt/text/segment/segmented.rb
111
- - lib/rbbt/text/segment/token.rb
112
- - lib/rbbt/text/segment/transformed.rb
105
+ - lib/rbbt/nlp/spaCy.rb
106
+ - lib/rbbt/segment.rb
107
+ - lib/rbbt/segment/annotation.rb
108
+ - lib/rbbt/segment/encoding.rb
109
+ - lib/rbbt/segment/named_entity.rb
110
+ - lib/rbbt/segment/overlaps.rb
111
+ - lib/rbbt/segment/range_index.rb
112
+ - lib/rbbt/segment/relationship.rb
113
+ - lib/rbbt/segment/segmented.rb
114
+ - lib/rbbt/segment/token.rb
115
+ - lib/rbbt/segment/transformed.rb
116
+ - lib/rbbt/segment/tsv.rb
113
117
  - share/install/software/ABNER
114
118
  - share/install/software/BANNER
115
119
  - share/install/software/ChemicalTagger
@@ -128,6 +132,9 @@ files:
128
132
  - test/rbbt/bow/test_bow.rb
129
133
  - test/rbbt/bow/test_dictionary.rb
130
134
  - test/rbbt/bow/test_misc.rb
135
+ - test/rbbt/document/corpus/test_pubmed.rb
136
+ - test/rbbt/document/test_annotation.rb
137
+ - test/rbbt/document/test_corpus.rb
131
138
  - test/rbbt/entity/test_document.rb
132
139
  - test/rbbt/ner/test_NER.rb
133
140
  - test/rbbt/ner/test_abner.rb
@@ -146,16 +153,17 @@ files:
146
153
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
147
154
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
148
155
  - test/rbbt/nlp/test_nlp.rb
149
- - test/rbbt/text/corpus/sources/test_pmid.rb
150
- - test/rbbt/text/corpus/test_document.rb
151
- - test/rbbt/text/segment/test_named_entity.rb
152
- - test/rbbt/text/segment/test_relationship.rb
153
- - test/rbbt/text/segment/test_segmented.rb
154
- - test/rbbt/text/segment/test_transformed.rb
155
- - test/rbbt/text/test_corpus.rb
156
- - test/rbbt/text/test_document.rb
157
- - test/rbbt/text/test_segment.rb
156
+ - test/rbbt/segment/test_annotation.rb
157
+ - test/rbbt/segment/test_corpus.rb
158
+ - test/rbbt/segment/test_encoding.rb
159
+ - test/rbbt/segment/test_named_entity.rb
160
+ - test/rbbt/segment/test_overlaps.rb
161
+ - test/rbbt/segment/test_range_index.rb
162
+ - test/rbbt/segment/test_transformed.rb
163
+ - test/rbbt/test_document.rb
164
+ - test/rbbt/test_segment.rb
158
165
  - test/test_helper.rb
166
+ - test/test_spaCy.rb
159
167
  homepage: http://github.com/mikisvaz/rbbt-util
160
168
  licenses: []
161
169
  metadata: {}
@@ -182,18 +190,13 @@ test_files:
182
190
  - test/rbbt/nlp/test_nlp.rb
183
191
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
184
192
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
185
- - test/rbbt/text/test_document.rb
186
- - test/rbbt/text/corpus/sources/test_pmid.rb
187
- - test/rbbt/text/corpus/test_document.rb
188
- - test/rbbt/text/test_segment.rb
189
- - test/rbbt/text/test_corpus.rb
190
- - test/rbbt/text/segment/test_transformed.rb
191
- - test/rbbt/text/segment/test_relationship.rb
192
- - test/rbbt/text/segment/test_named_entity.rb
193
- - test/rbbt/text/segment/test_segmented.rb
194
193
  - test/rbbt/bow/test_bow.rb
195
194
  - test/rbbt/bow/test_misc.rb
196
195
  - test/rbbt/bow/test_dictionary.rb
196
+ - test/rbbt/test_document.rb
197
+ - test/rbbt/document/test_annotation.rb
198
+ - test/rbbt/document/corpus/test_pubmed.rb
199
+ - test/rbbt/document/test_corpus.rb
197
200
  - test/rbbt/entity/test_document.rb
198
201
  - test/rbbt/ner/test_patterns.rb
199
202
  - test/rbbt/ner/test_NER.rb
@@ -209,4 +212,13 @@ test_files:
209
212
  - test/rbbt/ner/test_finder.rb
210
213
  - test/rbbt/ner/test_linnaeus.rb
211
214
  - test/rbbt/ner/test_oscar4.rb
215
+ - test/rbbt/test_segment.rb
216
+ - test/rbbt/segment/test_transformed.rb
217
+ - test/rbbt/segment/test_overlaps.rb
218
+ - test/rbbt/segment/test_annotation.rb
219
+ - test/rbbt/segment/test_named_entity.rb
220
+ - test/rbbt/segment/test_encoding.rb
221
+ - test/rbbt/segment/test_range_index.rb
222
+ - test/rbbt/segment/test_corpus.rb
223
+ - test/test_spaCy.rb
212
224
  - test/test_helper.rb
@@ -1,106 +0,0 @@
1
- require 'rbbt/text/corpus/document'
2
- require 'rbbt/text/corpus/document_repo'
3
-
4
- class Corpus
5
- class << self
6
- attr_accessor :claims
7
- def claim(namespace, &block)
8
- @@claims = {}
9
- @@claims[namespace] = block
10
- end
11
-
12
- end
13
- attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
14
-
15
- def initialize(corpora_path = nil)
16
- @corpora_path = case
17
- when corpora_path.nil?
18
- Rbbt.corpora
19
- when (not Path === corpora_path)
20
- Path.setup(corpora_path)
21
- else
22
- corpora_path
23
- end
24
-
25
- @corpora_path = @corpora_path.find
26
- @persistence_dir = File.join(@corpora_path, "annotations")
27
-
28
- Misc.lock(@persistence_dir) do
29
- @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
30
- @global_annotations.unnamed = true
31
- @global_annotations.close
32
- end
33
-
34
- Misc.lock(@corpora_path.document_repo) do
35
- @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
36
- @document_repo.close
37
- end
38
-
39
- end
40
-
41
- def persistence_for(docid)
42
- File.join(persistence_dir, docid)
43
- end
44
-
45
-
46
- def docid(docid)
47
- begin
48
- if @document_repo.include?(docid)
49
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
50
- else
51
- namespace, id, type = docid.split(":")
52
- if @@claims.include?(namespace)
53
-
54
- docid = self.instance_exec id, type, &(@@claims[namespace])
55
- docid = docid.first if Array === docid
56
- self.docid(docid)
57
- else
58
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
59
- end
60
- end
61
- ensure
62
- @document_repo.close
63
- end
64
- end
65
-
66
- def document(namespace, id, type, hash)
67
- docid = [namespace, id, type, hash] * ":"
68
- self.docid(docid)
69
- end
70
-
71
- def add_document(text, namespace = nil, id = nil, type = nil)
72
- text = Misc.fixutf8(text)
73
- hash = Digest::MD5.hexdigest(text)
74
- @document_repo.add(text, namespace, id, type, hash)
75
- end
76
-
77
- def add_docid(text, docid)
78
- namespace, id, type, hash = docid.split(":")
79
- @document_repo.add(text, namespace, id, type, hash)
80
- end
81
-
82
-
83
- def find(namespace=nil, id = nil, type = nil, hash = nil)
84
- @document_repo.find(namespace, id, type, hash).collect{|docid|
85
- self.docid(docid)
86
- }
87
- end
88
-
89
- def find_docid(docid)
90
- @document_repo.find_docid(docid).collect{|docid|
91
- self.docid(docid)
92
- }
93
- end
94
-
95
- def exists?(namespace=nil, id = nil, type = nil, hash = nil)
96
- find(namespace, id, type, hash).any?
97
- end
98
-
99
- def [](docid)
100
- self.docid(docid)
101
- end
102
-
103
- def include?(id)
104
- @document_repo.include? id
105
- end
106
- end
@@ -1,383 +0,0 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/docid'
4
- require 'rbbt/tsv'
5
- require 'rbbt/resource/path'
6
- require 'rbbt/persist/tsv'
7
- require 'rbbt/util/misc'
8
- require 'rbbt/text/document'
9
- require 'json'
10
-
11
- class Corpus
12
- class Document
13
-
14
- class MultipleEntity < Exception; end
15
-
16
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
17
-
18
- attr_accessor :multiple_result
19
-
20
- def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
21
- @segments = {}
22
- @segment_indices = {}
23
- @corpus = corpus
24
-
25
- if not persist_dir.nil?
26
- @persist_dir = persist_dir
27
- @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
28
- end
29
-
30
- @global_persistence = global_persistence
31
-
32
- if not docid.nil?
33
- @docid = docid
34
- update_docid
35
- end
36
- @text = text unless text.nil?
37
- end
38
-
39
- def update_docid
40
- @namespace, @id, @type, @hash = docid.split(":", -1)
41
- end
42
-
43
- def docid=(docid)
44
- @docid = docid
45
- update_docid
46
- end
47
-
48
- def self.define(entity, &block)
49
- send :define_method, "produce_#{entity}" do
50
- segments = self.instance_exec &block
51
-
52
- segments.each{|s| s.docid = docid }
53
- end
54
-
55
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
56
- def load_#{entity}(raw = false)
57
- return if segments.include? "#{ entity }"
58
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
59
- entities = load_with_persistence_#{entity}(raw)
60
- else
61
- entities = produce_#{entity}
62
- end
63
-
64
- segments["#{ entity }"] = entities
65
- end
66
-
67
- def #{entity}(raw = false)
68
- begin
69
- entities = segments["#{ entity }"]
70
- if entities.nil?
71
- load_#{entity}(raw)
72
- entities = segments["#{ entity }"]
73
- end
74
- end
75
-
76
- entities
77
- end
78
-
79
- def #{entity}_at(pos, persist = false)
80
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
81
- end
82
-
83
- EOC
84
- end
85
-
86
- def self.define_multiple(entity, &block)
87
- send :define_method, "produce_#{entity}" do
88
- if self.multiple_result && self.multiple_result[entity]
89
- segments = self.multiple_result[entity]
90
- return segments.each{|s| s.docid = docid }
91
- end
92
- raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
93
- end
94
-
95
- name = "multiple_produce_#{entity}"
96
- class << self
97
- self
98
- end.send :define_method, name, &block
99
-
100
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
101
- def load_#{entity}(raw = false)
102
- return if segments.include? "#{ entity }"
103
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
104
- entities = load_with_persistence_#{entity}(raw)
105
- else
106
- entities = produce_#{entity}
107
- end
108
-
109
- segments["#{ entity }"] = entities
110
- end
111
-
112
- def #{entity}(raw = false)
113
- begin
114
- entities = segments["#{ entity }"]
115
- if entities.nil?
116
- load_#{entity}(raw)
117
- entities = segments["#{ entity }"]
118
- end
119
- end
120
-
121
- entities
122
- end
123
-
124
- def #{entity}_at(pos, persist = false)
125
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
126
- end
127
-
128
- EOC
129
- end
130
-
131
- def self.prepare_multiple(docs, entity)
132
- missing = []
133
- docs.each do |doc|
134
- begin
135
- doc.send(entity)
136
- rescue MultipleEntity
137
- missing << doc
138
- end
139
- end
140
- res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
141
- case res
142
- when Array
143
- res.each_with_index do |res,i|
144
- missing[i].multiple_result ||= {}
145
- missing[i].multiple_result[entity] = res
146
- end
147
- when Hash
148
- res.each do |document,res|
149
- case document
150
- when Corpus::Document
151
- document.multiple_result[entity] = res
152
- when String
153
- document = missing.select{|d| d.docid == document}.first
154
- document.multiple_result[entity] = res
155
- end
156
- end
157
- end
158
- missing.each{|doc|
159
- doc.send entity
160
- }
161
- end
162
-
163
-
164
- #{{{ PERSISTENCE
165
-
166
- TSV_REPOS = {}
167
- FIELDS_FOR_ENTITY_PERSISTENCE = {}
168
- def self.persist(entity, fields = nil)
169
-
170
- if not fields.nil?
171
- fields = [fields] if not Array === fields
172
- fields = fields.collect{|f| f.to_s}
173
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
174
- end
175
-
176
- self.class_eval <<-EOC, __FILE__, __LINE__
177
- def load_with_persistence_#{entity}(raw = false)
178
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
179
-
180
- tsv_file = File.join(@persist_dir.find, "#{ entity }")
181
-
182
- return nil if raw == :check and File.exists? tsv_file
183
-
184
- annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
185
- segments = produce_#{entity}
186
- tsv = Segment.tsv(segments, fields)
187
- end
188
-
189
- return annotations if raw
190
-
191
- annotations.unnamed = true
192
- annotations.collect{|id, annotation|
193
- Segment.load_tsv_values(text, annotation, annotations.fields)
194
- }
195
- end
196
- EOC
197
- end
198
-
199
- def self.persist_in_tsv(entity, tsv = nil, fields = nil)
200
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]).tap{|t| t.unnamed = true, t.close} if Path === tsv
201
-
202
- if ! tsv.nil? && ! tsv.respond_to?(:keys)
203
- fields = tsv
204
- tsv = nil
205
- end
206
-
207
- TSV_REPOS[entity.to_s] = tsv
208
-
209
- if ! fields.nil?
210
- fields = [fields] if not Array === fields
211
- fields = fields.collect{|f| f.to_s}
212
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
213
- end
214
-
215
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
216
- def load_with_persistence_#{entity}(raw = false)
217
- repo = TSV_REPOS["#{ entity }"]
218
- if repo.nil?
219
- raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
220
- repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
221
- end
222
-
223
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
224
- begin
225
- if ! repo.include?("#{ entity }")
226
- segments = produce_#{entity}
227
- repo.write_and_read do
228
- repo["#{entity}"] = Segment.tsv(segments, fields) if segments.any?
229
- end
230
- else
231
- if raw == :check
232
- repo.close
233
- return nil
234
- end
235
- end
236
-
237
- annotations = repo["#{entity}"]
238
-
239
- repo.close
240
-
241
- return annotations if raw
242
-
243
- annotations.unnamed = true
244
- annotations.collect{|id, annotation|
245
- Segment.load_tsv_values(text, annotation, annotations.fields)
246
- }
247
- ensure
248
- repo.close
249
- end
250
- end
251
- EOC
252
- end
253
-
254
- def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
255
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => (fields || ["Start", "End", "JSON", "Document ID", "Entity Type"])).tap{|t| t.unnamed = true, t.close} if Path === tsv
256
-
257
- doc_field ||= "Document ID"
258
- entity_field ||= "Entity Type"
259
-
260
- TSV_REPOS[entity.to_s] = tsv
261
-
262
- if not fields.nil?
263
- fields = [fields] if not Array === fields
264
- fields = fields.collect{|f| f.to_s}
265
- else
266
- fields = nil
267
- end
268
-
269
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
270
-
271
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
272
- def load_with_persistence_#{entity}(raw = false)
273
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
274
-
275
- data = TSV_REPOS["#{ entity }"] || @global_persistence
276
-
277
- begin
278
-
279
- if data.respond_to? :persistence_path and String === data.persistence_path
280
- data.filter(data.persistence_path + '.filters')
281
- end
282
-
283
- keys = data.read_and_close do
284
-
285
- fields = data.fields if fields.nil? and data.respond_to? :fields
286
-
287
- data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
288
- data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
289
- keys = data.keys
290
- data.pop_filter if fields.include?("#{entity_field}")
291
- data.pop_filter if fields.include?("#{doc_field}")
292
-
293
- keys
294
- end
295
-
296
-
297
- if keys.empty?
298
- segments = produce_#{entity}
299
- segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
300
- tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
301
-
302
- tsv.add_field "#{ doc_field }" do
303
- @docid
304
- end
305
-
306
- tsv.add_field "#{ entity_field }" do
307
- "#{ entity }"
308
- end
309
-
310
- keys = data.write_and_close do
311
- data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
312
- data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
313
- keys = tsv.collect do |key, value|
314
- data[key] = value
315
- key
316
- end
317
- data.pop_filter if fields.include?("#{entity_field}")
318
- data.pop_filter if fields.include?("#{doc_field}")
319
- keys
320
- end
321
-
322
- else
323
- return nil if raw == :check
324
- end
325
-
326
- return data.values if raw
327
-
328
- start_pos = data.identify_field "Start"
329
- data.read_and_close do
330
- data.chunked_values_at(keys).collect{|annotation|
331
- begin
332
- pos = annotation[start_pos]
333
- Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
334
- rescue
335
- Log.exception $!
336
- iif keys
337
- iif [text, annotation]
338
- end
339
-
340
- }.compact
341
- end
342
- ensure
343
- data.close
344
- end
345
-
346
- end
347
- EOC
348
- end
349
-
350
- def segment_index(name, persist_dir = nil)
351
- @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
352
- end
353
-
354
- def load_into(segment, *annotations)
355
- options = annotations.pop if Hash === annotations.last
356
- options ||= {}
357
-
358
- if options[:persist] and not @persist_dir.nil?
359
- persist_dir = File.join(@persist_dir, 'ranges')
360
- else
361
- persist_dir = nil
362
- end
363
-
364
- Segmented.setup(segment, {})
365
- annotations.collect do |name|
366
- name = name.to_s
367
- index = segment_index(name, persist_dir)
368
- annotations = index[segment.range]
369
- segment.segments[name] ||= {}
370
- segment.segments[name] = annotations
371
- class << segment
372
- self
373
- end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
374
- end
375
-
376
- segment
377
- end
378
-
379
- def entity
380
- Object::Document.setup(self.docid, corpus)
381
- end
382
- end
383
- end