rbbt-text 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
data/bin/get_ppis.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbbt-util'
|
4
|
+
require 'rbbt/annotations/corpus'
|
5
|
+
require 'rbbt/annotations/corpus/pubmed'
|
6
|
+
require 'rbbt/annotations/relationships/ppi'
|
7
|
+
require 'rbbt/sources/pubmed'
|
8
|
+
require 'rbbt/ner/annotations'
|
9
|
+
require 'rbbt/ner/token_trieNER'
|
10
|
+
require 'rbbt/ner/annotations/transformed'
|
11
|
+
require 'rbbt/ner/chemical_tagger'
|
12
|
+
|
13
|
+
Corpus.define_entity_ner "Compounds", false do |doc|
|
14
|
+
@@chemical_tagger ||= ChemicalTagger.new
|
15
|
+
@@chemical_tagger.entities(doc.text)
|
16
|
+
end
|
17
|
+
|
18
|
+
Corpus.define_entity_ner "Diseases", false do |doc|
|
19
|
+
if ! defined? @@tokenizer
|
20
|
+
@@tokenizer = TokenTrieNER.new [], :longest_match => true
|
21
|
+
@@tokenizer.merge TSV.new(Rbbt.share.databases.COSTART.COSTART, :native => 0, :extra => 0, :flatten => true), :COSTART
|
22
|
+
@@tokenizer.merge TSV.new(Rbbt.share.databases.CTCAE.CTCAE, :native => 0, :extra => 1, :flatten => true), :CTCAE
|
23
|
+
@@tokenizer.merge Rbbt.share.databases.Polysearch.disease, :disease
|
24
|
+
end
|
25
|
+
@@tokenizer.entities(doc.text)
|
26
|
+
end
|
27
|
+
|
28
|
+
corpus = Corpus.new Rbbt.tmp.corpus["PPIS2"].find
|
29
|
+
|
30
|
+
docids = corpus.add_pubmed_query("Cancer", 5000, :abstract)
|
31
|
+
|
32
|
+
Misc.profile do
|
33
|
+
docids[0..100].each do |docid|
|
34
|
+
puts "ARTICLE: #{ docid }"
|
35
|
+
doc = corpus.docid(docid)
|
36
|
+
diseases = doc.produce_diseases
|
37
|
+
#puts "Diseases: #{diseases.collect{|g| [g,g.id,g.offset] * ":"} * ", "}"
|
38
|
+
#sentences = doc.sentences
|
39
|
+
#diseases_index = Segment.index(diseases)
|
40
|
+
#sentences.each do |sentence|
|
41
|
+
# diseases_in_sentence = diseases_index[sentence.range]
|
42
|
+
# next if diseases_in_sentence.empty?
|
43
|
+
# Transformed.transform(sentence, sentence.make_relative(diseases_in_sentence.dup)) do |entity|
|
44
|
+
# entity.html
|
45
|
+
# end
|
46
|
+
# puts "---#{[sentence.id, sentence.offset] * ":"}"
|
47
|
+
# puts sentence
|
48
|
+
# puts "Diseases: #{diseases_in_sentence.collect{|g| [g,g.id,g.offset] * ":"} * ", "}"
|
49
|
+
# sentence.restore
|
50
|
+
#end
|
51
|
+
end
|
52
|
+
end
|
data/lib/rbbt/bow/dictionary.rb
CHANGED
@@ -74,15 +74,15 @@ class Dictionary::TF_IDF
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def best(options = {})
|
77
|
-
|
77
|
+
high, low, limit = {
|
78
78
|
:low => 0,
|
79
|
-
:
|
79
|
+
:high => 1,
|
80
80
|
}.merge(options).
|
81
|
-
values_at(:
|
81
|
+
values_at(:high, :low, :limit)
|
82
82
|
|
83
83
|
num_docs = @num_docs.to_f
|
84
84
|
best = df.select{|term, value|
|
85
|
-
value >= low && value <=
|
85
|
+
value >= low && value <= high
|
86
86
|
}.collect{|p|
|
87
87
|
term = p.first
|
88
88
|
df_value = p.last
|
@@ -147,19 +147,19 @@ class Dictionary::KL
|
|
147
147
|
end
|
148
148
|
|
149
149
|
def best(options = {})
|
150
|
-
|
150
|
+
high, low, limit = {
|
151
151
|
:low => 0,
|
152
|
-
:
|
152
|
+
:high => 1,
|
153
153
|
}.merge(options).
|
154
|
-
values_at(:
|
154
|
+
values_at(:high, :low, :limit)
|
155
155
|
|
156
156
|
pos_df = @pos_dict.df
|
157
157
|
neg_df = @neg_dict.df
|
158
158
|
|
159
159
|
best = {}
|
160
160
|
terms.select{|term|
|
161
|
-
pos_df[term] >= low && pos_df[term] <=
|
162
|
-
neg_df[term] >= low && neg_df[term] <=
|
161
|
+
pos_df[term] >= low && pos_df[term] <= high ||
|
162
|
+
neg_df[term] >= low && neg_df[term] <= high
|
163
163
|
}.each{|term|
|
164
164
|
pos = pos_df[term]
|
165
165
|
neg = neg_df[term]
|
data/lib/rbbt/bow/misc.rb
CHANGED
@@ -1,7 +1,91 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/open'
|
3
3
|
|
4
|
-
Rbbt.
|
4
|
+
Rbbt.share.wordlists.trigger_terms.define_as_url "http://zope.bioinfo.cnio.es/hpylori/pubmedxml2dir_files/ppi_trigger_term_table.txt"
|
5
5
|
|
6
|
-
$stopwords
|
6
|
+
$stopwords = Rbbt.share.wordlists.stopwords.read.scan(/\w+/)
|
7
|
+
|
8
|
+
$greek = {
|
9
|
+
"alpha" => "a",
|
10
|
+
"beta" => "b",
|
11
|
+
"gamma" => "g",
|
12
|
+
"delta" => "d",
|
13
|
+
"epsilon" => "e",
|
14
|
+
"zeta" => "z",
|
15
|
+
"eta" => "e",
|
16
|
+
"theta" => "th",
|
17
|
+
"iota" => "i",
|
18
|
+
"kappa" => "k",
|
19
|
+
"lambda" => "l",
|
20
|
+
"mu" => "m",
|
21
|
+
"nu" => "n",
|
22
|
+
"xi" => "x",
|
23
|
+
"omicron" => "o",
|
24
|
+
"pi" => "p",
|
25
|
+
"rho" => "r",
|
26
|
+
"sigma" => "s",
|
27
|
+
"tau" => "t",
|
28
|
+
"upsilon" => "u",
|
29
|
+
"phi" => "ph",
|
30
|
+
"chi" => "ch",
|
31
|
+
"psi" => "ps",
|
32
|
+
"omega" => "o"
|
33
|
+
}
|
34
|
+
|
35
|
+
$inverse_greek = Hash.new
|
36
|
+
$greek.each{|l,s| $inverse_greek[s] = l }
|
37
|
+
|
38
|
+
class String
|
39
|
+
CONSONANTS = []
|
40
|
+
if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
|
41
|
+
Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Uses heuristics to checks if a string seems like a special word, like a gene name.
|
45
|
+
def is_special?
|
46
|
+
# Only consonants
|
47
|
+
return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
|
48
|
+
|
49
|
+
# Not a word
|
50
|
+
return false if self =~ /[^\s]\s[^\s]/;
|
51
|
+
return false if self.length < 3;
|
52
|
+
# Alphanumeric
|
53
|
+
return true if self =~ /[0-9]/ && self =~ /[a-z]/i
|
54
|
+
# All Caps
|
55
|
+
return true if self =~ /[A-Z]{2,}/;
|
56
|
+
# Caps Mix
|
57
|
+
return true if self =~ /[a-z][A-Z]/;
|
58
|
+
# All consonants
|
59
|
+
return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
|
60
|
+
# Dashed word
|
61
|
+
return true if self =~ /(^\w-|-\w$)/
|
62
|
+
# To many consonants (very heuristic)
|
63
|
+
if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
|
64
|
+
return true
|
65
|
+
end
|
66
|
+
|
67
|
+
return false
|
68
|
+
end
|
69
|
+
|
70
|
+
# Turns the first letter to lowercase
|
71
|
+
def downcase_first
|
72
|
+
return "" if self == ""
|
73
|
+
letters = self.scan(/./)
|
74
|
+
letters[0].downcase!
|
75
|
+
letters.join("")
|
76
|
+
end
|
77
|
+
|
78
|
+
# Turns a roman number into arabic form is possible. Just simple
|
79
|
+
# romans only...
|
80
|
+
def arabic
|
81
|
+
return 1 if self =~ /^I$/;
|
82
|
+
return 2 if self =~ /^II$/;
|
83
|
+
return 3 if self =~ /^III$/;
|
84
|
+
return 4 if self =~ /^IV$/;
|
85
|
+
return 5 if self =~ /^V$/;
|
86
|
+
return 10 if self =~ /^X$/;
|
87
|
+
|
88
|
+
return nil
|
89
|
+
end
|
90
|
+
end
|
7
91
|
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rbbt/corpus/document'
|
2
|
+
require 'rbbt/corpus/document_repo'
|
3
|
+
|
4
|
+
class Corpus
|
5
|
+
attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
|
6
|
+
def initialize(corpora_path = nil)
|
7
|
+
@corpora_path = case
|
8
|
+
when corpora_path.nil?
|
9
|
+
Rbbt.corpora
|
10
|
+
when (not Resource::Path === corpora_path)
|
11
|
+
Resource::Path.path(corpora_path)
|
12
|
+
else
|
13
|
+
corpora_path
|
14
|
+
end
|
15
|
+
|
16
|
+
@document_repo = DocumentRepo.get @corpora_path.document_repo, false
|
17
|
+
@persistence_dir = File.join(@corpora_path, "annotations")
|
18
|
+
@global_annotations = TSV.new(TCHash.get(File.join(@persistence_dir, "global_annotations"), :list), :list, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
19
|
+
@global_annotations.unnamed = true
|
20
|
+
end
|
21
|
+
|
22
|
+
def persistence_for(docid)
|
23
|
+
File.join(persistence_dir, docid)
|
24
|
+
end
|
25
|
+
|
26
|
+
def document(namespace, id, type, hash)
|
27
|
+
docid = [namespace, id, type, hash] * ":"
|
28
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
29
|
+
end
|
30
|
+
|
31
|
+
def docid(docid)
|
32
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_document(text, namespace, id, type = nil)
|
36
|
+
hash = Digest::MD5.hexdigest(text)
|
37
|
+
@document_repo.add(text, namespace, id, type, hash)
|
38
|
+
end
|
39
|
+
|
40
|
+
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
41
|
+
@document_repo.find(namespace, id, type, hash).collect{|docid|
|
42
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def find_docid(docid)
|
47
|
+
@document_repo.find_docid(docid).collect{|docid|
|
48
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
def exists?(namespace=nil, id = nil, type = nil, hash = nil)
|
53
|
+
find(namespace, id, type, hash).any?
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,289 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
require 'rbbt/util/resource'
|
4
|
+
require 'rbbt/util/misc'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
class Document
|
8
|
+
|
9
|
+
attr_accessor :text, :docid, :namespace, :id, :type, :hash, :annotations, :segment_indeces, :persistence_dir, :global_persistence
|
10
|
+
def initialize(persistence_dir = nil, docid = nil, text = nil, global_persistence = nil)
|
11
|
+
@annotations = {}
|
12
|
+
@segment_indeces = {}
|
13
|
+
|
14
|
+
if not persistence_dir.nil?
|
15
|
+
@persistence_dir = persistence_dir
|
16
|
+
@persistence_dir = Resource::Path.path(@persistence_dir) if not Resource::Path == @persistence_dir
|
17
|
+
end
|
18
|
+
|
19
|
+
@global_persistence = global_persistence
|
20
|
+
|
21
|
+
if not docid.nil?
|
22
|
+
@docid = docid
|
23
|
+
update_docid
|
24
|
+
end
|
25
|
+
@text = text unless text.nil?
|
26
|
+
end
|
27
|
+
|
28
|
+
def update_docid
|
29
|
+
@namespace, @id, @type, @hash = docid.split(":", -1)
|
30
|
+
end
|
31
|
+
|
32
|
+
def docid=(docid)
|
33
|
+
@docid = docid
|
34
|
+
update_docid
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.save_segment(segment, fields = nil)
|
38
|
+
if fields.nil?
|
39
|
+
eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
|
40
|
+
[segment.offset, eend, segment.info.to_json]
|
41
|
+
else
|
42
|
+
eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
|
43
|
+
info = segment.info
|
44
|
+
info["literal"] = segment.to_s.gsub(/\s/,' ')
|
45
|
+
info.extend IndiferentHash
|
46
|
+
[segment.offset, eend].concat info.values_at(*fields.collect{|f| f.downcase}).collect{|v| Array === v ? v * "|" : v}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.load_segment(text, annotation, fields = nil)
|
51
|
+
if fields.nil?
|
52
|
+
start, eend, info = annotation.values_at 0,1,2
|
53
|
+
info = JSON.parse(info)
|
54
|
+
else
|
55
|
+
start, eend = annotation.values_at 0,1
|
56
|
+
info = Misc.process_to_hash(fields) do |fields| annotation.values_at(*fields.collect{|f| f.downcase}).collect{|v| v.index("|").nil? ? v : v.split("|")} end
|
57
|
+
end
|
58
|
+
|
59
|
+
Segment.load(text, start, eend, info, @docid)
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.tsv(segments, fields = nil)
|
63
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
64
|
+
if fields.nil?
|
65
|
+
tsv.fields += ["Info"]
|
66
|
+
else
|
67
|
+
tsv.fields += fields
|
68
|
+
end
|
69
|
+
|
70
|
+
segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
71
|
+
|
72
|
+
tsv
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
#{{{ PERSISTENCE
|
77
|
+
|
78
|
+
TSV_REPOS = {}
|
79
|
+
FIELDS_FOR_ENTITY_PERSISTENCE = {}
|
80
|
+
def self.persist(entity, fields = nil)
|
81
|
+
|
82
|
+
if not fields.nil?
|
83
|
+
fields = [fields] if not Array === fields
|
84
|
+
fields = fields.collect{|f| f.to_s}
|
85
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
86
|
+
end
|
87
|
+
|
88
|
+
self.class_eval <<-EOC
|
89
|
+
def load_with_persistence_#{entity}
|
90
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
91
|
+
|
92
|
+
annotations = Persistence.persist("#{ entity }", :Entity, :tsv_string,
|
93
|
+
:persistence_file => File.join(@persistence_dir, "#{ entity }")) do
|
94
|
+
|
95
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
96
|
+
if fields.nil?
|
97
|
+
tsv.fields += ["Info"]
|
98
|
+
else
|
99
|
+
tsv.fields += fields
|
100
|
+
end
|
101
|
+
|
102
|
+
segments = produce_#{entity}
|
103
|
+
segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
104
|
+
|
105
|
+
tsv
|
106
|
+
end
|
107
|
+
|
108
|
+
annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
|
109
|
+
end
|
110
|
+
EOC
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.persist_in_tsv(entity, tsv = nil, fields = nil)
|
114
|
+
if not tsv.nil? and not tsv.respond_to?(:keys)
|
115
|
+
fields = tsv
|
116
|
+
tsv = nil
|
117
|
+
end
|
118
|
+
|
119
|
+
TSV_REPOS[entity.to_s] = tsv
|
120
|
+
|
121
|
+
if not fields.nil?
|
122
|
+
fields = [fields] if not Array === fields
|
123
|
+
fields = fields.collect{|f| f.to_s}
|
124
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
125
|
+
end
|
126
|
+
|
127
|
+
self.class_eval <<-EOC
|
128
|
+
def load_with_persistence_#{entity}
|
129
|
+
repo = TSV_REPOS["#{ entity }"]
|
130
|
+
if repo.nil?
|
131
|
+
raise "No persistence file or persistencr dir for persist_in_tsv" if persistence_dir.nil?
|
132
|
+
repo = TCHash.get(persistence_dir.annotations_by_type.find, TCHash::TSVSerializer)
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
137
|
+
|
138
|
+
if not repo.include? "#{ entity }"
|
139
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
140
|
+
if fields.nil?
|
141
|
+
tsv.fields += ["Info"]
|
142
|
+
else
|
143
|
+
tsv.fields += fields
|
144
|
+
end
|
145
|
+
|
146
|
+
produce_#{entity}.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
147
|
+
repo.write
|
148
|
+
repo["#{entity}"] = tsv
|
149
|
+
repo.read
|
150
|
+
end
|
151
|
+
|
152
|
+
annotations = repo["#{entity}"]
|
153
|
+
|
154
|
+
repo.close
|
155
|
+
|
156
|
+
annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
|
157
|
+
end
|
158
|
+
EOC
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
|
162
|
+
if not tsv.nil? and not tsv.respond_to?(:keys)
|
163
|
+
entity_field = doc_field if doc_field
|
164
|
+
doc_field = fields if fields
|
165
|
+
fields = tsv if tsv
|
166
|
+
tsv = nil
|
167
|
+
end
|
168
|
+
|
169
|
+
doc_field ||= "Document ID"
|
170
|
+
entity_field ||= "Entity Type"
|
171
|
+
|
172
|
+
TSV_REPOS[entity.to_s] = tsv
|
173
|
+
|
174
|
+
if not fields.nil?
|
175
|
+
fields = [fields] if not Array === fields
|
176
|
+
fields = fields.collect{|f| f.to_s}
|
177
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
178
|
+
end
|
179
|
+
|
180
|
+
self.class_eval <<-EOC
|
181
|
+
def load_with_persistence_#{entity}
|
182
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
183
|
+
|
184
|
+
data = TSV_REPOS["#{ entity }"]
|
185
|
+
|
186
|
+
if data.nil?
|
187
|
+
data = global_persistence
|
188
|
+
end
|
189
|
+
|
190
|
+
data.filter
|
191
|
+
data.add_filter("field:#{ doc_field }", @docid)
|
192
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }")
|
193
|
+
|
194
|
+
if data.keys.empty?
|
195
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
196
|
+
if fields.nil?
|
197
|
+
tsv.fields += ["Info"]
|
198
|
+
else
|
199
|
+
tsv.fields += fields
|
200
|
+
end
|
201
|
+
|
202
|
+
segments = produce_#{entity}
|
203
|
+
segments << Segment.annotate("No #{entity} found in document #{ @docid }", -1) if segments.empty?
|
204
|
+
segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
205
|
+
|
206
|
+
tsv.add_field "#{ doc_field }" do
|
207
|
+
@docid
|
208
|
+
end
|
209
|
+
|
210
|
+
tsv.add_field "#{ entity_field }" do
|
211
|
+
"#{ entity }"
|
212
|
+
end
|
213
|
+
|
214
|
+
data.write
|
215
|
+
data.merge!(tsv)
|
216
|
+
data.read
|
217
|
+
end
|
218
|
+
|
219
|
+
segments = []
|
220
|
+
data.each{|id, annotation| segments << Document.load_segment(text, annotation, fields) unless annotation[1].to_i == -1}
|
221
|
+
|
222
|
+
data.pop_filter
|
223
|
+
data.pop_filter
|
224
|
+
|
225
|
+
segments
|
226
|
+
end
|
227
|
+
EOC
|
228
|
+
end
|
229
|
+
|
230
|
+
|
231
|
+
def self.define(entity, &block)
|
232
|
+
send :define_method, "produce_#{entity}", &block
|
233
|
+
|
234
|
+
self.class_eval <<-EOC
|
235
|
+
def load_#{entity}
|
236
|
+
return if annotations.include? "#{ entity }"
|
237
|
+
if self.respond_to?("load_with_persistence_#{entity}") and not @persistence_dir.nil?
|
238
|
+
annotations["#{entity}"] = load_with_persistence_#{entity}
|
239
|
+
else
|
240
|
+
annotations["#{ entity }"] = produce_#{entity}
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def #{entity}
|
245
|
+
begin
|
246
|
+
entities = annotations["#{ entity }"]
|
247
|
+
if entities.nil?
|
248
|
+
load_#{entity}
|
249
|
+
entities = annotations["#{ entity }"]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
entities
|
254
|
+
end
|
255
|
+
|
256
|
+
def #{entity}_at(pos, persist = false)
|
257
|
+
segment_index("#{ entity }", persist ? File.join(@persistence_dir, 'ranges') : nil)[pos]
|
258
|
+
end
|
259
|
+
|
260
|
+
EOC
|
261
|
+
end
|
262
|
+
|
263
|
+
def segment_index(name, persistence_dir = nil)
|
264
|
+
@segment_indeces[name] ||= Segment.index(self.send(name), persistence_dir.nil? ? :memory : File.join(persistence_dir, name + '.range'))
|
265
|
+
end
|
266
|
+
|
267
|
+
def load_into(segment, *annotations)
|
268
|
+
options = annotations.pop if Hash === annotations.last
|
269
|
+
options ||= {}
|
270
|
+
if options[:persist] and not @persistence_dir.nil?
|
271
|
+
persistence_dir = File.join(@persistence_dir, 'ranges')
|
272
|
+
else
|
273
|
+
persistence_dir = nil
|
274
|
+
end
|
275
|
+
|
276
|
+
segment.extend Annotated
|
277
|
+
segment.annotations ||= {}
|
278
|
+
annotations.collect do |name|
|
279
|
+
name = name.to_s
|
280
|
+
annotations = segment_index(name, persistence_dir)[segment.range]
|
281
|
+
segment.annotations[name] = annotations
|
282
|
+
class << segment
|
283
|
+
self
|
284
|
+
end.class_eval "def #{ name }; @annotations['#{ name }']; end"
|
285
|
+
end
|
286
|
+
|
287
|
+
segment
|
288
|
+
end
|
289
|
+
end
|