rbbt-text 0.2.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
data/bin/get_ppis.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbbt-util'
|
4
|
+
require 'rbbt/annotations/corpus'
|
5
|
+
require 'rbbt/annotations/corpus/pubmed'
|
6
|
+
require 'rbbt/annotations/relationships/ppi'
|
7
|
+
require 'rbbt/sources/pubmed'
|
8
|
+
require 'rbbt/ner/annotations'
|
9
|
+
require 'rbbt/ner/token_trieNER'
|
10
|
+
require 'rbbt/ner/annotations/transformed'
|
11
|
+
require 'rbbt/ner/chemical_tagger'
|
12
|
+
|
13
|
+
Corpus.define_entity_ner "Compounds", false do |doc|
|
14
|
+
@@chemical_tagger ||= ChemicalTagger.new
|
15
|
+
@@chemical_tagger.entities(doc.text)
|
16
|
+
end
|
17
|
+
|
18
|
+
Corpus.define_entity_ner "Diseases", false do |doc|
|
19
|
+
if ! defined? @@tokenizer
|
20
|
+
@@tokenizer = TokenTrieNER.new [], :longest_match => true
|
21
|
+
@@tokenizer.merge TSV.new(Rbbt.share.databases.COSTART.COSTART, :native => 0, :extra => 0, :flatten => true), :COSTART
|
22
|
+
@@tokenizer.merge TSV.new(Rbbt.share.databases.CTCAE.CTCAE, :native => 0, :extra => 1, :flatten => true), :CTCAE
|
23
|
+
@@tokenizer.merge Rbbt.share.databases.Polysearch.disease, :disease
|
24
|
+
end
|
25
|
+
@@tokenizer.entities(doc.text)
|
26
|
+
end
|
27
|
+
|
28
|
+
corpus = Corpus.new Rbbt.tmp.corpus["PPIS2"].find
|
29
|
+
|
30
|
+
docids = corpus.add_pubmed_query("Cancer", 5000, :abstract)
|
31
|
+
|
32
|
+
Misc.profile do
|
33
|
+
docids[0..100].each do |docid|
|
34
|
+
puts "ARTICLE: #{ docid }"
|
35
|
+
doc = corpus.docid(docid)
|
36
|
+
diseases = doc.produce_diseases
|
37
|
+
#puts "Diseases: #{diseases.collect{|g| [g,g.id,g.offset] * ":"} * ", "}"
|
38
|
+
#sentences = doc.sentences
|
39
|
+
#diseases_index = Segment.index(diseases)
|
40
|
+
#sentences.each do |sentence|
|
41
|
+
# diseases_in_sentence = diseases_index[sentence.range]
|
42
|
+
# next if diseases_in_sentence.empty?
|
43
|
+
# Transformed.transform(sentence, sentence.make_relative(diseases_in_sentence.dup)) do |entity|
|
44
|
+
# entity.html
|
45
|
+
# end
|
46
|
+
# puts "---#{[sentence.id, sentence.offset] * ":"}"
|
47
|
+
# puts sentence
|
48
|
+
# puts "Diseases: #{diseases_in_sentence.collect{|g| [g,g.id,g.offset] * ":"} * ", "}"
|
49
|
+
# sentence.restore
|
50
|
+
#end
|
51
|
+
end
|
52
|
+
end
|
data/lib/rbbt/bow/dictionary.rb
CHANGED
@@ -74,15 +74,15 @@ class Dictionary::TF_IDF
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def best(options = {})
|
77
|
-
|
77
|
+
high, low, limit = {
|
78
78
|
:low => 0,
|
79
|
-
:
|
79
|
+
:high => 1,
|
80
80
|
}.merge(options).
|
81
|
-
values_at(:
|
81
|
+
values_at(:high, :low, :limit)
|
82
82
|
|
83
83
|
num_docs = @num_docs.to_f
|
84
84
|
best = df.select{|term, value|
|
85
|
-
value >= low && value <=
|
85
|
+
value >= low && value <= high
|
86
86
|
}.collect{|p|
|
87
87
|
term = p.first
|
88
88
|
df_value = p.last
|
@@ -147,19 +147,19 @@ class Dictionary::KL
|
|
147
147
|
end
|
148
148
|
|
149
149
|
def best(options = {})
|
150
|
-
|
150
|
+
high, low, limit = {
|
151
151
|
:low => 0,
|
152
|
-
:
|
152
|
+
:high => 1,
|
153
153
|
}.merge(options).
|
154
|
-
values_at(:
|
154
|
+
values_at(:high, :low, :limit)
|
155
155
|
|
156
156
|
pos_df = @pos_dict.df
|
157
157
|
neg_df = @neg_dict.df
|
158
158
|
|
159
159
|
best = {}
|
160
160
|
terms.select{|term|
|
161
|
-
pos_df[term] >= low && pos_df[term] <=
|
162
|
-
neg_df[term] >= low && neg_df[term] <=
|
161
|
+
pos_df[term] >= low && pos_df[term] <= high ||
|
162
|
+
neg_df[term] >= low && neg_df[term] <= high
|
163
163
|
}.each{|term|
|
164
164
|
pos = pos_df[term]
|
165
165
|
neg = neg_df[term]
|
data/lib/rbbt/bow/misc.rb
CHANGED
@@ -1,7 +1,91 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/open'
|
3
3
|
|
4
|
-
Rbbt.
|
4
|
+
Rbbt.share.wordlists.trigger_terms.define_as_url "http://zope.bioinfo.cnio.es/hpylori/pubmedxml2dir_files/ppi_trigger_term_table.txt"
|
5
5
|
|
6
|
-
$stopwords
|
6
|
+
$stopwords = Rbbt.share.wordlists.stopwords.read.scan(/\w+/)
|
7
|
+
|
8
|
+
$greek = {
|
9
|
+
"alpha" => "a",
|
10
|
+
"beta" => "b",
|
11
|
+
"gamma" => "g",
|
12
|
+
"delta" => "d",
|
13
|
+
"epsilon" => "e",
|
14
|
+
"zeta" => "z",
|
15
|
+
"eta" => "e",
|
16
|
+
"theta" => "th",
|
17
|
+
"iota" => "i",
|
18
|
+
"kappa" => "k",
|
19
|
+
"lambda" => "l",
|
20
|
+
"mu" => "m",
|
21
|
+
"nu" => "n",
|
22
|
+
"xi" => "x",
|
23
|
+
"omicron" => "o",
|
24
|
+
"pi" => "p",
|
25
|
+
"rho" => "r",
|
26
|
+
"sigma" => "s",
|
27
|
+
"tau" => "t",
|
28
|
+
"upsilon" => "u",
|
29
|
+
"phi" => "ph",
|
30
|
+
"chi" => "ch",
|
31
|
+
"psi" => "ps",
|
32
|
+
"omega" => "o"
|
33
|
+
}
|
34
|
+
|
35
|
+
$inverse_greek = Hash.new
|
36
|
+
$greek.each{|l,s| $inverse_greek[s] = l }
|
37
|
+
|
38
|
+
class String
|
39
|
+
CONSONANTS = []
|
40
|
+
if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
|
41
|
+
Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Uses heuristics to checks if a string seems like a special word, like a gene name.
|
45
|
+
def is_special?
|
46
|
+
# Only consonants
|
47
|
+
return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
|
48
|
+
|
49
|
+
# Not a word
|
50
|
+
return false if self =~ /[^\s]\s[^\s]/;
|
51
|
+
return false if self.length < 3;
|
52
|
+
# Alphanumeric
|
53
|
+
return true if self =~ /[0-9]/ && self =~ /[a-z]/i
|
54
|
+
# All Caps
|
55
|
+
return true if self =~ /[A-Z]{2,}/;
|
56
|
+
# Caps Mix
|
57
|
+
return true if self =~ /[a-z][A-Z]/;
|
58
|
+
# All consonants
|
59
|
+
return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
|
60
|
+
# Dashed word
|
61
|
+
return true if self =~ /(^\w-|-\w$)/
|
62
|
+
# To many consonants (very heuristic)
|
63
|
+
if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
|
64
|
+
return true
|
65
|
+
end
|
66
|
+
|
67
|
+
return false
|
68
|
+
end
|
69
|
+
|
70
|
+
# Turns the first letter to lowercase
|
71
|
+
def downcase_first
|
72
|
+
return "" if self == ""
|
73
|
+
letters = self.scan(/./)
|
74
|
+
letters[0].downcase!
|
75
|
+
letters.join("")
|
76
|
+
end
|
77
|
+
|
78
|
+
# Turns a roman number into arabic form is possible. Just simple
|
79
|
+
# romans only...
|
80
|
+
def arabic
|
81
|
+
return 1 if self =~ /^I$/;
|
82
|
+
return 2 if self =~ /^II$/;
|
83
|
+
return 3 if self =~ /^III$/;
|
84
|
+
return 4 if self =~ /^IV$/;
|
85
|
+
return 5 if self =~ /^V$/;
|
86
|
+
return 10 if self =~ /^X$/;
|
87
|
+
|
88
|
+
return nil
|
89
|
+
end
|
90
|
+
end
|
7
91
|
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rbbt/corpus/document'
|
2
|
+
require 'rbbt/corpus/document_repo'
|
3
|
+
|
4
|
+
class Corpus
|
5
|
+
attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
|
6
|
+
def initialize(corpora_path = nil)
|
7
|
+
@corpora_path = case
|
8
|
+
when corpora_path.nil?
|
9
|
+
Rbbt.corpora
|
10
|
+
when (not Resource::Path === corpora_path)
|
11
|
+
Resource::Path.path(corpora_path)
|
12
|
+
else
|
13
|
+
corpora_path
|
14
|
+
end
|
15
|
+
|
16
|
+
@document_repo = DocumentRepo.get @corpora_path.document_repo, false
|
17
|
+
@persistence_dir = File.join(@corpora_path, "annotations")
|
18
|
+
@global_annotations = TSV.new(TCHash.get(File.join(@persistence_dir, "global_annotations"), :list), :list, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
19
|
+
@global_annotations.unnamed = true
|
20
|
+
end
|
21
|
+
|
22
|
+
def persistence_for(docid)
|
23
|
+
File.join(persistence_dir, docid)
|
24
|
+
end
|
25
|
+
|
26
|
+
def document(namespace, id, type, hash)
|
27
|
+
docid = [namespace, id, type, hash] * ":"
|
28
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
29
|
+
end
|
30
|
+
|
31
|
+
def docid(docid)
|
32
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_document(text, namespace, id, type = nil)
|
36
|
+
hash = Digest::MD5.hexdigest(text)
|
37
|
+
@document_repo.add(text, namespace, id, type, hash)
|
38
|
+
end
|
39
|
+
|
40
|
+
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
41
|
+
@document_repo.find(namespace, id, type, hash).collect{|docid|
|
42
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def find_docid(docid)
|
47
|
+
@document_repo.find_docid(docid).collect{|docid|
|
48
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
def exists?(namespace=nil, id = nil, type = nil, hash = nil)
|
53
|
+
find(namespace, id, type, hash).any?
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,289 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
require 'rbbt/util/resource'
|
4
|
+
require 'rbbt/util/misc'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
class Document
|
8
|
+
|
9
|
+
attr_accessor :text, :docid, :namespace, :id, :type, :hash, :annotations, :segment_indeces, :persistence_dir, :global_persistence
|
10
|
+
def initialize(persistence_dir = nil, docid = nil, text = nil, global_persistence = nil)
|
11
|
+
@annotations = {}
|
12
|
+
@segment_indeces = {}
|
13
|
+
|
14
|
+
if not persistence_dir.nil?
|
15
|
+
@persistence_dir = persistence_dir
|
16
|
+
@persistence_dir = Resource::Path.path(@persistence_dir) if not Resource::Path == @persistence_dir
|
17
|
+
end
|
18
|
+
|
19
|
+
@global_persistence = global_persistence
|
20
|
+
|
21
|
+
if not docid.nil?
|
22
|
+
@docid = docid
|
23
|
+
update_docid
|
24
|
+
end
|
25
|
+
@text = text unless text.nil?
|
26
|
+
end
|
27
|
+
|
28
|
+
def update_docid
|
29
|
+
@namespace, @id, @type, @hash = docid.split(":", -1)
|
30
|
+
end
|
31
|
+
|
32
|
+
def docid=(docid)
|
33
|
+
@docid = docid
|
34
|
+
update_docid
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.save_segment(segment, fields = nil)
|
38
|
+
if fields.nil?
|
39
|
+
eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
|
40
|
+
[segment.offset, eend, segment.info.to_json]
|
41
|
+
else
|
42
|
+
eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
|
43
|
+
info = segment.info
|
44
|
+
info["literal"] = segment.to_s.gsub(/\s/,' ')
|
45
|
+
info.extend IndiferentHash
|
46
|
+
[segment.offset, eend].concat info.values_at(*fields.collect{|f| f.downcase}).collect{|v| Array === v ? v * "|" : v}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.load_segment(text, annotation, fields = nil)
|
51
|
+
if fields.nil?
|
52
|
+
start, eend, info = annotation.values_at 0,1,2
|
53
|
+
info = JSON.parse(info)
|
54
|
+
else
|
55
|
+
start, eend = annotation.values_at 0,1
|
56
|
+
info = Misc.process_to_hash(fields) do |fields| annotation.values_at(*fields.collect{|f| f.downcase}).collect{|v| v.index("|").nil? ? v : v.split("|")} end
|
57
|
+
end
|
58
|
+
|
59
|
+
Segment.load(text, start, eend, info, @docid)
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.tsv(segments, fields = nil)
|
63
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
64
|
+
if fields.nil?
|
65
|
+
tsv.fields += ["Info"]
|
66
|
+
else
|
67
|
+
tsv.fields += fields
|
68
|
+
end
|
69
|
+
|
70
|
+
segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
71
|
+
|
72
|
+
tsv
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
#{{{ PERSISTENCE
|
77
|
+
|
78
|
+
TSV_REPOS = {}
|
79
|
+
FIELDS_FOR_ENTITY_PERSISTENCE = {}
|
80
|
+
def self.persist(entity, fields = nil)
|
81
|
+
|
82
|
+
if not fields.nil?
|
83
|
+
fields = [fields] if not Array === fields
|
84
|
+
fields = fields.collect{|f| f.to_s}
|
85
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
86
|
+
end
|
87
|
+
|
88
|
+
self.class_eval <<-EOC
|
89
|
+
def load_with_persistence_#{entity}
|
90
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
91
|
+
|
92
|
+
annotations = Persistence.persist("#{ entity }", :Entity, :tsv_string,
|
93
|
+
:persistence_file => File.join(@persistence_dir, "#{ entity }")) do
|
94
|
+
|
95
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
96
|
+
if fields.nil?
|
97
|
+
tsv.fields += ["Info"]
|
98
|
+
else
|
99
|
+
tsv.fields += fields
|
100
|
+
end
|
101
|
+
|
102
|
+
segments = produce_#{entity}
|
103
|
+
segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
104
|
+
|
105
|
+
tsv
|
106
|
+
end
|
107
|
+
|
108
|
+
annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
|
109
|
+
end
|
110
|
+
EOC
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.persist_in_tsv(entity, tsv = nil, fields = nil)
|
114
|
+
if not tsv.nil? and not tsv.respond_to?(:keys)
|
115
|
+
fields = tsv
|
116
|
+
tsv = nil
|
117
|
+
end
|
118
|
+
|
119
|
+
TSV_REPOS[entity.to_s] = tsv
|
120
|
+
|
121
|
+
if not fields.nil?
|
122
|
+
fields = [fields] if not Array === fields
|
123
|
+
fields = fields.collect{|f| f.to_s}
|
124
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
125
|
+
end
|
126
|
+
|
127
|
+
self.class_eval <<-EOC
|
128
|
+
def load_with_persistence_#{entity}
|
129
|
+
repo = TSV_REPOS["#{ entity }"]
|
130
|
+
if repo.nil?
|
131
|
+
raise "No persistence file or persistencr dir for persist_in_tsv" if persistence_dir.nil?
|
132
|
+
repo = TCHash.get(persistence_dir.annotations_by_type.find, TCHash::TSVSerializer)
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
137
|
+
|
138
|
+
if not repo.include? "#{ entity }"
|
139
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
140
|
+
if fields.nil?
|
141
|
+
tsv.fields += ["Info"]
|
142
|
+
else
|
143
|
+
tsv.fields += fields
|
144
|
+
end
|
145
|
+
|
146
|
+
produce_#{entity}.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
147
|
+
repo.write
|
148
|
+
repo["#{entity}"] = tsv
|
149
|
+
repo.read
|
150
|
+
end
|
151
|
+
|
152
|
+
annotations = repo["#{entity}"]
|
153
|
+
|
154
|
+
repo.close
|
155
|
+
|
156
|
+
annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
|
157
|
+
end
|
158
|
+
EOC
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
|
162
|
+
if not tsv.nil? and not tsv.respond_to?(:keys)
|
163
|
+
entity_field = doc_field if doc_field
|
164
|
+
doc_field = fields if fields
|
165
|
+
fields = tsv if tsv
|
166
|
+
tsv = nil
|
167
|
+
end
|
168
|
+
|
169
|
+
doc_field ||= "Document ID"
|
170
|
+
entity_field ||= "Entity Type"
|
171
|
+
|
172
|
+
TSV_REPOS[entity.to_s] = tsv
|
173
|
+
|
174
|
+
if not fields.nil?
|
175
|
+
fields = [fields] if not Array === fields
|
176
|
+
fields = fields.collect{|f| f.to_s}
|
177
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
178
|
+
end
|
179
|
+
|
180
|
+
self.class_eval <<-EOC
|
181
|
+
def load_with_persistence_#{entity}
|
182
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
183
|
+
|
184
|
+
data = TSV_REPOS["#{ entity }"]
|
185
|
+
|
186
|
+
if data.nil?
|
187
|
+
data = global_persistence
|
188
|
+
end
|
189
|
+
|
190
|
+
data.filter
|
191
|
+
data.add_filter("field:#{ doc_field }", @docid)
|
192
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }")
|
193
|
+
|
194
|
+
if data.keys.empty?
|
195
|
+
tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
|
196
|
+
if fields.nil?
|
197
|
+
tsv.fields += ["Info"]
|
198
|
+
else
|
199
|
+
tsv.fields += fields
|
200
|
+
end
|
201
|
+
|
202
|
+
segments = produce_#{entity}
|
203
|
+
segments << Segment.annotate("No #{entity} found in document #{ @docid }", -1) if segments.empty?
|
204
|
+
segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
|
205
|
+
|
206
|
+
tsv.add_field "#{ doc_field }" do
|
207
|
+
@docid
|
208
|
+
end
|
209
|
+
|
210
|
+
tsv.add_field "#{ entity_field }" do
|
211
|
+
"#{ entity }"
|
212
|
+
end
|
213
|
+
|
214
|
+
data.write
|
215
|
+
data.merge!(tsv)
|
216
|
+
data.read
|
217
|
+
end
|
218
|
+
|
219
|
+
segments = []
|
220
|
+
data.each{|id, annotation| segments << Document.load_segment(text, annotation, fields) unless annotation[1].to_i == -1}
|
221
|
+
|
222
|
+
data.pop_filter
|
223
|
+
data.pop_filter
|
224
|
+
|
225
|
+
segments
|
226
|
+
end
|
227
|
+
EOC
|
228
|
+
end
|
229
|
+
|
230
|
+
|
231
|
+
def self.define(entity, &block)
|
232
|
+
send :define_method, "produce_#{entity}", &block
|
233
|
+
|
234
|
+
self.class_eval <<-EOC
|
235
|
+
def load_#{entity}
|
236
|
+
return if annotations.include? "#{ entity }"
|
237
|
+
if self.respond_to?("load_with_persistence_#{entity}") and not @persistence_dir.nil?
|
238
|
+
annotations["#{entity}"] = load_with_persistence_#{entity}
|
239
|
+
else
|
240
|
+
annotations["#{ entity }"] = produce_#{entity}
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def #{entity}
|
245
|
+
begin
|
246
|
+
entities = annotations["#{ entity }"]
|
247
|
+
if entities.nil?
|
248
|
+
load_#{entity}
|
249
|
+
entities = annotations["#{ entity }"]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
entities
|
254
|
+
end
|
255
|
+
|
256
|
+
def #{entity}_at(pos, persist = false)
|
257
|
+
segment_index("#{ entity }", persist ? File.join(@persistence_dir, 'ranges') : nil)[pos]
|
258
|
+
end
|
259
|
+
|
260
|
+
EOC
|
261
|
+
end
|
262
|
+
|
263
|
+
def segment_index(name, persistence_dir = nil)
|
264
|
+
@segment_indeces[name] ||= Segment.index(self.send(name), persistence_dir.nil? ? :memory : File.join(persistence_dir, name + '.range'))
|
265
|
+
end
|
266
|
+
|
267
|
+
def load_into(segment, *annotations)
|
268
|
+
options = annotations.pop if Hash === annotations.last
|
269
|
+
options ||= {}
|
270
|
+
if options[:persist] and not @persistence_dir.nil?
|
271
|
+
persistence_dir = File.join(@persistence_dir, 'ranges')
|
272
|
+
else
|
273
|
+
persistence_dir = nil
|
274
|
+
end
|
275
|
+
|
276
|
+
segment.extend Annotated
|
277
|
+
segment.annotations ||= {}
|
278
|
+
annotations.collect do |name|
|
279
|
+
name = name.to_s
|
280
|
+
annotations = segment_index(name, persistence_dir)[segment.range]
|
281
|
+
segment.annotations[name] = annotations
|
282
|
+
class << segment
|
283
|
+
self
|
284
|
+
end.class_eval "def #{ name }; @annotations['#{ name }']; end"
|
285
|
+
end
|
286
|
+
|
287
|
+
segment
|
288
|
+
end
|
289
|
+
end
|