rbbt 1.1.7 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +72 -136
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -246
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -145
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -79
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/cgd.Rakefile +0 -84
- data/install_scripts/organisms/human.Rakefile +0 -145
- data/install_scripts/organisms/mgi.Rakefile +0 -77
- data/install_scripts/organisms/pombe.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -258
- data/install_scripts/organisms/rgd.Rakefile +0 -88
- data/install_scripts/organisms/sgd.Rakefile +0 -66
- data/install_scripts/organisms/tair.Rakefile +0 -54
- data/install_scripts/organisms/worm.Rakefile +0 -109
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -86
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -40
- data/lib/rbbt/sources/organism.rb +0 -245
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -111
- data/lib/rbbt/util/arrayHash.rb +0 -255
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -235
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -19
- data/tasks/install.rake +0 -124
@@ -1,52 +0,0 @@
|
|
1
|
-
isLetters /^[A-Z]+$/i
|
2
|
-
isUpper /^[A-Z]+$/
|
3
|
-
isLower /^[a-z]+$/
|
4
|
-
isDigits /^[0-9]+$/i
|
5
|
-
isRoman /^[IVX]+$/
|
6
|
-
isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
|
7
|
-
isPunctuation /^[,.;]$/
|
8
|
-
isDelim /^[\/()\[\]{}\-]$/
|
9
|
-
isNonWord /^[^\w]+$/
|
10
|
-
isConjunction /^and|or|&|,$/
|
11
|
-
|
12
|
-
hasLetters /[A-Z]/i
|
13
|
-
hasUpper /.[A-Z]/
|
14
|
-
hasLower /[a-z]/
|
15
|
-
hasDigits /[0-9]/i
|
16
|
-
hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
|
17
|
-
hasPunctuation /[,.;]/
|
18
|
-
hasDelim /[\/()\[\]{}\-]/
|
19
|
-
hasNonWord /[^\w]/
|
20
|
-
caspMix /[a-z].[A-Z]/
|
21
|
-
keywords /(?:protein|gene|domain|ase)s?$/
|
22
|
-
hasSuffix /[a-z][A-Z0-9]$/
|
23
|
-
|
24
|
-
numLetters do |w| w.scan(/[A-Z]/i).length end
|
25
|
-
numDigits do |w| w.scan(/[0-9]/).length end
|
26
|
-
#
|
27
|
-
prefix_3 /^(...)/
|
28
|
-
prefix_4 /^(....)/
|
29
|
-
suffix_3 /(...)$/
|
30
|
-
suffix_4 /(....)$/
|
31
|
-
|
32
|
-
|
33
|
-
token1 do |w|
|
34
|
-
w.sub(/[A-Z]/,'A').
|
35
|
-
sub(/[a-z]/,'a').
|
36
|
-
sub(/[0-9]/,'0').
|
37
|
-
sub(/[^0-9a-z]/i,'x')
|
38
|
-
end
|
39
|
-
token2 do |w|
|
40
|
-
w.sub(/[A-Z]+/,'A').
|
41
|
-
sub(/[a-z]+/,'a').
|
42
|
-
sub(/[0-9]+/,'0').
|
43
|
-
sub(/[^0-9a-z]+/i,'x')
|
44
|
-
end
|
45
|
-
token3 do |w| w.downcase end
|
46
|
-
special do |w| w.is_special? end
|
47
|
-
|
48
|
-
context %w(special token2 isPunctuation isDelim)
|
49
|
-
window %w(1 2 3 -1 -2 -3)
|
50
|
-
#direction :reverse
|
51
|
-
|
52
|
-
|
@@ -1,219 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/sources/organism'
|
3
|
-
require 'rbbt/util/open'
|
4
|
-
require 'rbbt/ner/rner'
|
5
|
-
require 'rbbt/ner/rnorm'
|
6
|
-
|
7
|
-
|
8
|
-
require 'progress-monitor'
|
9
|
-
|
10
|
-
$type = ENV['ner'] || :rner
|
11
|
-
$debug = !ENV['debug'].nil?
|
12
|
-
$perfect = !ENV['perfect'].nil?
|
13
|
-
$docs = ENV['docs']
|
14
|
-
|
15
|
-
|
16
|
-
$org2rbbt = {
|
17
|
-
'yeast' => 'sgd',
|
18
|
-
'mouse' => 'mgi',
|
19
|
-
'fly' => 'sgd',
|
20
|
-
'bc2gn' => 'human',
|
21
|
-
}
|
22
|
-
|
23
|
-
def match(org, filedir, goldstandard,outfile)
|
24
|
-
|
25
|
-
t = Time.now
|
26
|
-
if org == 'bc2gn'
|
27
|
-
custom_file = File.join('config', org + '.config')
|
28
|
-
norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
|
29
|
-
:to_entrez => false,
|
30
|
-
:file => (File.exist?(custom_file) ? custom_file : nil),
|
31
|
-
:max_candidates => 200)
|
32
|
-
else
|
33
|
-
custom_file = File.join('config', org + '.config')
|
34
|
-
norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
|
35
|
-
:to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
|
36
|
-
:native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
|
37
|
-
:fix => proc{|l| l.sub(/S000/,'S0')}),
|
38
|
-
:file => (File.exist?(custom_file) ? custom_file : nil),
|
39
|
-
:max_candidates => 200)
|
40
|
-
end
|
41
|
-
STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
|
42
|
-
|
43
|
-
|
44
|
-
if $type.to_s == 'rner'
|
45
|
-
ner = NER.new('models/' + org)
|
46
|
-
else
|
47
|
-
ner = Organism.ner($org2rbbt[org], $type)
|
48
|
-
end
|
49
|
-
|
50
|
-
|
51
|
-
fout=File.open(outfile,'w')
|
52
|
-
|
53
|
-
gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
|
54
|
-
gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
|
55
|
-
|
56
|
-
if org == 'bc2gn'
|
57
|
-
lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
|
58
|
-
else
|
59
|
-
lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
|
60
|
-
end
|
61
|
-
|
62
|
-
if $docs
|
63
|
-
files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
|
64
|
-
else
|
65
|
-
files = Dir.glob(filedir + '*.txt').sort
|
66
|
-
end
|
67
|
-
|
68
|
-
Progress.monitor("Processing Files")
|
69
|
-
files.each{|f|
|
70
|
-
fid = File.basename(f).sub(/.txt/,'')
|
71
|
-
|
72
|
-
text = Open.read(f)
|
73
|
-
if $perfect
|
74
|
-
mentions = (gs_mentions[fid] || []).flatten
|
75
|
-
|
76
|
-
else
|
77
|
-
mentions = ner.extract(text).uniq
|
78
|
-
end
|
79
|
-
|
80
|
-
if $debug
|
81
|
-
puts "------------------------------------"
|
82
|
-
puts "FILE #{fid}"
|
83
|
-
puts
|
84
|
-
puts text
|
85
|
-
puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
|
86
|
-
puts "MENTIONS: #{mentions.join(", ")}"
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
found = []
|
91
|
-
mentions.each{|mention|
|
92
|
-
|
93
|
-
codes = norm.select(norm.match(mention),mention,text)
|
94
|
-
|
95
|
-
found += codes
|
96
|
-
codes.each{|code|
|
97
|
-
#code = code.sub(/S000/,'S0')
|
98
|
-
fout.puts "#{ fid }\t#{ code}\t#{mention}"
|
99
|
-
}
|
100
|
-
|
101
|
-
puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
|
102
|
-
}
|
103
|
-
|
104
|
-
if $debug
|
105
|
-
found.uniq!
|
106
|
-
fn = (gs[fid] || []).flatten.uniq - found
|
107
|
-
fp = found - (gs[fid] || []).flatten.uniq
|
108
|
-
|
109
|
-
fn.each{|code|
|
110
|
-
if lex[code]
|
111
|
-
puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
|
112
|
-
else
|
113
|
-
puts "FN: #{ code }"
|
114
|
-
end
|
115
|
-
}
|
116
|
-
fp.each{|code|
|
117
|
-
if lex[code]
|
118
|
-
puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
|
119
|
-
else
|
120
|
-
puts "FN: #{ code }"
|
121
|
-
end
|
122
|
-
}
|
123
|
-
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
}
|
128
|
-
fout.close
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
|
133
|
-
org = File.basename(t.name).sub(/\.features/,'')
|
134
|
-
|
135
|
-
if org == 'bc2gn'
|
136
|
-
lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
|
137
|
-
else
|
138
|
-
lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
|
139
|
-
end
|
140
|
-
|
141
|
-
names = File.open(lexicon).collect{|l|
|
142
|
-
names = l.split(/\t/)
|
143
|
-
names.shift
|
144
|
-
names.compact.select{|n| !n.empty?}
|
145
|
-
}.flatten
|
146
|
-
|
147
|
-
fout = File.open(t.name,'w')
|
148
|
-
parser = NERFeatures.new
|
149
|
-
|
150
|
-
Progress.monitor("CRFPP Features #{ org }")
|
151
|
-
names.each{|name|
|
152
|
-
features = parser.text_features(name, true)
|
153
|
-
features.each{|feat|
|
154
|
-
fout.puts feat.join(" ")
|
155
|
-
}
|
156
|
-
fout.puts
|
157
|
-
}
|
158
|
-
fout.close
|
159
|
-
if org != 'bc2gn'
|
160
|
-
Open.append(t.name, Open.read('../ner/data/BC2.features'))
|
161
|
-
else
|
162
|
-
Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
|
163
|
-
Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
|
164
|
-
end
|
165
|
-
|
166
|
-
end
|
167
|
-
|
168
|
-
rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
|
169
|
-
org = File.basename(t.name)
|
170
|
-
|
171
|
-
parser = NERFeatures.new
|
172
|
-
parser.train( t.name + '.features', t.name)
|
173
|
-
end
|
174
|
-
|
175
|
-
|
176
|
-
rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
|
177
|
-
org, dataset = File.basename(t.name).split(/_/)
|
178
|
-
|
179
|
-
if $type.to_sym == :rner
|
180
|
-
Rake::Task['models/' + org].invoke
|
181
|
-
end
|
182
|
-
|
183
|
-
filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
|
184
|
-
goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
|
185
|
-
|
186
|
-
match(org,filedir, goldstandard,t.name)
|
187
|
-
end
|
188
|
-
|
189
|
-
rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
|
190
|
-
org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
|
191
|
-
|
192
|
-
cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
|
193
|
-
puts cmd
|
194
|
-
system cmd
|
195
|
-
end
|
196
|
-
|
197
|
-
rule (/results\/bc2gn$/) do |t|
|
198
|
-
org = 'bc2gn'
|
199
|
-
|
200
|
-
if $type.to_sym == :rner
|
201
|
-
Rake::Task['models/' + org].invoke
|
202
|
-
end
|
203
|
-
|
204
|
-
filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
|
205
|
-
goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
|
206
|
-
|
207
|
-
match(org,filedir, goldstandard,t.name)
|
208
|
-
end
|
209
|
-
|
210
|
-
rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
|
211
|
-
|
212
|
-
cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
|
213
|
-
system cmd
|
214
|
-
|
215
|
-
end
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
@@ -1,10 +0,0 @@
|
|
1
|
-
equal do |w| [w] end
|
2
|
-
standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
|
3
|
-
cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
|
4
|
-
special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
|
5
|
-
words do |w|
|
6
|
-
w.sub(/(.*)I$/,'\1I \1').
|
7
|
-
scan(/[a-z][a-z]+/i).
|
8
|
-
sort{|a,b| b.length <=> a.length}.
|
9
|
-
collect{|n| n.downcase}
|
10
|
-
end
|
@@ -1,79 +0,0 @@
|
|
1
|
-
require 'rbbt/util/misc'
|
2
|
-
tokens do
|
3
|
-
|
4
|
-
# Some (possible) single letters first
|
5
|
-
receptor /^(?:receptor|r)s?$/i
|
6
|
-
protein /^(?:protein|p)s?$/i
|
7
|
-
roman /^[IV]+$/
|
8
|
-
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
9
|
-
|
10
|
-
|
11
|
-
# Some words for removal
|
12
|
-
stopword do |w| $stopwords.include?( w.downcase_first) end
|
13
|
-
gene /genes?/i
|
14
|
-
dna
|
15
|
-
cdna
|
16
|
-
rna
|
17
|
-
mrna
|
18
|
-
trna
|
19
|
-
cdna
|
20
|
-
component
|
21
|
-
exon
|
22
|
-
intron
|
23
|
-
domain
|
24
|
-
family
|
25
|
-
|
26
|
-
|
27
|
-
# Important words
|
28
|
-
number /^(?:\d+[.,]?\d+|\d)$/
|
29
|
-
greek do |w| $greek[w.downcase] != nil end
|
30
|
-
special do |w| w.is_special? end
|
31
|
-
promoter
|
32
|
-
similar /^(homolog.*|like|related|associated)$/
|
33
|
-
ase /ase$/
|
34
|
-
in_end /in$/
|
35
|
-
end
|
36
|
-
|
37
|
-
comparisons do
|
38
|
-
|
39
|
-
compare.number do |l1,l2|
|
40
|
-
v = 0
|
41
|
-
case
|
42
|
-
when l1.empty? && l2.empty?
|
43
|
-
v = 0
|
44
|
-
when l1.sort.uniq == l2.sort.uniq
|
45
|
-
v = 3
|
46
|
-
when l1.any? && l1[0] == l2[0]
|
47
|
-
v = -3
|
48
|
-
when l1.empty? && l2 == ['1']
|
49
|
-
v = -5
|
50
|
-
else
|
51
|
-
v = -10
|
52
|
-
end
|
53
|
-
v
|
54
|
-
end
|
55
|
-
|
56
|
-
diff.promoter -10
|
57
|
-
diff.receptor -10
|
58
|
-
diff.similar -10
|
59
|
-
diff.capital -10
|
60
|
-
|
61
|
-
same.unknown 1
|
62
|
-
miss.unknown -2
|
63
|
-
extr.unknown -2
|
64
|
-
|
65
|
-
same.greek 1
|
66
|
-
miss.greek -2
|
67
|
-
extr.greek -2
|
68
|
-
|
69
|
-
same.special 4
|
70
|
-
miss.special -3
|
71
|
-
extr.special -3
|
72
|
-
|
73
|
-
transform.roman do |t| [t.arabic, :number] end
|
74
|
-
transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
|
75
|
-
transform.ase do |t| [t, :special] end
|
76
|
-
transform.in_end do |t| [t, :special] end
|
77
|
-
transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
|
78
|
-
end
|
79
|
-
|
@@ -1,23 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
function norm(){
|
3
|
-
organism=$1
|
4
|
-
shift
|
5
|
-
dataset=$1
|
6
|
-
shift
|
7
|
-
ner=$1
|
8
|
-
shift
|
9
|
-
|
10
|
-
CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
|
11
|
-
echo $CMD
|
12
|
-
$CMD
|
13
|
-
}
|
14
|
-
|
15
|
-
|
16
|
-
function norm_2(){
|
17
|
-
ner=$1
|
18
|
-
shift
|
19
|
-
|
20
|
-
CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
|
21
|
-
echo $CMD
|
22
|
-
$CMD
|
23
|
-
}
|
@@ -1,43 +0,0 @@
|
|
1
|
-
$org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
|
2
|
-
|
3
|
-
task 'names' do
|
4
|
-
orgs = Dir.glob('*').
|
5
|
-
select{|t|
|
6
|
-
File.directory?(t ) &&
|
7
|
-
File.exist?(t + '/Rakefile')
|
8
|
-
}
|
9
|
-
|
10
|
-
orgs.each{|org|
|
11
|
-
pid = Process.fork{
|
12
|
-
Dir.chdir(org)
|
13
|
-
load 'Rakefile'
|
14
|
-
Rake::Task['name'].invoke
|
15
|
-
}
|
16
|
-
Process.waitpid pid
|
17
|
-
}
|
18
|
-
|
19
|
-
end
|
20
|
-
|
21
|
-
task 'default' do
|
22
|
-
if $org
|
23
|
-
orgs = [$org]
|
24
|
-
else
|
25
|
-
|
26
|
-
orgs = Dir.glob('*').
|
27
|
-
select{|t|
|
28
|
-
File.directory?(t ) &&
|
29
|
-
File.exist?(t + '/Rakefile')
|
30
|
-
}
|
31
|
-
end
|
32
|
-
|
33
|
-
orgs.each{|org|
|
34
|
-
puts "Updating #{ org }"
|
35
|
-
pid = Process.fork{
|
36
|
-
Dir.chdir(org)
|
37
|
-
load 'Rakefile'
|
38
|
-
Rake::Task['update'].invoke
|
39
|
-
}
|
40
|
-
Process.waitpid pid
|
41
|
-
}
|
42
|
-
end
|
43
|
-
|
@@ -1,84 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Candida albicans"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "Systematic Name"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 237561,
|
10
|
-
:fix => proc{|code| code.sub(/^CaO/,'orf') },
|
11
|
-
:check => proc{|code| code.match(/^orf/)},
|
12
|
-
:native => 3
|
13
|
-
}
|
14
|
-
|
15
|
-
$lexicon = {
|
16
|
-
:file => {
|
17
|
-
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
18
|
-
:native => 0,
|
19
|
-
:extra => [8,1,2],
|
20
|
-
:exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
|
21
|
-
},
|
22
|
-
}
|
23
|
-
|
24
|
-
$identifiers = {
|
25
|
-
:file => {
|
26
|
-
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
27
|
-
:native => 0,
|
28
|
-
:extra => [8,1,2],
|
29
|
-
:exclude => proc{|l| l.match(/^!/)},
|
30
|
-
:fields => ["GCD ID", "Gene Name", "Gene Alias"]
|
31
|
-
},
|
32
|
-
}
|
33
|
-
|
34
|
-
$go = {
|
35
|
-
:url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
|
36
|
-
:code => 10,
|
37
|
-
:go => 4,
|
38
|
-
:pmid => 5,
|
39
|
-
:fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
|
40
|
-
}
|
41
|
-
|
42
|
-
$query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
43
|
-
|
44
|
-
####
|
45
|
-
|
46
|
-
#Rake::Task['identifiers'].clear
|
47
|
-
#file 'identifiers' => ['lexicon'] do |t|
|
48
|
-
# identifiers = {}
|
49
|
-
# if $identifiers[:file]
|
50
|
-
# identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
|
51
|
-
# end
|
52
|
-
#
|
53
|
-
# orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
|
54
|
-
#
|
55
|
-
# translations = {}
|
56
|
-
#
|
57
|
-
# Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
|
58
|
-
# orfs.each{|orf|
|
59
|
-
# translations[orf] ||= []
|
60
|
-
# translations[orf] << entrez
|
61
|
-
# }
|
62
|
-
# }
|
63
|
-
#
|
64
|
-
# orf2native.each{|orf, native|
|
65
|
-
# next unless identifiers[native]
|
66
|
-
# identifiers[native] << [orf]
|
67
|
-
# if translations[orf]
|
68
|
-
# identifiers[native] << translations[orf]
|
69
|
-
# else
|
70
|
-
# identifiers[native] << []
|
71
|
-
# end
|
72
|
-
#
|
73
|
-
# }
|
74
|
-
#
|
75
|
-
# header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
|
76
|
-
# Open.write('identifiers',
|
77
|
-
# header +
|
78
|
-
# identifiers.collect{|code, name_lists|
|
79
|
-
# "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
|
80
|
-
# }.join("\n")
|
81
|
-
# )
|
82
|
-
#end
|
83
|
-
#
|
84
|
-
#
|