rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,10 +0,0 @@
1
- equal do |w| [w] end
2
- standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
- cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
4
- special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
- words do |w|
6
- w.sub(/(.*)I$/,'\1I \1').
7
- scan(/[a-z][a-z]+/i).
8
- sort{|a,b| b.length <=> a.length}.
9
- collect{|n| n.downcase}
10
- end
@@ -1,86 +0,0 @@
1
- require 'rbbt/util/misc'
2
-
3
-
4
- plural = Proc.new do |t| t.sub(/s$/,'') end
5
-
6
- tokens do
7
-
8
- # Some (possible) single letters first
9
- receptor /^(?:receptor|r)s?$/i
10
- protein /^(?:protein|p)s?$/i
11
- roman /^[IV]+$/
12
- greek_letter do |w| $inverse_greek[w.downcase] != nil end
13
-
14
-
15
- # Some words for removal
16
- stopword do |w| $stopwords.include?( w.downcase_first) end
17
- gene /genes?/i
18
- dna
19
- cdna
20
- rna
21
- mrna
22
- trna
23
- cdna
24
- component
25
- exon
26
- intron
27
- domain
28
- family
29
-
30
-
31
- # Important words
32
- number /^(?:\d+[.,]?\d+|\d)$/
33
- greek do |w| $greek[w.downcase] != nil end
34
- special do |w| w.is_special? end
35
- promoter
36
- similar /^(homolog.*|like|related|associated)$/
37
- ase /ase$/
38
- in_end /in$/
39
- end
40
-
41
- comparisons do
42
-
43
- compare.number do |l1,l2|
44
- v = 0
45
- case
46
- when l1.empty? && l2.empty?
47
- v = 0
48
- when l1.sort.uniq == l2.sort.uniq
49
- v = 3
50
- when l1.any? && l1[0] == l2[0]
51
- v = -3
52
- when l1.empty? && l2 == ['1']
53
- v = -5
54
- else
55
- v = -10
56
- end
57
- v
58
- end
59
-
60
- diff.promoter -10
61
- diff.receptor -10
62
- diff.similar -10
63
- diff.capital -10
64
-
65
- same.unknown 1
66
- miss.unknown -2
67
- extr.unknown -2
68
-
69
- same.greek 1
70
- miss.greek -2
71
- extr.greek -2
72
-
73
- same.special 4
74
- miss.special -3
75
- extr.special -3
76
-
77
- transform.receptor plural
78
- transform.protein plural
79
-
80
- transform.roman do |t| [t.arabic, :number] end
81
- transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
82
- transform.ase do |t| [t, :special] end
83
- transform.in_end do |t| [t, :special] end
84
- transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
85
- end
86
-
@@ -1,23 +0,0 @@
1
- #!/bin/bash
2
- function norm(){
3
- organism=$1
4
- shift
5
- dataset=$1
6
- shift
7
- ner=$1
8
- shift
9
-
10
- CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
11
- echo $CMD
12
- $CMD
13
- }
14
-
15
-
16
- function norm_2(){
17
- ner=$1
18
- shift
19
-
20
- CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
21
- echo $CMD
22
- $CMD
23
- }
@@ -1,55 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Arabidopsis thaliana"
4
-
5
-
6
- $native_id = "TAIR Locus"
7
-
8
- $entrez2native = {
9
- :tax =>3702,
10
- :fix => proc{|code| code.sub(/^TAIR:/,'')},
11
- :check => proc{|code| true },
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => "ftp://ftp.arabidopsis.org/home/tair/Genes/gene_aliases.20100413",
17
- :native => 0,
18
- :extra => [1,2],
19
- },
20
- }
21
-
22
- $identifiers = {
23
- :file => {
24
- :url => "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2009-7-29.txt",
25
- :native => 4,
26
- :extra => [0],
27
- :fields => ["Affymetrix"],
28
- },
29
- :biomart => {
30
- :database => 'athaliana_eg_gene',
31
- :main => ['TAIR Locus', 'tair_locus'],
32
- :extra => [
33
- ['Associated Gene Name' , "external_gene_id"] ,
34
- ['Gramene Gene ID' , "ensembl_gene_id"] ,
35
- ['RefSeq peptide' , "refseq_peptide"] ,
36
- ['Unigene' , "unigene"] ,
37
- ['Interpro ID' , "interpro"] ,
38
-
39
-
40
- ],
41
- :filter => ['with_tair_locus'], # This is needed as the filter is not with_mgi_id as was expected
42
- }
43
-
44
- }
45
-
46
- $go = {
47
- :url => "ftp://ftp.arabidopsis.org/home/tair/Ontologies/Gene_Ontology/ATH_GO_GOSLIM.txt",
48
- :code => 0,
49
- :go => 5,
50
- :pmid => 12,
51
- }
52
-
53
- $query = '("arabidopsis"[MeSH Terms] OR Arabidopsis[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]))'
54
-
55
-
@@ -1,84 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Candida albicans"
4
-
5
-
6
- $native_id = "Systematic Name"
7
-
8
- $entrez2native = {
9
- :tax => 237561,
10
- :fix => proc{|code| code.sub(/^CaO/,'orf') },
11
- :check => proc{|code| code.match(/^orf/)},
12
- :native => 3
13
- }
14
-
15
- $lexicon = {
16
- :file => {
17
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
18
- :native => 0,
19
- :extra => [8,1,2],
20
- :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
21
- },
22
- }
23
-
24
- $identifiers = {
25
- :file => {
26
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
27
- :native => 0,
28
- :extra => [8,1,2],
29
- :exclude => proc{|l| l.match(/^!/)},
30
- :fields => ["GCD ID", "Gene Name", "Gene Alias"]
31
- },
32
- }
33
-
34
- $go = {
35
- :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
36
- :code => 10,
37
- :go => 4,
38
- :pmid => 5,
39
- :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
40
- }
41
-
42
- $query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
43
-
44
- ####
45
-
46
- #Rake::Task['identifiers'].clear
47
- #file 'identifiers' => ['lexicon'] do |t|
48
- # identifiers = {}
49
- # if $identifiers[:file]
50
- # identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
51
- # end
52
- #
53
- # orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
54
- #
55
- # translations = {}
56
- #
57
- # Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
58
- # orfs.each{|orf|
59
- # translations[orf] ||= []
60
- # translations[orf] << entrez
61
- # }
62
- # }
63
- #
64
- # orf2native.each{|orf, native|
65
- # next unless identifiers[native]
66
- # identifiers[native] << [orf]
67
- # if translations[orf]
68
- # identifiers[native] << translations[orf]
69
- # else
70
- # identifiers[native] << []
71
- # end
72
- #
73
- # }
74
- #
75
- # header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
76
- # Open.write('identifiers',
77
- # header +
78
- # identifiers.collect{|code, name_lists|
79
- # "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
80
- # }.join("\n")
81
- # )
82
- #end
83
- #
84
- #
@@ -1,109 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Caenorhabditis elegans"
4
-
5
-
6
- $native_id = "WormBase ID"
7
-
8
- $entrez2native = {
9
- :tax => 6239,
10
- :fix => proc{|code| code.sub(/^WormBase:/,'')},
11
- :check => proc{|code| code.match(/^WBGene/)},
12
- }
13
-
14
- $lexicon = {
15
-
16
- :file =>{
17
-
18
- :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
19
- :native => 0,
20
- :extra => [1,2],
21
-
22
- # :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
23
- # :native => 0,
24
- # :extra => [2,3,4,5],
25
-
26
- },
27
- }
28
-
29
-
30
- $identifiers = {
31
-
32
- :file =>{
33
-
34
- :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
35
- :native => 0,
36
- :extra => [1,2],
37
-
38
- # :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
39
- # :native => 0,
40
- # :extra => [2,3,4,5],
41
-
42
- },
43
-
44
- :biomart => {
45
- :database => 'celegans_gene_ensembl',
46
- :main => ['Entrez Gene ID' , "entrezgene"],
47
- :extra => [
48
- ['WormBase gene', "wormbase_gene" ],
49
- ['Associated Gene Name ', "external_gene_id" ],
50
- ['WormPep id', "wormpep_id" ],
51
- [ 'Ensembl Gene ID', "ensembl_gene_id" ],
52
- [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
53
- [ 'Protein ID ', "protein_id" ],
54
- [ 'RefSeq Protein ID ', "refseq_peptide" ],
55
- [ 'Unigene ID ', "unigene" ],
56
- [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
57
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
58
- ['EMBL (Genbank) ID' , "embl"] ,
59
- ],
60
- :filter => [],
61
- }
62
- }
63
-
64
- $go = {
65
- :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
66
- :code => 0,
67
- :go => 3,
68
- :pmid => 3,
69
- }
70
-
71
- $query = '"caenorhabditis elegans"[MeSH Terms] OR Caenorhabditis elegans[Text Word]'
72
- ##########################
73
-
74
-
75
- module Open
76
-
77
- class << self
78
- alias_method :old_read, :read
79
-
80
- def read(url, options = {})
81
- content = old_read(url, options)
82
-
83
- if url =~ /GO/
84
- return content.gsub(/.*:.*\((GO:\d+)\)/,'\1').gsub(/\nGO/,"|GO").
85
- collect{|l|
86
- l = l.sub(/\|/,"\t")
87
- names, gos = l.chomp.split(/\t/)
88
-
89
- id, name, extra = names.split(/ /)
90
- extra = extra.gsub(/[()]/,'') if extra
91
-
92
- if gos
93
- gos.split(/\|/).collect{|go|
94
- [id, name, extra, go].join("\t")
95
- }.join("\n")
96
- else
97
- [id, name, extra].join("\t") + "\n"
98
- end
99
- }.join("\n")
100
- elsif url =~ /gene_ids/
101
- return content.gsub(/,/,"\t")
102
- else
103
- return content
104
- end
105
-
106
- end
107
- end
108
- end
109
-
@@ -1,140 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Homo sapiens"
4
-
5
-
6
- $native_id = "Entrez Gene ID"
7
-
8
- $entrez2native = {
9
- :tax => 9606,
10
- :fix => nil,
11
- :check => proc{|code| false},
12
- }
13
-
14
- $lexicon = {
15
- :biomart => {
16
- :database => 'hsapiens_gene_ensembl',
17
- :main => ['Entrez Gene ID' , "entrezgene"],
18
- :extra => [
19
- [ 'Associated Gene Name' , "external_gene_id"],
20
- [ 'HGNC symbol', "hgnc_symbol" ],
21
- [ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
22
- [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
23
- ],
24
- }
25
-
26
- }
27
-
28
- $identifiers = {
29
- :biomart => {
30
- :database => 'hsapiens_gene_ensembl',
31
- :main => ['Entrez Gene ID' , "entrezgene"],
32
- :extra => [
33
- [ 'Ensembl Gene ID', "ensembl_gene_id" ],
34
- [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
35
- [ 'Associated Gene Name', "external_gene_id" ],
36
- [ 'CCDS ID', "ccds" ],
37
- [ 'Protein ID', "protein_id" ],
38
- [ 'RefSeq Protein ID', "refseq_peptide" ],
39
- [ 'Unigene ID', "unigene" ],
40
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
41
- [ 'HGNC ID', "hgnc_id", 'HGNC'],
42
- ['EMBL (Genbank) ID' , "embl"] ,
43
-
44
- # Affymetrix
45
- [ 'AFFY HC G110', 'affy_hc_g110' ],
46
- [ 'AFFY HG FOCUS', 'affy_hg_focus' ],
47
- [ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
48
- [ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
49
- [ 'AFFY HG U133A', 'affy_hg_u133a' ],
50
- [ 'AFFY HG U133B', 'affy_hg_u133b' ],
51
- [ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
52
- [ 'AFFY HG U95B', 'affy_hg_u95b' ],
53
- [ 'AFFY HG U95C', 'affy_hg_u95c' ],
54
- [ 'AFFY HG U95D', 'affy_hg_u95d' ],
55
- [ 'AFFY HG U95E', 'affy_hg_u95e' ],
56
- [ 'AFFY HG U95A', 'affy_hg_u95a' ],
57
- [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
58
- [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
59
- [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
60
- [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
61
- [ 'Agilent WholeGenome',"agilent_wholegenome" ],
62
- [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
63
- [ 'Codelink ID', 'codelink' ],
64
- [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
65
- [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
66
-
67
- ],
68
- :filter => [],
69
- }
70
- }
71
-
72
- $go = {
73
- :url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
74
- :code => 2,
75
- :go => 4,
76
- :pmid => 5,
77
- }
78
-
79
- $query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
80
- ##########################
81
-
82
- require 'rbbt/util/index'
83
-
84
- Rake::Task['gene.go'].clear
85
- file 'gene.go' => ['identifiers'] do
86
- if File.exists? 'identifiers'
87
- require 'rbbt/sources/organism'
88
- index = Organism.id_index('Hsa', :other => ['Associated Gene Name'])
89
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
90
-
91
- data = data.collect{|code, value_lists|
92
- code = index[code]
93
- [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
94
- }.select{|p| p[0] && p[1].any?}
95
-
96
- Open.write('gene.go',
97
- data.collect{|p|
98
- "#{p[0]}\t#{p[1].uniq.join("|")}"
99
- }.join("\n")
100
- )
101
- end
102
- end
103
-
104
- Rake::Task['gene_go.pmid'].clear
105
- file 'gene_go.pmid' => ['identifiers'] do
106
- if File.exists? 'identifiers'
107
- index = Index.index('identifiers')
108
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
109
-
110
- data = data.collect{|code, value_lists|
111
- code = index[code]
112
- [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
113
- }.select{|p| p[0] && p[1].any?}
114
-
115
- Open.write('gene_go.pmid',
116
- data.collect{|p|
117
- "#{p[0]}\t#{p[1].uniq.join("|")}"
118
- }.join("\n")
119
- )
120
- end
121
- end
122
-
123
-
124
- Rake::Task['lexicon'].clear
125
- file 'lexicon' => ['identifiers'] do
126
- if File.exists? 'identifiers'
127
- require 'rbbt/sources/organism'
128
- HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
129
- names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
130
- translations = Organism.id_index('Hsa', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
131
-
132
- Open.write('lexicon',
133
- names.collect{|code, names|
134
- next unless translations[code]
135
- ([translations[code]] + names).join("\t")
136
- }.compact.join("\n")
137
- )
138
- end
139
-
140
- end