rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,10 +0,0 @@
1
- equal do |w| [w] end
2
- standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
- cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
4
- special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
- words do |w|
6
- w.sub(/(.*)I$/,'\1I \1').
7
- scan(/[a-z][a-z]+/i).
8
- sort{|a,b| b.length <=> a.length}.
9
- collect{|n| n.downcase}
10
- end
@@ -1,86 +0,0 @@
1
- require 'rbbt/util/misc'
2
-
3
-
4
- plural = Proc.new do |t| t.sub(/s$/,'') end
5
-
6
- tokens do
7
-
8
- # Some (possible) single letters first
9
- receptor /^(?:receptor|r)s?$/i
10
- protein /^(?:protein|p)s?$/i
11
- roman /^[IV]+$/
12
- greek_letter do |w| $inverse_greek[w.downcase] != nil end
13
-
14
-
15
- # Some words for removal
16
- stopword do |w| $stopwords.include?( w.downcase_first) end
17
- gene /genes?/i
18
- dna
19
- cdna
20
- rna
21
- mrna
22
- trna
23
- cdna
24
- component
25
- exon
26
- intron
27
- domain
28
- family
29
-
30
-
31
- # Important words
32
- number /^(?:\d+[.,]?\d+|\d)$/
33
- greek do |w| $greek[w.downcase] != nil end
34
- special do |w| w.is_special? end
35
- promoter
36
- similar /^(homolog.*|like|related|associated)$/
37
- ase /ase$/
38
- in_end /in$/
39
- end
40
-
41
- comparisons do
42
-
43
- compare.number do |l1,l2|
44
- v = 0
45
- case
46
- when l1.empty? && l2.empty?
47
- v = 0
48
- when l1.sort.uniq == l2.sort.uniq
49
- v = 3
50
- when l1.any? && l1[0] == l2[0]
51
- v = -3
52
- when l1.empty? && l2 == ['1']
53
- v = -5
54
- else
55
- v = -10
56
- end
57
- v
58
- end
59
-
60
- diff.promoter -10
61
- diff.receptor -10
62
- diff.similar -10
63
- diff.capital -10
64
-
65
- same.unknown 1
66
- miss.unknown -2
67
- extr.unknown -2
68
-
69
- same.greek 1
70
- miss.greek -2
71
- extr.greek -2
72
-
73
- same.special 4
74
- miss.special -3
75
- extr.special -3
76
-
77
- transform.receptor plural
78
- transform.protein plural
79
-
80
- transform.roman do |t| [t.arabic, :number] end
81
- transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
82
- transform.ase do |t| [t, :special] end
83
- transform.in_end do |t| [t, :special] end
84
- transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
85
- end
86
-
@@ -1,23 +0,0 @@
1
- #!/bin/bash
2
- function norm(){
3
- organism=$1
4
- shift
5
- dataset=$1
6
- shift
7
- ner=$1
8
- shift
9
-
10
- CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
11
- echo $CMD
12
- $CMD
13
- }
14
-
15
-
16
- function norm_2(){
17
- ner=$1
18
- shift
19
-
20
- CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
21
- echo $CMD
22
- $CMD
23
- }
@@ -1,55 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Arabidopsis thaliana"
4
-
5
-
6
- $native_id = "TAIR Locus"
7
-
8
- $entrez2native = {
9
- :tax =>3702,
10
- :fix => proc{|code| code.sub(/^TAIR:/,'')},
11
- :check => proc{|code| true },
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => "ftp://ftp.arabidopsis.org/home/tair/Genes/gene_aliases.20100413",
17
- :native => 0,
18
- :extra => [1,2],
19
- },
20
- }
21
-
22
- $identifiers = {
23
- :file => {
24
- :url => "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2009-7-29.txt",
25
- :native => 4,
26
- :extra => [0],
27
- :fields => ["Affymetrix"],
28
- },
29
- :biomart => {
30
- :database => 'athaliana_eg_gene',
31
- :main => ['TAIR Locus', 'tair_locus'],
32
- :extra => [
33
- ['Associated Gene Name' , "external_gene_id"] ,
34
- ['Gramene Gene ID' , "ensembl_gene_id"] ,
35
- ['RefSeq peptide' , "refseq_peptide"] ,
36
- ['Unigene' , "unigene"] ,
37
- ['Interpro ID' , "interpro"] ,
38
-
39
-
40
- ],
41
- :filter => ['with_tair_locus'], # This is needed as the filter is not with_mgi_id as was expected
42
- }
43
-
44
- }
45
-
46
- $go = {
47
- :url => "ftp://ftp.arabidopsis.org/home/tair/Ontologies/Gene_Ontology/ATH_GO_GOSLIM.txt",
48
- :code => 0,
49
- :go => 5,
50
- :pmid => 12,
51
- }
52
-
53
- $query = '("arabidopsis"[MeSH Terms] OR Arabidopsis[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]))'
54
-
55
-
@@ -1,84 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Candida albicans"
4
-
5
-
6
- $native_id = "Systematic Name"
7
-
8
- $entrez2native = {
9
- :tax => 237561,
10
- :fix => proc{|code| code.sub(/^CaO/,'orf') },
11
- :check => proc{|code| code.match(/^orf/)},
12
- :native => 3
13
- }
14
-
15
- $lexicon = {
16
- :file => {
17
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
18
- :native => 0,
19
- :extra => [8,1,2],
20
- :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
21
- },
22
- }
23
-
24
- $identifiers = {
25
- :file => {
26
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
27
- :native => 0,
28
- :extra => [8,1,2],
29
- :exclude => proc{|l| l.match(/^!/)},
30
- :fields => ["GCD ID", "Gene Name", "Gene Alias"]
31
- },
32
- }
33
-
34
- $go = {
35
- :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
36
- :code => 10,
37
- :go => 4,
38
- :pmid => 5,
39
- :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
40
- }
41
-
42
- $query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
43
-
44
- ####
45
-
46
- #Rake::Task['identifiers'].clear
47
- #file 'identifiers' => ['lexicon'] do |t|
48
- # identifiers = {}
49
- # if $identifiers[:file]
50
- # identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
51
- # end
52
- #
53
- # orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
54
- #
55
- # translations = {}
56
- #
57
- # Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
58
- # orfs.each{|orf|
59
- # translations[orf] ||= []
60
- # translations[orf] << entrez
61
- # }
62
- # }
63
- #
64
- # orf2native.each{|orf, native|
65
- # next unless identifiers[native]
66
- # identifiers[native] << [orf]
67
- # if translations[orf]
68
- # identifiers[native] << translations[orf]
69
- # else
70
- # identifiers[native] << []
71
- # end
72
- #
73
- # }
74
- #
75
- # header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
76
- # Open.write('identifiers',
77
- # header +
78
- # identifiers.collect{|code, name_lists|
79
- # "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
80
- # }.join("\n")
81
- # )
82
- #end
83
- #
84
- #
@@ -1,109 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Caenorhabditis elegans"
4
-
5
-
6
- $native_id = "WormBase ID"
7
-
8
- $entrez2native = {
9
- :tax => 6239,
10
- :fix => proc{|code| code.sub(/^WormBase:/,'')},
11
- :check => proc{|code| code.match(/^WBGene/)},
12
- }
13
-
14
- $lexicon = {
15
-
16
- :file =>{
17
-
18
- :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
19
- :native => 0,
20
- :extra => [1,2],
21
-
22
- # :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
23
- # :native => 0,
24
- # :extra => [2,3,4,5],
25
-
26
- },
27
- }
28
-
29
-
30
- $identifiers = {
31
-
32
- :file =>{
33
-
34
- :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
35
- :native => 0,
36
- :extra => [1,2],
37
-
38
- # :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
39
- # :native => 0,
40
- # :extra => [2,3,4,5],
41
-
42
- },
43
-
44
- :biomart => {
45
- :database => 'celegans_gene_ensembl',
46
- :main => ['Entrez Gene ID' , "entrezgene"],
47
- :extra => [
48
- ['WormBase gene', "wormbase_gene" ],
49
- ['Associated Gene Name ', "external_gene_id" ],
50
- ['WormPep id', "wormpep_id" ],
51
- [ 'Ensembl Gene ID', "ensembl_gene_id" ],
52
- [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
53
- [ 'Protein ID ', "protein_id" ],
54
- [ 'RefSeq Protein ID ', "refseq_peptide" ],
55
- [ 'Unigene ID ', "unigene" ],
56
- [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
57
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
58
- ['EMBL (Genbank) ID' , "embl"] ,
59
- ],
60
- :filter => [],
61
- }
62
- }
63
-
64
- $go = {
65
- :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
66
- :code => 0,
67
- :go => 3,
68
- :pmid => 3,
69
- }
70
-
71
- $query = '"caenorhabditis elegans"[MeSH Terms] OR Caenorhabditis elegans[Text Word]'
72
- ##########################
73
-
74
-
75
- module Open
76
-
77
- class << self
78
- alias_method :old_read, :read
79
-
80
- def read(url, options = {})
81
- content = old_read(url, options)
82
-
83
- if url =~ /GO/
84
- return content.gsub(/.*:.*\((GO:\d+)\)/,'\1').gsub(/\nGO/,"|GO").
85
- collect{|l|
86
- l = l.sub(/\|/,"\t")
87
- names, gos = l.chomp.split(/\t/)
88
-
89
- id, name, extra = names.split(/ /)
90
- extra = extra.gsub(/[()]/,'') if extra
91
-
92
- if gos
93
- gos.split(/\|/).collect{|go|
94
- [id, name, extra, go].join("\t")
95
- }.join("\n")
96
- else
97
- [id, name, extra].join("\t") + "\n"
98
- end
99
- }.join("\n")
100
- elsif url =~ /gene_ids/
101
- return content.gsub(/,/,"\t")
102
- else
103
- return content
104
- end
105
-
106
- end
107
- end
108
- end
109
-
@@ -1,140 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Homo sapiens"
4
-
5
-
6
- $native_id = "Entrez Gene ID"
7
-
8
- $entrez2native = {
9
- :tax => 9606,
10
- :fix => nil,
11
- :check => proc{|code| false},
12
- }
13
-
14
- $lexicon = {
15
- :biomart => {
16
- :database => 'hsapiens_gene_ensembl',
17
- :main => ['Entrez Gene ID' , "entrezgene"],
18
- :extra => [
19
- [ 'Associated Gene Name' , "external_gene_id"],
20
- [ 'HGNC symbol', "hgnc_symbol" ],
21
- [ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
22
- [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
23
- ],
24
- }
25
-
26
- }
27
-
28
- $identifiers = {
29
- :biomart => {
30
- :database => 'hsapiens_gene_ensembl',
31
- :main => ['Entrez Gene ID' , "entrezgene"],
32
- :extra => [
33
- [ 'Ensembl Gene ID', "ensembl_gene_id" ],
34
- [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
35
- [ 'Associated Gene Name', "external_gene_id" ],
36
- [ 'CCDS ID', "ccds" ],
37
- [ 'Protein ID', "protein_id" ],
38
- [ 'RefSeq Protein ID', "refseq_peptide" ],
39
- [ 'Unigene ID', "unigene" ],
40
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
41
- [ 'HGNC ID', "hgnc_id", 'HGNC'],
42
- ['EMBL (Genbank) ID' , "embl"] ,
43
-
44
- # Affymetrix
45
- [ 'AFFY HC G110', 'affy_hc_g110' ],
46
- [ 'AFFY HG FOCUS', 'affy_hg_focus' ],
47
- [ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
48
- [ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
49
- [ 'AFFY HG U133A', 'affy_hg_u133a' ],
50
- [ 'AFFY HG U133B', 'affy_hg_u133b' ],
51
- [ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
52
- [ 'AFFY HG U95B', 'affy_hg_u95b' ],
53
- [ 'AFFY HG U95C', 'affy_hg_u95c' ],
54
- [ 'AFFY HG U95D', 'affy_hg_u95d' ],
55
- [ 'AFFY HG U95E', 'affy_hg_u95e' ],
56
- [ 'AFFY HG U95A', 'affy_hg_u95a' ],
57
- [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
58
- [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
59
- [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
60
- [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
61
- [ 'Agilent WholeGenome',"agilent_wholegenome" ],
62
- [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
63
- [ 'Codelink ID', 'codelink' ],
64
- [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
65
- [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
66
-
67
- ],
68
- :filter => [],
69
- }
70
- }
71
-
72
- $go = {
73
- :url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
74
- :code => 2,
75
- :go => 4,
76
- :pmid => 5,
77
- }
78
-
79
- $query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
80
- ##########################
81
-
82
- require 'rbbt/util/index'
83
-
84
- Rake::Task['gene.go'].clear
85
- file 'gene.go' => ['identifiers'] do
86
- if File.exists? 'identifiers'
87
- require 'rbbt/sources/organism'
88
- index = Organism.id_index('Hsa', :other => ['Associated Gene Name'])
89
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
90
-
91
- data = data.collect{|code, value_lists|
92
- code = index[code]
93
- [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
94
- }.select{|p| p[0] && p[1].any?}
95
-
96
- Open.write('gene.go',
97
- data.collect{|p|
98
- "#{p[0]}\t#{p[1].uniq.join("|")}"
99
- }.join("\n")
100
- )
101
- end
102
- end
103
-
104
- Rake::Task['gene_go.pmid'].clear
105
- file 'gene_go.pmid' => ['identifiers'] do
106
- if File.exists? 'identifiers'
107
- index = Index.index('identifiers')
108
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
109
-
110
- data = data.collect{|code, value_lists|
111
- code = index[code]
112
- [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
113
- }.select{|p| p[0] && p[1].any?}
114
-
115
- Open.write('gene_go.pmid',
116
- data.collect{|p|
117
- "#{p[0]}\t#{p[1].uniq.join("|")}"
118
- }.join("\n")
119
- )
120
- end
121
- end
122
-
123
-
124
- Rake::Task['lexicon'].clear
125
- file 'lexicon' => ['identifiers'] do
126
- if File.exists? 'identifiers'
127
- require 'rbbt/sources/organism'
128
- HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
129
- names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
130
- translations = Organism.id_index('Hsa', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
131
-
132
- Open.write('lexicon',
133
- names.collect{|code, names|
134
- next unless translations[code]
135
- ([translations[code]] + names).join("\t")
136
- }.compact.join("\n")
137
- )
138
- end
139
-
140
- end