RubyGems - rbbt - Versions diffs - 1.1.7 → 2.0.0 - Mend

rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +72 -136
data/LICENSE +0 -20
data/bin/rbbt_config +0 -246
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -145
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -79
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/cgd.Rakefile +0 -84
data/install_scripts/organisms/human.Rakefile +0 -145
data/install_scripts/organisms/mgi.Rakefile +0 -77
data/install_scripts/organisms/pombe.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -258
data/install_scripts/organisms/rgd.Rakefile +0 -88
data/install_scripts/organisms/sgd.Rakefile +0 -66
data/install_scripts/organisms/tair.Rakefile +0 -54
data/install_scripts/organisms/worm.Rakefile +0 -109
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -86
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -40
data/lib/rbbt/sources/organism.rb +0 -245
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -111
data/lib/rbbt/util/arrayHash.rb +0 -255
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -235
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -19
data/tasks/install.rake +0 -124

data/install_scripts/organisms/human.Rakefile DELETED

@@ -1,145 +0,0 @@
-require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
-$name = "Homo sapiens"
-$native_id = "Entrez Gene ID"
-$entrez2native = {
-  :tax => 9606,
-  :fix => nil,
-  :check => proc{|code| false},
-}
-$lexicon = {
-  :biomart => {
-    :database => 'hsapiens_gene_ensembl',
-    :main => ['Entrez Gene ID' , "entrezgene"],
-    :extra => [
-      [ 'Associated Gene Name' , "external_gene_id"],
-      [ 'HGNC symbol', "hgnc_symbol"  ],
-      [ 'HGNC automatic gene name', "hgnc_automatic_gene_name"  ],
-      [ 'HGNC curated gene name ', "hgnc_curated_gene_name"  ],
-    ],
-  }
-}
-$identifiers = {
-  :biomart => {
-    :database => 'hsapiens_gene_ensembl',
-    :main => ['Entrez Gene ID' , "entrezgene"],
-    :extra => [
-      [ 'Ensembl Gene ID', "ensembl_gene_id"  ],
-      [ 'Ensembl Protein ID', "ensembl_peptide_id"  ],
-      [ 'Associated Gene Name', "external_gene_id"  ],
-      [ 'CCDS ID', "ccds"  ],
-      [ 'Protein ID', "protein_id"  ],
-      [ 'RefSeq Protein ID', "refseq_peptide"  ],
-      [ 'Unigene ID', "unigene"  ],
-      [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
-      [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
-      [ 'HGNC ID', "hgnc_id", 'HGNC'],
-      ['EMBL (Genbank) ID' , "embl"] ,
-      # Affymetrix
-      [ 'AFFY HC G110', 'affy_hc_g110' ],
-      [ 'AFFY HG FOCUS', 'affy_hg_focus' ],
-      [ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
-      [ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
-      [ 'AFFY HG U133A', 'affy_hg_u133a' ],
-      [ 'AFFY HG U133B', 'affy_hg_u133b' ],
-      [ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
-      [ 'AFFY HG U95B', 'affy_hg_u95b' ],
-      [ 'AFFY HG U95C', 'affy_hg_u95c' ],
-      [ 'AFFY HG U95D', 'affy_hg_u95d' ],
-      [ 'AFFY HG U95E', 'affy_hg_u95e' ],
-      [ 'AFFY HG U95A', 'affy_hg_u95a' ],
-      [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
-      [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
-      [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
-      [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
-      [ 'Agilent WholeGenome',"agilent_wholegenome" ],
-      [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
-      [ 'Codelink ID', 'codelink' ],
-      [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
-      [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
-    ],
-    :filter => [],
-  }
-}
-$go = {
- :url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
- :code => 2,
- :go   => 4,
- :pmid => 5,
-}
-$query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
-##########################
-require 'rbbt/util/index'
-Rake::Task['gene.go'].clear
-file 'gene.go' => ['identifiers'] do
-  if File.exists? 'identifiers'
-    require 'rbbt/sources/organism'
-    index = Organism.id_index('human', :other => ['Associated Gene Name'])
-    data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
-    data = data.collect{|code, value_lists|
-      code = index[code]
-      [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
-    }.select{|p| p[0] && p[1].any?}
-    Open.write('gene.go',
-               data.collect{|p|
-                 p[1].uniq.collect{|go|
-                   "#{p[0]}\t#{go}"
-                 }.join("\n")
-               }.join("\n")
-              )
-  end
-end
-Rake::Task['gene_go.pmid'].clear
-file 'gene_go.pmid' => ['identifiers'] do
-  if File.exists? 'identifiers'
-    index = Index.index('identifiers')
-    data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
-    data = data.collect{|code, value_lists|
-      code = index[code]
-      [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
-    }.select{|p| p[0] && p[1].any?}
-    Open.write('gene_go.pmid',
-               data.collect{|p|
-                 p[1].uniq.collect{|pmid|
-                   "#{p[0]}\t#{pmid}"
-                 }.join("\n")
-               }.join("\n")
-              )
-  end
-end
-Rake::Task['lexicon'].clear
-file 'lexicon' => ['identifiers'] do
-  if File.exists? 'identifiers'
-    require 'rbbt/sources/organism'
-    HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
-    names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
-    translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
-    Open.write('lexicon',
-               names.collect{|code, names|
-                 next unless translations[code]
-                 ([translations[code]] + names).join("\t")
-               }.compact.join("\n")
-               )
-  end
-end

data/install_scripts/organisms/mgi.Rakefile DELETED

@@ -1,77 +0,0 @@
-require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
-$name = "Mus musculus"
-$native_id = "MGI DB ID"
-$entrez2native = {
-  :tax => 10090,
-  :fix => nil,
-  :check => proc{|code| code.match(/^MGI/)},
-}
-$lexicon = {
-  :file => {
-    :url =>  "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
-    :native => 0,
-    :extra => [2,3],
-    :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
-  },
-}
-$identifiers = {
-  :file => {
-    :url =>  "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
-    :native => 0,
-    :extra => [],
-    :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
-  },
-  :biomart => {
-    :database => 'mmusculus_gene_ensembl',
-    :main => ['MGI DB ID', 'mgi_id'] ,
-    :extra => [
-      ['Associated Gene Name' , "external_gene_id"],
-      ['Protein ID' , "protein_id"] ,
-      ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
-      ['Unigene ID' , "unigene"] ,
-      ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
-      ['RefSeq Protein ID' , "refseq_peptide"] ,
-      ['EMBL (Genbank) ID' , "embl"] ,
-      ['Affy mg u74a',"affy_mg_u74a" ],
-      ['Affy mg u74av2',"affy_mg_u74av2" ],
-      ['Affy mg u74b',"affy_mg_u74b" ],
-      ['Affy mg u74bv2',"affy_mg_u74bv2" ],
-      ['Affy mg u74c',"affy_mg_u74c" ],
-      ['Affy mg u74cv2',"affy_mg_u74cv2" ],
-      ['Affy moe430a',"affy_moe430a" ],
-      ['Affy moe430b',"affy_moe430b" ],
-      ['AFFY MoEx',"affy_moex_1_0_st_v1" ],
-      ['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
-      ['Affy mouse430 2',"affy_mouse430_2" ],
-      ['Affy mouse430a 2',"affy_mouse430a_2" ],
-      ['Affy mu11ksuba',"affy_mu11ksuba" ],
-      ['Affy mu11ksubb',"affy_mu11ksubb" ],
-      ['Agilent WholeGenome',"agilent_wholegenome" ],
-      ['Codelink ID',"codelink" ],
-      ['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
-      ['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
-    ],
-    :filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
-  }
-}
-$go = {
-  :url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
-  :code => 1,
-  :go   => 4,
-  :pmid => 5,
-}
-$query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
-##########################

data/install_scripts/organisms/pombe.Rakefile DELETED

@@ -1,40 +0,0 @@
-require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
-$name = "Schizosaccharomyces pombe"
-$native_id = "GeneDB Id"
-$entrez2native = {
-  :tax => 4896,
-  :fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
-  :check => proc{|code| code.match(/^SP/)},
-}
-$lexicon = {
-  :file => {
-    :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
-    :native => 0,
-    :extra => [1,2,3,4,5,6,7,8]
-  },
-}
-$identifiers = {
-  :file => {
-    :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
-    :native => 0,
-    :extra => [],
-  },
-}
-$go = {
-  :url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
-  :code => 1,
-  :go   => 4,
-  :pmid => 5,
-}
-$query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
-####

data/install_scripts/organisms/rake-include.rb DELETED

@@ -1,258 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-require 'rbbt/util/arrayHash'
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require 'rbbt/sources/pubmed'
-file 'name' do
-  Open.write('name', $name)
-end
-file 'all.pmid' do
-  Open.write('all.pmid', PubMed.query($query).join("\n"))
-end
-file 'lexicon' do
-  begin
-    data = nil
-    # Read from file
-    if $lexicon[:file]
-      file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
-      data = ArrayHash.new(file, $native_id)
-    end
-    # Translate from entrez to native if needed
-    if $entrez2native
-      translations = {}
-      Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
-        each{|k,v|
-          translations[k] = [v.join("|")]
-      }
-      translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
-      if data
-        data.merge(translations_data)
-      else
-        data = translations_data
-      end
-    end
-    # Read from Biomart and merge with previous data
-    if $lexicon[:biomart]
-      biomart = {}
-      BioMart.query(
-        $lexicon[:biomart][:database],
-        $lexicon[:biomart][:main][1],
-        $lexicon[:biomart][:extra].collect{|v| v[1]},
-        $lexicon[:biomart][:filter]
-      ).each{|key, values_list|
-        values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
-        biomart[key] = values
-      }
-      biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
-      if data
-        if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
-          field = $native_id
-        else
-          field =  'Entrez Gene ID'
-        end
-        data.merge(biomart_data, field)
-      else
-        data = biomart_data
-      end
-    end
-    if $entrez2native
-      gene_alias = {}
-      Entrez.entrez2native($entrez2native[:tax],4).
-        each{|k,v|
-        gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
-      }
-      if gene_alias.keys.any?
-        gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
-        data.merge(gene_alias_data, 'Entrez Gene ID')
-      end
-    end
-    data.remove('Entrez Gene ID')
-    data.clean
-    Open.write('lexicon', data.data.collect{|code, name_lists|
-      "#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
-    }.join("\n"))
-rescue Entrez::NoFileError
-  puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
-end
-end
-file 'identifiers' do
-  begin
-    data = nil
-    if $identifiers[:file]
-      file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
-      data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
-    end
-    # Translate from entrez to native if needed
-    if $entrez2native
-      translations = {}
-      Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
-        each{|k,v|
-          translations[k] = [v.join("|")]
-      }
-      if translations.keys.any?
-        translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
-        if data
-          data.merge(translations_data)
-        else
-          data = translations_data
-        end
-      end
-    end
-    # Read from Biomart and merge with previous data
-    if $identifiers[:biomart]
-      biomart = {}
-      BioMart.query(
-        $identifiers[:biomart][:database],
-        $identifiers[:biomart][:main][1],
-        $identifiers[:biomart][:extra].collect{|v| v[1]},
-        $identifiers[:biomart][:filter]
-      ).each{|key, values_list|
-        values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
-        biomart[key] = values
-      }
-      biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
-      $identifiers[:biomart][:extra].each{|values|
-        if values[2]
-          biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
-        end
-      }
-      if data
-        if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
-          field = $native_id
-        else
-          field = 'Entrez Gene ID'
-        end
-        data.merge(biomart_data, field)
-      else
-        data = biomart_data
-      end
-    end
-    # Add the alias at the end
-    if $entrez2native
-      gene_alias = {}
-      Entrez.entrez2native($entrez2native[:tax],4).
-       each{|k,v|
-         gene_alias[k] = [v.join("|")]
-      }
-      if gene_alias.keys.any?
-        gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
-        if data
-          data.merge(gene_alias_data, 'Entrez Gene ID')
-        else
-          data = gene_alias_data
-        end
-      end
-    end
-    # Write ids to file
-    fout = File.open('identifiers', 'w')
-    fout.puts "##{$native_id}\t" + data.fields.join("\t")
-    data.clean
-    data.data.each{|code, values|
-      fout.puts code + "\t" + values.join("\t")
-    }
-    fout.close
-  rescue Entrez::NoFileError
-    puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
-  end
-end
-file 'gene.go' do
-  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
-  data = data.collect{|code, value_lists|
-    [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
-  }.select{|p|  p[1].any?}
-  Open.write('gene.go',
-              data.collect{|p|
-                p[1].uniq.collect{|go|
-                  "#{p[0]}\t#{go}"
-                }.join("\n")
-              }.join("\n")
-            )
-end
-file 'gene_go.pmid' do
-  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
-  data = data.collect{|code, value_lists|
-    [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
-  }.select{|p|  p[1].any?}
-  Open.write('gene_go.pmid',
-              data.collect{|p|
-                p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
-              }.join("\n")
-            )
-end
-file 'gene.pmid' do
-  begin
-    translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
-    data = Entrez.entrez2pubmed($entrez2native[:tax])
-    Open.write('gene.pmid',
-               data.collect{|code,pmids|
-      next if translations && ! translations[code]
-      code = translations[code].first if translations
-      pmids.collect{|pmid|
-                 "#{ code }\t#{pmid}"
-      }.compact.join("\n")
-    }.compact.join("\n")
-              )
-  rescue Entrez::NoFileError
-    puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
-  end
-end
-task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
-task 'clean' do
-  `rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
-end
-task 'update' do
-  Rake::Task['clean'].invoke if $force
-  Rake::Task['all'].invoke
-end