rbbt 1.1.7 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +72 -136
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -246
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -145
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -79
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/cgd.Rakefile +0 -84
- data/install_scripts/organisms/human.Rakefile +0 -145
- data/install_scripts/organisms/mgi.Rakefile +0 -77
- data/install_scripts/organisms/pombe.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -258
- data/install_scripts/organisms/rgd.Rakefile +0 -88
- data/install_scripts/organisms/sgd.Rakefile +0 -66
- data/install_scripts/organisms/tair.Rakefile +0 -54
- data/install_scripts/organisms/worm.Rakefile +0 -109
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -86
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -40
- data/lib/rbbt/sources/organism.rb +0 -245
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -111
- data/lib/rbbt/util/arrayHash.rb +0 -255
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -235
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -19
- data/tasks/install.rake +0 -124
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ccb5da58d650ebae30d3a24f39eaf8c12e864f160e83d4b2ba382d6ea487e758
|
4
|
+
data.tar.gz: '0963980b784617afdfdcf8c20b3545ed2cd95175f98366707d67d8a0077e9eca'
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3fbc5b33566884fd427a6844de3c1eed4b394512cd5c7e3d725d6669a151743979925344f592c7da8bbb1fc7504998e653acdefa67d4d5465e0b0fe5322fcbf3
|
7
|
+
data.tar.gz: c8a62ea9e741658c398ae286a468c72307c917d0e6d070488a46c9c21e71395e9d52e8f19f4986a363844e33c131a1540c7064d8c7f0cb0e8f4670588a08a491
|
data/README.rdoc
CHANGED
@@ -1,140 +1,4 @@
|
|
1
|
-
|
1
|
+
This is the meta-package for the Rbbt. It produces a gem that installs rbbt-util, rbbt-rest, and rbbt-sources.
|
2
2
|
|
3
|
-
|
4
|
-
for SENT[http://sent.dacya.ucm.es], but its functionality has been used
|
5
|
-
for other applications as well, such as MARQ[http://marq.dacya.ucm.es].
|
3
|
+
I'm working on a documentation page: http://mikisvaz.github.io/rbbt
|
6
4
|
|
7
|
-
== Important Note
|
8
|
-
|
9
|
-
Some unexpected gem dependencies may appear.
|
10
|
-
|
11
|
-
Rbbt covers several functionalities, some will work right away, some require to
|
12
|
-
install dependencies or download and process data from the internet. Since not
|
13
|
-
all users are likely to need all the functionalities, this gems dependencies
|
14
|
-
include only the very basic requirements. Dependencies may appear unexpectedly
|
15
|
-
when using new parts of the API.
|
16
|
-
|
17
|
-
== Functionality
|
18
|
-
|
19
|
-
=== Data sources interface
|
20
|
-
|
21
|
-
PubMed:: Making queries and retrieving articles.
|
22
|
-
|
23
|
-
BioMart:: Making queries to BioMart programmatically. It can divide a large query into smaller ones and merge the results.
|
24
|
-
|
25
|
-
Entrez:: Retrieving gene entries, associated articles, and gene synonyms and aliases.
|
26
|
-
|
27
|
-
Biocreative:: Using the competition test and training data to train and evaluate Named Entity Extraction models and Gene Mention Normalization.
|
28
|
-
|
29
|
-
|
30
|
-
=== Text mining tasks
|
31
|
-
|
32
|
-
BagOfWords:: Bag-of-words representation of text. Chunk text into terms, which can be unigrams or bi-grams, remove stopwords, build a term thesaurus using a TF_IDF (term frequency inverse document frequency) or a KL (Kullback-Leibler divergence) Dictionary, and extract a bag-of-words representations suitable for the Classifier.
|
33
|
-
|
34
|
-
Classifier:: Using R to build classification models and to use them to classify new entires. Currently the models are Support Vector Machines.
|
35
|
-
|
36
|
-
NER:: Named Entity Extraction. Currently there are 3 alternatives to do this Abner, Banner, RegExpNER, and NER. The first two are third party Java systems that require the rjb[rjb.rubyforge.org/] (Ruby Java Bridge) gem to be installed. The third one, RegExpNER, is a simple regular-expression based system which can be used when there is not enough data to train a CRF based system, for example, to find Polysearch terms. The last one, the default, is a reimplementation of a CRF-based system, such as Abner and Banner, completely configurable using a simple DSL (domain specific language).
|
37
|
-
|
38
|
-
Normalizer:: Resolve gene mentions to the actual genes they refer to. It compares the gene mention to all possible gene names and synonyms to find the best match. It is configurable using a DSL.
|
39
|
-
|
40
|
-
=== Organisms support
|
41
|
-
|
42
|
-
Using configuration files rbbt can support different organisms. The system is prepared to parse organism specific database files and merge them with Entrez and BioMart. Basically producing the following information
|
43
|
-
|
44
|
-
Lexicon:: Listing the synonyms for each gene
|
45
|
-
|
46
|
-
Identifiers:: Listing different identifiers for each gene like Entrez Gene Ids, Unigene, Affymetrix probe ids, etc. This is not the same as the lexicon which holds names, not identifiers.
|
47
|
-
|
48
|
-
GO:: Listing associations of genes to GO terms.
|
49
|
-
|
50
|
-
PubMed articles:: List articles associated to each gene, as listed in Entrez or listed to support of GO associations.
|
51
|
-
|
52
|
-
With this information rbbt offers the following functionality via the Organism class
|
53
|
-
|
54
|
-
NER and Normalization:: Loads custom models for Named Entity Extraction and Gene Mention Normalization
|
55
|
-
|
56
|
-
Identifiers translation:: Translates gene identifiers between formats.
|
57
|
-
|
58
|
-
Organisms in rbbt are identified using a keyword. This is the list of organisms currently supported with their associated keywords:
|
59
|
-
|
60
|
-
Candida albicans:: cgd
|
61
|
-
Mus musculus:: mgi
|
62
|
-
Rattus norvegicus:: rgd
|
63
|
-
Saccharomyces cerevisiae:: sgd
|
64
|
-
Arabidopsis thaliana:: tair
|
65
|
-
Caenorhabditis elegans:: worm
|
66
|
-
Homo sapiens:: human
|
67
|
-
Schizosaccharomyces pombe:: pombe
|
68
|
-
|
69
|
-
|
70
|
-
=== Other
|
71
|
-
|
72
|
-
Cache:: The system caches PubMed articles and Entrez gene entries, this is considered a persistent cache since these items are unlikely to change. Also caches any data downloaded from the internet, like BioMart queries for example, into a non-persistent cache that can be purged to perform updates to the system.
|
73
|
-
|
74
|
-
Tab separated file helpers:: The data in rbbt is saved into tab separated files and is loaded into Hash. Modules like Open or ArrayHash help dealing with these files and data structures.
|
75
|
-
|
76
|
-
= Installation
|
77
|
-
|
78
|
-
Install the gem normally <tt>gem install rbbt</tt>. The gem includes a configuration tool rbbt_config. The first time you run it it will ask you to configure some paths. After that you may use it to process data for different tasks. Lets see some scenarios:
|
79
|
-
|
80
|
-
=== Using rbbt to translate identifiers
|
81
|
-
|
82
|
-
1. Do <tt>rbbt_config prepare identifiers</tt> to do deploy the configuration files and download entrez data, this needs to be done just once.
|
83
|
-
3. Now you may do <tt>rbbt_config install organisms</tt> toprocess all the organisms, or <tt>rbbt_config install organisms -o sgd</tt> to process only yeast (sgd).
|
84
|
-
4. You may now use a script like this to translate gene identifiers from yeast feed from the standard input
|
85
|
-
require 'rbbt/sources/organism'
|
86
|
-
|
87
|
-
index = Organism.id_index('sgd', :native => 'Entrez Gene Id')
|
88
|
-
|
89
|
-
STDIN.each_line{|l| puts "#{l.chomp} => #{index[l.chomp]}"}
|
90
|
-
|
91
|
-
=== Using rbbt to find gene mentions in text
|
92
|
-
|
93
|
-
First prepare the organisms as you did in the previous section. Next, if you want to use the default NER module:
|
94
|
-
|
95
|
-
1. Install the Biocreative data used to train the model and compile the CRF++ plugin, <tt>rbbt_config prepare rner</tt>. You may need at this point to install ParseTree and ruby2ruby
|
96
|
-
2. Build the module for a particular organism <tt>rbbt_config install ner -o sgd</tt>. You need to have the gems ParseTree and ruby2ruby for this to work. This process can take a long time.
|
97
|
-
|
98
|
-
Or, if you wan to use Abner or Banner:
|
99
|
-
|
100
|
-
1. Download and install the packages <tt>rbbt_config prepare java_ner</tt>
|
101
|
-
|
102
|
-
You may now, for example, find mentions to genes in articles from a PubMed query using this script
|
103
|
-
|
104
|
-
require 'rbbt/sources/organism'
|
105
|
-
require 'rbbt/sources/pubmed'
|
106
|
-
|
107
|
-
# type = :abner
|
108
|
-
# type = :banner
|
109
|
-
type = :rner
|
110
|
-
|
111
|
-
ner = Organism.ner('sgd', type )
|
112
|
-
pmids = PubMed.query(ARGV[0], 500)
|
113
|
-
|
114
|
-
PubMed.get_article(pmids).each{|pmid,article|
|
115
|
-
mentions = ner.extract(article.text)
|
116
|
-
puts pmid
|
117
|
-
puts article.text
|
118
|
-
puts "Mentions: " << mentions.uniq.join(", ")
|
119
|
-
puts
|
120
|
-
}
|
121
|
-
|
122
|
-
== More Installation Guidelines
|
123
|
-
|
124
|
-
This is the complete list of gem requirements: <tt>ParseTree ruby2ruby simpleconsole rjb rsruby stemmer rand rake progress-monitor</tt>. Some of these gems to not work with ruby 1.9 at the time, or may be a bit more complicated to install, for that reason *they are not reported as dependencies and are only required when they are about to be used*. Note that some of these gems are in the gemcutter repository, you may need to install the <tt>gemcutter</tt> gem and do <tt>gem tumble</tt>
|
125
|
-
|
126
|
-
Some of the API requires to have some data processed using rbbt_config. This command is used to install third party software, download data from the internet, or build models. The command <tt>rbbt_config prepare all</tt> will install and process everything, this will take a long time, specially building the NER models. So you might want to start with the basic install and include more things as they are needed.
|
127
|
-
|
128
|
-
|
129
|
-
= Note on Patches/Pull Requests
|
130
|
-
|
131
|
-
* Fork the project.
|
132
|
-
* Make your feature addition or bug fix.
|
133
|
-
* Add tests for it. This is important so I don't break it in a future version unintentionally.
|
134
|
-
* Commit, do not mess with rakefile, version, or history.
|
135
|
-
(if you want to have your own version, that is fine, but bump version in a commit by itself that I can ignore when I pull)
|
136
|
-
* Send me a pull request. Bonus points for topic branches.
|
137
|
-
|
138
|
-
= Copyright
|
139
|
-
|
140
|
-
Copyright (c) 2009 Miguel Vazquez. See LICENSE for details.
|
metadata
CHANGED
@@ -1,163 +1,99 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
|
-
authors:
|
6
|
+
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
type: :runtime
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
11
|
+
date: 2020-05-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rbbt-util
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
21
17
|
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0
|
24
|
-
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: simpleconsole
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
27
20
|
type: :runtime
|
28
|
-
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rbbt-rest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
31
|
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version:
|
34
|
-
version:
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: stemmer
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
37
34
|
type: :runtime
|
38
|
-
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
40
|
-
requirements:
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
41
38
|
- - ">="
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version:
|
44
|
-
|
45
|
-
|
46
|
-
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rbbt-sources
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
47
48
|
type: :runtime
|
48
|
-
|
49
|
-
version_requirements: !ruby/object:Gem::Requirement
|
50
|
-
requirements:
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rbbt-dm
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
51
59
|
- - ">="
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version:
|
54
|
-
version:
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: simpleconsole
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
57
62
|
type: :runtime
|
58
|
-
|
59
|
-
version_requirements: !ruby/object:Gem::Requirement
|
60
|
-
requirements:
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
61
66
|
- - ">="
|
62
|
-
- !ruby/object:Gem::Version
|
63
|
-
version:
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
classification, as well as data integration modules that interface with PubMed, Entrez Gene, BioMart.
|
68
|
-
email: miguel.vazquez@fdi.ucm.es
|
69
|
-
executables:
|
70
|
-
- rbbt_config
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: Meta package for a gem that requires the basic Rbbt packages
|
70
|
+
email: miguel.vazquez@cnio.es
|
71
|
+
executables: []
|
71
72
|
extensions: []
|
72
|
-
|
73
|
-
extra_rdoc_files:
|
74
|
-
- LICENSE
|
73
|
+
extra_rdoc_files:
|
75
74
|
- README.rdoc
|
76
|
-
files:
|
77
|
-
- install_scripts/classifier/R/classify.R
|
78
|
-
- install_scripts/classifier/Rakefile
|
79
|
-
- install_scripts/get_abner.sh
|
80
|
-
- install_scripts/get_banner.sh
|
81
|
-
- install_scripts/get_biocreative.sh
|
82
|
-
- install_scripts/get_crf++.sh
|
83
|
-
- install_scripts/get_entrez.sh
|
84
|
-
- install_scripts/get_go.sh
|
85
|
-
- install_scripts/get_polysearch.sh
|
86
|
-
- install_scripts/ner/Rakefile
|
87
|
-
- install_scripts/ner/config/default.rb
|
88
|
-
- install_scripts/norm/Rakefile
|
89
|
-
- install_scripts/norm/config/cue_default.rb
|
90
|
-
- install_scripts/norm/config/tokens_default.rb
|
91
|
-
- install_scripts/norm/functions.sh
|
92
|
-
- install_scripts/organisms/Rakefile
|
93
|
-
- install_scripts/organisms/cgd.Rakefile
|
94
|
-
- install_scripts/organisms/human.Rakefile
|
95
|
-
- install_scripts/organisms/mgi.Rakefile
|
96
|
-
- install_scripts/organisms/pombe.Rakefile
|
97
|
-
- install_scripts/organisms/rake-include.rb
|
98
|
-
- install_scripts/organisms/rgd.Rakefile
|
99
|
-
- install_scripts/organisms/sgd.Rakefile
|
100
|
-
- install_scripts/organisms/tair.Rakefile
|
101
|
-
- install_scripts/organisms/worm.Rakefile
|
102
|
-
- install_scripts/wordlists/consonants
|
103
|
-
- install_scripts/wordlists/stopwords
|
104
|
-
- lib/rbbt.rb
|
105
|
-
- lib/rbbt/bow/bow.rb
|
106
|
-
- lib/rbbt/bow/classifier.rb
|
107
|
-
- lib/rbbt/bow/dictionary.rb
|
108
|
-
- lib/rbbt/ner/abner.rb
|
109
|
-
- lib/rbbt/ner/banner.rb
|
110
|
-
- lib/rbbt/ner/dictionaryNER.rb
|
111
|
-
- lib/rbbt/ner/regexpNER.rb
|
112
|
-
- lib/rbbt/ner/rner.rb
|
113
|
-
- lib/rbbt/ner/rnorm.rb
|
114
|
-
- lib/rbbt/ner/rnorm/cue_index.rb
|
115
|
-
- lib/rbbt/ner/rnorm/tokens.rb
|
116
|
-
- lib/rbbt/sources/biocreative.rb
|
117
|
-
- lib/rbbt/sources/biomart.rb
|
118
|
-
- lib/rbbt/sources/entrez.rb
|
119
|
-
- lib/rbbt/sources/go.rb
|
120
|
-
- lib/rbbt/sources/organism.rb
|
121
|
-
- lib/rbbt/sources/polysearch.rb
|
122
|
-
- lib/rbbt/sources/pubmed.rb
|
123
|
-
- lib/rbbt/util/arrayHash.rb
|
124
|
-
- lib/rbbt/util/filecache.rb
|
125
|
-
- lib/rbbt/util/index.rb
|
126
|
-
- lib/rbbt/util/misc.rb
|
127
|
-
- lib/rbbt/util/open.rb
|
128
|
-
- lib/rbbt/util/rake.rb
|
129
|
-
- lib/rbbt/util/simpleDSL.rb
|
130
|
-
- lib/rbbt/util/tmpfile.rb
|
131
|
-
- tasks/install.rake
|
132
|
-
- LICENSE
|
75
|
+
files:
|
133
76
|
- README.rdoc
|
134
|
-
has_rdoc: true
|
135
77
|
homepage: http://github.com/mikisvaz/rbbt
|
136
78
|
licenses: []
|
137
|
-
|
79
|
+
metadata: {}
|
138
80
|
post_install_message:
|
139
|
-
rdoc_options:
|
140
|
-
|
141
|
-
require_paths:
|
81
|
+
rdoc_options: []
|
82
|
+
require_paths:
|
142
83
|
- lib
|
143
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
144
|
-
requirements:
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
145
86
|
- - ">="
|
146
|
-
- !ruby/object:Gem::Version
|
147
|
-
version:
|
148
|
-
|
149
|
-
|
150
|
-
requirements:
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
151
91
|
- - ">="
|
152
|
-
- !ruby/object:Gem::Version
|
153
|
-
version:
|
154
|
-
version:
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
155
94
|
requirements: []
|
156
|
-
|
157
|
-
rubyforge_project:
|
158
|
-
rubygems_version: 1.3.5
|
95
|
+
rubygems_version: 3.0.6
|
159
96
|
signing_key:
|
160
|
-
specification_version:
|
161
|
-
summary:
|
97
|
+
specification_version: 4
|
98
|
+
summary: Ruby bioinformatics toolbox. Meta package
|
162
99
|
test_files: []
|
163
|
-
|
data/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Copyright (c) 2009 Miguel Vazquez
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
-
a copy of this software and associated documentation files (the
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
10
|
-
|
11
|
-
The above copyright notice and this permission notice shall be
|
12
|
-
included in all copies or substantial portions of the Software.
|
13
|
-
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/rbbt_config
DELETED
@@ -1,246 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'rake'
|
5
|
-
|
6
|
-
|
7
|
-
require 'simpleconsole'
|
8
|
-
|
9
|
-
begin
|
10
|
-
require 'rbbt'
|
11
|
-
rescue Rbbt::NoConfig
|
12
|
-
$noconfig = true
|
13
|
-
end
|
14
|
-
|
15
|
-
TASKS= %w(organisms ner norm classifier biocreative entrez go wordlists polysearch abner banner crf++)
|
16
|
-
|
17
|
-
$USAGE =<<EOT
|
18
|
-
#{__FILE__} <action> [<subaction>] [--update] [--organism <org>]
|
19
|
-
|
20
|
-
actions:
|
21
|
-
|
22
|
-
* configure: Set paths for data, cache, and tmp directories
|
23
|
-
|
24
|
-
* prepare:
|
25
|
-
|
26
|
-
Basic subactions:
|
27
|
-
|
28
|
-
* organisms: Install processing scripts to process organisms
|
29
|
-
* ner: Install processing scripts for Named Entity Recognition
|
30
|
-
* norm: Install processing scripts for Gene Mention Normalization
|
31
|
-
* classifier: Install processing scripts for Classification
|
32
|
-
|
33
|
-
* biocreative: Download and train and test data from BioCreative
|
34
|
-
* entrez: Download and install data from Entrez
|
35
|
-
* go: Download and install data from The Gene Ontology
|
36
|
-
* wordlists: Install word lists
|
37
|
-
* polysearch: Download and install Polysearch dictionaries
|
38
|
-
|
39
|
-
* abner: Download and install Abner NER system: http://pages.cs.wisc.edu/~bsettles/abner/
|
40
|
-
* banner: Download and install Banner NER system: http://sourceforge.net/projects/banner/
|
41
|
-
* crf++: Download and install CRF++ a CRF framework: http://crfpp.sourceforge.net/
|
42
|
-
|
43
|
-
Subactions grouped by task:
|
44
|
-
|
45
|
-
* identifiers: entrez, organisms
|
46
|
-
* rner: entrez, organisms, biocreative, ner, crf++
|
47
|
-
* java_ner: entrez, organisms, abner, banner
|
48
|
-
* norm: entrez organisms, biocreative, crf++, norm, polysearch
|
49
|
-
* bow: organisms, wordlists
|
50
|
-
* classifier: organisms, wordlists, classifier, go
|
51
|
-
* all: #{TASKS.join(", ")}
|
52
|
-
|
53
|
-
* install:
|
54
|
-
* organisms: Gather organisms data
|
55
|
-
* ner: Build Named Entity Recognition Models. Mention Normalization needs no training.
|
56
|
-
* classification: Build Function/Process Classifiers
|
57
|
-
|
58
|
-
--update: Rebuild models or reprocess organism data even if present. You may want to purge the cache
|
59
|
-
to be up to date with the data in the internet.
|
60
|
-
|
61
|
-
--organism: Gather data only for that particular organism. The organism must be specified by the
|
62
|
-
keyword. Use '#{__FILE__} organisms' to see find the keywords.
|
63
|
-
|
64
|
-
* purge_cache: Clean the non-persistent cache, which holds general things
|
65
|
-
downloaded using Open.read, like organism identifiers downloaded from
|
66
|
-
BioMart. The persistent cache, which hold pubmed articles or entrez gene
|
67
|
-
descriptions, is not cleaned, as these are not likely to change
|
68
|
-
|
69
|
-
* organisms: Show a list of all organisms along with their identifier in the system
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
EOT
|
74
|
-
|
75
|
-
class Controller < SimpleConsole::Controller
|
76
|
-
|
77
|
-
params :bool => {:u => :update},
|
78
|
-
:string => {:o => :organism}
|
79
|
-
|
80
|
-
def organisms
|
81
|
-
end
|
82
|
-
|
83
|
-
|
84
|
-
def default
|
85
|
-
render :action => :usage
|
86
|
-
end
|
87
|
-
|
88
|
-
def help
|
89
|
-
render :action => :usage
|
90
|
-
end
|
91
|
-
|
92
|
-
def install
|
93
|
-
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
94
|
-
|
95
|
-
case params[:id]
|
96
|
-
when "organisms"
|
97
|
-
@location = File.join(Rbbt.datadir,'organisms')
|
98
|
-
when "ner"
|
99
|
-
@location = File.join(Rbbt.datadir,'ner')
|
100
|
-
when "classifier"
|
101
|
-
@location = File.join(Rbbt.datadir,'classifier')
|
102
|
-
else
|
103
|
-
redirect_to :action => :help, :id => :update
|
104
|
-
end
|
105
|
-
|
106
|
-
$force = true if params[:update]
|
107
|
-
$org = params[:organism] if params[:organism]
|
108
|
-
|
109
|
-
end
|
110
|
-
|
111
|
-
def prepare
|
112
|
-
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
113
|
-
case params[:id]
|
114
|
-
when "identifiers"
|
115
|
-
require 'rbbt/sources/organism'
|
116
|
-
require 'rbbt/sources/entrez'
|
117
|
-
@tasks = %w(entrez organisms)
|
118
|
-
when "rner"
|
119
|
-
require 'rbbt/ner/rner'
|
120
|
-
require 'rbbt/sources/entrez'
|
121
|
-
@tasks = %w(entrez organisms biocreative ner crf++)
|
122
|
-
when "java_ner"
|
123
|
-
require 'rjb'
|
124
|
-
@tasks = %w(entrez organisms abner banner)
|
125
|
-
when "norm"
|
126
|
-
require 'rbbt/ner/rner'
|
127
|
-
require 'rbbt/ner/rnorm'
|
128
|
-
require 'rbbt/ner/regexpNER'
|
129
|
-
require 'rbbt/sources/entrez'
|
130
|
-
@tasks = %w(entrez organisms biocreative crf++ norm polysearch)
|
131
|
-
when "bow"
|
132
|
-
require 'rbbt/bow/bow'
|
133
|
-
require 'rbbt/bow/dictionary'
|
134
|
-
@tasks = %w(organisms wordlists)
|
135
|
-
when "classifier"
|
136
|
-
require 'rbbt/bow/bow'
|
137
|
-
require 'rbbt/bow/dictionary'
|
138
|
-
require 'rbbt/bow/classifier'
|
139
|
-
@tasks = %w(organisms wordlists classifier go)
|
140
|
-
when "all"
|
141
|
-
@tasks = TASKS
|
142
|
-
when nil
|
143
|
-
redirect_to :action => :help, :id => :install
|
144
|
-
else
|
145
|
-
redirect_to :action => :help, :id => :install if ! TASKS.include? params[:id]
|
146
|
-
@tasks = [params[:id]]
|
147
|
-
end
|
148
|
-
|
149
|
-
$force = true if params[:update]
|
150
|
-
$org = params[:organism] if params[:organism]
|
151
|
-
|
152
|
-
end
|
153
|
-
|
154
|
-
def configure
|
155
|
-
end
|
156
|
-
|
157
|
-
def purge_cache
|
158
|
-
end
|
159
|
-
|
160
|
-
end
|
161
|
-
|
162
|
-
class View < SimpleConsole::View
|
163
|
-
def usage
|
164
|
-
puts $USAGE
|
165
|
-
end
|
166
|
-
|
167
|
-
def organisms
|
168
|
-
require 'rbbt/sources/organism'
|
169
|
-
all = Organism.all(false)
|
170
|
-
installed = Organism.all
|
171
|
-
|
172
|
-
all.each{|org|
|
173
|
-
puts "#{Organism.name(org)}: #{org} #{installed.include?(org) ? "(installed)" : ""}"
|
174
|
-
}
|
175
|
-
end
|
176
|
-
|
177
|
-
|
178
|
-
def prepare
|
179
|
-
load File.join(Rbbt.rootdir, 'tasks/install.rake')
|
180
|
-
|
181
|
-
@tasks.each{|t|
|
182
|
-
puts "Invoking #{ t }"
|
183
|
-
Rake::Task[t].invoke
|
184
|
-
}
|
185
|
-
end
|
186
|
-
|
187
|
-
def install
|
188
|
-
|
189
|
-
puts "Changing directory to #{@location}"
|
190
|
-
chdir @location
|
191
|
-
|
192
|
-
load "./Rakefile"
|
193
|
-
|
194
|
-
Rake::Task['default'].invoke
|
195
|
-
end
|
196
|
-
|
197
|
-
|
198
|
-
def configure
|
199
|
-
|
200
|
-
defaultdir = File.join(ENV['HOME'],'rbbt')
|
201
|
-
|
202
|
-
cachedir = File.join(defaultdir, 'cache')
|
203
|
-
tmpdir = File.join(defaultdir, 'tmp')
|
204
|
-
datadir = File.join(defaultdir, 'data')
|
205
|
-
|
206
|
-
puts "Please indicate where you wish to place the data directories"
|
207
|
-
puts
|
208
|
-
|
209
|
-
puts
|
210
|
-
puts "* Cache Directory: This directory will hold downloads, from PubMed,
|
211
|
-
Entrez and other, for local store. It might grow considerably."
|
212
|
-
print "[#{ cachedir }]? "
|
213
|
-
input = STDIN.gets
|
214
|
-
cachedir = input if input =~ /\w/
|
215
|
-
|
216
|
-
puts
|
217
|
-
puts "* Tmp Directory: Temporary files."
|
218
|
-
print "[#{ tmpdir }]? "
|
219
|
-
input = STDIN.gets
|
220
|
-
tmpdir = input if input =~ /\w/
|
221
|
-
|
222
|
-
puts
|
223
|
-
puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
|
224
|
-
print "[#{ datadir }]? "
|
225
|
-
input = STDIN.gets
|
226
|
-
datadir = input if input =~ /\w/
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
|
231
|
-
fout.puts "cachedir: #{cachedir}"
|
232
|
-
fout.puts "tmpdir: #{tmpdir}"
|
233
|
-
fout.puts "datadir: #{datadir}"
|
234
|
-
fout.close
|
235
|
-
|
236
|
-
end
|
237
|
-
|
238
|
-
def purge_cache
|
239
|
-
FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
|
240
|
-
end
|
241
|
-
|
242
|
-
end
|
243
|
-
|
244
|
-
SimpleConsole::Application.run(ARGV, Controller, View)
|
245
|
-
|
246
|
-
|