reubypathdb 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/eupathdb_gene_information_table.rb +67 -12
- data/lib/eupathdb_gff.rb +215 -0
- data/lib/jgi_genes.rb +300 -0
- data/lib/reubypathdb.rb +2 -0
- data/reubypathdb.gemspec +55 -0
- metadata +12 -9
- data/.gitignore +0 -21
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -1,29 +1,80 @@
|
|
1
|
+
# Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
2
|
+
# These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
|
1
3
|
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
# A class for extracting gene info from a particular gene from the information file
|
7
|
+
class EuPathDBGeneInformationFileExtractor
|
8
|
+
# A filename path to the gene information file
|
9
|
+
attr_accessor :filename
|
10
|
+
|
11
|
+
def initialize(filename = nil)
|
12
|
+
@filename = filename
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
|
16
|
+
# there are multiple in the file, only the first is returned. If none are found, nil is returned.
|
17
|
+
#
|
18
|
+
# If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
|
19
|
+
def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
|
20
|
+
inside_iterator = lambda do |gene|
|
21
|
+
return gene if wanted_gene_id == gene.get_info('Gene Id')
|
22
|
+
end
|
23
|
+
|
24
|
+
filename = @filename
|
25
|
+
if grep_hack_lines and grep_hack_lines.to_i != 0
|
26
|
+
Tempfile.new('reubypathdb_grep_hack') do |tempfile|
|
27
|
+
# grep however many lines from past the point. Rather dodgy, but faster.
|
28
|
+
raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
|
29
|
+
`grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
|
30
|
+
EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
|
31
|
+
inside_iterator.call(gene)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
else
|
35
|
+
# no grep hack. Parse the whole gene information file
|
36
|
+
EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
|
37
|
+
inside_iterator.call(gene)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# A class for parsing the 'gene information table' files from EuPathDB, such
|
45
|
+
# as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
46
|
+
#
|
47
|
+
# The usual way of interacting with these is the use of the each method,
|
48
|
+
# which returns a EuPathDBGeneInformation object with all of the recorded
|
49
|
+
# information in it.
|
2
50
|
class EuPathDBGeneInformationTable
|
3
51
|
include Enumerable
|
4
|
-
|
52
|
+
|
53
|
+
# Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
|
5
54
|
def initialize(io)
|
6
55
|
@io = io
|
7
56
|
end
|
8
|
-
|
57
|
+
|
58
|
+
# Return a EuPathDBGeneInformation object with
|
59
|
+
# the contained info in it, one at a time
|
9
60
|
def each
|
10
61
|
while g = next_gene
|
11
62
|
yield g
|
12
63
|
end
|
13
64
|
end
|
14
|
-
|
65
|
+
|
15
66
|
# Returns a EuPathDBGeneInformation object with all the data you could
|
16
67
|
# possibly want.
|
17
68
|
def next_gene
|
18
69
|
info = EuPathDBGeneInformation.new
|
19
|
-
|
70
|
+
|
20
71
|
# first, read the table, which should start with the ID column
|
21
72
|
line = @io.readline.strip
|
22
73
|
while line == ''
|
23
74
|
return nil if @io.eof?
|
24
75
|
line = @io.readline.strip
|
25
76
|
end
|
26
|
-
|
77
|
+
|
27
78
|
while line != ''
|
28
79
|
if matches = line.match(/^(.*?)\: (.*)$/)
|
29
80
|
info.add_information(matches[1], matches[2])
|
@@ -33,7 +84,7 @@ class EuPathDBGeneInformationTable
|
|
33
84
|
|
34
85
|
line = @io.readline.strip
|
35
86
|
end
|
36
|
-
|
87
|
+
|
37
88
|
# now read each of the tables, which should start with the
|
38
89
|
# 'TABLE: <name>' entry
|
39
90
|
line = @io.readline.strip
|
@@ -44,7 +95,7 @@ class EuPathDBGeneInformationTable
|
|
44
95
|
if line == ''
|
45
96
|
# add it to the stack unless we are just starting out
|
46
97
|
info.add_table(table_name, headers, data) unless table_name.nil?
|
47
|
-
|
98
|
+
|
48
99
|
# reset things
|
49
100
|
table_name = nil
|
50
101
|
headers = nil
|
@@ -63,32 +114,36 @@ class EuPathDBGeneInformationTable
|
|
63
114
|
end
|
64
115
|
line = @io.readline.strip
|
65
116
|
end
|
66
|
-
|
117
|
+
|
67
118
|
# return the object that has been created
|
68
119
|
return info
|
69
120
|
end
|
70
121
|
end
|
71
122
|
|
123
|
+
# Each gene in the gene information table is represented
|
124
|
+
# by 2 types of information - info and tables.
|
125
|
+
# info are 1 line data, whereas tables are tables of
|
126
|
+
# data with possibly multiple rows
|
72
127
|
class EuPathDBGeneInformation
|
73
128
|
def info
|
74
129
|
@info
|
75
130
|
end
|
76
|
-
|
131
|
+
|
77
132
|
def get_info(key)
|
78
133
|
@info[key]
|
79
134
|
end
|
80
135
|
alias_method :[], :get_info
|
81
|
-
|
136
|
+
|
82
137
|
def get_table(table_name)
|
83
138
|
@tables[table_name]
|
84
139
|
end
|
85
|
-
|
140
|
+
|
86
141
|
def add_information(key, value)
|
87
142
|
@info ||= {}
|
88
143
|
@info[key] = value
|
89
144
|
"Added info #{key}, now is #{@info[key]}"
|
90
145
|
end
|
91
|
-
|
146
|
+
|
92
147
|
def add_table(name, headers, data)
|
93
148
|
@tables ||= {}
|
94
149
|
@tables[name] = []
|
data/lib/eupathdb_gff.rb
ADDED
@@ -0,0 +1,215 @@
|
|
1
|
+
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bio'
|
4
|
+
require 'jgi_genes'
|
5
|
+
require 'cgi'
|
6
|
+
|
7
|
+
# Unlike JGI genes files, ApiDB files have several differences:
|
8
|
+
# - genes on the reverse strand appear in order of their exons, and so
|
9
|
+
# the exons are not all in the correct order with respect to the underlying
|
10
|
+
# sequence.
|
11
|
+
class EupathDBGFF < JgiGenesGff
|
12
|
+
attr_accessor :features_to_ignore
|
13
|
+
|
14
|
+
def initialize(path)
|
15
|
+
@file = File.open path, 'r'
|
16
|
+
@next_gff = read_record
|
17
|
+
@features_to_ignore = [
|
18
|
+
'rRNA',
|
19
|
+
'tRNA',
|
20
|
+
'snRNA',
|
21
|
+
'transcript'
|
22
|
+
]
|
23
|
+
end
|
24
|
+
|
25
|
+
def next_gene
|
26
|
+
cur = @next_gff
|
27
|
+
|
28
|
+
if !cur
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
|
32
|
+
# Ignore the supercontigs at the start of the file
|
33
|
+
while ignore_line?(cur) or ignore_record?(cur)
|
34
|
+
@next_gff = read_record
|
35
|
+
cur = @next_gff
|
36
|
+
if !cur
|
37
|
+
return nil
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
if cur.feature != 'gene'
|
42
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected gene first."
|
43
|
+
end
|
44
|
+
|
45
|
+
# save line so can set these values later,
|
46
|
+
# i
|
47
|
+
gene_line = cur
|
48
|
+
|
49
|
+
|
50
|
+
# First mRNA
|
51
|
+
cur = read_record
|
52
|
+
|
53
|
+
if cur.feature != 'mRNA'
|
54
|
+
# skip rRNA type genes because they are not relevant
|
55
|
+
if ignore_record?(cur)
|
56
|
+
# skip forward to the next gene
|
57
|
+
while cur.feature != 'gene'
|
58
|
+
cur = read_record
|
59
|
+
return nil if cur.nil? # we have reached the end on an ignored gene
|
60
|
+
end
|
61
|
+
@next_gff = cur
|
62
|
+
if cur
|
63
|
+
return next_gene
|
64
|
+
else
|
65
|
+
return nil
|
66
|
+
end
|
67
|
+
else
|
68
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected mRNA next."
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Setup the gene in itself
|
73
|
+
gene = setup_gene_from_first_line gene_line
|
74
|
+
|
75
|
+
# setup stuff from mRNA line
|
76
|
+
ids = cur.attributes['Ontology_term']
|
77
|
+
if ids
|
78
|
+
gene.go_identifiers = ids.split ','
|
79
|
+
end
|
80
|
+
|
81
|
+
# Next CDS
|
82
|
+
cur = read_record
|
83
|
+
if cur.feature != 'CDS'
|
84
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected CDS next."
|
85
|
+
end
|
86
|
+
gene.cds = []
|
87
|
+
while cur.feature == 'CDS'
|
88
|
+
f = Bio::Location.new
|
89
|
+
f.from = cur.start
|
90
|
+
f.to = cur.end
|
91
|
+
gene.cds.push f
|
92
|
+
|
93
|
+
cur = read_record
|
94
|
+
end
|
95
|
+
|
96
|
+
#next exons
|
97
|
+
if cur.feature != 'exon'
|
98
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected exon next."
|
99
|
+
end
|
100
|
+
gene.exons = []
|
101
|
+
while cur and cur.feature == 'exon'
|
102
|
+
f = Bio::Location.new
|
103
|
+
f.from = cur.start
|
104
|
+
f.to = cur.end
|
105
|
+
gene.exons.push f
|
106
|
+
|
107
|
+
cur = read_record
|
108
|
+
end
|
109
|
+
|
110
|
+
@next_gff = cur
|
111
|
+
|
112
|
+
return gene
|
113
|
+
end
|
114
|
+
|
115
|
+
# ignore this line when parsing the file
|
116
|
+
def ignore_line?(cur)
|
117
|
+
return ['supercontig', 'introgressed_chromosome_region'].include?(cur.feature)
|
118
|
+
end
|
119
|
+
|
120
|
+
# Certain things I don't want uploaded, like apicoplast genome, etc.
|
121
|
+
def ignore_record?(record)
|
122
|
+
if !record or !record.seqname or
|
123
|
+
@features_to_ignore.include?(record.feature) or
|
124
|
+
record.seqname.match(/^apidb\|NC\_/) or
|
125
|
+
record.seqname.match(/^apidb\|API_IRAB/) or
|
126
|
+
record.seqname.match(/^apidb\|M76611/) or
|
127
|
+
record.seqname.match(/^apidb\|X95276/) #or
|
128
|
+
# record.seqname.match(/^apidb\|Pf/)
|
129
|
+
return true
|
130
|
+
else
|
131
|
+
return false
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
private
|
136
|
+
def read_record
|
137
|
+
line = ""
|
138
|
+
|
139
|
+
# while blank or comment lines, skip, except for ##Fasta, which
|
140
|
+
# means all the genes have already been defined
|
141
|
+
while line.lstrip.rstrip.empty? or line.match(/^\#/)
|
142
|
+
|
143
|
+
line = @file.gets
|
144
|
+
|
145
|
+
if !line or line.match(/^\#\#FASTA/)
|
146
|
+
return nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
whole = EupathDBGFFRecord.new(line)
|
152
|
+
return whole
|
153
|
+
end
|
154
|
+
|
155
|
+
# Given a line describing a gene in an apidb gff file, setup all the
|
156
|
+
# stuff associated with the 'gene' line
|
157
|
+
def setup_gene_from_first_line(gene_line)
|
158
|
+
gene = PositionedGeneWithOntology.new
|
159
|
+
gene.start = gene_line.start
|
160
|
+
gene.strand = gene_line.strand
|
161
|
+
aliai = gene_line.attributes['Alias']
|
162
|
+
if aliai
|
163
|
+
aliai.chomp!
|
164
|
+
gene.alternate_ids = aliai.split ','
|
165
|
+
end
|
166
|
+
|
167
|
+
# make description proper
|
168
|
+
description = gene_line.attributes['description']
|
169
|
+
gene.description = CGI::unescape(description) # yey for useful functions I didn't write
|
170
|
+
|
171
|
+
# name - remove the 'apidb|' bit
|
172
|
+
match = gene_line.attributes['ID'].match('apidb\|(.*)')
|
173
|
+
if !match or !match[1] or match[1] === ''
|
174
|
+
raise Exception, "Badly parsed gene name: #{gene_line}.attributes['ID']}."
|
175
|
+
end
|
176
|
+
gene.name = match[1]
|
177
|
+
gene.seqname = gene_line.seqname
|
178
|
+
|
179
|
+
return gene
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
|
185
|
+
class EupathDBGFFRecord < JgiGffRecord
|
186
|
+
# eg. ID=apidb|X95275;Name=X95275;description=Plasmodium+falciparum+complete+gene+map+of+plastid-like+DNA+%28IR-A%29.
|
187
|
+
def parse_attributes(attributes_string)
|
188
|
+
@attributes = Hash.new
|
189
|
+
parts = attributes_string.split ';'
|
190
|
+
if parts
|
191
|
+
parts.each {|couple|
|
192
|
+
cs = couple.split '='
|
193
|
+
#deal with attributes like 'Note=;' by ignoring them
|
194
|
+
# I once found one of these in the yeast genome gff
|
195
|
+
next if cs.length == 1 and couple.match(/=/)
|
196
|
+
if cs.length != 2
|
197
|
+
raise Exception, "Badly handled attributes bit in api db gff: '#{cs}' from '#{attributes_string}'"
|
198
|
+
end
|
199
|
+
@attributes[cs[0]] = cs[1]
|
200
|
+
}
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
class PositionedGeneWithOntology < PositionedGene
|
207
|
+
attr_accessor :alternate_ids, :description
|
208
|
+
attr_writer :go_identifiers
|
209
|
+
|
210
|
+
def go_identifiers
|
211
|
+
return nil if !@go_identifiers
|
212
|
+
return @go_identifiers.sort.uniq
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
data/lib/jgi_genes.rb
ADDED
@@ -0,0 +1,300 @@
|
|
1
|
+
require 'bio'
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
#
|
10
|
+
class JgiGenesGff
|
11
|
+
|
12
|
+
#
|
13
|
+
def initialize(path)
|
14
|
+
@jgi_file = File.open(path, "r")
|
15
|
+
@next_gff = read_record
|
16
|
+
end
|
17
|
+
|
18
|
+
# Return a enhanced_gene object or nil if none exists
|
19
|
+
def next_gene
|
20
|
+
# Parse the first line into data structures for current gene
|
21
|
+
cur = @next_gff
|
22
|
+
if !cur
|
23
|
+
return nil
|
24
|
+
end
|
25
|
+
|
26
|
+
# Make sure the assumption that the first one is an exon is true
|
27
|
+
if cur.feature==='exon'
|
28
|
+
seqname = cur.seqname
|
29
|
+
strand = cur.strand
|
30
|
+
source = cur.source
|
31
|
+
name = parse_name(cur.attributes)
|
32
|
+
|
33
|
+
f = Bio::Location.new
|
34
|
+
f.from = cur.start
|
35
|
+
f.to = cur.end
|
36
|
+
exons = [f]
|
37
|
+
cds = []
|
38
|
+
protein_id = nil #Unknown until we have a CDS line in the file
|
39
|
+
|
40
|
+
# Continue reading until finished gene or finished file
|
41
|
+
finished_gene = false
|
42
|
+
while !finished_gene and (cur = read_record)
|
43
|
+
|
44
|
+
|
45
|
+
# if still in the same gene
|
46
|
+
if parse_name(cur.attributes) === name
|
47
|
+
if cur.strand != strand or cur.seqname != seqname or cur.source != source
|
48
|
+
puts "EXCEPTION !!!!!!!!!!!!!!!!!!!"
|
49
|
+
raise Exception, 'Data bug in JGI file or parsing is being done incorrectly'
|
50
|
+
end
|
51
|
+
f = Bio::Location.new
|
52
|
+
f.from = cur.start
|
53
|
+
f.to = cur.end
|
54
|
+
case cur.feature
|
55
|
+
when 'exon'
|
56
|
+
exons.push f
|
57
|
+
when 'CDS'
|
58
|
+
cds.push f
|
59
|
+
protein_id = parse_protein_id(cur.attributes)
|
60
|
+
when 'start_codon' #meh
|
61
|
+
when 'stop_codon'
|
62
|
+
else
|
63
|
+
puts "EXCEPTION !!!!!!!!!!!!!!!!!!!"
|
64
|
+
raise Exception, "Unknown feature type #{cur.feature} found."
|
65
|
+
end
|
66
|
+
else
|
67
|
+
finished_gene = true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
#make ready for the next gene
|
72
|
+
@next_gff = cur
|
73
|
+
|
74
|
+
#create a new positioned gene with the useful characteristics
|
75
|
+
# puts "Returning gene:"
|
76
|
+
# p exons.length
|
77
|
+
# p cds.length
|
78
|
+
g = PositionedGene.new
|
79
|
+
g.seqname = seqname
|
80
|
+
g.name = name
|
81
|
+
g.strand = strand
|
82
|
+
g.start = exons[0].from
|
83
|
+
g.exons = exons
|
84
|
+
g.cds = cds
|
85
|
+
g.protein_id = protein_id
|
86
|
+
return g
|
87
|
+
else
|
88
|
+
p cur.feature
|
89
|
+
# I'm not sure if this is detrimental or not, but to be safe..
|
90
|
+
raise Exception, "Assumption failed: exon is not first feature in the gene"
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
def distance_iterator
|
96
|
+
return JgiGenesIterator.new(self)
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
# Read a line from the file, and create the next gff object,
|
101
|
+
# or nil if none exists
|
102
|
+
def read_record
|
103
|
+
line = ""
|
104
|
+
|
105
|
+
while line.lstrip.rstrip.empty?
|
106
|
+
line = @jgi_file.gets
|
107
|
+
if !line
|
108
|
+
return nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
whole = JgiGffRecord.new(line)
|
114
|
+
return whole
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
# Return the name of the gene, given the attributes hash
|
119
|
+
def parse_name(attributes)
|
120
|
+
name = attributes['name'].gsub('"','')
|
121
|
+
return name
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
def parse_protein_id(attributes)
|
126
|
+
return attributes['proteinId'].to_i
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
# A gene as read from the gff file.
|
133
|
+
# cds and exons are assumed to be in increasing order in terms of their
|
134
|
+
# positions
|
135
|
+
# along the positive strand.
|
136
|
+
class PositionedGene
|
137
|
+
attr_accessor :seqname, :name, :strand, :start, :exons, :cds, :protein_id
|
138
|
+
|
139
|
+
# Return the position of the cds end
|
140
|
+
def cds_start
|
141
|
+
# If gene has no coding regions, I guess
|
142
|
+
if !@cds[0]
|
143
|
+
return nil
|
144
|
+
end
|
145
|
+
return @cds[0].from
|
146
|
+
end
|
147
|
+
|
148
|
+
def cds_end
|
149
|
+
# If gene has no coding regions, I guess
|
150
|
+
if !@cds[@cds.length-1]
|
151
|
+
return nil
|
152
|
+
end
|
153
|
+
return @cds[@cds.length-1].to
|
154
|
+
end
|
155
|
+
|
156
|
+
def positive_strand?
|
157
|
+
return @strand === '+'
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
# Fixes up JGI to GFF problems. I don't mean to blame anyone but
|
166
|
+
# they just don't seem to go together
|
167
|
+
class JgiGffRecord < Bio::GFF::Record
|
168
|
+
|
169
|
+
SEQNAME_COL = 0
|
170
|
+
SOURCE_COL = 1
|
171
|
+
FEATURE_COL = 2
|
172
|
+
START_COL = 3
|
173
|
+
END_COL = 4
|
174
|
+
SCORE_COL = 5
|
175
|
+
STRAND_COL = 6
|
176
|
+
FRAME_COL = 7
|
177
|
+
ATTRIBUTES_COL = 8
|
178
|
+
|
179
|
+
def initialize(line)
|
180
|
+
@line = line
|
181
|
+
|
182
|
+
parts = line.split("\t");
|
183
|
+
if parts.length != 9 and parts.length != 8
|
184
|
+
raise Exception, "Badly formatted GFF line - doesn't have correct number of components '#{line}"
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
parse_mandatory_columns(parts)
|
189
|
+
|
190
|
+
parse_attributes(parts[ATTRIBUTES_COL])
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
# Given an array of 8 strings, parse the columns into something
|
196
|
+
# that can be understood by this object
|
197
|
+
def parse_mandatory_columns(parts)
|
198
|
+
@seqname = parts[SEQNAME_COL]
|
199
|
+
@source = parts[SOURCE_COL]
|
200
|
+
@feature = parts[FEATURE_COL]
|
201
|
+
@start = parts[START_COL]
|
202
|
+
@end = parts[END_COL]
|
203
|
+
@score = parts[SCORE_COL]
|
204
|
+
@strand = parts[STRAND_COL]
|
205
|
+
@frame = parts[FRAME_COL]
|
206
|
+
end
|
207
|
+
|
208
|
+
|
209
|
+
# parse the last part of a line into a hash contained in attributes
|
210
|
+
# global variable
|
211
|
+
def parse_attributes(attribute_string)
|
212
|
+
@attributes = Hash.new #define empty attributes even if there are none
|
213
|
+
|
214
|
+
if attribute_string
|
215
|
+
#let the fancy parsing begin
|
216
|
+
aparts = attribute_string.split '; '
|
217
|
+
|
218
|
+
aparts.each do |bit|
|
219
|
+
hbits = bit.split ' '
|
220
|
+
if !hbits or hbits.length != 2
|
221
|
+
raise Exception, "Failed to parse attributes in line: #{line}"
|
222
|
+
end
|
223
|
+
str = hbits[1].gsub(/\"/, '').rstrip.lstrip
|
224
|
+
@attributes[hbits[0]] = str
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
def to_s
|
231
|
+
@line
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
class JgiGenesIterator
|
240
|
+
|
241
|
+
|
242
|
+
def initialize(jgiGenesGffObj)
|
243
|
+
@genbank = jgiGenesGffObj
|
244
|
+
|
245
|
+
# Setup cycle for iterator
|
246
|
+
@cur_gene = @genbank.next_gene
|
247
|
+
@next_gene = @genbank.next_gene
|
248
|
+
@next_is_first = true
|
249
|
+
end
|
250
|
+
|
251
|
+
def has_next_distance
|
252
|
+
return !@next_gene.nil?
|
253
|
+
end
|
254
|
+
|
255
|
+
# Return the next gene to be worked on
|
256
|
+
def next_gene
|
257
|
+
return @cur_gene
|
258
|
+
end
|
259
|
+
|
260
|
+
# Return the upstream distance between one gene and another
|
261
|
+
def next_distance
|
262
|
+
# if the first gene in the list
|
263
|
+
if @next_is_first
|
264
|
+
# cycle has already been setup in initialisation
|
265
|
+
@next_is_first = false;
|
266
|
+
else
|
267
|
+
#cycle through things
|
268
|
+
if !@cur_gene #if nothing is found
|
269
|
+
raise Exception, 'Unexpected nil cur_gene - a software coding error?'
|
270
|
+
end
|
271
|
+
@prev_gene = @cur_gene
|
272
|
+
@cur_gene = @next_gene
|
273
|
+
@next_gene = @genbank.next_gene
|
274
|
+
end
|
275
|
+
|
276
|
+
if !@cur_gene
|
277
|
+
raise Exception, 'Overrun iterator - no more genes available. Use has_next_distance'
|
278
|
+
end
|
279
|
+
|
280
|
+
|
281
|
+
|
282
|
+
# We look at the current gene, and return its upstream distance
|
283
|
+
if @cur_gene.positive_strand?
|
284
|
+
# so we want the distance between cur and last then
|
285
|
+
|
286
|
+
# if last gene undefined or on a different scaffold, return nothing
|
287
|
+
if !@prev_gene or @prev_gene.seqname != @cur_gene.seqname
|
288
|
+
return nil
|
289
|
+
end
|
290
|
+
return @cur_gene.cds_start.to_i - @prev_gene.cds_end.to_i
|
291
|
+
else
|
292
|
+
if !@next_gene or @next_gene.seqname != @cur_gene.seqname
|
293
|
+
return nil
|
294
|
+
end
|
295
|
+
return @next_gene.cds_start.to_i - @cur_gene.cds_end.to_i
|
296
|
+
end
|
297
|
+
|
298
|
+
end
|
299
|
+
|
300
|
+
end
|
data/lib/reubypathdb.rb
ADDED
data/reubypathdb.gemspec
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{reubypathdb}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Ben J Woodcroft"]
|
12
|
+
s.date = %q{2011-04-19}
|
13
|
+
s.description = %q{Classes to help parsing EuPathDB data files}
|
14
|
+
s.email = %q{donttrustben near gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
"LICENSE",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"lib/eupathdb_gene_information_table.rb",
|
26
|
+
"lib/eupathdb_gff.rb",
|
27
|
+
"lib/jgi_genes.rb",
|
28
|
+
"lib/reubypathdb.rb",
|
29
|
+
"reubypathdb.gemspec",
|
30
|
+
"test/data/eupathGeneInformation.txt",
|
31
|
+
"test/helper.rb",
|
32
|
+
"test/test_eupathdb_gene_information_table.rb"
|
33
|
+
]
|
34
|
+
s.homepage = %q{http://github.com/wwood/reubypathdb}
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = %q{1.6.2}
|
37
|
+
s.summary = %q{Classes to help parsing EuPathDB data files}
|
38
|
+
s.test_files = [
|
39
|
+
"test/helper.rb",
|
40
|
+
"test/test_eupathdb_gene_information_table.rb"
|
41
|
+
]
|
42
|
+
|
43
|
+
if s.respond_to? :specification_version then
|
44
|
+
s.specification_version = 3
|
45
|
+
|
46
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
47
|
+
s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
48
|
+
else
|
49
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
50
|
+
end
|
51
|
+
else
|
52
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reubypathdb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Ben J Woodcroft
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-04-19 00:00:00 +10:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -43,12 +43,15 @@ extra_rdoc_files:
|
|
43
43
|
- README.rdoc
|
44
44
|
files:
|
45
45
|
- .document
|
46
|
-
- .gitignore
|
47
46
|
- LICENSE
|
48
47
|
- README.rdoc
|
49
48
|
- Rakefile
|
50
49
|
- VERSION
|
51
50
|
- lib/eupathdb_gene_information_table.rb
|
51
|
+
- lib/eupathdb_gff.rb
|
52
|
+
- lib/jgi_genes.rb
|
53
|
+
- lib/reubypathdb.rb
|
54
|
+
- reubypathdb.gemspec
|
52
55
|
- test/data/eupathGeneInformation.txt
|
53
56
|
- test/helper.rb
|
54
57
|
- test/test_eupathdb_gene_information_table.rb
|
@@ -57,8 +60,8 @@ homepage: http://github.com/wwood/reubypathdb
|
|
57
60
|
licenses: []
|
58
61
|
|
59
62
|
post_install_message:
|
60
|
-
rdoc_options:
|
61
|
-
|
63
|
+
rdoc_options: []
|
64
|
+
|
62
65
|
require_paths:
|
63
66
|
- lib
|
64
67
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -82,7 +85,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
85
|
requirements: []
|
83
86
|
|
84
87
|
rubyforge_project:
|
85
|
-
rubygems_version: 1.
|
88
|
+
rubygems_version: 1.6.2
|
86
89
|
signing_key:
|
87
90
|
specification_version: 3
|
88
91
|
summary: Classes to help parsing EuPathDB data files
|