reubypathdb 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/eupathdb_gene_information_table.rb +67 -12
- data/lib/eupathdb_gff.rb +215 -0
- data/lib/jgi_genes.rb +300 -0
- data/lib/reubypathdb.rb +2 -0
- data/reubypathdb.gemspec +55 -0
- metadata +12 -9
- data/.gitignore +0 -21
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -1,29 +1,80 @@
|
|
1
|
+
# Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
2
|
+
# These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
|
1
3
|
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
# A class for extracting gene info from a particular gene from the information file
|
7
|
+
class EuPathDBGeneInformationFileExtractor
|
8
|
+
# A filename path to the gene information file
|
9
|
+
attr_accessor :filename
|
10
|
+
|
11
|
+
def initialize(filename = nil)
|
12
|
+
@filename = filename
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
|
16
|
+
# there are multiple in the file, only the first is returned. If none are found, nil is returned.
|
17
|
+
#
|
18
|
+
# If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
|
19
|
+
def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
|
20
|
+
inside_iterator = lambda do |gene|
|
21
|
+
return gene if wanted_gene_id == gene.get_info('Gene Id')
|
22
|
+
end
|
23
|
+
|
24
|
+
filename = @filename
|
25
|
+
if grep_hack_lines and grep_hack_lines.to_i != 0
|
26
|
+
Tempfile.new('reubypathdb_grep_hack') do |tempfile|
|
27
|
+
# grep however many lines from past the point. Rather dodgy, but faster.
|
28
|
+
raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
|
29
|
+
`grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
|
30
|
+
EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
|
31
|
+
inside_iterator.call(gene)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
else
|
35
|
+
# no grep hack. Parse the whole gene information file
|
36
|
+
EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
|
37
|
+
inside_iterator.call(gene)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# A class for parsing the 'gene information table' files from EuPathDB, such
|
45
|
+
# as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
46
|
+
#
|
47
|
+
# The usual way of interacting with these is the use of the each method,
|
48
|
+
# which returns a EuPathDBGeneInformation object with all of the recorded
|
49
|
+
# information in it.
|
2
50
|
class EuPathDBGeneInformationTable
|
3
51
|
include Enumerable
|
4
|
-
|
52
|
+
|
53
|
+
# Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
|
5
54
|
def initialize(io)
|
6
55
|
@io = io
|
7
56
|
end
|
8
|
-
|
57
|
+
|
58
|
+
# Return a EuPathDBGeneInformation object with
|
59
|
+
# the contained info in it, one at a time
|
9
60
|
def each
|
10
61
|
while g = next_gene
|
11
62
|
yield g
|
12
63
|
end
|
13
64
|
end
|
14
|
-
|
65
|
+
|
15
66
|
# Returns a EuPathDBGeneInformation object with all the data you could
|
16
67
|
# possibly want.
|
17
68
|
def next_gene
|
18
69
|
info = EuPathDBGeneInformation.new
|
19
|
-
|
70
|
+
|
20
71
|
# first, read the table, which should start with the ID column
|
21
72
|
line = @io.readline.strip
|
22
73
|
while line == ''
|
23
74
|
return nil if @io.eof?
|
24
75
|
line = @io.readline.strip
|
25
76
|
end
|
26
|
-
|
77
|
+
|
27
78
|
while line != ''
|
28
79
|
if matches = line.match(/^(.*?)\: (.*)$/)
|
29
80
|
info.add_information(matches[1], matches[2])
|
@@ -33,7 +84,7 @@ class EuPathDBGeneInformationTable
|
|
33
84
|
|
34
85
|
line = @io.readline.strip
|
35
86
|
end
|
36
|
-
|
87
|
+
|
37
88
|
# now read each of the tables, which should start with the
|
38
89
|
# 'TABLE: <name>' entry
|
39
90
|
line = @io.readline.strip
|
@@ -44,7 +95,7 @@ class EuPathDBGeneInformationTable
|
|
44
95
|
if line == ''
|
45
96
|
# add it to the stack unless we are just starting out
|
46
97
|
info.add_table(table_name, headers, data) unless table_name.nil?
|
47
|
-
|
98
|
+
|
48
99
|
# reset things
|
49
100
|
table_name = nil
|
50
101
|
headers = nil
|
@@ -63,32 +114,36 @@ class EuPathDBGeneInformationTable
|
|
63
114
|
end
|
64
115
|
line = @io.readline.strip
|
65
116
|
end
|
66
|
-
|
117
|
+
|
67
118
|
# return the object that has been created
|
68
119
|
return info
|
69
120
|
end
|
70
121
|
end
|
71
122
|
|
123
|
+
# Each gene in the gene information table is represented
|
124
|
+
# by 2 types of information - info and tables.
|
125
|
+
# info are 1 line data, whereas tables are tables of
|
126
|
+
# data with possibly multiple rows
|
72
127
|
class EuPathDBGeneInformation
|
73
128
|
def info
|
74
129
|
@info
|
75
130
|
end
|
76
|
-
|
131
|
+
|
77
132
|
def get_info(key)
|
78
133
|
@info[key]
|
79
134
|
end
|
80
135
|
alias_method :[], :get_info
|
81
|
-
|
136
|
+
|
82
137
|
def get_table(table_name)
|
83
138
|
@tables[table_name]
|
84
139
|
end
|
85
|
-
|
140
|
+
|
86
141
|
def add_information(key, value)
|
87
142
|
@info ||= {}
|
88
143
|
@info[key] = value
|
89
144
|
"Added info #{key}, now is #{@info[key]}"
|
90
145
|
end
|
91
|
-
|
146
|
+
|
92
147
|
def add_table(name, headers, data)
|
93
148
|
@tables ||= {}
|
94
149
|
@tables[name] = []
|
data/lib/eupathdb_gff.rb
ADDED
@@ -0,0 +1,215 @@
|
|
1
|
+
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bio'
|
4
|
+
require 'jgi_genes'
|
5
|
+
require 'cgi'
|
6
|
+
|
7
|
+
# Unlike JGI genes files, ApiDB files have several differences:
|
8
|
+
# - genes on the reverse strand appear in order of their exons, and so
|
9
|
+
# the exons are not all in the correct order with respect to the underlying
|
10
|
+
# sequence.
|
11
|
+
class EupathDBGFF < JgiGenesGff
|
12
|
+
attr_accessor :features_to_ignore
|
13
|
+
|
14
|
+
def initialize(path)
|
15
|
+
@file = File.open path, 'r'
|
16
|
+
@next_gff = read_record
|
17
|
+
@features_to_ignore = [
|
18
|
+
'rRNA',
|
19
|
+
'tRNA',
|
20
|
+
'snRNA',
|
21
|
+
'transcript'
|
22
|
+
]
|
23
|
+
end
|
24
|
+
|
25
|
+
def next_gene
|
26
|
+
cur = @next_gff
|
27
|
+
|
28
|
+
if !cur
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
|
32
|
+
# Ignore the supercontigs at the start of the file
|
33
|
+
while ignore_line?(cur) or ignore_record?(cur)
|
34
|
+
@next_gff = read_record
|
35
|
+
cur = @next_gff
|
36
|
+
if !cur
|
37
|
+
return nil
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
if cur.feature != 'gene'
|
42
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected gene first."
|
43
|
+
end
|
44
|
+
|
45
|
+
# save line so can set these values later,
|
46
|
+
# i
|
47
|
+
gene_line = cur
|
48
|
+
|
49
|
+
|
50
|
+
# First mRNA
|
51
|
+
cur = read_record
|
52
|
+
|
53
|
+
if cur.feature != 'mRNA'
|
54
|
+
# skip rRNA type genes because they are not relevant
|
55
|
+
if ignore_record?(cur)
|
56
|
+
# skip forward to the next gene
|
57
|
+
while cur.feature != 'gene'
|
58
|
+
cur = read_record
|
59
|
+
return nil if cur.nil? # we have reached the end on an ignored gene
|
60
|
+
end
|
61
|
+
@next_gff = cur
|
62
|
+
if cur
|
63
|
+
return next_gene
|
64
|
+
else
|
65
|
+
return nil
|
66
|
+
end
|
67
|
+
else
|
68
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected mRNA next."
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Setup the gene in itself
|
73
|
+
gene = setup_gene_from_first_line gene_line
|
74
|
+
|
75
|
+
# setup stuff from mRNA line
|
76
|
+
ids = cur.attributes['Ontology_term']
|
77
|
+
if ids
|
78
|
+
gene.go_identifiers = ids.split ','
|
79
|
+
end
|
80
|
+
|
81
|
+
# Next CDS
|
82
|
+
cur = read_record
|
83
|
+
if cur.feature != 'CDS'
|
84
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected CDS next."
|
85
|
+
end
|
86
|
+
gene.cds = []
|
87
|
+
while cur.feature == 'CDS'
|
88
|
+
f = Bio::Location.new
|
89
|
+
f.from = cur.start
|
90
|
+
f.to = cur.end
|
91
|
+
gene.cds.push f
|
92
|
+
|
93
|
+
cur = read_record
|
94
|
+
end
|
95
|
+
|
96
|
+
#next exons
|
97
|
+
if cur.feature != 'exon'
|
98
|
+
raise Exception, "Badly parsed apidb line: #{cur}. Expected exon next."
|
99
|
+
end
|
100
|
+
gene.exons = []
|
101
|
+
while cur and cur.feature == 'exon'
|
102
|
+
f = Bio::Location.new
|
103
|
+
f.from = cur.start
|
104
|
+
f.to = cur.end
|
105
|
+
gene.exons.push f
|
106
|
+
|
107
|
+
cur = read_record
|
108
|
+
end
|
109
|
+
|
110
|
+
@next_gff = cur
|
111
|
+
|
112
|
+
return gene
|
113
|
+
end
|
114
|
+
|
115
|
+
# ignore this line when parsing the file
|
116
|
+
def ignore_line?(cur)
|
117
|
+
return ['supercontig', 'introgressed_chromosome_region'].include?(cur.feature)
|
118
|
+
end
|
119
|
+
|
120
|
+
# Certain things I don't want uploaded, like apicoplast genome, etc.
|
121
|
+
def ignore_record?(record)
|
122
|
+
if !record or !record.seqname or
|
123
|
+
@features_to_ignore.include?(record.feature) or
|
124
|
+
record.seqname.match(/^apidb\|NC\_/) or
|
125
|
+
record.seqname.match(/^apidb\|API_IRAB/) or
|
126
|
+
record.seqname.match(/^apidb\|M76611/) or
|
127
|
+
record.seqname.match(/^apidb\|X95276/) #or
|
128
|
+
# record.seqname.match(/^apidb\|Pf/)
|
129
|
+
return true
|
130
|
+
else
|
131
|
+
return false
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
private
|
136
|
+
def read_record
|
137
|
+
line = ""
|
138
|
+
|
139
|
+
# while blank or comment lines, skip, except for ##Fasta, which
|
140
|
+
# means all the genes have already been defined
|
141
|
+
while line.lstrip.rstrip.empty? or line.match(/^\#/)
|
142
|
+
|
143
|
+
line = @file.gets
|
144
|
+
|
145
|
+
if !line or line.match(/^\#\#FASTA/)
|
146
|
+
return nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
whole = EupathDBGFFRecord.new(line)
|
152
|
+
return whole
|
153
|
+
end
|
154
|
+
|
155
|
+
# Given a line describing a gene in an apidb gff file, setup all the
|
156
|
+
# stuff associated with the 'gene' line
|
157
|
+
def setup_gene_from_first_line(gene_line)
|
158
|
+
gene = PositionedGeneWithOntology.new
|
159
|
+
gene.start = gene_line.start
|
160
|
+
gene.strand = gene_line.strand
|
161
|
+
aliai = gene_line.attributes['Alias']
|
162
|
+
if aliai
|
163
|
+
aliai.chomp!
|
164
|
+
gene.alternate_ids = aliai.split ','
|
165
|
+
end
|
166
|
+
|
167
|
+
# make description proper
|
168
|
+
description = gene_line.attributes['description']
|
169
|
+
gene.description = CGI::unescape(description) # yey for useful functions I didn't write
|
170
|
+
|
171
|
+
# name - remove the 'apidb|' bit
|
172
|
+
match = gene_line.attributes['ID'].match('apidb\|(.*)')
|
173
|
+
if !match or !match[1] or match[1] === ''
|
174
|
+
raise Exception, "Badly parsed gene name: #{gene_line}.attributes['ID']}."
|
175
|
+
end
|
176
|
+
gene.name = match[1]
|
177
|
+
gene.seqname = gene_line.seqname
|
178
|
+
|
179
|
+
return gene
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
|
185
|
+
class EupathDBGFFRecord < JgiGffRecord
|
186
|
+
# eg. ID=apidb|X95275;Name=X95275;description=Plasmodium+falciparum+complete+gene+map+of+plastid-like+DNA+%28IR-A%29.
|
187
|
+
def parse_attributes(attributes_string)
|
188
|
+
@attributes = Hash.new
|
189
|
+
parts = attributes_string.split ';'
|
190
|
+
if parts
|
191
|
+
parts.each {|couple|
|
192
|
+
cs = couple.split '='
|
193
|
+
#deal with attributes like 'Note=;' by ignoring them
|
194
|
+
# I once found one of these in the yeast genome gff
|
195
|
+
next if cs.length == 1 and couple.match(/=/)
|
196
|
+
if cs.length != 2
|
197
|
+
raise Exception, "Badly handled attributes bit in api db gff: '#{cs}' from '#{attributes_string}'"
|
198
|
+
end
|
199
|
+
@attributes[cs[0]] = cs[1]
|
200
|
+
}
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
class PositionedGeneWithOntology < PositionedGene
|
207
|
+
attr_accessor :alternate_ids, :description
|
208
|
+
attr_writer :go_identifiers
|
209
|
+
|
210
|
+
def go_identifiers
|
211
|
+
return nil if !@go_identifiers
|
212
|
+
return @go_identifiers.sort.uniq
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
data/lib/jgi_genes.rb
ADDED
@@ -0,0 +1,300 @@
|
|
1
|
+
require 'bio'
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
#
|
10
|
+
class JgiGenesGff
|
11
|
+
|
12
|
+
#
|
13
|
+
def initialize(path)
|
14
|
+
@jgi_file = File.open(path, "r")
|
15
|
+
@next_gff = read_record
|
16
|
+
end
|
17
|
+
|
18
|
+
# Return a enhanced_gene object or nil if none exists
|
19
|
+
def next_gene
|
20
|
+
# Parse the first line into data structures for current gene
|
21
|
+
cur = @next_gff
|
22
|
+
if !cur
|
23
|
+
return nil
|
24
|
+
end
|
25
|
+
|
26
|
+
# Make sure the assumption that the first one is an exon is true
|
27
|
+
if cur.feature==='exon'
|
28
|
+
seqname = cur.seqname
|
29
|
+
strand = cur.strand
|
30
|
+
source = cur.source
|
31
|
+
name = parse_name(cur.attributes)
|
32
|
+
|
33
|
+
f = Bio::Location.new
|
34
|
+
f.from = cur.start
|
35
|
+
f.to = cur.end
|
36
|
+
exons = [f]
|
37
|
+
cds = []
|
38
|
+
protein_id = nil #Unknown until we have a CDS line in the file
|
39
|
+
|
40
|
+
# Continue reading until finished gene or finished file
|
41
|
+
finished_gene = false
|
42
|
+
while !finished_gene and (cur = read_record)
|
43
|
+
|
44
|
+
|
45
|
+
# if still in the same gene
|
46
|
+
if parse_name(cur.attributes) === name
|
47
|
+
if cur.strand != strand or cur.seqname != seqname or cur.source != source
|
48
|
+
puts "EXCEPTION !!!!!!!!!!!!!!!!!!!"
|
49
|
+
raise Exception, 'Data bug in JGI file or parsing is being done incorrectly'
|
50
|
+
end
|
51
|
+
f = Bio::Location.new
|
52
|
+
f.from = cur.start
|
53
|
+
f.to = cur.end
|
54
|
+
case cur.feature
|
55
|
+
when 'exon'
|
56
|
+
exons.push f
|
57
|
+
when 'CDS'
|
58
|
+
cds.push f
|
59
|
+
protein_id = parse_protein_id(cur.attributes)
|
60
|
+
when 'start_codon' #meh
|
61
|
+
when 'stop_codon'
|
62
|
+
else
|
63
|
+
puts "EXCEPTION !!!!!!!!!!!!!!!!!!!"
|
64
|
+
raise Exception, "Unknown feature type #{cur.feature} found."
|
65
|
+
end
|
66
|
+
else
|
67
|
+
finished_gene = true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
#make ready for the next gene
|
72
|
+
@next_gff = cur
|
73
|
+
|
74
|
+
#create a new positioned gene with the useful characteristics
|
75
|
+
# puts "Returning gene:"
|
76
|
+
# p exons.length
|
77
|
+
# p cds.length
|
78
|
+
g = PositionedGene.new
|
79
|
+
g.seqname = seqname
|
80
|
+
g.name = name
|
81
|
+
g.strand = strand
|
82
|
+
g.start = exons[0].from
|
83
|
+
g.exons = exons
|
84
|
+
g.cds = cds
|
85
|
+
g.protein_id = protein_id
|
86
|
+
return g
|
87
|
+
else
|
88
|
+
p cur.feature
|
89
|
+
# I'm not sure if this is detrimental or not, but to be safe..
|
90
|
+
raise Exception, "Assumption failed: exon is not first feature in the gene"
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
def distance_iterator
|
96
|
+
return JgiGenesIterator.new(self)
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
# Read a line from the file, and create the next gff object,
|
101
|
+
# or nil if none exists
|
102
|
+
def read_record
|
103
|
+
line = ""
|
104
|
+
|
105
|
+
while line.lstrip.rstrip.empty?
|
106
|
+
line = @jgi_file.gets
|
107
|
+
if !line
|
108
|
+
return nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
whole = JgiGffRecord.new(line)
|
114
|
+
return whole
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
# Return the name of the gene, given the attributes hash
|
119
|
+
def parse_name(attributes)
|
120
|
+
name = attributes['name'].gsub('"','')
|
121
|
+
return name
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
def parse_protein_id(attributes)
|
126
|
+
return attributes['proteinId'].to_i
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
# A gene as read from the gff file.
|
133
|
+
# cds and exons are assumed to be in increasing order in terms of their
|
134
|
+
# positions
|
135
|
+
# along the positive strand.
|
136
|
+
class PositionedGene
|
137
|
+
attr_accessor :seqname, :name, :strand, :start, :exons, :cds, :protein_id
|
138
|
+
|
139
|
+
# Return the position of the cds end
|
140
|
+
def cds_start
|
141
|
+
# If gene has no coding regions, I guess
|
142
|
+
if !@cds[0]
|
143
|
+
return nil
|
144
|
+
end
|
145
|
+
return @cds[0].from
|
146
|
+
end
|
147
|
+
|
148
|
+
def cds_end
|
149
|
+
# If gene has no coding regions, I guess
|
150
|
+
if !@cds[@cds.length-1]
|
151
|
+
return nil
|
152
|
+
end
|
153
|
+
return @cds[@cds.length-1].to
|
154
|
+
end
|
155
|
+
|
156
|
+
def positive_strand?
|
157
|
+
return @strand === '+'
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
# Fixes up JGI to GFF problems. I don't mean to blame anyone but
|
166
|
+
# they just don't seem to go together
|
167
|
+
class JgiGffRecord < Bio::GFF::Record
|
168
|
+
|
169
|
+
SEQNAME_COL = 0
|
170
|
+
SOURCE_COL = 1
|
171
|
+
FEATURE_COL = 2
|
172
|
+
START_COL = 3
|
173
|
+
END_COL = 4
|
174
|
+
SCORE_COL = 5
|
175
|
+
STRAND_COL = 6
|
176
|
+
FRAME_COL = 7
|
177
|
+
ATTRIBUTES_COL = 8
|
178
|
+
|
179
|
+
def initialize(line)
|
180
|
+
@line = line
|
181
|
+
|
182
|
+
parts = line.split("\t");
|
183
|
+
if parts.length != 9 and parts.length != 8
|
184
|
+
raise Exception, "Badly formatted GFF line - doesn't have correct number of components '#{line}"
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
parse_mandatory_columns(parts)
|
189
|
+
|
190
|
+
parse_attributes(parts[ATTRIBUTES_COL])
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
# Given an array of 8 strings, parse the columns into something
|
196
|
+
# that can be understood by this object
|
197
|
+
def parse_mandatory_columns(parts)
|
198
|
+
@seqname = parts[SEQNAME_COL]
|
199
|
+
@source = parts[SOURCE_COL]
|
200
|
+
@feature = parts[FEATURE_COL]
|
201
|
+
@start = parts[START_COL]
|
202
|
+
@end = parts[END_COL]
|
203
|
+
@score = parts[SCORE_COL]
|
204
|
+
@strand = parts[STRAND_COL]
|
205
|
+
@frame = parts[FRAME_COL]
|
206
|
+
end
|
207
|
+
|
208
|
+
|
209
|
+
# parse the last part of a line into a hash contained in attributes
|
210
|
+
# global variable
|
211
|
+
def parse_attributes(attribute_string)
|
212
|
+
@attributes = Hash.new #define empty attributes even if there are none
|
213
|
+
|
214
|
+
if attribute_string
|
215
|
+
#let the fancy parsing begin
|
216
|
+
aparts = attribute_string.split '; '
|
217
|
+
|
218
|
+
aparts.each do |bit|
|
219
|
+
hbits = bit.split ' '
|
220
|
+
if !hbits or hbits.length != 2
|
221
|
+
raise Exception, "Failed to parse attributes in line: #{line}"
|
222
|
+
end
|
223
|
+
str = hbits[1].gsub(/\"/, '').rstrip.lstrip
|
224
|
+
@attributes[hbits[0]] = str
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
def to_s
|
231
|
+
@line
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
class JgiGenesIterator
|
240
|
+
|
241
|
+
|
242
|
+
def initialize(jgiGenesGffObj)
|
243
|
+
@genbank = jgiGenesGffObj
|
244
|
+
|
245
|
+
# Setup cycle for iterator
|
246
|
+
@cur_gene = @genbank.next_gene
|
247
|
+
@next_gene = @genbank.next_gene
|
248
|
+
@next_is_first = true
|
249
|
+
end
|
250
|
+
|
251
|
+
def has_next_distance
|
252
|
+
return !@next_gene.nil?
|
253
|
+
end
|
254
|
+
|
255
|
+
# Return the next gene to be worked on
|
256
|
+
def next_gene
|
257
|
+
return @cur_gene
|
258
|
+
end
|
259
|
+
|
260
|
+
# Return the upstream distance between one gene and another
|
261
|
+
def next_distance
|
262
|
+
# if the first gene in the list
|
263
|
+
if @next_is_first
|
264
|
+
# cycle has already been setup in initialisation
|
265
|
+
@next_is_first = false;
|
266
|
+
else
|
267
|
+
#cycle through things
|
268
|
+
if !@cur_gene #if nothing is found
|
269
|
+
raise Exception, 'Unexpected nil cur_gene - a software coding error?'
|
270
|
+
end
|
271
|
+
@prev_gene = @cur_gene
|
272
|
+
@cur_gene = @next_gene
|
273
|
+
@next_gene = @genbank.next_gene
|
274
|
+
end
|
275
|
+
|
276
|
+
if !@cur_gene
|
277
|
+
raise Exception, 'Overrun iterator - no more genes available. Use has_next_distance'
|
278
|
+
end
|
279
|
+
|
280
|
+
|
281
|
+
|
282
|
+
# We look at the current gene, and return its upstream distance
|
283
|
+
if @cur_gene.positive_strand?
|
284
|
+
# so we want the distance between cur and last then
|
285
|
+
|
286
|
+
# if last gene undefined or on a different scaffold, return nothing
|
287
|
+
if !@prev_gene or @prev_gene.seqname != @cur_gene.seqname
|
288
|
+
return nil
|
289
|
+
end
|
290
|
+
return @cur_gene.cds_start.to_i - @prev_gene.cds_end.to_i
|
291
|
+
else
|
292
|
+
if !@next_gene or @next_gene.seqname != @cur_gene.seqname
|
293
|
+
return nil
|
294
|
+
end
|
295
|
+
return @next_gene.cds_start.to_i - @cur_gene.cds_end.to_i
|
296
|
+
end
|
297
|
+
|
298
|
+
end
|
299
|
+
|
300
|
+
end
|
data/lib/reubypathdb.rb
ADDED
data/reubypathdb.gemspec
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{reubypathdb}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Ben J Woodcroft"]
|
12
|
+
s.date = %q{2011-04-19}
|
13
|
+
s.description = %q{Classes to help parsing EuPathDB data files}
|
14
|
+
s.email = %q{donttrustben near gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
"LICENSE",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"lib/eupathdb_gene_information_table.rb",
|
26
|
+
"lib/eupathdb_gff.rb",
|
27
|
+
"lib/jgi_genes.rb",
|
28
|
+
"lib/reubypathdb.rb",
|
29
|
+
"reubypathdb.gemspec",
|
30
|
+
"test/data/eupathGeneInformation.txt",
|
31
|
+
"test/helper.rb",
|
32
|
+
"test/test_eupathdb_gene_information_table.rb"
|
33
|
+
]
|
34
|
+
s.homepage = %q{http://github.com/wwood/reubypathdb}
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = %q{1.6.2}
|
37
|
+
s.summary = %q{Classes to help parsing EuPathDB data files}
|
38
|
+
s.test_files = [
|
39
|
+
"test/helper.rb",
|
40
|
+
"test/test_eupathdb_gene_information_table.rb"
|
41
|
+
]
|
42
|
+
|
43
|
+
if s.respond_to? :specification_version then
|
44
|
+
s.specification_version = 3
|
45
|
+
|
46
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
47
|
+
s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
48
|
+
else
|
49
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
50
|
+
end
|
51
|
+
else
|
52
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reubypathdb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Ben J Woodcroft
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-04-19 00:00:00 +10:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -43,12 +43,15 @@ extra_rdoc_files:
|
|
43
43
|
- README.rdoc
|
44
44
|
files:
|
45
45
|
- .document
|
46
|
-
- .gitignore
|
47
46
|
- LICENSE
|
48
47
|
- README.rdoc
|
49
48
|
- Rakefile
|
50
49
|
- VERSION
|
51
50
|
- lib/eupathdb_gene_information_table.rb
|
51
|
+
- lib/eupathdb_gff.rb
|
52
|
+
- lib/jgi_genes.rb
|
53
|
+
- lib/reubypathdb.rb
|
54
|
+
- reubypathdb.gemspec
|
52
55
|
- test/data/eupathGeneInformation.txt
|
53
56
|
- test/helper.rb
|
54
57
|
- test/test_eupathdb_gene_information_table.rb
|
@@ -57,8 +60,8 @@ homepage: http://github.com/wwood/reubypathdb
|
|
57
60
|
licenses: []
|
58
61
|
|
59
62
|
post_install_message:
|
60
|
-
rdoc_options:
|
61
|
-
|
63
|
+
rdoc_options: []
|
64
|
+
|
62
65
|
require_paths:
|
63
66
|
- lib
|
64
67
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -82,7 +85,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
85
|
requirements: []
|
83
86
|
|
84
87
|
rubyforge_project:
|
85
|
-
rubygems_version: 1.
|
88
|
+
rubygems_version: 1.6.2
|
86
89
|
signing_key:
|
87
90
|
specification_version: 3
|
88
91
|
summary: Classes to help parsing EuPathDB data files
|