qu-mfeindex 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/bin/mfeindex +128 -0
- data/lib/qu/mfeindex.rb +166 -0
- data/lib/qu/mfeindex/data.rb +24 -0
- data/lib/qu/mfeindex/version.rb +5 -0
- data/lib/qu/pymfeindex +283 -0
- data/qu-mfeindex.gemspec +26 -0
- metadata +112 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 078d0efd0389e0fb00ddb3434b227f57a329960e
|
4
|
+
data.tar.gz: 3e899029902dd46daec7ec7ba7341c7ea04f19a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0196c1634bdfc2455fce27addf74687745bdb9106894d1ed84780800cabf64fc89832cf9c28f97f24e4c6be88a027a4605263bb7545c2737772bd0fbbe051ba5
|
7
|
+
data.tar.gz: 84950a7f7cc5003a464d8dcc53a3a712a56748960af70abd70b35956a00a9dd84430698002b80aac5804f9d837e40dddbf3a54d8a4bc0c5d949606c6560130bf
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Wubin Qu
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Qu::Mfeindex
|
2
|
+
|
3
|
+
DNA sequence indexer originally developed for MFEprimer-2.0
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'qu-mfeindex'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install qu-mfeindex
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
`mfeindex fasta_file [kvalue]`
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/mfeindex
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'qu/mfeindex'
|
4
|
+
|
5
|
+
def split_db(big_db)
|
6
|
+
big_db_dir = Qu::Mfeindex::get_big_db_dir(big_db)
|
7
|
+
Dir.mkdir(big_db_dir) unless File.directory?(big_db_dir)
|
8
|
+
|
9
|
+
sum_size = 0
|
10
|
+
small_file_list = []
|
11
|
+
Dir.foreach(big_db_dir).each do |small_file|
|
12
|
+
prefix = File.basename(big_db)
|
13
|
+
next small_file unless small_file =~ /^#{prefix}\.\d$/
|
14
|
+
small_file_list << File.join(big_db_dir, small_file)
|
15
|
+
sum_size += File.size(File.join(big_db_dir, small_file))
|
16
|
+
end
|
17
|
+
|
18
|
+
if sum_size >= File.size(big_db)
|
19
|
+
$stdout.print "#{big_db} has been splited, do you want to resplit it? [y/N]:"
|
20
|
+
choice = $stdin.gets.chomp.downcase
|
21
|
+
choice = 'n' if choice.empty?
|
22
|
+
while !['y', 'n'].include?(choice)
|
23
|
+
$stdout.print "The choice should be 'y' or 'n':"
|
24
|
+
choice = $stdin.gets.chomp.downcase
|
25
|
+
end
|
26
|
+
|
27
|
+
if choice == 'n'
|
28
|
+
return small_file_list
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
small_file_list = []
|
33
|
+
|
34
|
+
small_file_index = 0
|
35
|
+
small_file_name = File.join(big_db_dir, File.basename(big_db) + '.' + small_file_index.to_s)
|
36
|
+
fh = File.open(small_file_name, 'w')
|
37
|
+
small_file_list << small_file_name
|
38
|
+
File.open(big_db).each do |line|
|
39
|
+
if line.start_with?('>')
|
40
|
+
if fh.size >= Qu::Mfeindex::BIG_DB_SPLIT_CUTOFF
|
41
|
+
fh.close
|
42
|
+
small_file_index += 1
|
43
|
+
small_file_name = File.join(big_db_dir, File.basename(big_db) + '.' + small_file_index.to_s)
|
44
|
+
fh = File.open(small_file_name, 'w')
|
45
|
+
small_file_list << small_file_name
|
46
|
+
end
|
47
|
+
end
|
48
|
+
fh.write(line)
|
49
|
+
end
|
50
|
+
fh.close
|
51
|
+
|
52
|
+
return small_file_list
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
def index_db(fasta_file, kvalue)
|
57
|
+
if Qu::Mfeindex::db_indexed?(fasta_file)
|
58
|
+
$stdout.print "#{fasta_file} has already been indexed, do you want to reindex? [y/N]:"
|
59
|
+
choice = $stdin.gets.chomp.downcase
|
60
|
+
choice = 'n' if choice.empty?
|
61
|
+
while !['y', 'n'].include?(choice)
|
62
|
+
$stdout.print "The choice should be 'y' or 'n':"
|
63
|
+
choice = $stdin.gets.chomp.downcase
|
64
|
+
end
|
65
|
+
else
|
66
|
+
choice = 'y'
|
67
|
+
end
|
68
|
+
|
69
|
+
if choice == 'y'
|
70
|
+
Qu::Mfeindex::MFEprimerIndex(fasta_file, kvalue, reindex=true)
|
71
|
+
else
|
72
|
+
Qu::Mfeindex::MFEprimerIndex(fasta_file, kvalue, reindex=false)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
usage = "Index database for MFEprimer-2.0
|
78
|
+
|
79
|
+
Usage:
|
80
|
+
|
81
|
+
#{File.basename($0)} fasta_file [kvalue]
|
82
|
+
|
83
|
+
|
84
|
+
Options:
|
85
|
+
|
86
|
+
kvalue: Default is 9 [Integer].
|
87
|
+
|
88
|
+
Contact: Wubin Qu <quwubin@gmail.com>
|
89
|
+
"
|
90
|
+
|
91
|
+
case ARGV.size
|
92
|
+
when 2
|
93
|
+
fasta_file = ARGV[0]
|
94
|
+
kvalue = ARGV[1].to_i
|
95
|
+
when 1
|
96
|
+
fasta_file = ARGV[0]
|
97
|
+
kvalue = 9
|
98
|
+
else
|
99
|
+
$stderr.puts usage
|
100
|
+
exit
|
101
|
+
end
|
102
|
+
|
103
|
+
unless File.exists?(fasta_file)
|
104
|
+
$stdout.puts "Error: #{fasta_file} is not exitst.\n"
|
105
|
+
$stderr.puts usage
|
106
|
+
exit
|
107
|
+
end
|
108
|
+
|
109
|
+
if File.size(fasta_file) > Qu::Mfeindex::BIG_DB_SPLIT_CUTOFF
|
110
|
+
$stdout.print "#{fasta_file} is too large, do you want to split it first? [Y/n]:"
|
111
|
+
choice = $stdin.gets.chomp.downcase
|
112
|
+
choice = 'y' if choice.empty?
|
113
|
+
while !['y', 'n'].include?(choice)
|
114
|
+
$stdout.print "The choice should be 'y' or 'n':"
|
115
|
+
choice = $stdin.gets.chomp.downcase
|
116
|
+
end
|
117
|
+
else
|
118
|
+
choice = 'n'
|
119
|
+
end
|
120
|
+
|
121
|
+
if choice == 'y'
|
122
|
+
small_file_list = split_db(fasta_file)
|
123
|
+
small_file_list.each do |small_file|
|
124
|
+
index_db(small_file, kvalue)
|
125
|
+
end
|
126
|
+
else
|
127
|
+
index_db(fasta_file, kvalue)
|
128
|
+
end
|
data/lib/qu/mfeindex.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'qu/utils'
|
2
|
+
require 'qu/cmdwrapper'
|
3
|
+
|
4
|
+
require_relative "mfeindex/data"
|
5
|
+
require_relative "mfeindex/version"
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
module Qu
|
9
|
+
module Mfeindex
|
10
|
+
# Your code goes here...
|
11
|
+
|
12
|
+
module_function
|
13
|
+
|
14
|
+
def get_big_db_dir(db)
|
15
|
+
db + BIG_DB
|
16
|
+
end
|
17
|
+
|
18
|
+
def check_db(db_list)
|
19
|
+
new_db_list = []
|
20
|
+
|
21
|
+
db_list.each do |db|
|
22
|
+
big_db_dir = get_big_db_dir(db)
|
23
|
+
if db_indexed?(db)
|
24
|
+
new_db_list << db
|
25
|
+
next
|
26
|
+
elsif File.directory?(big_db_dir)
|
27
|
+
Dir.foreach(big_db_dir).each do |small_file|
|
28
|
+
prefix = File.basename(db)
|
29
|
+
next small_file unless small_file =~ /^#{prefix}\.\d$/
|
30
|
+
new_db_list << File.join(big_db_dir, small_file)
|
31
|
+
end
|
32
|
+
elsif !File.exists?(db)
|
33
|
+
$stderr.puts "Error: #{db} is not exists."
|
34
|
+
exit
|
35
|
+
elsif File.size(db) > BIG_DB_SPLIT_CUTOFF
|
36
|
+
$stderr.puts "Warning: #{db} is too large, please use mfepindex to index the db first."
|
37
|
+
exit
|
38
|
+
else
|
39
|
+
new_db_list << db
|
40
|
+
next
|
41
|
+
end
|
42
|
+
end
|
43
|
+
new_db_list
|
44
|
+
end
|
45
|
+
|
46
|
+
def db_indexed?(db)
|
47
|
+
File.exist?(db + DB_SQLITE3) and File.exist?(db + DB_JSON) and File.exist?(db + DB_2BIT)
|
48
|
+
end
|
49
|
+
|
50
|
+
def MFEprimerIndex(fasta_file, k = 9, reindex = false)
|
51
|
+
return if !reindex and db_indexed?(fasta_file)
|
52
|
+
|
53
|
+
unless File.exists?(fasta_file)
|
54
|
+
$stderr.puts "Error: #{fasta_file} is not exists."
|
55
|
+
exit
|
56
|
+
end
|
57
|
+
info_json = {}
|
58
|
+
|
59
|
+
uni_fasta = fasta_file + '.unifasta'
|
60
|
+
|
61
|
+
File.open(uni_fasta, 'w') do |fh|
|
62
|
+
Bio::FlatFile.new(Bio::FastaFormat, File.open(fasta_file)).each_with_index do |record, index|
|
63
|
+
info_json[index] = {'id' => record.entry_name, 'desc' => record.desc, 'size' => record.naseq.size}
|
64
|
+
fh.write ">#{index}\n#{record.naseq}\n"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
File.open(fasta_file + DB_JSON, 'w') do |fh|
|
69
|
+
fh.write(JSON.dump(info_json))
|
70
|
+
end
|
71
|
+
|
72
|
+
Qu::Cmdwrapper::faToTwoBit(uni_fasta, fasta_file + DB_2BIT)
|
73
|
+
|
74
|
+
cmd = File.join(__dir__, 'pymfeindex')
|
75
|
+
$stderr.puts "Begin index database: #{fasta_file}"
|
76
|
+
`#{cmd} -f #{uni_fasta} -k #{k} -o #{fasta_file + DB_SQLITE3}`
|
77
|
+
begin
|
78
|
+
File.delete(uni_fasta)
|
79
|
+
rescue
|
80
|
+
if File.exists?(uni_fasta)
|
81
|
+
$stderr.puts "You can delete the file #{uni_fasta} by hand."
|
82
|
+
end
|
83
|
+
end
|
84
|
+
$stderr.puts "Done index database: #{fasta_file}"
|
85
|
+
end
|
86
|
+
|
87
|
+
def int2dna(int, k=9, base_number=4)
|
88
|
+
seqint = int.to_s(base_number)
|
89
|
+
dna = ""
|
90
|
+
(0...seqint.length).each do |index|
|
91
|
+
dna += D2I[seqint[index].to_i]
|
92
|
+
end
|
93
|
+
return 'A' * (k - seqint.length) + dna
|
94
|
+
end
|
95
|
+
|
96
|
+
def dna2int(dna, base_number=4)
|
97
|
+
plus_int = 0
|
98
|
+
dna = dna.upcase
|
99
|
+
# This is plus strand position
|
100
|
+
dna.each_char.with_index do |base, index|
|
101
|
+
plus_int += D2I[base] * base_number ** (dna.length - 1 - index)
|
102
|
+
end
|
103
|
+
return plus_int
|
104
|
+
end
|
105
|
+
|
106
|
+
def split_pos(data)
|
107
|
+
# Split position data from SQLite3 database which generated by mfeindex
|
108
|
+
pos_hash = {}
|
109
|
+
data.split(';').each do |hit_record|
|
110
|
+
hit_id, hit_pos = hit_record.split(':')
|
111
|
+
pos_hash[hit_id.to_i] = hit_pos.split(',').collect {|pos| pos.to_i}
|
112
|
+
end
|
113
|
+
|
114
|
+
return pos_hash
|
115
|
+
end
|
116
|
+
|
117
|
+
def detect_kvalue(db_file)
|
118
|
+
db = SQLite3::Database.new(db_file)
|
119
|
+
|
120
|
+
begin
|
121
|
+
mer_num = db.execute("select count(*) from pos")[0][0]
|
122
|
+
kvalue = Math.log(mer_num, 4).to_i
|
123
|
+
rescue Exception => e
|
124
|
+
kvalue = 9
|
125
|
+
end
|
126
|
+
|
127
|
+
return kvalue
|
128
|
+
end
|
129
|
+
|
130
|
+
def get_kvalue(db_list)
|
131
|
+
kmer_list = []
|
132
|
+
db_list.each do |db|
|
133
|
+
if db_indexed?(db)
|
134
|
+
kmer_list << detect_kvalue(db + DB_SQLITE3)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
kmer_list.uniq!
|
138
|
+
|
139
|
+
kvalue = 9
|
140
|
+
|
141
|
+
if kmer_list.size > 1
|
142
|
+
$stderr.puts "Different index kmer value among #{@opts.db}."
|
143
|
+
exit
|
144
|
+
elsif kmer_list.size == 1
|
145
|
+
kvalue = kmer_list[0]
|
146
|
+
else
|
147
|
+
kvalue = 9
|
148
|
+
end
|
149
|
+
kvalue
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
def query_sqlite3(db_file, mer_id_list)
|
154
|
+
pos = {}
|
155
|
+
db = SQLite3::Database.new(db_file)
|
156
|
+
db.execute("select mer_id, plus, minus from pos where mer_id in (#{mer_id_list.join(', ')})") do |row|
|
157
|
+
mer_id, plus, minus = row
|
158
|
+
pos[mer_id] ||= {}
|
159
|
+
pos[mer_id][:plus] = split_pos(plus) unless plus.empty?
|
160
|
+
pos[mer_id][:minus] = split_pos(minus) unless minus.empty?
|
161
|
+
end
|
162
|
+
|
163
|
+
return pos
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Qu
|
2
|
+
module Mfeindex
|
3
|
+
DB_JSON = '.uni'
|
4
|
+
DB_SQLITE3 = '.sqlite3.db'
|
5
|
+
DB_2BIT = '.2bit'
|
6
|
+
BIG_DB = '_BIG_MFE_DB'
|
7
|
+
BIG_DB_SPLIT_CUTOFF = 1024**3
|
8
|
+
|
9
|
+
D2I = {
|
10
|
+
'A' => 0,
|
11
|
+
'T' => 3,
|
12
|
+
'C' => 2,
|
13
|
+
'G' => 1,
|
14
|
+
'-' => 4, # For bubble, added by Zheyan Liu
|
15
|
+
0 => 'A',
|
16
|
+
1 => 'G',
|
17
|
+
2 => 'C',
|
18
|
+
3 => 'T',
|
19
|
+
4 => '-', # For bubble, added by Zheyan Liu
|
20
|
+
}
|
21
|
+
|
22
|
+
ANTISENSE_CHARS = %w{A G C T -}
|
23
|
+
end
|
24
|
+
end
|
data/lib/qu/pymfeindex
ADDED
@@ -0,0 +1,283 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
from __future__ import division
|
3
|
+
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
import datetime
|
7
|
+
from time import time
|
8
|
+
from optparse import OptionParser
|
9
|
+
import sqlite3
|
10
|
+
|
11
|
+
import platform
|
12
|
+
import subprocess
|
13
|
+
import re
|
14
|
+
|
15
|
+
|
16
|
+
D2n_dic = dict(A=0, T=3, C=2, G=1, a=0, t=3, c=2, g=1)
|
17
|
+
n2D_dic = {0:'A', 3:'T', 2:'C', 1:'G', 0:'a', 3:'t', 2:'c', 1:'g'}
|
18
|
+
|
19
|
+
def print_usage():
|
20
|
+
print '''
|
21
|
+
%s: Index DB for MFEprimer-2.0
|
22
|
+
|
23
|
+
Usage:
|
24
|
+
|
25
|
+
%s -f human.genomic -k 9 -o index_db_name
|
26
|
+
|
27
|
+
Author: Wubin Qu <quwubin@gmail.com>
|
28
|
+
Last updated: 2012-9-28
|
29
|
+
''' % (os.path.basename(sys.argv[0]), os.path.basename(sys.argv[0]))
|
30
|
+
|
31
|
+
def optget():
|
32
|
+
'''parse options'''
|
33
|
+
parser = OptionParser()
|
34
|
+
parser.add_option("-f", "--file", dest = "filename", help = "DNA file in fasta to be indexed")
|
35
|
+
parser.add_option("-k", "--k", dest = "k", type='int', help = "K mer , default is 9", default = 9)
|
36
|
+
parser.add_option("-o", "--out", dest = "out", help = "Index db file name")
|
37
|
+
|
38
|
+
(options, args) = parser.parse_args()
|
39
|
+
|
40
|
+
if not options.filename:
|
41
|
+
print_usage()
|
42
|
+
exit()
|
43
|
+
|
44
|
+
if not options.out:
|
45
|
+
options.out = options.filename + '.sqlite3.db'
|
46
|
+
|
47
|
+
return options
|
48
|
+
|
49
|
+
def parse_fasta_format(fh):
|
50
|
+
'''
|
51
|
+
A Fasta-format Parser return Iterator
|
52
|
+
'''
|
53
|
+
# Remove the comment and blank lines before the first record
|
54
|
+
while True:
|
55
|
+
line = fh.readline()
|
56
|
+
if not line: return # Blank line
|
57
|
+
|
58
|
+
line = line.strip()
|
59
|
+
|
60
|
+
if line.startswith('>'):
|
61
|
+
break
|
62
|
+
|
63
|
+
while True:
|
64
|
+
if not line.startswith('>'):
|
65
|
+
raise ValueError("Records in Fasta files should start with '>' character")
|
66
|
+
|
67
|
+
id, sep, desc = line[1:].partition(' ')
|
68
|
+
|
69
|
+
seq_lines = []
|
70
|
+
line = fh.readline()
|
71
|
+
while True:
|
72
|
+
if not line: break
|
73
|
+
|
74
|
+
line = line.strip()
|
75
|
+
|
76
|
+
if line.startswith('>'):
|
77
|
+
break
|
78
|
+
|
79
|
+
if not line:
|
80
|
+
line = fh.readline()
|
81
|
+
continue
|
82
|
+
|
83
|
+
seq_lines.append(line.replace(' ', '').replace("\r", ''))
|
84
|
+
line = fh.readline()
|
85
|
+
|
86
|
+
yield (id, desc, ''.join(seq_lines))
|
87
|
+
|
88
|
+
if not line: return
|
89
|
+
|
90
|
+
assert False, 'Should not reach this line'
|
91
|
+
|
92
|
+
def get_free_memory_percent():
|
93
|
+
if platform.system() == 'Darwin':
|
94
|
+
# Get process info
|
95
|
+
vm = subprocess.Popen(['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
|
96
|
+
installed_memory = float(subprocess.Popen(['sysctl', '-n', 'hw.memsize'], stdout=subprocess.PIPE).communicate()[0])
|
97
|
+
|
98
|
+
# Process vm_stat
|
99
|
+
vmLines = vm.split('\n')
|
100
|
+
sep = re.compile(':[\s]+')
|
101
|
+
vmStats = {}
|
102
|
+
for row in range(1,len(vmLines)-2):
|
103
|
+
rowText = vmLines[row].strip()
|
104
|
+
rowElements = sep.split(rowText)
|
105
|
+
vmStats[(rowElements[0])] = int(rowElements[1].strip('\.')) * 4096
|
106
|
+
|
107
|
+
total_comsumed = vmStats["Pages wired down"] + vmStats["Pages active"] + vmStats["Pages inactive"]
|
108
|
+
|
109
|
+
return (installed_memory - total_comsumed) / installed_memory * 100
|
110
|
+
|
111
|
+
elif platform.system() == 'Linux':
|
112
|
+
items = subprocess.Popen(['free', '-m'], stdout=subprocess.PIPE).communicate()[0].splitlines()[1].strip().split()
|
113
|
+
free = float(items[3]) + float(items[5]) + float(items[6])
|
114
|
+
total = float(items[1])
|
115
|
+
|
116
|
+
return free / total * 100
|
117
|
+
else:
|
118
|
+
print "Sorry, currently only support Mac OS and Linux."
|
119
|
+
return 0
|
120
|
+
|
121
|
+
|
122
|
+
def insert_db(conn, mer_count, plus, minus):
|
123
|
+
for mer_id in xrange(mer_count):
|
124
|
+
conn.execute("insert into pos (mer_id, plus, minus) values (?, ?, ?)", \
|
125
|
+
[mer_id, plus[mer_id], minus[mer_id]])
|
126
|
+
|
127
|
+
conn.commit()
|
128
|
+
|
129
|
+
def update_db(conn, mer_count, plus, minus):
|
130
|
+
for mer_id in xrange(mer_count):
|
131
|
+
(plus_data, minus_data) = conn.execute("select plus, minus from pos where mer_id=?", [mer_id]).fetchone()
|
132
|
+
if plus_data:
|
133
|
+
if plus[mer_id]:
|
134
|
+
plus_data += ';%s' % plus[mer_id]
|
135
|
+
else:
|
136
|
+
pass
|
137
|
+
else:
|
138
|
+
plus_data = plus[mer_id]
|
139
|
+
|
140
|
+
if minus_data:
|
141
|
+
if minus[mer_id]:
|
142
|
+
minus_data += ';%s' % minus[mer_id]
|
143
|
+
else:
|
144
|
+
pass
|
145
|
+
else:
|
146
|
+
minus_data = minus[mer_id]
|
147
|
+
|
148
|
+
conn.execute("update pos set plus=?, minus=? where mer_id=?", \
|
149
|
+
[plus_data, minus_data, mer_id])
|
150
|
+
|
151
|
+
conn.commit()
|
152
|
+
|
153
|
+
def baseN(num, b):
|
154
|
+
'''convert non-negative decimal integer n to
|
155
|
+
equivalent in another base b (2-36)'''
|
156
|
+
return ((num == 0) and '0' ) or ( baseN(num // b, b).lstrip('0') + "0123456789abcdefghijklmnopqrstuvwxyz"[num % b])
|
157
|
+
|
158
|
+
def int2DNA(num, k):
|
159
|
+
seq = baseN(num, 4)
|
160
|
+
return 'A' * (k-len(seq)) + (''.join([n2D_dic[int(base)] for base in seq]))
|
161
|
+
|
162
|
+
def DNA2int_2(seq):
|
163
|
+
'''convert a sub-sequence/seq to a non-negative integer'''
|
164
|
+
plus_mer = 0
|
165
|
+
minus_mer = 0
|
166
|
+
length = len(seq) - 1
|
167
|
+
for i, letter in enumerate(seq):
|
168
|
+
plus_mer += D2n_dic[letter] * 4 ** (length - i)
|
169
|
+
minus_mer += (3 - D2n_dic[letter]) * 4 ** i
|
170
|
+
|
171
|
+
return plus_mer, minus_mer
|
172
|
+
|
173
|
+
def DNA2int(seq):
|
174
|
+
'''convert a sub-sequence/seq to a non-negative integer'''
|
175
|
+
plus_mer = 0
|
176
|
+
length = len(seq) - 1
|
177
|
+
for i, letter in enumerate(seq):
|
178
|
+
plus_mer += D2n_dic[letter] * 4 ** (length - i)
|
179
|
+
|
180
|
+
return plus_mer
|
181
|
+
|
182
|
+
def index(filename, k, dbname):
|
183
|
+
''''''
|
184
|
+
start = time()
|
185
|
+
|
186
|
+
mer_count = 4**k
|
187
|
+
|
188
|
+
conn = sqlite3.connect(dbname)
|
189
|
+
cur = conn.cursor()
|
190
|
+
cur.executescript('''
|
191
|
+
drop table if exists pos;
|
192
|
+
create table pos(
|
193
|
+
mer_id integer primary key,
|
194
|
+
plus text,
|
195
|
+
minus text
|
196
|
+
);''')
|
197
|
+
|
198
|
+
plus = ['']*mer_count
|
199
|
+
minus = ['']*mer_count
|
200
|
+
|
201
|
+
is_empty = False
|
202
|
+
is_db_new = True
|
203
|
+
|
204
|
+
for record_id, record_desc, fasta_seq in parse_fasta_format(open(filename)):
|
205
|
+
is_empty = False
|
206
|
+
print record_id
|
207
|
+
|
208
|
+
#print 'Time used: ', time() - start
|
209
|
+
|
210
|
+
#plus_mer_list = [''] * mer_count
|
211
|
+
#minus_mer_list = [''] * mer_count
|
212
|
+
plus_mer_list = {}
|
213
|
+
minus_mer_list = {}
|
214
|
+
|
215
|
+
for i in xrange(len(fasta_seq)-k + 1):
|
216
|
+
#start = time()
|
217
|
+
kmer = fasta_seq[i:(i+k)]
|
218
|
+
#print kmer, i
|
219
|
+
|
220
|
+
try:
|
221
|
+
plus_mer_id, minus_mer_id = DNA2int_2(kmer)
|
222
|
+
except:
|
223
|
+
# Skip the unrecognized base, such as 'N'
|
224
|
+
continue
|
225
|
+
|
226
|
+
if plus_mer_list.has_key(plus_mer_id):
|
227
|
+
plus_mer_list[plus_mer_id] += ',%i' % (i+k-1)
|
228
|
+
else:
|
229
|
+
plus_mer_list[plus_mer_id] = str(i+k-1)
|
230
|
+
|
231
|
+
if minus_mer_list.has_key(minus_mer_id):
|
232
|
+
minus_mer_list[minus_mer_id] += ',%i' % (i)
|
233
|
+
else:
|
234
|
+
minus_mer_list[minus_mer_id] = str(i)
|
235
|
+
|
236
|
+
|
237
|
+
#print 'Index time used: ', time() - start
|
238
|
+
#start = time()
|
239
|
+
for mer_id, pos in plus_mer_list.items():
|
240
|
+
if plus[mer_id]:
|
241
|
+
plus[mer_id] += ';%s:%s' % (record_id, pos)
|
242
|
+
else:
|
243
|
+
plus[mer_id] = '%s:%s' % (record_id, pos)
|
244
|
+
|
245
|
+
for mer_id, pos in minus_mer_list.items():
|
246
|
+
if minus[mer_id]:
|
247
|
+
minus[mer_id] += ';%s:%s' % (record_id, pos)
|
248
|
+
else:
|
249
|
+
minus[mer_id] = '%s:%s' % (record_id, pos)
|
250
|
+
|
251
|
+
#print 'Merge time used: ', time() - start
|
252
|
+
|
253
|
+
memory_percent = get_free_memory_percent()
|
254
|
+
if memory_percent < 30:
|
255
|
+
if is_db_new:
|
256
|
+
insert_db(conn, mer_count, plus, minus)
|
257
|
+
is_db_new = False
|
258
|
+
else:
|
259
|
+
update_db(conn, mer_count, plus, minus)
|
260
|
+
|
261
|
+
# Empty the container
|
262
|
+
plus = ['']*mer_count
|
263
|
+
minus = ['']*mer_count
|
264
|
+
is_empty = True
|
265
|
+
|
266
|
+
print 'Empty plus and minus due to the memory problem.'
|
267
|
+
|
268
|
+
if not is_empty:
|
269
|
+
if is_db_new:
|
270
|
+
insert_db(conn, mer_count, plus, minus)
|
271
|
+
else:
|
272
|
+
update_db(conn, mer_count, plus, minus)
|
273
|
+
|
274
|
+
print "Time used: %s" % str(datetime.timedelta(seconds=(time() - start)))
|
275
|
+
print 'Done.'
|
276
|
+
|
277
|
+
def main():
|
278
|
+
'''main'''
|
279
|
+
options = optget()
|
280
|
+
index(options.filename, options.k, options.out)
|
281
|
+
|
282
|
+
if __name__ == "__main__":
|
283
|
+
main()
|
data/qu-mfeindex.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'qu/mfeindex/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "qu-mfeindex"
|
8
|
+
spec.version = Qu::Mfeindex::VERSION
|
9
|
+
spec.authors = ["Wubin Qu"]
|
10
|
+
spec.email = ["quwubin@gmail.com"]
|
11
|
+
spec.description = %q{DNA sequence indexer originally developed for MFEprimer-2.0}
|
12
|
+
spec.summary = %q{A DNA sequence idnexer}
|
13
|
+
spec.homepage = "https://github.com/quwubin/qu-mfeindex"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'qu-utils', '~> 1.0'
|
22
|
+
spec.add_runtime_dependency 'qu-cmdwrapper', '~> 1.0'
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: qu-mfeindex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Wubin Qu
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-04-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: qu-utils
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: qu-cmdwrapper
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: DNA sequence indexer originally developed for MFEprimer-2.0
|
70
|
+
email:
|
71
|
+
- quwubin@gmail.com
|
72
|
+
executables:
|
73
|
+
- mfeindex
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- ".gitignore"
|
78
|
+
- Gemfile
|
79
|
+
- LICENSE.txt
|
80
|
+
- README.md
|
81
|
+
- Rakefile
|
82
|
+
- bin/mfeindex
|
83
|
+
- lib/qu/mfeindex.rb
|
84
|
+
- lib/qu/mfeindex/data.rb
|
85
|
+
- lib/qu/mfeindex/version.rb
|
86
|
+
- lib/qu/pymfeindex
|
87
|
+
- qu-mfeindex.gemspec
|
88
|
+
homepage: https://github.com/quwubin/qu-mfeindex
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 2.2.0
|
109
|
+
signing_key:
|
110
|
+
specification_version: 4
|
111
|
+
summary: A DNA sequence idnexer
|
112
|
+
test_files: []
|