qu-mfeindex 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/bin/mfeindex +128 -0
- data/lib/qu/mfeindex.rb +166 -0
- data/lib/qu/mfeindex/data.rb +24 -0
- data/lib/qu/mfeindex/version.rb +5 -0
- data/lib/qu/pymfeindex +283 -0
- data/qu-mfeindex.gemspec +26 -0
- metadata +112 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 078d0efd0389e0fb00ddb3434b227f57a329960e
|
4
|
+
data.tar.gz: 3e899029902dd46daec7ec7ba7341c7ea04f19a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0196c1634bdfc2455fce27addf74687745bdb9106894d1ed84780800cabf64fc89832cf9c28f97f24e4c6be88a027a4605263bb7545c2737772bd0fbbe051ba5
|
7
|
+
data.tar.gz: 84950a7f7cc5003a464d8dcc53a3a712a56748960af70abd70b35956a00a9dd84430698002b80aac5804f9d837e40dddbf3a54d8a4bc0c5d949606c6560130bf
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Wubin Qu
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Qu::Mfeindex
|
2
|
+
|
3
|
+
DNA sequence indexer originally developed for MFEprimer-2.0
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'qu-mfeindex'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install qu-mfeindex
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
`mfeindex fasta_file [kvalue]`
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/mfeindex
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'qu/mfeindex'
|
4
|
+
|
5
|
+
def split_db(big_db)
|
6
|
+
big_db_dir = Qu::Mfeindex::get_big_db_dir(big_db)
|
7
|
+
Dir.mkdir(big_db_dir) unless File.directory?(big_db_dir)
|
8
|
+
|
9
|
+
sum_size = 0
|
10
|
+
small_file_list = []
|
11
|
+
Dir.foreach(big_db_dir).each do |small_file|
|
12
|
+
prefix = File.basename(big_db)
|
13
|
+
next small_file unless small_file =~ /^#{prefix}\.\d$/
|
14
|
+
small_file_list << File.join(big_db_dir, small_file)
|
15
|
+
sum_size += File.size(File.join(big_db_dir, small_file))
|
16
|
+
end
|
17
|
+
|
18
|
+
if sum_size >= File.size(big_db)
|
19
|
+
$stdout.print "#{big_db} has been splited, do you want to resplit it? [y/N]:"
|
20
|
+
choice = $stdin.gets.chomp.downcase
|
21
|
+
choice = 'n' if choice.empty?
|
22
|
+
while !['y', 'n'].include?(choice)
|
23
|
+
$stdout.print "The choice should be 'y' or 'n':"
|
24
|
+
choice = $stdin.gets.chomp.downcase
|
25
|
+
end
|
26
|
+
|
27
|
+
if choice == 'n'
|
28
|
+
return small_file_list
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
small_file_list = []
|
33
|
+
|
34
|
+
small_file_index = 0
|
35
|
+
small_file_name = File.join(big_db_dir, File.basename(big_db) + '.' + small_file_index.to_s)
|
36
|
+
fh = File.open(small_file_name, 'w')
|
37
|
+
small_file_list << small_file_name
|
38
|
+
File.open(big_db).each do |line|
|
39
|
+
if line.start_with?('>')
|
40
|
+
if fh.size >= Qu::Mfeindex::BIG_DB_SPLIT_CUTOFF
|
41
|
+
fh.close
|
42
|
+
small_file_index += 1
|
43
|
+
small_file_name = File.join(big_db_dir, File.basename(big_db) + '.' + small_file_index.to_s)
|
44
|
+
fh = File.open(small_file_name, 'w')
|
45
|
+
small_file_list << small_file_name
|
46
|
+
end
|
47
|
+
end
|
48
|
+
fh.write(line)
|
49
|
+
end
|
50
|
+
fh.close
|
51
|
+
|
52
|
+
return small_file_list
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
def index_db(fasta_file, kvalue)
|
57
|
+
if Qu::Mfeindex::db_indexed?(fasta_file)
|
58
|
+
$stdout.print "#{fasta_file} has already been indexed, do you want to reindex? [y/N]:"
|
59
|
+
choice = $stdin.gets.chomp.downcase
|
60
|
+
choice = 'n' if choice.empty?
|
61
|
+
while !['y', 'n'].include?(choice)
|
62
|
+
$stdout.print "The choice should be 'y' or 'n':"
|
63
|
+
choice = $stdin.gets.chomp.downcase
|
64
|
+
end
|
65
|
+
else
|
66
|
+
choice = 'y'
|
67
|
+
end
|
68
|
+
|
69
|
+
if choice == 'y'
|
70
|
+
Qu::Mfeindex::MFEprimerIndex(fasta_file, kvalue, reindex=true)
|
71
|
+
else
|
72
|
+
Qu::Mfeindex::MFEprimerIndex(fasta_file, kvalue, reindex=false)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
usage = "Index database for MFEprimer-2.0
|
78
|
+
|
79
|
+
Usage:
|
80
|
+
|
81
|
+
#{File.basename($0)} fasta_file [kvalue]
|
82
|
+
|
83
|
+
|
84
|
+
Options:
|
85
|
+
|
86
|
+
kvalue: Default is 9 [Integer].
|
87
|
+
|
88
|
+
Contact: Wubin Qu <quwubin@gmail.com>
|
89
|
+
"
|
90
|
+
|
91
|
+
case ARGV.size
|
92
|
+
when 2
|
93
|
+
fasta_file = ARGV[0]
|
94
|
+
kvalue = ARGV[1].to_i
|
95
|
+
when 1
|
96
|
+
fasta_file = ARGV[0]
|
97
|
+
kvalue = 9
|
98
|
+
else
|
99
|
+
$stderr.puts usage
|
100
|
+
exit
|
101
|
+
end
|
102
|
+
|
103
|
+
unless File.exists?(fasta_file)
|
104
|
+
$stdout.puts "Error: #{fasta_file} is not exitst.\n"
|
105
|
+
$stderr.puts usage
|
106
|
+
exit
|
107
|
+
end
|
108
|
+
|
109
|
+
if File.size(fasta_file) > Qu::Mfeindex::BIG_DB_SPLIT_CUTOFF
|
110
|
+
$stdout.print "#{fasta_file} is too large, do you want to split it first? [Y/n]:"
|
111
|
+
choice = $stdin.gets.chomp.downcase
|
112
|
+
choice = 'y' if choice.empty?
|
113
|
+
while !['y', 'n'].include?(choice)
|
114
|
+
$stdout.print "The choice should be 'y' or 'n':"
|
115
|
+
choice = $stdin.gets.chomp.downcase
|
116
|
+
end
|
117
|
+
else
|
118
|
+
choice = 'n'
|
119
|
+
end
|
120
|
+
|
121
|
+
if choice == 'y'
|
122
|
+
small_file_list = split_db(fasta_file)
|
123
|
+
small_file_list.each do |small_file|
|
124
|
+
index_db(small_file, kvalue)
|
125
|
+
end
|
126
|
+
else
|
127
|
+
index_db(fasta_file, kvalue)
|
128
|
+
end
|
data/lib/qu/mfeindex.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'qu/utils'
|
2
|
+
require 'qu/cmdwrapper'
|
3
|
+
|
4
|
+
require_relative "mfeindex/data"
|
5
|
+
require_relative "mfeindex/version"
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
module Qu
|
9
|
+
module Mfeindex
|
10
|
+
# Your code goes here...
|
11
|
+
|
12
|
+
module_function
|
13
|
+
|
14
|
+
def get_big_db_dir(db)
|
15
|
+
db + BIG_DB
|
16
|
+
end
|
17
|
+
|
18
|
+
def check_db(db_list)
|
19
|
+
new_db_list = []
|
20
|
+
|
21
|
+
db_list.each do |db|
|
22
|
+
big_db_dir = get_big_db_dir(db)
|
23
|
+
if db_indexed?(db)
|
24
|
+
new_db_list << db
|
25
|
+
next
|
26
|
+
elsif File.directory?(big_db_dir)
|
27
|
+
Dir.foreach(big_db_dir).each do |small_file|
|
28
|
+
prefix = File.basename(db)
|
29
|
+
next small_file unless small_file =~ /^#{prefix}\.\d$/
|
30
|
+
new_db_list << File.join(big_db_dir, small_file)
|
31
|
+
end
|
32
|
+
elsif !File.exists?(db)
|
33
|
+
$stderr.puts "Error: #{db} is not exists."
|
34
|
+
exit
|
35
|
+
elsif File.size(db) > BIG_DB_SPLIT_CUTOFF
|
36
|
+
$stderr.puts "Warning: #{db} is too large, please use mfepindex to index the db first."
|
37
|
+
exit
|
38
|
+
else
|
39
|
+
new_db_list << db
|
40
|
+
next
|
41
|
+
end
|
42
|
+
end
|
43
|
+
new_db_list
|
44
|
+
end
|
45
|
+
|
46
|
+
def db_indexed?(db)
|
47
|
+
File.exist?(db + DB_SQLITE3) and File.exist?(db + DB_JSON) and File.exist?(db + DB_2BIT)
|
48
|
+
end
|
49
|
+
|
50
|
+
def MFEprimerIndex(fasta_file, k = 9, reindex = false)
|
51
|
+
return if !reindex and db_indexed?(fasta_file)
|
52
|
+
|
53
|
+
unless File.exists?(fasta_file)
|
54
|
+
$stderr.puts "Error: #{fasta_file} is not exists."
|
55
|
+
exit
|
56
|
+
end
|
57
|
+
info_json = {}
|
58
|
+
|
59
|
+
uni_fasta = fasta_file + '.unifasta'
|
60
|
+
|
61
|
+
File.open(uni_fasta, 'w') do |fh|
|
62
|
+
Bio::FlatFile.new(Bio::FastaFormat, File.open(fasta_file)).each_with_index do |record, index|
|
63
|
+
info_json[index] = {'id' => record.entry_name, 'desc' => record.desc, 'size' => record.naseq.size}
|
64
|
+
fh.write ">#{index}\n#{record.naseq}\n"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
File.open(fasta_file + DB_JSON, 'w') do |fh|
|
69
|
+
fh.write(JSON.dump(info_json))
|
70
|
+
end
|
71
|
+
|
72
|
+
Qu::Cmdwrapper::faToTwoBit(uni_fasta, fasta_file + DB_2BIT)
|
73
|
+
|
74
|
+
cmd = File.join(__dir__, 'pymfeindex')
|
75
|
+
$stderr.puts "Begin index database: #{fasta_file}"
|
76
|
+
`#{cmd} -f #{uni_fasta} -k #{k} -o #{fasta_file + DB_SQLITE3}`
|
77
|
+
begin
|
78
|
+
File.delete(uni_fasta)
|
79
|
+
rescue
|
80
|
+
if File.exists?(uni_fasta)
|
81
|
+
$stderr.puts "You can delete the file #{uni_fasta} by hand."
|
82
|
+
end
|
83
|
+
end
|
84
|
+
$stderr.puts "Done index database: #{fasta_file}"
|
85
|
+
end
|
86
|
+
|
87
|
+
def int2dna(int, k=9, base_number=4)
|
88
|
+
seqint = int.to_s(base_number)
|
89
|
+
dna = ""
|
90
|
+
(0...seqint.length).each do |index|
|
91
|
+
dna += D2I[seqint[index].to_i]
|
92
|
+
end
|
93
|
+
return 'A' * (k - seqint.length) + dna
|
94
|
+
end
|
95
|
+
|
96
|
+
def dna2int(dna, base_number=4)
|
97
|
+
plus_int = 0
|
98
|
+
dna = dna.upcase
|
99
|
+
# This is plus strand position
|
100
|
+
dna.each_char.with_index do |base, index|
|
101
|
+
plus_int += D2I[base] * base_number ** (dna.length - 1 - index)
|
102
|
+
end
|
103
|
+
return plus_int
|
104
|
+
end
|
105
|
+
|
106
|
+
def split_pos(data)
|
107
|
+
# Split position data from SQLite3 database which generated by mfeindex
|
108
|
+
pos_hash = {}
|
109
|
+
data.split(';').each do |hit_record|
|
110
|
+
hit_id, hit_pos = hit_record.split(':')
|
111
|
+
pos_hash[hit_id.to_i] = hit_pos.split(',').collect {|pos| pos.to_i}
|
112
|
+
end
|
113
|
+
|
114
|
+
return pos_hash
|
115
|
+
end
|
116
|
+
|
117
|
+
def detect_kvalue(db_file)
|
118
|
+
db = SQLite3::Database.new(db_file)
|
119
|
+
|
120
|
+
begin
|
121
|
+
mer_num = db.execute("select count(*) from pos")[0][0]
|
122
|
+
kvalue = Math.log(mer_num, 4).to_i
|
123
|
+
rescue Exception => e
|
124
|
+
kvalue = 9
|
125
|
+
end
|
126
|
+
|
127
|
+
return kvalue
|
128
|
+
end
|
129
|
+
|
130
|
+
def get_kvalue(db_list)
|
131
|
+
kmer_list = []
|
132
|
+
db_list.each do |db|
|
133
|
+
if db_indexed?(db)
|
134
|
+
kmer_list << detect_kvalue(db + DB_SQLITE3)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
kmer_list.uniq!
|
138
|
+
|
139
|
+
kvalue = 9
|
140
|
+
|
141
|
+
if kmer_list.size > 1
|
142
|
+
$stderr.puts "Different index kmer value among #{@opts.db}."
|
143
|
+
exit
|
144
|
+
elsif kmer_list.size == 1
|
145
|
+
kvalue = kmer_list[0]
|
146
|
+
else
|
147
|
+
kvalue = 9
|
148
|
+
end
|
149
|
+
kvalue
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
def query_sqlite3(db_file, mer_id_list)
|
154
|
+
pos = {}
|
155
|
+
db = SQLite3::Database.new(db_file)
|
156
|
+
db.execute("select mer_id, plus, minus from pos where mer_id in (#{mer_id_list.join(', ')})") do |row|
|
157
|
+
mer_id, plus, minus = row
|
158
|
+
pos[mer_id] ||= {}
|
159
|
+
pos[mer_id][:plus] = split_pos(plus) unless plus.empty?
|
160
|
+
pos[mer_id][:minus] = split_pos(minus) unless minus.empty?
|
161
|
+
end
|
162
|
+
|
163
|
+
return pos
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Qu
|
2
|
+
module Mfeindex
|
3
|
+
DB_JSON = '.uni'
|
4
|
+
DB_SQLITE3 = '.sqlite3.db'
|
5
|
+
DB_2BIT = '.2bit'
|
6
|
+
BIG_DB = '_BIG_MFE_DB'
|
7
|
+
BIG_DB_SPLIT_CUTOFF = 1024**3
|
8
|
+
|
9
|
+
D2I = {
|
10
|
+
'A' => 0,
|
11
|
+
'T' => 3,
|
12
|
+
'C' => 2,
|
13
|
+
'G' => 1,
|
14
|
+
'-' => 4, # For bubble, added by Zheyan Liu
|
15
|
+
0 => 'A',
|
16
|
+
1 => 'G',
|
17
|
+
2 => 'C',
|
18
|
+
3 => 'T',
|
19
|
+
4 => '-', # For bubble, added by Zheyan Liu
|
20
|
+
}
|
21
|
+
|
22
|
+
ANTISENSE_CHARS = %w{A G C T -}
|
23
|
+
end
|
24
|
+
end
|
data/lib/qu/pymfeindex
ADDED
@@ -0,0 +1,283 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
from __future__ import division
|
3
|
+
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
import datetime
|
7
|
+
from time import time
|
8
|
+
from optparse import OptionParser
|
9
|
+
import sqlite3
|
10
|
+
|
11
|
+
import platform
|
12
|
+
import subprocess
|
13
|
+
import re
|
14
|
+
|
15
|
+
|
16
|
+
D2n_dic = dict(A=0, T=3, C=2, G=1, a=0, t=3, c=2, g=1)
|
17
|
+
n2D_dic = {0:'A', 3:'T', 2:'C', 1:'G', 0:'a', 3:'t', 2:'c', 1:'g'}
|
18
|
+
|
19
|
+
def print_usage():
|
20
|
+
print '''
|
21
|
+
%s: Index DB for MFEprimer-2.0
|
22
|
+
|
23
|
+
Usage:
|
24
|
+
|
25
|
+
%s -f human.genomic -k 9 -o index_db_name
|
26
|
+
|
27
|
+
Author: Wubin Qu <quwubin@gmail.com>
|
28
|
+
Last updated: 2012-9-28
|
29
|
+
''' % (os.path.basename(sys.argv[0]), os.path.basename(sys.argv[0]))
|
30
|
+
|
31
|
+
def optget():
|
32
|
+
'''parse options'''
|
33
|
+
parser = OptionParser()
|
34
|
+
parser.add_option("-f", "--file", dest = "filename", help = "DNA file in fasta to be indexed")
|
35
|
+
parser.add_option("-k", "--k", dest = "k", type='int', help = "K mer , default is 9", default = 9)
|
36
|
+
parser.add_option("-o", "--out", dest = "out", help = "Index db file name")
|
37
|
+
|
38
|
+
(options, args) = parser.parse_args()
|
39
|
+
|
40
|
+
if not options.filename:
|
41
|
+
print_usage()
|
42
|
+
exit()
|
43
|
+
|
44
|
+
if not options.out:
|
45
|
+
options.out = options.filename + '.sqlite3.db'
|
46
|
+
|
47
|
+
return options
|
48
|
+
|
49
|
+
def parse_fasta_format(fh):
|
50
|
+
'''
|
51
|
+
A Fasta-format Parser return Iterator
|
52
|
+
'''
|
53
|
+
# Remove the comment and blank lines before the first record
|
54
|
+
while True:
|
55
|
+
line = fh.readline()
|
56
|
+
if not line: return # Blank line
|
57
|
+
|
58
|
+
line = line.strip()
|
59
|
+
|
60
|
+
if line.startswith('>'):
|
61
|
+
break
|
62
|
+
|
63
|
+
while True:
|
64
|
+
if not line.startswith('>'):
|
65
|
+
raise ValueError("Records in Fasta files should start with '>' character")
|
66
|
+
|
67
|
+
id, sep, desc = line[1:].partition(' ')
|
68
|
+
|
69
|
+
seq_lines = []
|
70
|
+
line = fh.readline()
|
71
|
+
while True:
|
72
|
+
if not line: break
|
73
|
+
|
74
|
+
line = line.strip()
|
75
|
+
|
76
|
+
if line.startswith('>'):
|
77
|
+
break
|
78
|
+
|
79
|
+
if not line:
|
80
|
+
line = fh.readline()
|
81
|
+
continue
|
82
|
+
|
83
|
+
seq_lines.append(line.replace(' ', '').replace("\r", ''))
|
84
|
+
line = fh.readline()
|
85
|
+
|
86
|
+
yield (id, desc, ''.join(seq_lines))
|
87
|
+
|
88
|
+
if not line: return
|
89
|
+
|
90
|
+
assert False, 'Should not reach this line'
|
91
|
+
|
92
|
+
def get_free_memory_percent():
|
93
|
+
if platform.system() == 'Darwin':
|
94
|
+
# Get process info
|
95
|
+
vm = subprocess.Popen(['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
|
96
|
+
installed_memory = float(subprocess.Popen(['sysctl', '-n', 'hw.memsize'], stdout=subprocess.PIPE).communicate()[0])
|
97
|
+
|
98
|
+
# Process vm_stat
|
99
|
+
vmLines = vm.split('\n')
|
100
|
+
sep = re.compile(':[\s]+')
|
101
|
+
vmStats = {}
|
102
|
+
for row in range(1,len(vmLines)-2):
|
103
|
+
rowText = vmLines[row].strip()
|
104
|
+
rowElements = sep.split(rowText)
|
105
|
+
vmStats[(rowElements[0])] = int(rowElements[1].strip('\.')) * 4096
|
106
|
+
|
107
|
+
total_comsumed = vmStats["Pages wired down"] + vmStats["Pages active"] + vmStats["Pages inactive"]
|
108
|
+
|
109
|
+
return (installed_memory - total_comsumed) / installed_memory * 100
|
110
|
+
|
111
|
+
elif platform.system() == 'Linux':
|
112
|
+
items = subprocess.Popen(['free', '-m'], stdout=subprocess.PIPE).communicate()[0].splitlines()[1].strip().split()
|
113
|
+
free = float(items[3]) + float(items[5]) + float(items[6])
|
114
|
+
total = float(items[1])
|
115
|
+
|
116
|
+
return free / total * 100
|
117
|
+
else:
|
118
|
+
print "Sorry, currently only support Mac OS and Linux."
|
119
|
+
return 0
|
120
|
+
|
121
|
+
|
122
|
+
def insert_db(conn, mer_count, plus, minus):
|
123
|
+
for mer_id in xrange(mer_count):
|
124
|
+
conn.execute("insert into pos (mer_id, plus, minus) values (?, ?, ?)", \
|
125
|
+
[mer_id, plus[mer_id], minus[mer_id]])
|
126
|
+
|
127
|
+
conn.commit()
|
128
|
+
|
129
|
+
def update_db(conn, mer_count, plus, minus):
|
130
|
+
for mer_id in xrange(mer_count):
|
131
|
+
(plus_data, minus_data) = conn.execute("select plus, minus from pos where mer_id=?", [mer_id]).fetchone()
|
132
|
+
if plus_data:
|
133
|
+
if plus[mer_id]:
|
134
|
+
plus_data += ';%s' % plus[mer_id]
|
135
|
+
else:
|
136
|
+
pass
|
137
|
+
else:
|
138
|
+
plus_data = plus[mer_id]
|
139
|
+
|
140
|
+
if minus_data:
|
141
|
+
if minus[mer_id]:
|
142
|
+
minus_data += ';%s' % minus[mer_id]
|
143
|
+
else:
|
144
|
+
pass
|
145
|
+
else:
|
146
|
+
minus_data = minus[mer_id]
|
147
|
+
|
148
|
+
conn.execute("update pos set plus=?, minus=? where mer_id=?", \
|
149
|
+
[plus_data, minus_data, mer_id])
|
150
|
+
|
151
|
+
conn.commit()
|
152
|
+
|
153
|
+
def baseN(num, b):
|
154
|
+
'''convert non-negative decimal integer n to
|
155
|
+
equivalent in another base b (2-36)'''
|
156
|
+
return ((num == 0) and '0' ) or ( baseN(num // b, b).lstrip('0') + "0123456789abcdefghijklmnopqrstuvwxyz"[num % b])
|
157
|
+
|
158
|
+
def int2DNA(num, k):
|
159
|
+
seq = baseN(num, 4)
|
160
|
+
return 'A' * (k-len(seq)) + (''.join([n2D_dic[int(base)] for base in seq]))
|
161
|
+
|
162
|
+
def DNA2int_2(seq):
|
163
|
+
'''convert a sub-sequence/seq to a non-negative integer'''
|
164
|
+
plus_mer = 0
|
165
|
+
minus_mer = 0
|
166
|
+
length = len(seq) - 1
|
167
|
+
for i, letter in enumerate(seq):
|
168
|
+
plus_mer += D2n_dic[letter] * 4 ** (length - i)
|
169
|
+
minus_mer += (3 - D2n_dic[letter]) * 4 ** i
|
170
|
+
|
171
|
+
return plus_mer, minus_mer
|
172
|
+
|
173
|
+
def DNA2int(seq):
|
174
|
+
'''convert a sub-sequence/seq to a non-negative integer'''
|
175
|
+
plus_mer = 0
|
176
|
+
length = len(seq) - 1
|
177
|
+
for i, letter in enumerate(seq):
|
178
|
+
plus_mer += D2n_dic[letter] * 4 ** (length - i)
|
179
|
+
|
180
|
+
return plus_mer
|
181
|
+
|
182
|
+
def index(filename, k, dbname):
|
183
|
+
''''''
|
184
|
+
start = time()
|
185
|
+
|
186
|
+
mer_count = 4**k
|
187
|
+
|
188
|
+
conn = sqlite3.connect(dbname)
|
189
|
+
cur = conn.cursor()
|
190
|
+
cur.executescript('''
|
191
|
+
drop table if exists pos;
|
192
|
+
create table pos(
|
193
|
+
mer_id integer primary key,
|
194
|
+
plus text,
|
195
|
+
minus text
|
196
|
+
);''')
|
197
|
+
|
198
|
+
plus = ['']*mer_count
|
199
|
+
minus = ['']*mer_count
|
200
|
+
|
201
|
+
is_empty = False
|
202
|
+
is_db_new = True
|
203
|
+
|
204
|
+
for record_id, record_desc, fasta_seq in parse_fasta_format(open(filename)):
|
205
|
+
is_empty = False
|
206
|
+
print record_id
|
207
|
+
|
208
|
+
#print 'Time used: ', time() - start
|
209
|
+
|
210
|
+
#plus_mer_list = [''] * mer_count
|
211
|
+
#minus_mer_list = [''] * mer_count
|
212
|
+
plus_mer_list = {}
|
213
|
+
minus_mer_list = {}
|
214
|
+
|
215
|
+
for i in xrange(len(fasta_seq)-k + 1):
|
216
|
+
#start = time()
|
217
|
+
kmer = fasta_seq[i:(i+k)]
|
218
|
+
#print kmer, i
|
219
|
+
|
220
|
+
try:
|
221
|
+
plus_mer_id, minus_mer_id = DNA2int_2(kmer)
|
222
|
+
except:
|
223
|
+
# Skip the unrecognized base, such as 'N'
|
224
|
+
continue
|
225
|
+
|
226
|
+
if plus_mer_list.has_key(plus_mer_id):
|
227
|
+
plus_mer_list[plus_mer_id] += ',%i' % (i+k-1)
|
228
|
+
else:
|
229
|
+
plus_mer_list[plus_mer_id] = str(i+k-1)
|
230
|
+
|
231
|
+
if minus_mer_list.has_key(minus_mer_id):
|
232
|
+
minus_mer_list[minus_mer_id] += ',%i' % (i)
|
233
|
+
else:
|
234
|
+
minus_mer_list[minus_mer_id] = str(i)
|
235
|
+
|
236
|
+
|
237
|
+
#print 'Index time used: ', time() - start
|
238
|
+
#start = time()
|
239
|
+
for mer_id, pos in plus_mer_list.items():
|
240
|
+
if plus[mer_id]:
|
241
|
+
plus[mer_id] += ';%s:%s' % (record_id, pos)
|
242
|
+
else:
|
243
|
+
plus[mer_id] = '%s:%s' % (record_id, pos)
|
244
|
+
|
245
|
+
for mer_id, pos in minus_mer_list.items():
|
246
|
+
if minus[mer_id]:
|
247
|
+
minus[mer_id] += ';%s:%s' % (record_id, pos)
|
248
|
+
else:
|
249
|
+
minus[mer_id] = '%s:%s' % (record_id, pos)
|
250
|
+
|
251
|
+
#print 'Merge time used: ', time() - start
|
252
|
+
|
253
|
+
memory_percent = get_free_memory_percent()
|
254
|
+
if memory_percent < 30:
|
255
|
+
if is_db_new:
|
256
|
+
insert_db(conn, mer_count, plus, minus)
|
257
|
+
is_db_new = False
|
258
|
+
else:
|
259
|
+
update_db(conn, mer_count, plus, minus)
|
260
|
+
|
261
|
+
# Empty the container
|
262
|
+
plus = ['']*mer_count
|
263
|
+
minus = ['']*mer_count
|
264
|
+
is_empty = True
|
265
|
+
|
266
|
+
print 'Empty plus and minus due to the memory problem.'
|
267
|
+
|
268
|
+
if not is_empty:
|
269
|
+
if is_db_new:
|
270
|
+
insert_db(conn, mer_count, plus, minus)
|
271
|
+
else:
|
272
|
+
update_db(conn, mer_count, plus, minus)
|
273
|
+
|
274
|
+
print "Time used: %s" % str(datetime.timedelta(seconds=(time() - start)))
|
275
|
+
print 'Done.'
|
276
|
+
|
277
|
+
def main():
|
278
|
+
'''main'''
|
279
|
+
options = optget()
|
280
|
+
index(options.filename, options.k, options.out)
|
281
|
+
|
282
|
+
if __name__ == "__main__":
|
283
|
+
main()
|
data/qu-mfeindex.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'qu/mfeindex/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "qu-mfeindex"
|
8
|
+
spec.version = Qu::Mfeindex::VERSION
|
9
|
+
spec.authors = ["Wubin Qu"]
|
10
|
+
spec.email = ["quwubin@gmail.com"]
|
11
|
+
spec.description = %q{DNA sequence indexer originally developed for MFEprimer-2.0}
|
12
|
+
spec.summary = %q{A DNA sequence idnexer}
|
13
|
+
spec.homepage = "https://github.com/quwubin/qu-mfeindex"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'qu-utils', '~> 1.0'
|
22
|
+
spec.add_runtime_dependency 'qu-cmdwrapper', '~> 1.0'
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: qu-mfeindex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Wubin Qu
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-04-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: qu-utils
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: qu-cmdwrapper
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: DNA sequence indexer originally developed for MFEprimer-2.0
|
70
|
+
email:
|
71
|
+
- quwubin@gmail.com
|
72
|
+
executables:
|
73
|
+
- mfeindex
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- ".gitignore"
|
78
|
+
- Gemfile
|
79
|
+
- LICENSE.txt
|
80
|
+
- README.md
|
81
|
+
- Rakefile
|
82
|
+
- bin/mfeindex
|
83
|
+
- lib/qu/mfeindex.rb
|
84
|
+
- lib/qu/mfeindex/data.rb
|
85
|
+
- lib/qu/mfeindex/version.rb
|
86
|
+
- lib/qu/pymfeindex
|
87
|
+
- qu-mfeindex.gemspec
|
88
|
+
homepage: https://github.com/quwubin/qu-mfeindex
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 2.2.0
|
109
|
+
signing_key:
|
110
|
+
specification_version: 4
|
111
|
+
summary: A DNA sequence idnexer
|
112
|
+
test_files: []
|