big_simon 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/COPYING +674 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +41 -0
- data/README.md +37 -0
- data/Rakefile +6 -0
- data/big_simon.gemspec +30 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/big_simon/version.rb +3 -0
- data/lib/big_simon.rb +100 -0
- data/vendor/bin/mac/WIsH +0 -0
- data/vendor/bin/mac/computeMeasure.out +0 -0
- data/vendor/bin/mac/computeMeasure_onlyd2star.out +0 -0
- data/vendor/bin/mac/countKmer.out +0 -0
- data/vendor/bin/mac/vhm.py +381 -0
- metadata +118 -0
data/Gemfile.lock
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
big_simon (0.0.1)
|
5
|
+
rya (~> 0.1.3)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
abort_if (0.2.0)
|
11
|
+
diff-lcs (1.3)
|
12
|
+
rake (10.5.0)
|
13
|
+
rspec (3.7.0)
|
14
|
+
rspec-core (~> 3.7.0)
|
15
|
+
rspec-expectations (~> 3.7.0)
|
16
|
+
rspec-mocks (~> 3.7.0)
|
17
|
+
rspec-core (3.7.1)
|
18
|
+
rspec-support (~> 3.7.0)
|
19
|
+
rspec-expectations (3.7.0)
|
20
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
21
|
+
rspec-support (~> 3.7.0)
|
22
|
+
rspec-mocks (3.7.0)
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
24
|
+
rspec-support (~> 3.7.0)
|
25
|
+
rspec-support (3.7.1)
|
26
|
+
rya (0.1.3)
|
27
|
+
abort_if (~> 0.2.0)
|
28
|
+
systemu (~> 2.6, >= 2.6.5)
|
29
|
+
systemu (2.6.5)
|
30
|
+
|
31
|
+
PLATFORMS
|
32
|
+
ruby
|
33
|
+
|
34
|
+
DEPENDENCIES
|
35
|
+
big_simon!
|
36
|
+
bundler (~> 1.16)
|
37
|
+
rake (~> 10.0)
|
38
|
+
rspec (~> 3.0)
|
39
|
+
|
40
|
+
BUNDLED WITH
|
41
|
+
1.16.3
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# BigSimon
|
2
|
+
|
3
|
+
A pipeline for finding hosts of viruses!
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'big_simon'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install big_simon
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
TODO
|
24
|
+
|
25
|
+
## Development
|
26
|
+
|
27
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
28
|
+
|
29
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/mooreryan/big_simon. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
34
|
+
|
35
|
+
## Code of Conduct
|
36
|
+
|
37
|
+
Everyone interacting in the BigSimon project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/big_simon/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
ADDED
data/big_simon.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "big_simon/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "big_simon"
|
8
|
+
spec.version = BigSimon::VERSION
|
9
|
+
spec.authors = ["Ryan Moore"]
|
10
|
+
spec.email = ["moorer@udel.edu"]
|
11
|
+
|
12
|
+
spec.summary = %q{Viral host discovery pipeline.}
|
13
|
+
spec.description = %q{Viral host discovery pipeline.}
|
14
|
+
spec.homepage = "https://github.com/mooreryan/big_simon"
|
15
|
+
|
16
|
+
# Specify which files should be added to the gem when it is released.
|
17
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
18
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
19
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
|
+
end
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ["lib"]
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.16"
|
26
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
27
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
28
|
+
|
29
|
+
spec.add_runtime_dependency "rya", "~> 0.1.3"
|
30
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "big_simon"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/big_simon.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
require "rya"
|
2
|
+
|
3
|
+
require "big_simon/version"
|
4
|
+
|
5
|
+
Time.extend Rya::CoreExtensions::Time
|
6
|
+
Process.extend Rya::CoreExtensions::Process
|
7
|
+
|
8
|
+
module BigSimon
|
9
|
+
|
10
|
+
# Project directories
|
11
|
+
ROOT = File.join __dir__, ".."
|
12
|
+
BIN = File.join ROOT, "vendor", "bin", "mac"
|
13
|
+
SPEC = File.join ROOT, "spec"
|
14
|
+
TEST_FILES = File.join SPEC, "test_files"
|
15
|
+
|
16
|
+
class Parsers
|
17
|
+
|
18
|
+
def self.vir_host_matcher fname
|
19
|
+
hosts = nil
|
20
|
+
|
21
|
+
host_info = {}
|
22
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
23
|
+
line.chomp!
|
24
|
+
line.sub! /,$/, "" # git rid of trailing commas
|
25
|
+
|
26
|
+
if idx.zero?
|
27
|
+
stat, *hosts = line.split ","
|
28
|
+
else
|
29
|
+
ary = line.split ","
|
30
|
+
virus = ary.shift
|
31
|
+
|
32
|
+
dists = ary.map.
|
33
|
+
with_index { |dist, idx| [hosts[idx], dist.to_f] }.
|
34
|
+
sort_by { |_, dist| dist }
|
35
|
+
|
36
|
+
best_host = dists[0][0]
|
37
|
+
|
38
|
+
host_info[virus] = {
|
39
|
+
best: best_host,
|
40
|
+
all: dists
|
41
|
+
}
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
host_info
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class Runners
|
50
|
+
|
51
|
+
# Runs the WIsH program
|
52
|
+
#
|
53
|
+
# @raise [AbortIf::Exit] if commands fail
|
54
|
+
def self.wish exe, vir_dir, host_dir, outdir, threads
|
55
|
+
model_dir = File.join outdir, "model"
|
56
|
+
|
57
|
+
FileUtils.mkdir_p model_dir
|
58
|
+
|
59
|
+
build_model = "#{exe} " \
|
60
|
+
"-t #{threads} " \
|
61
|
+
"-c build " \
|
62
|
+
"-g #{host_dir} " \
|
63
|
+
"-m #{model_dir}"
|
64
|
+
|
65
|
+
predict = "#{exe} " \
|
66
|
+
"-t #{threads} " \
|
67
|
+
"-c predict " \
|
68
|
+
"-g #{vir_dir} " \
|
69
|
+
"-m #{model_dir} " \
|
70
|
+
"-r #{outdir} -b"
|
71
|
+
|
72
|
+
Process.run_and_time_it! "Building model", build_model
|
73
|
+
Process.run_and_time_it! "Predicting host", predict
|
74
|
+
|
75
|
+
FileUtils.rm_r model_dir if Dir.exist? model_dir
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.vir_host_matcher exe, vir_dir, host_dir, outdir
|
79
|
+
FileUtils.mkdir_p outdir
|
80
|
+
|
81
|
+
cmd = "python #{exe} " \
|
82
|
+
"-v #{vir_dir} " \
|
83
|
+
"-b #{host_dir} " \
|
84
|
+
"-o #{outdir} " \
|
85
|
+
"-d 1" # only compute d2star dissimilarity
|
86
|
+
|
87
|
+
Process.run_and_time_it! "Computing d2star dissimilarity", cmd
|
88
|
+
|
89
|
+
tmp_dir = File.join outdir, "tmp"
|
90
|
+
FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
|
91
|
+
|
92
|
+
bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
|
93
|
+
bad_files.each do |fname|
|
94
|
+
path = File.join outdir, fname
|
95
|
+
|
96
|
+
FileUtils.rm path if File.exist? path
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
data/vendor/bin/mac/WIsH
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,381 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
import os,sys
|
3
|
+
import optparse
|
4
|
+
import subprocess
|
5
|
+
from subprocess import call
|
6
|
+
import time
|
7
|
+
import platform
|
8
|
+
import numpy
|
9
|
+
|
10
|
+
|
11
|
+
prog_base = os.path.split(sys.argv[0])[1]
|
12
|
+
|
13
|
+
parser = optparse.OptionParser()
|
14
|
+
parser.add_option("-v", "--virusFaDir", action = "store", type = "string", dest = "virusFaDir",
|
15
|
+
help = "the directory to the folder containing bacteria virus fasta files")
|
16
|
+
parser.add_option("-b", "--hostFaDir", action = "store", type = "string", dest = "hostFaDir",
|
17
|
+
help = "the directory to the folder containing bacteria host fasta files")
|
18
|
+
parser.add_option("-o", "--out", action = "store", type = "string", dest = "outDir",
|
19
|
+
default='./', help = "output directory")
|
20
|
+
parser.add_option("-t", "--taxa", action = "store", type = "string", dest = "hostTaxaFile",
|
21
|
+
help = "the host taxa file (including the path) ")
|
22
|
+
parser.add_option("-d", "--d2star", action = "store", type = "string", dest = "onlyD2star",
|
23
|
+
default=0, help = "compute only d2star dissimilarity? 1 for yes, 0 for no")
|
24
|
+
#parser.add_option("-u", "--continue", action = "store", type = "string", dest = "onlyComputeMeasure",
|
25
|
+
# default=0, help = "kmer count is ready only compute measures? 1 for yes, 0 for no")
|
26
|
+
#parser.add_option("-k", "--kLen", action = "store", type = "string", dest = "kLen",
|
27
|
+
# help = "the length of k-tuple")
|
28
|
+
|
29
|
+
(options, args) = parser.parse_args()
|
30
|
+
if (options.virusFaDir is None or
|
31
|
+
options.hostFaDir is None) :
|
32
|
+
sys.stderr.write(prog_base + ": ERROR: missing required command-line argument")
|
33
|
+
filelog.write(prog_base + ": ERROR: missing required command-line argument")
|
34
|
+
parser.print_help()
|
35
|
+
sys.exit(0)
|
36
|
+
|
37
|
+
|
38
|
+
## tmp file directory
|
39
|
+
if not os.path.exists(options.outDir) :
|
40
|
+
os.makedirs(options.outDir)
|
41
|
+
tmpDir = os.path.join(options.outDir, "tmp")
|
42
|
+
if not os.path.exists(tmpDir) :
|
43
|
+
os.makedirs(tmpDir)
|
44
|
+
|
45
|
+
## log file ##
|
46
|
+
filelog = open(os.path.join(tmpDir, 'vhm.log'), 'w')
|
47
|
+
|
48
|
+
## name length ##
|
49
|
+
nameLen = 93 - len(options.outDir)
|
50
|
+
#### possibly because of the kmercount folder name for each contig is too long?
|
51
|
+
|
52
|
+
|
53
|
+
#################### 0: preparation ############################
|
54
|
+
|
55
|
+
## path to the programs
|
56
|
+
vhmPath = os.path.dirname(sys.argv[0])
|
57
|
+
if len(vhmPath) == 0 :
|
58
|
+
vhmPath="./"
|
59
|
+
#print vhmPath
|
60
|
+
|
61
|
+
## kmer length and MC order
|
62
|
+
kmax = 6
|
63
|
+
order = 2
|
64
|
+
|
65
|
+
## compile c++ code if not
|
66
|
+
optSys = platform.system()
|
67
|
+
if optSys == 'Linux' :
|
68
|
+
exePath = os.path.join(vhmPath, "bin", "linux64")
|
69
|
+
elif optSys == 'Darwin' :
|
70
|
+
exePath = os.path.join(vhmPath, "bin", "macDarwin")
|
71
|
+
elif optSys == 'Windows' :
|
72
|
+
exePath = os.path.join(vhmPath, "bin", "windows64")
|
73
|
+
else :
|
74
|
+
#sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
|
75
|
+
exePath = "unknown"
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
## countKmer c++ code
|
80
|
+
countKmerCpp = os.path.join(vhmPath, "countKmer.cpp")
|
81
|
+
countKmerOut = os.path.join(vhmPath, "countKmer.out")
|
82
|
+
if not os.path.exists(countKmerOut) :
|
83
|
+
## recognize OS, copy exe files to vhmPath
|
84
|
+
sys.stderr.write("WARNING: can't find the file " + countKmerOut + ", try to copy one from bin \n")
|
85
|
+
filelog.write("WARNING: can't find the file " + countKmerOut + ", try to copy one from bin \n")
|
86
|
+
preCountKmerOut = os.path.join(exePath, "countKmer.out")
|
87
|
+
if exePath != "unknown" :
|
88
|
+
if os.path.exists(preCountKmerOut) :
|
89
|
+
os.system("cp " + preCountKmerOut + " " + vhmPath)
|
90
|
+
else :
|
91
|
+
sys.stderr.write( "ERROR: can't find file " + preCountKmerOut + " \n Please run make under the main directory! \n" )
|
92
|
+
filelog.write( "ERROR: can't find file " + preCountKmerOut + " \n Please run make under the main directory! \n" )
|
93
|
+
sys.exit(0)
|
94
|
+
else :
|
95
|
+
## can't recognize OS, try to compile
|
96
|
+
sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
|
97
|
+
sys.stderr.write( "Trying to compile..." + "\n")
|
98
|
+
filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
|
99
|
+
filelog.write( "Trying to compile..." + "\n")
|
100
|
+
if os.path.exists(countKmerCpp) :
|
101
|
+
os.system("g++ " + countKmerCpp + " -o " + countKmerOut + " -std=c++0x" )
|
102
|
+
else :
|
103
|
+
sys.stderr.write( "ERROR: can't find file " + countKmerCpp + " \n Please run make under the main directory! \n" )
|
104
|
+
filelog.write( "ERROR: can't find file " + countKmerCpp + " \n Please run make under the main directory! \n" )
|
105
|
+
sys.exit(0)
|
106
|
+
|
107
|
+
os.system("chmod 777 " + countKmerOut)
|
108
|
+
filelog.flush()
|
109
|
+
|
110
|
+
## computeMeasure c++ code
|
111
|
+
computeMeasureCpp = os.path.join(vhmPath, "computeMeasure.cpp")
|
112
|
+
computeMeasureOut = os.path.join(vhmPath, "computeMeasure.out")
|
113
|
+
if not os.path.exists(computeMeasureOut) :
|
114
|
+
## recognize OS, copy exe files to vhmPath
|
115
|
+
sys.stderr.write("WARNING: can't find the file " + computeMeasureOut + ", try to copy one from bin \n")
|
116
|
+
filelog.write("WARNING: can't find the file " + computeMeasureOut + ", try to copy one from bin \n")
|
117
|
+
preComputeMeasureOut = os.path.join(exePath, "computeMeasure.out")
|
118
|
+
if exePath != "unknown" :
|
119
|
+
if os.path.exists(preComputeMeasureOut) :
|
120
|
+
os.system("cp " + preComputeMeasureOut + " " + vhmPath)
|
121
|
+
else :
|
122
|
+
sys.stderr.write( "ERROR: can't find file " + preComputeMeasureOut + " \n Please run make under the main directory! \n" )
|
123
|
+
filelog.write( "ERROR: can't find file " + preComputeMeasureOut + " \n Please run make under the main directory! \n" )
|
124
|
+
sys.exit(0)
|
125
|
+
else :
|
126
|
+
## can't recognize OS, try to compile
|
127
|
+
sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
|
128
|
+
sys.stderr.write( "Trying to compile..." + "\n")
|
129
|
+
filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
|
130
|
+
filelog.write( "Trying to compile..." + "\n")
|
131
|
+
if os.path.exists(computeMeasureCpp) :
|
132
|
+
os.system("g++ " + computeMeasureCpp + " -o " + computeMeasureOut + " -std=c++0x" )
|
133
|
+
else :
|
134
|
+
sys.stderr.write( "ERROR: can't find file " + computeMeasureCpp + " \n Please run make under the main directory! \n" )
|
135
|
+
filelog.write( "ERROR: can't find file " + computeMeasureCpp + " \n Please run make under the main directory! \n" )
|
136
|
+
sys.exit(0)
|
137
|
+
|
138
|
+
os.system("chmod 777 " + computeMeasureOut)
|
139
|
+
filelog.flush()
|
140
|
+
|
141
|
+
## computeMeasure c++ code
|
142
|
+
computed2starCpp = os.path.join(vhmPath, "computeMeasure_onlyd2star.cpp")
|
143
|
+
computed2starOut = os.path.join(vhmPath, "computeMeasure_onlyd2star.out")
|
144
|
+
if not os.path.exists(computed2starOut) :
|
145
|
+
## recognize OS, copy exe files to vhmPath
|
146
|
+
sys.stderr.write("WARNING: can't find the file " + computed2starOut + ", try to copy one from bin \n")
|
147
|
+
preComputed2starOut = os.path.join(exePath, "computeMeasure_onlyd2star.out")
|
148
|
+
if exePath != "unknown" :
|
149
|
+
if os.path.exists(preComputed2starOut) :
|
150
|
+
os.system("cp " + preComputed2starOut + " " + vhmPath)
|
151
|
+
else :
|
152
|
+
sys.stderr.write( "ERROR: can't find file " + preComputed2starOut + " \n Please run make under the main directory! \n" )
|
153
|
+
filelog.write( "ERROR: can't find file " + preComputed2starOut + " \n Please run make under the main directory! \n" )
|
154
|
+
sys.exit(0)
|
155
|
+
else :
|
156
|
+
## can't recognize OS, try to compile
|
157
|
+
sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
|
158
|
+
sys.stderr.write( "Trying to compile..." + "\n")
|
159
|
+
filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
|
160
|
+
filelog.write( "Trying to compile..." + "\n")
|
161
|
+
if os.path.exists(computed2starCpp) :
|
162
|
+
os.system("g++ " + computed2starCpp + " -o " + computed2starOut + " -std=c++0x" )
|
163
|
+
else :
|
164
|
+
sys.stderr.write( "ERROR: can't find file " + computed2starCpp + " \n Please run make under the main directory! \n" )
|
165
|
+
filelog.write( "ERROR: can't find file " + computed2starCpp + " \n Please run make under the main directory! \n" )
|
166
|
+
sys.exit(0)
|
167
|
+
|
168
|
+
os.system("chmod 777 " + computed2starOut)
|
169
|
+
filelog.flush()
|
170
|
+
|
171
|
+
## if only compute d2star
|
172
|
+
if int(options.onlyD2star) == 1 :
|
173
|
+
computeMeasureOut=computed2starOut
|
174
|
+
else :
|
175
|
+
computeMeasureOut=computeMeasureOut
|
176
|
+
|
177
|
+
|
178
|
+
## kmer count directory
|
179
|
+
kmerCountPath = os.path.join(tmpDir, "KC")
|
180
|
+
if not os.path.exists(kmerCountPath) :
|
181
|
+
os.makedirs(kmerCountPath)
|
182
|
+
|
183
|
+
## virusFaList, hostFaList
|
184
|
+
virusFaList = os.listdir(options.virusFaDir)
|
185
|
+
#virusFaList = os.path.join(options.virusFaDir, os.listdir(options.virusFaDir))
|
186
|
+
hostFaList = os.listdir(options.hostFaDir)
|
187
|
+
|
188
|
+
## virus list file, host list file
|
189
|
+
virusListFile = os.path.join(tmpDir, "virusList")
|
190
|
+
hostListFile = os.path.join(tmpDir, "hostList")
|
191
|
+
|
192
|
+
virusListFileWrite = open(virusListFile, 'w') ## make file blank
|
193
|
+
virusListFileWrite.close()
|
194
|
+
virusListFileWrite = open(virusListFile, 'a')
|
195
|
+
|
196
|
+
hostListFileWrite = open(hostListFile, 'w') ## make file blank
|
197
|
+
hostListFileWrite.close()
|
198
|
+
hostListFileWrite = open(hostListFile, 'a')
|
199
|
+
|
200
|
+
#time.sleep(6) # delays for 10 seconds
|
201
|
+
|
202
|
+
################ 00: hostTaxa issues: ##########################
|
203
|
+
############## 1. hostName=hostFileName(with extension) ######
|
204
|
+
############## 2. hostTaxa should be no missing ########
|
205
|
+
|
206
|
+
#################### 00: if hostTaxa missing ###################
|
207
|
+
if options.hostTaxaFile is None :
|
208
|
+
hostTaxaFile = os.path.join(options.outDir, "hostTaxa.txt_new.txt")
|
209
|
+
sys.stdout.write("WARNING: no hostTaxa file provided, creating a dummy one \n")
|
210
|
+
filelog.write("WARNING: no hostTaxa file provided, creating a dummy one \n")
|
211
|
+
hostTaxaFileWrite = open(hostTaxaFile, 'w') ## make file blank
|
212
|
+
hostTaxaFileWrite.close()
|
213
|
+
hostTaxaFileWrite = open(hostTaxaFile, 'a')
|
214
|
+
|
215
|
+
hostTaxaFileWrite.write("hostNCBIName hostName hostSuperkingdom hostPhylum hostClass hostOrder hostFamily hostGenus hostSpecies\n")
|
216
|
+
for currentFileName in hostFaList :
|
217
|
+
if currentFileName.startswith('.') :
|
218
|
+
continue
|
219
|
+
if len(currentFileName) > nameLen :
|
220
|
+
sys.stdout.write( "WARNING: the file name has more than " + str(nameLen) + " letters! Use the first " + str(nameLen) + " letters as the name \n")
|
221
|
+
filelog.write( "WARNING: the file name has more than " + str(nameLen) + " letters! Use the first " + str(nameLen) + " letters as the name \n")
|
222
|
+
currentFileNameS = currentFileName[:nameLen]
|
223
|
+
else :
|
224
|
+
currentFileNameS = currentFileName
|
225
|
+
hostTaxaFileWrite.write(currentFileNameS)
|
226
|
+
for i in range(1,9) :
|
227
|
+
hostTaxaFileWrite.write("\t" + "unknown")
|
228
|
+
hostTaxaFileWrite.write("\n")
|
229
|
+
hostTaxaFileWrite.close()
|
230
|
+
options.hostTaxaFile = hostTaxaFile
|
231
|
+
|
232
|
+
################### REFORMAT hostTaxa (fill missing) ###############
|
233
|
+
else :
|
234
|
+
hostTaxaFile = os.path.join(options.outDir, os.path.basename(options.hostTaxaFile)+"_new.txt")
|
235
|
+
hostTaxaTable = numpy.genfromtxt(options.hostTaxaFile,delimiter="\t", dtype=str)
|
236
|
+
hostTaxaTable[hostTaxaTable=='']='unknown'
|
237
|
+
numpy.savetxt(hostTaxaFile, hostTaxaTable, fmt="%s", delimiter='\t', newline='\n')
|
238
|
+
|
239
|
+
filelog.flush()
|
240
|
+
|
241
|
+
#################### 1: count kmer and prepare list files ############################
|
242
|
+
#sys.stdout.write("Step 1: counting kmers \n")
|
243
|
+
|
244
|
+
start_time = time.time()
|
245
|
+
count = 0
|
246
|
+
|
247
|
+
for currentFileName in virusFaList :
|
248
|
+
if currentFileName.startswith('.') :
|
249
|
+
continue
|
250
|
+
if os.path.isdir(os.path.join(options.virusFaDir, currentFileName)) :
|
251
|
+
sys.stderr.write( "ERROR: zero bytes of file " + currentFileName + "\n")
|
252
|
+
filelog.write( "ERROR: zero bytes of file " + currentFileName + "\n")
|
253
|
+
sys.exit(0)
|
254
|
+
if len(currentFileName) > nameLen :
|
255
|
+
currentFileNameS = currentFileName[:nameLen]
|
256
|
+
else :
|
257
|
+
currentFileNameS = currentFileName
|
258
|
+
sys.stdout.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
|
259
|
+
filelog.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
|
260
|
+
for w in range(1, (kmax+1)) :
|
261
|
+
currentFilePath = os.path.join(options.virusFaDir, currentFileName)
|
262
|
+
|
263
|
+
currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
|
264
|
+
cmdKmer = countKmerOut + " -l -k " + str(w) + \
|
265
|
+
" -i " + currentFilePath +\
|
266
|
+
" -o " + currentKmerCountPath +\
|
267
|
+
" -s " + currentFileNameS
|
268
|
+
cmdKmerOut = subprocess.Popen(cmdKmer, shell=True, \
|
269
|
+
stderr = subprocess.PIPE, \
|
270
|
+
stdout = subprocess.PIPE)
|
271
|
+
cmdKmerOut.wait()
|
272
|
+
|
273
|
+
if len(os.listdir(currentKmerCountPath)) == ( kmax + 1 ):
|
274
|
+
virusListFileWrite.write(currentFileNameS + " " + \
|
275
|
+
currentKmerCountPath + " " +\
|
276
|
+
str(2) + "\n")
|
277
|
+
else :
|
278
|
+
sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
279
|
+
filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
280
|
+
sys.exit(0)
|
281
|
+
|
282
|
+
filelog.flush()
|
283
|
+
|
284
|
+
end_time = time.time()
|
285
|
+
count += 1
|
286
|
+
#sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
|
287
|
+
sys.stdout.write(" (Average time for counting kmers for one virus: " + str(round((end_time - start_time)/count,4)) + "s) \n" )
|
288
|
+
sys.stdout.write(" (ETR for counting kmers for viruses: " + str(round((end_time - start_time)/count*(len(virusFaList)-count), 4)) + "s) \n")
|
289
|
+
filelog.write(" (Average time for counting kmers for one virus: " + str(round((end_time - start_time)/count,4)) + "s) \n" )
|
290
|
+
filelog.write(" (ETR for counting kmers for viruses: " + str(round((end_time - start_time)/count*(len(virusFaList)-count), 4)) + "s) \n")
|
291
|
+
|
292
|
+
virusListFileWrite.close()
|
293
|
+
|
294
|
+
|
295
|
+
start_time = time.time()
|
296
|
+
count = 0
|
297
|
+
|
298
|
+
for currentFileName in hostFaList :
|
299
|
+
if currentFileName.startswith('.') :
|
300
|
+
continue
|
301
|
+
if os.path.isdir(os.path.join(options.hostFaDir, currentFileName)) :
|
302
|
+
sys.stderr.write( "ERROR: zero bytes of file " + currentFileName + "\n")
|
303
|
+
filelog.write( "ERROR: zero bytes of file " + currentFileName + "\n")
|
304
|
+
sys.exit(0)
|
305
|
+
if len(currentFileName) > nameLen :
|
306
|
+
currentFileNameS = currentFileName[:nameLen]
|
307
|
+
else :
|
308
|
+
currentFileNameS = currentFileName
|
309
|
+
sys.stdout.write("Step 1: counting kmers for host " + currentFileNameS + "\n")
|
310
|
+
filelog.write("Step 1: counting kmers for host " + currentFileNameS + "\n")
|
311
|
+
for w in range(1, (kmax+1)) :
|
312
|
+
currentFilePath = os.path.join(options.hostFaDir, currentFileName)
|
313
|
+
currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
|
314
|
+
cmdKmer = countKmerOut + " -l -k " + str(w) + \
|
315
|
+
" -i " + currentFilePath +\
|
316
|
+
" -o " + currentKmerCountPath +\
|
317
|
+
" -s " + currentFileNameS
|
318
|
+
#print cmdKmer
|
319
|
+
cmdKmerOut = subprocess.Popen(cmdKmer, shell=True, \
|
320
|
+
stderr = subprocess.PIPE, \
|
321
|
+
stdout = subprocess.PIPE)
|
322
|
+
cmdKmerOut.wait()
|
323
|
+
|
324
|
+
if len(os.listdir(currentKmerCountPath)) == ( kmax + 1 ) :
|
325
|
+
hostListFileWrite.write(currentFileNameS + " " + \
|
326
|
+
currentKmerCountPath + " " +\
|
327
|
+
str(2) + "\n")
|
328
|
+
else :
|
329
|
+
sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
330
|
+
filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
331
|
+
sys.exit(0)
|
332
|
+
|
333
|
+
filelog.flush()
|
334
|
+
|
335
|
+
end_time = time.time()
|
336
|
+
count += 1
|
337
|
+
#sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
|
338
|
+
sys.stdout.write(" (Average time for counting kmers for one host: " + str(round((end_time - start_time)/count, 4)) + "s) \n")
|
339
|
+
sys.stdout.write(" (ETR for counting kmers for hosts: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
|
340
|
+
filelog.write(" (Average time for counting kmers for one host: " + str(round((end_time - start_time)/count, 4)) + "s) \n")
|
341
|
+
filelog.write(" (ETR for counting kmers for hosts: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
|
342
|
+
|
343
|
+
hostListFileWrite.close()
|
344
|
+
|
345
|
+
|
346
|
+
|
347
|
+
#time.sleep(6) # delays for 10 seconds
|
348
|
+
|
349
|
+
################### 2: compute measures #####################
|
350
|
+
start_time = time.time()
|
351
|
+
count = 0
|
352
|
+
|
353
|
+
sys.stdout.write("Step 2: compute distance/dissimilarity measures \n")
|
354
|
+
filelog.write("Step 2: compute distance/dissimilarity measures \n")
|
355
|
+
cmdCptMeasure = computeMeasureOut + " -k " + str(kmax) + \
|
356
|
+
" -i " + virusListFile + " -j " + hostListFile + \
|
357
|
+
" -o " + options.outDir + " -t " + hostTaxaFile
|
358
|
+
print(cmdCptMeasure)
|
359
|
+
|
360
|
+
#with open(os.path.join(tmpDir, 'computeMeasureOut.log'), 'w') as filelog:
|
361
|
+
cmdCptMeasureOut = subprocess.Popen(cmdCptMeasure, shell=True, \
|
362
|
+
stderr = subprocess.PIPE, \
|
363
|
+
stdout = subprocess.PIPE)
|
364
|
+
#for c in iter(lambda: cmdCptMeasureOut.stderr.read(1), ''):
|
365
|
+
for c in iter(cmdCptMeasureOut.stderr.readline, b''):
|
366
|
+
sys.stdout.write(c.decode("utf-8"))
|
367
|
+
filelog.write(c.decode("utf-8"))
|
368
|
+
filelog.flush()
|
369
|
+
|
370
|
+
end_time = time.time()
|
371
|
+
count += 1
|
372
|
+
sys.stdout.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
|
373
|
+
sys.stdout.write(" (ETR for computing dissimilarities for virus-host pairs: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
|
374
|
+
filelog.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
|
375
|
+
filelog.write(" (ETR for computing dissimilarities for virus-host pairs: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
|
376
|
+
|
377
|
+
cmdCptMeasureOut.wait()
|
378
|
+
|
379
|
+
filelog.close()
|
380
|
+
|
381
|
+
sys.stdout.write("done \n")
|