big_simon 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock ADDED
@@ -0,0 +1,41 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ big_simon (0.0.1)
5
+ rya (~> 0.1.3)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ abort_if (0.2.0)
11
+ diff-lcs (1.3)
12
+ rake (10.5.0)
13
+ rspec (3.7.0)
14
+ rspec-core (~> 3.7.0)
15
+ rspec-expectations (~> 3.7.0)
16
+ rspec-mocks (~> 3.7.0)
17
+ rspec-core (3.7.1)
18
+ rspec-support (~> 3.7.0)
19
+ rspec-expectations (3.7.0)
20
+ diff-lcs (>= 1.2.0, < 2.0)
21
+ rspec-support (~> 3.7.0)
22
+ rspec-mocks (3.7.0)
23
+ diff-lcs (>= 1.2.0, < 2.0)
24
+ rspec-support (~> 3.7.0)
25
+ rspec-support (3.7.1)
26
+ rya (0.1.3)
27
+ abort_if (~> 0.2.0)
28
+ systemu (~> 2.6, >= 2.6.5)
29
+ systemu (2.6.5)
30
+
31
+ PLATFORMS
32
+ ruby
33
+
34
+ DEPENDENCIES
35
+ big_simon!
36
+ bundler (~> 1.16)
37
+ rake (~> 10.0)
38
+ rspec (~> 3.0)
39
+
40
+ BUNDLED WITH
41
+ 1.16.3
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # BigSimon
2
+
3
+ A pipeline for finding hosts of viruses!
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'big_simon'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install big_simon
20
+
21
+ ## Usage
22
+
23
+ TODO
24
+
25
+ ## Development
26
+
27
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
28
+
29
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
30
+
31
+ ## Contributing
32
+
33
+ Bug reports and pull requests are welcome on GitHub at https://github.com/mooreryan/big_simon. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
34
+
35
+ ## Code of Conduct
36
+
37
+ Everyone interacting in the BigSimon project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/big_simon/blob/master/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/big_simon.gemspec ADDED
@@ -0,0 +1,30 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "big_simon/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "big_simon"
8
+ spec.version = BigSimon::VERSION
9
+ spec.authors = ["Ryan Moore"]
10
+ spec.email = ["moorer@udel.edu"]
11
+
12
+ spec.summary = %q{Viral host discovery pipeline.}
13
+ spec.description = %q{Viral host discovery pipeline.}
14
+ spec.homepage = "https://github.com/mooreryan/big_simon"
15
+
16
+ # Specify which files should be added to the gem when it is released.
17
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.16"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
28
+
29
+ spec.add_runtime_dependency "rya", "~> 0.1.3"
30
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "big_simon"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ module BigSimon
2
+ VERSION = "0.0.1"
3
+ end
data/lib/big_simon.rb ADDED
@@ -0,0 +1,100 @@
1
+ require "rya"
2
+
3
+ require "big_simon/version"
4
+
5
+ Time.extend Rya::CoreExtensions::Time
6
+ Process.extend Rya::CoreExtensions::Process
7
+
8
+ module BigSimon
9
+
10
+ # Project directories
11
+ ROOT = File.join __dir__, ".."
12
+ BIN = File.join ROOT, "vendor", "bin", "mac"
13
+ SPEC = File.join ROOT, "spec"
14
+ TEST_FILES = File.join SPEC, "test_files"
15
+
16
+ class Parsers
17
+
18
+ def self.vir_host_matcher fname
19
+ hosts = nil
20
+
21
+ host_info = {}
22
+ File.open(fname, "rt").each_line.with_index do |line, idx|
23
+ line.chomp!
24
+ line.sub! /,$/, "" # git rid of trailing commas
25
+
26
+ if idx.zero?
27
+ stat, *hosts = line.split ","
28
+ else
29
+ ary = line.split ","
30
+ virus = ary.shift
31
+
32
+ dists = ary.map.
33
+ with_index { |dist, idx| [hosts[idx], dist.to_f] }.
34
+ sort_by { |_, dist| dist }
35
+
36
+ best_host = dists[0][0]
37
+
38
+ host_info[virus] = {
39
+ best: best_host,
40
+ all: dists
41
+ }
42
+ end
43
+ end
44
+
45
+ host_info
46
+ end
47
+ end
48
+
49
+ class Runners
50
+
51
+ # Runs the WIsH program
52
+ #
53
+ # @raise [AbortIf::Exit] if commands fail
54
+ def self.wish exe, vir_dir, host_dir, outdir, threads
55
+ model_dir = File.join outdir, "model"
56
+
57
+ FileUtils.mkdir_p model_dir
58
+
59
+ build_model = "#{exe} " \
60
+ "-t #{threads} " \
61
+ "-c build " \
62
+ "-g #{host_dir} " \
63
+ "-m #{model_dir}"
64
+
65
+ predict = "#{exe} " \
66
+ "-t #{threads} " \
67
+ "-c predict " \
68
+ "-g #{vir_dir} " \
69
+ "-m #{model_dir} " \
70
+ "-r #{outdir} -b"
71
+
72
+ Process.run_and_time_it! "Building model", build_model
73
+ Process.run_and_time_it! "Predicting host", predict
74
+
75
+ FileUtils.rm_r model_dir if Dir.exist? model_dir
76
+ end
77
+
78
+ def self.vir_host_matcher exe, vir_dir, host_dir, outdir
79
+ FileUtils.mkdir_p outdir
80
+
81
+ cmd = "python #{exe} " \
82
+ "-v #{vir_dir} " \
83
+ "-b #{host_dir} " \
84
+ "-o #{outdir} " \
85
+ "-d 1" # only compute d2star dissimilarity
86
+
87
+ Process.run_and_time_it! "Computing d2star dissimilarity", cmd
88
+
89
+ tmp_dir = File.join outdir, "tmp"
90
+ FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
91
+
92
+ bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
93
+ bad_files.each do |fname|
94
+ path = File.join outdir, fname
95
+
96
+ FileUtils.rm path if File.exist? path
97
+ end
98
+ end
99
+ end
100
+ end
Binary file
Binary file
Binary file
@@ -0,0 +1,381 @@
1
+ #!/usr/bin/env python
2
+ import os,sys
3
+ import optparse
4
+ import subprocess
5
+ from subprocess import call
6
+ import time
7
+ import platform
8
+ import numpy
9
+
10
+
11
+ prog_base = os.path.split(sys.argv[0])[1]
12
+
13
+ parser = optparse.OptionParser()
14
+ parser.add_option("-v", "--virusFaDir", action = "store", type = "string", dest = "virusFaDir",
15
+ help = "the directory to the folder containing bacteria virus fasta files")
16
+ parser.add_option("-b", "--hostFaDir", action = "store", type = "string", dest = "hostFaDir",
17
+ help = "the directory to the folder containing bacteria host fasta files")
18
+ parser.add_option("-o", "--out", action = "store", type = "string", dest = "outDir",
19
+ default='./', help = "output directory")
20
+ parser.add_option("-t", "--taxa", action = "store", type = "string", dest = "hostTaxaFile",
21
+ help = "the host taxa file (including the path) ")
22
+ parser.add_option("-d", "--d2star", action = "store", type = "string", dest = "onlyD2star",
23
+ default=0, help = "compute only d2star dissimilarity? 1 for yes, 0 for no")
24
+ #parser.add_option("-u", "--continue", action = "store", type = "string", dest = "onlyComputeMeasure",
25
+ # default=0, help = "kmer count is ready only compute measures? 1 for yes, 0 for no")
26
+ #parser.add_option("-k", "--kLen", action = "store", type = "string", dest = "kLen",
27
+ # help = "the length of k-tuple")
28
+
29
+ (options, args) = parser.parse_args()
30
+ if (options.virusFaDir is None or
31
+ options.hostFaDir is None) :
32
+ sys.stderr.write(prog_base + ": ERROR: missing required command-line argument")
33
+ filelog.write(prog_base + ": ERROR: missing required command-line argument")
34
+ parser.print_help()
35
+ sys.exit(0)
36
+
37
+
38
+ ## tmp file directory
39
+ if not os.path.exists(options.outDir) :
40
+ os.makedirs(options.outDir)
41
+ tmpDir = os.path.join(options.outDir, "tmp")
42
+ if not os.path.exists(tmpDir) :
43
+ os.makedirs(tmpDir)
44
+
45
+ ## log file ##
46
+ filelog = open(os.path.join(tmpDir, 'vhm.log'), 'w')
47
+
48
+ ## name length ##
49
+ nameLen = 93 - len(options.outDir)
50
+ #### possibly because of the kmercount folder name for each contig is too long?
51
+
52
+
53
+ #################### 0: preparation ############################
54
+
55
+ ## path to the programs
56
+ vhmPath = os.path.dirname(sys.argv[0])
57
+ if len(vhmPath) == 0 :
58
+ vhmPath="./"
59
+ #print vhmPath
60
+
61
+ ## kmer length and MC order
62
+ kmax = 6
63
+ order = 2
64
+
65
+ ## compile c++ code if not
66
+ optSys = platform.system()
67
+ if optSys == 'Linux' :
68
+ exePath = os.path.join(vhmPath, "bin", "linux64")
69
+ elif optSys == 'Darwin' :
70
+ exePath = os.path.join(vhmPath, "bin", "macDarwin")
71
+ elif optSys == 'Windows' :
72
+ exePath = os.path.join(vhmPath, "bin", "windows64")
73
+ else :
74
+ #sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
75
+ exePath = "unknown"
76
+
77
+
78
+
79
+ ## countKmer c++ code
80
+ countKmerCpp = os.path.join(vhmPath, "countKmer.cpp")
81
+ countKmerOut = os.path.join(vhmPath, "countKmer.out")
82
+ if not os.path.exists(countKmerOut) :
83
+ ## recognize OS, copy exe files to vhmPath
84
+ sys.stderr.write("WARNING: can't find the file " + countKmerOut + ", try to copy one from bin \n")
85
+ filelog.write("WARNING: can't find the file " + countKmerOut + ", try to copy one from bin \n")
86
+ preCountKmerOut = os.path.join(exePath, "countKmer.out")
87
+ if exePath != "unknown" :
88
+ if os.path.exists(preCountKmerOut) :
89
+ os.system("cp " + preCountKmerOut + " " + vhmPath)
90
+ else :
91
+ sys.stderr.write( "ERROR: can't find file " + preCountKmerOut + " \n Please run make under the main directory! \n" )
92
+ filelog.write( "ERROR: can't find file " + preCountKmerOut + " \n Please run make under the main directory! \n" )
93
+ sys.exit(0)
94
+ else :
95
+ ## can't recognize OS, try to compile
96
+ sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
97
+ sys.stderr.write( "Trying to compile..." + "\n")
98
+ filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
99
+ filelog.write( "Trying to compile..." + "\n")
100
+ if os.path.exists(countKmerCpp) :
101
+ os.system("g++ " + countKmerCpp + " -o " + countKmerOut + " -std=c++0x" )
102
+ else :
103
+ sys.stderr.write( "ERROR: can't find file " + countKmerCpp + " \n Please run make under the main directory! \n" )
104
+ filelog.write( "ERROR: can't find file " + countKmerCpp + " \n Please run make under the main directory! \n" )
105
+ sys.exit(0)
106
+
107
+ os.system("chmod 777 " + countKmerOut)
108
+ filelog.flush()
109
+
110
+ ## computeMeasure c++ code
111
+ computeMeasureCpp = os.path.join(vhmPath, "computeMeasure.cpp")
112
+ computeMeasureOut = os.path.join(vhmPath, "computeMeasure.out")
113
+ if not os.path.exists(computeMeasureOut) :
114
+ ## recognize OS, copy exe files to vhmPath
115
+ sys.stderr.write("WARNING: can't find the file " + computeMeasureOut + ", try to copy one from bin \n")
116
+ filelog.write("WARNING: can't find the file " + computeMeasureOut + ", try to copy one from bin \n")
117
+ preComputeMeasureOut = os.path.join(exePath, "computeMeasure.out")
118
+ if exePath != "unknown" :
119
+ if os.path.exists(preComputeMeasureOut) :
120
+ os.system("cp " + preComputeMeasureOut + " " + vhmPath)
121
+ else :
122
+ sys.stderr.write( "ERROR: can't find file " + preComputeMeasureOut + " \n Please run make under the main directory! \n" )
123
+ filelog.write( "ERROR: can't find file " + preComputeMeasureOut + " \n Please run make under the main directory! \n" )
124
+ sys.exit(0)
125
+ else :
126
+ ## can't recognize OS, try to compile
127
+ sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
128
+ sys.stderr.write( "Trying to compile..." + "\n")
129
+ filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
130
+ filelog.write( "Trying to compile..." + "\n")
131
+ if os.path.exists(computeMeasureCpp) :
132
+ os.system("g++ " + computeMeasureCpp + " -o " + computeMeasureOut + " -std=c++0x" )
133
+ else :
134
+ sys.stderr.write( "ERROR: can't find file " + computeMeasureCpp + " \n Please run make under the main directory! \n" )
135
+ filelog.write( "ERROR: can't find file " + computeMeasureCpp + " \n Please run make under the main directory! \n" )
136
+ sys.exit(0)
137
+
138
+ os.system("chmod 777 " + computeMeasureOut)
139
+ filelog.flush()
140
+
141
+ ## computeMeasure c++ code
142
+ computed2starCpp = os.path.join(vhmPath, "computeMeasure_onlyd2star.cpp")
143
+ computed2starOut = os.path.join(vhmPath, "computeMeasure_onlyd2star.out")
144
+ if not os.path.exists(computed2starOut) :
145
+ ## recognize OS, copy exe files to vhmPath
146
+ sys.stderr.write("WARNING: can't find the file " + computed2starOut + ", try to copy one from bin \n")
147
+ preComputed2starOut = os.path.join(exePath, "computeMeasure_onlyd2star.out")
148
+ if exePath != "unknown" :
149
+ if os.path.exists(preComputed2starOut) :
150
+ os.system("cp " + preComputed2starOut + " " + vhmPath)
151
+ else :
152
+ sys.stderr.write( "ERROR: can't find file " + preComputed2starOut + " \n Please run make under the main directory! \n" )
153
+ filelog.write( "ERROR: can't find file " + preComputed2starOut + " \n Please run make under the main directory! \n" )
154
+ sys.exit(0)
155
+ else :
156
+ ## can't recognize OS, try to compile
157
+ sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
158
+ sys.stderr.write( "Trying to compile..." + "\n")
159
+ filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
160
+ filelog.write( "Trying to compile..." + "\n")
161
+ if os.path.exists(computed2starCpp) :
162
+ os.system("g++ " + computed2starCpp + " -o " + computed2starOut + " -std=c++0x" )
163
+ else :
164
+ sys.stderr.write( "ERROR: can't find file " + computed2starCpp + " \n Please run make under the main directory! \n" )
165
+ filelog.write( "ERROR: can't find file " + computed2starCpp + " \n Please run make under the main directory! \n" )
166
+ sys.exit(0)
167
+
168
+ os.system("chmod 777 " + computed2starOut)
169
+ filelog.flush()
170
+
171
+ ## if only compute d2star
172
+ if int(options.onlyD2star) == 1 :
173
+ computeMeasureOut=computed2starOut
174
+ else :
175
+ computeMeasureOut=computeMeasureOut
176
+
177
+
178
+ ## kmer count directory
179
+ kmerCountPath = os.path.join(tmpDir, "KC")
180
+ if not os.path.exists(kmerCountPath) :
181
+ os.makedirs(kmerCountPath)
182
+
183
+ ## virusFaList, hostFaList
184
+ virusFaList = os.listdir(options.virusFaDir)
185
+ #virusFaList = os.path.join(options.virusFaDir, os.listdir(options.virusFaDir))
186
+ hostFaList = os.listdir(options.hostFaDir)
187
+
188
+ ## virus list file, host list file
189
+ virusListFile = os.path.join(tmpDir, "virusList")
190
+ hostListFile = os.path.join(tmpDir, "hostList")
191
+
192
+ virusListFileWrite = open(virusListFile, 'w') ## make file blank
193
+ virusListFileWrite.close()
194
+ virusListFileWrite = open(virusListFile, 'a')
195
+
196
+ hostListFileWrite = open(hostListFile, 'w') ## make file blank
197
+ hostListFileWrite.close()
198
+ hostListFileWrite = open(hostListFile, 'a')
199
+
200
+ #time.sleep(6) # delays for 10 seconds
201
+
202
+ ################ 00: hostTaxa issues: ##########################
203
+ ############## 1. hostName=hostFileName(with extension) ######
204
+ ############## 2. hostTaxa should be no missing ########
205
+
206
+ #################### 00: if hostTaxa missing ###################
207
+ if options.hostTaxaFile is None :
208
+ hostTaxaFile = os.path.join(options.outDir, "hostTaxa.txt_new.txt")
209
+ sys.stdout.write("WARNING: no hostTaxa file provided, creating a dummy one \n")
210
+ filelog.write("WARNING: no hostTaxa file provided, creating a dummy one \n")
211
+ hostTaxaFileWrite = open(hostTaxaFile, 'w') ## make file blank
212
+ hostTaxaFileWrite.close()
213
+ hostTaxaFileWrite = open(hostTaxaFile, 'a')
214
+
215
+ hostTaxaFileWrite.write("hostNCBIName hostName hostSuperkingdom hostPhylum hostClass hostOrder hostFamily hostGenus hostSpecies\n")
216
+ for currentFileName in hostFaList :
217
+ if currentFileName.startswith('.') :
218
+ continue
219
+ if len(currentFileName) > nameLen :
220
+ sys.stdout.write( "WARNING: the file name has more than " + str(nameLen) + " letters! Use the first " + str(nameLen) + " letters as the name \n")
221
+ filelog.write( "WARNING: the file name has more than " + str(nameLen) + " letters! Use the first " + str(nameLen) + " letters as the name \n")
222
+ currentFileNameS = currentFileName[:nameLen]
223
+ else :
224
+ currentFileNameS = currentFileName
225
+ hostTaxaFileWrite.write(currentFileNameS)
226
+ for i in range(1,9) :
227
+ hostTaxaFileWrite.write("\t" + "unknown")
228
+ hostTaxaFileWrite.write("\n")
229
+ hostTaxaFileWrite.close()
230
+ options.hostTaxaFile = hostTaxaFile
231
+
232
+ ################### REFORMAT hostTaxa (fill missing) ###############
233
+ else :
234
+ hostTaxaFile = os.path.join(options.outDir, os.path.basename(options.hostTaxaFile)+"_new.txt")
235
+ hostTaxaTable = numpy.genfromtxt(options.hostTaxaFile,delimiter="\t", dtype=str)
236
+ hostTaxaTable[hostTaxaTable=='']='unknown'
237
+ numpy.savetxt(hostTaxaFile, hostTaxaTable, fmt="%s", delimiter='\t', newline='\n')
238
+
239
+ filelog.flush()
240
+
241
+ #################### 1: count kmer and prepare list files ############################
242
+ #sys.stdout.write("Step 1: counting kmers \n")
243
+
244
+ start_time = time.time()
245
+ count = 0
246
+
247
+ for currentFileName in virusFaList :
248
+ if currentFileName.startswith('.') :
249
+ continue
250
+ if os.path.isdir(os.path.join(options.virusFaDir, currentFileName)) :
251
+ sys.stderr.write( "ERROR: zero bytes of file " + currentFileName + "\n")
252
+ filelog.write( "ERROR: zero bytes of file " + currentFileName + "\n")
253
+ sys.exit(0)
254
+ if len(currentFileName) > nameLen :
255
+ currentFileNameS = currentFileName[:nameLen]
256
+ else :
257
+ currentFileNameS = currentFileName
258
+ sys.stdout.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
259
+ filelog.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
260
+ for w in range(1, (kmax+1)) :
261
+ currentFilePath = os.path.join(options.virusFaDir, currentFileName)
262
+
263
+ currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
264
+ cmdKmer = countKmerOut + " -l -k " + str(w) + \
265
+ " -i " + currentFilePath +\
266
+ " -o " + currentKmerCountPath +\
267
+ " -s " + currentFileNameS
268
+ cmdKmerOut = subprocess.Popen(cmdKmer, shell=True, \
269
+ stderr = subprocess.PIPE, \
270
+ stdout = subprocess.PIPE)
271
+ cmdKmerOut.wait()
272
+
273
+ if len(os.listdir(currentKmerCountPath)) == ( kmax + 1 ):
274
+ virusListFileWrite.write(currentFileNameS + " " + \
275
+ currentKmerCountPath + " " +\
276
+ str(2) + "\n")
277
+ else :
278
+ sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
279
+ filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
280
+ sys.exit(0)
281
+
282
+ filelog.flush()
283
+
284
+ end_time = time.time()
285
+ count += 1
286
+ #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
287
+ sys.stdout.write(" (Average time for counting kmers for one virus: " + str(round((end_time - start_time)/count,4)) + "s) \n" )
288
+ sys.stdout.write(" (ETR for counting kmers for viruses: " + str(round((end_time - start_time)/count*(len(virusFaList)-count), 4)) + "s) \n")
289
+ filelog.write(" (Average time for counting kmers for one virus: " + str(round((end_time - start_time)/count,4)) + "s) \n" )
290
+ filelog.write(" (ETR for counting kmers for viruses: " + str(round((end_time - start_time)/count*(len(virusFaList)-count), 4)) + "s) \n")
291
+
292
+ virusListFileWrite.close()
293
+
294
+
295
+ start_time = time.time()
296
+ count = 0
297
+
298
+ for currentFileName in hostFaList :
299
+ if currentFileName.startswith('.') :
300
+ continue
301
+ if os.path.isdir(os.path.join(options.hostFaDir, currentFileName)) :
302
+ sys.stderr.write( "ERROR: zero bytes of file " + currentFileName + "\n")
303
+ filelog.write( "ERROR: zero bytes of file " + currentFileName + "\n")
304
+ sys.exit(0)
305
+ if len(currentFileName) > nameLen :
306
+ currentFileNameS = currentFileName[:nameLen]
307
+ else :
308
+ currentFileNameS = currentFileName
309
+ sys.stdout.write("Step 1: counting kmers for host " + currentFileNameS + "\n")
310
+ filelog.write("Step 1: counting kmers for host " + currentFileNameS + "\n")
311
+ for w in range(1, (kmax+1)) :
312
+ currentFilePath = os.path.join(options.hostFaDir, currentFileName)
313
+ currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
314
+ cmdKmer = countKmerOut + " -l -k " + str(w) + \
315
+ " -i " + currentFilePath +\
316
+ " -o " + currentKmerCountPath +\
317
+ " -s " + currentFileNameS
318
+ #print cmdKmer
319
+ cmdKmerOut = subprocess.Popen(cmdKmer, shell=True, \
320
+ stderr = subprocess.PIPE, \
321
+ stdout = subprocess.PIPE)
322
+ cmdKmerOut.wait()
323
+
324
+ if len(os.listdir(currentKmerCountPath)) == ( kmax + 1 ) :
325
+ hostListFileWrite.write(currentFileNameS + " " + \
326
+ currentKmerCountPath + " " +\
327
+ str(2) + "\n")
328
+ else :
329
+ sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
330
+ filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
331
+ sys.exit(0)
332
+
333
+ filelog.flush()
334
+
335
+ end_time = time.time()
336
+ count += 1
337
+ #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
338
+ sys.stdout.write(" (Average time for counting kmers for one host: " + str(round((end_time - start_time)/count, 4)) + "s) \n")
339
+ sys.stdout.write(" (ETR for counting kmers for hosts: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
340
+ filelog.write(" (Average time for counting kmers for one host: " + str(round((end_time - start_time)/count, 4)) + "s) \n")
341
+ filelog.write(" (ETR for counting kmers for hosts: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
342
+
343
+ hostListFileWrite.close()
344
+
345
+
346
+
347
+ #time.sleep(6) # delays for 10 seconds
348
+
349
+ ################### 2: compute measures #####################
350
+ start_time = time.time()
351
+ count = 0
352
+
353
+ sys.stdout.write("Step 2: compute distance/dissimilarity measures \n")
354
+ filelog.write("Step 2: compute distance/dissimilarity measures \n")
355
+ cmdCptMeasure = computeMeasureOut + " -k " + str(kmax) + \
356
+ " -i " + virusListFile + " -j " + hostListFile + \
357
+ " -o " + options.outDir + " -t " + hostTaxaFile
358
+ print(cmdCptMeasure)
359
+
360
+ #with open(os.path.join(tmpDir, 'computeMeasureOut.log'), 'w') as filelog:
361
+ cmdCptMeasureOut = subprocess.Popen(cmdCptMeasure, shell=True, \
362
+ stderr = subprocess.PIPE, \
363
+ stdout = subprocess.PIPE)
364
+ #for c in iter(lambda: cmdCptMeasureOut.stderr.read(1), ''):
365
+ for c in iter(cmdCptMeasureOut.stderr.readline, b''):
366
+ sys.stdout.write(c.decode("utf-8"))
367
+ filelog.write(c.decode("utf-8"))
368
+ filelog.flush()
369
+
370
+ end_time = time.time()
371
+ count += 1
372
+ sys.stdout.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
373
+ sys.stdout.write(" (ETR for computing dissimilarities for virus-host pairs: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
374
+ filelog.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
375
+ filelog.write(" (ETR for computing dissimilarities for virus-host pairs: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
376
+
377
+ cmdCptMeasureOut.wait()
378
+
379
+ filelog.close()
380
+
381
+ sys.stdout.write("done \n")