big_simon 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock ADDED
@@ -0,0 +1,41 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ big_simon (0.0.1)
5
+ rya (~> 0.1.3)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ abort_if (0.2.0)
11
+ diff-lcs (1.3)
12
+ rake (10.5.0)
13
+ rspec (3.7.0)
14
+ rspec-core (~> 3.7.0)
15
+ rspec-expectations (~> 3.7.0)
16
+ rspec-mocks (~> 3.7.0)
17
+ rspec-core (3.7.1)
18
+ rspec-support (~> 3.7.0)
19
+ rspec-expectations (3.7.0)
20
+ diff-lcs (>= 1.2.0, < 2.0)
21
+ rspec-support (~> 3.7.0)
22
+ rspec-mocks (3.7.0)
23
+ diff-lcs (>= 1.2.0, < 2.0)
24
+ rspec-support (~> 3.7.0)
25
+ rspec-support (3.7.1)
26
+ rya (0.1.3)
27
+ abort_if (~> 0.2.0)
28
+ systemu (~> 2.6, >= 2.6.5)
29
+ systemu (2.6.5)
30
+
31
+ PLATFORMS
32
+ ruby
33
+
34
+ DEPENDENCIES
35
+ big_simon!
36
+ bundler (~> 1.16)
37
+ rake (~> 10.0)
38
+ rspec (~> 3.0)
39
+
40
+ BUNDLED WITH
41
+ 1.16.3
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # BigSimon
2
+
3
+ A pipeline for finding hosts of viruses!
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'big_simon'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install big_simon
20
+
21
+ ## Usage
22
+
23
+ TODO
24
+
25
+ ## Development
26
+
27
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
28
+
29
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
30
+
31
+ ## Contributing
32
+
33
+ Bug reports and pull requests are welcome on GitHub at https://github.com/mooreryan/big_simon. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
34
+
35
+ ## Code of Conduct
36
+
37
+ Everyone interacting in the BigSimon project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/big_simon/blob/master/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/big_simon.gemspec ADDED
@@ -0,0 +1,30 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "big_simon/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "big_simon"
8
+ spec.version = BigSimon::VERSION
9
+ spec.authors = ["Ryan Moore"]
10
+ spec.email = ["moorer@udel.edu"]
11
+
12
+ spec.summary = %q{Viral host discovery pipeline.}
13
+ spec.description = %q{Viral host discovery pipeline.}
14
+ spec.homepage = "https://github.com/mooreryan/big_simon"
15
+
16
+ # Specify which files should be added to the gem when it is released.
17
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.16"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
28
+
29
+ spec.add_runtime_dependency "rya", "~> 0.1.3"
30
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "big_simon"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ module BigSimon
2
+ VERSION = "0.0.1"
3
+ end
data/lib/big_simon.rb ADDED
@@ -0,0 +1,100 @@
1
+ require "rya"
2
+
3
+ require "big_simon/version"
4
+
5
+ Time.extend Rya::CoreExtensions::Time
6
+ Process.extend Rya::CoreExtensions::Process
7
+
8
+ module BigSimon
9
+
10
+ # Project directories
11
+ ROOT = File.join __dir__, ".."
12
+ BIN = File.join ROOT, "vendor", "bin", "mac"
13
+ SPEC = File.join ROOT, "spec"
14
+ TEST_FILES = File.join SPEC, "test_files"
15
+
16
+ class Parsers
17
+
18
+ def self.vir_host_matcher fname
19
+ hosts = nil
20
+
21
+ host_info = {}
22
+ File.open(fname, "rt").each_line.with_index do |line, idx|
23
+ line.chomp!
24
+ line.sub! /,$/, "" # git rid of trailing commas
25
+
26
+ if idx.zero?
27
+ stat, *hosts = line.split ","
28
+ else
29
+ ary = line.split ","
30
+ virus = ary.shift
31
+
32
+ dists = ary.map.
33
+ with_index { |dist, idx| [hosts[idx], dist.to_f] }.
34
+ sort_by { |_, dist| dist }
35
+
36
+ best_host = dists[0][0]
37
+
38
+ host_info[virus] = {
39
+ best: best_host,
40
+ all: dists
41
+ }
42
+ end
43
+ end
44
+
45
+ host_info
46
+ end
47
+ end
48
+
49
+ class Runners
50
+
51
+ # Runs the WIsH program
52
+ #
53
+ # @raise [AbortIf::Exit] if commands fail
54
+ def self.wish exe, vir_dir, host_dir, outdir, threads
55
+ model_dir = File.join outdir, "model"
56
+
57
+ FileUtils.mkdir_p model_dir
58
+
59
+ build_model = "#{exe} " \
60
+ "-t #{threads} " \
61
+ "-c build " \
62
+ "-g #{host_dir} " \
63
+ "-m #{model_dir}"
64
+
65
+ predict = "#{exe} " \
66
+ "-t #{threads} " \
67
+ "-c predict " \
68
+ "-g #{vir_dir} " \
69
+ "-m #{model_dir} " \
70
+ "-r #{outdir} -b"
71
+
72
+ Process.run_and_time_it! "Building model", build_model
73
+ Process.run_and_time_it! "Predicting host", predict
74
+
75
+ FileUtils.rm_r model_dir if Dir.exist? model_dir
76
+ end
77
+
78
+ def self.vir_host_matcher exe, vir_dir, host_dir, outdir
79
+ FileUtils.mkdir_p outdir
80
+
81
+ cmd = "python #{exe} " \
82
+ "-v #{vir_dir} " \
83
+ "-b #{host_dir} " \
84
+ "-o #{outdir} " \
85
+ "-d 1" # only compute d2star dissimilarity
86
+
87
+ Process.run_and_time_it! "Computing d2star dissimilarity", cmd
88
+
89
+ tmp_dir = File.join outdir, "tmp"
90
+ FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
91
+
92
+ bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
93
+ bad_files.each do |fname|
94
+ path = File.join outdir, fname
95
+
96
+ FileUtils.rm path if File.exist? path
97
+ end
98
+ end
99
+ end
100
+ end
Binary file
Binary file
Binary file
@@ -0,0 +1,381 @@
1
+ #!/usr/bin/env python
2
+ import os,sys
3
+ import optparse
4
+ import subprocess
5
+ from subprocess import call
6
+ import time
7
+ import platform
8
+ import numpy
9
+
10
+
11
+ prog_base = os.path.split(sys.argv[0])[1]
12
+
13
+ parser = optparse.OptionParser()
14
+ parser.add_option("-v", "--virusFaDir", action = "store", type = "string", dest = "virusFaDir",
15
+ help = "the directory to the folder containing bacteria virus fasta files")
16
+ parser.add_option("-b", "--hostFaDir", action = "store", type = "string", dest = "hostFaDir",
17
+ help = "the directory to the folder containing bacteria host fasta files")
18
+ parser.add_option("-o", "--out", action = "store", type = "string", dest = "outDir",
19
+ default='./', help = "output directory")
20
+ parser.add_option("-t", "--taxa", action = "store", type = "string", dest = "hostTaxaFile",
21
+ help = "the host taxa file (including the path) ")
22
+ parser.add_option("-d", "--d2star", action = "store", type = "string", dest = "onlyD2star",
23
+ default=0, help = "compute only d2star dissimilarity? 1 for yes, 0 for no")
24
+ #parser.add_option("-u", "--continue", action = "store", type = "string", dest = "onlyComputeMeasure",
25
+ # default=0, help = "kmer count is ready only compute measures? 1 for yes, 0 for no")
26
+ #parser.add_option("-k", "--kLen", action = "store", type = "string", dest = "kLen",
27
+ # help = "the length of k-tuple")
28
+
29
+ (options, args) = parser.parse_args()
30
+ if (options.virusFaDir is None or
31
+ options.hostFaDir is None) :
32
+ sys.stderr.write(prog_base + ": ERROR: missing required command-line argument")
33
+ filelog.write(prog_base + ": ERROR: missing required command-line argument")
34
+ parser.print_help()
35
+ sys.exit(0)
36
+
37
+
38
+ ## tmp file directory
39
+ if not os.path.exists(options.outDir) :
40
+ os.makedirs(options.outDir)
41
+ tmpDir = os.path.join(options.outDir, "tmp")
42
+ if not os.path.exists(tmpDir) :
43
+ os.makedirs(tmpDir)
44
+
45
+ ## log file ##
46
+ filelog = open(os.path.join(tmpDir, 'vhm.log'), 'w')
47
+
48
+ ## name length ##
49
+ nameLen = 93 - len(options.outDir)
50
+ #### possibly because of the kmercount folder name for each contig is too long?
51
+
52
+
53
+ #################### 0: preparation ############################
54
+
55
+ ## path to the programs
56
+ vhmPath = os.path.dirname(sys.argv[0])
57
+ if len(vhmPath) == 0 :
58
+ vhmPath="./"
59
+ #print vhmPath
60
+
61
+ ## kmer length and MC order
62
+ kmax = 6
63
+ order = 2
64
+
65
+ ## compile c++ code if not
66
+ optSys = platform.system()
67
+ if optSys == 'Linux' :
68
+ exePath = os.path.join(vhmPath, "bin", "linux64")
69
+ elif optSys == 'Darwin' :
70
+ exePath = os.path.join(vhmPath, "bin", "macDarwin")
71
+ elif optSys == 'Windows' :
72
+ exePath = os.path.join(vhmPath, "bin", "windows64")
73
+ else :
74
+ #sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
75
+ exePath = "unknown"
76
+
77
+
78
+
79
+ ## countKmer c++ code
80
+ countKmerCpp = os.path.join(vhmPath, "countKmer.cpp")
81
+ countKmerOut = os.path.join(vhmPath, "countKmer.out")
82
+ if not os.path.exists(countKmerOut) :
83
+ ## recognize OS, copy exe files to vhmPath
84
+ sys.stderr.write("WARNING: can't find the file " + countKmerOut + ", try to copy one from bin \n")
85
+ filelog.write("WARNING: can't find the file " + countKmerOut + ", try to copy one from bin \n")
86
+ preCountKmerOut = os.path.join(exePath, "countKmer.out")
87
+ if exePath != "unknown" :
88
+ if os.path.exists(preCountKmerOut) :
89
+ os.system("cp " + preCountKmerOut + " " + vhmPath)
90
+ else :
91
+ sys.stderr.write( "ERROR: can't find file " + preCountKmerOut + " \n Please run make under the main directory! \n" )
92
+ filelog.write( "ERROR: can't find file " + preCountKmerOut + " \n Please run make under the main directory! \n" )
93
+ sys.exit(0)
94
+ else :
95
+ ## can't recognize OS, try to compile
96
+ sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
97
+ sys.stderr.write( "Trying to compile..." + "\n")
98
+ filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
99
+ filelog.write( "Trying to compile..." + "\n")
100
+ if os.path.exists(countKmerCpp) :
101
+ os.system("g++ " + countKmerCpp + " -o " + countKmerOut + " -std=c++0x" )
102
+ else :
103
+ sys.stderr.write( "ERROR: can't find file " + countKmerCpp + " \n Please run make under the main directory! \n" )
104
+ filelog.write( "ERROR: can't find file " + countKmerCpp + " \n Please run make under the main directory! \n" )
105
+ sys.exit(0)
106
+
107
+ os.system("chmod 777 " + countKmerOut)
108
+ filelog.flush()
109
+
110
+ ## computeMeasure c++ code
111
+ computeMeasureCpp = os.path.join(vhmPath, "computeMeasure.cpp")
112
+ computeMeasureOut = os.path.join(vhmPath, "computeMeasure.out")
113
+ if not os.path.exists(computeMeasureOut) :
114
+ ## recognize OS, copy exe files to vhmPath
115
+ sys.stderr.write("WARNING: can't find the file " + computeMeasureOut + ", try to copy one from bin \n")
116
+ filelog.write("WARNING: can't find the file " + computeMeasureOut + ", try to copy one from bin \n")
117
+ preComputeMeasureOut = os.path.join(exePath, "computeMeasure.out")
118
+ if exePath != "unknown" :
119
+ if os.path.exists(preComputeMeasureOut) :
120
+ os.system("cp " + preComputeMeasureOut + " " + vhmPath)
121
+ else :
122
+ sys.stderr.write( "ERROR: can't find file " + preComputeMeasureOut + " \n Please run make under the main directory! \n" )
123
+ filelog.write( "ERROR: can't find file " + preComputeMeasureOut + " \n Please run make under the main directory! \n" )
124
+ sys.exit(0)
125
+ else :
126
+ ## can't recognize OS, try to compile
127
+ sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
128
+ sys.stderr.write( "Trying to compile..." + "\n")
129
+ filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
130
+ filelog.write( "Trying to compile..." + "\n")
131
+ if os.path.exists(computeMeasureCpp) :
132
+ os.system("g++ " + computeMeasureCpp + " -o " + computeMeasureOut + " -std=c++0x" )
133
+ else :
134
+ sys.stderr.write( "ERROR: can't find file " + computeMeasureCpp + " \n Please run make under the main directory! \n" )
135
+ filelog.write( "ERROR: can't find file " + computeMeasureCpp + " \n Please run make under the main directory! \n" )
136
+ sys.exit(0)
137
+
138
+ os.system("chmod 777 " + computeMeasureOut)
139
+ filelog.flush()
140
+
141
+ ## computeMeasure c++ code
142
+ computed2starCpp = os.path.join(vhmPath, "computeMeasure_onlyd2star.cpp")
143
+ computed2starOut = os.path.join(vhmPath, "computeMeasure_onlyd2star.out")
144
+ if not os.path.exists(computed2starOut) :
145
+ ## recognize OS, copy exe files to vhmPath
146
+ sys.stderr.write("WARNING: can't find the file " + computed2starOut + ", try to copy one from bin \n")
147
+ preComputed2starOut = os.path.join(exePath, "computeMeasure_onlyd2star.out")
148
+ if exePath != "unknown" :
149
+ if os.path.exists(preComputed2starOut) :
150
+ os.system("cp " + preComputed2starOut + " " + vhmPath)
151
+ else :
152
+ sys.stderr.write( "ERROR: can't find file " + preComputed2starOut + " \n Please run make under the main directory! \n" )
153
+ filelog.write( "ERROR: can't find file " + preComputed2starOut + " \n Please run make under the main directory! \n" )
154
+ sys.exit(0)
155
+ else :
156
+ ## can't recognize OS, try to compile
157
+ sys.stderr.write("WARNING: can't recognize the operating system" + optSys + " \n")
158
+ sys.stderr.write( "Trying to compile..." + "\n")
159
+ filelog.write("WARNING: can't recognize the operating system" + optSys + " \n")
160
+ filelog.write( "Trying to compile..." + "\n")
161
+ if os.path.exists(computed2starCpp) :
162
+ os.system("g++ " + computed2starCpp + " -o " + computed2starOut + " -std=c++0x" )
163
+ else :
164
+ sys.stderr.write( "ERROR: can't find file " + computed2starCpp + " \n Please run make under the main directory! \n" )
165
+ filelog.write( "ERROR: can't find file " + computed2starCpp + " \n Please run make under the main directory! \n" )
166
+ sys.exit(0)
167
+
168
+ os.system("chmod 777 " + computed2starOut)
169
+ filelog.flush()
170
+
171
+ ## if only compute d2star
172
+ if int(options.onlyD2star) == 1 :
173
+ computeMeasureOut=computed2starOut
174
+ else :
175
+ computeMeasureOut=computeMeasureOut
176
+
177
+
178
+ ## kmer count directory
179
+ kmerCountPath = os.path.join(tmpDir, "KC")
180
+ if not os.path.exists(kmerCountPath) :
181
+ os.makedirs(kmerCountPath)
182
+
183
+ ## virusFaList, hostFaList
184
+ virusFaList = os.listdir(options.virusFaDir)
185
+ #virusFaList = os.path.join(options.virusFaDir, os.listdir(options.virusFaDir))
186
+ hostFaList = os.listdir(options.hostFaDir)
187
+
188
+ ## virus list file, host list file
189
+ virusListFile = os.path.join(tmpDir, "virusList")
190
+ hostListFile = os.path.join(tmpDir, "hostList")
191
+
192
+ virusListFileWrite = open(virusListFile, 'w') ## make file blank
193
+ virusListFileWrite.close()
194
+ virusListFileWrite = open(virusListFile, 'a')
195
+
196
+ hostListFileWrite = open(hostListFile, 'w') ## make file blank
197
+ hostListFileWrite.close()
198
+ hostListFileWrite = open(hostListFile, 'a')
199
+
200
+ #time.sleep(6) # delays for 10 seconds
201
+
202
+ ################ 00: hostTaxa issues: ##########################
203
+ ############## 1. hostName=hostFileName(with extension) ######
204
+ ############## 2. hostTaxa should be no missing ########
205
+
206
+ #################### 00: if hostTaxa missing ###################
207
+ if options.hostTaxaFile is None :
208
+ hostTaxaFile = os.path.join(options.outDir, "hostTaxa.txt_new.txt")
209
+ sys.stdout.write("WARNING: no hostTaxa file provided, creating a dummy one \n")
210
+ filelog.write("WARNING: no hostTaxa file provided, creating a dummy one \n")
211
+ hostTaxaFileWrite = open(hostTaxaFile, 'w') ## make file blank
212
+ hostTaxaFileWrite.close()
213
+ hostTaxaFileWrite = open(hostTaxaFile, 'a')
214
+
215
+ hostTaxaFileWrite.write("hostNCBIName hostName hostSuperkingdom hostPhylum hostClass hostOrder hostFamily hostGenus hostSpecies\n")
216
+ for currentFileName in hostFaList :
217
+ if currentFileName.startswith('.') :
218
+ continue
219
+ if len(currentFileName) > nameLen :
220
+ sys.stdout.write( "WARNING: the file name has more than " + str(nameLen) + " letters! Use the first " + str(nameLen) + " letters as the name \n")
221
+ filelog.write( "WARNING: the file name has more than " + str(nameLen) + " letters! Use the first " + str(nameLen) + " letters as the name \n")
222
+ currentFileNameS = currentFileName[:nameLen]
223
+ else :
224
+ currentFileNameS = currentFileName
225
+ hostTaxaFileWrite.write(currentFileNameS)
226
+ for i in range(1,9) :
227
+ hostTaxaFileWrite.write("\t" + "unknown")
228
+ hostTaxaFileWrite.write("\n")
229
+ hostTaxaFileWrite.close()
230
+ options.hostTaxaFile = hostTaxaFile
231
+
232
+ ################### REFORMAT hostTaxa (fill missing) ###############
233
+ else :
234
+ hostTaxaFile = os.path.join(options.outDir, os.path.basename(options.hostTaxaFile)+"_new.txt")
235
+ hostTaxaTable = numpy.genfromtxt(options.hostTaxaFile,delimiter="\t", dtype=str)
236
+ hostTaxaTable[hostTaxaTable=='']='unknown'
237
+ numpy.savetxt(hostTaxaFile, hostTaxaTable, fmt="%s", delimiter='\t', newline='\n')
238
+
239
+ filelog.flush()
240
+
241
+ #################### 1: count kmer and prepare list files ############################
242
+ #sys.stdout.write("Step 1: counting kmers \n")
243
+
244
+ start_time = time.time()
245
+ count = 0
246
+
247
+ for currentFileName in virusFaList :
248
+ if currentFileName.startswith('.') :
249
+ continue
250
+ if os.path.isdir(os.path.join(options.virusFaDir, currentFileName)) :
251
+ sys.stderr.write( "ERROR: zero bytes of file " + currentFileName + "\n")
252
+ filelog.write( "ERROR: zero bytes of file " + currentFileName + "\n")
253
+ sys.exit(0)
254
+ if len(currentFileName) > nameLen :
255
+ currentFileNameS = currentFileName[:nameLen]
256
+ else :
257
+ currentFileNameS = currentFileName
258
+ sys.stdout.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
259
+ filelog.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
260
+ for w in range(1, (kmax+1)) :
261
+ currentFilePath = os.path.join(options.virusFaDir, currentFileName)
262
+
263
+ currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
264
+ cmdKmer = countKmerOut + " -l -k " + str(w) + \
265
+ " -i " + currentFilePath +\
266
+ " -o " + currentKmerCountPath +\
267
+ " -s " + currentFileNameS
268
+ cmdKmerOut = subprocess.Popen(cmdKmer, shell=True, \
269
+ stderr = subprocess.PIPE, \
270
+ stdout = subprocess.PIPE)
271
+ cmdKmerOut.wait()
272
+
273
+ if len(os.listdir(currentKmerCountPath)) == ( kmax + 1 ):
274
+ virusListFileWrite.write(currentFileNameS + " " + \
275
+ currentKmerCountPath + " " +\
276
+ str(2) + "\n")
277
+ else :
278
+ sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
279
+ filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
280
+ sys.exit(0)
281
+
282
+ filelog.flush()
283
+
284
+ end_time = time.time()
285
+ count += 1
286
+ #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
287
+ sys.stdout.write(" (Average time for counting kmers for one virus: " + str(round((end_time - start_time)/count,4)) + "s) \n" )
288
+ sys.stdout.write(" (ETR for counting kmers for viruses: " + str(round((end_time - start_time)/count*(len(virusFaList)-count), 4)) + "s) \n")
289
+ filelog.write(" (Average time for counting kmers for one virus: " + str(round((end_time - start_time)/count,4)) + "s) \n" )
290
+ filelog.write(" (ETR for counting kmers for viruses: " + str(round((end_time - start_time)/count*(len(virusFaList)-count), 4)) + "s) \n")
291
+
292
+ virusListFileWrite.close()
293
+
294
+
295
+ start_time = time.time()
296
+ count = 0
297
+
298
+ for currentFileName in hostFaList :
299
+ if currentFileName.startswith('.') :
300
+ continue
301
+ if os.path.isdir(os.path.join(options.hostFaDir, currentFileName)) :
302
+ sys.stderr.write( "ERROR: zero bytes of file " + currentFileName + "\n")
303
+ filelog.write( "ERROR: zero bytes of file " + currentFileName + "\n")
304
+ sys.exit(0)
305
+ if len(currentFileName) > nameLen :
306
+ currentFileNameS = currentFileName[:nameLen]
307
+ else :
308
+ currentFileNameS = currentFileName
309
+ sys.stdout.write("Step 1: counting kmers for host " + currentFileNameS + "\n")
310
+ filelog.write("Step 1: counting kmers for host " + currentFileNameS + "\n")
311
+ for w in range(1, (kmax+1)) :
312
+ currentFilePath = os.path.join(options.hostFaDir, currentFileName)
313
+ currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
314
+ cmdKmer = countKmerOut + " -l -k " + str(w) + \
315
+ " -i " + currentFilePath +\
316
+ " -o " + currentKmerCountPath +\
317
+ " -s " + currentFileNameS
318
+ #print cmdKmer
319
+ cmdKmerOut = subprocess.Popen(cmdKmer, shell=True, \
320
+ stderr = subprocess.PIPE, \
321
+ stdout = subprocess.PIPE)
322
+ cmdKmerOut.wait()
323
+
324
+ if len(os.listdir(currentKmerCountPath)) == ( kmax + 1 ) :
325
+ hostListFileWrite.write(currentFileNameS + " " + \
326
+ currentKmerCountPath + " " +\
327
+ str(2) + "\n")
328
+ else :
329
+ sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
330
+ filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
331
+ sys.exit(0)
332
+
333
+ filelog.flush()
334
+
335
+ end_time = time.time()
336
+ count += 1
337
+ #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
338
+ sys.stdout.write(" (Average time for counting kmers for one host: " + str(round((end_time - start_time)/count, 4)) + "s) \n")
339
+ sys.stdout.write(" (ETR for counting kmers for hosts: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
340
+ filelog.write(" (Average time for counting kmers for one host: " + str(round((end_time - start_time)/count, 4)) + "s) \n")
341
+ filelog.write(" (ETR for counting kmers for hosts: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
342
+
343
+ hostListFileWrite.close()
344
+
345
+
346
+
347
+ #time.sleep(6) # delays for 10 seconds
348
+
349
+ ################### 2: compute measures #####################
350
+ start_time = time.time()
351
+ count = 0
352
+
353
+ sys.stdout.write("Step 2: compute distance/dissimilarity measures \n")
354
+ filelog.write("Step 2: compute distance/dissimilarity measures \n")
355
+ cmdCptMeasure = computeMeasureOut + " -k " + str(kmax) + \
356
+ " -i " + virusListFile + " -j " + hostListFile + \
357
+ " -o " + options.outDir + " -t " + hostTaxaFile
358
+ print(cmdCptMeasure)
359
+
360
+ #with open(os.path.join(tmpDir, 'computeMeasureOut.log'), 'w') as filelog:
361
+ cmdCptMeasureOut = subprocess.Popen(cmdCptMeasure, shell=True, \
362
+ stderr = subprocess.PIPE, \
363
+ stdout = subprocess.PIPE)
364
+ #for c in iter(lambda: cmdCptMeasureOut.stderr.read(1), ''):
365
+ for c in iter(cmdCptMeasureOut.stderr.readline, b''):
366
+ sys.stdout.write(c.decode("utf-8"))
367
+ filelog.write(c.decode("utf-8"))
368
+ filelog.flush()
369
+
370
+ end_time = time.time()
371
+ count += 1
372
+ sys.stdout.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
373
+ sys.stdout.write(" (ETR for computing dissimilarities for virus-host pairs: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
374
+ filelog.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
375
+ filelog.write(" (ETR for computing dissimilarities for virus-host pairs: " + str(round((end_time - start_time)/count*(len(hostFaList)-count), 4)) + "s) \n")
376
+
377
+ cmdCptMeasureOut.wait()
378
+
379
+ filelog.close()
380
+
381
+ sys.stdout.write("done \n")