kssdtree 1.1.1__tar.gz → 1.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {kssdtree-1.1.1/kssdtree.egg-info → kssdtree-1.1.2}/PKG-INFO +1 -1
  2. {kssdtree-1.1.1 → kssdtree-1.1.2/kssdtree.egg-info}/PKG-INFO +1 -1
  3. kssdtree-1.1.2/kssdtree.py +402 -0
  4. {kssdtree-1.1.1 → kssdtree-1.1.2}/setup.py +1 -1
  5. {kssdtree-1.1.1 → kssdtree-1.1.2}/toolutils.py +12 -6
  6. kssdtree-1.1.1/kssdtree.py +0 -390
  7. {kssdtree-1.1.1 → kssdtree-1.1.2}/MANIFEST.in +0 -0
  8. {kssdtree-1.1.1 → kssdtree-1.1.2}/README.md +0 -0
  9. {kssdtree-1.1.1 → kssdtree-1.1.2}/align.c +0 -0
  10. {kssdtree-1.1.1 → kssdtree-1.1.2}/buildtree.c +0 -0
  11. {kssdtree-1.1.1 → kssdtree-1.1.2}/bytescale.c +0 -0
  12. {kssdtree-1.1.1 → kssdtree-1.1.2}/cluster.c +0 -0
  13. {kssdtree-1.1.1 → kssdtree-1.1.2}/co2mco.c +0 -0
  14. {kssdtree-1.1.1 → kssdtree-1.1.2}/command_composite.c +0 -0
  15. {kssdtree-1.1.1 → kssdtree-1.1.2}/command_dist.c +0 -0
  16. {kssdtree-1.1.1 → kssdtree-1.1.2}/command_dist_wrapper.c +0 -0
  17. {kssdtree-1.1.1 → kssdtree-1.1.2}/command_set.c +0 -0
  18. {kssdtree-1.1.1 → kssdtree-1.1.2}/command_shuffle.c +0 -0
  19. {kssdtree-1.1.1 → kssdtree-1.1.2}/distancemat.c +0 -0
  20. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnj.c +0 -0
  21. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/bytescale.h +0 -0
  22. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/dnj.h +0 -0
  23. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/filebuff.h +0 -0
  24. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/hclust.h +0 -0
  25. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/matrix.h +0 -0
  26. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/mman.h +0 -0
  27. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/nj.h +0 -0
  28. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/nwck.h +0 -0
  29. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/pherror.h +0 -0
  30. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/phy.h +0 -0
  31. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/qseqs.h +0 -0
  32. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/str.h +0 -0
  33. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/threader.h +0 -0
  34. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/tmp.h +0 -0
  35. {kssdtree-1.1.1 → kssdtree-1.1.2}/dnjheaders/vector.h +0 -0
  36. {kssdtree-1.1.1 → kssdtree-1.1.2}/filebuff.c +0 -0
  37. {kssdtree-1.1.1 → kssdtree-1.1.2}/global_basic.c +0 -0
  38. {kssdtree-1.1.1 → kssdtree-1.1.2}/hclust.c +0 -0
  39. {kssdtree-1.1.1 → kssdtree-1.1.2}/iseq2comem.c +0 -0
  40. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/co2mco.h +0 -0
  41. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/command_composite.h +0 -0
  42. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/command_dist.h +0 -0
  43. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/command_dist_wrapper.h +0 -0
  44. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/command_set.h +0 -0
  45. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/command_shuffle.h +0 -0
  46. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/global_basic.h +0 -0
  47. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/iseq2comem.h +0 -0
  48. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/mman.h +0 -0
  49. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdheaders/mytime.h +0 -0
  50. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdtree.egg-info/SOURCES.txt +0 -0
  51. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdtree.egg-info/dependency_links.txt +0 -0
  52. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdtree.egg-info/not-zip-safe +0 -0
  53. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdtree.egg-info/requires.txt +0 -0
  54. {kssdtree-1.1.1 → kssdtree-1.1.2}/kssdtree.egg-info/top_level.txt +0 -0
  55. {kssdtree-1.1.1 → kssdtree-1.1.2}/matrix.c +0 -0
  56. {kssdtree-1.1.1 → kssdtree-1.1.2}/mman.c +0 -0
  57. {kssdtree-1.1.1 → kssdtree-1.1.2}/mytime.c +0 -0
  58. {kssdtree-1.1.1 → kssdtree-1.1.2}/nj.c +0 -0
  59. {kssdtree-1.1.1 → kssdtree-1.1.2}/njheaders/align.h +0 -0
  60. {kssdtree-1.1.1 → kssdtree-1.1.2}/njheaders/buildtree.h +0 -0
  61. {kssdtree-1.1.1 → kssdtree-1.1.2}/njheaders/cluster.h +0 -0
  62. {kssdtree-1.1.1 → kssdtree-1.1.2}/njheaders/distancemat.h +0 -0
  63. {kssdtree-1.1.1 → kssdtree-1.1.2}/njheaders/sequence.h +0 -0
  64. {kssdtree-1.1.1 → kssdtree-1.1.2}/njheaders/tree.h +0 -0
  65. {kssdtree-1.1.1 → kssdtree-1.1.2}/njheaders/util.h +0 -0
  66. {kssdtree-1.1.1 → kssdtree-1.1.2}/nwck.c +0 -0
  67. {kssdtree-1.1.1 → kssdtree-1.1.2}/pherror.c +0 -0
  68. {kssdtree-1.1.1 → kssdtree-1.1.2}/phy.c +0 -0
  69. {kssdtree-1.1.1 → kssdtree-1.1.2}/pydnj.c +0 -0
  70. {kssdtree-1.1.1 → kssdtree-1.1.2}/pykssd.c +0 -0
  71. {kssdtree-1.1.1 → kssdtree-1.1.2}/pynj.c +0 -0
  72. {kssdtree-1.1.1 → kssdtree-1.1.2}/qseqs.c +0 -0
  73. {kssdtree-1.1.1 → kssdtree-1.1.2}/sequence.c +0 -0
  74. {kssdtree-1.1.1 → kssdtree-1.1.2}/setup.cfg +0 -0
  75. {kssdtree-1.1.1 → kssdtree-1.1.2}/str.c +0 -0
  76. {kssdtree-1.1.1 → kssdtree-1.1.2}/tmp.c +0 -0
  77. {kssdtree-1.1.1 → kssdtree-1.1.2}/tree.c +0 -0
  78. {kssdtree-1.1.1 → kssdtree-1.1.2}/util.c +0 -0
  79. {kssdtree-1.1.1 → kssdtree-1.1.2}/vector.c +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 1.1
2
2
  Name: kssdtree
3
- Version: 1.1.1
3
+ Version: 1.1.2
4
4
  Summary: Kssdtree is a versatile Python package for phylogenetic analysis. It also provides one-stop tree construction and visualization. It can handle DNA sequences of both fasta or fastq format, whether gzipped or not.
5
5
  Home-page: https://github.com/yhlink/kssdtree
6
6
  Author: Hang Yang
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 1.1
2
2
  Name: kssdtree
3
- Version: 1.1.1
3
+ Version: 1.1.2
4
4
  Summary: Kssdtree is a versatile Python package for phylogenetic analysis. It also provides one-stop tree construction and visualization. It can handle DNA sequences of both fasta or fastq format, whether gzipped or not.
5
5
  Home-page: https://github.com/yhlink/kssdtree
6
6
  Author: Hang Yang
@@ -0,0 +1,402 @@
1
+ import kssd
2
+ import nj
3
+ import dnj
4
+ import toolutils
5
+ import os
6
+ import platform
7
+ import shutil
8
+ import time
9
+ import requests
10
+
11
+
12
+ def shuffle(k=None, s=None, l=None, o=None):
13
+ kssd.write_dim_shuffle_file(k, s, l, o)
14
+
15
+
16
+ def sketch(shuf_file=None, genomes_file=None, output=None, set_opt=None):
17
+ if shuf_file is not None and genomes_file is not None and output is not None:
18
+ if not os.path.exists(genomes_file):
19
+ print('No such file or directory: ', genomes_file)
20
+ return False
21
+ if set_opt is None:
22
+ set_opt = False
23
+ if not toolutils.allowed_file(genomes_file):
24
+ for filename in os.listdir(genomes_file):
25
+ if not toolutils.allowed_file(filename):
26
+ print('Genome format error for file:', filename)
27
+ return False
28
+ if not os.path.exists(shuf_file):
29
+ if shuf_file in ['L3K9.shuf']:
30
+ print('Downloading...', shuf_file)
31
+ import http.client
32
+ http.client.HTTPConnection._http_vsn = 10
33
+ http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
34
+ url = 'http://www.metakssdcoabundance.link/kssdtree/shuffle/' + shuf_file
35
+ start_time = time.time()
36
+ response = requests.get(url, stream=True)
37
+ with open(shuf_file, 'wb') as file:
38
+ for chunk in response.iter_content(chunk_size=1024):
39
+ if chunk:
40
+ file.write(chunk)
41
+ end_time = time.time()
42
+ if end_time - start_time > 120:
43
+ print(
44
+ "Network timeout, please manually download from github (https://github.com/yhlink/kssdtree/tree/master/shuffle_file)")
45
+ return False
46
+ print('Download finished: ', shuf_file)
47
+ elif shuf_file in ['L2K8.shuf', 'L3K10.shuf', 'L2K9.shuf', 'L3K11.shuf']:
48
+ print('Shuffling...', shuf_file)
49
+ file_name = shuf_file.split('.')[0]
50
+ k = int(file_name[3:])
51
+ if k == 11 or k == 10:
52
+ s = 6
53
+ else:
54
+ s = 5
55
+ l = int(file_name[1])
56
+ shuffle(k, s, l, file_name)
57
+ print('Shuffle finished: ', shuf_file)
58
+ else:
59
+ print('No such file or directory: ', shuf_file)
60
+ return False
61
+ print('Sketching...')
62
+ start = time.time()
63
+ if set_opt:
64
+ kssd.dist_dispatch(shuf_file, genomes_file, output, 1, 0, 0)
65
+ else:
66
+ kssd.dist_dispatch(shuf_file, genomes_file, output, 0, 0, 0)
67
+ end = time.time()
68
+ print('Sketch spend time:%.2fs' % (end - start))
69
+ print('Sketch finished!')
70
+ return True
71
+ else:
72
+ print('Args error!!!')
73
+ return False
74
+
75
+
76
+ def dist(ref_sketch=None, qry_sketch=None, output=None, flag=None):
77
+ if ref_sketch is not None and qry_sketch is not None and output is not None:
78
+ if not os.path.exists(ref_sketch):
79
+ print('No such file or directory: ', ref_sketch)
80
+ return False
81
+ if not os.path.exists(qry_sketch):
82
+ print('No such file or directory: ', qry_sketch)
83
+ return False
84
+ if flag is None:
85
+ flag = 0
86
+ print('Disting...')
87
+ start = time.time()
88
+ if '/' in output:
89
+ output_dir = os.path.dirname(output)
90
+ output_name = output.split('/')[-1]
91
+ if not os.path.exists(output_dir):
92
+ os.makedirs(output_dir)
93
+ print("Created directory:", output_dir)
94
+ else:
95
+ output_name = output
96
+ if output_name.endswith(".phy") or output_name.endswith(".phylip"):
97
+ kssd.dist_dispatch(ref_sketch, output, qry_sketch, 2, 0, flag)
98
+ end = time.time()
99
+ print('Dist spend time:%.2fs' % (end - start))
100
+ print('Dist finished!')
101
+ return True
102
+ else:
103
+ print('Output type error, only supports .phylip (.phy) format:', output_name)
104
+ return False
105
+ else:
106
+ print('Args error!!!')
107
+ return False
108
+
109
+
110
+ def retrieve(ref_sketch=None, qry_sketch=None, output=None, N=None, method=None):
111
+ if ref_sketch is not None and qry_sketch is not None and output is not None:
112
+ if method is None:
113
+ method = 'nj'
114
+ if method not in ['nj', 'dnj']:
115
+ print('Only support nj and dnj methods!!!')
116
+ return
117
+ if not os.path.exists(qry_sketch):
118
+ print('No such file or directory: ', qry_sketch)
119
+ return False
120
+ if ref_sketch == 'gtdbr214_sketch':
121
+ print('Retrieving...')
122
+ start = time.time()
123
+ if not os.path.exists(output):
124
+ os.makedirs(output)
125
+ print("Created directory:", output)
126
+ newick, accession_taxonomy = toolutils.upload_request(qry_sketch=qry_sketch, method=method, N=N)
127
+ with open(os.path.join(output, 'output.newick'), 'w') as f:
128
+ f.write(newick)
129
+ with open(os.path.join(output, 'output_accession_taxonomy.txt'), 'w') as f:
130
+ for key, value in accession_taxonomy.items():
131
+ f.write("%s %s\n" % (key, value))
132
+ end = time.time()
133
+ print('Retrieve spend time:%.2fs' % (end - start))
134
+ print('Retrieve finished!')
135
+ return True
136
+ else:
137
+ print("ref_sketch must be set to 'gtdbr214_sketc'")
138
+ return False
139
+ else:
140
+ print('Args error!!!')
141
+ return False
142
+
143
+
144
+ def build(phylip=None, output=None, method=None):
145
+ if phylip is not None:
146
+ if not os.path.exists(phylip):
147
+ print('No such file or directory: ', phylip)
148
+ return False
149
+ if method is None:
150
+ method = 'nj'
151
+ if method not in ['nj', 'dnj']:
152
+ print('Only support nj and dnj methods!!!')
153
+ return False
154
+ print('Building...')
155
+ if '/' in output:
156
+ output_dir = os.path.dirname(output)
157
+ output_name = output.split('/')[-1]
158
+ if not os.path.exists(output_dir):
159
+ os.makedirs(output_dir)
160
+ print("Created directory:", output_dir)
161
+ else:
162
+ output_name = output
163
+ if output_name.endswith(".nwk") or output_name.endswith(".newick"):
164
+ start = time.time()
165
+ if method == 'nj':
166
+ state = nj.build(phylip, output)
167
+ else:
168
+ if platform.system() == 'Linux':
169
+ state = dnj.build(phylip, output, method)
170
+ else:
171
+ state = nj.build(phylip, output)
172
+ if state == 1:
173
+ with open(output, 'r') as f:
174
+ lines = f.readlines()
175
+ newick = ''.join(lines)
176
+ newick = newick.replace('\n', '')
177
+ with open(output, 'w') as f:
178
+ f.write(newick)
179
+ end = time.time()
180
+ print('Build spend time:%.2fs' % (end - start))
181
+ print('Build finished!')
182
+ return True
183
+ else:
184
+ print('Args error!!!')
185
+ return False
186
+
187
+
188
+ def visualize(newick=None, taxonomy=None, mode=None):
189
+ if newick is not None:
190
+ if not os.path.exists(newick):
191
+ print('No such file or directory: ', newick)
192
+ return False
193
+ if mode is None:
194
+ mode = 'r'
195
+ toolutils.view_tree(newick, taxonomy, mode=mode)
196
+ else:
197
+ print('Args error!!!')
198
+ return False
199
+
200
+
201
+ def union(ref_sketch=None, output=None):
202
+ if ref_sketch is not None and output is not None:
203
+ if not os.path.exists(ref_sketch):
204
+ print('No such file or directory: ', ref_sketch)
205
+ return False
206
+ kssd.sketch_union(ref_sketch, output)
207
+ return True
208
+ else:
209
+ return False
210
+
211
+
212
+ def subtract(ref_sketch=None, genomes_sketch=None, output=None, flag=None):
213
+ if ref_sketch is not None and genomes_sketch is not None and output is not None:
214
+ if not os.path.exists(ref_sketch):
215
+ print('No such file or directory: ', ref_sketch)
216
+ return False
217
+ if not os.path.exists(genomes_sketch):
218
+ print('No such file or directory: ', genomes_sketch)
219
+ return False
220
+ if flag == 1:
221
+ print('Subtracting...')
222
+ start = time.time()
223
+ kssd.sketch_operate(ref_sketch, output, genomes_sketch)
224
+ end = time.time()
225
+ print('Subtract spend time:%.2fs' % (end - start))
226
+ print('Subtract finished!')
227
+ return True
228
+ else:
229
+ timeStamp = int(time.mktime(time.localtime(time.time())))
230
+ print('Subtracting...')
231
+ start = time.time()
232
+ temp_txt = 'ref.txt'
233
+ kssd.print_gnames(ref_sketch, temp_txt)
234
+ nums = 0
235
+ with open(temp_txt, 'r') as file:
236
+ for line in file:
237
+ nums += 1
238
+ if nums == 1:
239
+ temp_union_sketch = ref_sketch
240
+ else:
241
+ temp_union_sketch = 'ref_union_sketch_' + str(timeStamp)
242
+ r = union(ref_sketch=ref_sketch, output=temp_union_sketch)
243
+ if not r:
244
+ print('Union error!!!')
245
+ return False
246
+ kssd.sketch_operate(temp_union_sketch, output, genomes_sketch)
247
+ end = time.time()
248
+ current_directory = os.getcwd()
249
+ temp_dir = os.path.join(current_directory, temp_union_sketch)
250
+ if platform.system() == 'Linux':
251
+ if os.path.exists(temp_dir):
252
+ shutil.rmtree(temp_dir)
253
+ if os.path.exists(temp_txt):
254
+ os.remove(temp_txt)
255
+ else:
256
+ pass
257
+ print('Subtract spend time:%.2fs' % (end - start))
258
+ print('Subtract finished!')
259
+ return True
260
+ else:
261
+ print('Args error!!!')
262
+ return False
263
+
264
+
265
+ def quick(shuf_file=None, genomes_file=None, output=None, reference=None, taxonomy=None, method='nj', mode='r', N=0):
266
+ if reference is None and taxonomy is None:
267
+ if shuf_file is not None and genomes_file is not None and output is not None:
268
+ timeStamp = int(time.mktime(time.localtime(time.time())))
269
+ temp_sketch = genomes_file + '_sketch_' + str(timeStamp)
270
+ temp_phy = 'temp.phy'
271
+ print('Step1...')
272
+ s1 = sketch(shuf_file=shuf_file, genomes_file=genomes_file, output=temp_sketch, set_opt=False)
273
+ if not s1:
274
+ return False
275
+ print('Step2...')
276
+ if method == 'nj':
277
+ s2 = dist(ref_sketch=temp_sketch, qry_sketch=temp_sketch, output=temp_phy, flag=0)
278
+ else:
279
+ s2 = dist(ref_sketch=temp_sketch, qry_sketch=temp_sketch, output=temp_phy, flag=1)
280
+ if not s2:
281
+ return False
282
+ print('Step3...')
283
+ s3 = build(phylip=temp_phy, output=output, method=method)
284
+ if not s3:
285
+ return False
286
+ if platform.system() == 'Linux':
287
+ current_directory = os.getcwd()
288
+ temp_dir1 = os.path.join(current_directory, temp_sketch)
289
+ temp_dir2 = os.path.join(current_directory, 'distout')
290
+ if os.path.exists(temp_dir1):
291
+ shutil.rmtree(temp_dir1)
292
+ if os.path.exists(temp_dir2):
293
+ shutil.rmtree(temp_dir2)
294
+ if os.path.exists(temp_phy):
295
+ os.remove(temp_phy)
296
+ return True
297
+ else:
298
+ print('Step4...')
299
+ print('Tree visualization finished!')
300
+ visualize(newick=output, taxonomy=taxonomy, mode=mode)
301
+ else:
302
+ print('Args error!!!')
303
+ return False
304
+
305
+ elif reference == 'gtdbr214_sketch' and taxonomy is None:
306
+ if shuf_file is not None and genomes_file is not None and output is not None:
307
+ if not toolutils.is_positive_integer(N):
308
+ print("N must >0 !!!")
309
+ return False
310
+ if shuf_file != 'L3K9.shuf':
311
+ print("shuffle file must be set to 'L3K9.shuf'")
312
+ return False
313
+ timeStamp = int(time.mktime(time.localtime(time.time())))
314
+ qry_sketch = genomes_file + '_sketch_' + str(timeStamp)
315
+ s1 = sketch(shuf_file=shuf_file, genomes_file=genomes_file, output=qry_sketch, set_opt=True)
316
+ if not s1:
317
+ return False
318
+ s2 = retrieve(ref_sketch=reference, qry_sketch=qry_sketch, output=output, N=N, method=method)
319
+ if not s2:
320
+ return False
321
+ if platform.system() == 'Linux':
322
+ return True
323
+ else:
324
+ print('Tree visualization finished!')
325
+ visualize(newick=os.path.join(output, 'output.newick'),
326
+ taxonomy=os.path.join(output, 'output_accession_taxonomy.txt'), mode=None)
327
+ else:
328
+ print('Args error!!!')
329
+ return False
330
+ else:
331
+ if shuf_file is not None and genomes_file is not None and output is not None and method in ['nj', 'dnj']:
332
+ if shuf_file is not None and genomes_file is not None and output is not None and method in ['nj', 'dnj']:
333
+ timeStamp = int(time.mktime(time.localtime(time.time())))
334
+ temp_reference_sketch = 'ref_sketch_' + str(timeStamp)
335
+ temp_genomes_sketch = genomes_file + '_sketch_' + str(timeStamp)
336
+ if not toolutils.allowed_file(reference):
337
+ cur_path = os.getcwd()
338
+ ref_path = os.path.join(cur_path, reference)
339
+ num = toolutils.get_file_num(ref_path)
340
+ if num == 1:
341
+ temp_union_sketch = temp_reference_sketch
342
+ else:
343
+ temp_union_sketch = 'ref_union_sketch_' + str(timeStamp)
344
+ else:
345
+ temp_union_sketch = temp_reference_sketch
346
+ temp_subtract_sketch = genomes_file + '_subtract_sketch_' + str(timeStamp)
347
+ temp_phy = 'temp.phy'
348
+ print('Step1...')
349
+ s1 = sketch(shuf_file=shuf_file, genomes_file=reference, output=temp_reference_sketch, set_opt=True)
350
+ if not s1:
351
+ return False
352
+ s2 = sketch(shuf_file=shuf_file, genomes_file=genomes_file, output=temp_genomes_sketch, set_opt=True)
353
+ if not s2:
354
+ return False
355
+ print('Step2...')
356
+ s3 = union(ref_sketch=temp_reference_sketch, output=temp_union_sketch)
357
+ if not s3:
358
+ return False
359
+ s4 = subtract(ref_sketch=temp_union_sketch, genomes_sketch=temp_genomes_sketch,
360
+ output=temp_subtract_sketch, flag=1)
361
+ if not s4:
362
+ return False
363
+ print('Step3...')
364
+ if method == 'nj':
365
+ s5 = dist(ref_sketch=temp_subtract_sketch, qry_sketch=temp_subtract_sketch, output=temp_phy,
366
+ flag=0)
367
+ else:
368
+ s5 = dist(ref_sketch=temp_subtract_sketch, qry_sketch=temp_subtract_sketch, output=temp_phy,
369
+ flag=1)
370
+ if not s5:
371
+ return False
372
+ print('Step4...')
373
+ s6 = build(phylip=temp_phy, output=output, method=method)
374
+ if not s6:
375
+ return False
376
+ if platform.system() == 'Linux':
377
+ current_directory = os.getcwd()
378
+ temp_dir1 = os.path.join(current_directory, temp_reference_sketch)
379
+ temp_dir2 = os.path.join(current_directory, temp_genomes_sketch)
380
+ temp_dir3 = os.path.join(current_directory, temp_union_sketch)
381
+ temp_dir4 = os.path.join(current_directory, temp_subtract_sketch)
382
+ temp_dir5 = os.path.join(current_directory, 'distout')
383
+ if os.path.exists(temp_dir1):
384
+ shutil.rmtree(temp_dir1)
385
+ if os.path.exists(temp_dir2):
386
+ shutil.rmtree(temp_dir2)
387
+ if os.path.exists(temp_dir3):
388
+ shutil.rmtree(temp_dir3)
389
+ if os.path.exists(temp_dir4):
390
+ shutil.rmtree(temp_dir4)
391
+ if os.path.exists(temp_dir5):
392
+ shutil.rmtree(temp_dir5)
393
+ if os.path.exists(temp_phy):
394
+ os.remove(temp_phy)
395
+ return True
396
+ else:
397
+ print('Step5...')
398
+ print('Tree visualization finished!')
399
+ visualize(newick=output, taxonomy=taxonomy, mode=mode)
400
+ else:
401
+ print('Args error!!!')
402
+ return False
@@ -93,7 +93,7 @@ require_pakages = [
93
93
 
94
94
  setup(
95
95
  name='kssdtree',
96
- version='1.1.1',
96
+ version='1.1.2',
97
97
  author='Hang Yang',
98
98
  author_email='yhlink1207@gmail.com',
99
99
  description="Kssdtree is a versatile Python package for phylogenetic analysis. It also provides one-stop tree construction and visualization. It can handle DNA sequences of both fasta or fastq format, whether gzipped or not. ",
@@ -47,13 +47,20 @@ def get_file_num(cwd):
47
47
  return len(res)
48
48
 
49
49
 
50
- def upload_request(dir_name, method, N):
51
- zip_file = dir_name + '.zip'
50
+ def decompress_zip(zip_path, dir_path):
51
+ f = zipfile.ZipFile(zip_path, 'r')
52
+ for file in f.namelist():
53
+ f.extract(file, dir_path)
54
+ f.close()
55
+
56
+
57
+ def upload_request(qry_sketch, method, N):
58
+ zip_file = qry_sketch + '.zip'
52
59
  zip = zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED)
53
- for item in os.listdir(dir_name):
54
- zip.write(dir_name + os.sep + item)
60
+ for item in os.listdir(qry_sketch):
61
+ zip.write(qry_sketch + os.sep + item)
55
62
  zip.close()
56
- url = "http://18.205.53.149:8000/kssdtree/upload"
63
+ url = "http://www.metakssdcoabundance.link/kssdtree/upload"
57
64
  header = {
58
65
  "kssdtree": 'upload'
59
66
  }
@@ -219,7 +226,6 @@ def view_tree(newick, taxonomy, mode):
219
226
  # t.render("bubble_map.png", w=600, dpi=300, tree_style=ts)
220
227
  t.show(tree_style=ts)
221
228
 
222
-
223
229
  def deal_gtdb_txt(temp_dist_output):
224
230
  data = pd.read_csv(temp_dist_output, delimiter='\t', header=None, skiprows=1)
225
231
  column_2 = data.iloc[:, 1]
@@ -1,390 +0,0 @@
1
- import kssd
2
- import nj
3
- import dnj
4
- import toolutils
5
- import os
6
- import platform
7
- import shutil
8
- import time
9
- import requests
10
-
11
-
12
- def shuffle(k=8, s=5, l=2, o='default'):
13
- print('shuffling...')
14
- kssd.write_dim_shuffle_file(k, s, l, o)
15
- print('shuffle finished!')
16
-
17
-
18
- def sketch(shuffle=None, genomes=None, output=None, set_opt=None):
19
- """
20
- sketch: sketching genomes into sketch and generating sketch files.
21
- :param shuffle: Kssdtree provide 'L3K9.shuf' and 'L3K10.shuf' files as input for genome sketching or decomposition. The default is 'L3K10.shuf'.
22
- :param genomes: The folder path for genome files. It supports the input of genome files in fasta/fastq formats.
23
- :param output: The output folder path for sketch result files of genome files.
24
- :param set_opt: Whether to do the set operation, default is False, if you want to do the set operation, you can set set_opt=True.
25
- :return: null
26
- """
27
- if set_opt is None:
28
- set_opt = False
29
- if shuffle is not None and genomes is not None and output is not None:
30
- current_directory = os.getcwd()
31
- shuf_file_path = os.path.join(current_directory, shuffle)
32
- if not os.path.exists(shuf_file_path):
33
- if shuffle == 'L3K9.shuf':
34
- print('downloading...', shuffle)
35
- import http.client
36
- http.client.HTTPConnection._http_vsn = 10
37
- http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
38
- url = 'http://18.205.53.149:8000/kssdtree/shuffle/' + shuffle
39
- headers = {
40
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
41
- }
42
- r = requests.get(url, headers=headers, stream=True)
43
- with open(os.getcwd() + "\\" + shuffle, mode="wb") as f:
44
- f.write(r.content)
45
- print('download finished!', shuffle)
46
- else:
47
- file_name = shuffle.split('.')[0]
48
- k = int(file_name[3:])
49
- if k == 10:
50
- s = 6
51
- else:
52
- s = 5
53
- l = int(file_name[1])
54
- shuffle(k=k, s=s, l=l, o=file_name)
55
- print('sketching...')
56
- start = time.time()
57
- if set_opt:
58
- kssd.dist_dispatch(shuffle, genomes, output, 1, 0, 0)
59
- else:
60
- kssd.dist_dispatch(shuffle, genomes, output, 0, 0, 0)
61
- end = time.time()
62
- print('sketch spend time:%.2fs' % (end - start))
63
- print('sketch finished!')
64
- else:
65
- print('args error!!!')
66
-
67
-
68
- def dist(ref_sketch=None, qry_sketch=None, output=None, flag=None):
69
- """
70
- computing pairwise distances between reference and query genomes, and then generating a distance matrix in phylip format.
71
- :param ref_sketch: The folder path for sketch result files of reference genome files.
72
- :param qry_sketch: The folder path for sketch result files of query genome files.
73
- :param output: The output filename of distance matrix in phylip format.
74
- :param flag: 0 or 1. 0,1 is used to generate the distance matrix required by NJ (0 for diagonal elements) and DNJ (no diagonal elements) respectively.
75
- :return: null
76
- """
77
- if flag is None:
78
- flag = 0
79
- if ref_sketch is not None and qry_sketch is not None and output is not None:
80
- print('disting...')
81
- start = time.time()
82
- kssd.dist_dispatch(ref_sketch, output, qry_sketch, 2, 0, flag)
83
- end = time.time()
84
- print('dist spend time:%.2fs' % (end - start))
85
- print('dist finished!')
86
- else:
87
- print('args error!!!')
88
-
89
-
90
- def retrieve(ref_sketch=None, qry_sketch=None, output=None, N=None):
91
- """
92
- retrieving N closest sketches from reference or GTDB (R214) sketches and combining query sketch files.
93
- :param ref_sketch: The folder path for sketch result files of reference genome files.
94
- :param qry_sketch: The folder path for sketch result files of query genome files.
95
- :param output: The output folder path for retrieve sketch result files of genome files.
96
- :param N: Max number of nearest reference genomes.
97
- :return: null
98
- """
99
- if ref_sketch is not None and qry_sketch is not None and output is not None:
100
- if ref_sketch == 'gtdbr214_sketch':
101
- print('retrieving...')
102
- start = time.time()
103
- temp_related_sketch = 'related_sketch'
104
- reference = 'static/gtdbr214_sketch'
105
- kssd.dist_dispatch(reference, output, qry_sketch, 2, N, 3)
106
- kssd.print_gnames(reference, 'gtdb.txt')
107
- file_path1 = os.path.join(os.getcwd(), 'distout', 'distance.out')
108
- toolutils.deal_gtdb_txt(file_path1)
109
- kssd.grouping_genomes('new_gtdb.txt', reference, temp_related_sketch)
110
- kssd.dist_dispatch(output, qry_sketch, temp_related_sketch, 3, 0, 0)
111
- end = time.time()
112
- print('retrieve spend time:%.2fs' % (end - start))
113
- print('retrieve finished!')
114
- else:
115
- print('retrieving...')
116
- start = time.time()
117
- timeStamp = int(time.mktime(time.localtime(time.time())))
118
- temp_related_sketch = 'related_sketch_' + str(timeStamp)
119
- kssd.dist_dispatch(ref_sketch, output, qry_sketch, 2, N, 3)
120
- kssd.print_gnames(ref_sketch, 'gtdb.txt')
121
- file_path1 = os.path.join(os.getcwd(), 'distout', 'distance.out')
122
- toolutils.deal_gtdb_txt(file_path1)
123
- kssd.grouping_genomes('new_gtdb.txt', ref_sketch, temp_related_sketch)
124
- kssd.dist_dispatch(output, qry_sketch, temp_related_sketch, 3, 0, 0)
125
- end = time.time()
126
- file_path1 = 'new.txt'
127
- file_path2 = 'gtdb.txt'
128
- file_path3 = 'new_gtdb.txt'
129
- file_path4 = 'related_genomes_values.txt'
130
- file_path5 = 'modified_file.txt'
131
- file_path6 = 'new_accession_taxonomy.txt'
132
- if os.path.exists(file_path1):
133
- os.remove(file_path1)
134
- if os.path.exists(file_path2):
135
- os.remove(file_path2)
136
- if os.path.exists(file_path3):
137
- os.remove(file_path3)
138
- if os.path.exists(file_path4):
139
- os.remove(file_path4)
140
- if os.path.exists(file_path5):
141
- os.remove(file_path5)
142
- if os.path.exists(file_path6):
143
- os.remove(file_path6)
144
- print('retrieve spend time:%.2fs' % (end - start))
145
- print('retrieve finished!')
146
- else:
147
- print('args error!!!')
148
-
149
-
150
- def build(phylip=None, output=None, method=None):
151
- """
152
- constructing tree with NJ or DNJ and generating tree in newick format.
153
- :param phylip: The distance matrix in phylip format.
154
- :param output: 'nj'(NJ) or 'dnj'(DNJ) method for constructing tree. The default is 'nj'.
155
- :param method: The output filename of tree in newick format.
156
- :return: null
157
- """
158
- if method is None:
159
- method = 'nj'
160
- if method not in ['nj', 'dnj']:
161
- print('method only support nj and dnj!!!')
162
- return
163
- if phylip is not None:
164
- print('building...')
165
- start = time.time()
166
- if output is None:
167
- output = 'kssdtree.newick'
168
- if method == 'nj':
169
- state = nj.build(phylip, output)
170
- else:
171
- if platform.system() == 'Linux':
172
- state = dnj.build(phylip, output, method)
173
- else:
174
- state = nj.build(phylip, output)
175
- if state == 1:
176
- nwk_path = os.path.join(os.getcwd(), output)
177
- with open(nwk_path, 'r') as f:
178
- lines = f.readlines()
179
- newick = ''.join(lines)
180
- newick = newick.replace('\n', '')
181
- with open(nwk_path, 'w') as f:
182
- f.write(newick)
183
- end = time.time()
184
- print('build spend time:%.2fs' % (end - start))
185
- print('build finished!')
186
- else:
187
- print('args error!!!')
188
-
189
-
190
- def visualize(newick=None, taxonomy=None, mode=None):
191
- """
192
- visualizing tree with ETE3 toolkit.
193
- :param newick: The tree in newick format.
194
- :param taxonomy: The taxonomy information in txt format, which records the name (accession) of genome and its taxonomy. The default is None.
195
- :param mode: 'r'(rectangle) or 'c'(circle) mode for visualizing tree. The default is 'r'.
196
- :return: null
197
- """
198
- if mode is None:
199
- mode = 'r'
200
- if newick is not None:
201
- toolutils.view_tree(newick, taxonomy, mode=mode)
202
- else:
203
- print('args error!!!')
204
-
205
-
206
- def union(ref_sketch=None, output=None):
207
- """
208
- :param sketch:
209
- :param output:
210
- :return:
211
- """
212
- if ref_sketch is not None and output is not None:
213
- kssd.sketch_union(sketch, output)
214
- else:
215
- print('args error!!!')
216
-
217
-
218
- def subtract(ref_sketch=None, genomes_sketch=None, output=None, flag=0):
219
- """
220
- subtracting the ref_sketch from genomes_sketch and creating the remainder sketch files.
221
- :param ref_sketch: The folder path for reference sketch result files.
222
- :param genomes_sketch: The folder path for sketch result files of genome files.
223
- :param output: The output folder path for remainder sketch result files.
224
- :param flag: 0.
225
- :return: null
226
- """
227
- if ref_sketch is not None and genomes_sketch is not None and output is not None:
228
- if flag == 1:
229
- print('subtracting...')
230
- start = time.time()
231
- kssd.sketch_operate(ref_sketch, output, genomes_sketch)
232
- end = time.time()
233
- print('subtract spend time:%.2fs' % (end - start))
234
- print('subtract finished!')
235
- else:
236
- timeStamp = int(time.mktime(time.localtime(time.time())))
237
- temp_union_sketch = 'ref_union_sketch_' + str(timeStamp)
238
- print('subtracting...')
239
- start = time.time()
240
- union(ref_sketch=ref_sketch, output=temp_union_sketch)
241
- kssd.sketch_operate(temp_union_sketch, output, genomes_sketch)
242
- end = time.time()
243
- current_directory = os.getcwd()
244
- temp_dir = os.path.join(current_directory, temp_union_sketch)
245
- if platform.system() == 'Linux':
246
- if os.path.exists(temp_dir):
247
- shutil.rmtree(temp_dir)
248
- else:
249
- pass
250
- print('subtract spend time:%.2fs' % (end - start))
251
- print('subtract finished!')
252
- else:
253
- print('args error!!!')
254
-
255
-
256
- def quick(shuffle=None, genomes=None, output=None, reference=None, taxonomy=None, method='nj', mode='r', N=0):
257
- """
258
- simplifying pipeline and eliminating the necessity of organizing many intermediate files.
259
- :param shuffle: Kssdtree provide frequently-used 'L3K9.shuf' and 'L3K10.shuf' files as input for genome sketching or decomposition. The default is 'L3K10.shuf'. If you want to perform phylogenetic placement, you must use 'L3K9.shuf' file.
260
- :param genomes: The folder path for genome files. It supports the input of genome files in fasta/fastq formats.
261
- :param output: The output filename of tree in newick format.
262
- :param reference: The default is None, will perform the routine workflow. If you want to perform the reference subtraction workflow, you can set reference to the reference genome file or folder path. If you want to perform the phylogenetic placement, you must set reference to ‘gtdbr214’.
263
- :param taxonomy: The filename of taxonomy information in txt format, which records the name (accession) of genome and its taxonomy. The default is None.
264
- :param method: 'nj'(NJ) or 'dnj'(DNJ) method for constructing tree. The default is 'nj'.
265
- :param mode: 'r'(rectangle) or 'c'(circle) mode for visualizing tree. The default is 'r'.
266
- :param N: Max number of nearest reference genomes. The default is 0 for computing pairwise distances between genomes on routine and reference subtraction workflows. If you want to perform the phylogenetic placement, you can set N > 0.
267
- :return: null
268
- """
269
- if reference is None and taxonomy is None:
270
- if shuffle is not None and genomes is not None and output is not None:
271
- for filename in os.listdir(genomes):
272
- if not toolutils.allowed_file(filename):
273
- print('Genome format error for file:', filename)
274
- return 0
275
- timeStamp = int(time.mktime(time.localtime(time.time())))
276
- temp_sketch = genomes + '_sketch_' + str(timeStamp)
277
- temp_phy = 'temp.phy'
278
- print('step1...')
279
- sketch(shuffle=shuffle, genomes=genomes, output=temp_sketch, set_opt=False)
280
- print('step2...')
281
- if method == 'nj':
282
- dist(ref_sketch=temp_sketch, qry_sketch=temp_sketch, output=temp_phy, flag=0)
283
- else:
284
- dist(ref_sketch=temp_sketch, qry_sketch=temp_sketch, output=temp_phy, flag=1)
285
- print('step3...')
286
- build(phylip=temp_phy, output=output, method=method)
287
- if platform.system() == 'Linux':
288
- pass
289
- else:
290
- print('step4...')
291
- print('tree visualization finished!')
292
- visualize(newick=output, taxonomy=taxonomy, mode=mode)
293
- current_directory = os.getcwd()
294
- temp_dir1 = os.path.join(current_directory, temp_sketch)
295
- temp_dir2 = os.path.join(current_directory, 'distout')
296
- if platform.system() == 'Linux':
297
- if os.path.exists(temp_dir1):
298
- shutil.rmtree(temp_dir1)
299
- if os.path.exists(temp_dir2):
300
- shutil.rmtree(temp_dir2)
301
- else:
302
- pass
303
- else:
304
- print('args error!!!')
305
- elif reference == 'gtdbr214' and taxonomy is None:
306
- if shuffle is not None and genomes is not None and output is not None and toolutils.is_positive_integer(N):
307
- if shuffle != 'L3K9.shuf':
308
- print("shuffle must be set to 'L3K9.shuf'")
309
- return 0
310
- for filename in os.listdir(genomes):
311
- if not toolutils.allowed_file(filename):
312
- print('Genome format error for file:', filename)
313
- return 0
314
- timeStamp = int(time.mktime(time.localtime(time.time())))
315
- temp_sketch = genomes + '_sketch_' + str(timeStamp)
316
- sketch(shuffle=shuffle, genomes=genomes, output=temp_sketch, set_opt=True)
317
- newick, accession_taxonomy = toolutils.upload_request(dir_name=temp_sketch, method=method, N=N)
318
- with open(output, 'w') as f:
319
- f.write(newick)
320
- with open('accession_taxonomy.txt', 'w') as f:
321
- for key, value in accession_taxonomy.items():
322
- f.write("%s %s\n" % (key, value))
323
- if platform.system() == 'Linux':
324
- pass
325
- else:
326
- print('tree visualization finished!')
327
- visualize(newick=output, taxonomy='accession_taxonomy.txt', mode=None)
328
- else:
329
- print('args error or N<=0!!!')
330
- else:
331
- if shuffle is not None and genomes is not None and output is not None and method in ['nj', 'dnj']:
332
- if shuffle is not None and genomes is not None and output is not None and method in ['nj', 'dnj']:
333
- timeStamp = int(time.mktime(time.localtime(time.time())))
334
- temp_reference_sketch = 'ref_sketch_' + str(timeStamp)
335
- temp_genomes_sketch = genomes + '_sketch_' + str(timeStamp)
336
- if not toolutils.allowed_file(reference):
337
- cur_path = os.getcwd()
338
- ref_path = os.path.join(cur_path, reference)
339
- num = toolutils.get_file_num(ref_path)
340
- if num == 1:
341
- temp_union_sketch = temp_reference_sketch
342
- else:
343
- temp_union_sketch = 'ref_union_sketch_' + str(timeStamp)
344
- else:
345
- temp_union_sketch = temp_reference_sketch
346
- temp_subtract_sketch = genomes + '_subtract_sketch_' + str(timeStamp)
347
- temp_phy = 'temp.phy'
348
- print('step1...')
349
- sketch(shuffle=shuffle, genomes=reference, output=temp_reference_sketch, set_opt=True)
350
- sketch(shuffle=shuffle, genomes=genomes, output=temp_genomes_sketch, set_opt=True)
351
- print('step2...')
352
- union(ref_sketch=temp_reference_sketch, output=temp_union_sketch)
353
- subtract(ref_sketch=temp_union_sketch, genomes_sketch=temp_genomes_sketch,
354
- output=temp_subtract_sketch, flag=1)
355
- print('step3...')
356
- if method == 'nj':
357
- dist(ref_sketch=temp_subtract_sketch, qry_sketch=temp_subtract_sketch, output=temp_phy,
358
- flag=0)
359
- else:
360
- dist(ref_sketch=temp_subtract_sketch, qry_sketch=temp_subtract_sketch, output=temp_phy,
361
- flag=1)
362
- print('step4...')
363
- build(phylip=temp_phy, output=output, method=method)
364
- if platform.system() == 'Linux':
365
- pass
366
- else:
367
- print('step5...')
368
- print('tree visualization finished!')
369
- visualize(newick=output, taxonomy=taxonomy, mode=mode)
370
- current_directory = os.getcwd()
371
- temp_dir1 = os.path.join(current_directory, temp_reference_sketch)
372
- temp_dir2 = os.path.join(current_directory, temp_genomes_sketch)
373
- temp_dir3 = os.path.join(current_directory, temp_union_sketch)
374
- temp_dir4 = os.path.join(current_directory, temp_subtract_sketch)
375
- temp_dir5 = os.path.join(current_directory, 'distout')
376
- if platform.system() == 'Linux':
377
- if os.path.exists(temp_dir1):
378
- shutil.rmtree(temp_dir1)
379
- if os.path.exists(temp_dir2):
380
- shutil.rmtree(temp_dir2)
381
- if os.path.exists(temp_dir3):
382
- shutil.rmtree(temp_dir3)
383
- if os.path.exists(temp_dir4):
384
- shutil.rmtree(temp_dir4)
385
- if os.path.exists(temp_dir5):
386
- shutil.rmtree(temp_dir5)
387
- else:
388
- pass
389
- else:
390
- print('args error!!!')
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes