zipstrain 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.3
2
+ Name: zipstrain
3
+ Version: 0.1.3
4
+ Summary:
5
+ Author: ParsaGhadermazi
6
+ Author-email: 54489047+ParsaGhadermazi@users.noreply.github.com
7
+ Requires-Python: >=3.12
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiofiles (>=25.1.0,<26.0.0)
12
+ Requires-Dist: click (>=8.3.0,<9.0.0)
13
+ Requires-Dist: intervaltree (>=3.1.0,<4.0.0)
14
+ Requires-Dist: matplotlib (>=3.10.7,<4.0.0)
15
+ Requires-Dist: numpy (>=2.3.4,<3.0.0)
16
+ Requires-Dist: pandas (>=2.3.3,<3.0.0)
17
+ Requires-Dist: plotly (>=6.3.1,<7.0.0)
18
+ Requires-Dist: polars (>=1.34.0,<2.0.0)
19
+ Requires-Dist: psutil (>=7.1.2,<8.0.0)
20
+ Requires-Dist: pyarrow (>=22.0.0,<23.0.0)
21
+ Requires-Dist: pydantic (>=2.12.3,<3.0.0)
22
+ Requires-Dist: rich (>=14.2.0,<15.0.0)
23
+ Requires-Dist: rich-click (>=1.9.4,<2.0.0)
24
+ Requires-Dist: scipy (>=1.16.2,<2.0.0)
25
+ Requires-Dist: seaborn (>=0.13.2,<0.14.0)
26
+ Description-Content-Type: text/markdown
27
+
28
+
File without changes
@@ -0,0 +1,37 @@
1
+ [project]
2
+ name = "zipstrain"
3
+ version = "0.1.3"
4
+ description = ""
5
+ authors = [
6
+ {name = "ParsaGhadermazi",email = "54489047+ParsaGhadermazi@users.noreply.github.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "polars (>=1.34.0,<2.0.0)",
12
+ "plotly (>=6.3.1,<7.0.0)",
13
+ "seaborn (>=0.13.2,<0.14.0)",
14
+ "numpy (>=2.3.4,<3.0.0)",
15
+ "matplotlib (>=3.10.7,<4.0.0)",
16
+ "pandas (>=2.3.3,<3.0.0)",
17
+ "pyarrow (>=22.0.0,<23.0.0)",
18
+ "intervaltree (>=3.1.0,<4.0.0)",
19
+ "scipy (>=1.16.2,<2.0.0)",
20
+ "aiofiles (>=25.1.0,<26.0.0)",
21
+ "pydantic (>=2.12.3,<3.0.0)",
22
+ "rich (>=14.2.0,<15.0.0)",
23
+ "click (>=8.3.0,<9.0.0)",
24
+ "psutil (>=7.1.2,<8.0.0)",
25
+ "rich-click (>=1.9.4,<2.0.0)"
26
+ ]
27
+
28
+ [tool.poetry]
29
+ packages = [{include = "zipstrain", from = "src"}]
30
+
31
+
32
+ [build-system]
33
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
34
+ build-backend = "poetry.core.masonry.api"
35
+
36
+ [tool.poetry.scripts]
37
+ zipstrain = "zipstrain.cli:cli"
Binary file
@@ -0,0 +1,7 @@
1
+ """
2
+ ZipStrain
3
+ ========================
4
+ ZipStrain is a bioinformatics tool for strain-level analysis of metagenomics data. It is designed with scalability and efficiency in mind, leveraging advanced data processing techniques to handle large datasets.
5
+
6
+ """
7
+ __version__ = "0.1.0"
@@ -0,0 +1,430 @@
1
+ """
2
+ zipstrain.utils
3
+ ========================
4
+ This module contains the command-line interface (CLI) implementation for the zipstrain application.
5
+ """
6
+ import rich_click as click
7
+ import zipstrain.utils as ut
8
+ import zipstrain.compare as cp
9
+ import zipstrain.profile as pf
10
+ import zipstrain.task_manager as tm
11
+ import zipstrain.database as db
12
+ import polars as pl
13
+ import pathlib
14
+
15
+
16
+
17
+ @click.group()
18
+ def cli():
19
+ """Main CLI app"""
20
+ pass
21
+
22
+ @cli.group()
23
+ def utilities():
24
+ pass
25
+
26
+ @utilities.command("build-null-model")
27
+ @click.option('--error-rate', '-e', default=0.001, help="Error rate for the sequencing technology.")
28
+ @click.option('--max-total-reads', '-m', default=10000, help="Maximum coverage to consider for a base")
29
+ @click.option('--p-threshold', '-p', default=0.05, help="Significance threshold for the Poisson distribution.")
30
+ @click.option('--output-file', '-o', required=True, help="Path to save the output Parquet file.")
31
+ @click.option('--model-type', '-t', default="poisson", type=click.Choice(['poisson']), help="Type of null model to build.")
32
+ def build_null_model(error_rate, max_total_reads, p_threshold, output_file, model_type):
33
+ """
34
+ Build a null model for sequencing errors based on the Poisson distribution.
35
+
36
+ Args:
37
+ error_rate (float): Error rate for the sequencing technology.
38
+ max_total_reads (int): Maximum total reads to consider.
39
+ p_threshold (float): Significance threshold for the Poisson distribution.
40
+ """
41
+ if model_type == "poisson":
42
+ df_thresh = ut.build_null_poisson(error_rate, max_total_reads, p_threshold)
43
+ else:
44
+ raise ValueError(f"Unsupported model type: {model_type}")
45
+ df_thresh = pl.DataFrame(df_thresh, schema=["cov", "max_error_count"])
46
+ df_thresh.write_parquet(output_file)
47
+
48
+ @utilities.command("merge_parquet")
49
+ @click.option('--input-dir', '-i', required=True, help="Directory containing Parquet files to merge.")
50
+ @click.option('--output-file', '-o', required=True, help="Path to save the merged Parquet file.")
51
+ def merge_parquet(input_dir, output_file):
52
+ """
53
+ Merge multiple Parquet files in a directory into a single Parquet file, adding gene information.
54
+
55
+ Args:
56
+ input_dir (str): Directory containing Parquet files to merge.
57
+ output_file (str): Path to save the merged Parquet file.
58
+ """
59
+ input_path = pathlib.Path(input_dir)
60
+ parquet_files = list(input_path.glob("*.parquet"))
61
+ if not parquet_files:
62
+ raise ValueError(f"No Parquet files found in directory: {input_dir}")
63
+
64
+ mpileup_df = pl.concat([pl.scan_parquet(pf) for pf in parquet_files])
65
+ mpileup_df.sink_parquet(pathlib.Path(output_file), compression='zstd')
66
+
67
+
68
+ @utilities.command("process_mpileup")
69
+ @click.option('--gene-range-table-loc', '-g', required=True, help="Location of the gene range table in TSV format.")
70
+ @click.option('--batch-bed', '-b', required=True, help="Location of the batch BED file.")
71
+ @click.option('--batch-size', '-s', default=10000, help="Buffer size for processing stdin from samtools.")
72
+ @click.option('--output-file', '-o', required=True, help="Location to save the output Parquet file.")
73
+ def process_mpileup(gene_range_table_loc, batch_bed, batch_size, output_file):
74
+ """
75
+ Process mpileup files and save the results in a Parquet file.
76
+
77
+ Args:
78
+ gene_range_table_loc (str): Path to the gene range table in TSV format.
79
+ batch_bed (str): Path to the batch BED file.
80
+ output_file (str): Path to save the output Parquet file.
81
+ """
82
+ ut.process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output_file)
83
+
84
+ @utilities.command("make_bed")
85
+ @click.option('--db-fasta-dir', '-d', required=True, help="Path to the database in fasta format.")
86
+ @click.option('--max-scaffold-length', '-m', default=500000, help="Maximum scaffold length to split into multiple entries.")
87
+ @click.option('--output-file', '-o', required=True, help="Path to save the output BED file.")
88
+ def make_bed(db_fasta_dir, max_scaffold_length, output_file):
89
+ """
90
+ Create a BED file from the database in fasta format.
91
+
92
+ Args:
93
+ db_fasta_dir (str): Path to the fasta file.
94
+ max_scaffold_length (int): Splits scaffolds longer than this into multiple entries of length <= max_scaffold_length.
95
+ output_file (str): Path to save the output BED file.
96
+ """
97
+ bed_df = ut.make_the_bed(db_fasta_dir, max_scaffold_length)
98
+ bed_df.write_csv(output_file, separator='\t', include_header=False)
99
+
100
+ @utilities.command("get_genome_lengths")
101
+ @click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
102
+ @click.option('--bed-file', '-b', required=True, help="Path to the BED file.")
103
+ @click.option('--output-file', '-o', required=True, help="Path to save the output Parquet file.")
104
+ def get_genome_lengths(stb_file, bed_file, output_file):
105
+ """
106
+ Extract the genome length information from the scaffold-to-genome mapping table.
107
+
108
+ Args:
109
+ stb_file (str): Path to the scaffold-to-genome mapping file.
110
+ bed_file (str): Path to the BED file containing genomic regions.
111
+ output_file (str): Path to save the output Parquet file.
112
+ """
113
+ stb = pl.scan_csv(stb_file, separator='\t',has_header=False).with_columns(
114
+ pl.col("column_1").alias("scaffold"),
115
+ pl.col("column_2").alias("genome")
116
+ )
117
+
118
+ bed_table = pl.scan_csv(bed_file, separator='\t',has_header=False).with_columns(
119
+ pl.col("column_1").alias("scaffold"),
120
+ pl.col("column_2").cast(pl.Int64).alias("start"),
121
+ pl.col("column_3").cast(pl.Int64).alias("end")
122
+ ).select(["scaffold", "start", "end"])
123
+ genome_length = ut.extract_genome_length(stb, bed_table)
124
+ genome_length.sink_parquet(output_file, compression='zstd')
125
+
126
+ @utilities.command("genome_breadth_matrix")
127
+ @click.option('--profile', '-p', type=str, required=True, help="Path to the profile Parquet file.")
128
+ @click.option('--genome-length', '-g', type=str, required=True, help="Path to the genome length Parquet file.")
129
+ @click.option('--stb', '-s', type=str, required=True, help="Path to the scaffold-to-genome mapping file.")
130
+ @click.option('--min-cov', '-c', default=1, help="Minimum coverage to consider a position.")
131
+ @click.option('--output-file', '-o', required=True, help="Path to save the output Parquet file.")
132
+ def genome_breadth_matrix(profile, genome_length, stb, min_cov, output_file):
133
+ """
134
+ Generate a genome breadth matrix from the given profiles and scaffold-to-genome mapping.
135
+
136
+ Args:
137
+ profiles (list): List of profiles in the format 'name:path_to_profile'.
138
+ genome_length (str): Path to the genome length Parquet file.
139
+ stb_file (str): Path to the scaffold-to-genome mapping file.
140
+ min_cov (int): Minimum coverage to consider a position.
141
+ output_file (str): Path to save the output Parquet file.
142
+ """
143
+ genome_length = pl.scan_parquet(genome_length)
144
+ stb = pl.scan_csv(stb, separator='\t', has_header=False).select(
145
+ pl.col("column_1").alias("scaffold"),
146
+ pl.col("column_2").alias("genome")
147
+ )
148
+ profile_dir= pathlib.Path(profile)
149
+ profile = pl.scan_parquet(profile)
150
+ lf=ut.get_genome_breadth_matrix(profile,profile_dir.name, genome_length,stb, min_cov)
151
+ lf.sink_parquet(output_file, compression='zstd')
152
+
153
+ @utilities.command("collect_breadth_tables")
154
+ @click.option('--breadth-tables-dir', '-d', required=True, help="Directory containing breadth tables in Parquet format.")
155
+ @click.option('--extension', '-e', default='parquet', help="File extension of the breadth tables.")
156
+ @click.option('--output-file', '-o', required=True, help="Path to save the collected breadth tables.")
157
+ def collect_breadth(breadth_tables_dir, extension, output_file):
158
+ """
159
+ Collect multiple genome breadth tables into a single LazyFrame.
160
+
161
+ Args:
162
+ breadth_tables_dir (str): Directory containing breadth tables in Parquet format.
163
+ extension (str): File extension of the breadth tables.
164
+ output_file (str): Path to save the collected breadth tables.
165
+ """
166
+ breadth_tables = list(pathlib.Path(breadth_tables_dir).glob(f"*.{extension}"))
167
+ if not breadth_tables:
168
+ raise ValueError(f"No breadth tables found in directory: {breadth_tables_dir}")
169
+
170
+ lazy_frames = [pl.scan_parquet(str(pf)) for pf in breadth_tables]
171
+ combined_lf = ut.collect_breadth_tables(lazy_frames)
172
+ combined_lf.sink_parquet(output_file, compression='zstd')
173
+
174
+
175
+ @cli.group()
176
+ def gene_tools():
177
+ """Utility commands for various tasks."""
178
+ pass
179
+
180
+ @gene_tools.command("gene-range-table")
181
+ @click.option('--gene-file', '-g', required=True, help="location of gene file. Prodigal's nucleotide fasta output")
182
+ @click.option('--output-file', '-o', required=True, help="location to save output tsv file")
183
+ def get_gene_range_table(gene_file, output_file):
184
+ """
185
+ Main function to build and save the gene location table.
186
+
187
+ Args:
188
+ gene_file (str): Path to the gene FASTA file.
189
+ output_file (str): Path to save the output TSV file.
190
+ """
191
+ gene_locs=pf.build_gene_range_table(pathlib.Path(gene_file))
192
+ gene_locs.sink_csv(pathlib.Path(output_file), separator="\t", include_header=False)
193
+
194
+
195
+ @gene_tools.command("gene-loc-table")
196
+ @click.option('--gene-file', '-g', required=True, help="location of gene file. Prodigal's nucleotide fasta output")
197
+ @click.option('--scaffold-list', '-s', required=True, help="location of scaffold list. A text file with each line containing a scaffold name.")
198
+ @click.option('--output-file', '-o', required=True, help="location to save output parquet file")
199
+ def get_gene_loc_table(gene_file, scaffold_list, output_file):
200
+ """
201
+ Main function to build and save the gene location table.
202
+
203
+ Args:
204
+ gene_file (str): Path to the gene FASTA file.
205
+ scaffold_list (str): Path to the scaffold list file.
206
+ output_file (str): Path to save the output Parquet file.
207
+ """
208
+ scaffolds=set(pl.read_csv(pathlib.Path(scaffold_list), has_header=False,separator="\t").select(pl.col("column_1")).to_series().to_list())
209
+ gene_locs=pf.build_gene_loc_table(pathlib.Path(gene_file), scaffolds)
210
+ gene_locs.write_parquet(pathlib.Path(output_file))
211
+
212
+
213
+
214
+ @cli.group()
215
+ def compare():
216
+ pass
217
+
218
+ @compare.command("single_compare_genome")
219
+ @click.option('--mpileup-contig-1', '-m1', required=True, help="Path to the first mpileup file.")
220
+ @click.option('--mpileup-contig-2', '-m2', required=True, help="Path to the second mpileup file.")
221
+ @click.option('--scaffolds-1', '-s1', required=True, help="Path to the list of scaffolds for the first mpileup file.")
222
+ @click.option('--scaffolds-2', '-s2', required=True, help="Path to the list of scaffolds for the second mpileup file.")
223
+ @click.option('--null-model', '-n', required=True, help="Path to the null model Parquet file.")
224
+ @click.option('--stb-file', '-s', required=True, help="Path to the scaffold to genome mapping file.")
225
+ @click.option('--min-cov', '-c', default=5, help="Minimum coverage to consider a position.")
226
+ @click.option('--min-gene-compare-len', '-l', default=100, help="Minimum gene length to consider for comparison.")
227
+ @click.option('--memory-mode', '-m', default="heavy", type=click.Choice(['heavy', 'light'], case_sensitive=False), help="Memory mode for processing: 'heavy' or 'light'.")
228
+ @click.option('--chrom-batch-size', '-b', default=10000, help="Batch size for processing chromosomes. Only used in light memory mode.")
229
+ @click.option('--output-file', '-o', required=True, help="Path to save the parquet file.")
230
+ @click.option('--genome', '-g', default="all", help="If provided, do the comparison only for the specified genome.")
231
+ @click.option('--engine', '-e', default="streaming", type=click.Choice(['streaming', 'gpu',"auto"], case_sensitive=False), help="Engine to use for processing: 'streaming', 'gpu' or 'auto'.")
232
+ def single_compare_genome(mpileup_contig_1, mpileup_contig_2, scaffolds_1, scaffolds_2, null_model, stb_file, min_cov, min_gene_compare_len, memory_mode, chrom_batch_size, output_file, genome, engine):
233
+ """
234
+ Main function to compare two mpileup files and calculate genome and gene statistics.
235
+
236
+ Args:
237
+ mpileup_contig_1 (str): Path to the first mpileup file.
238
+ mpileup_contig_2 (str): Path to the second mpileup file.
239
+ scaffolds_1 (str): Path to the list of scaffolds for the first mpileup file.
240
+ scaffolds_2 (str): Path to the list of scaffolds for the second mpileup file.
241
+ null_model (str): Path to the null model Parquet file.
242
+ gene_locs (str): Path to the gene locations Parquet file.
243
+ min_cov (int): Minimum coverage to consider a position.
244
+ min_gene_compare_len (int): Minimum gene length to consider for comparison.
245
+ memory_mode (str): Memory mode for processing: 'heavy' or 'light'.
246
+ chrom_batch_size (int): Batch size for processing chromosomes. Only used in light memory mode.
247
+ output_file (str): Path to save the parquet file.
248
+ genome (str): If provided, do the comparison only for the specified genome.
249
+ stb_file (str): Path to the scaffold to genome mapping file.
250
+ """
251
+ with pl.StringCache():
252
+ mpile_contig_1 = pl.scan_parquet(mpileup_contig_1).with_columns(
253
+ (pl.col("chrom").cast(pl.Categorical).alias("chrom"),
254
+ pl.col("gene").cast(pl.Categorical).alias("gene"))
255
+ )
256
+ mpile_contig_2 = pl.scan_parquet(mpileup_contig_2).with_columns(
257
+ (pl.col("chrom").cast(pl.Categorical).alias("chrom"),
258
+ pl.col("gene").cast(pl.Categorical).alias("gene"))
259
+ )
260
+
261
+ stb = pl.scan_csv(stb_file, separator="\t", has_header=False).with_columns(
262
+ pl.col("column_1").alias("scaffold").cast(pl.Categorical),
263
+ pl.col("column_2").alias("genome").cast(pl.Categorical)
264
+ ).select(["scaffold", "genome"])
265
+
266
+ null_model = pl.scan_parquet(null_model)
267
+ mpile_contig_1_name = pathlib.Path(mpileup_contig_1).name
268
+ mpile_contig_2_name = pathlib.Path(mpileup_contig_2).name
269
+ if genome != "all":
270
+ scaffold_scope = stb.filter(pl.col("genome") == genome).collect()["scaffold"].to_list()
271
+ else:
272
+ scaffold_scope = None
273
+
274
+ if memory_mode == "light":
275
+ scaffolds_1 = pl.scan_csv(scaffolds_1, separator="\t", has_header=False).select(pl.col("column_1").alias("scaffold"))
276
+ scaffolds_2 = pl.scan_csv(scaffolds_2, separator="\t", has_header=False).select(pl.col("column_1").alias("scaffold"))
277
+ shared_scaffolds = list(set(scaffolds_1["scaffold"].to_list()).intersection(set(scaffolds_2["scaffold"].to_list())))
278
+ mpile_contig_1 = mpile_contig_1.filter(pl.col("chrom").is_in(shared_scaffolds))
279
+ mpile_contig_2 = mpile_contig_2.filter(pl.col("chrom").is_in(shared_scaffolds))
280
+ else:
281
+ shared_scaffolds=None
282
+
283
+
284
+ comp = cp.compare_genomes(mpile_contig_1=mpile_contig_1,
285
+ mpile_contig_2=mpile_contig_2,
286
+ null_model=null_model,
287
+ scaffold_to_genome=stb,
288
+ min_cov=min_cov,
289
+ min_gene_compare_len=min_gene_compare_len,
290
+ memory_mode=memory_mode,
291
+ chrom_batch_size=chrom_batch_size,
292
+ shared_scaffolds=shared_scaffolds,
293
+ scaffold_scope=scaffold_scope,
294
+ engine=engine)
295
+
296
+ comp=comp.with_columns(pl.lit(mpile_contig_1_name).alias("sample_1"), pl.lit(mpile_contig_2_name).alias("sample_2")).fill_null(0)
297
+ comp.sink_parquet(output_file,engine=engine)
298
+
299
+ @cli.group()
300
+ def run():
301
+ pass
302
+
303
+ @run.command("prepare_profiling",help="Prepare the files needed for profiling bam files and save them in the specified output directory.")
304
+ @click.option('--reference-fasta', '-r', required=True, help="Path to the reference genome in FASTA format.")
305
+ @click.option('--gene-fasta', '-g', required=True, help="Path to the gene annotations in FASTA format.")
306
+ @click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
307
+ @click.option('--output-dir', '-o', required=True, help="Directory to save the profiling database.")
308
+ def prepare_profiling(reference_fasta, gene_fasta, stb_file, output_dir):
309
+ """
310
+ Prepare the files needed for profiling bam files and save them in the specified output directory.
311
+ """
312
+ output_dir=pathlib.Path(output_dir)
313
+ output_dir.mkdir(parents=True, exist_ok=True)
314
+ bed_df = ut.make_the_bed(reference_fasta)
315
+ bed_df.write_csv(output_dir / "genomes_bed_file.bed", separator='\t', include_header=False)
316
+ gene_range_table = pf.build_gene_range_table(pathlib.Path(gene_fasta))
317
+ gene_range_table.write_csv(output_dir / "gene_range_table.tsv", separator='\t', include_header=False)
318
+
319
+ stb = pl.scan_csv(stb_file, separator='\t',has_header=False).with_columns(
320
+ pl.col("column_1").alias("scaffold"),
321
+ pl.col("column_2").alias("genome")
322
+ )
323
+
324
+ bed_df = bed_df.lazy()
325
+ genome_length = ut.extract_genome_length(stb, bed_df)
326
+ genome_length.sink_parquet(output_dir / "genome_lengths.parquet", compression='zstd')
327
+
328
+ @run.command("compare_genomes")
329
+ @click.option("--genome-comparison-object", "-g", required=True, help="Path to the genome comparison object in json format.")
330
+ @click.option("--run-dir", "-r", required=True, help="Directory to save the run data.")
331
+ @click.option("--max-concurrent-batches", "-m", default=5, help="Maximum number of concurrent batches to run.")
332
+ @click.option("--poll-interval", "-p", default=1, help="Polling interval in seconds to check the status of batches.")
333
+ @click.option("--execution-mode", "-e", default="local", help="Execution mode: 'local' or 'slurm'.")
334
+ @click.option("--slurm-config", "-s", default=None, help="Path to the SLURM configuration file in json format. Required if execution mode is 'slurm'.")
335
+ @click.option("--container-engine", "-c", default="local", help="Container engine to use: 'local', 'docker' or 'apptainer'.")
336
+ @click.option("--task-per-batch", "-t", default=10, help="Number of tasks to include in each batch.")
337
+ @click.option("--polars-engine", "-a", default="streaming", type=click.Choice(['streaming', 'gpu', 'auto'], case_sensitive=False), help="Polars engine to use: 'streaming', 'gpu' or 'auto'.")
338
+ @click.option("--chrom-batch-size", "-b", default=10000, help="Batch size for processing chromosomes. Only used in light memory mode.")
339
+ @click.option("--memory-mode", "-h", default="heavy", type=click.Choice(['heavy', 'light'], case_sensitive=False), help="Memory mode for processing: 'heavy' or 'light'.")
340
+ def compare_genomes(genome_comparison_object, run_dir, max_concurrent_batches, poll_interval, execution_mode, slurm_config, container_engine, task_per_batch, polars_engine, chrom_batch_size, memory_mode):
341
+ """
342
+ Run genome comparisons in batches using the specified execution mode and container engine.
343
+
344
+ Args:
345
+ genome_comparison_object (str): Path to the genome comparison object in json format.
346
+ run_dir (str): Directory to save the run data.
347
+ max_concurrent_batches (int): Maximum number of concurrent batches to run.
348
+ poll_interval (int): Polling interval in seconds to check the status of batches.
349
+ execution_mode (str): Execution mode: 'local' or 'slurm'.
350
+ slurm_config (str): Path to the SLURM configuration file in json format. Required if execution mode is 'slurm'.
351
+ container_engine (str): Container engine to use: 'local', 'docker' or 'apptainer'.
352
+ task_per_batch (int): Number of tasks to include in each batch.
353
+ """
354
+ genome_comp_db=db.GenomeComparisonDatabase.load_obj(pathlib.Path(genome_comparison_object))
355
+ run_dir=pathlib.Path(run_dir)
356
+ slurm_conf=None
357
+ if execution_mode == "slurm":
358
+ if slurm_config is None:
359
+ raise ValueError("SLURM configuration file must be provided when execution mode is 'slurm'.")
360
+ slurm_conf = tm.SlurmConfig.from_json(slurm_config)
361
+
362
+ if container_engine == "local":
363
+ container_engine_obj = tm.LocalEngine(address="")
364
+ elif container_engine == "docker":
365
+ container_engine_obj = tm.DockerEngine(address="parsaghadermazi/zipstrain:amd64") #could go to a toml or json config file
366
+ elif container_engine == "apptainer":
367
+ container_engine_obj = tm.ApptainerEngine(address="docker://parsaghadermazi/zipstrain:amd64") #could go to a toml or json config file
368
+ else:
369
+ raise ValueError("Invalid container engine. Choose from 'local', 'docker', or 'apptainer'.")
370
+ tm.lazy_run_compares(
371
+ comps_db=genome_comp_db,
372
+ container_engine=container_engine_obj,
373
+ run_dir=run_dir,
374
+ max_concurrent_batches=max_concurrent_batches,
375
+ polars_engine=polars_engine,
376
+ execution_mode=execution_mode,
377
+ slurm_config=slurm_conf,
378
+ memory_mode=memory_mode,
379
+ chrom_batch_size=chrom_batch_size,
380
+ tasks_per_batch=task_per_batch,
381
+ poll_interval=poll_interval,
382
+ )
383
+
384
+ @run.command("build-comp-database")
385
+ @click.option("--profile-db-dir", "-p", required=True, help="Directory containing profile either in parquet format.")
386
+ @click.option("--config-file", "-c", required=True, help="Path to the genome comparsion database config file in json format.")
387
+ @click.option("--output-dir", "-o", required=True, help="Directory to genome comparison database object.")
388
+ @click.option("--comp-db-file", "-f", required=False, help="The initial database file. If provided only additional comparisons will be added to this database.")
389
+ def build_comp_database(profile_db_dir, config_file, output_dir, comp_db_file):
390
+ """
391
+ Build a genome comparison database from the given profiles and configuration.
392
+
393
+ Parameters:
394
+ profile_db_dir (str): Directory containing profile either in parquet format.
395
+ config_file (str): Path to the genome comparison database config file in json format.
396
+ """
397
+ profile_db_dir=pathlib.Path(profile_db_dir)
398
+ profile_db=db.ProfileDatabase(
399
+ db_loc=profile_db_dir,
400
+ )
401
+ existing_db_loc=pathlib.Path(comp_db_file) if comp_db_file is not None else None
402
+ if existing_db_loc is not None and not existing_db_loc.exists():
403
+ raise FileNotFoundError(f"{existing_db_loc} does not exist.")
404
+ obj=db.GenomeComparisonDatabase(
405
+ profile_db=profile_db,
406
+ config=db.GenomeComparisonConfig.from_json(pathlib.Path(config_file)),
407
+ comp_db_loc=existing_db_loc,
408
+ )
409
+ obj.dump_obj(pathlib.Path(output_dir))
410
+
411
+ @run.command("to-complete-table")
412
+ @click.option("--genome-comparison-object", "-g", required=True, help="Path to the genome comparison object in json format.")
413
+ @click.option("--output-file", "-o", required=True, help="Path to save the completed pairs Parquet file.")
414
+ def to_complete_table(genome_comparison_object, output_file):
415
+ """
416
+ Generate a table of completed genome comparison pairs and save it to a Parquet file.
417
+
418
+ Parameters:
419
+ genome_comparison_object (str): Path to the genome comparison object in json format.
420
+ output_file (str): Path to save the completed pairs Parquet file.
421
+ """
422
+ genome_comp_db=db.GenomeComparisonDatabase.load_obj(pathlib.Path(genome_comparison_object))
423
+ completed_pairs=genome_comp_db.to_complete_input_table()
424
+ completed_pairs.sink_parquet(pathlib.Path(output_file), compression='zstd', engine="streaming")
425
+
426
+
427
+
428
+
429
+ if __name__ == "__main__":
430
+ cli()