PyPI - pybiolib - Versions diffs - 1.1.2236__py3-none-any.whl → 1.1.2250__py3-none-any.whl - Mend

pybiolib 1.1.2236py3-none-any.whl → 1.1.2250py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

biolib/_internal/utils/multinode.py ADDED Viewed

@@ -0,0 +1,264 @@
+import glob
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import biolib
+from biolib.utils import SeqUtil
+def natsorted(lst):
+    """Sort the list using the natural sort key."""
+    def _natural_sort_key(s):
+        """A key function for natural sorting."""
+        return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
+    return sorted(lst, key=_natural_sort_key)
+def fasta_above_threshold(fasta_file, work_threshold, work_per_residue=1, verbose=False):
+    """True if total FASYA residue work above max_work"""
+    records = SeqUtil.parse_fasta(fasta_file)
+    # Calculate work units
+    total_work_units = 0
+    for i, record in enumerate(records):
+        sequence_work_units = len(record.sequence) * work_per_residue
+        total_work_units += sequence_work_units
+        if total_work_units >= work_threshold:
+            if verbose:
+                print(f'FASTA above threshold (stopped at {total_work_units}) >= {work_threshold}')
+                print(f'From  from {i+1}/{len(records)} sequences in {fasta_file}')
+            return True
+    if verbose:
+        print(f'FASTA below threshold ({total_work_units}) < {work_threshold}')
+        print(f'From {len(records)} sequences in {fasta_file}')
+    return False
+def run_locally(command_list, args):
+    """Run script locally (no multi-node processing)"""
+    # Prepare command
+    new_args = vars(args)
+    # Delete multinode-specific input arguments
+    for k in list(new_args.keys()):
+        if str(k).startswith('multinode'):
+            del new_args[k]
+    # Convert to list format
+    new_args_list = _args_dict_to_args_list(new_args)
+    # Prepare command, e.g. ["python3", "predict.py"] + new_args_list
+    command = command_list + new_args_list
+    if args.verbose >= 1:
+        print(f'Running {command}')
+    # Run command
+    result = subprocess.run(command, capture_output=True, text=True, check=False)
+    if result.returncode == 0:
+        print(f'{result.stdout}')
+    else:
+        print(f'Error: {result.stderr}')
+def fasta_batch_records(fasta_file, work_per_batch_min, work_per_residue=1, verbose=False):
+    """Converts FASTA records to batches of records, based on thresholds"""
+    def log_batches(batches):
+        for i, batch in enumerate(batches):
+            batch_dict = {
+                'records': len(batch),
+                'residues': sum(len(record.sequence) for record in batch),
+            }
+            n_seqs, n_res = batch_dict['records'], batch_dict['residues']
+            print(f'Batch {i+1}: {n_res} residues from {n_seqs} sequences')
+    records = SeqUtil.parse_fasta(fasta_file)
+    batches = []
+    batch = []
+    current_work_units = 0
+    total_work_units = 0
+    for record in records:
+        # Add to batch
+        batch.append(record)
+        # Calculate work units
+        seq = record.sequence
+        sequence_work_units = len(seq) * work_per_residue
+        # Increase counters
+        current_work_units += sequence_work_units
+        total_work_units += sequence_work_units
+        # If above limit, start a new batch
+        if current_work_units >= work_per_batch_min:
+            batches.append(batch)
+            batch = []
+            current_work_units = 0
+    # Append last batch if present
+    if batch:
+        batches.append(batch)
+    if verbose:
+        log_batches(batches)
+    return batches
+def fasta_send_batches_biolib(app_url, batches, args, args_fasta='fasta', verbose=1):
+    """
+    Send jobs through pybiolib interface
+    """
+    if args.verbose >= 1:
+        print(f'Sending {len(batches)} batches to Biolib')
+    # Login to biolib, prepare app
+    # current_app = biolib.load(Runtime.get_app_uri())
+    biolib.login()
+    current_app = biolib.load(app_url)  # Nb: uses "_" not "-"
+    # Compute results
+    job_list = []
+    for i, batch_records in enumerate(batches):  # MH
+        # Write FASTA, send to server
+        with tempfile.TemporaryDirectory() as tempdir:
+            # New arguments
+            new_args = vars(args)
+            # Write batched FASTA to send
+            fasta_path = f'{tempdir}/input.fasta'
+            SeqUtil.write_records_to_fasta(fasta_path, batch_records)
+            new_args[args_fasta] = fasta_path
+            new_args['multinode_only_local'] = True
+            # Convert to list
+            new_args_list = _args_dict_to_args_list(new_args)
+            # Send job
+            job = current_app.cli(args=new_args_list, blocking=False)
+            job_list.append(job)
+            # Job stats
+            if args.verbose:
+                batch_dict = _get_batch_stats(batch_records)
+                n_seqs, n_res = batch_dict['records'], batch_dict['residues']
+                print(f'Sending job {i+1}: {n_res} residues from {n_seqs} sequences -> arg_list = {new_args_list}')
+    # Stream job output at a time
+    print('Streaming job outputs ...')
+    for i, job in enumerate(job_list):
+        job.stream_logs()
+        # Check if job succeeded
+        assert job.get_exit_code() == 0, f'Job failed with exit code {job.get_exit_code()}'
+        # Write to disk
+        output_dir = f'job_output/job_{i+1}'
+        job.save_files(output_dir=output_dir)
+        if verbose:
+            print(f'Saving to {output_dir}')
+def merge_folder(folder_name, job_out_dir='job_output', out_dir='output', verbose=1):
+    """Helper function for merging folders"""
+    os.makedirs(out_dir, exist_ok=True)
+    job_dirs = glob.glob(f'{job_out_dir}/job_*')
+    job_dirs = natsorted(job_dirs)
+    # Move first file, prepare to merge
+    first_folder = f'{job_dirs[0]}/{folder_name}'
+    merged_folder = f'{out_dir}/{folder_name}'
+    shutil.move(first_folder, merged_folder)
+    if verbose:
+        print(f'Merging {folder_name} from {len(job_dirs)} directories to {merged_folder}')
+    # If more than one folder, merge to first
+    if len(job_dirs) >= 2:
+        # Find each job output file
+        for job_dir in job_dirs[1:]:
+            # Move over extra files
+            extra_folder = f'{job_dir}/{folder_name}'
+            extra_files = os.listdir(extra_folder)
+            for file_name in extra_files:
+                file_path = f'{extra_folder}/{file_name}'
+                shutil.move(file_path, merged_folder)
+def merge_file(
+    file_name,
+    header_lines_int=1,
+    job_out_dir='job_output',
+    out_dir='output',
+    verbose=1,
+):
+    """Helper function for merging files with headers"""
+    os.makedirs(out_dir, exist_ok=True)
+    job_dirs = glob.glob(f'{job_out_dir}/job_*')
+    job_dirs = natsorted(job_dirs)
+    # Move first file, prepare to merge
+    first_file = f'{job_dirs[0]}/{file_name}'
+    merged_file = f'{out_dir}/{file_name}'
+    shutil.move(first_file, merged_file)
+    if verbose:
+        print(f'Merging {file_name} from {len(job_dirs)} directories to {merged_file}')
+    # If more than one file, append to first
+    if len(job_dirs) >= 2:
+        # Open first file
+        with open(merged_file, 'a') as merged_file_handle:
+            # Find each job output file
+            for job_dir in job_dirs[1:]:
+                # Open extra file
+                extra_file = f'{job_dir}/{file_name}'
+                with open(extra_file) as extra_file_handle:
+                    # Skip first n header lines
+                    for _ in range(header_lines_int):
+                        next(extra_file_handle)
+                    # Append content to first file
+                    contents = extra_file_handle.read()
+                    merged_file_handle.write(contents)
+def _get_batch_stats(batch):
+    stats_dict = {
+        'records': len(batch),
+        'residues': sum(len(R.sequence) for R in batch),
+    }
+    return stats_dict
+def _args_dict_to_args_list(new_args):
+    """Converts args dict to list of arguments for Biolib"""
+    nested_list = [[f'--{key}', f'{value}'] for key, value in new_args.items()]
+    arg_list = []
+    for lst in nested_list:
+        for item in lst:
+            arg_list.append(item)
+    return arg_list

biolib/cli/data_record.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Dict, List
 import click
 from biolib._data_record.data_record import DataRecord
+from biolib.biolib_api_client import BiolibApiClient
 from biolib.biolib_logging import logger, logger_no_user_data
 from biolib.typing_utils import Optional
@@ -57,6 +58,7 @@ def download(uri: str, file: Optional[str], path_filter: Optional[str]) -> None:
 @click.argument('uri', required=True)
 @click.option('--json', 'output_as_json', is_flag=True, default=False, required=False, help='Format output as JSON')
 def describe(uri: str, output_as_json: bool) -> None:
+    BiolibApiClient.assert_is_signed_in(authenticated_action_description='get Data Record description')
     record = DataRecord.get_by_uri(uri)
     files_info: List[Dict] = []
     total_size_in_bytes = 0

{pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pybiolib
-Version: 1.1.2236
+Version: 1.1.2250
 Summary: BioLib Python Client
 Home-page: https://github.com/biolib
 License: MIT

{pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/RECORD RENAMED Viewed

@@ -23,6 +23,7 @@ biolib/_internal/types/experiment.py,sha256=D94iBdn2nS92lRW-TOs1a2WKXJD5ZtmzL4yp
 biolib/_internal/types/resource.py,sha256=G-vPkZoe4Um6FPxsQZtRzAlbSW5sDW4NFkbjn21I3V4,372
 biolib/_internal/types/typing.py,sha256=D4EKKEe7kDx0K6lJi-H_XLtk-8w6nu2fdqn9bvzI-Xo,288
 biolib/_internal/utils/__init__.py,sha256=p5vsIFyu-zYqBgdSMfwW9NC_jk7rXvvCbV4Bzd3As7c,630
+biolib/_internal/utils/multinode.py,sha256=UnM08GXc8U-p0eoSleer4BIgngIsn_fgh9FxRQJkIiI,8068
 biolib/_runtime/runtime.py,sha256=oVgTnDDJv9L4BUP1_sd0oAj4LLyyiPSQdhp7ixWARvw,2923
 biolib/api/__init__.py,sha256=mQ4u8FijqyLzjYMezMUUbbBGNB3iFmkNdjXnWPZ7Jlw,138
 biolib/api/client.py,sha256=FRpdH5aI187b_I_4HUNi680v4iOP65z5f2RcUo8D8MA,3559
@@ -57,7 +58,7 @@ biolib/biolib_errors.py,sha256=5m4lK2l39DafpoXBImEBD4EPH3ayXBX0JgtPzmGClow,689
 biolib/biolib_logging.py,sha256=J3E5H_LL5k6ZUim2C8gqN7E6lCBZMTpO4tnMpOPwG9U,2854
 biolib/cli/__init__.py,sha256=0v3c_J-U0k46c5ZWeQjLG_kTaKDJm81LBxQpDO2B_aI,1286
 biolib/cli/auth.py,sha256=rpWGmXs6Fz6CGrO9K8ibPRszOdXG78Vig_boKaVCD9A,2082
-biolib/cli/data_record.py,sha256=08JbZkFWKMo0PrnhhG0jQEKnNW7pPLti9cOw8s1TWfI,3344
+biolib/cli/data_record.py,sha256=t8DfJK2EZ_SNZ9drDA_N5Jqy8DNwf9f5SlFrIaOvtv0,3501
 biolib/cli/download_container.py,sha256=HIZVHOPmslGE5M2Dsp9r2cCkAEJx__vcsDz5Wt5LRos,483
 biolib/cli/init.py,sha256=wQOfii_au-d30Hp7DdH-WVw-WVraKvA_zY4za1w7DE8,821
 biolib/cli/lfs.py,sha256=z2qHUwink85mv9yDgifbVKkVwuyknGhMDTfly_gLKJM,4151
@@ -116,8 +117,8 @@ biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3
 biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
 biolib/utils/seq_util.py,sha256=ZQFcaE37B2dtucN2zDjOmdya_X0ITc1zBFZJNQY13XA,5183
 biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
-pybiolib-1.1.2236.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
-pybiolib-1.1.2236.dist-info/METADATA,sha256=1ifPFsaJA8_xjhDV2TwwqQx5oJJo9dXWcy_G360yHHQ,1508
-pybiolib-1.1.2236.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-pybiolib-1.1.2236.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
-pybiolib-1.1.2236.dist-info/RECORD,,
+pybiolib-1.1.2250.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
+pybiolib-1.1.2250.dist-info/METADATA,sha256=IwmPFCDmfGZxhiEc-2YbSTyVc7tARTMCYIeVlk0NHSo,1508
+pybiolib-1.1.2250.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+pybiolib-1.1.2250.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
+pybiolib-1.1.2250.dist-info/RECORD,,

{pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/LICENSE RENAMED Viewed

File without changes

{pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/WHEEL RENAMED Viewed

File without changes

{pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pybiolib 1.1.2236__py3-none-any.whl → 1.1.2250__py3-none-any.whl

pybiolib 1.1.2236py3-none-any.whl → 1.1.2250py3-none-any.whl