PyPI - geoseeq - Versions diffs - 0.6.15.dev1__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

geoseeq 0.6.15.dev1py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

geoseeq/cli/download.py +2 -2
geoseeq/cli/main.py +1 -1
geoseeq/cli/upload/__init__.py +4 -3
geoseeq/cli/upload/upload_advanced.py +230 -0
geoseeq/cli/upload/upload_reads.py +14 -5
geoseeq/constants.py +4 -1
geoseeq/dashboard/dashboard.py +103 -0
geoseeq/file_system/filesystem_download.py +434 -0
geoseeq/file_system/main.py +122 -0
geoseeq/id_constructors/from_ids.py +20 -14
geoseeq/id_constructors/utils.py +17 -0
geoseeq/plotting/README.md +4 -0
geoseeq/upload_download_manager.py +16 -6
geoseeq/vc/README.md +48 -0
{geoseeq-0.6.15.dev1.dist-info → geoseeq-0.7.1.dist-info}/METADATA +8 -10
{geoseeq-0.6.15.dev1.dist-info → geoseeq-0.7.1.dist-info}/RECORD +20 -18
{geoseeq-0.6.15.dev1.dist-info → geoseeq-0.7.1.dist-info}/WHEEL +1 -2
geoseeq-0.6.15.dev1.dist-info/top_level.txt +0 -2
tests/test_api_client.py +0 -283
tests/test_plotting.py +0 -29
/tests/__init__.py → /geoseeq/contrib/ncbi/README.md +0 -0
{geoseeq-0.6.15.dev1.dist-info → geoseeq-0.7.1.dist-info}/entry_points.txt +0 -0
{geoseeq-0.6.15.dev1.dist-info → geoseeq-0.7.1.dist-info/licenses}/LICENSE +0 -0

geoseeq/cli/download.py CHANGED Viewed

@@ -414,7 +414,7 @@ def _make_read_configs(download_results, config_dir="."):
         "reads_1": ["small.fq.gz"],
         "reads_2": [],
         "fastq_checksum": "",
-        "data_type": "short-read",
+        "data_type": "single",
         "bdx_result_dir": "results",
         "geoseeq_uuid": "05bf22e9-9d25-42db-af25-31bc538a7006"
     }
@@ -428,7 +428,7 @@ def _make_read_configs(download_results, config_dir="."):
                 "reads_1": [],
                 "reads_2": [],
                 "fastq_checksum": "",
-                "data_type": "short-read",
+                "data_type": read_type,
                 "bdx_result_dir": "results",
                 "geoseeq_uuid": sample.uuid,
             }

geoseeq/cli/main.py CHANGED Viewed

@@ -55,7 +55,7 @@ def version():
     Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
     Run `geoseeq eula show` to view the EULA.
     """
-    click.echo('0.6.15dev1')  # remember to update pyproject.toml
+    click.echo('0.7.1')  # remember to update pyproject.toml
 @main.group('advanced')

geoseeq/cli/upload/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ from .upload import (
     cli_upload_smart_tree,
 )
 from .upload_reads import cli_upload_reads_wizard
-from .upload_advanced import cli_find_urls_for_reads
+from .upload_advanced import cli_find_urls_for_reads, cli_upload_from_config
 @click.group('upload')
 def cli_upload():
@@ -22,9 +22,10 @@ cli_upload.add_command(cli_metadata)
 cli_upload.add_command(cli_upload_smart_table)
 cli_upload.add_command(cli_upload_smart_tree)
-@click.group('upload')
+@cli_upload.group('advanced')
 def cli_upload_advanced():
     """Advanced tools to upload files to GeoSeeq."""
     pass
-cli_upload_advanced.add_command(cli_find_urls_for_reads)
+cli_upload_advanced.add_command(cli_find_urls_for_reads)
+cli_upload_advanced.add_command(cli_upload_from_config)

geoseeq/cli/upload/upload_advanced.py CHANGED Viewed

@@ -24,6 +24,16 @@ from geoseeq.cli.shared_params import (
 from geoseeq.constants import FASTQ_MODULE_NAMES
 from geoseeq.cli.progress_bar import PBarManager
+import pandas as pd
+from typing import Dict, Optional
+from geoseeq.id_constructors.from_ids import (
+    org_from_id,
+    project_from_id,
+    sample_from_id,
+    result_folder_from_id,
+    result_file_from_id,
+)
+from geoseeq.upload_download_manager import GeoSeeqUploadManager
 logger = logging.getLogger('geoseeq_api')
@@ -90,3 +100,223 @@ def cli_find_urls_for_reads(state, cores, overwrite, yes, regex, private, module
     groups = _group_files(knex, filepaths, module_name, regex, yes)
     for file_name, target_url in _find_target_urls(groups, module_name, proj, filepaths, overwrite, cores, state):
         print(f'{file_name}\t{target_url}', file=state.outfile)
+def _get_result_file_from_record_with_ids(knex, record: Dict) -> Dict:
+    """Get all relevant objects from a record, handling GRNs, UUIDs, and absolute names without requiring parent objects.
+    Returns a dict with 'org', 'project', 'sample', 'folder', and 'result_file' keys.
+    Objects may be None if not needed/specified.
+    Guaranteed that at least org is not None.
+    """
+    objects = {
+        'org': None,
+        'project': None,
+        'sample': None,
+        'folder': None,
+        'result_file': None
+    }
+    # Try to get file directly - if it's a GRN/UUID we don't need parent objects
+    try:
+        objects['result_file'] = result_file_from_id(knex, record['filename'])
+        objects['folder'] = objects['result_file'].folder
+        if hasattr(objects['folder'], 'sample'):
+            objects['sample'] = objects['folder'].sample
+            objects['project'] = objects['sample'].project
+        else:
+            objects['project'] = objects['folder'].project
+        objects['org'] = objects['project'].org
+        return objects
+    except ValueError:
+        pass  # Not a GRN, UUID or abs name. Continue with normal flow
+    # Try to get folder directly - if it's a GRN/UUID we don't need parent objects
+    try:
+        objects['folder'] = result_folder_from_id(knex, record['folder'])
+        # Get parent objects from folder
+        if hasattr(objects['folder'], 'sample'):
+            objects['sample'] = objects['folder'].sample
+            objects['project'] = objects['sample'].project
+        else:
+            objects['project'] = objects['folder'].project
+        objects['org'] = objects['project'].org
+        return objects
+    except ValueError:
+        pass  # Not a GRN, UUID or abs name. Continue with normal flow
+    # Try to get sample directly if specified
+    if pd.notna(record['sample']):
+        try:
+            objects['sample'] = sample_from_id(knex, record['sample'])
+            objects['project'] = objects['sample'].project
+            objects['org'] = objects['project'].org
+            return objects
+        except ValueError:
+            pass  # Not a GRN, UUID or abs name. Continue with normal flow
+    # Try to get project directly
+    try:
+        objects['project'] = project_from_id(knex, record['project'])
+        objects['org'] = objects['project'].org
+        return objects
+    except ValueError:
+        pass  # Not a GRN/UUID, continue
+    if objects['org'] is None: # Get org directly if we don't have one yet
+        objects['org'] = org_from_id(knex, record['organization'])
+    return objects
+def _get_result_file_from_record(knex, record: Dict) -> Dict:
+    """Get all relevant objects from a record, handling GRNs/UUIDs without requiring parent objects.
+    Returns a dict with 'org', 'project', 'sample', 'folder', and 'result_file' keys.
+    Objects may be None if not needed/specified.
+    """
+    objects = _get_result_file_from_record_with_ids(knex, record)
+    if objects['project'] is None:
+        objects['project'] = objects['org'].project(record['project'])
+    if objects['sample'] is None:
+        if pd.notna(record['sample']):
+            objects['sample'] = objects['project'].sample(record['sample'])
+            parent = objects['sample']
+    else:
+        parent = objects['project']
+    if objects['folder'] is None:
+        objects['folder'] = parent.result_folder(record['folder'])
+    if objects['result_file'] is None:
+        objects['result_file'] = objects['folder'].result_file(record['filename'])
+    objects['result_file'].idem()
+    print(objects)
+    return objects
+def _add_record_to_upload_manager_local_file(record: Dict, result_file, upload_manager: GeoSeeqUploadManager) -> None:
+    """Add a local file upload to the upload manager."""
+    upload_manager.add_result_file(result_file, record['path'], link_type='upload')
+def _add_record_to_upload_manager_s3_file(record: Dict, result_file, upload_manager: GeoSeeqUploadManager) -> None:
+    """Add an S3 file link to the upload manager.
+    Handles two types of S3 URLs:
+    1. https://endpoint/bucket/key - Full URL with endpoint included
+    2. s3://bucket/key - S3 protocol URL that needs endpoint added
+    """
+    path = record['path']
+    if path.startswith('s3://'):
+        # Convert s3:// URL to https:// URL
+        if not record['endpoint_url']:
+            raise ValueError("endpoint_url is required for s3:// URLs")
+        # Remove s3:// prefix and combine with endpoint
+        bucket_and_key = path[5:]  # len('s3://') == 5
+        path = f"{record['endpoint_url'].rstrip('/')}/{bucket_and_key}"
+    elif not path.startswith('https://'):
+        raise ValueError("S3 URLs must start with either 's3://' or 'https://'")
+    upload_manager.add_result_file(result_file, path, link_type='s3')
+def _upload_one_record(knex, record: Dict, overwrite: bool, upload_manager: GeoSeeqUploadManager) -> Dict:
+    """Process a single record from the config file and add it to the upload manager."""
+    objects = _get_result_file_from_record(knex, record)
+    if not objects['result_file']:
+        raise ValueError(f"Could not find or create result_file from record: {record}")
+    # Add to upload manager based on type
+    if record['type'].lower() == 'local':
+        _add_record_to_upload_manager_local_file(record, objects["result_file"], upload_manager)
+    elif record['type'].lower() == 's3':
+        _add_record_to_upload_manager_s3_file(record, objects["result_file"], upload_manager)
+    else:
+        raise ValueError(f"Unknown file type: {record['type']}")
+    return objects
+REQUIRED_COLUMNS = [
+    'organization', 'project', 'sample', 'folder',
+    'filename', 'path', 'type', 'endpoint_url'
+]
+@click.command('from-config')
+@use_common_state
+@click.option('--cores', default=1, help='Number of uploads to run in parallel')
+@click.option('--sep', default=',', help='Separator character for the CSV file')
+@overwrite_option
+@yes_option
+@click.argument('config_file', type=click.Path(exists=True))
+def cli_upload_from_config(state, cores, sep, overwrite, yes, config_file):
+    """Upload files to GeoSeeq based on a configuration CSV file.
+    \b
+    The CSV file must have the following columns:
+    - organization: Organization name, GRN, or UUID (optional if project/sample/folder specified by GRN/UUID)
+    - project: Project name, GRN, or UUID (optional if sample/folder specified by GRN/UUID)
+    - sample: Sample name, GRN, or UUID (optional, also optional if folder specified by GRN/UUID)
+    - folder: Folder name, GRN, or UUID
+    - filename: Name to give the file on GeoSeeq
+    - path: Path to local file or S3 URL
+    - type: Either "local" or "s3"
+    - endpoint_url: S3 endpoint URL (required for S3 files)
+    \b
+    When using GRNs or UUIDs, you can omit the parent object IDs. For example:
+    - If folder is a GRN/UUID, organization/project/sample can be blank
+    - If sample is a GRN/UUID, organization/project can be blank
+    - If project is a GRN/UUID, organization can be blank
+    \b
+    Example config.csv:
+    organization,project,sample,folder,filename,path,type,endpoint_url
+    MyOrg,MyProject,Sample1,reads,file1.fastq,/path/to/file1.fastq,local,
+    ,grn:project:uuid,Sample2,reads,file2.fastq,/path/to/file2.fastq,local,
+    ,,grn:sample:uuid,reads,file3.fastq,/path/to/file3.fastq,local,
+    ,,,grn:folder:uuid,file4.fastq,s3://bucket/file4.fastq,s3,https://s3.amazonaws.com
+    \b
+    Example with tab separator:
+    $ geoseeq upload advanced from-config --sep $'\t' config.tsv
+    """
+    knex = state.get_knex()
+    # Read and validate config file
+    df = pd.read_csv(config_file, sep=sep)
+    missing_cols = set(REQUIRED_COLUMNS) - set(df.columns)
+    if missing_cols:
+        raise click.UsageError(f"Config file missing required columns: {missing_cols}")
+    # Create upload manager
+    upload_manager = GeoSeeqUploadManager(
+        n_parallel_uploads=cores,
+        progress_tracker_factory=PBarManager().get_new_bar,
+        log_level=state.log_level,
+        overwrite=overwrite,
+        use_cache=state.use_cache,
+    )
+    # Process records and add to upload manager
+    objects_by_record = {}  # Store objects for human readable paths
+    for _, record in df.iterrows():
+        objects = _upload_one_record(knex, record, overwrite, upload_manager)
+        objects_by_record[record['path']] = objects
+    # Show preview with both technical and human readable paths
+    click.echo(upload_manager.get_preview_string(), err=True)
+    if not yes:
+        click.confirm('Do you want to proceed with these uploads?', abort=True)
+    # Perform uploads
+    upload_manager.upload_files()

geoseeq/cli/upload/upload_reads.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import click
 import requests
 from os.path import basename
+import pandas as pd
 from multiprocessing import Pool, current_process
 from geoseeq.cli.constants import *
@@ -67,8 +67,12 @@ def _get_regex(knex, filepaths, module_name, lib, regex):
     return regex
-def _group_files(knex, filepaths, module_name, regex, yes):
+def _group_files(knex, filepaths, module_name, regex, yes, name_map):
     """Group the files into samples, confirm, and return the groups."""
+    if name_map is not None:
+        name_map_filename, cur_col, new_col = name_map
+        name_map = pd.read_csv(name_map_filename)[[cur_col, new_col]]
+        name_map = name_map.set_index(cur_col).to_dict()
     seq_length, seq_type = module_name.split('::')[:2]
     groups = knex.post('bulk_upload/group_files', json={
         'filenames': list(filepaths.keys()),
@@ -76,7 +80,11 @@ def _group_files(knex, filepaths, module_name, regex, yes):
         'regex': regex
     })
     for group in groups:
-        click.echo(f'sample_name: {group["sample_name"]}', err=True)
+        sample_name = group["sample_name"]
+        if name_map:
+            sample_name = name_map.get(sample_name, sample_name)
+            group["sample_name"] = sample_name
+        click.echo(f'sample_name: {sample_name}', err=True)
         click.echo(f'  module_name: {module_name}', err=True)
         for field_name, filename in group['fields'].items():
             path = filepaths[filename]
@@ -173,10 +181,11 @@ def flatten_list_of_bams(filepaths):
 @private_option
 @link_option
 @no_new_versions_option
+@click.option('--name-map', default=None, nargs=3, help="A file to use for converting names. Takes three arguments: a file name, a column name for current names, and a column name for new names.")
 @module_option(FASTQ_MODULE_NAMES)
 @project_id_arg
 @click.argument('fastq_files', type=click.Path(exists=True), nargs=-1)
-def cli_upload_reads_wizard(state, cores, overwrite, yes, regex, private, link_type, no_new_versions, module_name, project_id, fastq_files):
+def cli_upload_reads_wizard(state, cores, overwrite, yes, regex, private, link_type, no_new_versions, name_map, module_name, project_id, fastq_files):
     """Upload fastq read files to GeoSeeq.
     This command automatically groups files by their sample name, lane number
@@ -229,7 +238,7 @@ def cli_upload_reads_wizard(state, cores, overwrite, yes, regex, private, link_t
     filepaths = {basename(line): line for line in flatten_list_of_fastxs(fastq_files)}
     click.echo(f'Found {len(filepaths)} files to upload.', err=True)
     regex = _get_regex(knex, filepaths, module_name, proj, regex)
-    groups = _group_files(knex, filepaths, module_name, regex, yes)
+    groups = _group_files(knex, filepaths, module_name, regex, yes, name_map)
     _do_upload(groups, module_name, link_type, proj, filepaths, overwrite, no_new_versions, cores, state)

geoseeq/constants.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from os import environ
 from os.path import join
+from typing import Literal
 FIVE_MB = 5 * (1024 ** 2)
 FASTQ_MODULE_NAMES = [
@@ -13,4 +14,6 @@ DEFAULT_ENDPOINT = "https://backend.geoseeq.com"
 CONFIG_FOLDER = environ.get("XDG_CONFIG_HOME", join(environ["HOME"], ".config"))
 CONFIG_DIR = environ.get("GEOSEEQ_CONFIG_DIR", join(CONFIG_FOLDER, "geoseeq"))
-PROFILES_PATH = join(CONFIG_DIR, "profiles.json")
+PROFILES_PATH = join(CONFIG_DIR, "profiles.json")
+OBJECT_TYPE_STR = Literal['org', 'project', 'sample', 'sample_result_folder', 'project_result_folder', 'sample_result_file', 'project_result_file']

geoseeq/dashboard/dashboard.py ADDED Viewed

@@ -0,0 +1,103 @@
+import logging
+import json
+from typing import Literal
+from geoseeq.remote_object import RemoteObject
+from geoseeq.id_constructors import result_file_from_blob
+from geoseeq import ProjectResultFile
+logger = logging.getLogger("geoseeq_api")
+class Dashboard(RemoteObject):
+    parent_field = "project"
+    remote_fields = ["is_default"]
+    def __init__(self, knex, project, name="Default dashboard", is_default=False):
+        super().__init__(self)
+        self.knex = knex
+        self.project = project
+        self._name = name
+        self.tiles = []
+        self.is_default = is_default
+    def _get(self, allow_overwrite=False):
+        blob = self.knex.get(f"sample_groups/{self.project.uuid}/dashboard-list")
+        blob = blob["dashboard_data"][self.name]
+        for tile_blob in blob["tiles"]:
+            tile = DashboardTile.from_blob(self, tile_blob)
+            self.tiles.append(tile)
+        blob.pop("tiles")
+        self.load_blob(blob, allow_overwrite=allow_overwrite)
+    def _save(self):
+        self.save_tiles()
+    def save_tiles(self):
+        post_data = {"tiles": [tile._get_post_data() for tile in self.tiles]}
+        blob = self.knex.post(f"sample_groups/{self.project.uuid}/dashboard/{self.name}/tiles", json=post_data, json_response=False)
+        print(blob)
+    def _create(self):
+        post_data = {"name": self.name, "is_default": self.is_default}
+        blob = self.knex.post(f"sample_groups/{self.project.uuid}/dashboard", json=post_data)
+        self.load_blob(blob)
+    def tile(self, title, result_file, style: Literal["col-span-1", "col-span-2"]="col-span-1"):
+        result_file.get()
+        tile = DashboardTile(self.knex, self, title, result_file, style=style)
+        self.tiles.append(tile)
+        self._modified = True
+        return tile
+    def add_tile(self, tile):
+        self.tiles.append(tile)
+        self._modified = True
+    @property
+    def name(self):
+        return self._name
+    def __str__(self):
+        return f"<Geoseeq Dashboard: {self.project.grn} \"{self.name}\"/>"
+    def __repr__(self):
+        return str(self)
+    @property
+    def grn(self):
+        return f"grn:dashboard:{self.project.uuid}:\"{self.name}\""
+    def pre_hash(self):
+        return "DASH" + self.project.uuid + self.name
+class DashboardTile:
+    def __init__(self, knex, dashboard, title, result_file, style="col-span-1"):
+        self.knex = knex
+        self.dashboard = dashboard
+        self.title = title
+        self.style = style
+        self.result_file = result_file
+    def _get_post_data(self):
+        out =  {
+            "field_uuid": self.result_file.uuid,
+            "field_type": "group" if isinstance(self.result_file, ProjectResultFile) else "sample",
+            "style": self.style,
+            "title": self.title,
+            "has_related_field": False,
+        }
+        return out
+    @classmethod
+    def from_blob(cls, dashboard, blob):
+        result_file = result_file_from_blob(blob["viz_field"])
+        return cls(dashboard.knex, dashboard, blob["title"], result_file, style=blob["style"])
+    def __str__(self) -> str:
+        return f"<Geoseeq DashboardTile: {self.dashboard.grn} \"{self.title}\" />"
+    def __repr__(self) -> str:
+        return str(self)

geoseeq 0.6.15.dev1__py3-none-any.whl → 0.7.1__py3-none-any.whl

geoseeq 0.6.15.dev1py3-none-any.whl → 0.7.1py3-none-any.whl