geoseeq 0.6.15.dev1__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geoseeq/cli/download.py CHANGED
@@ -414,7 +414,7 @@ def _make_read_configs(download_results, config_dir="."):
414
414
  "reads_1": ["small.fq.gz"],
415
415
  "reads_2": [],
416
416
  "fastq_checksum": "",
417
- "data_type": "short-read",
417
+ "data_type": "single",
418
418
  "bdx_result_dir": "results",
419
419
  "geoseeq_uuid": "05bf22e9-9d25-42db-af25-31bc538a7006"
420
420
  }
@@ -428,7 +428,7 @@ def _make_read_configs(download_results, config_dir="."):
428
428
  "reads_1": [],
429
429
  "reads_2": [],
430
430
  "fastq_checksum": "",
431
- "data_type": "short-read",
431
+ "data_type": read_type,
432
432
  "bdx_result_dir": "results",
433
433
  "geoseeq_uuid": sample.uuid,
434
434
  }
geoseeq/cli/main.py CHANGED
@@ -55,7 +55,7 @@ def version():
55
55
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
56
56
  Run `geoseeq eula show` to view the EULA.
57
57
  """
58
- click.echo('0.6.15dev1') # remember to update pyproject.toml
58
+ click.echo('0.7.1') # remember to update pyproject.toml
59
59
 
60
60
 
61
61
  @main.group('advanced')
@@ -8,7 +8,7 @@ from .upload import (
8
8
  cli_upload_smart_tree,
9
9
  )
10
10
  from .upload_reads import cli_upload_reads_wizard
11
- from .upload_advanced import cli_find_urls_for_reads
11
+ from .upload_advanced import cli_find_urls_for_reads, cli_upload_from_config
12
12
 
13
13
  @click.group('upload')
14
14
  def cli_upload():
@@ -22,9 +22,10 @@ cli_upload.add_command(cli_metadata)
22
22
  cli_upload.add_command(cli_upload_smart_table)
23
23
  cli_upload.add_command(cli_upload_smart_tree)
24
24
 
25
- @click.group('upload')
25
+ @cli_upload.group('advanced')
26
26
  def cli_upload_advanced():
27
27
  """Advanced tools to upload files to GeoSeeq."""
28
28
  pass
29
29
 
30
- cli_upload_advanced.add_command(cli_find_urls_for_reads)
30
+ cli_upload_advanced.add_command(cli_find_urls_for_reads)
31
+ cli_upload_advanced.add_command(cli_upload_from_config)
@@ -24,6 +24,16 @@ from geoseeq.cli.shared_params import (
24
24
 
25
25
  from geoseeq.constants import FASTQ_MODULE_NAMES
26
26
  from geoseeq.cli.progress_bar import PBarManager
27
+ import pandas as pd
28
+ from typing import Dict, Optional
29
+ from geoseeq.id_constructors.from_ids import (
30
+ org_from_id,
31
+ project_from_id,
32
+ sample_from_id,
33
+ result_folder_from_id,
34
+ result_file_from_id,
35
+ )
36
+ from geoseeq.upload_download_manager import GeoSeeqUploadManager
27
37
 
28
38
  logger = logging.getLogger('geoseeq_api')
29
39
 
@@ -90,3 +100,223 @@ def cli_find_urls_for_reads(state, cores, overwrite, yes, regex, private, module
90
100
  groups = _group_files(knex, filepaths, module_name, regex, yes)
91
101
  for file_name, target_url in _find_target_urls(groups, module_name, proj, filepaths, overwrite, cores, state):
92
102
  print(f'{file_name}\t{target_url}', file=state.outfile)
103
+
104
+
105
+ def _get_result_file_from_record_with_ids(knex, record: Dict) -> Dict:
106
+ """Get all relevant objects from a record, handling GRNs, UUIDs, and absolute names without requiring parent objects.
107
+
108
+ Returns a dict with 'org', 'project', 'sample', 'folder', and 'result_file' keys.
109
+ Objects may be None if not needed/specified.
110
+ Guaranteed that at least org is not None.
111
+ """
112
+ objects = {
113
+ 'org': None,
114
+ 'project': None,
115
+ 'sample': None,
116
+ 'folder': None,
117
+ 'result_file': None
118
+ }
119
+
120
+ # Try to get file directly - if it's a GRN/UUID we don't need parent objects
121
+ try:
122
+ objects['result_file'] = result_file_from_id(knex, record['filename'])
123
+ objects['folder'] = objects['result_file'].folder
124
+ if hasattr(objects['folder'], 'sample'):
125
+ objects['sample'] = objects['folder'].sample
126
+ objects['project'] = objects['sample'].project
127
+ else:
128
+ objects['project'] = objects['folder'].project
129
+ objects['org'] = objects['project'].org
130
+ return objects
131
+ except ValueError:
132
+ pass # Not a GRN, UUID or abs name. Continue with normal flow
133
+
134
+ # Try to get folder directly - if it's a GRN/UUID we don't need parent objects
135
+ try:
136
+ objects['folder'] = result_folder_from_id(knex, record['folder'])
137
+ # Get parent objects from folder
138
+ if hasattr(objects['folder'], 'sample'):
139
+ objects['sample'] = objects['folder'].sample
140
+ objects['project'] = objects['sample'].project
141
+ else:
142
+ objects['project'] = objects['folder'].project
143
+ objects['org'] = objects['project'].org
144
+ return objects
145
+ except ValueError:
146
+ pass # Not a GRN, UUID or abs name. Continue with normal flow
147
+
148
+ # Try to get sample directly if specified
149
+ if pd.notna(record['sample']):
150
+ try:
151
+ objects['sample'] = sample_from_id(knex, record['sample'])
152
+ objects['project'] = objects['sample'].project
153
+ objects['org'] = objects['project'].org
154
+ return objects
155
+ except ValueError:
156
+ pass # Not a GRN, UUID or abs name. Continue with normal flow
157
+
158
+ # Try to get project directly
159
+ try:
160
+ objects['project'] = project_from_id(knex, record['project'])
161
+ objects['org'] = objects['project'].org
162
+ return objects
163
+ except ValueError:
164
+ pass # Not a GRN/UUID, continue
165
+
166
+
167
+ if objects['org'] is None: # Get org directly if we don't have one yet
168
+ objects['org'] = org_from_id(knex, record['organization'])
169
+
170
+ return objects
171
+
172
+
173
+ def _get_result_file_from_record(knex, record: Dict) -> Dict:
174
+ """Get all relevant objects from a record, handling GRNs/UUIDs without requiring parent objects.
175
+
176
+ Returns a dict with 'org', 'project', 'sample', 'folder', and 'result_file' keys.
177
+ Objects may be None if not needed/specified.
178
+ """
179
+ objects = _get_result_file_from_record_with_ids(knex, record)
180
+
181
+ if objects['project'] is None:
182
+ objects['project'] = objects['org'].project(record['project'])
183
+
184
+ if objects['sample'] is None:
185
+ if pd.notna(record['sample']):
186
+ objects['sample'] = objects['project'].sample(record['sample'])
187
+ parent = objects['sample']
188
+ else:
189
+ parent = objects['project']
190
+
191
+ if objects['folder'] is None:
192
+ objects['folder'] = parent.result_folder(record['folder'])
193
+
194
+ if objects['result_file'] is None:
195
+ objects['result_file'] = objects['folder'].result_file(record['filename'])
196
+
197
+ objects['result_file'].idem()
198
+ print(objects)
199
+ return objects
200
+
201
+
202
+ def _add_record_to_upload_manager_local_file(record: Dict, result_file, upload_manager: GeoSeeqUploadManager) -> None:
203
+ """Add a local file upload to the upload manager."""
204
+ upload_manager.add_result_file(result_file, record['path'], link_type='upload')
205
+
206
+
207
+ def _add_record_to_upload_manager_s3_file(record: Dict, result_file, upload_manager: GeoSeeqUploadManager) -> None:
208
+ """Add an S3 file link to the upload manager.
209
+
210
+ Handles two types of S3 URLs:
211
+ 1. https://endpoint/bucket/key - Full URL with endpoint included
212
+ 2. s3://bucket/key - S3 protocol URL that needs endpoint added
213
+ """
214
+ path = record['path']
215
+
216
+ if path.startswith('s3://'):
217
+ # Convert s3:// URL to https:// URL
218
+ if not record['endpoint_url']:
219
+ raise ValueError("endpoint_url is required for s3:// URLs")
220
+
221
+ # Remove s3:// prefix and combine with endpoint
222
+ bucket_and_key = path[5:] # len('s3://') == 5
223
+ path = f"{record['endpoint_url'].rstrip('/')}/{bucket_and_key}"
224
+ elif not path.startswith('https://'):
225
+ raise ValueError("S3 URLs must start with either 's3://' or 'https://'")
226
+
227
+ upload_manager.add_result_file(result_file, path, link_type='s3')
228
+
229
+
230
+ def _upload_one_record(knex, record: Dict, overwrite: bool, upload_manager: GeoSeeqUploadManager) -> Dict:
231
+ """Process a single record from the config file and add it to the upload manager."""
232
+ objects = _get_result_file_from_record(knex, record)
233
+ if not objects['result_file']:
234
+ raise ValueError(f"Could not find or create result_file from record: {record}")
235
+
236
+ # Add to upload manager based on type
237
+ if record['type'].lower() == 'local':
238
+ _add_record_to_upload_manager_local_file(record, objects["result_file"], upload_manager)
239
+ elif record['type'].lower() == 's3':
240
+ _add_record_to_upload_manager_s3_file(record, objects["result_file"], upload_manager)
241
+ else:
242
+ raise ValueError(f"Unknown file type: {record['type']}")
243
+
244
+ return objects
245
+
246
+
247
+ REQUIRED_COLUMNS = [
248
+ 'organization', 'project', 'sample', 'folder',
249
+ 'filename', 'path', 'type', 'endpoint_url'
250
+ ]
251
+
252
+
253
+ @click.command('from-config')
254
+ @use_common_state
255
+ @click.option('--cores', default=1, help='Number of uploads to run in parallel')
256
+ @click.option('--sep', default=',', help='Separator character for the CSV file')
257
+ @overwrite_option
258
+ @yes_option
259
+ @click.argument('config_file', type=click.Path(exists=True))
260
+ def cli_upload_from_config(state, cores, sep, overwrite, yes, config_file):
261
+ """Upload files to GeoSeeq based on a configuration CSV file.
262
+
263
+ \b
264
+ The CSV file must have the following columns:
265
+ - organization: Organization name, GRN, or UUID (optional if project/sample/folder specified by GRN/UUID)
266
+ - project: Project name, GRN, or UUID (optional if sample/folder specified by GRN/UUID)
267
+ - sample: Sample name, GRN, or UUID (optional, also optional if folder specified by GRN/UUID)
268
+ - folder: Folder name, GRN, or UUID
269
+ - filename: Name to give the file on GeoSeeq
270
+ - path: Path to local file or S3 URL
271
+ - type: Either "local" or "s3"
272
+ - endpoint_url: S3 endpoint URL (required for S3 files)
273
+
274
+ \b
275
+ When using GRNs or UUIDs, you can omit the parent object IDs. For example:
276
+ - If folder is a GRN/UUID, organization/project/sample can be blank
277
+ - If sample is a GRN/UUID, organization/project can be blank
278
+ - If project is a GRN/UUID, organization can be blank
279
+
280
+ \b
281
+ Example config.csv:
282
+ organization,project,sample,folder,filename,path,type,endpoint_url
283
+ MyOrg,MyProject,Sample1,reads,file1.fastq,/path/to/file1.fastq,local,
284
+ ,grn:project:uuid,Sample2,reads,file2.fastq,/path/to/file2.fastq,local,
285
+ ,,grn:sample:uuid,reads,file3.fastq,/path/to/file3.fastq,local,
286
+ ,,,grn:folder:uuid,file4.fastq,s3://bucket/file4.fastq,s3,https://s3.amazonaws.com
287
+
288
+ \b
289
+ Example with tab separator:
290
+ $ geoseeq upload advanced from-config --sep $'\t' config.tsv
291
+ """
292
+ knex = state.get_knex()
293
+
294
+ # Read and validate config file
295
+ df = pd.read_csv(config_file, sep=sep)
296
+ missing_cols = set(REQUIRED_COLUMNS) - set(df.columns)
297
+ if missing_cols:
298
+ raise click.UsageError(f"Config file missing required columns: {missing_cols}")
299
+
300
+ # Create upload manager
301
+ upload_manager = GeoSeeqUploadManager(
302
+ n_parallel_uploads=cores,
303
+ progress_tracker_factory=PBarManager().get_new_bar,
304
+ log_level=state.log_level,
305
+ overwrite=overwrite,
306
+ use_cache=state.use_cache,
307
+ )
308
+
309
+ # Process records and add to upload manager
310
+ objects_by_record = {} # Store objects for human readable paths
311
+ for _, record in df.iterrows():
312
+ objects = _upload_one_record(knex, record, overwrite, upload_manager)
313
+ objects_by_record[record['path']] = objects
314
+
315
+ # Show preview with both technical and human readable paths
316
+ click.echo(upload_manager.get_preview_string(), err=True)
317
+
318
+ if not yes:
319
+ click.confirm('Do you want to proceed with these uploads?', abort=True)
320
+
321
+ # Perform uploads
322
+ upload_manager.upload_files()
@@ -2,7 +2,7 @@ import logging
2
2
  import click
3
3
  import requests
4
4
  from os.path import basename
5
-
5
+ import pandas as pd
6
6
  from multiprocessing import Pool, current_process
7
7
 
8
8
  from geoseeq.cli.constants import *
@@ -67,8 +67,12 @@ def _get_regex(knex, filepaths, module_name, lib, regex):
67
67
  return regex
68
68
 
69
69
 
70
- def _group_files(knex, filepaths, module_name, regex, yes):
70
+ def _group_files(knex, filepaths, module_name, regex, yes, name_map):
71
71
  """Group the files into samples, confirm, and return the groups."""
72
+ if name_map is not None:
73
+ name_map_filename, cur_col, new_col = name_map
74
+ name_map = pd.read_csv(name_map_filename)[[cur_col, new_col]]
75
+ name_map = name_map.set_index(cur_col).to_dict()
72
76
  seq_length, seq_type = module_name.split('::')[:2]
73
77
  groups = knex.post('bulk_upload/group_files', json={
74
78
  'filenames': list(filepaths.keys()),
@@ -76,7 +80,11 @@ def _group_files(knex, filepaths, module_name, regex, yes):
76
80
  'regex': regex
77
81
  })
78
82
  for group in groups:
79
- click.echo(f'sample_name: {group["sample_name"]}', err=True)
83
+ sample_name = group["sample_name"]
84
+ if name_map:
85
+ sample_name = name_map.get(sample_name, sample_name)
86
+ group["sample_name"] = sample_name
87
+ click.echo(f'sample_name: {sample_name}', err=True)
80
88
  click.echo(f' module_name: {module_name}', err=True)
81
89
  for field_name, filename in group['fields'].items():
82
90
  path = filepaths[filename]
@@ -173,10 +181,11 @@ def flatten_list_of_bams(filepaths):
173
181
  @private_option
174
182
  @link_option
175
183
  @no_new_versions_option
184
+ @click.option('--name-map', default=None, nargs=3, help="A file to use for converting names. Takes three arguments: a file name, a column name for current names, and a column name for new names.")
176
185
  @module_option(FASTQ_MODULE_NAMES)
177
186
  @project_id_arg
178
187
  @click.argument('fastq_files', type=click.Path(exists=True), nargs=-1)
179
- def cli_upload_reads_wizard(state, cores, overwrite, yes, regex, private, link_type, no_new_versions, module_name, project_id, fastq_files):
188
+ def cli_upload_reads_wizard(state, cores, overwrite, yes, regex, private, link_type, no_new_versions, name_map, module_name, project_id, fastq_files):
180
189
  """Upload fastq read files to GeoSeeq.
181
190
 
182
191
  This command automatically groups files by their sample name, lane number
@@ -229,7 +238,7 @@ def cli_upload_reads_wizard(state, cores, overwrite, yes, regex, private, link_t
229
238
  filepaths = {basename(line): line for line in flatten_list_of_fastxs(fastq_files)}
230
239
  click.echo(f'Found {len(filepaths)} files to upload.', err=True)
231
240
  regex = _get_regex(knex, filepaths, module_name, proj, regex)
232
- groups = _group_files(knex, filepaths, module_name, regex, yes)
241
+ groups = _group_files(knex, filepaths, module_name, regex, yes, name_map)
233
242
  _do_upload(groups, module_name, link_type, proj, filepaths, overwrite, no_new_versions, cores, state)
234
243
 
235
244
 
geoseeq/constants.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from os import environ
2
2
  from os.path import join
3
+ from typing import Literal
3
4
 
4
5
  FIVE_MB = 5 * (1024 ** 2)
5
6
  FASTQ_MODULE_NAMES = [
@@ -13,4 +14,6 @@ DEFAULT_ENDPOINT = "https://backend.geoseeq.com"
13
14
 
14
15
  CONFIG_FOLDER = environ.get("XDG_CONFIG_HOME", join(environ["HOME"], ".config"))
15
16
  CONFIG_DIR = environ.get("GEOSEEQ_CONFIG_DIR", join(CONFIG_FOLDER, "geoseeq"))
16
- PROFILES_PATH = join(CONFIG_DIR, "profiles.json")
17
+ PROFILES_PATH = join(CONFIG_DIR, "profiles.json")
18
+
19
+ OBJECT_TYPE_STR = Literal['org', 'project', 'sample', 'sample_result_folder', 'project_result_folder', 'sample_result_file', 'project_result_file']
@@ -0,0 +1,103 @@
1
+
2
+ import logging
3
+ import json
4
+ from typing import Literal
5
+ from geoseeq.remote_object import RemoteObject
6
+ from geoseeq.id_constructors import result_file_from_blob
7
+ from geoseeq import ProjectResultFile
8
+
9
+ logger = logging.getLogger("geoseeq_api")
10
+
11
+
12
+ class Dashboard(RemoteObject):
13
+ parent_field = "project"
14
+ remote_fields = ["is_default"]
15
+
16
+ def __init__(self, knex, project, name="Default dashboard", is_default=False):
17
+ super().__init__(self)
18
+ self.knex = knex
19
+ self.project = project
20
+ self._name = name
21
+ self.tiles = []
22
+ self.is_default = is_default
23
+
24
+ def _get(self, allow_overwrite=False):
25
+ blob = self.knex.get(f"sample_groups/{self.project.uuid}/dashboard-list")
26
+ blob = blob["dashboard_data"][self.name]
27
+ for tile_blob in blob["tiles"]:
28
+ tile = DashboardTile.from_blob(self, tile_blob)
29
+ self.tiles.append(tile)
30
+ blob.pop("tiles")
31
+ self.load_blob(blob, allow_overwrite=allow_overwrite)
32
+
33
+ def _save(self):
34
+ self.save_tiles()
35
+
36
+ def save_tiles(self):
37
+ post_data = {"tiles": [tile._get_post_data() for tile in self.tiles]}
38
+ blob = self.knex.post(f"sample_groups/{self.project.uuid}/dashboard/{self.name}/tiles", json=post_data, json_response=False)
39
+ print(blob)
40
+
41
+ def _create(self):
42
+ post_data = {"name": self.name, "is_default": self.is_default}
43
+ blob = self.knex.post(f"sample_groups/{self.project.uuid}/dashboard", json=post_data)
44
+ self.load_blob(blob)
45
+
46
+ def tile(self, title, result_file, style: Literal["col-span-1", "col-span-2"]="col-span-1"):
47
+ result_file.get()
48
+ tile = DashboardTile(self.knex, self, title, result_file, style=style)
49
+ self.tiles.append(tile)
50
+ self._modified = True
51
+ return tile
52
+
53
+ def add_tile(self, tile):
54
+ self.tiles.append(tile)
55
+ self._modified = True
56
+
57
+ @property
58
+ def name(self):
59
+ return self._name
60
+
61
+ def __str__(self):
62
+ return f"<Geoseeq Dashboard: {self.project.grn} \"{self.name}\"/>"
63
+
64
+ def __repr__(self):
65
+ return str(self)
66
+
67
+ @property
68
+ def grn(self):
69
+ return f"grn:dashboard:{self.project.uuid}:\"{self.name}\""
70
+
71
+ def pre_hash(self):
72
+ return "DASH" + self.project.uuid + self.name
73
+
74
+
75
+ class DashboardTile:
76
+
77
+ def __init__(self, knex, dashboard, title, result_file, style="col-span-1"):
78
+ self.knex = knex
79
+ self.dashboard = dashboard
80
+ self.title = title
81
+ self.style = style
82
+ self.result_file = result_file
83
+
84
+ def _get_post_data(self):
85
+ out = {
86
+ "field_uuid": self.result_file.uuid,
87
+ "field_type": "group" if isinstance(self.result_file, ProjectResultFile) else "sample",
88
+ "style": self.style,
89
+ "title": self.title,
90
+ "has_related_field": False,
91
+ }
92
+ return out
93
+
94
+ @classmethod
95
+ def from_blob(cls, dashboard, blob):
96
+ result_file = result_file_from_blob(blob["viz_field"])
97
+ return cls(dashboard.knex, dashboard, blob["title"], result_file, style=blob["style"])
98
+
99
+ def __str__(self) -> str:
100
+ return f"<Geoseeq DashboardTile: {self.dashboard.grn} \"{self.title}\" />"
101
+
102
+ def __repr__(self) -> str:
103
+ return str(self)