cool-seq-tool 0.3.0.dev0__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- """Module for UTA queries."""
1
+ """Provide transcript lookup and metadata tools via the UTA database."""
2
2
  import ast
3
3
  import base64
4
4
  import logging
@@ -14,24 +14,34 @@ from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecification
14
14
  from botocore.exceptions import ClientError
15
15
  from pyliftover import LiftOver
16
16
 
17
- from cool_seq_tool.schemas import AnnotationLayer, Assembly
17
+ from cool_seq_tool.schemas import AnnotationLayer, Assembly, Strand
18
18
 
19
- # use `bound` to upper-bound UTADatabase or child classes
20
- UTADatabaseType = TypeVar("UTADatabaseType", bound="UTADatabase")
19
+ # use `bound` to upper-bound UtaDatabase or child classes
20
+ UTADatabaseType = TypeVar("UTADatabaseType", bound="UtaDatabase")
21
21
 
22
22
  # Environment variables for paths to chain files for pyliftover
23
23
  LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
24
24
  LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
25
25
 
26
26
  UTA_DB_URL = environ.get(
27
- "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129"
27
+ "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129b"
28
28
  )
29
29
 
30
30
  logger = logging.getLogger(__name__)
31
31
 
32
32
 
33
- class UTADatabase:
34
- """Class for connecting and querying UTA database."""
33
+ class UtaDatabase:
34
+ """Provide transcript lookup and metadata tools via the Universal Transcript Archive
35
+ (UTA) database.
36
+
37
+ Users should use the ``create()`` method to construct a new instance. Note that
38
+ almost all public methods are defined as ``async`` -- see the :ref:`Usage section <async_note>`
39
+ for more information.
40
+
41
+ >>> import asyncio
42
+ >>> from cool_seq_tool.sources.uta_database import UtaDatabase
43
+ >>> uta_db = asyncio.run(UtaDatabase.create())
44
+ """
35
45
 
36
46
  def __init__(
37
47
  self,
@@ -39,19 +49,19 @@ class UTADatabase:
39
49
  chain_file_37_to_38: Optional[str] = None,
40
50
  chain_file_38_to_37: Optional[str] = None,
41
51
  ) -> None:
42
- """Initialize DB class. Downstream libraries should use the create()
43
- method to construct a new instance: await UTADatabase.create()
52
+ """Initialize DB class. Should only be used by ``create()`` method, and not
53
+ be called directly by a user.
44
54
 
45
55
  :param db_url: PostgreSQL connection URL
46
- Format: `driver://user:password@host/database/schema`
56
+ Format: ``driver://user:password@host/database/schema``
47
57
  :param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly.
48
- This is used for pyliftover. If this is not provided, will check to see if
49
- LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will allow
50
- pyliftover to download a chain file from UCSC
58
+ This is used for ``pyliftover``. If this is not provided, will check to see
59
+ if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will
60
+ allow ``pyliftover`` to download a chain file from UCSC
51
61
  :param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
52
- This is used for pyliftover. If this is not provided, will check to see if
53
- LIFTOVER_CHAIN_38_TO_37 env var is set. If neither is provided, will allow
54
- pyliftover to download a chain file from UCSC
62
+ This is used for ``pyliftover``. If this is not provided, will check to see
63
+ if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will
64
+ allow ``pyliftover`` to download a chain file from UCSC
55
65
  """
56
66
  self.schema = None
57
67
  self._connection_pool = None
@@ -137,10 +147,17 @@ class UTADatabase:
137
147
  async def create(
138
148
  cls: Type[UTADatabaseType], db_url: str = UTA_DB_URL
139
149
  ) -> UTADatabaseType:
140
- """Provide fully-initialized class instance (a la factory pattern)
141
- :param UTADatabaseType cls: supplied implicitly
142
- :param str db_url: PostgreSQL connection URL
143
- Format: `driver://user:password@host/database/schema`
150
+ """Manufacture a fully-initialized class instance (a la factory pattern). This
151
+ method should be used instead of calling the class directly to create a new
152
+ instance.
153
+
154
+ >>> import asyncio
155
+ >>> from cool_seq_tool.sources.uta_database import UtaDatabase
156
+ >>> uta_db = asyncio.run(UtaDatabase.create())
157
+
158
+ :param cls: supplied implicitly
159
+ :param db_url: PostgreSQL connection URL
160
+ Format: ``driver://user:password@host/database/schema``
144
161
  :return: UTA DB access class instance
145
162
  """
146
163
  self = cls(db_url)
@@ -151,7 +168,7 @@ class UTADatabase:
151
168
  async def execute_query(self, query: str) -> Any: # noqa: ANN401
152
169
  """Execute a query and return its result.
153
170
 
154
- :param str query: Query to make on database
171
+ :param query: Query to make on database
155
172
  :return: Query's result
156
173
  """
157
174
 
@@ -185,7 +202,7 @@ class UTADatabase:
185
202
  genomic_table_exists = genomic_table_exists[0].get("exists")
186
203
  if genomic_table_exists is None:
187
204
  logger.critical(
188
- "SELECT EXISTS query in UTADatabase._create_genomic_table "
205
+ "SELECT EXISTS query in UtaDatabase._create_genomic_table "
189
206
  "returned invalid response"
190
207
  )
191
208
  raise ValueError("SELECT EXISTS query returned invalid response")
@@ -212,8 +229,8 @@ class UTADatabase:
212
229
  await self.execute_query(create_genomic_table)
213
230
 
214
231
  indexes = [
215
- f"""CREATE INDEX alt_pos_index ON {self.schema}.genomic (alt_ac, alt_start_i, alt_end_i);""", # noqa: E501
216
- f"""CREATE INDEX gene_alt_index ON {self.schema}.genomic (hgnc, alt_ac);""", # noqa: E501
232
+ f"""CREATE INDEX alt_pos_index ON {self.schema}.genomic (alt_ac, alt_start_i, alt_end_i);""",
233
+ f"""CREATE INDEX gene_alt_index ON {self.schema}.genomic (hgnc, alt_ac);""",
217
234
  f"""CREATE INDEX alt_ac_index ON {self.schema}.genomic (alt_ac);""",
218
235
  ]
219
236
  for create_index in indexes:
@@ -223,7 +240,7 @@ class UTADatabase:
223
240
  def _transform_list(li: List) -> List[List[Any]]:
224
241
  """Transform list to only contain field values
225
242
 
226
- :param List li: List of asyncpg.Record objects
243
+ :param li: List of asyncpg.Record objects
227
244
  :return: List of list of objects
228
245
  """
229
246
  results = list()
@@ -231,30 +248,33 @@ class UTADatabase:
231
248
  results.append([field for field in item])
232
249
  return results
233
250
 
234
- async def chr_to_gene_and_accessions(
251
+ async def get_genes_and_alt_acs(
235
252
  self,
236
- chromosome: int,
237
253
  pos: int,
238
- strand: Optional[int] = None,
254
+ strand: Optional[Strand] = None,
255
+ chromosome: Optional[int] = None,
239
256
  alt_ac: Optional[str] = None,
240
257
  gene: Optional[str] = None,
241
258
  ) -> Tuple[Optional[Dict], Optional[str]]:
242
- """Return genes and genomic accessions related to a position on a chr.
243
-
244
- :param int chromosome: Chromosome number
245
- :param int pos: Genomic position
246
- :param Optional[int] strand: Strand. Must be either `-1` or `1`
247
- :param Optional[str] alt_ac: Genomic accession
248
- :param Optional[str] gene: Gene symbol
249
- :return: Dictionary containing genes and genomic accessions and
250
- warnings if found
259
+ """Return genes and genomic accessions for a position on a chromosome or alt_ac
260
+
261
+ :param pos: Genomic position
262
+ :param strand: Strand
263
+ :param chromosome: Chromosome. Must give chromosome without a prefix
264
+ (i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
265
+ If ``alt_ac`` is also provided, ``alt_ac`` will be used.
266
+ :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
267
+ must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
268
+ will be used.
269
+ :param gene: Gene symbol
270
+ :return: Dictionary containing genes and genomic accessions and warnings if found
251
271
  """
252
272
  alt_ac_cond = (
253
273
  f"WHERE alt_ac = '{alt_ac}'"
254
274
  if alt_ac
255
275
  else f"WHERE alt_ac ~ '^NC_[0-9]+0{chromosome}.[0-9]+$'"
256
276
  )
257
- strand_cond = f"AND alt_strand = '{strand}'" if strand else ""
277
+ strand_cond = f"AND alt_strand = '{strand.value}'" if strand else ""
258
278
  gene_cond = f"AND hgnc = '{gene}'" if gene else ""
259
279
 
260
280
  query = f"""
@@ -275,7 +295,10 @@ class UTADatabase:
275
295
  f" is mapped between an exon's start and end coordinates"
276
296
  )
277
297
  if strand:
278
- msg += f" on the " f"{'positive' if strand == 1 else 'negative'} strand"
298
+ msg += (
299
+ f" on the "
300
+ f"{'positive' if strand == Strand.POSITIVE else 'negative'} strand"
301
+ )
279
302
  if gene:
280
303
  msg += f" and on gene {gene}"
281
304
  return None, msg
@@ -293,12 +316,12 @@ class UTADatabase:
293
316
  ) -> Tuple[Optional[List[Tuple[int, int]]], Optional[str]]:
294
317
  """Get list of transcript exons start/end coordinates.
295
318
 
296
- :param str tx_ac: Transcript accession
297
- :param Optional[str] alt_ac: Genomic accession
319
+ :param tx_ac: Transcript accession
320
+ :param alt_ac: Genomic accession
298
321
  :return: List of a transcript's accessions and warnings if found
299
322
  """
300
323
  if alt_ac:
301
- # We know what asesmbly we're looking for since we have the
324
+ # We know what assembly we're looking for since we have the
302
325
  # genomic accession
303
326
  query = f"""
304
327
  SELECT DISTINCT tx_start_i, tx_end_i
@@ -329,126 +352,17 @@ class UTADatabase:
329
352
  tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
330
353
  return tx_exons, None
331
354
 
332
- @staticmethod
333
- def _validate_exon(
334
- transcript: str, tx_exons: List[Tuple[int, int]], exon_number: int
335
- ) -> Tuple[Optional[Tuple[int, int]], Optional[str]]:
336
- """Validate that exon number is valid
337
-
338
- :param str transcript: Transcript accession
339
- :param List tx_exons: List of transcript's exons
340
- :param Optional[int] exon_number: Exon number to validate
341
- :return: Transcript coordinates and warnings if found
342
- """
343
- msg = f"Exon {exon_number} does not exist on {transcript}"
344
- try:
345
- if exon_number < 1:
346
- return None, msg
347
- exon = tx_exons[exon_number - 1]
348
- except IndexError:
349
- return None, msg
350
- return exon, None
351
-
352
- def get_tx_exon_coords(
353
- self,
354
- transcript: str,
355
- tx_exons: List[Tuple[int, int]],
356
- exon_start: Optional[int] = None,
357
- exon_end: Optional[int] = None,
358
- ) -> Tuple[
359
- Optional[Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]],
360
- Optional[str],
361
- ]:
362
- """Get transcript exon coordinates
363
-
364
- :param transcript: Transcript accession
365
- :param tx_exons: List of transcript exons
366
- :param exon_start: Start exon number
367
- :param exon_end: End exon number
368
- :return: [Transcript start exon coords, Transcript end exon coords],
369
- and warnings if found
370
- """
371
- if exon_start is not None:
372
- tx_exon_start, warning = self._validate_exon(
373
- transcript, tx_exons, exon_start
374
- )
375
- if not tx_exon_start:
376
- return None, warning
377
- else:
378
- tx_exon_start = None
379
-
380
- if exon_end is not None:
381
- tx_exon_end, warning = self._validate_exon(transcript, tx_exons, exon_end)
382
- if not tx_exon_end:
383
- return None, warning
384
- else:
385
- tx_exon_end = None
386
- return (tx_exon_start, tx_exon_end), None
387
-
388
- async def get_alt_ac_start_and_end(
389
- self,
390
- tx_ac: str,
391
- tx_exon_start: Optional[List[str]] = None,
392
- tx_exon_end: Optional[List[str]] = None,
393
- gene: Optional[str] = None,
394
- ) -> Tuple[Optional[Tuple[Tuple, Tuple]], Optional[str]]:
395
- """Get genomic coordinates for related transcript exon start and end.
396
-
397
- :param str tx_ac: Transcript accession
398
- :param Optional[List[str]] tx_exon_start: Transcript's exon start
399
- coordinates
400
- :param Optional[List[str]] tx_exon_end: Transcript's exon end
401
- coordinates
402
- :param str gene: Gene symbol
403
- :return: Alt ac start and end data, and warnings if found
404
- """
405
- if tx_exon_start:
406
- alt_ac_start, warning = await self.get_alt_ac_start_or_end(
407
- tx_ac, int(tx_exon_start[0]), int(tx_exon_start[1]), gene=gene
408
- )
409
- if not alt_ac_start:
410
- return None, warning
411
- else:
412
- alt_ac_start = None
413
-
414
- if tx_exon_end:
415
- alt_ac_end, warning = await self.get_alt_ac_start_or_end(
416
- tx_ac, int(tx_exon_end[0]), int(tx_exon_end[1]), gene=gene
417
- )
418
- if not alt_ac_end:
419
- return None, warning
420
- else:
421
- alt_ac_end = None
422
-
423
- if alt_ac_start is None and alt_ac_end is None:
424
- msg = "Unable to find `alt_ac_start` or `alt_ac_end`"
425
- logger.warning(msg)
426
- return None, msg
427
-
428
- # validate
429
- if alt_ac_start and alt_ac_end:
430
- for i in (0, 1, 4):
431
- if alt_ac_start[i] != alt_ac_end[i]:
432
- if i == 0:
433
- error = "Gene symbol does not match"
434
- elif i == 1:
435
- error = "Chromosome does not match"
436
- else:
437
- error = "Strand does not match"
438
- logger.warning(f"{error}: " f"{alt_ac_start[i]} != {alt_ac_end[i]}")
439
- return (alt_ac_start, alt_ac_end), None
440
-
441
355
  async def get_alt_ac_start_or_end(
442
356
  self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: Optional[str]
443
357
  ) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]:
444
358
  """Get genomic data for related transcript exon start or end.
445
359
 
446
- :param str tx_ac: Transcript accession
447
- :param int tx_exon_start: Transcript's exon start coordinate
448
- :param int tx_exon_end: Transcript's exon end coordinate
449
- :param Optional[str] gene: Gene symbol
360
+ :param tx_ac: Transcript accession
361
+ :param tx_exon_start: Transcript's exon start coordinate
362
+ :param tx_exon_end: Transcript's exon end coordinate
363
+ :param gene: HGNC gene symbol
450
364
  :return: [hgnc symbol, genomic accession for chromosome,
451
- start exon's end coordinate, end exon's start coordinate, strand],
365
+ aligned genomic start coordinate, aligned genomic end coordinate, strand],
452
366
  and warnings if found
453
367
  """
454
368
  if gene:
@@ -487,7 +401,7 @@ class UTADatabase:
487
401
  async def get_cds_start_end(self, tx_ac: str) -> Optional[Tuple[int, int]]:
488
402
  """Get coding start and end site
489
403
 
490
- :param str tx_ac: Transcript accession
404
+ :param tx_ac: Transcript accession
491
405
  :return: [Coding start site, Coding end site]
492
406
  """
493
407
  if tx_ac.startswith("ENS"):
@@ -511,7 +425,7 @@ class UTADatabase:
511
425
  async def get_newest_assembly_ac(self, ac: str) -> List[str]:
512
426
  """Find accession associated to latest genomic assembly
513
427
 
514
- :param str ac: Accession
428
+ :param ac: Accession
515
429
  :return: List of accessions associated to latest genomic assembly. Order by
516
430
  desc
517
431
  """
@@ -540,8 +454,8 @@ class UTADatabase:
540
454
  async def validate_genomic_ac(self, ac: str) -> bool:
541
455
  """Return whether or not genomic accession exists.
542
456
 
543
- :param str ac: Genomic accession
544
- :return: `True` if genomic accession exists. `False` otherwise.
457
+ :param ac: Genomic accession
458
+ :return: ``True`` if genomic accession exists. ``False`` otherwise.
545
459
  """
546
460
  query = f"""
547
461
  SELECT EXISTS(
@@ -554,10 +468,19 @@ class UTADatabase:
554
468
  return result[0][0]
555
469
 
556
470
  async def get_ac_descr(self, ac: str) -> Optional[str]:
557
- """Return accession description.
558
- Typically description exists if not GRCh38 assembly.
559
-
560
- :param str ac: Accession
471
+ """Return accession description. This is typically available only for accessions
472
+ from older (pre-GRCh38) builds.
473
+
474
+ >>> import asyncio
475
+ >>> from cool_seq_tool.sources.uta_database import UtaDatabase
476
+ >>> async def describe():
477
+ ... uta_db = await UtaDatabase.create()
478
+ ... result = await uta_db.get_ac_descr("NC_000001.10")
479
+ ... return result
480
+ >>> asyncio.run(describe())
481
+ 'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly'
482
+
483
+ :param ac: chromosome accession, e.g. ``"NC_000001.10"``
561
484
  :return: Description containing assembly and chromosome
562
485
  """
563
486
  query = f"""
@@ -580,23 +503,23 @@ class UTADatabase:
580
503
  tx_ac: str,
581
504
  start_pos: int,
582
505
  end_pos: int,
583
- alt_ac: str = None,
506
+ alt_ac: Optional[str] = None,
584
507
  use_tx_pos: bool = True,
585
508
  like_tx_ac: bool = False,
586
509
  ) -> List:
587
510
  """Return queried data from tx_exon_aln_v table.
588
511
 
589
- :param str tx_ac: accession on c. coordinate
590
- :param int start_pos: Start position change
591
- :param int end_pos: End position change
592
- :param str alt_ac: accession on g. coordinate
593
- :param bool use_tx_pos: `True` if querying on transcript position. This means
594
- `start_pos` and `end_pos` are on the c. coordinate
595
- `False` if querying on genomic position. This means `start_pos` and
596
- `end_pos` are on the g. coordinate
597
- :param bool like_tx_ac: `True` if tx_ac condition should be a like statement.
512
+ :param tx_ac: accession on c. coordinate
513
+ :param start_pos: Start position change
514
+ :param end_pos: End position change
515
+ :param alt_ac: accession on g. coordinate
516
+ :param use_tx_pos: ``True`` if querying on transcript position. This means
517
+ ``start_pos`` and ``end_pos`` are on the c. coordinate
518
+ ``False`` if querying on genomic position. This means ``start_pos`` and
519
+ ``end_pos`` are on the g. coordinate
520
+ :param like_tx_ac: ``True`` if tx_ac condition should be a like statement.
598
521
  This is used when you want to query an accession regardless of its version
599
- `False` if tx_condition will be exact match
522
+ ``False`` if tx_condition will be exact match
600
523
  :return: List of tx_exon_aln_v data
601
524
  """
602
525
  if end_pos is None:
@@ -659,16 +582,13 @@ class UTADatabase:
659
582
  def data_from_result(result: List) -> Optional[Dict]:
660
583
  """Return data found from result.
661
584
 
662
- :param List result: Data from tx_exon_aln_v table
585
+ :param result: Data from tx_exon_aln_v table
663
586
  :return: Gene, strand, and position ranges for tx and alt_ac
664
587
  """
665
588
  gene = result[0]
666
- if result[7] == -1:
667
- strand = "-"
668
- else:
669
- strand = "+"
670
589
  tx_pos_range = result[2], result[3]
671
590
  alt_pos_range = result[5], result[6]
591
+ strand = Strand(result[7])
672
592
  alt_aln_method = result[8]
673
593
  tx_exon_id = result[9]
674
594
  alt_exon_id = result[10]
@@ -694,13 +614,30 @@ class UTADatabase:
694
614
  async def get_mane_c_genomic_data(
695
615
  self, ac: str, alt_ac: Optional[str], start_pos: int, end_pos: int
696
616
  ) -> Optional[Dict]:
697
- """Get MANE Transcript and genomic data.
698
-
699
- Used when going from g -> MANE c
700
- :param str ac: MANE Transcript accession
701
- :param str alt_ac: NC Accession
702
- :param int start_pos: Genomic start position change
703
- :param int end_pos: Genomic end position change
617
+ """Get MANE transcript and genomic data. Used when going from g. to MANE c.
618
+ representation.
619
+
620
+ >>> import asyncio
621
+ >>> from cool_seq_tool.sources import UtaDatabase
622
+ >>> async def get_braf_mane():
623
+ ... uta_db = await UtaDatabase.create()
624
+ ... result = await uta_db.get_mane_c_genomic_data(
625
+ ... "NM_004333.6",
626
+ ... None,
627
+ ... 140753335,
628
+ ... 140753335,
629
+ ... )
630
+ ... return result
631
+ >>> braf = asyncio.run(get_braf_mane())
632
+ >>> braf["alt_ac"]
633
+ 'NC_000007.14'
634
+
635
+ :param ac: MANE transcript accession
636
+ :param alt_ac: NC accession. Used to triangulate on correct genomic data. Can
637
+ be set to ``None`` if unavailable.
638
+ :param start_pos: Genomic start position
639
+ :param end_pos: Genomic end position change
640
+ :return: MANE transcript results if successful
704
641
  """
705
642
  results = await self.get_tx_exon_aln_v_data(
706
643
  ac, start_pos, end_pos, alt_ac=alt_ac, use_tx_pos=False
@@ -723,9 +660,7 @@ class UTADatabase:
723
660
  data["coding_start_site"] = coding_start_site[0]
724
661
  data["coding_end_site"] = coding_start_site[1]
725
662
 
726
- if data["strand"] == "-":
727
- end_pos += 1
728
- start_pos += 1
663
+ if data["strand"] == Strand.NEGATIVE:
729
664
  data["alt_pos_change_range"] = (end_pos, start_pos)
730
665
  data["alt_pos_change"] = (
731
666
  data["alt_pos_range"][1] - data["alt_pos_change_range"][0],
@@ -752,13 +687,12 @@ class UTADatabase:
752
687
  ) -> Optional[Dict]:
753
688
  """Get transcript mapping to genomic data.
754
689
 
755
- :param str tx_ac: Accession on c. coordinate
756
- :param Tuple pos: (start pos, end pos)
757
- :param Union[AnnotationLayer.CDNA, AnnotationLayer.GENOMIC] annotation_layer:
758
- Annotation layer for `ac` and `pos`
759
- :param Optional[str] alt_ac: Accession on g. coordinate
760
- :param Assembly target_genome_assembly: Genome assembly to get genomic data for.
761
- If `alt_ac` is provided, it will return the associated assembly.
690
+ :param tx_ac: Accession on c. coordinate
691
+ :param pos: (start pos, end pos)
692
+ :param annotation_layer: Annotation layer for ``ac`` and ``pos``
693
+ :param alt_ac: Accession on g. coordinate
694
+ :param target_genome_assembly: Genome assembly to get genomic data for.
695
+ If ``alt_ac`` is provided, it will return the associated assembly.
762
696
  :return: Gene, Transcript accession and position change,
763
697
  Altered transcript accession and position change, Strand
764
698
  """
@@ -789,7 +723,7 @@ class UTADatabase:
789
723
  )
790
724
 
791
725
  if annotation_layer == AnnotationLayer.CDNA:
792
- if data["strand"] == "-":
726
+ if data["strand"] == Strand.NEGATIVE:
793
727
  data["alt_pos_change_range"] = (
794
728
  data["alt_pos_range"][1] - data["pos_change"][0],
795
729
  data["alt_pos_range"][0] + data["pos_change"][1],
@@ -800,8 +734,8 @@ class UTADatabase:
800
734
  data["alt_pos_range"][1] - data["pos_change"][1],
801
735
  )
802
736
  else:
803
- if data["strand"] == "-":
804
- data["alt_pos_change_range"] = (pos[1] + 1, pos[0] + 1)
737
+ if data["strand"] == Strand.NEGATIVE:
738
+ data["alt_pos_change_range"] = (pos[1], pos[0])
805
739
  else:
806
740
  data["alt_pos_change_range"] = pos
807
741
 
@@ -810,7 +744,7 @@ class UTADatabase:
810
744
  async def get_ac_from_gene(self, gene: str) -> List[str]:
811
745
  """Return genomic accession(s) associated to a gene.
812
746
 
813
- :param str gene: Gene symbol
747
+ :param gene: Gene symbol
814
748
  :return: List of genomic accessions, sorted in desc order
815
749
  """
816
750
  query = f"""
@@ -832,11 +766,20 @@ class UTADatabase:
832
766
  async def get_gene_from_ac(
833
767
  self, ac: str, start_pos: int, end_pos: int
834
768
  ) -> Optional[List[str]]:
835
- """Get transcripts from NC accession and positions.
836
-
837
- :param str ac: NC Accession
838
- :param int start_pos: Start position change
839
- :param int end_pos: End position change
769
+ """Get gene(s) within the provided coordinate range
770
+
771
+ >>> import asyncio
772
+ >>> from cool_seq_tool.sources import UtaDatabase
773
+ >>> async def get_gene():
774
+ ... uta_db = await UtaDatabase.create()
775
+ ... result = await uta_db.get_gene_from_ac("NC_000017.11", 43044296, 43045802)
776
+ ... return result
777
+ >>> asyncio.run(get_gene())
778
+ ['BRCA1']
779
+
780
+ :param ac: NC accession, e.g. ``"NC_000001.11"``
781
+ :param start_pos: Start position change
782
+ :param end_pos: End position change
840
783
  :return: List of HGNC gene symbols
841
784
  """
842
785
  if end_pos is None:
@@ -871,20 +814,20 @@ class UTADatabase:
871
814
  use_tx_pos: bool = True,
872
815
  alt_ac: Optional[str] = None,
873
816
  ) -> pl.DataFrame:
874
- """Get transcripts for a given `gene` or `alt_ac` related to optional positions.
817
+ """Get transcripts for a given ``gene`` or ``alt_ac`` related to optional positions.
875
818
 
876
819
  :param start_pos: Start position change
877
- If not provided and `end_pos` not provided, all transcripts associated with
820
+ If not provided and ``end_pos`` not provided, all transcripts associated with
878
821
  the gene and/or accession will be returned
879
822
  :param end_pos: End position change
880
- If not provided and `start_pos` not provided, all transcripts associated
823
+ If not provided and ``start_pos`` not provided, all transcripts associated
881
824
  with the gene and/or accession will be returned
882
825
  :param gene: HGNC gene symbol
883
- :param use_tx_pos: `True` if querying on transcript position. This means
884
- `start_pos` and `end_pos` are c. coordinate positions. `False` if querying
885
- on genomic position. This means `start_pos` and `end_pos` are g. coordinate
826
+ :param use_tx_pos: ``True`` if querying on transcript position. This means
827
+ ``start_pos`` and ``end_pos`` are c. coordinate positions. ``False`` if querying
828
+ on genomic position. This means ``start_pos`` and ``end_pos`` are g. coordinate
886
829
  positions
887
- :param alt_ac: Genomic accession. If not provided, must provide `gene`
830
+ :param alt_ac: Genomic accession. If not provided, must provide ``gene``
888
831
  :return: Data Frame containing transcripts associated with a gene.
889
832
  Transcripts are ordered by most recent NC accession, then by
890
833
  descending transcript length
@@ -938,12 +881,15 @@ class UTADatabase:
938
881
  results = [
939
882
  (r["pro_ac"], r["tx_ac"], r["alt_ac"], r["cds_start_i"]) for r in results
940
883
  ]
941
- return pl.DataFrame(results, schema=schema).unique()
884
+ results_df = pl.DataFrame(results, schema=schema)
885
+ if results:
886
+ results_df = results_df.unique()
887
+ return results_df
942
888
 
943
889
  async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]:
944
890
  """Get chromosome and assembly for NC accession if not in GRCh38.
945
891
 
946
- :param str ac: NC accession
892
+ :param ac: NC accession
947
893
  :return: Chromosome and Assembly accession is on
948
894
  """
949
895
  descr = await self.get_ac_descr(ac)
@@ -966,8 +912,8 @@ class UTADatabase:
966
912
  async def liftover_to_38(self, genomic_tx_data: Dict) -> None:
967
913
  """Liftover genomic_tx_data to hg38 assembly.
968
914
 
969
- :param Dict genomic_tx_data: Dictionary containing gene, nc_accession,
970
- alt_pos, and strand
915
+ :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
916
+ strand
971
917
  """
972
918
  descr = await self.get_chr_assembly(genomic_tx_data["alt_ac"])
973
919
  if descr is None:
@@ -1022,9 +968,9 @@ class UTADatabase:
1022
968
  ) -> Optional[Tuple]:
1023
969
  """Get new genome assembly data for a position on a chromosome.
1024
970
 
1025
- :param str chromosome: The chromosome number. Must be prefixed with `chr`
1026
- :param int pos: Position on the chromosome
1027
- :param Assembly liftover_to_assembly: Assembly to liftover to
971
+ :param chromosome: The chromosome number. Must be prefixed with ``chr``
972
+ :param pos: Position on the chromosome
973
+ :param liftover_to_assembly: Assembly to liftover to
1028
974
  :return: [Target chromosome, target position, target strand,
1029
975
  conversion_chain_score] for assembly
1030
976
  """
@@ -1055,11 +1001,11 @@ class UTADatabase:
1055
1001
  ) -> None:
1056
1002
  """Update genomic_tx_data to have coordinates for given assembly.
1057
1003
 
1058
- :param Dict genomic_tx_data: Dictionary containing gene, nc_accession,
1059
- alt_pos, and strand
1060
- :param str key: Key to access coordinate positions
1061
- :param str chromosome: Chromosome, must be prefixed with `chr`
1062
- :param Assembly liftover_to_assembly: Assembly to liftover to
1004
+ :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
1005
+ strand
1006
+ :param key: Key to access coordinate positions
1007
+ :param chromosome: Chromosome, must be prefixed with ``chr``
1008
+ :param liftover_to_assembly: Assembly to liftover to
1063
1009
  """
1064
1010
  liftover_start_i = self.get_liftover(
1065
1011
  chromosome, genomic_tx_data[key][0], liftover_to_assembly
@@ -1084,11 +1030,12 @@ class UTADatabase:
1084
1030
  genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
1085
1031
 
1086
1032
  async def p_to_c_ac(self, p_ac: str) -> List[str]:
1087
- """Return c. accession from p. accession.
1033
+ """Return cDNA reference sequence accession from protein reference sequence
1034
+ accession (i.e. ``p.`` to ``c.`` in HGVS syntax)
1088
1035
 
1089
- :param str p_ac: Protein accession
1090
- :return: List of rows containing c. accessions that are associated
1091
- with the given p. accession. In ascending order.
1036
+ :param p_ac: Protein accession
1037
+ :return: List of rows containing c. accessions that are associated with the
1038
+ given p. accession. In ascending order.
1092
1039
  """
1093
1040
  # Ensembl accessions do not have versions
1094
1041
  if p_ac.startswith("EN"):
@@ -1115,8 +1062,8 @@ class UTADatabase:
1115
1062
  ) -> List[str]:
1116
1063
  """Get transcripts associated to a genomic ac and position.
1117
1064
 
1118
- :param str alt_ac: Genomic accession
1119
- :param int g_pos: Genomic position
1065
+ :param alt_ac: Genomic accession
1066
+ :param g_pos: Genomic position
1120
1067
  :return: RefSeq transcripts on c. coordinate
1121
1068
  """
1122
1069
  query = f"""
@@ -1133,7 +1080,7 @@ class UTADatabase:
1133
1080
 
1134
1081
  @staticmethod
1135
1082
  def get_secret() -> str:
1136
- """Get secrets for UTA DB instances."""
1083
+ """Get secrets for UTA DB instances. Used for deployment on AWS."""
1137
1084
  secret_name = environ["UTA_DB_SECRET"]
1138
1085
  region_name = "us-east-2"
1139
1086