pydna 5.5.3__py3-none-any.whl → 5.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/alphabet.py ADDED
@@ -0,0 +1,995 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ dscode - The nucleic acid alphabet used in pydna
6
+
7
+ This file serves to define dscode, the DNA alphabet used in pydna.
8
+ Each symbol represents a basepair (two opposing bases in the two antiparalell
9
+ DNA strands).
10
+
11
+ The alphabet is defined in the end of this docstring which serve as the single
12
+ source of thruth. The alphabet is used to construct the codestrings dictionary
13
+ with has the following keys (strings) in the order indicated:
14
+
15
+ 1. un_ambiguous_ds_dna
16
+ 2. ds_rna
17
+ 3. ambiguous_ds_dna
18
+ 4. single_stranded_dna_rna
19
+ 5. loops_dna_rna
20
+ 6. mismatched_dna_rna
21
+ 7. gap
22
+
23
+ Each value of the codestrings dictionary is a multiline string. This string
24
+ has five lines following this form:
25
+
26
+ ::
27
+
28
+ W 1 Watson symbol
29
+ | 2 Pipe
30
+ C 3 Crick symbol
31
+ <empty line> 4
32
+ S 5 dscode symbol
33
+
34
+ W (line 1) and C (line 2) are complementary bases in a double stranded DNA
35
+ molecule and S (line 5) are the symbols of the alphabet used to
36
+ describe the base pair above the symbol.
37
+
38
+ Line 2 must contain only the pipe character, indicating basepairing and
39
+ line 4 must be empty. The lines must be of equal length and a series ot
40
+ tests are performed to ensure the integrity of the alphabet.
41
+
42
+ The string definition as well as the keys for the codestrings dict follow this
43
+ line and is contained in the last 13 lines of the docstring:
44
+
45
+ un_ambiguous_ds_dna
46
+ | ds_rna
47
+ | | ambiguous_ds_dna
48
+ | | | single_stranded_dna_rna
49
+ | | | | loops_dna_rna
50
+ | | | | | mismatched_dna_rna
51
+ | | | | | | gap
52
+ | | | | | | |
53
+ GATC UA RYMKSWHBVDN GATC••••U• -----AGCTU AAACCCGGGTTTUUUGCT •
54
+ |||| || ||||||||||| |||||||||| |||||||||| |||||||||||||||||| |
55
+ CTAG AU YRKMSWDVBHN ••••CTAG•U AGCTU----- ACGACTAGTCGTGCTUUU •
56
+
57
+ GATC UO RYMKSWHBVDN PEXIQFZJ$% 0123456789 !#{}&*()<>@:?[]=_; •
58
+
59
+ """
60
+ import re
61
+ from dataclasses import dataclass
62
+
63
+ __all__ = [
64
+ # Core alphabet dictionaries
65
+ "basepair_dict",
66
+ "annealing_dict",
67
+ "annealing_dict_w_holes",
68
+ "complement_dict_for_dscode",
69
+ # Translation tables (str.translate, bytes.translate)
70
+ "complement_table_for_dscode",
71
+ "dscode_to_watson_table",
72
+ "dscode_to_crick_table",
73
+ "dscode_to_watson_tail_table",
74
+ "dscode_to_crick_tail_table",
75
+ "dscode_to_full_sequence_table",
76
+ # Alphabet subsets
77
+ "ds_letters",
78
+ "ss_letters_watson",
79
+ "ss_letters_crick",
80
+ # Regex helpers and factories
81
+ "iupac_compl_regex",
82
+ "regex_ss_melt_factory",
83
+ "regex_ds_melt_factory",
84
+ # Data structures
85
+ "DseqParts",
86
+ # Public helper functions
87
+ "get_parts",
88
+ "dsbreaks",
89
+ "representation_tuple",
90
+ "anneal_strands",
91
+ ]
92
+
93
+
94
+ # An alias for whitespace
95
+ emptyspace = chr(32)
96
+
97
+ # ============================================================================
98
+ # Alphabet definition extracted from module docstring
99
+ # ============================================================================
100
+
101
+ lines = __doc__.rstrip().splitlines()[-13:] # last 13 docstring lines are read
102
+
103
+ assert not lines[-2] # line 4 has to be empty
104
+ assert set(lines[-4]) == {" ", "|"} # line 2 has to have pipes only.
105
+
106
+ uppers = lines[-5]
107
+ pipes = lines[-4]
108
+ lowers = lines[-3]
109
+ dscode = lines[-1]
110
+
111
+ # Make sure all lineas are equal in length
112
+ assert (
113
+ len(uppers.split())
114
+ == len(lowers.split())
115
+ == len(pipes.split())
116
+ == len(dscode.split())
117
+ )
118
+
119
+ # Extract the keys from the docstring
120
+ names = [x.strip("| ") for x in lines[: len(dscode.split())]]
121
+
122
+ # ============================================================================
123
+ # Construct the codestrings dict
124
+ # ============================================================================
125
+
126
+ codestrings = {}
127
+
128
+ for upper, pipe, lower, code, name in zip(
129
+ uppers.split(), pipes.split(), lowers.split(), dscode.split(), names
130
+ ):
131
+ codestrings[name.strip()] = f"{upper}\n{pipe}\n{lower}\n\n{code}\n".replace(
132
+ "•", emptyspace
133
+ )
134
+
135
+
136
+ # ============================================================================
137
+ # Define ascii letters not used in the alphabet
138
+ # ============================================================================
139
+
140
+ letters_not_in_dscode = "lL\"',-./\\^`|+~"
141
+
142
+
143
+ # ============================================================================
144
+ # for loop below carries out a series of consistency checks
145
+ # ============================================================================
146
+
147
+ for name, codestring in codestrings.items():
148
+
149
+ lines = codestring.splitlines()
150
+
151
+ assert len(lines) == 5, f'codestring["{name}"] does not have 5 lines'
152
+
153
+ # We want the Watson, Crick and Symbol lines only
154
+ # Second line has to be pipes ("|") and fourth has to be empty
155
+
156
+ watsn, pipes, crick, empty, symbl = lines
157
+
158
+ # Check so that all letters are ascii symbols.
159
+ assert all(
160
+ ln.isascii() for ln in (watsn, crick, symbl)
161
+ ), f'codestring["{name}"] has non-ascii letters'
162
+
163
+ # Verify so that all chars that have uppercase are uppercase.
164
+ assert all(
165
+ ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha()
166
+ ), f'codestring["{name}"] has non-uppercase letters'
167
+
168
+ # check so that pipes contain only "|"
169
+ assert set(pipes) == set(
170
+ "|"
171
+ ), f'codestring["{name}"] has non-pipe character(s) in line 2'
172
+
173
+ # check so strings are the same length
174
+ assert all(
175
+ len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl)
176
+ ), f'codestring["{name}"] has lines of unequal length'
177
+
178
+ # Check that the the letters in the letters_not_in_dscode string
179
+ # are not used.
180
+ assert not any(
181
+ [letter in letters_not_in_dscode for letter in symbl]
182
+ ), f'codestring["{name}"] has chars outside alphabet'
183
+
184
+
185
+ # ============================================================================
186
+ # The `codes` dictionary is a dict of dicts containing the information of the
187
+ # code strings in the form if a dict with string names as keys, each containing
188
+ # a {tuple: string} dict with this structure:
189
+ #
190
+ # (Watson letter, Crick letter): dscode symbol
191
+ # ============================================================================
192
+
193
+ codes = dict()
194
+
195
+ for name, codestring in codestrings.items():
196
+
197
+ lines = codestring.splitlines()
198
+
199
+ watsons, _, cricks, _, symbols = lines
200
+
201
+ # d is an alias of codes[name] used in this loop for code clarity.
202
+ codes[name] = d = dict()
203
+
204
+ for watson, crick, symbol in zip(watsons, cricks, symbols):
205
+ d[watson, crick] = symbol
206
+
207
+ del d # delete alias
208
+
209
+ # ============================================================================
210
+ # The `basepair_dict` dictionary is a merge of a subset of the `codes`dict.
211
+ # ============================================================================
212
+
213
+ basepair_dict = (
214
+ codes["un_ambiguous_ds_dna"]
215
+ | codes["ambiguous_ds_dna"]
216
+ | codes["ds_rna"]
217
+ | codes["single_stranded_dna_rna"]
218
+ # | codes["mismatched_dna_rna"]
219
+ # | codes["loops_dna_rna"]
220
+ | codes["gap"]
221
+ )
222
+
223
+
224
+ # ============================================================================
225
+ # The `annealing_dict` dictionary contain letters for single stranded
226
+ # DNA and their dscode after annealing
227
+ # ============================================================================
228
+
229
+ # The annealing_dict_of_str is constructed below. It contains the information
230
+ # needed to tell if two DNA fragments (like a and b below) can anneal.
231
+
232
+ # This of course only concerns single stranded regions.
233
+
234
+ # The dict has the form (x, y): s
235
+
236
+ # Where x and y are bases in a and b and the symbol s is the resulting dscode
237
+ # symbol for the base pair that is formed.
238
+
239
+ # The letters x and y are from the values in the
240
+ # codes["single_stranded_dna_rna"] dictionary.
241
+
242
+ # For, example: One key-value pair is ('P', 'Q'): 'G' which matches the first
243
+ # of the four new base pairings formed between a and b in the example below.
244
+
245
+ # (a)
246
+ # gggPEXI (dscode for a)
247
+
248
+ # gggGATC
249
+ # ccc
250
+ # aaa (b)
251
+ # CTAGttt
252
+
253
+ # QFZJaaa (dscode for b)
254
+
255
+
256
+ # gggGATCaaa (annealing product between a and b)
257
+ # cccCTAGttt
258
+
259
+ # This loops through the base pairs where the upper or lower
260
+ # positions are empty. (w, c), s would be ("G", " "), "P"
261
+ # in the first iteration.
262
+
263
+ annealing_dict = dict()
264
+
265
+ temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"]
266
+
267
+ # Alias to make the code below more readable.
268
+ d = codes["single_stranded_dna_rna"]
269
+
270
+ for (x, y), symbol in d.items():
271
+ if y == emptyspace:
272
+ other = next(b for a, b in temp if a == x)
273
+ symbol_other = d[emptyspace, other]
274
+ annealing_dict[symbol, symbol_other] = temp[x, other]
275
+ annealing_dict[symbol_other, symbol] = temp[x, other]
276
+ elif x == emptyspace:
277
+ other = next(a for a, b in temp if b == y)
278
+ symbol_other = d[other, emptyspace]
279
+ annealing_dict[symbol, symbol_other] = temp[other, y]
280
+ annealing_dict[symbol_other, symbol] = temp[other, y]
281
+ else:
282
+ raise ValueError("This should not happen")
283
+
284
+ del d, temp
285
+
286
+ # ============================================================================
287
+ # The `annealing_dict_w_holes`contains the `annealing_dict`
288
+ # and additional key pairs where one position is empty
289
+ # ============================================================================
290
+
291
+ temp = {}
292
+
293
+ for (x, y), symbol in annealing_dict.items():
294
+
295
+ temp[x, emptyspace] = x
296
+ temp[emptyspace, y] = y
297
+
298
+ annealing_dict_w_holes = annealing_dict | temp
299
+
300
+ del temp
301
+
302
+
303
+ # ============================================================================
304
+ # translation tables
305
+ # ============================================================================
306
+
307
+ # A collection of translation tables are a practical way to obtain Watson and Crick
308
+ # from dscode or the reverse complement strands when needed.
309
+
310
+ # These are meant to be used by the str.translate or bytes.translate methods.
311
+
312
+
313
+ # ============================================================================
314
+ # The translation table "complement_table_for_dscode" is used to obtain the
315
+ # complement of a DNA sequence in dscode format.
316
+ # ============================================================================
317
+
318
+ complement_dict_for_dscode = {
319
+ s: basepair_dict[c, w] for (w, c), s in basepair_dict.items()
320
+ }
321
+
322
+ from_letters = "".join(complement_dict_for_dscode.keys())
323
+ to_letters = "".join(complement_dict_for_dscode.values())
324
+
325
+ from_letters += from_letters.lower()
326
+ to_letters += to_letters.lower()
327
+
328
+ complement_table_for_dscode = bytes.maketrans(
329
+ from_letters.encode("ascii"), to_letters.encode("ascii")
330
+ )
331
+
332
+
333
+ # ============================================================================
334
+ # dscode_to_watson_table and dscode_to_crick_table
335
+ # ============================================================================
336
+
337
+ # dscode_to_watson_table and dscode_to_crick_table are used to obtain the Watson
338
+ # and (reverse) Crick strands from dscode.
339
+
340
+ # Three extra letters (placeholder1, placeholder2, interval) are added to the
341
+ # table and used in the representation_tuple function to
342
+ # add range indicators ("..") in the watson or crick strings for
343
+ # representation of long sequences.
344
+
345
+ dscode_sense = ""
346
+ dscode_compl = ""
347
+ watson = ""
348
+ crick = ""
349
+ dscode_sense_lower = ""
350
+ dscode_compl_lower = ""
351
+ watson_lower = ""
352
+ crick_lower = ""
353
+
354
+ for (w, c), dscode in basepair_dict.items():
355
+ dscode_sense += dscode
356
+ dscode_compl += basepair_dict[c, w]
357
+ watson += w
358
+ crick += c
359
+ dscode_lower = dscode.lower()
360
+ if dscode_lower in dscode_sense:
361
+ continue
362
+ dscode_sense_lower += dscode_lower
363
+ watson_lower += w.lower()
364
+ crick_lower += c.lower()
365
+ dscode_compl_lower += dscode_compl.lower()
366
+
367
+ # dscode_sense += dscode_sense.lower()
368
+ # dscode_compl += dscode_compl.lower()
369
+ # watson += watson.lower()
370
+ # crick += crick.lower()
371
+
372
+ placeholder1 = "~"
373
+ placeholder2 = "+"
374
+ interval = "."
375
+
376
+ assert placeholder1 in letters_not_in_dscode
377
+ assert placeholder2 in letters_not_in_dscode
378
+ assert interval in letters_not_in_dscode
379
+
380
+ dscode_to_watson_table = bytes.maketrans(
381
+ (dscode_sense + dscode_sense_lower + placeholder1 + placeholder2).encode("ascii"),
382
+ (watson + watson_lower + emptyspace + interval).encode("ascii"),
383
+ )
384
+
385
+ dscode_to_crick_table = bytes.maketrans(
386
+ (dscode_sense + dscode_sense_lower + placeholder1 + placeholder2).encode("ascii"),
387
+ (crick + crick_lower + interval + emptyspace).encode("ascii"),
388
+ )
389
+
390
+
391
+ # ============================================================================
392
+ # dscode_to_watson_tail_table
393
+ # ============================================================================
394
+
395
+
396
+ watson_tail_letter_dict = {
397
+ w: s for (w, c), s in codes["single_stranded_dna_rna"].items() if c.isspace()
398
+ }
399
+
400
+ from_letters = "".join(watson_tail_letter_dict.keys())
401
+ to_letters = "".join(watson_tail_letter_dict.values())
402
+
403
+ from_letters += from_letters.lower()
404
+ to_letters += to_letters.lower()
405
+
406
+ dscode_to_watson_tail_table = bytes.maketrans(
407
+ from_letters.encode("ascii"), to_letters.encode("ascii")
408
+ )
409
+
410
+ from_letters_full = five_prime_ss_letters = to_letters
411
+ to_letters_full = from_letters
412
+
413
+ # ============================================================================
414
+ # dscode_to_crick_tail_table
415
+ # ============================================================================
416
+
417
+ crick_tail_letter_dict = {
418
+ complement_dict_for_dscode[c]: s
419
+ for (w, c), s in codes["single_stranded_dna_rna"].items()
420
+ if w.isspace()
421
+ }
422
+
423
+ from_letters = "".join(crick_tail_letter_dict.keys())
424
+ to_letters = "".join(crick_tail_letter_dict.values())
425
+
426
+ from_letters += from_letters.lower()
427
+ to_letters += to_letters.lower()
428
+
429
+ dscode_to_crick_tail_table = bytes.maketrans(
430
+ from_letters.encode("ascii"), to_letters.encode("ascii")
431
+ )
432
+
433
+ three_prime_ss_letters = to_letters
434
+ from_letters_full += to_letters
435
+ to_letters_full += from_letters
436
+
437
+
438
+ # ============================================================================
439
+ # dscode_to_full_sequence_table
440
+ # ============================================================================
441
+
442
+
443
+ dscode_to_full_sequence_table = bytes.maketrans(
444
+ from_letters_full.encode("ascii"), to_letters_full.encode("ascii")
445
+ )
446
+
447
+
448
+ # This loop adds upper and lower case symbols
449
+ mixed_case_dict = {}
450
+
451
+ for (x, y), symbol in basepair_dict.items():
452
+ mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
453
+ mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
454
+ mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
455
+
456
+ if x == emptyspace:
457
+ mixed_case_dict[x, y.lower()] = symbol.lower()
458
+ mixed_case_dict[x, y.upper()] = symbol.upper()
459
+ if y == emptyspace:
460
+ mixed_case_dict[x.lower(), y] = symbol.lower()
461
+ mixed_case_dict[x.upper(), y] = symbol.upper()
462
+
463
+ # Add mixed case entries to the dict
464
+ basepair_dict.update(mixed_case_dict)
465
+
466
+ mixed_case_dict = {}
467
+
468
+ # This loop adds upper and lower case symbols
469
+ for (x, y), symbol in annealing_dict.items():
470
+ mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
471
+ mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
472
+ mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
473
+
474
+ # Add mixed case entries to the dict
475
+ annealing_dict.update(mixed_case_dict)
476
+
477
+ ds_letters = (
478
+ "".join(codes["un_ambiguous_ds_dna"].values())
479
+ + "".join(codes["ds_rna"].values())
480
+ + "".join(codes["ambiguous_ds_dna"].values())
481
+ )
482
+
483
+ ss_letters_watson = "".join(
484
+ s for (w, c), s in codes["single_stranded_dna_rna"].items() if c == emptyspace
485
+ )
486
+ ss_letters_crick = "".join(
487
+ s for (w, c), s in codes["single_stranded_dna_rna"].items() if w == emptyspace
488
+ )
489
+
490
+ ds_letters += ds_letters.lower()
491
+ ss_letters_watson += ss_letters_watson.lower()
492
+ ss_letters_crick += ss_letters_crick.lower()
493
+
494
+
495
+ # ============================================================================
496
+ # iupac_compl_regex dict of regexes below cover IUPAC Ambiguity Code
497
+ # complements and is used in the amplify module.
498
+ # ============================================================================
499
+
500
+ iupac_compl_regex = {
501
+ "A": "(?:T|U)",
502
+ "C": "(?:G)",
503
+ "G": "(?:C)",
504
+ "T": "(?:A)",
505
+ "U": "(?:A)",
506
+ "R": "(?:T|C|Y)",
507
+ "Y": "(?:G|A|R)",
508
+ "S": "(?:G|C|S)",
509
+ "W": "(?:A|T|W)",
510
+ "K": "(?:C|AM)",
511
+ "M": "(?:T|G|K)",
512
+ "B": "(?:C|G|A|V)",
513
+ "D": "(?:A|C|T|H)",
514
+ "H": "(?:A|G|T|D)",
515
+ "V": "(?:T|C|G|B)",
516
+ "N": "(?:A|G|C|T|N)",
517
+ }
518
+
519
+ # This loop adds upper and lower case symbols
520
+ # mixed_case_dict = {}
521
+
522
+ for (x, y), symbol in annealing_dict_w_holes.items():
523
+ mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
524
+ mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
525
+ mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
526
+ # Add mixed case entries to the dict
527
+ annealing_dict_w_holes.update(mixed_case_dict)
528
+
529
+ # ============================================================================
530
+ # DseqParts dataclass
531
+ # ============================================================================
532
+
533
+
534
+ @dataclass
535
+ class DseqParts:
536
+ sticky_left5: str
537
+ sticky_left3: str
538
+ middle: str
539
+ sticky_right3: str
540
+ sticky_right5: str
541
+ single_watson: str
542
+ single_crick: str
543
+
544
+ def __iter__(self):
545
+ """
546
+ Allow unpacking DseqParts instances.
547
+ >>> from pydna.alphabet import get_parts
548
+ >>> sticky_left5, sticky_left3, middle, sticky_right3, sticky_right5, single_watson, single_crick = get_parts("eeATCGuggCCGgg")
549
+ >>> sticky_left5
550
+ 'ee'
551
+ >>> middle
552
+ 'ATCGuggCCGgg'
553
+ """
554
+ return iter(
555
+ (
556
+ self.sticky_left5,
557
+ self.sticky_left3,
558
+ self.middle,
559
+ self.sticky_right3,
560
+ self.sticky_right5,
561
+ self.single_watson,
562
+ self.single_crick,
563
+ )
564
+ )
565
+
566
+ def __getitem__(self, index: int) -> str:
567
+ """
568
+ Allow indexing DseqParts instances.
569
+ >>> from pydna.alphabet import get_parts
570
+ >>> parts = get_parts("eeATCGuggCCGgg")
571
+ >>> parts[0]
572
+ 'ee'
573
+ >>> parts[2]
574
+ 'ATCGuggCCGgg'
575
+ """
576
+ return tuple(self)[index]
577
+
578
+
579
+ def get_parts(datastring: str) -> DseqParts:
580
+ """
581
+ Returns a DseqParts instance containing the parts of a dsDNA sequence.
582
+
583
+ The datastring argument should contain a string with dscode symbols.
584
+
585
+ A regular expression is used to capture the single stranded regions at
586
+ the ends as well as the ds region in the middle, if any.
587
+
588
+ The figure below numbers the regex capture groups and what they capture
589
+ as well as the DseqParts instance field name for each group.
590
+
591
+ ::
592
+
593
+ group 0 "sticky_left5"
594
+ |
595
+ | group 3"sticky_right5"
596
+ | |
597
+ --- ---
598
+ GGGATCC
599
+ TAGGTCA
600
+ ----
601
+ |
602
+ group 2 "middle"
603
+
604
+
605
+
606
+ group 1 "sticky_left3"
607
+ |
608
+ | group 4 "sticky_right3"
609
+ | |
610
+ --- ---
611
+ ATCCAGT
612
+ CCCTAGG
613
+ ----
614
+ |
615
+ group 2 "middle"
616
+
617
+
618
+
619
+ group 5 "single_watson" (only an upper strand)
620
+ |
621
+ -------
622
+ ATCCAGT
623
+ |||||||
624
+
625
+
626
+
627
+ group 6 "single_crick" (only a lower strand)
628
+ |
629
+ -------
630
+
631
+ |||||||
632
+ CCCTAGG
633
+
634
+ Examples
635
+ --------
636
+ >>>
637
+
638
+ Up to seven groups (0..6) are captured.s ome are mutually exclusive
639
+ which means that one of them is an empty string:
640
+
641
+ 0 or 1, not both, a DNA fragment has either 5' or 3' sticky end.
642
+
643
+ 2 or 5 or 6, a DNA molecule has a ds region or is entirely single stranded.
644
+
645
+ 3 or 4, not both, either 5' or 3' sticky end.
646
+
647
+ Note that internal single stranded regions are not identified and will
648
+ be contained in the middle part if they are present.
649
+
650
+ Parameters
651
+ ----------
652
+ datastring : str
653
+ A string with dscode.
654
+
655
+ Returns
656
+ -------
657
+ DseqParts
658
+ Seven string fields describing the DNA molecule.
659
+ DseqParts(sticky_left5='', sticky_left3='',
660
+ middle='',
661
+ sticky_right3='', sticky_right5='',
662
+ single_watson='', single_crick='')
663
+
664
+ """
665
+
666
+ m = re.match(
667
+ f"([{ss_letters_watson}]*)" # capture group 0 ssDNA in watson strand
668
+ f"([{ss_letters_crick}]*)" # " 1 ssDNA in crick strand
669
+ f"(?=[{ds_letters}])" # positive lookahead for dsDNA, no capture
670
+ "(.*)" # capture group 2 everything in the middle
671
+ f"(?<=[{ds_letters}])" # positive look behind for dsDNA, no capture
672
+ f"([{ss_letters_watson}]*)" # capture group 3 ssDNA in watson strand
673
+ f"([{ss_letters_crick}]*)|" # " 4 ssDNA in crick strand
674
+ f"([{ss_letters_watson}]+)|" # " 5 if data contains only upper strand
675
+ f"([{ss_letters_crick}]+)", # " 6 if data contains only lower strand
676
+ datastring,
677
+ )
678
+
679
+ result = m.groups() if m else (None, None, None, None, None, None, None)
680
+
681
+ result = ["" if e is None else e for e in result]
682
+
683
+ return DseqParts(
684
+ sticky_left5=result[0],
685
+ sticky_left3=result[1],
686
+ middle=result[2],
687
+ sticky_right3=result[3],
688
+ sticky_right5=result[4],
689
+ single_watson=result[5],
690
+ single_crick=result[6],
691
+ )
692
+
693
+
694
+ def dsbreaks(datastring: str) -> list[str]:
695
+ """
696
+ Find double strand breaks in DNA in dscode format.
697
+
698
+ An empty watson position next to an empty crick position in the dsDNA
699
+ leads to a discontinuous DNA. This function is used to show breaks in
700
+ DNA in Dseq.__init__.
701
+
702
+ >>> from pydna.alphabet import dsbreaks
703
+ >>> x, = dsbreaks("GATPFTAA")
704
+ >>> print(x)
705
+ [0:8]
706
+ GATG TAA
707
+ CTA TATT
708
+ >>> dsbreaks("GATC")
709
+ []
710
+
711
+ Parameters
712
+ ----------
713
+ data : str
714
+ A string representing DNA in dscode format.
715
+
716
+ Returns
717
+ -------
718
+ list[str]
719
+ A list of 3-line
720
+
721
+ """
722
+
723
+ wl = re.escape(five_prime_ss_letters)
724
+ cl = re.escape(three_prime_ss_letters)
725
+
726
+ breaks = []
727
+ regex = (
728
+ "(.{0,3})" # return context if present.
729
+ f"([{wl}][{cl}]|[{cl}][{wl}])" # find adjacent single strand chars.
730
+ "(.{0,3})" # return context if present.
731
+ )
732
+ for mobj in re.finditer(regex, datastring):
733
+ chunk = mobj.group()
734
+ w, c = representation_tuple(chunk)
735
+ breaks.append(f"[{mobj.start()}:{mobj.end()}]\n{w}\n{c}\n")
736
+ return breaks
737
+
738
+
739
+ def representation_tuple(
740
+ datastring: str = "", length_limit_for_repr: int = 30, chunk: int = 4
741
+ ):
742
+ """
743
+ Two line string representation of a sequence of dscode symbols.
744
+
745
+ See pydna.alphabet module for the definition of the pydna dscode
746
+ alphabet. The dscode has a symbol (ascii) character for base pairs
747
+ and single stranded DNA.
748
+
749
+ This function is used by the Dseq.__repr__() method.
750
+
751
+ Parameters
752
+ ----------
753
+ data : TYPE, optional
754
+ DESCRIPTION. The default is "".
755
+
756
+ Returns
757
+ -------
758
+ str
759
+ A two line string containing The Watson and Crick strands.
760
+
761
+ """
762
+
763
+ (
764
+ sticky_left5,
765
+ sticky_left3,
766
+ middle,
767
+ sticky_right5,
768
+ sticky_right3,
769
+ single_watson,
770
+ single_crick,
771
+ ) = get_parts(datastring)
772
+
773
+ if len(datastring) > length_limit_for_repr:
774
+ """
775
+ We need to shorten the repr if the sequence is longer than
776
+ limit imposed by length_limit_for_repr.
777
+
778
+ The representation has three parts, so we divide by three for each part.
779
+
780
+ Long DNA strands are interrupted by interval notation, like agc..att
781
+ where the two dots indicate intervening hidden sequence.
782
+
783
+
784
+ Dseq(-71)
785
+ GAAA..AATCaaaa..aaaa
786
+ tttt..ttttCTAA..AAAG
787
+
788
+ placeholder1, placeholder2 are two letters that are replaced by
789
+ interval characters in the upper or lower strands by the translation
790
+ """
791
+
792
+ part_limit = length_limit_for_repr // 3
793
+
794
+ if len(sticky_left5) > part_limit:
795
+ sticky_left5 = (
796
+ sticky_left5[:chunk] + placeholder2 * 2 + sticky_left5[-chunk:]
797
+ )
798
+
799
+ if len(sticky_left3) > part_limit:
800
+ sticky_left3 = (
801
+ sticky_left3[:chunk] + placeholder1 * 2 + sticky_left3[-chunk:]
802
+ )
803
+
804
+ if len(middle) > part_limit:
805
+ middle = middle[:4] + interval * 2 + middle[-4:]
806
+
807
+ if len(sticky_right5) > part_limit:
808
+ sticky_right5 = (
809
+ sticky_right5[:chunk] + placeholder2 * 2 + sticky_right5[-chunk:]
810
+ )
811
+
812
+ if len(sticky_right3) > part_limit:
813
+ sticky_right3 = (
814
+ sticky_right3[:chunk] + placeholder1 * 2 + sticky_right3[-chunk:]
815
+ )
816
+
817
+ # The processed string that will be used to
818
+ # obtain a watson and crick strand
819
+ processed_dscode = (sticky_left5 or sticky_left3) + middle + (
820
+ sticky_right5 or sticky_right3
821
+ ) or single_watson + single_crick
822
+
823
+ watson = processed_dscode.translate(dscode_to_watson_table).rstrip()
824
+ crick = processed_dscode.translate(dscode_to_crick_table).rstrip()
825
+
826
+ return watson, crick
827
+
828
+
829
+ def regex_ss_melt_factory(length: int) -> re.Pattern:
830
+ """
831
+ A regular expression for finding double-stranded regions flanked by single-stranded DNA
832
+ that can be melted to shed a single-stranded fragment.
833
+
834
+ This function returns a regular expression that finds double-stranded regions
835
+ (of length <= length) that are flanked by single-stranded regions on the same
836
+ side in dscode format. These regions are useful to identify as potential melt
837
+ sites, since melting them leads to the shedding of a single-stranded fragment.
838
+
839
+ The regular expression finds double stranded patches flanked by empty
840
+ positions on the same side (see figure below). Melting of this kind of
841
+ sites leads to the shedding of a single stranded fragment.
842
+
843
+ ::
844
+
845
+ GFTTAJA <-- dscode representing the ds DNA below.
846
+
847
+ G TTA A <-- "TTA" is found by the regex for length <= 3
848
+ CTAATGT
849
+
850
+
851
+ Examples
852
+ --------
853
+ >>> from pydna.dseq import Dseq
854
+ >>> regex = regex_ss_melt_factory(3)
855
+ >>> s = Dseq("GFTTAJA")
856
+ >>> s
857
+ Dseq(-7)
858
+ G TTA A
859
+ CTAATGT
860
+ >>> mobj = regex.search(s._data)
861
+ >>> mobj.groupdict()
862
+ {'watson': b'TTA', 'crick': None}
863
+
864
+
865
+ Parameters
866
+ ----------
867
+ length : int
868
+ Max length of double stranded region flanked by single stranded
869
+ regions.
870
+
871
+ Returns
872
+ -------
873
+ TYPE
874
+ regular expression object.
875
+
876
+ """
877
+
878
+ regex = (
879
+ f"(?P<watson>((?<=[{ss_letters_crick}]))"
880
+ f"([{ds_letters}]{{1,{length}}})"
881
+ f"((?=[^{ss_letters_watson}{ds_letters}])))|"
882
+ f"(?P<crick>((?<=[{ss_letters_watson}]))"
883
+ f"([{ds_letters}]{{1,{length}}})"
884
+ f"((?=[^{ss_letters_crick}{ds_letters}])))"
885
+ )
886
+
887
+ return re.compile(regex.encode("ascii"))
888
+
889
+
890
+ def regex_ds_melt_factory(length: int) -> re.Pattern:
891
+ """
892
+ A regular expression for finding double-stranded regions flanked by single-stranded DNA
893
+ that can be melted to shed multiple double stranded fragments.
894
+
895
+ This function returns a regular expression that finds double-stranded regions
896
+ (of length <= length) that are flanked by single-stranded regions on opposite
897
+ sides in dscode format. These regions are useful to identify as potential melt
898
+ sites, since melting them leads to separation into multiple double stranded fragments.
899
+
900
+ The regular expression finds double stranded patches flanked by empty
901
+ positions on opposite sides(see figure below). Melting of this kind of
902
+ sites leads to separation into multiple double stranded fragments.
903
+
904
+ ::
905
+ aaaGFTTAIAttt <-- dscode
906
+
907
+ aaaG TTACAttt <-- "TTA" is found by the regex for length <= 3
908
+ tttCTAAT Taaa
909
+
910
+ Examples
911
+ --------
912
+
913
+ >>> from pydna.dseq import Dseq
914
+ >>> regex = regex_ds_melt_factory(3)
915
+ >>> s = Dseq("aaaGFTTAIAttt")
916
+ >>> s
917
+ Dseq(-13)
918
+ aaaG TTACAttt
919
+ tttCTAAT Taaa
920
+ >>> mobj = regex.search(s._data)
921
+ >>> mobj.groupdict()
922
+ {'watson': None, 'crick': b'TTA'}
923
+
924
+ Parameters
925
+ ----------
926
+ length : int
927
+ Max length of double stranded region flanked by single stranded
928
+ regions.
929
+
930
+ Returns
931
+ -------
932
+ TYPE
933
+ regular expression object.
934
+
935
+ """
936
+
937
+ regex = (
938
+ f"(?P<watson>((?<=[{ss_letters_watson}])|^)"
939
+ f"([{ds_letters}]{{1,{length}}})"
940
+ f"((?=[^{ss_letters_watson}{ds_letters}])|$))|"
941
+ f"(?P<crick>((?<=[{ss_letters_crick}])|^)"
942
+ f"([{ds_letters}]{{1,{length}}})"
943
+ f"((?=[^{ss_letters_crick}{ds_letters}])|$))"
944
+ )
945
+
946
+ return re.compile(regex.encode("ascii"))
947
+
948
+
949
+ def anneal_strands(strand_a: str, strand_b: str) -> bool:
950
+ """
951
+ Test if two DNA strands containing dscode anneal or not.
952
+
953
+ Both strands are assumed to be given in 5' -> 3' direction.
954
+
955
+ Examples
956
+ --------
957
+
958
+ >>> from pydna.alphabet import anneal_strands
959
+ >>> a = "TTA"
960
+ >>> b = "AAT"[::-1]
961
+ >>> anneal_strands(a, b)
962
+ True
963
+ >>> anneal_strands(b, a)
964
+ True
965
+ >>> c = "UUA"
966
+ >>> anneal_strands(c, b)
967
+ True
968
+ >>> anneal_strands(a.lower(), b)
969
+ True
970
+ >>> anneal_strands("TG", "AA")
971
+ False
972
+
973
+ Parameters
974
+ ----------
975
+ watson : str
976
+ A single DNA strand.
977
+ crick : str
978
+ A single DNA strand.
979
+
980
+ Returns
981
+ -------
982
+ bool
983
+ True if annealing is perfect.
984
+
985
+ """
986
+ w = strand_a.translate(dscode_to_watson_table)
987
+ c = strand_b.translate(complement_table_for_dscode).translate(
988
+ dscode_to_crick_table
989
+ )[::-1]
990
+ for x, y in zip(w, c):
991
+ try:
992
+ basepair_dict[(x, y)]
993
+ except KeyError:
994
+ return False
995
+ return True