pydna 5.5.2__py3-none-any.whl → 5.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,553 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ This module provides classes that roughly map to the `OpenCloning <https://opencloning.org>`_
4
+ data model, which is defined using `LinkML <https://linkml.io>`, and available as a python
5
+ package `opencloning-linkml <https://pypi.org/project/opencloning-linkml/>`_. These classes
6
+ are documented there, and the ones in this module essentially replace the fields pointing to
7
+ sequences and primers (which use ids in the data model) to ``Dseqrecord`` and ``Primer``
8
+ objects, respectively. Similarly, it uses Location from ``Biopython`` instead of a string,
9
+ which is what the data model uses.
10
+
11
+ When using pydna to plan cloning, it stores the provenance of ``Dseqrecord`` objects in
12
+ their ``source`` attribute. Not all methods generate sources so far, so refer to the
13
+ documentation notebooks for examples on how to use this feature. The ``history`` method of
14
+ ``Dseqrecord`` objects can be used to get a string representation of the provenance of the
15
+ sequence. You can also use the ``CloningStrategy`` class to create a JSON representation of
16
+ the cloning strategy. That ``CloningStrategy`` can be loaded in the OpenCloning web interface
17
+ to see a representation of the cloning strategy.
18
+
19
+ """
20
+ from __future__ import annotations
21
+
22
+ from typing import Optional, Union, Any, ClassVar, Type
23
+ from pydantic_core import core_schema
24
+ from contextlib import contextmanager
25
+ from threading import local
26
+
27
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
28
+
29
+ from opencloning_linkml.datamodel import (
30
+ CloningStrategy as _BaseCloningStrategy,
31
+ Primer as _PrimerModel,
32
+ Source as _Source,
33
+ TextFileSequence as _TextFileSequence,
34
+ AssemblySource as _AssemblySource,
35
+ SourceInput as _SourceInput,
36
+ AssemblyFragment as _AssemblyFragment,
37
+ ManuallyTypedSource as _ManuallyTypedSource,
38
+ RestrictionAndLigationSource as _RestrictionAndLigationSource,
39
+ GibsonAssemblySource as _GibsonAssemblySource,
40
+ RestrictionEnzymeDigestionSource as _RestrictionEnzymeDigestionSource,
41
+ SequenceCutSource as _SequenceCutSource,
42
+ RestrictionSequenceCut as _RestrictionSequenceCut,
43
+ SequenceCut as _SequenceCut,
44
+ InFusionSource as _InFusionSource,
45
+ OverlapExtensionPCRLigationSource as _OverlapExtensionPCRLigationSource,
46
+ InVivoAssemblySource as _InVivoAssemblySource,
47
+ LigationSource as _LigationSource,
48
+ GatewaySource as _GatewaySource,
49
+ GatewayReactionType,
50
+ HomologousRecombinationSource as _HomologousRecombinationSource,
51
+ CreLoxRecombinationSource as _CreLoxRecombinationSource,
52
+ PCRSource as _PCRSource,
53
+ CRISPRSource as _CRISPRSource,
54
+ )
55
+ from Bio.SeqFeature import Location, LocationParserError
56
+ from Bio.Restriction.Restriction import AbstractCut
57
+ import networkx as nx
58
+ from typing import List
59
+
60
+ from Bio.SeqIO.InsdcIO import _insdc_location_string as format_feature_location
61
+
62
+ from pydna.types import CutSiteType, SubFragmentRepresentationAssembly
63
+ from pydna.utils import create_location
64
+ from typing import TYPE_CHECKING
65
+
66
+ if TYPE_CHECKING: # pragma: no cover
67
+ from pydna.dseqrecord import Dseqrecord
68
+ from pydna.primer import Primer
69
+
70
+
71
+ # Thread-local storage for ID strategy
72
+ _thread_local = local()
73
+
74
+
75
+ @contextmanager
76
+ def id_mode(use_python_internal_id: bool = True):
77
+ """Context manager that is used to determine how ids are assigned to objects when
78
+ mapping them to the OpenCloning data model. If ``use_python_internal_id`` is True,
79
+ the built-in python ``id()`` function is used to assign ids to objects. That function
80
+ produces a unique integer for each object in python, so it's guaranteed to be unique.
81
+ If ``use_python_internal_id`` is False, the object's ``.id`` attribute (must be a string integer)
82
+ is used to assign ids to objects. This is useful when the objects already have meaningful ids,
83
+ and you want to keep references to them in ``SourceInput`` objects (which sequences and
84
+ primers are used in a particular source).
85
+
86
+ Parameters
87
+ ----------
88
+ use_python_internal_id: bool
89
+ If True, use Python's built-in id() function.
90
+ If False, use the object's .id attribute (must be a string integer).
91
+
92
+ Examples
93
+ --------
94
+ >>> from pydna.dseqrecord import Dseqrecord
95
+ >>> from pydna.opencloning_models import get_id, id_mode
96
+ >>> dseqr = Dseqrecord("ATGC")
97
+ >>> dseqr.name = "my_sequence"
98
+ >>> dseqr.id = "123"
99
+ >>> get_id(dseqr) == id(dseqr)
100
+ True
101
+ >>> with id_mode(use_python_internal_id=False):
102
+ ... get_id(dseqr)
103
+ 123
104
+ """
105
+ old_value = getattr(_thread_local, "use_python_internal_id", True)
106
+ _thread_local.use_python_internal_id = use_python_internal_id
107
+ try:
108
+ yield
109
+ finally:
110
+ _thread_local.use_python_internal_id = old_value
111
+
112
+
113
+ def get_id(obj: "Primer" | "Dseqrecord") -> int:
114
+ """Get ID using the current strategy from thread-local storage (see id_mode)
115
+ Parameters
116
+ ----------
117
+ obj: Primer | Dseqrecord
118
+ The object to get the id of
119
+
120
+ Returns
121
+ -------
122
+ int: The id of the object
123
+
124
+ """
125
+ use_python_internal_id = getattr(_thread_local, "use_python_internal_id", True)
126
+ if use_python_internal_id:
127
+ return id(obj)
128
+ if not isinstance(obj.id, str) or not obj.id.isdigit():
129
+ raise ValueError(
130
+ f"If use_python_internal_id is False, id must be a string representing an integer, "
131
+ f"but object {obj} has an invalid id: {obj.id}"
132
+ )
133
+ return int(obj.id)
134
+
135
+
136
+ class SequenceLocationStr(str):
137
+ """A string representation of a sequence location, genbank-like."""
138
+
139
+ # TODO: this should handle origin-spanning simple locations (splitted)
140
+ @classmethod
141
+ def from_biopython_location(cls, location: Location):
142
+ return cls(format_feature_location(location, None))
143
+
144
+ def to_biopython_location(self) -> Location:
145
+ return Location.fromstring(self)
146
+
147
+ @classmethod
148
+ def field_validator(cls, v):
149
+ if isinstance(v, str):
150
+ value = cls(v)
151
+ try:
152
+ value.to_biopython_location()
153
+ except LocationParserError as err:
154
+ raise ValueError(f"Location {v!r} is not a valid location") from err
155
+ return value
156
+ raise ValueError(f"Location must be a string or a {cls.__name__}")
157
+
158
+ @classmethod
159
+ def __get_pydantic_core_schema__(
160
+ cls,
161
+ source_type,
162
+ handler,
163
+ ) -> core_schema.CoreSchema:
164
+ """Generate Pydantic core schema for SequenceLocationStr."""
165
+ return core_schema.with_info_after_validator_function(
166
+ cls._validate,
167
+ core_schema.str_schema(),
168
+ )
169
+
170
+ @classmethod
171
+ def _validate(cls, value: str, info):
172
+ """Validate and create SequenceLocationStr instance."""
173
+ return cls.field_validator(value)
174
+
175
+ @classmethod
176
+ def from_start_and_end(
177
+ cls, start: int, end: int, seq_len: int | None = None, strand: int | None = 1
178
+ ):
179
+ return cls.from_biopython_location(create_location(start, end, seq_len, strand))
180
+
181
+
182
+ class ConfiguredBaseModel(BaseModel):
183
+ model_config = ConfigDict(
184
+ validate_assignment=True,
185
+ validate_default=True,
186
+ extra="forbid",
187
+ arbitrary_types_allowed=True,
188
+ use_enum_values=True,
189
+ strict=False,
190
+ )
191
+ pass
192
+
193
+
194
+ class TextFileSequence(_TextFileSequence):
195
+
196
+ @classmethod
197
+ def from_dseqrecord(cls, dseqr: "Dseqrecord"):
198
+ return cls(
199
+ id=get_id(dseqr),
200
+ sequence_file_format="genbank",
201
+ overhang_crick_3prime=dseqr.seq.ovhg,
202
+ overhang_watson_3prime=dseqr.seq.watson_ovhg(),
203
+ file_content=dseqr.format("genbank"),
204
+ )
205
+
206
+
207
+ class PrimerModel(_PrimerModel):
208
+
209
+ @classmethod
210
+ def from_primer(cls, primer: "Primer"):
211
+ return cls(
212
+ id=get_id(primer),
213
+ name=primer.name,
214
+ sequence=str(primer.seq),
215
+ )
216
+
217
+
218
+ class SourceInput(ConfiguredBaseModel):
219
+ sequence: object
220
+
221
+ @field_validator("sequence")
222
+ @classmethod
223
+ def _validate_sequence_field(cls, value: Any):
224
+ """Separate validation to avoid circular imports."""
225
+
226
+ from pydna.dseqrecord import Dseqrecord
227
+ from pydna.primer import Primer
228
+
229
+ if isinstance(value, (Dseqrecord, Primer)):
230
+ return value
231
+ module = type(value).__module__
232
+ name = type(value).__name__
233
+ raise TypeError(f"sequence must be Dseqrecord or Primer; got {module}.{name}")
234
+
235
+ def to_pydantic_model(self) -> _SourceInput:
236
+ return _SourceInput(sequence=get_id(self.sequence))
237
+
238
+
239
+ class AssemblyFragment(SourceInput):
240
+
241
+ left_location: Optional[Location] = Field(default=None)
242
+ right_location: Optional[Location] = Field(default=None)
243
+ reverse_complemented: bool
244
+
245
+ @staticmethod
246
+ def from_biopython_location(location: Location | None):
247
+ if location is None:
248
+ return None
249
+ return SequenceLocationStr.from_biopython_location(location)
250
+
251
+ def to_pydantic_model(self) -> _AssemblyFragment:
252
+ return _AssemblyFragment(
253
+ sequence=get_id(self.sequence),
254
+ left_location=self.from_biopython_location(self.left_location),
255
+ right_location=self.from_biopython_location(self.right_location),
256
+ reverse_complemented=self.reverse_complemented,
257
+ )
258
+
259
+
260
+ class Source(ConfiguredBaseModel):
261
+ input: list[Union[SourceInput, AssemblyFragment]] = Field(default_factory=list)
262
+ TARGET_MODEL: ClassVar[Type[_Source]] = _Source
263
+
264
+ def input_models(self):
265
+ return [fragment.to_pydantic_model() for fragment in self.input]
266
+
267
+ def _kwargs(self, seq_id: int) -> dict:
268
+ return {
269
+ "id": seq_id,
270
+ "input": self.input_models(),
271
+ }
272
+
273
+ def to_pydantic_model(self, seq_id: int):
274
+ kwargs = self._kwargs(seq_id)
275
+ return self.TARGET_MODEL(**kwargs)
276
+
277
+ def add_to_history_graph(self, history_graph: nx.DiGraph, seq: "Dseqrecord"):
278
+ """
279
+ Add the source to the history graph.
280
+
281
+ It does not use the get_id function, because it just uses it to have unique identifiers
282
+ for graph nodes, not to store them anywhere.
283
+ """
284
+ from pydna.dseqrecord import Dseqrecord
285
+
286
+ history_graph.add_node(id(seq), label=f"{seq.name} ({repr(seq)})")
287
+ history_graph.add_node(id(self), label=str(self.TARGET_MODEL.__name__))
288
+ history_graph.add_edge(id(seq), id(self))
289
+ for fragment in self.input:
290
+ fragment_seq = fragment.sequence
291
+ # This could be a Primer as well, which doesn't have a source
292
+ if isinstance(fragment_seq, Dseqrecord) and fragment_seq.source is not None:
293
+ fragment_seq.source.add_to_history_graph(history_graph, fragment_seq)
294
+ else:
295
+ history_graph.add_node(
296
+ id(fragment_seq),
297
+ label=f"{fragment_seq.name} ({repr(fragment_seq)})",
298
+ )
299
+ history_graph.add_edge(id(self), id(fragment_seq))
300
+
301
+ def history_string(self, seq: "Dseqrecord"):
302
+ """
303
+ Returns a string representation of the cloning history of the sequence.
304
+ See dseqrecord.history() for examples.
305
+ """
306
+ history_graph = nx.DiGraph()
307
+ self.add_to_history_graph(history_graph, seq)
308
+ return "\n".join(
309
+ nx.generate_network_text(history_graph, with_labels=True, sources=[id(seq)])
310
+ )
311
+
312
+
313
+ class AssemblySource(Source):
314
+ circular: bool
315
+
316
+ TARGET_MODEL: ClassVar[Type[_AssemblySource]] = _AssemblySource
317
+
318
+ def _kwargs(self, seq_id: int) -> dict:
319
+ return {
320
+ **super()._kwargs(seq_id),
321
+ "circular": self.circular,
322
+ }
323
+
324
+ def to_pydantic_model(self, seq_id: int):
325
+ return self.TARGET_MODEL(**self._kwargs(seq_id))
326
+
327
+ @classmethod
328
+ def from_subfragment_representation(
329
+ cls,
330
+ assembly: SubFragmentRepresentationAssembly,
331
+ fragments: list["Dseqrecord"],
332
+ is_circular: bool,
333
+ ):
334
+
335
+ input_list = []
336
+ for f_index, loc1, loc2 in assembly:
337
+ input_list.append(
338
+ AssemblyFragment(
339
+ sequence=fragments[abs(f_index) - 1],
340
+ left_location=loc1,
341
+ right_location=loc2,
342
+ reverse_complemented=f_index < 0,
343
+ )
344
+ )
345
+
346
+ return AssemblySource(input=input_list, circular=is_circular)
347
+
348
+
349
+ class RestrictionAndLigationSource(AssemblySource):
350
+ restriction_enzymes: list[AbstractCut]
351
+
352
+ TARGET_MODEL: ClassVar[Type[_RestrictionAndLigationSource]] = (
353
+ _RestrictionAndLigationSource
354
+ )
355
+
356
+ def _kwargs(self, seq_id: int) -> dict:
357
+ return {
358
+ **super()._kwargs(seq_id),
359
+ "restriction_enzymes": [str(enzyme) for enzyme in self.restriction_enzymes],
360
+ }
361
+
362
+
363
+ class GibsonAssemblySource(AssemblySource):
364
+ TARGET_MODEL: ClassVar[Type[_GibsonAssemblySource]] = _GibsonAssemblySource
365
+
366
+
367
+ class InFusionSource(AssemblySource):
368
+ TARGET_MODEL: ClassVar[Type[_InFusionSource]] = _InFusionSource
369
+
370
+
371
+ class OverlapExtensionPCRLigationSource(AssemblySource):
372
+ TARGET_MODEL: ClassVar[Type[_OverlapExtensionPCRLigationSource]] = (
373
+ _OverlapExtensionPCRLigationSource
374
+ )
375
+
376
+
377
+ class InVivoAssemblySource(AssemblySource):
378
+ TARGET_MODEL: ClassVar[Type[_InVivoAssemblySource]] = _InVivoAssemblySource
379
+
380
+
381
+ class LigationSource(AssemblySource):
382
+ TARGET_MODEL: ClassVar[Type[_LigationSource]] = _LigationSource
383
+
384
+
385
+ class GatewaySource(AssemblySource):
386
+ TARGET_MODEL: ClassVar[Type[_GatewaySource]] = _GatewaySource
387
+ reaction_type: GatewayReactionType
388
+ greedy: bool = Field(default=False)
389
+
390
+ def _kwargs(self, seq_id: int) -> dict:
391
+ return {
392
+ **super()._kwargs(seq_id),
393
+ "reaction_type": self.reaction_type,
394
+ "greedy": self.greedy,
395
+ }
396
+
397
+
398
+ class HomologousRecombinationSource(AssemblySource):
399
+ TARGET_MODEL: ClassVar[Type[_HomologousRecombinationSource]] = (
400
+ _HomologousRecombinationSource
401
+ )
402
+
403
+
404
+ class CRISPRSource(HomologousRecombinationSource):
405
+ TARGET_MODEL: ClassVar[Type[_CRISPRSource]] = _CRISPRSource
406
+
407
+
408
+ class CreLoxRecombinationSource(AssemblySource):
409
+ TARGET_MODEL: ClassVar[Type[_CreLoxRecombinationSource]] = (
410
+ _CreLoxRecombinationSource
411
+ )
412
+
413
+
414
+ class PCRSource(AssemblySource):
415
+ TARGET_MODEL: ClassVar[Type[_PCRSource]] = _PCRSource
416
+ add_primer_features: bool = Field(default=False)
417
+
418
+ def _kwargs(self, seq_id: int) -> dict:
419
+ return {
420
+ **super()._kwargs(seq_id),
421
+ "add_primer_features": self.add_primer_features,
422
+ }
423
+
424
+
425
+ class SequenceCutSource(Source):
426
+ left_edge: CutSiteType | None
427
+ right_edge: CutSiteType | None
428
+
429
+ BASE_MODEL: ClassVar[Type[_SequenceCutSource]] = _SequenceCutSource
430
+ ENZYME_MODEL: ClassVar[Type[_RestrictionEnzymeDigestionSource]] = (
431
+ _RestrictionEnzymeDigestionSource
432
+ )
433
+
434
+ @staticmethod
435
+ def _cutsite_to_model(cut_site: CutSiteType | None):
436
+ if cut_site is None:
437
+ return None
438
+ watson, overhang = cut_site[0]
439
+ enzyme_or_none = cut_site[1]
440
+ if isinstance(enzyme_or_none, AbstractCut):
441
+ return _RestrictionSequenceCut(
442
+ cut_watson=watson,
443
+ overhang=overhang,
444
+ restriction_enzyme=str(enzyme_or_none),
445
+ )
446
+ return _SequenceCut(cut_watson=watson, overhang=overhang)
447
+
448
+ @classmethod
449
+ def from_parent(
450
+ cls, parent: "Dseqrecord", left_edge: CutSiteType, right_edge: CutSiteType
451
+ ):
452
+ return cls(
453
+ input=[SourceInput(sequence=parent)],
454
+ left_edge=left_edge,
455
+ right_edge=right_edge,
456
+ )
457
+
458
+ def _has_enzyme(self) -> bool:
459
+ def has_enzyme(edge):
460
+ return edge is not None and isinstance(edge[1], AbstractCut)
461
+
462
+ return has_enzyme(self.left_edge) or has_enzyme(self.right_edge)
463
+
464
+ def _target_model(self):
465
+ return self.ENZYME_MODEL if self._has_enzyme() else self.BASE_MODEL
466
+
467
+ def _kwargs(self, seq_id: int) -> dict:
468
+ return {
469
+ **super()._kwargs(seq_id),
470
+ "left_edge": self._cutsite_to_model(self.left_edge),
471
+ "right_edge": self._cutsite_to_model(self.right_edge),
472
+ }
473
+
474
+ def to_pydantic_model(self, seq_id: int):
475
+ return self._target_model()(**self._kwargs(seq_id))
476
+
477
+
478
+ class CloningStrategy(_BaseCloningStrategy):
479
+
480
+ # For now, we don't add anything, but the classes will not have the new
481
+ # methods if this is used
482
+ # It will be used for validation for now
483
+ primers: Optional[List[PrimerModel]] = Field(
484
+ default_factory=list,
485
+ description="""The primers that are used in the cloning strategy""",
486
+ json_schema_extra={
487
+ "linkml_meta": {"alias": "primers", "domain_of": ["CloningStrategy"]}
488
+ },
489
+ )
490
+
491
+ def add_primer(self, primer: "Primer"):
492
+ existing_ids = {seq.id for seq in self.primers}
493
+ if get_id(primer) in existing_ids:
494
+ return
495
+ self.primers.append(PrimerModel.from_primer(primer))
496
+
497
+ def add_dseqrecord(self, dseqr: "Dseqrecord"):
498
+ from pydna.dseqrecord import Dseqrecord
499
+
500
+ existing_ids = {seq.id for seq in self.sequences}
501
+ if get_id(dseqr) in existing_ids:
502
+ return
503
+ self.sequences.append(TextFileSequence.from_dseqrecord(dseqr))
504
+ if dseqr.source is not None:
505
+ self.sources.append(dseqr.source.to_pydantic_model(get_id(dseqr)))
506
+ this_source: Source = dseqr.source
507
+ for source_input in this_source.input:
508
+ if isinstance(source_input.sequence, Dseqrecord):
509
+ self.add_dseqrecord(source_input.sequence)
510
+ else:
511
+ self.add_primer(source_input.sequence)
512
+ else:
513
+ self.sources.append(
514
+ _ManuallyTypedSource(id=get_id(dseqr), input=[], user_input="A")
515
+ )
516
+
517
+ def reassign_ids(self):
518
+ all_ids = (
519
+ {seq.id for seq in self.sequences}
520
+ | {source.id for source in self.sources}
521
+ | {primer.id for primer in self.primers}
522
+ )
523
+ id_mappings = {id: i + 1 for i, id in enumerate(sorted(all_ids))}
524
+ for seq in self.sequences:
525
+ seq.id = id_mappings[seq.id]
526
+ for primer in self.primers:
527
+ primer.id = id_mappings[primer.id]
528
+ for source in self.sources:
529
+ source.id = id_mappings[source.id]
530
+ for assembly_fragment in source.input:
531
+ assembly_fragment.sequence = id_mappings[assembly_fragment.sequence]
532
+
533
+ @classmethod
534
+ def from_dseqrecords(cls, dseqrs: list["Dseqrecord"], description: str = ""):
535
+ cloning_strategy = cls(sources=[], sequences=[], description=description)
536
+ for dseqr in dseqrs:
537
+ cloning_strategy.add_dseqrecord(dseqr)
538
+ return cloning_strategy
539
+
540
+ def model_dump_json(self, *args, **kwargs):
541
+ if getattr(_thread_local, "use_python_internal_id", True):
542
+ # Make a deep copy of the cloning strategy and reassign ids
543
+ cs = self.__deepcopy__()
544
+ cs.reassign_ids()
545
+ return super(CloningStrategy, cs).model_dump_json(*args, **kwargs)
546
+ return super().model_dump_json(*args, **kwargs)
547
+
548
+ def model_dump(self, *args, **kwargs):
549
+ if getattr(_thread_local, "use_python_internal_id", True):
550
+ cs = self.__deepcopy__()
551
+ cs.reassign_ids()
552
+ return super(CloningStrategy, cs).model_dump(*args, **kwargs)
553
+ return super().model_dump(*args, **kwargs)
pydna/parsers.py CHANGED
@@ -208,3 +208,26 @@ def parse(data, ds=True):
208
208
  def parse_primers(data):
209
209
  """docstring."""
210
210
  return [_Primer(x) for x in parse(data, ds=False)]
211
+
212
+
213
+ def parse_snapgene(file_path: str) -> list[_Dseqrecord]:
214
+ """Parse a SnapGene file and return a Dseqrecord object.
215
+
216
+ Parameters
217
+ ----------
218
+ file_path : str
219
+ The path to the SnapGene file to parse.
220
+
221
+ Returns
222
+ -------
223
+ Dseqrecord
224
+ The parsed SnapGene file as a Dseqrecord object.
225
+
226
+ """
227
+ with open(file_path, "rb") as f:
228
+ parsed_seq = next(_SeqIO.parse(f, "snapgene"))
229
+ circular = (
230
+ "topology" in parsed_seq.annotations.keys()
231
+ and parsed_seq.annotations["topology"] == "circular"
232
+ )
233
+ return [_Dseqrecord(parsed_seq, circular=circular)]
pydna/seqrecord.py CHANGED
@@ -197,7 +197,7 @@ class SeqRecord(_SeqRecord):
197
197
  def translate(self):
198
198
  """docstring."""
199
199
  p = super().translate()
200
- return ProteinSeqRecord(_ProteinSeq(p.seq[:-1]))
200
+ return ProteinSeqRecord(_ProteinSeq(p.seq))
201
201
 
202
202
  def add_colors_to_features_for_ape(self):
203
203
  """Assign colors to features.
@@ -0,0 +1,44 @@
1
+ # -*- coding: utf-8 -*-
2
+ from pydna.dseqrecord import Dseqrecord as _Dseqrecord
3
+ import re
4
+ from Bio.Data.IUPACData import ambiguous_dna_values as _ambiguous_dna_values
5
+
6
+ ambiguous_only_dna_values = {**_ambiguous_dna_values}
7
+ for normal_base in "ACGT":
8
+ del ambiguous_only_dna_values[normal_base]
9
+
10
+
11
+ def compute_regex_site(site: str) -> str:
12
+ """
13
+ Creates a regex pattern from a string that may contain degenerate bases.
14
+
15
+ Args:
16
+ site: The string to convert to a regex pattern.
17
+
18
+ Returns:
19
+ The regex pattern.
20
+ """
21
+ upper_site = site.upper()
22
+ for k, v in ambiguous_only_dna_values.items():
23
+ if len(v) > 1:
24
+ upper_site = upper_site.replace(k, f"[{''.join(v)}]")
25
+
26
+ # Make case insensitive
27
+ upper_site = f"(?i){upper_site}"
28
+ return upper_site
29
+
30
+
31
+ def dseqrecord_finditer(pattern: str, seq: _Dseqrecord) -> list[re.Match]:
32
+ """
33
+ Finds all matches of a regex pattern in a Dseqrecord.
34
+
35
+ Args:
36
+ pattern: The regex pattern to search for.
37
+ seq: The Dseqrecord to search in.
38
+
39
+ Returns:
40
+ A list of matches.
41
+ """
42
+ query = str(seq.seq) if not seq.circular else str(seq.seq) * 2
43
+ matches = re.finditer(pattern, query)
44
+ return (m for m in matches if m.start() <= len(seq))
pydna/types.py CHANGED
@@ -12,8 +12,11 @@ from typing import (
12
12
  Callable as _Callable,
13
13
  )
14
14
 
15
+ # Import AbstractCut at runtime for CutSiteType
16
+ from Bio.Restriction.Restriction import AbstractCut as _AbstractCut
17
+ from pydna.crispr import _cas as __cas
18
+
15
19
  if TYPE_CHECKING:
16
- from Bio.Restriction import AbstractCut as _AbstractCut
17
20
  from Bio.Restriction import RestrictionBatch as _RestrictionBatch
18
21
  from pydna.dseq import Dseq
19
22
  from Bio.SeqFeature import Location as _Location
@@ -25,7 +28,7 @@ DseqType = _TypeVar("DseqType", bound="Dseq")
25
28
  EnzymesType = _TypeVar(
26
29
  "EnzymesType", "_RestrictionBatch", _Iterable["_AbstractCut"], "_AbstractCut"
27
30
  )
28
- CutSiteType = _Tuple[_Tuple[int, int], _Union["_AbstractCut", None]]
31
+ CutSiteType = _Tuple[_Tuple[int, int], _Union[_AbstractCut, None, __cas]]
29
32
  AssemblyEdgeType = _Tuple[int, int, "_Location | None", "_Location | None"]
30
33
  AssemblySubFragmentType = _Tuple[int, "_Location | None", "_Location | None"]
31
34
  EdgeRepresentationAssembly = list[AssemblyEdgeType]