pydna 5.5.4__py3-none-any.whl → 5.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +30 -195
- pydna/_pretty.py +8 -8
- pydna/_thermodynamic_data.py +3 -3
- pydna/all.py +1 -12
- pydna/alphabet.py +995 -0
- pydna/amplicon.py +19 -24
- pydna/amplify.py +75 -95
- pydna/assembly.py +64 -81
- pydna/assembly2.py +375 -310
- pydna/codon.py +4 -4
- pydna/common_sub_strings.py +6 -8
- pydna/contig.py +203 -10
- pydna/design.py +176 -60
- pydna/dseq.py +1788 -718
- pydna/dseqrecord.py +197 -179
- pydna/gateway.py +6 -6
- pydna/gel.py +5 -5
- pydna/genbank.py +43 -46
- pydna/genbankfixer.py +89 -92
- pydna/ladders.py +11 -12
- pydna/oligonucleotide_hybridization.py +124 -0
- pydna/opencloning_models.py +187 -60
- pydna/parsers.py +45 -32
- pydna/primer.py +4 -4
- pydna/primer_screen.py +833 -0
- pydna/readers.py +14 -9
- pydna/seq.py +137 -47
- pydna/seqrecord.py +54 -62
- pydna/sequence_picker.py +2 -5
- pydna/sequence_regex.py +6 -6
- pydna/tm.py +17 -17
- pydna/types.py +19 -19
- pydna/utils.py +97 -75
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/METADATA +8 -8
- pydna-5.5.6.dist-info/RECORD +42 -0
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/WHEEL +1 -1
- pydna/conftest.py +0 -42
- pydna/download.py +0 -32
- pydna/genbankfile.py +0 -42
- pydna/genbankrecord.py +0 -168
- pydna/goldengate.py +0 -45
- pydna/ligate.py +0 -62
- pydna/user_cloning.py +0 -29
- pydna-5.5.4.dist-info/RECORD +0 -46
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
This module contains the functions for oligonucleotide hybridization.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pydna.common_sub_strings import common_sub_strings
|
|
7
|
+
from Bio.Seq import reverse_complement
|
|
8
|
+
from pydna.primer import Primer
|
|
9
|
+
from pydna.dseqrecord import Dseqrecord
|
|
10
|
+
from pydna.dseq import Dseq
|
|
11
|
+
from pydna.opencloning_models import OligoHybridizationSource, SourceInput
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def oligonucleotide_hybridization_overhangs(
|
|
15
|
+
fwd_oligo_seq: str, rvs_oligo_seq: str, minimal_annealing: int
|
|
16
|
+
) -> list[int]:
|
|
17
|
+
"""
|
|
18
|
+
Returns possible overhangs between two oligos given a minimal annealing length, and
|
|
19
|
+
returns an error if mismatches are found.
|
|
20
|
+
|
|
21
|
+
see https://github.com/manulera/OpenCloning_backend/issues/302 for notation
|
|
22
|
+
|
|
23
|
+
>>> from pydna.oligonucleotide_hybridization import oligonucleotide_hybridization_overhangs
|
|
24
|
+
>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCAT", 3)
|
|
25
|
+
[0]
|
|
26
|
+
>>> oligonucleotide_hybridization_overhangs("aATGGC", "GCCAT", 5)
|
|
27
|
+
[-1]
|
|
28
|
+
>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATa", 5)
|
|
29
|
+
[1]
|
|
30
|
+
>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATaaGCCAT", 5)
|
|
31
|
+
[0, 7]
|
|
32
|
+
|
|
33
|
+
If the minimal annealing length is longer than the length of the shortest oligo, it returns an empty list.
|
|
34
|
+
|
|
35
|
+
>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATaaGCCAT", 100)
|
|
36
|
+
[]
|
|
37
|
+
|
|
38
|
+
If it's possible to anneal for ``minimal_annealing`` length, but with mismatches, it raises an error.
|
|
39
|
+
|
|
40
|
+
>>> oligonucleotide_hybridization_overhangs("cATGGC", "GCCATa", 5)
|
|
41
|
+
Traceback (most recent call last):
|
|
42
|
+
...
|
|
43
|
+
ValueError: The oligonucleotides can anneal with mismatches
|
|
44
|
+
"""
|
|
45
|
+
matches = common_sub_strings(
|
|
46
|
+
fwd_oligo_seq.lower(),
|
|
47
|
+
reverse_complement(rvs_oligo_seq.lower()),
|
|
48
|
+
minimal_annealing,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
for pos_fwd, pos_rvs, length in matches:
|
|
52
|
+
|
|
53
|
+
if (pos_fwd != 0 and pos_rvs != 0) or (
|
|
54
|
+
pos_fwd + length < len(fwd_oligo_seq)
|
|
55
|
+
and pos_rvs + length < len(rvs_oligo_seq)
|
|
56
|
+
):
|
|
57
|
+
raise ValueError("The oligonucleotides can anneal with mismatches")
|
|
58
|
+
|
|
59
|
+
# Return possible overhangs
|
|
60
|
+
return [pos_rvs - pos_fwd for pos_fwd, pos_rvs, length in matches]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def oligonucleotide_hybridization(
|
|
64
|
+
fwd_primer: Primer, rvs_primer: Primer, minimal_annealing: int
|
|
65
|
+
) -> list[Dseqrecord]:
|
|
66
|
+
"""
|
|
67
|
+
Returns a list of Dseqrecord objects representing the hybridization of two primers.
|
|
68
|
+
|
|
69
|
+
>>> from pydna.primer import Primer
|
|
70
|
+
>>> from pydna.oligonucleotide_hybridization import oligonucleotide_hybridization
|
|
71
|
+
>>> fwd_primer = Primer("ATGGC")
|
|
72
|
+
>>> rvs_primer = Primer("GCCA")
|
|
73
|
+
>>> oligonucleotide_hybridization(fwd_primer, rvs_primer, 3)[0].seq
|
|
74
|
+
Dseq(-5)
|
|
75
|
+
ATGGC
|
|
76
|
+
ACCG
|
|
77
|
+
|
|
78
|
+
Multiple values can be returned:
|
|
79
|
+
|
|
80
|
+
>>> rvs_primer2 = Primer("GCCATaaGCCAT")
|
|
81
|
+
>>> oligonucleotide_hybridization(fwd_primer, rvs_primer2, 3)[0].seq
|
|
82
|
+
Dseq(-12)
|
|
83
|
+
ATGGC
|
|
84
|
+
TACCGaaTACCG
|
|
85
|
+
>>> oligonucleotide_hybridization(fwd_primer, rvs_primer2, 3)[1].seq
|
|
86
|
+
Dseq(-12)
|
|
87
|
+
ATGGC
|
|
88
|
+
TACCGaaTACCG
|
|
89
|
+
|
|
90
|
+
If no possible overhangs are found, it returns an empty list.
|
|
91
|
+
|
|
92
|
+
>>> oligonucleotide_hybridization(fwd_primer, rvs_primer, 100)
|
|
93
|
+
[]
|
|
94
|
+
|
|
95
|
+
If there are mismatches given the minimal annealing length, it raises an error.
|
|
96
|
+
|
|
97
|
+
>>> fwd_primer3 = Primer("cATGGC")
|
|
98
|
+
>>> rvs_primer3 = Primer("GCCATa")
|
|
99
|
+
>>> oligonucleotide_hybridization(fwd_primer3, rvs_primer3, 5)
|
|
100
|
+
Traceback (most recent call last):
|
|
101
|
+
...
|
|
102
|
+
ValueError: The oligonucleotides can anneal with mismatches
|
|
103
|
+
"""
|
|
104
|
+
possible_overhangs = oligonucleotide_hybridization_overhangs(
|
|
105
|
+
str(fwd_primer.seq), str(rvs_primer.seq), minimal_annealing
|
|
106
|
+
)
|
|
107
|
+
sources = [
|
|
108
|
+
OligoHybridizationSource(
|
|
109
|
+
overhang_crick_3prime=pos,
|
|
110
|
+
input=[SourceInput(sequence=fwd_primer), SourceInput(sequence=rvs_primer)],
|
|
111
|
+
)
|
|
112
|
+
for pos in possible_overhangs
|
|
113
|
+
]
|
|
114
|
+
return [
|
|
115
|
+
Dseqrecord(
|
|
116
|
+
Dseq(
|
|
117
|
+
str(fwd_primer.seq),
|
|
118
|
+
str(rvs_primer.seq),
|
|
119
|
+
ovhg=source.overhang_crick_3prime,
|
|
120
|
+
),
|
|
121
|
+
source=source,
|
|
122
|
+
)
|
|
123
|
+
for source in sources
|
|
124
|
+
]
|
pydna/opencloning_models.py
CHANGED
|
@@ -16,6 +16,17 @@ sequence. You can also use the ``CloningStrategy`` class to create a JSON repres
|
|
|
16
16
|
the cloning strategy. That ``CloningStrategy`` can be loaded in the OpenCloning web interface
|
|
17
17
|
to see a representation of the cloning strategy.
|
|
18
18
|
|
|
19
|
+
|
|
20
|
+
Contributing
|
|
21
|
+
============
|
|
22
|
+
|
|
23
|
+
Not all fields can be readily serialized to be converted to regular types in pydantic. For
|
|
24
|
+
instance, the ``coordinates`` field of the ``GenomeCoordinatesSource`` class is a
|
|
25
|
+
``SimpleLocation`` object, or the ``input`` field of ``Source`` is a list of ``SourceInput``
|
|
26
|
+
objects, which can be ``Dseqrecord`` or ``Primer`` objects, or ``AssemblyFragment`` objects.
|
|
27
|
+
For these type of fields, you have to define a ``field_serializer`` method to serialize them
|
|
28
|
+
to the correct type.
|
|
29
|
+
|
|
19
30
|
"""
|
|
20
31
|
from __future__ import annotations
|
|
21
32
|
|
|
@@ -24,10 +35,11 @@ from pydantic_core import core_schema
|
|
|
24
35
|
from contextlib import contextmanager
|
|
25
36
|
from threading import local
|
|
26
37
|
|
|
27
|
-
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
38
|
+
from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
|
|
28
39
|
|
|
29
40
|
from opencloning_linkml.datamodel import (
|
|
30
41
|
CloningStrategy as _BaseCloningStrategy,
|
|
42
|
+
DatabaseSource as _DatabaseSource,
|
|
31
43
|
Primer as _PrimerModel,
|
|
32
44
|
Source as _Source,
|
|
33
45
|
TextFileSequence as _TextFileSequence,
|
|
@@ -47,12 +59,32 @@ from opencloning_linkml.datamodel import (
|
|
|
47
59
|
LigationSource as _LigationSource,
|
|
48
60
|
GatewaySource as _GatewaySource,
|
|
49
61
|
GatewayReactionType,
|
|
62
|
+
AnnotationTool,
|
|
50
63
|
HomologousRecombinationSource as _HomologousRecombinationSource,
|
|
51
64
|
CreLoxRecombinationSource as _CreLoxRecombinationSource,
|
|
52
65
|
PCRSource as _PCRSource,
|
|
53
66
|
CRISPRSource as _CRISPRSource,
|
|
67
|
+
RepositoryIdSource as _RepositoryIdSource,
|
|
68
|
+
UploadedFileSource as _UploadedFileSource,
|
|
69
|
+
AddgeneIdSource as _AddgeneIdSource,
|
|
70
|
+
AddgeneSequenceType,
|
|
71
|
+
BenchlingUrlSource as _BenchlingUrlSource,
|
|
72
|
+
SnapGenePlasmidSource as _SnapGenePlasmidSource,
|
|
73
|
+
EuroscarfSource as _EuroscarfSource,
|
|
74
|
+
WekWikGeneIdSource as _WekWikGeneIdSource,
|
|
75
|
+
SEVASource as _SEVASource,
|
|
76
|
+
IGEMSource as _IGEMSource,
|
|
77
|
+
OpenDNACollectionsSource as _OpenDNACollectionsSource,
|
|
78
|
+
GenomeCoordinatesSource as _GenomeCoordinatesSource,
|
|
79
|
+
OligoHybridizationSource as _OligoHybridizationSource,
|
|
80
|
+
PolymeraseExtensionSource as _PolymeraseExtensionSource,
|
|
81
|
+
AnnotationSource as _AnnotationSource,
|
|
82
|
+
AnnotationReport as _AnnotationReport,
|
|
83
|
+
PlannotateAnnotationReport as _PlannotateAnnotationReport,
|
|
84
|
+
ReverseComplementSource as _ReverseComplementSource,
|
|
85
|
+
NCBISequenceSource as _NCBISequenceSource,
|
|
54
86
|
)
|
|
55
|
-
from Bio.SeqFeature import Location, LocationParserError
|
|
87
|
+
from Bio.SeqFeature import Location, LocationParserError, SimpleLocation
|
|
56
88
|
from Bio.Restriction.Restriction import AbstractCut
|
|
57
89
|
import networkx as nx
|
|
58
90
|
from typing import List
|
|
@@ -78,8 +110,9 @@ def id_mode(use_python_internal_id: bool = True):
|
|
|
78
110
|
mapping them to the OpenCloning data model. If ``use_python_internal_id`` is True,
|
|
79
111
|
the built-in python ``id()`` function is used to assign ids to objects. That function
|
|
80
112
|
produces a unique integer for each object in python, so it's guaranteed to be unique.
|
|
81
|
-
If ``use_python_internal_id`` is False, the object's ``.id`` attribute
|
|
82
|
-
is used to assign ids to objects. This is useful
|
|
113
|
+
If ``use_python_internal_id`` is False, the object's ``.id`` attribute
|
|
114
|
+
(must be a string integer) is used to assign ids to objects. This is useful
|
|
115
|
+
when the objects already have meaningful ids,
|
|
83
116
|
and you want to keep references to them in ``SourceInput`` objects (which sequences and
|
|
84
117
|
primers are used in a particular source).
|
|
85
118
|
|
|
@@ -136,7 +169,6 @@ def get_id(obj: "Primer" | "Dseqrecord") -> int:
|
|
|
136
169
|
class SequenceLocationStr(str):
|
|
137
170
|
"""A string representation of a sequence location, genbank-like."""
|
|
138
171
|
|
|
139
|
-
# TODO: this should handle origin-spanning simple locations (splitted)
|
|
140
172
|
@classmethod
|
|
141
173
|
def from_biopython_location(cls, location: Location):
|
|
142
174
|
return cls(format_feature_location(location, None))
|
|
@@ -178,6 +210,14 @@ class SequenceLocationStr(str):
|
|
|
178
210
|
):
|
|
179
211
|
return cls.from_biopython_location(create_location(start, end, seq_len, strand))
|
|
180
212
|
|
|
213
|
+
def get_ncbi_format_coordinates(self) -> str:
|
|
214
|
+
"""Return start, end, strand in the same format as the NCBI eutils API (1-based, inclusive)"""
|
|
215
|
+
return (
|
|
216
|
+
self.to_biopython_location().start + 1,
|
|
217
|
+
self.to_biopython_location().end,
|
|
218
|
+
self.to_biopython_location().strand,
|
|
219
|
+
)
|
|
220
|
+
|
|
181
221
|
|
|
182
222
|
class ConfiguredBaseModel(BaseModel):
|
|
183
223
|
model_config = ConfigDict(
|
|
@@ -199,7 +239,7 @@ class TextFileSequence(_TextFileSequence):
|
|
|
199
239
|
id=get_id(dseqr),
|
|
200
240
|
sequence_file_format="genbank",
|
|
201
241
|
overhang_crick_3prime=dseqr.seq.ovhg,
|
|
202
|
-
overhang_watson_3prime=dseqr.seq.watson_ovhg
|
|
242
|
+
overhang_watson_3prime=dseqr.seq.watson_ovhg,
|
|
203
243
|
file_content=dseqr.format("genbank"),
|
|
204
244
|
)
|
|
205
245
|
|
|
@@ -261,18 +301,23 @@ class Source(ConfiguredBaseModel):
|
|
|
261
301
|
input: list[Union[SourceInput, AssemblyFragment]] = Field(default_factory=list)
|
|
262
302
|
TARGET_MODEL: ClassVar[Type[_Source]] = _Source
|
|
263
303
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
return
|
|
269
|
-
"id": seq_id,
|
|
270
|
-
"input": self.input_models(),
|
|
271
|
-
}
|
|
304
|
+
@field_serializer("input")
|
|
305
|
+
def serialize_input(
|
|
306
|
+
self, input: list[Union[SourceInput, AssemblyFragment]]
|
|
307
|
+
) -> list[_SourceInput | _AssemblyFragment]:
|
|
308
|
+
return [fragment.to_pydantic_model() for fragment in input]
|
|
272
309
|
|
|
273
310
|
def to_pydantic_model(self, seq_id: int):
|
|
274
|
-
|
|
275
|
-
|
|
311
|
+
model_dict = self.model_dump()
|
|
312
|
+
model_dict["id"] = seq_id
|
|
313
|
+
return self.TARGET_MODEL(**model_dict)
|
|
314
|
+
|
|
315
|
+
def to_unserialized_dict(self):
|
|
316
|
+
"""
|
|
317
|
+
Converts into a dictionary without serializing the fields.
|
|
318
|
+
This is used to be able to recast.
|
|
319
|
+
"""
|
|
320
|
+
return {field: getattr(self, field) for field in self.__pydantic_fields__}
|
|
276
321
|
|
|
277
322
|
def add_to_history_graph(self, history_graph: nx.DiGraph, seq: "Dseqrecord"):
|
|
278
323
|
"""
|
|
@@ -315,15 +360,6 @@ class AssemblySource(Source):
|
|
|
315
360
|
|
|
316
361
|
TARGET_MODEL: ClassVar[Type[_AssemblySource]] = _AssemblySource
|
|
317
362
|
|
|
318
|
-
def _kwargs(self, seq_id: int) -> dict:
|
|
319
|
-
return {
|
|
320
|
-
**super()._kwargs(seq_id),
|
|
321
|
-
"circular": self.circular,
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
def to_pydantic_model(self, seq_id: int):
|
|
325
|
-
return self.TARGET_MODEL(**self._kwargs(seq_id))
|
|
326
|
-
|
|
327
363
|
@classmethod
|
|
328
364
|
def from_subfragment_representation(
|
|
329
365
|
cls,
|
|
@@ -346,6 +382,90 @@ class AssemblySource(Source):
|
|
|
346
382
|
return AssemblySource(input=input_list, circular=is_circular)
|
|
347
383
|
|
|
348
384
|
|
|
385
|
+
class DatabaseSource(Source):
|
|
386
|
+
TARGET_MODEL: ClassVar[Type[_DatabaseSource]] = _DatabaseSource
|
|
387
|
+
|
|
388
|
+
database_id: int
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class UploadedFileSource(Source):
|
|
392
|
+
|
|
393
|
+
TARGET_MODEL: ClassVar[Type[_UploadedFileSource]] = _UploadedFileSource
|
|
394
|
+
|
|
395
|
+
file_name: str
|
|
396
|
+
index_in_file: int
|
|
397
|
+
sequence_file_format: str
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class RepositoryIdSource(Source):
|
|
401
|
+
|
|
402
|
+
TARGET_MODEL: ClassVar[Type[_RepositoryIdSource]] = _RepositoryIdSource
|
|
403
|
+
|
|
404
|
+
repository_id: str
|
|
405
|
+
# location: Location
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
class RepositoryIdSourceWithSequenceFileUrl(RepositoryIdSource):
|
|
409
|
+
"""
|
|
410
|
+
Auxiliary class to avoid code duplication in the sources that have
|
|
411
|
+
a sequence file url.
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
sequence_file_url: Optional[str] = None
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
class AddgeneIdSource(RepositoryIdSourceWithSequenceFileUrl):
|
|
418
|
+
TARGET_MODEL: ClassVar[Type[_AddgeneIdSource]] = _AddgeneIdSource
|
|
419
|
+
|
|
420
|
+
addgene_sequence_type: Optional[AddgeneSequenceType] = None
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
class BenchlingUrlSource(RepositoryIdSource):
|
|
424
|
+
TARGET_MODEL: ClassVar[Type[_BenchlingUrlSource]] = _BenchlingUrlSource
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
class SnapGenePlasmidSource(RepositoryIdSource):
|
|
428
|
+
TARGET_MODEL: ClassVar[Type[_SnapGenePlasmidSource]] = _SnapGenePlasmidSource
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class EuroscarfSource(RepositoryIdSource):
|
|
432
|
+
TARGET_MODEL: ClassVar[Type[_EuroscarfSource]] = _EuroscarfSource
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class WekWikGeneIdSource(RepositoryIdSourceWithSequenceFileUrl):
|
|
436
|
+
TARGET_MODEL: ClassVar[Type[_WekWikGeneIdSource]] = _WekWikGeneIdSource
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
class SEVASource(RepositoryIdSourceWithSequenceFileUrl):
|
|
440
|
+
TARGET_MODEL: ClassVar[Type[_SEVASource]] = _SEVASource
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
class IGEMSource(RepositoryIdSourceWithSequenceFileUrl):
|
|
444
|
+
TARGET_MODEL: ClassVar[Type[_IGEMSource]] = _IGEMSource
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
class OpenDNACollectionsSource(RepositoryIdSourceWithSequenceFileUrl):
|
|
448
|
+
TARGET_MODEL: ClassVar[Type[_OpenDNACollectionsSource]] = _OpenDNACollectionsSource
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
class NCBISequenceSource(RepositoryIdSource):
|
|
452
|
+
TARGET_MODEL: ClassVar[Type[_NCBISequenceSource]] = _NCBISequenceSource
|
|
453
|
+
coordinates: SimpleLocation | None = None
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
class GenomeCoordinatesSource(NCBISequenceSource):
|
|
457
|
+
TARGET_MODEL: ClassVar[Type[_GenomeCoordinatesSource]] = _GenomeCoordinatesSource
|
|
458
|
+
|
|
459
|
+
assembly_accession: Optional[str] = None
|
|
460
|
+
locus_tag: Optional[str] = None
|
|
461
|
+
gene_id: Optional[int] = None
|
|
462
|
+
coordinates: SimpleLocation
|
|
463
|
+
|
|
464
|
+
@field_serializer("coordinates")
|
|
465
|
+
def serialize_coordinates(self, coordinates: SimpleLocation) -> str:
|
|
466
|
+
return SequenceLocationStr.from_biopython_location(coordinates)
|
|
467
|
+
|
|
468
|
+
|
|
349
469
|
class RestrictionAndLigationSource(AssemblySource):
|
|
350
470
|
restriction_enzymes: list[AbstractCut]
|
|
351
471
|
|
|
@@ -353,11 +473,11 @@ class RestrictionAndLigationSource(AssemblySource):
|
|
|
353
473
|
_RestrictionAndLigationSource
|
|
354
474
|
)
|
|
355
475
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
476
|
+
@field_serializer("restriction_enzymes")
|
|
477
|
+
def serialize_restriction_enzymes(
|
|
478
|
+
self, restriction_enzymes: list[AbstractCut]
|
|
479
|
+
) -> list[str]:
|
|
480
|
+
return [str(enzyme) for enzyme in restriction_enzymes]
|
|
361
481
|
|
|
362
482
|
|
|
363
483
|
class GibsonAssemblySource(AssemblySource):
|
|
@@ -387,13 +507,6 @@ class GatewaySource(AssemblySource):
|
|
|
387
507
|
reaction_type: GatewayReactionType
|
|
388
508
|
greedy: bool = Field(default=False)
|
|
389
509
|
|
|
390
|
-
def _kwargs(self, seq_id: int) -> dict:
|
|
391
|
-
return {
|
|
392
|
-
**super()._kwargs(seq_id),
|
|
393
|
-
"reaction_type": self.reaction_type,
|
|
394
|
-
"greedy": self.greedy,
|
|
395
|
-
}
|
|
396
|
-
|
|
397
510
|
|
|
398
511
|
class HomologousRecombinationSource(AssemblySource):
|
|
399
512
|
TARGET_MODEL: ClassVar[Type[_HomologousRecombinationSource]] = (
|
|
@@ -415,21 +528,24 @@ class PCRSource(AssemblySource):
|
|
|
415
528
|
TARGET_MODEL: ClassVar[Type[_PCRSource]] = _PCRSource
|
|
416
529
|
add_primer_features: bool = Field(default=False)
|
|
417
530
|
|
|
418
|
-
def _kwargs(self, seq_id: int) -> dict:
|
|
419
|
-
return {
|
|
420
|
-
**super()._kwargs(seq_id),
|
|
421
|
-
"add_primer_features": self.add_primer_features,
|
|
422
|
-
}
|
|
423
|
-
|
|
424
531
|
|
|
425
532
|
class SequenceCutSource(Source):
|
|
426
533
|
left_edge: CutSiteType | None
|
|
427
534
|
right_edge: CutSiteType | None
|
|
428
535
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
536
|
+
@property
|
|
537
|
+
def TARGET_MODEL(self):
|
|
538
|
+
return (
|
|
539
|
+
_RestrictionEnzymeDigestionSource
|
|
540
|
+
if self._has_enzyme()
|
|
541
|
+
else _SequenceCutSource
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
@field_serializer("left_edge", "right_edge")
|
|
545
|
+
def serialize_cut_site(
|
|
546
|
+
self, cut_site: CutSiteType | None
|
|
547
|
+
) -> _RestrictionSequenceCut | _SequenceCut | None:
|
|
548
|
+
return self._cutsite_to_model(cut_site)
|
|
433
549
|
|
|
434
550
|
@staticmethod
|
|
435
551
|
def _cutsite_to_model(cut_site: CutSiteType | None):
|
|
@@ -461,18 +577,31 @@ class SequenceCutSource(Source):
|
|
|
461
577
|
|
|
462
578
|
return has_enzyme(self.left_edge) or has_enzyme(self.right_edge)
|
|
463
579
|
|
|
464
|
-
def _target_model(self):
|
|
465
|
-
return self.ENZYME_MODEL if self._has_enzyme() else self.BASE_MODEL
|
|
466
580
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
"right_edge": self._cutsite_to_model(self.right_edge),
|
|
472
|
-
}
|
|
581
|
+
class OligoHybridizationSource(Source):
|
|
582
|
+
TARGET_MODEL: ClassVar[Type[_OligoHybridizationSource]] = _OligoHybridizationSource
|
|
583
|
+
|
|
584
|
+
overhang_crick_3prime: Optional[int] = None
|
|
473
585
|
|
|
474
|
-
|
|
475
|
-
|
|
586
|
+
|
|
587
|
+
class PolymeraseExtensionSource(Source):
|
|
588
|
+
TARGET_MODEL: ClassVar[Type[_PolymeraseExtensionSource]] = (
|
|
589
|
+
_PolymeraseExtensionSource
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
class AnnotationSource(Source):
|
|
594
|
+
TARGET_MODEL: ClassVar[Type[_AnnotationSource]] = _AnnotationSource
|
|
595
|
+
|
|
596
|
+
annotation_tool: AnnotationTool
|
|
597
|
+
annotation_tool_version: Optional[str] = None
|
|
598
|
+
annotation_report: Optional[
|
|
599
|
+
list[_AnnotationReport | _PlannotateAnnotationReport]
|
|
600
|
+
] = None
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
class ReverseComplementSource(Source):
|
|
604
|
+
TARGET_MODEL: ClassVar[Type[_ReverseComplementSource]] = _ReverseComplementSource
|
|
476
605
|
|
|
477
606
|
|
|
478
607
|
class CloningStrategy(_BaseCloningStrategy):
|
|
@@ -510,9 +639,7 @@ class CloningStrategy(_BaseCloningStrategy):
|
|
|
510
639
|
else:
|
|
511
640
|
self.add_primer(source_input.sequence)
|
|
512
641
|
else:
|
|
513
|
-
self.sources.append(
|
|
514
|
-
_ManuallyTypedSource(id=get_id(dseqr), input=[], user_input="A")
|
|
515
|
-
)
|
|
642
|
+
self.sources.append(_ManuallyTypedSource(id=get_id(dseqr), input=[]))
|
|
516
643
|
|
|
517
644
|
def reassign_ids(self):
|
|
518
645
|
all_ids = (
|
pydna/parsers.py
CHANGED
|
@@ -7,26 +7,23 @@
|
|
|
7
7
|
|
|
8
8
|
"""Provides two functions, parse and parse_primers"""
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
import
|
|
12
|
-
import
|
|
13
|
-
import textwrap as _textwrap
|
|
10
|
+
import re
|
|
11
|
+
import io
|
|
12
|
+
import textwrap
|
|
14
13
|
|
|
15
|
-
from Bio import SeqIO
|
|
16
|
-
|
|
17
|
-
from pydna.dseqrecord import Dseqrecord
|
|
18
|
-
from
|
|
14
|
+
from Bio import SeqIO
|
|
15
|
+
|
|
16
|
+
from pydna.dseqrecord import Dseqrecord
|
|
17
|
+
from Bio.SeqRecord import SeqRecord
|
|
18
|
+
from pydna.opencloning_models import UploadedFileSource
|
|
19
|
+
from pydna.primer import Primer
|
|
19
20
|
|
|
20
|
-
# from pydna.amplify import pcr as _pcr
|
|
21
|
-
# from copy import deepcopy as _deepcopy
|
|
22
|
-
# from Bio.SeqFeature import SeqFeature as _SeqFeature
|
|
23
|
-
# import xml.etree.ElementTree as _et
|
|
24
21
|
|
|
25
22
|
try:
|
|
26
|
-
from itertools import pairwise
|
|
23
|
+
from itertools import pairwise
|
|
27
24
|
except ImportError:
|
|
28
25
|
|
|
29
|
-
def
|
|
26
|
+
def pairwise(iterable):
|
|
30
27
|
# pairwise('ABCDEFG') → AB BC CD DE EF FG
|
|
31
28
|
iterator = iter(iterable)
|
|
32
29
|
a = next(iterator, None)
|
|
@@ -51,8 +48,8 @@ gb_fasta_embl_regex = (
|
|
|
51
48
|
|
|
52
49
|
def extract_from_text(text):
|
|
53
50
|
"""docstring."""
|
|
54
|
-
data =
|
|
55
|
-
mos = list(
|
|
51
|
+
data = textwrap.dedent(str(text))
|
|
52
|
+
mos = list(re.finditer(gb_fasta_embl_regex, data + "\n\n", flags=re.MULTILINE))
|
|
56
53
|
|
|
57
54
|
class Fakemo(object):
|
|
58
55
|
def start(self):
|
|
@@ -65,7 +62,7 @@ def extract_from_text(text):
|
|
|
65
62
|
|
|
66
63
|
gaps = []
|
|
67
64
|
|
|
68
|
-
for mo1, mo2 in
|
|
65
|
+
for mo1, mo2 in pairwise([mofirst] + mos + [molast]):
|
|
69
66
|
gaps.append(data[mo1.end() : mo2.start()])
|
|
70
67
|
|
|
71
68
|
return tuple(mo.group(0) for mo in mos), tuple(gaps)
|
|
@@ -85,19 +82,22 @@ def embl_gb_fasta(text):
|
|
|
85
82
|
# topology = "linear"
|
|
86
83
|
|
|
87
84
|
for chunk in chunks:
|
|
88
|
-
handle =
|
|
85
|
+
handle = io.StringIO(chunk)
|
|
89
86
|
# circular = False
|
|
90
87
|
first_line = chunk.splitlines()[0].lower().split()
|
|
91
88
|
try:
|
|
92
|
-
parsed =
|
|
89
|
+
parsed = SeqIO.read(handle, "embl")
|
|
90
|
+
parsed.annotations["pydna_parse_sequence_file_format"] = "embl"
|
|
93
91
|
except ValueError:
|
|
94
92
|
handle.seek(0)
|
|
95
93
|
try:
|
|
96
|
-
parsed =
|
|
94
|
+
parsed = SeqIO.read(handle, "genbank")
|
|
95
|
+
parsed.annotations["pydna_parse_sequence_file_format"] = "genbank"
|
|
97
96
|
except ValueError:
|
|
98
97
|
handle.seek(0)
|
|
99
98
|
try:
|
|
100
|
-
parsed =
|
|
99
|
+
parsed = SeqIO.read(handle, "fasta-blast")
|
|
100
|
+
parsed.annotations["pydna_parse_sequence_file_format"] = "fasta"
|
|
101
101
|
except ValueError:
|
|
102
102
|
handle.close()
|
|
103
103
|
continue
|
|
@@ -126,7 +126,7 @@ def embl_gb_fasta(text):
|
|
|
126
126
|
return tuple(result_list)
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
def parse(data, ds=True):
|
|
129
|
+
def parse(data, ds=True) -> list[Dseqrecord | SeqRecord]:
|
|
130
130
|
"""Return *all* DNA sequences found in data.
|
|
131
131
|
|
|
132
132
|
If no sequences are found, an empty list is returned. This is a greedy
|
|
@@ -191,15 +191,22 @@ def parse(data, ds=True):
|
|
|
191
191
|
path = item
|
|
192
192
|
finally:
|
|
193
193
|
newsequences = embl_gb_fasta(raw)
|
|
194
|
-
# nfs = [_SeqFeature() for f in parsed.features]
|
|
195
|
-
# for f, nf in zip(parsed.features, nfs):
|
|
196
|
-
# nf.__dict__ = _deepcopy(f.__dict__)
|
|
197
|
-
# parsed.features = nfs
|
|
198
194
|
for s in newsequences:
|
|
199
195
|
if ds and path:
|
|
200
|
-
|
|
196
|
+
from pydna.opencloning_models import UploadedFileSource
|
|
197
|
+
|
|
198
|
+
result = Dseqrecord.from_SeqRecord(s)
|
|
199
|
+
result.source = UploadedFileSource(
|
|
200
|
+
file_name=str(path), # we use str to handle PosixPath
|
|
201
|
+
sequence_file_format=s.annotations[
|
|
202
|
+
"pydna_parse_sequence_file_format"
|
|
203
|
+
],
|
|
204
|
+
index_in_file=0,
|
|
205
|
+
)
|
|
206
|
+
sequences.append(result)
|
|
207
|
+
# sequences.append(_GenbankFile.from_SeqRecord(s, path=path))
|
|
201
208
|
elif ds:
|
|
202
|
-
sequences.append(
|
|
209
|
+
sequences.append(Dseqrecord.from_SeqRecord(s))
|
|
203
210
|
else:
|
|
204
211
|
sequences.append(s)
|
|
205
212
|
return sequences
|
|
@@ -207,10 +214,10 @@ def parse(data, ds=True):
|
|
|
207
214
|
|
|
208
215
|
def parse_primers(data):
|
|
209
216
|
"""docstring."""
|
|
210
|
-
return [
|
|
217
|
+
return [Primer(x) for x in parse(data, ds=False)]
|
|
211
218
|
|
|
212
219
|
|
|
213
|
-
def parse_snapgene(file_path: str) -> list[
|
|
220
|
+
def parse_snapgene(file_path: str) -> list[Dseqrecord]:
|
|
214
221
|
"""Parse a SnapGene file and return a Dseqrecord object.
|
|
215
222
|
|
|
216
223
|
Parameters
|
|
@@ -225,9 +232,15 @@ def parse_snapgene(file_path: str) -> list[_Dseqrecord]:
|
|
|
225
232
|
|
|
226
233
|
"""
|
|
227
234
|
with open(file_path, "rb") as f:
|
|
228
|
-
parsed_seq = next(
|
|
235
|
+
parsed_seq = next(SeqIO.parse(f, "snapgene"))
|
|
229
236
|
circular = (
|
|
230
237
|
"topology" in parsed_seq.annotations.keys()
|
|
231
238
|
and parsed_seq.annotations["topology"] == "circular"
|
|
232
239
|
)
|
|
233
|
-
|
|
240
|
+
|
|
241
|
+
source = UploadedFileSource(
|
|
242
|
+
file_name=str(file_path),
|
|
243
|
+
sequence_file_format="snapgene",
|
|
244
|
+
index_in_file=0,
|
|
245
|
+
)
|
|
246
|
+
return [Dseqrecord(parsed_seq, circular=circular, source=source)]
|
pydna/primer.py
CHANGED
|
@@ -7,11 +7,11 @@
|
|
|
7
7
|
|
|
8
8
|
"""This module provide the Primer class that is a subclass of the biopython SeqRecord."""
|
|
9
9
|
|
|
10
|
-
from pydna.seq import Seq
|
|
11
|
-
from pydna.seqrecord import SeqRecord
|
|
10
|
+
from pydna.seq import Seq
|
|
11
|
+
from pydna.seqrecord import SeqRecord
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class Primer(
|
|
14
|
+
class Primer(SeqRecord):
|
|
15
15
|
"""Primer and its position on a template, footprint and tail."""
|
|
16
16
|
|
|
17
17
|
def __init__(
|
|
@@ -23,7 +23,7 @@ class Primer(_SeqRecord):
|
|
|
23
23
|
elif hasattr(record, "transcribe"): # Seq
|
|
24
24
|
super().__init__(record, *args, **kwargs)
|
|
25
25
|
else: # string?
|
|
26
|
-
super().__init__(
|
|
26
|
+
super().__init__(Seq(record), *args, **kwargs)
|
|
27
27
|
self.amplicon = amplicon
|
|
28
28
|
self.position = position
|
|
29
29
|
self._fp = footprint or len(record)
|