pydna 5.5.4__py3-none-any.whl → 5.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ This module contains the functions for oligonucleotide hybridization.
4
+ """
5
+
6
+ from pydna.common_sub_strings import common_sub_strings
7
+ from Bio.Seq import reverse_complement
8
+ from pydna.primer import Primer
9
+ from pydna.dseqrecord import Dseqrecord
10
+ from pydna.dseq import Dseq
11
+ from pydna.opencloning_models import OligoHybridizationSource, SourceInput
12
+
13
+
14
+ def oligonucleotide_hybridization_overhangs(
15
+ fwd_oligo_seq: str, rvs_oligo_seq: str, minimal_annealing: int
16
+ ) -> list[int]:
17
+ """
18
+ Returns possible overhangs between two oligos given a minimal annealing length, and
19
+ returns an error if mismatches are found.
20
+
21
+ see https://github.com/manulera/OpenCloning_backend/issues/302 for notation
22
+
23
+ >>> from pydna.oligonucleotide_hybridization import oligonucleotide_hybridization_overhangs
24
+ >>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCAT", 3)
25
+ [0]
26
+ >>> oligonucleotide_hybridization_overhangs("aATGGC", "GCCAT", 5)
27
+ [-1]
28
+ >>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATa", 5)
29
+ [1]
30
+ >>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATaaGCCAT", 5)
31
+ [0, 7]
32
+
33
+ If the minimal annealing length is longer than the length of the shortest oligo, it returns an empty list.
34
+
35
+ >>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATaaGCCAT", 100)
36
+ []
37
+
38
+ If it's possible to anneal for ``minimal_annealing`` length, but with mismatches, it raises an error.
39
+
40
+ >>> oligonucleotide_hybridization_overhangs("cATGGC", "GCCATa", 5)
41
+ Traceback (most recent call last):
42
+ ...
43
+ ValueError: The oligonucleotides can anneal with mismatches
44
+ """
45
+ matches = common_sub_strings(
46
+ fwd_oligo_seq.lower(),
47
+ reverse_complement(rvs_oligo_seq.lower()),
48
+ minimal_annealing,
49
+ )
50
+
51
+ for pos_fwd, pos_rvs, length in matches:
52
+
53
+ if (pos_fwd != 0 and pos_rvs != 0) or (
54
+ pos_fwd + length < len(fwd_oligo_seq)
55
+ and pos_rvs + length < len(rvs_oligo_seq)
56
+ ):
57
+ raise ValueError("The oligonucleotides can anneal with mismatches")
58
+
59
+ # Return possible overhangs
60
+ return [pos_rvs - pos_fwd for pos_fwd, pos_rvs, length in matches]
61
+
62
+
63
+ def oligonucleotide_hybridization(
64
+ fwd_primer: Primer, rvs_primer: Primer, minimal_annealing: int
65
+ ) -> list[Dseqrecord]:
66
+ """
67
+ Returns a list of Dseqrecord objects representing the hybridization of two primers.
68
+
69
+ >>> from pydna.primer import Primer
70
+ >>> from pydna.oligonucleotide_hybridization import oligonucleotide_hybridization
71
+ >>> fwd_primer = Primer("ATGGC")
72
+ >>> rvs_primer = Primer("GCCA")
73
+ >>> oligonucleotide_hybridization(fwd_primer, rvs_primer, 3)[0].seq
74
+ Dseq(-5)
75
+ ATGGC
76
+ ACCG
77
+
78
+ Multiple values can be returned:
79
+
80
+ >>> rvs_primer2 = Primer("GCCATaaGCCAT")
81
+ >>> oligonucleotide_hybridization(fwd_primer, rvs_primer2, 3)[0].seq
82
+ Dseq(-12)
83
+ ATGGC
84
+ TACCGaaTACCG
85
+ >>> oligonucleotide_hybridization(fwd_primer, rvs_primer2, 3)[1].seq
86
+ Dseq(-12)
87
+ ATGGC
88
+ TACCGaaTACCG
89
+
90
+ If no possible overhangs are found, it returns an empty list.
91
+
92
+ >>> oligonucleotide_hybridization(fwd_primer, rvs_primer, 100)
93
+ []
94
+
95
+ If there are mismatches given the minimal annealing length, it raises an error.
96
+
97
+ >>> fwd_primer3 = Primer("cATGGC")
98
+ >>> rvs_primer3 = Primer("GCCATa")
99
+ >>> oligonucleotide_hybridization(fwd_primer3, rvs_primer3, 5)
100
+ Traceback (most recent call last):
101
+ ...
102
+ ValueError: The oligonucleotides can anneal with mismatches
103
+ """
104
+ possible_overhangs = oligonucleotide_hybridization_overhangs(
105
+ str(fwd_primer.seq), str(rvs_primer.seq), minimal_annealing
106
+ )
107
+ sources = [
108
+ OligoHybridizationSource(
109
+ overhang_crick_3prime=pos,
110
+ input=[SourceInput(sequence=fwd_primer), SourceInput(sequence=rvs_primer)],
111
+ )
112
+ for pos in possible_overhangs
113
+ ]
114
+ return [
115
+ Dseqrecord(
116
+ Dseq(
117
+ str(fwd_primer.seq),
118
+ str(rvs_primer.seq),
119
+ ovhg=source.overhang_crick_3prime,
120
+ ),
121
+ source=source,
122
+ )
123
+ for source in sources
124
+ ]
@@ -16,6 +16,17 @@ sequence. You can also use the ``CloningStrategy`` class to create a JSON repres
16
16
  the cloning strategy. That ``CloningStrategy`` can be loaded in the OpenCloning web interface
17
17
  to see a representation of the cloning strategy.
18
18
 
19
+
20
+ Contributing
21
+ ============
22
+
23
+ Not all fields can be readily serialized to be converted to regular types in pydantic. For
24
+ instance, the ``coordinates`` field of the ``GenomeCoordinatesSource`` class is a
25
+ ``SimpleLocation`` object, or the ``input`` field of ``Source`` is a list of ``SourceInput``
26
+ objects, which can be ``Dseqrecord`` or ``Primer`` objects, or ``AssemblyFragment`` objects.
27
+ For these type of fields, you have to define a ``field_serializer`` method to serialize them
28
+ to the correct type.
29
+
19
30
  """
20
31
  from __future__ import annotations
21
32
 
@@ -24,10 +35,11 @@ from pydantic_core import core_schema
24
35
  from contextlib import contextmanager
25
36
  from threading import local
26
37
 
27
- from pydantic import BaseModel, ConfigDict, Field, field_validator
38
+ from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
28
39
 
29
40
  from opencloning_linkml.datamodel import (
30
41
  CloningStrategy as _BaseCloningStrategy,
42
+ DatabaseSource as _DatabaseSource,
31
43
  Primer as _PrimerModel,
32
44
  Source as _Source,
33
45
  TextFileSequence as _TextFileSequence,
@@ -47,12 +59,32 @@ from opencloning_linkml.datamodel import (
47
59
  LigationSource as _LigationSource,
48
60
  GatewaySource as _GatewaySource,
49
61
  GatewayReactionType,
62
+ AnnotationTool,
50
63
  HomologousRecombinationSource as _HomologousRecombinationSource,
51
64
  CreLoxRecombinationSource as _CreLoxRecombinationSource,
52
65
  PCRSource as _PCRSource,
53
66
  CRISPRSource as _CRISPRSource,
67
+ RepositoryIdSource as _RepositoryIdSource,
68
+ UploadedFileSource as _UploadedFileSource,
69
+ AddgeneIdSource as _AddgeneIdSource,
70
+ AddgeneSequenceType,
71
+ BenchlingUrlSource as _BenchlingUrlSource,
72
+ SnapGenePlasmidSource as _SnapGenePlasmidSource,
73
+ EuroscarfSource as _EuroscarfSource,
74
+ WekWikGeneIdSource as _WekWikGeneIdSource,
75
+ SEVASource as _SEVASource,
76
+ IGEMSource as _IGEMSource,
77
+ OpenDNACollectionsSource as _OpenDNACollectionsSource,
78
+ GenomeCoordinatesSource as _GenomeCoordinatesSource,
79
+ OligoHybridizationSource as _OligoHybridizationSource,
80
+ PolymeraseExtensionSource as _PolymeraseExtensionSource,
81
+ AnnotationSource as _AnnotationSource,
82
+ AnnotationReport as _AnnotationReport,
83
+ PlannotateAnnotationReport as _PlannotateAnnotationReport,
84
+ ReverseComplementSource as _ReverseComplementSource,
85
+ NCBISequenceSource as _NCBISequenceSource,
54
86
  )
55
- from Bio.SeqFeature import Location, LocationParserError
87
+ from Bio.SeqFeature import Location, LocationParserError, SimpleLocation
56
88
  from Bio.Restriction.Restriction import AbstractCut
57
89
  import networkx as nx
58
90
  from typing import List
@@ -78,8 +110,9 @@ def id_mode(use_python_internal_id: bool = True):
78
110
  mapping them to the OpenCloning data model. If ``use_python_internal_id`` is True,
79
111
  the built-in python ``id()`` function is used to assign ids to objects. That function
80
112
  produces a unique integer for each object in python, so it's guaranteed to be unique.
81
- If ``use_python_internal_id`` is False, the object's ``.id`` attribute (must be a string integer)
82
- is used to assign ids to objects. This is useful when the objects already have meaningful ids,
113
+ If ``use_python_internal_id`` is False, the object's ``.id`` attribute
114
+ (must be a string integer) is used to assign ids to objects. This is useful
115
+ when the objects already have meaningful ids,
83
116
  and you want to keep references to them in ``SourceInput`` objects (which sequences and
84
117
  primers are used in a particular source).
85
118
 
@@ -136,7 +169,6 @@ def get_id(obj: "Primer" | "Dseqrecord") -> int:
136
169
  class SequenceLocationStr(str):
137
170
  """A string representation of a sequence location, genbank-like."""
138
171
 
139
- # TODO: this should handle origin-spanning simple locations (splitted)
140
172
  @classmethod
141
173
  def from_biopython_location(cls, location: Location):
142
174
  return cls(format_feature_location(location, None))
@@ -178,6 +210,14 @@ class SequenceLocationStr(str):
178
210
  ):
179
211
  return cls.from_biopython_location(create_location(start, end, seq_len, strand))
180
212
 
213
+ def get_ncbi_format_coordinates(self) -> str:
214
+ """Return start, end, strand in the same format as the NCBI eutils API (1-based, inclusive)"""
215
+ return (
216
+ self.to_biopython_location().start + 1,
217
+ self.to_biopython_location().end,
218
+ self.to_biopython_location().strand,
219
+ )
220
+
181
221
 
182
222
  class ConfiguredBaseModel(BaseModel):
183
223
  model_config = ConfigDict(
@@ -199,7 +239,7 @@ class TextFileSequence(_TextFileSequence):
199
239
  id=get_id(dseqr),
200
240
  sequence_file_format="genbank",
201
241
  overhang_crick_3prime=dseqr.seq.ovhg,
202
- overhang_watson_3prime=dseqr.seq.watson_ovhg(),
242
+ overhang_watson_3prime=dseqr.seq.watson_ovhg,
203
243
  file_content=dseqr.format("genbank"),
204
244
  )
205
245
 
@@ -261,18 +301,23 @@ class Source(ConfiguredBaseModel):
261
301
  input: list[Union[SourceInput, AssemblyFragment]] = Field(default_factory=list)
262
302
  TARGET_MODEL: ClassVar[Type[_Source]] = _Source
263
303
 
264
- def input_models(self):
265
- return [fragment.to_pydantic_model() for fragment in self.input]
266
-
267
- def _kwargs(self, seq_id: int) -> dict:
268
- return {
269
- "id": seq_id,
270
- "input": self.input_models(),
271
- }
304
+ @field_serializer("input")
305
+ def serialize_input(
306
+ self, input: list[Union[SourceInput, AssemblyFragment]]
307
+ ) -> list[_SourceInput | _AssemblyFragment]:
308
+ return [fragment.to_pydantic_model() for fragment in input]
272
309
 
273
310
  def to_pydantic_model(self, seq_id: int):
274
- kwargs = self._kwargs(seq_id)
275
- return self.TARGET_MODEL(**kwargs)
311
+ model_dict = self.model_dump()
312
+ model_dict["id"] = seq_id
313
+ return self.TARGET_MODEL(**model_dict)
314
+
315
+ def to_unserialized_dict(self):
316
+ """
317
+ Converts into a dictionary without serializing the fields.
318
+ This is used to be able to recast.
319
+ """
320
+ return {field: getattr(self, field) for field in self.__pydantic_fields__}
276
321
 
277
322
  def add_to_history_graph(self, history_graph: nx.DiGraph, seq: "Dseqrecord"):
278
323
  """
@@ -315,15 +360,6 @@ class AssemblySource(Source):
315
360
 
316
361
  TARGET_MODEL: ClassVar[Type[_AssemblySource]] = _AssemblySource
317
362
 
318
- def _kwargs(self, seq_id: int) -> dict:
319
- return {
320
- **super()._kwargs(seq_id),
321
- "circular": self.circular,
322
- }
323
-
324
- def to_pydantic_model(self, seq_id: int):
325
- return self.TARGET_MODEL(**self._kwargs(seq_id))
326
-
327
363
  @classmethod
328
364
  def from_subfragment_representation(
329
365
  cls,
@@ -346,6 +382,90 @@ class AssemblySource(Source):
346
382
  return AssemblySource(input=input_list, circular=is_circular)
347
383
 
348
384
 
385
+ class DatabaseSource(Source):
386
+ TARGET_MODEL: ClassVar[Type[_DatabaseSource]] = _DatabaseSource
387
+
388
+ database_id: int
389
+
390
+
391
+ class UploadedFileSource(Source):
392
+
393
+ TARGET_MODEL: ClassVar[Type[_UploadedFileSource]] = _UploadedFileSource
394
+
395
+ file_name: str
396
+ index_in_file: int
397
+ sequence_file_format: str
398
+
399
+
400
+ class RepositoryIdSource(Source):
401
+
402
+ TARGET_MODEL: ClassVar[Type[_RepositoryIdSource]] = _RepositoryIdSource
403
+
404
+ repository_id: str
405
+ # location: Location
406
+
407
+
408
+ class RepositoryIdSourceWithSequenceFileUrl(RepositoryIdSource):
409
+ """
410
+ Auxiliary class to avoid code duplication in the sources that have
411
+ a sequence file url.
412
+ """
413
+
414
+ sequence_file_url: Optional[str] = None
415
+
416
+
417
+ class AddgeneIdSource(RepositoryIdSourceWithSequenceFileUrl):
418
+ TARGET_MODEL: ClassVar[Type[_AddgeneIdSource]] = _AddgeneIdSource
419
+
420
+ addgene_sequence_type: Optional[AddgeneSequenceType] = None
421
+
422
+
423
+ class BenchlingUrlSource(RepositoryIdSource):
424
+ TARGET_MODEL: ClassVar[Type[_BenchlingUrlSource]] = _BenchlingUrlSource
425
+
426
+
427
+ class SnapGenePlasmidSource(RepositoryIdSource):
428
+ TARGET_MODEL: ClassVar[Type[_SnapGenePlasmidSource]] = _SnapGenePlasmidSource
429
+
430
+
431
+ class EuroscarfSource(RepositoryIdSource):
432
+ TARGET_MODEL: ClassVar[Type[_EuroscarfSource]] = _EuroscarfSource
433
+
434
+
435
+ class WekWikGeneIdSource(RepositoryIdSourceWithSequenceFileUrl):
436
+ TARGET_MODEL: ClassVar[Type[_WekWikGeneIdSource]] = _WekWikGeneIdSource
437
+
438
+
439
+ class SEVASource(RepositoryIdSourceWithSequenceFileUrl):
440
+ TARGET_MODEL: ClassVar[Type[_SEVASource]] = _SEVASource
441
+
442
+
443
+ class IGEMSource(RepositoryIdSourceWithSequenceFileUrl):
444
+ TARGET_MODEL: ClassVar[Type[_IGEMSource]] = _IGEMSource
445
+
446
+
447
+ class OpenDNACollectionsSource(RepositoryIdSourceWithSequenceFileUrl):
448
+ TARGET_MODEL: ClassVar[Type[_OpenDNACollectionsSource]] = _OpenDNACollectionsSource
449
+
450
+
451
+ class NCBISequenceSource(RepositoryIdSource):
452
+ TARGET_MODEL: ClassVar[Type[_NCBISequenceSource]] = _NCBISequenceSource
453
+ coordinates: SimpleLocation | None = None
454
+
455
+
456
+ class GenomeCoordinatesSource(NCBISequenceSource):
457
+ TARGET_MODEL: ClassVar[Type[_GenomeCoordinatesSource]] = _GenomeCoordinatesSource
458
+
459
+ assembly_accession: Optional[str] = None
460
+ locus_tag: Optional[str] = None
461
+ gene_id: Optional[int] = None
462
+ coordinates: SimpleLocation
463
+
464
+ @field_serializer("coordinates")
465
+ def serialize_coordinates(self, coordinates: SimpleLocation) -> str:
466
+ return SequenceLocationStr.from_biopython_location(coordinates)
467
+
468
+
349
469
  class RestrictionAndLigationSource(AssemblySource):
350
470
  restriction_enzymes: list[AbstractCut]
351
471
 
@@ -353,11 +473,11 @@ class RestrictionAndLigationSource(AssemblySource):
353
473
  _RestrictionAndLigationSource
354
474
  )
355
475
 
356
- def _kwargs(self, seq_id: int) -> dict:
357
- return {
358
- **super()._kwargs(seq_id),
359
- "restriction_enzymes": [str(enzyme) for enzyme in self.restriction_enzymes],
360
- }
476
+ @field_serializer("restriction_enzymes")
477
+ def serialize_restriction_enzymes(
478
+ self, restriction_enzymes: list[AbstractCut]
479
+ ) -> list[str]:
480
+ return [str(enzyme) for enzyme in restriction_enzymes]
361
481
 
362
482
 
363
483
  class GibsonAssemblySource(AssemblySource):
@@ -387,13 +507,6 @@ class GatewaySource(AssemblySource):
387
507
  reaction_type: GatewayReactionType
388
508
  greedy: bool = Field(default=False)
389
509
 
390
- def _kwargs(self, seq_id: int) -> dict:
391
- return {
392
- **super()._kwargs(seq_id),
393
- "reaction_type": self.reaction_type,
394
- "greedy": self.greedy,
395
- }
396
-
397
510
 
398
511
  class HomologousRecombinationSource(AssemblySource):
399
512
  TARGET_MODEL: ClassVar[Type[_HomologousRecombinationSource]] = (
@@ -415,21 +528,24 @@ class PCRSource(AssemblySource):
415
528
  TARGET_MODEL: ClassVar[Type[_PCRSource]] = _PCRSource
416
529
  add_primer_features: bool = Field(default=False)
417
530
 
418
- def _kwargs(self, seq_id: int) -> dict:
419
- return {
420
- **super()._kwargs(seq_id),
421
- "add_primer_features": self.add_primer_features,
422
- }
423
-
424
531
 
425
532
  class SequenceCutSource(Source):
426
533
  left_edge: CutSiteType | None
427
534
  right_edge: CutSiteType | None
428
535
 
429
- BASE_MODEL: ClassVar[Type[_SequenceCutSource]] = _SequenceCutSource
430
- ENZYME_MODEL: ClassVar[Type[_RestrictionEnzymeDigestionSource]] = (
431
- _RestrictionEnzymeDigestionSource
432
- )
536
+ @property
537
+ def TARGET_MODEL(self):
538
+ return (
539
+ _RestrictionEnzymeDigestionSource
540
+ if self._has_enzyme()
541
+ else _SequenceCutSource
542
+ )
543
+
544
+ @field_serializer("left_edge", "right_edge")
545
+ def serialize_cut_site(
546
+ self, cut_site: CutSiteType | None
547
+ ) -> _RestrictionSequenceCut | _SequenceCut | None:
548
+ return self._cutsite_to_model(cut_site)
433
549
 
434
550
  @staticmethod
435
551
  def _cutsite_to_model(cut_site: CutSiteType | None):
@@ -461,18 +577,31 @@ class SequenceCutSource(Source):
461
577
 
462
578
  return has_enzyme(self.left_edge) or has_enzyme(self.right_edge)
463
579
 
464
- def _target_model(self):
465
- return self.ENZYME_MODEL if self._has_enzyme() else self.BASE_MODEL
466
580
 
467
- def _kwargs(self, seq_id: int) -> dict:
468
- return {
469
- **super()._kwargs(seq_id),
470
- "left_edge": self._cutsite_to_model(self.left_edge),
471
- "right_edge": self._cutsite_to_model(self.right_edge),
472
- }
581
+ class OligoHybridizationSource(Source):
582
+ TARGET_MODEL: ClassVar[Type[_OligoHybridizationSource]] = _OligoHybridizationSource
583
+
584
+ overhang_crick_3prime: Optional[int] = None
473
585
 
474
- def to_pydantic_model(self, seq_id: int):
475
- return self._target_model()(**self._kwargs(seq_id))
586
+
587
+ class PolymeraseExtensionSource(Source):
588
+ TARGET_MODEL: ClassVar[Type[_PolymeraseExtensionSource]] = (
589
+ _PolymeraseExtensionSource
590
+ )
591
+
592
+
593
+ class AnnotationSource(Source):
594
+ TARGET_MODEL: ClassVar[Type[_AnnotationSource]] = _AnnotationSource
595
+
596
+ annotation_tool: AnnotationTool
597
+ annotation_tool_version: Optional[str] = None
598
+ annotation_report: Optional[
599
+ list[_AnnotationReport | _PlannotateAnnotationReport]
600
+ ] = None
601
+
602
+
603
+ class ReverseComplementSource(Source):
604
+ TARGET_MODEL: ClassVar[Type[_ReverseComplementSource]] = _ReverseComplementSource
476
605
 
477
606
 
478
607
  class CloningStrategy(_BaseCloningStrategy):
@@ -510,9 +639,7 @@ class CloningStrategy(_BaseCloningStrategy):
510
639
  else:
511
640
  self.add_primer(source_input.sequence)
512
641
  else:
513
- self.sources.append(
514
- _ManuallyTypedSource(id=get_id(dseqr), input=[], user_input="A")
515
- )
642
+ self.sources.append(_ManuallyTypedSource(id=get_id(dseqr), input=[]))
516
643
 
517
644
  def reassign_ids(self):
518
645
  all_ids = (
pydna/parsers.py CHANGED
@@ -7,26 +7,23 @@
7
7
 
8
8
  """Provides two functions, parse and parse_primers"""
9
9
 
10
- # import os as _os
11
- import re as _re
12
- import io as _io
13
- import textwrap as _textwrap
10
+ import re
11
+ import io
12
+ import textwrap
14
13
 
15
- from Bio import SeqIO as _SeqIO
16
- from pydna.genbankfile import GenbankFile as _GenbankFile
17
- from pydna.dseqrecord import Dseqrecord as _Dseqrecord
18
- from pydna.primer import Primer as _Primer
14
+ from Bio import SeqIO
15
+
16
+ from pydna.dseqrecord import Dseqrecord
17
+ from Bio.SeqRecord import SeqRecord
18
+ from pydna.opencloning_models import UploadedFileSource
19
+ from pydna.primer import Primer
19
20
 
20
- # from pydna.amplify import pcr as _pcr
21
- # from copy import deepcopy as _deepcopy
22
- # from Bio.SeqFeature import SeqFeature as _SeqFeature
23
- # import xml.etree.ElementTree as _et
24
21
 
25
22
  try:
26
- from itertools import pairwise as _pairwise
23
+ from itertools import pairwise
27
24
  except ImportError:
28
25
 
29
- def _pairwise(iterable):
26
+ def pairwise(iterable):
30
27
  # pairwise('ABCDEFG') → AB BC CD DE EF FG
31
28
  iterator = iter(iterable)
32
29
  a = next(iterator, None)
@@ -51,8 +48,8 @@ gb_fasta_embl_regex = (
51
48
 
52
49
  def extract_from_text(text):
53
50
  """docstring."""
54
- data = _textwrap.dedent(str(text))
55
- mos = list(_re.finditer(gb_fasta_embl_regex, data + "\n\n", flags=_re.MULTILINE))
51
+ data = textwrap.dedent(str(text))
52
+ mos = list(re.finditer(gb_fasta_embl_regex, data + "\n\n", flags=re.MULTILINE))
56
53
 
57
54
  class Fakemo(object):
58
55
  def start(self):
@@ -65,7 +62,7 @@ def extract_from_text(text):
65
62
 
66
63
  gaps = []
67
64
 
68
- for mo1, mo2 in _pairwise([mofirst] + mos + [molast]):
65
+ for mo1, mo2 in pairwise([mofirst] + mos + [molast]):
69
66
  gaps.append(data[mo1.end() : mo2.start()])
70
67
 
71
68
  return tuple(mo.group(0) for mo in mos), tuple(gaps)
@@ -85,19 +82,22 @@ def embl_gb_fasta(text):
85
82
  # topology = "linear"
86
83
 
87
84
  for chunk in chunks:
88
- handle = _io.StringIO(chunk)
85
+ handle = io.StringIO(chunk)
89
86
  # circular = False
90
87
  first_line = chunk.splitlines()[0].lower().split()
91
88
  try:
92
- parsed = _SeqIO.read(handle, "embl")
89
+ parsed = SeqIO.read(handle, "embl")
90
+ parsed.annotations["pydna_parse_sequence_file_format"] = "embl"
93
91
  except ValueError:
94
92
  handle.seek(0)
95
93
  try:
96
- parsed = _SeqIO.read(handle, "genbank")
94
+ parsed = SeqIO.read(handle, "genbank")
95
+ parsed.annotations["pydna_parse_sequence_file_format"] = "genbank"
97
96
  except ValueError:
98
97
  handle.seek(0)
99
98
  try:
100
- parsed = _SeqIO.read(handle, "fasta-blast")
99
+ parsed = SeqIO.read(handle, "fasta-blast")
100
+ parsed.annotations["pydna_parse_sequence_file_format"] = "fasta"
101
101
  except ValueError:
102
102
  handle.close()
103
103
  continue
@@ -126,7 +126,7 @@ def embl_gb_fasta(text):
126
126
  return tuple(result_list)
127
127
 
128
128
 
129
- def parse(data, ds=True):
129
+ def parse(data, ds=True) -> list[Dseqrecord | SeqRecord]:
130
130
  """Return *all* DNA sequences found in data.
131
131
 
132
132
  If no sequences are found, an empty list is returned. This is a greedy
@@ -191,15 +191,22 @@ def parse(data, ds=True):
191
191
  path = item
192
192
  finally:
193
193
  newsequences = embl_gb_fasta(raw)
194
- # nfs = [_SeqFeature() for f in parsed.features]
195
- # for f, nf in zip(parsed.features, nfs):
196
- # nf.__dict__ = _deepcopy(f.__dict__)
197
- # parsed.features = nfs
198
194
  for s in newsequences:
199
195
  if ds and path:
200
- sequences.append(_GenbankFile.from_SeqRecord(s, path=path))
196
+ from pydna.opencloning_models import UploadedFileSource
197
+
198
+ result = Dseqrecord.from_SeqRecord(s)
199
+ result.source = UploadedFileSource(
200
+ file_name=str(path), # we use str to handle PosixPath
201
+ sequence_file_format=s.annotations[
202
+ "pydna_parse_sequence_file_format"
203
+ ],
204
+ index_in_file=0,
205
+ )
206
+ sequences.append(result)
207
+ # sequences.append(_GenbankFile.from_SeqRecord(s, path=path))
201
208
  elif ds:
202
- sequences.append(_Dseqrecord.from_SeqRecord(s))
209
+ sequences.append(Dseqrecord.from_SeqRecord(s))
203
210
  else:
204
211
  sequences.append(s)
205
212
  return sequences
@@ -207,10 +214,10 @@ def parse(data, ds=True):
207
214
 
208
215
  def parse_primers(data):
209
216
  """docstring."""
210
- return [_Primer(x) for x in parse(data, ds=False)]
217
+ return [Primer(x) for x in parse(data, ds=False)]
211
218
 
212
219
 
213
- def parse_snapgene(file_path: str) -> list[_Dseqrecord]:
220
+ def parse_snapgene(file_path: str) -> list[Dseqrecord]:
214
221
  """Parse a SnapGene file and return a Dseqrecord object.
215
222
 
216
223
  Parameters
@@ -225,9 +232,15 @@ def parse_snapgene(file_path: str) -> list[_Dseqrecord]:
225
232
 
226
233
  """
227
234
  with open(file_path, "rb") as f:
228
- parsed_seq = next(_SeqIO.parse(f, "snapgene"))
235
+ parsed_seq = next(SeqIO.parse(f, "snapgene"))
229
236
  circular = (
230
237
  "topology" in parsed_seq.annotations.keys()
231
238
  and parsed_seq.annotations["topology"] == "circular"
232
239
  )
233
- return [_Dseqrecord(parsed_seq, circular=circular)]
240
+
241
+ source = UploadedFileSource(
242
+ file_name=str(file_path),
243
+ sequence_file_format="snapgene",
244
+ index_in_file=0,
245
+ )
246
+ return [Dseqrecord(parsed_seq, circular=circular, source=source)]
pydna/primer.py CHANGED
@@ -7,11 +7,11 @@
7
7
 
8
8
  """This module provide the Primer class that is a subclass of the biopython SeqRecord."""
9
9
 
10
- from pydna.seq import Seq as _Seq
11
- from pydna.seqrecord import SeqRecord as _SeqRecord
10
+ from pydna.seq import Seq
11
+ from pydna.seqrecord import SeqRecord
12
12
 
13
13
 
14
- class Primer(_SeqRecord):
14
+ class Primer(SeqRecord):
15
15
  """Primer and its position on a template, footprint and tail."""
16
16
 
17
17
  def __init__(
@@ -23,7 +23,7 @@ class Primer(_SeqRecord):
23
23
  elif hasattr(record, "transcribe"): # Seq
24
24
  super().__init__(record, *args, **kwargs)
25
25
  else: # string?
26
- super().__init__(_Seq(record), *args, **kwargs)
26
+ super().__init__(Seq(record), *args, **kwargs)
27
27
  self.amplicon = amplicon
28
28
  self.position = position
29
29
  self._fp = footprint or len(record)