@datagrok/bio 2.11.30 → 2.11.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/dist/36.js +1 -1
- package/dist/36.js.map +1 -1
- package/dist/42.js +1 -1
- package/dist/42.js.map +1 -1
- package/dist/590.js +2 -0
- package/dist/590.js.map +1 -0
- package/dist/709.js +1 -2
- package/dist/709.js.map +1 -1
- package/dist/79.js.map +1 -1
- package/dist/895.js +3 -0
- package/dist/895.js.map +1 -0
- package/dist/package-test.js +8 -1
- package/dist/package-test.js.LICENSE.txt +1 -0
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +8 -1
- package/dist/package.js.LICENSE.txt +1 -0
- package/dist/package.js.map +1 -1
- package/files/{data → monomer-libraries}/HELMCoreLibrary.json +594 -594
- package/files/tests/libraries/HELMmonomerSchema.json +96 -0
- package/package.json +13 -11
- package/scripts/sequence_generator.md +48 -0
- package/scripts/sequence_generator.py +515 -256
- package/src/package-test.ts +4 -0
- package/src/package.ts +26 -24
- package/src/tests/WebLogo-layout-tests.ts +37 -0
- package/src/tests/WebLogo-positions-test.ts +5 -0
- package/src/tests/WebLogo-project-tests.ts +63 -0
- package/src/tests/activity-cliffs-tests.ts +3 -2
- package/src/tests/monomer-libraries-tests.ts +7 -4
- package/src/tests/scoring.ts +3 -2
- package/src/tests/substructure-filters-tests.ts +3 -2
- package/src/tests/to-atomic-level-tests.ts +3 -2
- package/src/utils/helm-to-molfile.ts +3 -3
- package/src/utils/monomer-lib/lib-manager.ts +116 -0
- package/src/utils/monomer-lib/library-file-manager/consts.ts +1 -0
- package/src/utils/monomer-lib/library-file-manager/custom-monomer-lib-handlers.ts +80 -0
- package/src/utils/monomer-lib/library-file-manager/event-manager.ts +58 -0
- package/src/utils/monomer-lib/library-file-manager/file-manager.ts +187 -0
- package/src/utils/monomer-lib/library-file-manager/file-validator.ts +56 -0
- package/src/utils/monomer-lib/library-file-manager/style.css +8 -0
- package/src/utils/monomer-lib/library-file-manager/ui.ts +224 -0
- package/src/utils/monomer-lib/monomer-lib.ts +114 -0
- package/src/utils/poly-tool/const.ts +28 -0
- package/src/utils/poly-tool/monomer-lib-handler.ts +115 -0
- package/src/utils/poly-tool/types.ts +6 -0
- package/src/utils/poly-tool/ui.ts +2 -2
- package/src/viewers/vd-regions-viewer.ts +5 -4
- package/src/viewers/web-logo-viewer.ts +6 -5
- package/src/widgets/bio-substructure-filter.ts +4 -1
- package/files/libraries/HELMCoreLibrary.json +0 -18218
- package/src/utils/monomer-lib.ts +0 -305
- /package/dist/{709.js.LICENSE.txt → 895.js.LICENSE.txt} +0 -0
|
@@ -3,65 +3,120 @@
|
|
|
3
3
|
# description: Create the model peptides/DNA sequences with peptides data
|
|
4
4
|
# language: python
|
|
5
5
|
# tags: template, demo
|
|
6
|
-
# input: int clusters = 5 { caption:
|
|
7
|
-
# input: int num_sequences = 50 { caption: Number of sequences in each cluster
|
|
8
|
-
# input:
|
|
9
|
-
# input: int
|
|
10
|
-
# input: int
|
|
11
|
-
# input: int
|
|
12
|
-
# input:
|
|
13
|
-
# input: double
|
|
14
|
-
# input: double
|
|
15
|
-
# input:
|
|
16
|
-
# input:
|
|
17
|
-
# input:
|
|
18
|
-
# input: string
|
|
19
|
-
#
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
6
|
+
# input: int clusters = 5 { caption: Clusters; category: Clusters } [Number of clusters]
|
|
7
|
+
# input: int num_sequences = 50 { caption: Sequences; category: Clusters } [Number of sequences in each cluster]
|
|
8
|
+
# input: string alphabet_key = "Protein" { choices: ["Protein", "DNA", "RNA", "Protein_EXT"]; caption: Alphabet; category: Clusters;} [Sequence alphabet. Ignored if the HELM library is specified.]
|
|
9
|
+
# input: int motif_length = 12 { caption: Motif length; category: Motif } [Average length of motif]
|
|
10
|
+
# input: int max_variants_position = 3 { caption: Position variants; category: Motif } [Maximum number of different letters in a conservative position of the motif]
|
|
11
|
+
# input: int random_length = 3 { caption: Randon length; category: Motif } [Average length of random sequence parts before and after motif]
|
|
12
|
+
# input: int dispersion = 2 { caption: Length variation; category: Motif } [Variation of total sequence length]
|
|
13
|
+
# input: double activity_range = 0.2 { caption: Activity range; category: Activity parameters; format: 0.000} [Range of the mean activity value difference between clusters]
|
|
14
|
+
# input: double cliff_probability = 0.05 { caption: Cliff probability; category: Activity parameters; format: 0.000} [Probability to make activity cliff of a sequence]
|
|
15
|
+
# input: double cliff_strength = 5.0 { caption: Cliff strength; category: Activity parameters } [The size of the cliff comparing to the dispersion of the initial activity]
|
|
16
|
+
# input: double cliff_strength_dispersion = 1.0 { caption: Cliff dispersion; category: Activity parameters } [Dispersion of cliff strength]
|
|
17
|
+
# input: string assay_noise_levels = "0.4, 0.85" { caption: Noise levels; category: Assay settings } [List of assay noise levels, separated by comma]
|
|
18
|
+
# input: string assay_scales = "(0|10), (0|150.0)" { caption: Assay scales; category: Assay settings } [Typical scale size for each assay. Assays are separated by comma. Minimum and maximum values are separated by pipe. Brackets are optional]
|
|
19
|
+
# input: bool disable_negatives = true { caption: Crop negatives; category: Assay settings } [Set negative measurements for assay to zero]
|
|
20
|
+
# input: string fasta_separator = "" { caption: Fasta separator; nullable: true; category: Output format} [Monomers separator for FASTA format]
|
|
21
|
+
# input: file helm_library_file { caption: HELM library; nullable: true; category: Output format} [HELM library to load alphabet. Output format is set to HELM if the HELM library is specified]
|
|
22
|
+
# input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Connection mode; category: Output format} [Peptides connection mode for HELM output)]
|
|
23
|
+
# output: dataframe sequences_data
|
|
24
|
+
|
|
25
|
+
description = """The utility generates clusters of macromolecule sequences to test SAR functionality.
|
|
26
|
+
Each cluster contains a randomly generated sequence motif.
|
|
24
27
|
Each sequence has activity - a Gauss-distributed random value.
|
|
25
|
-
All sequences in the cluster has activities from the same distibution.
|
|
26
28
|
The utility can simulate activity cliffs - random changes in the conservative motif letters,
|
|
27
|
-
leading to
|
|
28
|
-
"""
|
|
29
|
+
leading to the significant change in the activity.
|
|
30
|
+
Utility can simulate multiple experimental assays measuring activity, with different scales and noise levels."""
|
|
29
31
|
|
|
30
32
|
import random
|
|
31
33
|
import argparse
|
|
32
34
|
import sys
|
|
35
|
+
from collections import namedtuple
|
|
33
36
|
from enum import Enum
|
|
34
37
|
|
|
35
|
-
from typing import List, Tuple, Dict,
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# --- Type definitions ---
|
|
38
|
+
from typing import List, Tuple, NamedTuple, Dict, Set, Any
|
|
39
39
|
|
|
40
|
+
# ===== Type definitions =====
|
|
40
41
|
Letter = str
|
|
41
|
-
Alphabet = List[
|
|
42
|
-
|
|
42
|
+
Alphabet = List[Letter]
|
|
43
43
|
LetterChoice = List[Letter]
|
|
44
44
|
MotifTemplate = List[LetterChoice]
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
# The sequence in a list of a monomers from the alphabet.
|
|
47
|
+
# We can't use string because monomers can have several letters
|
|
48
|
+
Sequence = List[Letter]
|
|
49
|
+
SequenceList = List[Sequence]
|
|
47
50
|
SequenceSquashed = str # Sequence, joined together in string form
|
|
48
51
|
|
|
49
|
-
|
|
50
|
-
|
|
52
|
+
CliffPair = Tuple[int, int]
|
|
53
|
+
CliffList = List[CliffPair]
|
|
54
|
+
|
|
55
|
+
Activity = float
|
|
56
|
+
ActivityList = List[Activity]
|
|
57
|
+
|
|
58
|
+
ClusterParameters = NamedTuple(
|
|
59
|
+
"ClusterParameters",
|
|
60
|
+
[
|
|
61
|
+
("motif_length", int),
|
|
62
|
+
("max_variants_per_position", int),
|
|
63
|
+
("random_length", int),
|
|
64
|
+
("dispersion", int),
|
|
65
|
+
],
|
|
66
|
+
)
|
|
67
|
+
CliffParameters = namedtuple(
|
|
68
|
+
"CliffParameters",
|
|
69
|
+
["cliff_probability", "cliff_strength", "cliff_strength_dispersion"],
|
|
70
|
+
)
|
|
71
|
+
AssayParameters = NamedTuple(
|
|
72
|
+
"AssayParameters", [("noise_level", float), ("min", float), ("max", float)]
|
|
73
|
+
)
|
|
51
74
|
|
|
52
|
-
|
|
75
|
+
DataLine = Tuple[
|
|
76
|
+
Any, ...
|
|
77
|
+
] # Contains strings and 1+ number of floats - can't type more exactly
|
|
53
78
|
|
|
79
|
+
# ===== Constants =====
|
|
80
|
+
OutputFormat = Enum("OutputFormat", ["Fasta", "Helm"])
|
|
54
81
|
HelmConnectionMode = Enum("HelmConnectionMode", ["linear", "cyclic", "mixed"])
|
|
55
82
|
|
|
56
|
-
alphabets: Dict[str,
|
|
57
|
-
"
|
|
58
|
-
"DNA": "A,T,G,C",
|
|
59
|
-
"RNA": "A,U,G,C",
|
|
60
|
-
"
|
|
83
|
+
alphabets: Dict[str, Alphabet] = {
|
|
84
|
+
"Protein": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y".split(","),
|
|
85
|
+
"DNA": "A,T,G,C".split(","),
|
|
86
|
+
"RNA": "A,U,G,C".split(","),
|
|
87
|
+
"Protein_EXT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,dA,dC,dD,dE,dF,dH,dI,dK,dL,dM,dN,dP,dQ,dR,dS,dT,dV,dW,dY,meA,meD,meS,meT,meV,meY,meE,meG,meI,meK,meM,meN,meQ,meC,meR,meW,meF,meH,meL,Nle,Nva,Orn,Iva,aIle,gGlu,Hcy,Hse,Hyp,D-gGlu,D-Nle,D-hPhe,D-Hyp,D-Nva,D-Orn,Pyr,Phe_3Cl,Phe_4Cl,Phe_4NH2,Phg,Ser_tBu,Tyr_Bn,Tza,1Nal,Cha,Lys_Boc,aThr,D-2Nal,D-2Thi,D-aHyp,D-aIle,D-Phg,D-Ser_tBu,Cya,Lys_Me3,Pen,Phe_4Me,Ser_Bn,Tyr_tBu,2Nal,Thi,aHyp,Ala_tBu,hPhe,D-1Nal,D-aThr,D-Cha,D-Pen,D-Phe_4Cl,D-Ser_Bn,Wil,Oic_3aS-7aS,Pip,3Pal,4Pal,Abu,Apm,Chg,Dab,Dap,D-3Pal,D-aMeAbu,D-Chg,D-Cit,D-Dab,D-Pip,D-Tic,Aca,Tic,Aad,Cit,Aze,Ac5c,Aib,D-2Pal,D-Abu,D-Dap,Asu,D-Thz,D-Trp_For,D-Tyr_Et,Lys_Ac,Asp_OMe,Phe_ab-dehydro,Sta_3xi4xi,Tyr_ab-dehydroMe,App,Cap,Cys_SEt,Dsu,pnC,pnG,Pqa,Pro_4Me3OH,Met_O2,Phe_2Me,Phe_34diCl,Phe_4Br,Phe_4I,Phe_4Sdihydroorotamido,Pyl,Ser_PO3H2,Thr_PO3H2,Thz,Trp_Me,Tyr_26diMe,Tyr_3I,Tyr_3NO2,Tyr_Ph4OH,Tyr_SO3H,Val_3OH,xiIle,NMe2Abz,NMebAla,aMePhe,aMePro,aMeTyr_3OH,Bmt,Bmt_E,Cys_Bn,Gla,hHis,His_1Me,Gly_allyl,Gly_cPr,Asp_Ph2NH2,Azi,2Abz,3Abz,4Abz,Ac3c,Ac6c,bAla,D-Bmt,D-Bmt_E,D-hArg,D-Phe_4F,D-Trp_2Me,D-Tyr_Me,D-xiIle,Lys_iPr,Phe_ab-dehydro_3NO2,Sta_3S4S,Bux,Dpm,pnA,pnT,seC,Met_O,nTyr,Oic_3aR-7aS,Oic_3axi-7axi,Phe_2F,Phe_3F,Phe_4F,Phe_4NO2,Phe_bbdiMe,Trp_5OH,Trp_Ome,Tyr_35diI,Tyr_3OH,Tyr_Me,Tyr_PO3H2,xiHyp,xiThr,NMe4Abz,aMeTyr,Aoda,Bpa,Cys_Me,Dip,hArg,His_1Bn,His_3Me,Hyl_5xi,Bip,Abu_23dehydro,D-Dip,Dha,D-hArg_Et2,D-Met_S-O,D-His_1Bn,D-nTyr,D-Phe_4ureido".split(
|
|
88
|
+
","
|
|
89
|
+
),
|
|
61
90
|
}
|
|
62
91
|
|
|
92
|
+
# ===== Motif and sequence generation functions =====
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def alphabet_from_helm(helm_library_file: str) -> Alphabet:
|
|
96
|
+
"""
|
|
97
|
+
Reads the HELM library from a JSON file and extracts only backbone monomers suitable for sequence generation
|
|
98
|
+
"""
|
|
99
|
+
import json
|
|
100
|
+
|
|
101
|
+
def is_monomer_suitable(monomer: Any) -> bool:
|
|
102
|
+
return (
|
|
103
|
+
monomer["polymerType"] == "PEPTIDE"
|
|
104
|
+
and monomer["monomerType"] == "Backbone"
|
|
105
|
+
and len(monomer["rgroups"]) == 2
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
alphabet: Alphabet = []
|
|
109
|
+
with open(helm_library_file) as helm_library:
|
|
110
|
+
for monomer in json.load(helm_library):
|
|
111
|
+
if is_monomer_suitable(monomer):
|
|
112
|
+
alphabet.append(monomer["symbol"])
|
|
113
|
+
return alphabet
|
|
114
|
+
|
|
63
115
|
|
|
64
116
|
def mean_range(mean: int, disp: int) -> int:
|
|
117
|
+
"""
|
|
118
|
+
Returns random positive value around some mean with selected dispersion
|
|
119
|
+
"""
|
|
65
120
|
return random.randint(max(mean - disp, 0), mean + disp)
|
|
66
121
|
|
|
67
122
|
|
|
@@ -69,8 +124,11 @@ def generate_motif_template(
|
|
|
69
124
|
motif_length: int,
|
|
70
125
|
alphabet: Alphabet,
|
|
71
126
|
max_variants_cluster: int,
|
|
72
|
-
prob_any: float = 0.2,
|
|
127
|
+
prob_any: float = 0.2, # The probability to have a non-conservative letter (the `?` sign in notation) inside motif
|
|
73
128
|
) -> MotifTemplate:
|
|
129
|
+
"""
|
|
130
|
+
Generated random template from the alphabet
|
|
131
|
+
"""
|
|
74
132
|
motif_template = []
|
|
75
133
|
for position in range(motif_length):
|
|
76
134
|
# Selecting letters for position i
|
|
@@ -84,32 +142,44 @@ def generate_motif_template(
|
|
|
84
142
|
|
|
85
143
|
|
|
86
144
|
def generate_motif(template: MotifTemplate, alphabet: Alphabet) -> Sequence:
|
|
145
|
+
"""
|
|
146
|
+
Generate sequence motif by motif template
|
|
147
|
+
"""
|
|
87
148
|
template_with_any = [
|
|
88
149
|
(letters if not "?" in letters else alphabet) for letters in template
|
|
89
150
|
]
|
|
90
151
|
return [random.choice(letters) for letters in template_with_any]
|
|
91
152
|
|
|
92
153
|
|
|
93
|
-
def motif_notation(motif_template: MotifTemplate) -> str:
|
|
154
|
+
def motif_notation(motif_template: MotifTemplate, fasta_separator: str = "") -> str:
|
|
155
|
+
"""
|
|
156
|
+
Returns string representation of motif template
|
|
157
|
+
"""
|
|
158
|
+
|
|
94
159
|
def motif_notation_code(letter_choice: LetterChoice) -> str:
|
|
95
160
|
if len(letter_choice) == 1:
|
|
96
|
-
return letter_choice[0]
|
|
161
|
+
return letter_choice[0] + fasta_separator
|
|
97
162
|
else:
|
|
98
|
-
return f"[{
|
|
163
|
+
return f"[{fasta_separator.join(letter_choice)}]"
|
|
99
164
|
|
|
100
165
|
return "".join(
|
|
101
166
|
[motif_notation_code(letter_choice) for letter_choice in motif_template]
|
|
102
167
|
)
|
|
103
168
|
|
|
104
169
|
|
|
105
|
-
def
|
|
170
|
+
def generate_random_sequence(n: int, alphabet: Alphabet) -> Sequence:
|
|
171
|
+
"""
|
|
172
|
+
Generate a sequence containing n random letters from the alphabet
|
|
173
|
+
"""
|
|
106
174
|
return [random.choice(alphabet) for i in range(n)]
|
|
107
175
|
|
|
108
176
|
|
|
109
|
-
def
|
|
177
|
+
def make_motif_cliff(
|
|
110
178
|
motif_template: MotifTemplate, alphabet: Alphabet, motif: Sequence
|
|
111
179
|
) -> Sequence:
|
|
112
|
-
|
|
180
|
+
"""
|
|
181
|
+
Mutates a random conservative letter in the motif
|
|
182
|
+
"""
|
|
113
183
|
motif_len = len(motif_template)
|
|
114
184
|
pos = random.randrange(motif_len)
|
|
115
185
|
while "?" in motif_template[pos]:
|
|
@@ -127,17 +197,61 @@ def make_cliff(
|
|
|
127
197
|
)
|
|
128
198
|
|
|
129
199
|
|
|
200
|
+
def generate_cluster_sequences(
|
|
201
|
+
n_sequences: int,
|
|
202
|
+
motif_template: MotifTemplate,
|
|
203
|
+
prefix_length: int,
|
|
204
|
+
suffix_length: int,
|
|
205
|
+
alphabet: Alphabet,
|
|
206
|
+
cliff_probability: float,
|
|
207
|
+
) -> Tuple[SequenceList, CliffList]:
|
|
208
|
+
"""
|
|
209
|
+
Returns set of sequences for one cluster and introduces sequence cliffs
|
|
210
|
+
Also makes activity cliffs
|
|
211
|
+
"""
|
|
212
|
+
n_seq = 0
|
|
213
|
+
sequences: SequenceList = []
|
|
214
|
+
cliffs: CliffList = []
|
|
215
|
+
|
|
216
|
+
while n_seq < n_sequences:
|
|
217
|
+
motif = generate_motif(motif_template, alphabet)
|
|
218
|
+
prefix = generate_random_sequence(prefix_length, alphabet)
|
|
219
|
+
suffix = generate_random_sequence(suffix_length, alphabet)
|
|
220
|
+
seq = prefix + motif + suffix
|
|
221
|
+
sequences.append(seq)
|
|
222
|
+
n_seq += 1
|
|
223
|
+
if n_seq >= n_sequences:
|
|
224
|
+
break # This is the last sequence - can't do cliff
|
|
225
|
+
is_cliff = random.random() <= cliff_probability
|
|
226
|
+
if is_cliff:
|
|
227
|
+
# Making activity cliff
|
|
228
|
+
cliff_motif = make_motif_cliff(motif_template, alphabet, motif)
|
|
229
|
+
cliff_seq = prefix + cliff_motif + suffix
|
|
230
|
+
sequences.append(cliff_seq)
|
|
231
|
+
cliffs.append((n_seq - 1, n_seq))
|
|
232
|
+
n_seq += 1
|
|
233
|
+
# sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
|
|
234
|
+
return sequences, cliffs
|
|
235
|
+
|
|
236
|
+
|
|
130
237
|
def sequence_to_fasta(sequence: Sequence, separator: str) -> SequenceSquashed:
|
|
238
|
+
"""
|
|
239
|
+
Converts the sequence to FASTA format
|
|
240
|
+
"""
|
|
131
241
|
return separator.join(sequence)
|
|
132
242
|
|
|
133
243
|
|
|
134
244
|
def sequence_to_helm(
|
|
135
|
-
sequence: Sequence,
|
|
245
|
+
sequence: Sequence,
|
|
246
|
+
helm_connection_mode: HelmConnectionMode = HelmConnectionMode.linear,
|
|
136
247
|
) -> SequenceSquashed:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
248
|
+
"""
|
|
249
|
+
Converts the sequence to HELM format
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
def is_cyclic(helm_connection_mode: HelmConnectionMode) -> bool:
|
|
253
|
+
return helm_connection_mode == HelmConnectionMode.cyclic or (
|
|
254
|
+
helm_connection_mode == HelmConnectionMode.mixed and random.random() < 0.5
|
|
141
255
|
)
|
|
142
256
|
|
|
143
257
|
sequence_escaped: Sequence = [
|
|
@@ -149,135 +263,211 @@ def sequence_to_helm(
|
|
|
149
263
|
return f"PEPTIDE1{{{sequence_to_fasta(sequence_escaped,'.')}}}${connection_format}$$$V2.0"
|
|
150
264
|
|
|
151
265
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
cliff_probability: float,
|
|
161
|
-
cliff_strength: float,
|
|
162
|
-
) -> Iterator[SequenceRecord]:
|
|
163
|
-
# Making a motif template
|
|
164
|
-
motif_template = generate_motif_template(
|
|
165
|
-
motif_length, alphabet, max_variants_per_position
|
|
166
|
-
)
|
|
167
|
-
# Setting average and dispersion for activity
|
|
168
|
-
activity_average = random.random() * 10
|
|
169
|
-
activity_dispersion = random.random()
|
|
170
|
-
sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
|
|
171
|
-
|
|
172
|
-
for n_seq in range(n_sequences):
|
|
173
|
-
activity = random.gauss(activity_average, activity_dispersion)
|
|
174
|
-
|
|
175
|
-
motif = generate_motif(motif_template, alphabet)
|
|
176
|
-
prefix = generate_random(prefix_length, alphabet)
|
|
177
|
-
suffix = generate_random(suffix_length, alphabet)
|
|
178
|
-
seq = prefix + motif + suffix
|
|
179
|
-
sequence_record: SequenceRecord = (n_seq, seq, activity, False)
|
|
180
|
-
yield sequence_record
|
|
181
|
-
|
|
182
|
-
is_cliff = make_cliffs and (random.random() <= cliff_probability)
|
|
183
|
-
if is_cliff:
|
|
184
|
-
# Making activity cliff
|
|
185
|
-
cliff_motif = make_cliff(motif_template, alphabet, motif)
|
|
186
|
-
cliff_seq = prefix + cliff_motif + suffix
|
|
187
|
-
# Recalculating activity
|
|
188
|
-
cliff_disp = activity_dispersion * cliff_strength * (0.5 + random.random())
|
|
189
|
-
activity = activity_average - cliff_disp
|
|
190
|
-
cliff_activity = activity_average + cliff_disp
|
|
191
|
-
|
|
192
|
-
# sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
|
|
193
|
-
# sys.stderr.write(f"{activity_average}\t{motif}\t{activity}\n")
|
|
194
|
-
# sys.stderr.write(f"{activity_average}\t{cliff_motif}\t{cliff_activity}\n")
|
|
195
|
-
n_seq += 1
|
|
196
|
-
sequence_record = (n_seq, cliff_seq, cliff_activity, is_cliff)
|
|
197
|
-
yield sequence_record
|
|
266
|
+
# ===== Activity generation functions =====
|
|
267
|
+
def generate_ideal_activities(n: int, activity_range: float = 0) -> ActivityList:
|
|
268
|
+
"""
|
|
269
|
+
Generate ideal activities with Gauss distribution
|
|
270
|
+
The distribution center is chosen randomly with some dispersion
|
|
271
|
+
"""
|
|
272
|
+
mean = random.uniform(-activity_range, activity_range) if activity_range > 0 else 0
|
|
273
|
+
return [random.gauss(mean, 1) for _ in range(n)]
|
|
198
274
|
|
|
199
275
|
|
|
200
|
-
def
|
|
276
|
+
def make_activity_cliff(
|
|
277
|
+
activities: ActivityList,
|
|
278
|
+
cliffs: List[CliffPair],
|
|
279
|
+
cliff_strength: float,
|
|
280
|
+
cliff_strength_dispersion: float,
|
|
281
|
+
) -> ActivityList:
|
|
282
|
+
"""
|
|
283
|
+
Introduce activity cliffs -
|
|
284
|
+
make a pair of activities differ for random gauss-distributed value defined by cliff_strength and cliff_strength_dispersion
|
|
285
|
+
"""
|
|
286
|
+
cliff_activities = activities[:]
|
|
287
|
+
for first, second in cliffs:
|
|
288
|
+
activity1 = activities[first]
|
|
289
|
+
activity2 = activities[second]
|
|
290
|
+
average = (activity1 + activity2) / 2
|
|
291
|
+
scale = random.gauss(cliff_strength, cliff_strength_dispersion) / abs(
|
|
292
|
+
activity1 - activity2
|
|
293
|
+
)
|
|
294
|
+
cliff_activities[first] = average + (activity1 - average) * scale
|
|
295
|
+
cliff_activities[second] = average + (activity2 - average) * scale
|
|
296
|
+
return cliff_activities
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def generate_assay_activities(
|
|
300
|
+
activities: ActivityList,
|
|
301
|
+
assay: AssayParameters,
|
|
302
|
+
disable_negatives: bool = True,
|
|
303
|
+
) -> ActivityList:
|
|
304
|
+
"""
|
|
305
|
+
Generates activities measured in assay from some "ideal" activities.
|
|
306
|
+
Adds noise and scales the values to emulate some assay measurement scale
|
|
307
|
+
"""
|
|
308
|
+
assay_activities = []
|
|
309
|
+
scale_factor = 3 * (
|
|
310
|
+
1 + assay.noise_level
|
|
311
|
+
) # real activity 3-sigma in the interval [-scale_factor,+scale_factor]
|
|
312
|
+
for activity in activities:
|
|
313
|
+
noise = random.uniform(
|
|
314
|
+
-3, 3
|
|
315
|
+
) # some random noize in [-3,3] - 3 sigma for ideal activity
|
|
316
|
+
# Adding noize and normalizing
|
|
317
|
+
noised_activity = activity + noise * assay.noise_level
|
|
318
|
+
rescaled_activity = (
|
|
319
|
+
noised_activity / (scale_factor * 2)
|
|
320
|
+
) + 0.5 # rescaling activity to the interval [0;1]
|
|
321
|
+
|
|
322
|
+
assay_result = assay.min + (rescaled_activity * (assay.max - assay.min))
|
|
323
|
+
|
|
324
|
+
if disable_negatives and assay_result < 0:
|
|
325
|
+
assay_result = 0
|
|
326
|
+
|
|
327
|
+
assay_activities.append(assay_result)
|
|
328
|
+
return assay_activities
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def generate_data(
|
|
201
332
|
n_clusters: int,
|
|
202
333
|
n_sequences: int,
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
dispersion: int,
|
|
334
|
+
cluster_parameters: ClusterParameters,
|
|
335
|
+
assays: List[AssayParameters],
|
|
336
|
+
disable_negatives: bool,
|
|
207
337
|
alphabet: Alphabet,
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
338
|
+
output_format: OutputFormat,
|
|
339
|
+
fasta_separator: str,
|
|
340
|
+
helm_connection_mode: HelmConnectionMode,
|
|
341
|
+
activity_range: float,
|
|
342
|
+
cliff_probability: float = 0.05,
|
|
343
|
+
cliff_strength: float = 5.0,
|
|
344
|
+
cliff_dispersion: float = 1.0,
|
|
345
|
+
) -> Tuple[List[str], List[DataLine]]:
|
|
346
|
+
"""
|
|
347
|
+
Main function generating all data set - sequences, activities, etc
|
|
348
|
+
"""
|
|
349
|
+
headers: List[str] = ["cluster", "sequence_id", "sequence", "is_cliff"]
|
|
350
|
+
headers += [f"Assay_{i+1}" for i in range(len(assays))]
|
|
351
|
+
data: List[DataLine] = []
|
|
352
|
+
|
|
353
|
+
def cliffs_to_positions(cliffs: CliffList) -> Set[int]:
|
|
354
|
+
"""
|
|
355
|
+
Convert CliffList to a set containing positions of cliffs
|
|
356
|
+
"""
|
|
357
|
+
unique_pos = {pos for cliff in cliffs for pos in cliff}
|
|
358
|
+
return unique_pos
|
|
214
359
|
|
|
215
360
|
for n_cluster in range(n_clusters):
|
|
216
|
-
motif_length = mean_range(
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
total_length =
|
|
220
|
-
|
|
361
|
+
motif_length = mean_range(
|
|
362
|
+
cluster_parameters.motif_length, cluster_parameters.dispersion
|
|
363
|
+
)
|
|
364
|
+
total_length = (
|
|
365
|
+
mean_range(
|
|
366
|
+
cluster_parameters.random_length * 2, cluster_parameters.dispersion
|
|
367
|
+
)
|
|
368
|
+
+ motif_length
|
|
369
|
+
)
|
|
370
|
+
prefix_length = mean_range(
|
|
371
|
+
cluster_parameters.random_length, cluster_parameters.dispersion // 2
|
|
372
|
+
)
|
|
221
373
|
suffix_length = total_length - motif_length - prefix_length
|
|
222
|
-
|
|
223
|
-
|
|
374
|
+
|
|
375
|
+
# Making a motif template
|
|
376
|
+
motif_template = generate_motif_template(
|
|
377
|
+
motif_length, alphabet, cluster_parameters.max_variants_per_position
|
|
378
|
+
)
|
|
379
|
+
sys.stderr.write(
|
|
380
|
+
f"Motif template for cluster {n_cluster}: {motif_notation(motif_template, fasta_separator)}\n"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
sequences, cliffs = generate_cluster_sequences(
|
|
224
384
|
n_sequences,
|
|
225
|
-
|
|
385
|
+
motif_template,
|
|
226
386
|
prefix_length,
|
|
227
387
|
suffix_length,
|
|
228
|
-
max_variants_per_position,
|
|
229
|
-
make_cliffs,
|
|
230
388
|
alphabet,
|
|
231
389
|
cliff_probability,
|
|
232
|
-
|
|
233
|
-
):
|
|
234
|
-
sequences.append(
|
|
235
|
-
(n_cluster, f"c{n_cluster}_s{n_seq:03d}", seq, activity, is_cliff)
|
|
236
|
-
)
|
|
237
|
-
return headers, sequences
|
|
390
|
+
)
|
|
238
391
|
|
|
392
|
+
if output_format == OutputFormat.Fasta:
|
|
393
|
+
squashed_sequences = [
|
|
394
|
+
sequence_to_fasta(seq, fasta_separator) for seq in sequences
|
|
395
|
+
]
|
|
396
|
+
elif output_format == OutputFormat.Helm:
|
|
397
|
+
squashed_sequences = [
|
|
398
|
+
sequence_to_helm(seq, helm_connection_mode) for seq in sequences
|
|
399
|
+
]
|
|
400
|
+
else:
|
|
401
|
+
print("Unsupported output format")
|
|
402
|
+
exit(-1)
|
|
239
403
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
404
|
+
ideal_activities = generate_ideal_activities(n_sequences, activity_range)
|
|
405
|
+
cliffed_activities = make_activity_cliff(
|
|
406
|
+
ideal_activities, cliffs, cliff_strength, cliff_dispersion
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
assay_activities = [
|
|
410
|
+
generate_assay_activities(cliffed_activities, assay, disable_negatives)
|
|
411
|
+
for assay in assays
|
|
412
|
+
]
|
|
247
413
|
|
|
414
|
+
cliffs_positions = cliffs_to_positions(cliffs)
|
|
415
|
+
is_cliffs = [pos in cliffs_positions for pos in range(n_sequences)]
|
|
416
|
+
sequence_IDs = [f"c{n_cluster}_s{n:03d}" for n in range(n_sequences)]
|
|
248
417
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
name_cluster,
|
|
256
|
-
sequence_to_helm(seq, helm_connection_mode),
|
|
257
|
-
activity,
|
|
258
|
-
is_cliff,
|
|
418
|
+
cluster_data = zip(
|
|
419
|
+
[n_cluster] * n_sequences,
|
|
420
|
+
sequence_IDs,
|
|
421
|
+
squashed_sequences,
|
|
422
|
+
is_cliffs,
|
|
423
|
+
*assay_activities,
|
|
259
424
|
)
|
|
260
|
-
|
|
425
|
+
|
|
426
|
+
data.extend(cluster_data)
|
|
427
|
+
|
|
428
|
+
return headers, data
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def repack_assays(noise_levels_str: str, scales_str: str) -> List[AssayParameters]:
|
|
432
|
+
"""
|
|
433
|
+
Converts strings passed from the input data to the list of AssayParameters namedtuples
|
|
434
|
+
"""
|
|
435
|
+
noise_levels = [float(s) for s in noise_levels_str.split(",")]
|
|
436
|
+
scales = [s.strip().split("|") for s in scales_str.split(",")]
|
|
437
|
+
minmaxes = [(float(x[0].strip("() ")), float(x[1].strip("()"))) for x in scales]
|
|
438
|
+
if not (len(noise_levels) == len(minmaxes)):
|
|
439
|
+
print("Not equal range of parameters for assay definition")
|
|
440
|
+
exit(-1)
|
|
441
|
+
assays = [
|
|
442
|
+
AssayParameters(noise, min, max)
|
|
443
|
+
for noise, (min, max) in zip(noise_levels, minmaxes)
|
|
261
444
|
]
|
|
445
|
+
return assays
|
|
262
446
|
|
|
263
447
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
448
|
+
# ===== Tests =====
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def test_activities_correlation() -> None:
|
|
452
|
+
import numpy as np
|
|
453
|
+
|
|
454
|
+
ideal_activities = generate_ideal_activities(25, 0.1)
|
|
455
|
+
cliff_activities = make_activity_cliff(
|
|
456
|
+
ideal_activities, [(0, 1)], cliff_strength=5.0, cliff_strength_dispersion=1.0
|
|
269
457
|
)
|
|
458
|
+
assay_parameters = AssayParameters(0.3, 0, 10)
|
|
459
|
+
x = generate_assay_activities(cliff_activities, assay_parameters)
|
|
460
|
+
assay_parameters = AssayParameters(0.5, 0, 250)
|
|
461
|
+
y = generate_assay_activities(cliff_activities, assay_parameters)
|
|
270
462
|
|
|
463
|
+
print("Assay1: " + ",".join([str(a) for a in x]))
|
|
464
|
+
print("Assay2: " + ",".join([str(a) for a in y]))
|
|
465
|
+
corr = np.corrcoef(x, y)
|
|
466
|
+
print("Correlation: ", corr[1, 0])
|
|
467
|
+
assert corr[1, 0] >= 0.5
|
|
271
468
|
|
|
272
|
-
def alphabet_from_helm(helm_library_file: str) -> Alphabet:
|
|
273
|
-
import json
|
|
274
469
|
|
|
275
|
-
|
|
276
|
-
with open(helm_library_file) as helm_library:
|
|
277
|
-
for monomer in json.load(helm_library):
|
|
278
|
-
if is_monomer_suitable(monomer):
|
|
279
|
-
alphabet.append(monomer["symbol"])
|
|
280
|
-
return alphabet
|
|
470
|
+
# ===== Command-line arguments parsing =====
|
|
281
471
|
|
|
282
472
|
|
|
283
473
|
def parse_command_line_args() -> Any:
|
|
@@ -285,30 +475,52 @@ def parse_command_line_args() -> Any:
|
|
|
285
475
|
prog="MotifSequencesGenerator",
|
|
286
476
|
description=description,
|
|
287
477
|
epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
|
|
478
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
288
479
|
)
|
|
289
480
|
|
|
290
|
-
parser.
|
|
481
|
+
cluster_group = parser.add_argument_group("Cluster parameters")
|
|
482
|
+
|
|
483
|
+
cluster_group.add_argument(
|
|
291
484
|
"-c", "--clusters", type=int, default=5, help="Number of clusters"
|
|
292
485
|
)
|
|
293
|
-
|
|
486
|
+
cluster_group.add_argument(
|
|
294
487
|
"-s",
|
|
295
488
|
"--sequences",
|
|
296
489
|
type=int,
|
|
297
490
|
default=50,
|
|
298
491
|
help="Number of sequences in each supercluster",
|
|
299
492
|
)
|
|
300
|
-
|
|
493
|
+
|
|
494
|
+
available_alphabets = ",".join(list(alphabets.keys()))
|
|
495
|
+
cluster_group.add_argument(
|
|
496
|
+
"--alphabet",
|
|
497
|
+
type=str,
|
|
498
|
+
default=list(alphabets.keys())[0],
|
|
499
|
+
help=f"Sequence alphabet: {available_alphabets}.\n"
|
|
500
|
+
+ "Ignored if the HELM library is specified",
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
motif_group = parser.add_argument_group("Motif parameters")
|
|
504
|
+
|
|
505
|
+
motif_group.add_argument(
|
|
301
506
|
"-m,", "--motif-length", type=int, default=12, help="Average length of motif"
|
|
302
507
|
)
|
|
303
508
|
|
|
304
|
-
|
|
509
|
+
motif_group.add_argument(
|
|
510
|
+
"--max-variants-position",
|
|
511
|
+
type=int,
|
|
512
|
+
default=3,
|
|
513
|
+
help="Maximum number of different letters in a conservative position of the motif",
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
motif_group.add_argument(
|
|
305
517
|
"-r,",
|
|
306
518
|
"--random-length",
|
|
307
519
|
type=int,
|
|
308
520
|
default=3,
|
|
309
521
|
help="Average length of random sequence parts before and after motif",
|
|
310
522
|
)
|
|
311
|
-
|
|
523
|
+
motif_group.add_argument(
|
|
312
524
|
"-d,",
|
|
313
525
|
"--dispersion",
|
|
314
526
|
type=int,
|
|
@@ -316,124 +528,171 @@ def parse_command_line_args() -> Any:
|
|
|
316
528
|
help="Variation of total sequence length",
|
|
317
529
|
)
|
|
318
530
|
|
|
319
|
-
parser.
|
|
320
|
-
"-h,",
|
|
321
|
-
"--helm-library-file",
|
|
322
|
-
type=str,
|
|
323
|
-
help="JSON file containing the HELM monomer library in the same format as used for Datagrok. "
|
|
324
|
-
+ "The alphabet property is ignored when helm library is specified.",
|
|
325
|
-
)
|
|
531
|
+
cliffs_group = parser.add_argument_group("Activity parameters")
|
|
326
532
|
|
|
327
|
-
|
|
328
|
-
"--
|
|
329
|
-
type=
|
|
330
|
-
default=
|
|
331
|
-
help=
|
|
533
|
+
cliffs_group.add_argument(
|
|
534
|
+
"--activity-range",
|
|
535
|
+
type=float,
|
|
536
|
+
default=0.5,
|
|
537
|
+
help="Range of the mean activity value difference between clusters",
|
|
332
538
|
)
|
|
333
539
|
|
|
334
|
-
|
|
335
|
-
parser.add_argument(
|
|
336
|
-
"--alphabet",
|
|
337
|
-
type=str,
|
|
338
|
-
default=list(alphabets.keys())[0],
|
|
339
|
-
help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
|
|
340
|
-
f"by comma",
|
|
341
|
-
)
|
|
342
|
-
parser.add_argument(
|
|
343
|
-
"--max-variants-position",
|
|
344
|
-
type=int,
|
|
345
|
-
default=3,
|
|
346
|
-
help="Maximum number of different letters in conservative position in motif",
|
|
347
|
-
)
|
|
348
|
-
parser.add_argument(
|
|
540
|
+
cliffs_group.add_argument(
|
|
349
541
|
"--cliff-probability",
|
|
350
542
|
type=float,
|
|
351
|
-
default=0.
|
|
543
|
+
default=0.05,
|
|
352
544
|
help="Probability to make activity cliff of a sequence",
|
|
353
545
|
)
|
|
354
|
-
|
|
546
|
+
cliffs_group.add_argument(
|
|
355
547
|
"--cliff-strength",
|
|
356
548
|
type=float,
|
|
357
|
-
default=
|
|
358
|
-
help="
|
|
549
|
+
default=5.0,
|
|
550
|
+
help="Average strength of cliff",
|
|
359
551
|
)
|
|
360
|
-
|
|
361
|
-
|
|
552
|
+
|
|
553
|
+
cliffs_group.add_argument(
|
|
554
|
+
"--cliff-strength-dispersion",
|
|
555
|
+
type=float,
|
|
556
|
+
default=1.0,
|
|
557
|
+
help="Cliff strength dispersion",
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
assay_group = parser.add_argument_group("Assay parameters")
|
|
561
|
+
|
|
562
|
+
assay_group.add_argument(
|
|
563
|
+
"--assay-noise-levels",
|
|
564
|
+
type=str,
|
|
565
|
+
default="0.4, 0.85",
|
|
566
|
+
help="Noise level(s) for assays. A list of values separated by comma.",
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
assay_group.add_argument(
|
|
570
|
+
"--assay-scales",
|
|
571
|
+
type=str,
|
|
572
|
+
default="(0|10), (0|150.0)",
|
|
573
|
+
help="Typical scale size for each assay. Assays are separated by comma. Minimum and maximum values are separated by pipe. Brackets are optional."
|
|
574
|
+
+ "Activity outliers may be located outside the specified scale",
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
assay_group.add_argument(
|
|
578
|
+
"--enable-negatives",
|
|
362
579
|
type=bool,
|
|
363
|
-
|
|
364
|
-
|
|
580
|
+
help="Enable negative values for assays results",
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
output_group = parser.add_argument_group("Output parameters")
|
|
584
|
+
|
|
585
|
+
output_group.add_argument(
|
|
586
|
+
"--custom-alphabet",
|
|
587
|
+
type=str,
|
|
588
|
+
default="",
|
|
589
|
+
help=f"Custom sequence alphabet: list of letters separated by comma. Used only if the --alphabet=custom",
|
|
365
590
|
)
|
|
366
|
-
|
|
591
|
+
|
|
592
|
+
output_group.add_argument(
|
|
367
593
|
"--fasta-separator",
|
|
368
594
|
type=str,
|
|
369
595
|
default="",
|
|
370
|
-
help="
|
|
596
|
+
help="Monomers separator for FASTA format",
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
output_group.add_argument(
|
|
600
|
+
"-H,",
|
|
601
|
+
"--helm-library-file",
|
|
602
|
+
type=str,
|
|
603
|
+
help="JSON file containing the HELM monomer library. "
|
|
604
|
+
+ "The alphabet property is ignored when helm library is specified.",
|
|
371
605
|
)
|
|
606
|
+
|
|
607
|
+
output_group.add_argument(
|
|
608
|
+
"--helm-connection-mode",
|
|
609
|
+
type=str,
|
|
610
|
+
default=HelmConnectionMode.linear.name,
|
|
611
|
+
help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
|
|
612
|
+
)
|
|
613
|
+
|
|
372
614
|
command_line_args = parser.parse_args()
|
|
373
615
|
|
|
374
616
|
return command_line_args
|
|
375
617
|
|
|
376
618
|
|
|
377
|
-
#
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
619
|
+
# ===== Main part of script =====
|
|
620
|
+
|
|
621
|
+
if __name__ == "__main__":
|
|
622
|
+
grok = "clusters" in globals()
|
|
623
|
+
|
|
624
|
+
if not grok:
|
|
625
|
+
# We are not in Datagrok - need to parse command line arguments
|
|
626
|
+
args = parse_command_line_args()
|
|
627
|
+
#
|
|
628
|
+
clusters = args.clusters
|
|
629
|
+
num_sequences = args.sequences
|
|
630
|
+
alphabet_key = args.alphabet
|
|
631
|
+
#
|
|
632
|
+
motif_length = args.motif_length
|
|
633
|
+
max_variants_position = args.max_variants_position
|
|
634
|
+
random_length = args.random_length
|
|
635
|
+
dispersion = args.dispersion
|
|
636
|
+
#
|
|
637
|
+
activity_range = args.activity_range
|
|
638
|
+
cliff_probability = args.cliff_probability
|
|
639
|
+
cliff_strength = args.cliff_strength
|
|
640
|
+
cliff_strength_dispersion = args.cliff_strength_dispersion
|
|
641
|
+
#
|
|
642
|
+
assay_noise_levels = args.assay_noise_levels
|
|
643
|
+
assay_scales = args.assay_scales
|
|
644
|
+
disable_negatives = not args.enable_negatives
|
|
645
|
+
#
|
|
646
|
+
custom_alphabet = args.custom_alphabet
|
|
647
|
+
fasta_separator = args.fasta_separator
|
|
648
|
+
helm_library_file = args.helm_library_file
|
|
649
|
+
helm_connection_mode = args.helm_connection_mode
|
|
650
|
+
|
|
651
|
+
helm_init = helm_library_file is not None and helm_library_file != ""
|
|
652
|
+
|
|
653
|
+
if helm_init:
|
|
654
|
+
alphabet = alphabet_from_helm(helm_library_file)
|
|
655
|
+
output_format = OutputFormat.Helm
|
|
656
|
+
fasta_separator = "|"
|
|
657
|
+
else:
|
|
658
|
+
output_format = OutputFormat.Fasta
|
|
659
|
+
if not alphabet_key in alphabets:
|
|
660
|
+
pass # TBD: custom alphabet
|
|
661
|
+
alphabet = alphabets[alphabet_key]
|
|
662
|
+
|
|
663
|
+
# Packing parameters to structures to simplify function signatures
|
|
664
|
+
cluster_parameters = ClusterParameters(
|
|
665
|
+
motif_length, max_variants_position, random_length, dispersion
|
|
405
666
|
)
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
for line in data_formatted:
|
|
439
|
-
csv_writer.writerow(line)
|
|
667
|
+
assays = repack_assays(assay_noise_levels, assay_scales)
|
|
668
|
+
|
|
669
|
+
# Running sequence generator
|
|
670
|
+
header, data = generate_data(
|
|
671
|
+
clusters,
|
|
672
|
+
num_sequences,
|
|
673
|
+
cluster_parameters,
|
|
674
|
+
assays,
|
|
675
|
+
disable_negatives,
|
|
676
|
+
alphabet,
|
|
677
|
+
output_format,
|
|
678
|
+
fasta_separator,
|
|
679
|
+
HelmConnectionMode[helm_connection_mode],
|
|
680
|
+
activity_range,
|
|
681
|
+
cliff_probability,
|
|
682
|
+
cliff_strength,
|
|
683
|
+
cliff_strength_dispersion,
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
if grok:
|
|
687
|
+
# Exporting data to Datagrok as a Pandas dataframe
|
|
688
|
+
import pandas as pd
|
|
689
|
+
|
|
690
|
+
sequences_data = pd.DataFrame.from_records(data, columns=header)
|
|
691
|
+
else:
|
|
692
|
+
# Writing results to stdout - no need to work with big and heavy Pandas
|
|
693
|
+
import csv
|
|
694
|
+
|
|
695
|
+
csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
|
|
696
|
+
csv_writer.writerow(header)
|
|
697
|
+
for line in data:
|
|
698
|
+
csv_writer.writerow(line)
|