@datagrok/bio 2.11.30 → 2.11.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/dist/36.js +1 -1
  3. package/dist/36.js.map +1 -1
  4. package/dist/42.js +1 -1
  5. package/dist/42.js.map +1 -1
  6. package/dist/590.js +2 -0
  7. package/dist/590.js.map +1 -0
  8. package/dist/709.js +1 -2
  9. package/dist/709.js.map +1 -1
  10. package/dist/79.js.map +1 -1
  11. package/dist/895.js +3 -0
  12. package/dist/895.js.map +1 -0
  13. package/dist/package-test.js +8 -1
  14. package/dist/package-test.js.LICENSE.txt +1 -0
  15. package/dist/package-test.js.map +1 -1
  16. package/dist/package.js +8 -1
  17. package/dist/package.js.LICENSE.txt +1 -0
  18. package/dist/package.js.map +1 -1
  19. package/files/{data → monomer-libraries}/HELMCoreLibrary.json +594 -594
  20. package/files/tests/libraries/HELMmonomerSchema.json +96 -0
  21. package/package.json +13 -11
  22. package/scripts/sequence_generator.md +48 -0
  23. package/scripts/sequence_generator.py +515 -256
  24. package/src/package-test.ts +4 -0
  25. package/src/package.ts +26 -24
  26. package/src/tests/WebLogo-layout-tests.ts +37 -0
  27. package/src/tests/WebLogo-positions-test.ts +5 -0
  28. package/src/tests/WebLogo-project-tests.ts +63 -0
  29. package/src/tests/activity-cliffs-tests.ts +3 -2
  30. package/src/tests/monomer-libraries-tests.ts +7 -4
  31. package/src/tests/scoring.ts +3 -2
  32. package/src/tests/substructure-filters-tests.ts +3 -2
  33. package/src/tests/to-atomic-level-tests.ts +3 -2
  34. package/src/utils/helm-to-molfile.ts +3 -3
  35. package/src/utils/monomer-lib/lib-manager.ts +116 -0
  36. package/src/utils/monomer-lib/library-file-manager/consts.ts +1 -0
  37. package/src/utils/monomer-lib/library-file-manager/custom-monomer-lib-handlers.ts +80 -0
  38. package/src/utils/monomer-lib/library-file-manager/event-manager.ts +58 -0
  39. package/src/utils/monomer-lib/library-file-manager/file-manager.ts +187 -0
  40. package/src/utils/monomer-lib/library-file-manager/file-validator.ts +56 -0
  41. package/src/utils/monomer-lib/library-file-manager/style.css +8 -0
  42. package/src/utils/monomer-lib/library-file-manager/ui.ts +224 -0
  43. package/src/utils/monomer-lib/monomer-lib.ts +114 -0
  44. package/src/utils/poly-tool/const.ts +28 -0
  45. package/src/utils/poly-tool/monomer-lib-handler.ts +115 -0
  46. package/src/utils/poly-tool/types.ts +6 -0
  47. package/src/utils/poly-tool/ui.ts +2 -2
  48. package/src/viewers/vd-regions-viewer.ts +5 -4
  49. package/src/viewers/web-logo-viewer.ts +6 -5
  50. package/src/widgets/bio-substructure-filter.ts +4 -1
  51. package/files/libraries/HELMCoreLibrary.json +0 -18218
  52. package/src/utils/monomer-lib.ts +0 -305
  53. /package/dist/{709.js.LICENSE.txt → 895.js.LICENSE.txt} +0 -0
@@ -3,65 +3,120 @@
3
3
  # description: Create the model peptides/DNA sequences with peptides data
4
4
  # language: python
5
5
  # tags: template, demo
6
- # input: int clusters = 5 { caption: Number of clusters; category: Clusters }
7
- # input: int num_sequences = 50 { caption: Number of sequences in each cluster; category: Clusters }
8
- # input: int motif_length = 12 { caption: Average length of motif; category: Motif }
9
- # input: int max_variants_position = 3 { caption: Maximum number of different letters in conservative position in motif; category: Motif }
10
- # input: int random_length = 3 { caption: Average length of random sequence parts before and after motif; category: Motif }
11
- # input: int dispersion = 2 { caption: Variation of total sequence length; category: Motif }
12
- # input: bool enable_cliffs = true { caption: Enable activity cliffs; category: Activity cliffs }
13
- # input: double cliff_probability = 0.01 { caption: Probability to make activity cliff of a sequence; category: Activity cliffs; format: 0.000}
14
- # input: double cliff_strength = 4.0 { caption: Strength of cliff; category: Activity cliffs }
15
- # input: string alphabet_key = "PT" { caption: Sequence alphabet; category: Output format; hint: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma}
16
- # input: string fasta_separator = "" { caption: Fasta format separator; nullable: true; category: Output format}
17
- # input: file helm_library_file { caption: HELM library to produce HELM output; nullable: true; category: Output format}
18
- # input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Peptides connection mode (HELM only); category: Output format}
19
- # output: dataframe sequences
20
-
21
-
22
- description="""The utility generates clusters of macromolecule sequences to test SAR fucntionality.
23
- Each cluster contains randomly generated sequence motif.
6
+ # input: int clusters = 5 { caption: Clusters; category: Clusters } [Number of clusters]
7
+ # input: int num_sequences = 50 { caption: Sequences; category: Clusters } [Number of sequences in each cluster]
8
+ # input: string alphabet_key = "Protein" { choices: ["Protein", "DNA", "RNA", "Protein_EXT"]; caption: Alphabet; category: Clusters;} [Sequence alphabet. Ignored if the HELM library is specified.]
9
+ # input: int motif_length = 12 { caption: Motif length; category: Motif } [Average length of motif]
10
+ # input: int max_variants_position = 3 { caption: Position variants; category: Motif } [Maximum number of different letters in a conservative position of the motif]
11
+ # input: int random_length = 3 { caption: Randon length; category: Motif } [Average length of random sequence parts before and after motif]
12
+ # input: int dispersion = 2 { caption: Length variation; category: Motif } [Variation of total sequence length]
13
+ # input: double activity_range = 0.2 { caption: Activity range; category: Activity parameters; format: 0.000} [Range of the mean activity value difference between clusters]
14
+ # input: double cliff_probability = 0.05 { caption: Cliff probability; category: Activity parameters; format: 0.000} [Probability to make activity cliff of a sequence]
15
+ # input: double cliff_strength = 5.0 { caption: Cliff strength; category: Activity parameters } [The size of the cliff comparing to the dispersion of the initial activity]
16
+ # input: double cliff_strength_dispersion = 1.0 { caption: Cliff dispersion; category: Activity parameters } [Dispersion of cliff strength]
17
+ # input: string assay_noise_levels = "0.4, 0.85" { caption: Noise levels; category: Assay settings } [List of assay noise levels, separated by comma]
18
+ # input: string assay_scales = "(0|10), (0|150.0)" { caption: Assay scales; category: Assay settings } [Typical scale size for each assay. Assays are separated by comma. Minimum and maximum values are separated by pipe. Brackets are optional]
19
+ # input: bool disable_negatives = true { caption: Crop negatives; category: Assay settings } [Set negative measurements for assay to zero]
20
+ # input: string fasta_separator = "" { caption: Fasta separator; nullable: true; category: Output format} [Monomers separator for FASTA format]
21
+ # input: file helm_library_file { caption: HELM library; nullable: true; category: Output format} [HELM library to load alphabet. Output format is set to HELM if the HELM library is specified]
22
+ # input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Connection mode; category: Output format} [Peptides connection mode for HELM output)]
23
+ # output: dataframe sequences_data
24
+
25
+ description = """The utility generates clusters of macromolecule sequences to test SAR functionality.
26
+ Each cluster contains a randomly generated sequence motif.
24
27
  Each sequence has activity - a Gauss-distributed random value.
25
- All sequences in the cluster has activities from the same distibution.
26
28
  The utility can simulate activity cliffs - random changes in the conservative motif letters,
27
- leading to drastical change in the activity.
28
- """
29
+ leading to the significant change in the activity.
30
+ Utility can simulate multiple experimental assays measuring activity, with different scales and noise levels."""
29
31
 
30
32
  import random
31
33
  import argparse
32
34
  import sys
35
+ from collections import namedtuple
33
36
  from enum import Enum
34
37
 
35
- from typing import List, Tuple, Dict, Iterator, Any
36
-
37
-
38
- # --- Type definitions ---
38
+ from typing import List, Tuple, NamedTuple, Dict, Set, Any
39
39
 
40
+ # ===== Type definitions =====
40
41
  Letter = str
41
- Alphabet = List[str]
42
-
42
+ Alphabet = List[Letter]
43
43
  LetterChoice = List[Letter]
44
44
  MotifTemplate = List[LetterChoice]
45
45
 
46
- Sequence = List[Letter] # The sequence in a form of list
46
+ # The sequence in a list of a monomers from the alphabet.
47
+ # We can't use string because monomers can have several letters
48
+ Sequence = List[Letter]
49
+ SequenceList = List[Sequence]
47
50
  SequenceSquashed = str # Sequence, joined together in string form
48
51
 
49
- SequenceRecord = Tuple[int, Sequence, float, bool]
50
- ClusterSequenceRecord = Tuple[int, str, Sequence, float, bool]
52
+ CliffPair = Tuple[int, int]
53
+ CliffList = List[CliffPair]
54
+
55
+ Activity = float
56
+ ActivityList = List[Activity]
57
+
58
+ ClusterParameters = NamedTuple(
59
+ "ClusterParameters",
60
+ [
61
+ ("motif_length", int),
62
+ ("max_variants_per_position", int),
63
+ ("random_length", int),
64
+ ("dispersion", int),
65
+ ],
66
+ )
67
+ CliffParameters = namedtuple(
68
+ "CliffParameters",
69
+ ["cliff_probability", "cliff_strength", "cliff_strength_dispersion"],
70
+ )
71
+ AssayParameters = NamedTuple(
72
+ "AssayParameters", [("noise_level", float), ("min", float), ("max", float)]
73
+ )
51
74
 
52
- # --- constants ---
75
+ DataLine = Tuple[
76
+ Any, ...
77
+ ] # Contains strings and 1+ number of floats - can't type more exactly
53
78
 
79
+ # ===== Constants =====
80
+ OutputFormat = Enum("OutputFormat", ["Fasta", "Helm"])
54
81
  HelmConnectionMode = Enum("HelmConnectionMode", ["linear", "cyclic", "mixed"])
55
82
 
56
- alphabets: Dict[str, str] = {
57
- "PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
58
- "DNA": "A,T,G,C",
59
- "RNA": "A,U,G,C",
60
- "PT_HELM": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,dA,dC,dD,dE,dF,dH,dI,dK,dL,dM,dN,dP,dQ,dR,dS,dT,dV,dW,dY,meA,meD,meS,meT,meV,meY,meE,meG,meI,meK,meM,meN,meQ,meC,meR,meW,meF,meH,meL,Nle,Nva,Orn,Iva,aIle,gGlu,Hcy,Hse,Hyp,D-gGlu,D-Nle,D-hPhe,D-Hyp,D-Nva,D-Orn,Pyr,Phe_3Cl,Phe_4Cl,Phe_4NH2,Phg,Ser_tBu,Tyr_Bn,Tza,1Nal,Cha,Lys_Boc,aThr,D-2Nal,D-2Thi,D-aHyp,D-aIle,D-Phg,D-Ser_tBu,Cya,Lys_Me3,Pen,Phe_4Me,Ser_Bn,Tyr_tBu,2Nal,Thi,aHyp,Ala_tBu,hPhe,D-1Nal,D-aThr,D-Cha,D-Pen,D-Phe_4Cl,D-Ser_Bn,Wil,Oic_3aS-7aS,Pip,3Pal,4Pal,Abu,Apm,Chg,Dab,Dap,D-3Pal,D-aMeAbu,D-Chg,D-Cit,D-Dab,D-Pip,D-Tic,Aca,Tic,Aad,Cit,Aze,Ac5c,Aib,D-2Pal,D-Abu,D-Dap,Asu,D-Thz,D-Trp_For,D-Tyr_Et,Lys_Ac,Asp_OMe,Phe_ab-dehydro,Sta_3xi4xi,Tyr_ab-dehydroMe,App,Cap,Cys_SEt,Dsu,pnC,pnG,Pqa,Pro_4Me3OH,Met_O2,Phe_2Me,Phe_34diCl,Phe_4Br,Phe_4I,Phe_4Sdihydroorotamido,Pyl,Ser_PO3H2,Thr_PO3H2,Thz,Trp_Me,Tyr_26diMe,Tyr_3I,Tyr_3NO2,Tyr_Ph4OH,Tyr_SO3H,Val_3OH,xiIle,NMe2Abz,NMebAla,aMePhe,aMePro,aMeTyr_3OH,Bmt,Bmt_E,Cys_Bn,Gla,hHis,His_1Me,Gly_allyl,Gly_cPr,Asp_Ph2NH2,Azi,2Abz,3Abz,4Abz,Ac3c,Ac6c,bAla,D-Bmt,D-Bmt_E,D-hArg,D-Phe_4F,D-Trp_2Me,D-Tyr_Me,D-xiIle,Lys_iPr,Phe_ab-dehydro_3NO2,Sta_3S4S,Bux,Dpm,pnA,pnT,seC,Met_O,nTyr,Oic_3aR-7aS,Oic_3axi-7axi,Phe_2F,Phe_3F,Phe_4F,Phe_4NO2,Phe_bbdiMe,Trp_5OH,Trp_Ome,Tyr_35diI,Tyr_3OH,Tyr_Me,Tyr_PO3H2,xiHyp,xiThr,NMe4Abz,aMeTyr,Aoda,Bpa,Cys_Me,Dip,hArg,His_1Bn,His_3Me,Hyl_5xi,Bip,Abu_23dehydro,D-Dip,Dha,D-hArg_Et2,D-Met_S-O,D-His_1Bn,D-nTyr,D-Phe_4ureido",
83
+ alphabets: Dict[str, Alphabet] = {
84
+ "Protein": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y".split(","),
85
+ "DNA": "A,T,G,C".split(","),
86
+ "RNA": "A,U,G,C".split(","),
87
+ "Protein_EXT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,dA,dC,dD,dE,dF,dH,dI,dK,dL,dM,dN,dP,dQ,dR,dS,dT,dV,dW,dY,meA,meD,meS,meT,meV,meY,meE,meG,meI,meK,meM,meN,meQ,meC,meR,meW,meF,meH,meL,Nle,Nva,Orn,Iva,aIle,gGlu,Hcy,Hse,Hyp,D-gGlu,D-Nle,D-hPhe,D-Hyp,D-Nva,D-Orn,Pyr,Phe_3Cl,Phe_4Cl,Phe_4NH2,Phg,Ser_tBu,Tyr_Bn,Tza,1Nal,Cha,Lys_Boc,aThr,D-2Nal,D-2Thi,D-aHyp,D-aIle,D-Phg,D-Ser_tBu,Cya,Lys_Me3,Pen,Phe_4Me,Ser_Bn,Tyr_tBu,2Nal,Thi,aHyp,Ala_tBu,hPhe,D-1Nal,D-aThr,D-Cha,D-Pen,D-Phe_4Cl,D-Ser_Bn,Wil,Oic_3aS-7aS,Pip,3Pal,4Pal,Abu,Apm,Chg,Dab,Dap,D-3Pal,D-aMeAbu,D-Chg,D-Cit,D-Dab,D-Pip,D-Tic,Aca,Tic,Aad,Cit,Aze,Ac5c,Aib,D-2Pal,D-Abu,D-Dap,Asu,D-Thz,D-Trp_For,D-Tyr_Et,Lys_Ac,Asp_OMe,Phe_ab-dehydro,Sta_3xi4xi,Tyr_ab-dehydroMe,App,Cap,Cys_SEt,Dsu,pnC,pnG,Pqa,Pro_4Me3OH,Met_O2,Phe_2Me,Phe_34diCl,Phe_4Br,Phe_4I,Phe_4Sdihydroorotamido,Pyl,Ser_PO3H2,Thr_PO3H2,Thz,Trp_Me,Tyr_26diMe,Tyr_3I,Tyr_3NO2,Tyr_Ph4OH,Tyr_SO3H,Val_3OH,xiIle,NMe2Abz,NMebAla,aMePhe,aMePro,aMeTyr_3OH,Bmt,Bmt_E,Cys_Bn,Gla,hHis,His_1Me,Gly_allyl,Gly_cPr,Asp_Ph2NH2,Azi,2Abz,3Abz,4Abz,Ac3c,Ac6c,bAla,D-Bmt,D-Bmt_E,D-hArg,D-Phe_4F,D-Trp_2Me,D-Tyr_Me,D-xiIle,Lys_iPr,Phe_ab-dehydro_3NO2,Sta_3S4S,Bux,Dpm,pnA,pnT,seC,Met_O,nTyr,Oic_3aR-7aS,Oic_3axi-7axi,Phe_2F,Phe_3F,Phe_4F,Phe_4NO2,Phe_bbdiMe,Trp_5OH,Trp_Ome,Tyr_35diI,Tyr_3OH,Tyr_Me,Tyr_PO3H2,xiHyp,xiThr,NMe4Abz,aMeTyr,Aoda,Bpa,Cys_Me,Dip,hArg,His_1Bn,His_3Me,Hyl_5xi,Bip,Abu_23dehydro,D-Dip,Dha,D-hArg_Et2,D-Met_S-O,D-His_1Bn,D-nTyr,D-Phe_4ureido".split(
88
+ ","
89
+ ),
61
90
  }
62
91
 
92
+ # ===== Motif and sequence generation functions =====
93
+
94
+
95
+ def alphabet_from_helm(helm_library_file: str) -> Alphabet:
96
+ """
97
+ Reads the HELM library from a JSON file and extracts only backbone monomers suitable for sequence generation
98
+ """
99
+ import json
100
+
101
+ def is_monomer_suitable(monomer: Any) -> bool:
102
+ return (
103
+ monomer["polymerType"] == "PEPTIDE"
104
+ and monomer["monomerType"] == "Backbone"
105
+ and len(monomer["rgroups"]) == 2
106
+ )
107
+
108
+ alphabet: Alphabet = []
109
+ with open(helm_library_file) as helm_library:
110
+ for monomer in json.load(helm_library):
111
+ if is_monomer_suitable(monomer):
112
+ alphabet.append(monomer["symbol"])
113
+ return alphabet
114
+
63
115
 
64
116
  def mean_range(mean: int, disp: int) -> int:
117
+ """
118
+ Returns random positive value around some mean with selected dispersion
119
+ """
65
120
  return random.randint(max(mean - disp, 0), mean + disp)
66
121
 
67
122
 
@@ -69,8 +124,11 @@ def generate_motif_template(
69
124
  motif_length: int,
70
125
  alphabet: Alphabet,
71
126
  max_variants_cluster: int,
72
- prob_any: float = 0.2,
127
+ prob_any: float = 0.2, # The probability to have a non-conservative letter (the `?` sign in notation) inside motif
73
128
  ) -> MotifTemplate:
129
+ """
130
+ Generated random template from the alphabet
131
+ """
74
132
  motif_template = []
75
133
  for position in range(motif_length):
76
134
  # Selecting letters for position i
@@ -84,32 +142,44 @@ def generate_motif_template(
84
142
 
85
143
 
86
144
  def generate_motif(template: MotifTemplate, alphabet: Alphabet) -> Sequence:
145
+ """
146
+ Generate sequence motif by motif template
147
+ """
87
148
  template_with_any = [
88
149
  (letters if not "?" in letters else alphabet) for letters in template
89
150
  ]
90
151
  return [random.choice(letters) for letters in template_with_any]
91
152
 
92
153
 
93
- def motif_notation(motif_template: MotifTemplate) -> str:
154
+ def motif_notation(motif_template: MotifTemplate, fasta_separator: str = "") -> str:
155
+ """
156
+ Returns string representation of motif template
157
+ """
158
+
94
159
  def motif_notation_code(letter_choice: LetterChoice) -> str:
95
160
  if len(letter_choice) == 1:
96
- return letter_choice[0]
161
+ return letter_choice[0] + fasta_separator
97
162
  else:
98
- return f"[{''.join(letter_choice)}]"
163
+ return f"[{fasta_separator.join(letter_choice)}]"
99
164
 
100
165
  return "".join(
101
166
  [motif_notation_code(letter_choice) for letter_choice in motif_template]
102
167
  )
103
168
 
104
169
 
105
- def generate_random(n: int, alphabet: Alphabet) -> Sequence:
170
+ def generate_random_sequence(n: int, alphabet: Alphabet) -> Sequence:
171
+ """
172
+ Generate a sequence containing n random letters from the alphabet
173
+ """
106
174
  return [random.choice(alphabet) for i in range(n)]
107
175
 
108
176
 
109
- def make_cliff(
177
+ def make_motif_cliff(
110
178
  motif_template: MotifTemplate, alphabet: Alphabet, motif: Sequence
111
179
  ) -> Sequence:
112
- # Mutate conservative letter in motif
180
+ """
181
+ Mutates a random conservative letter in the motif
182
+ """
113
183
  motif_len = len(motif_template)
114
184
  pos = random.randrange(motif_len)
115
185
  while "?" in motif_template[pos]:
@@ -127,17 +197,61 @@ def make_cliff(
127
197
  )
128
198
 
129
199
 
200
+ def generate_cluster_sequences(
201
+ n_sequences: int,
202
+ motif_template: MotifTemplate,
203
+ prefix_length: int,
204
+ suffix_length: int,
205
+ alphabet: Alphabet,
206
+ cliff_probability: float,
207
+ ) -> Tuple[SequenceList, CliffList]:
208
+ """
209
+ Returns set of sequences for one cluster and introduces sequence cliffs
210
+ Also makes activity cliffs
211
+ """
212
+ n_seq = 0
213
+ sequences: SequenceList = []
214
+ cliffs: CliffList = []
215
+
216
+ while n_seq < n_sequences:
217
+ motif = generate_motif(motif_template, alphabet)
218
+ prefix = generate_random_sequence(prefix_length, alphabet)
219
+ suffix = generate_random_sequence(suffix_length, alphabet)
220
+ seq = prefix + motif + suffix
221
+ sequences.append(seq)
222
+ n_seq += 1
223
+ if n_seq >= n_sequences:
224
+ break # This is the last sequence - can't do cliff
225
+ is_cliff = random.random() <= cliff_probability
226
+ if is_cliff:
227
+ # Making activity cliff
228
+ cliff_motif = make_motif_cliff(motif_template, alphabet, motif)
229
+ cliff_seq = prefix + cliff_motif + suffix
230
+ sequences.append(cliff_seq)
231
+ cliffs.append((n_seq - 1, n_seq))
232
+ n_seq += 1
233
+ # sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
234
+ return sequences, cliffs
235
+
236
+
130
237
  def sequence_to_fasta(sequence: Sequence, separator: str) -> SequenceSquashed:
238
+ """
239
+ Converts the sequence to FASTA format
240
+ """
131
241
  return separator.join(sequence)
132
242
 
133
243
 
134
244
  def sequence_to_helm(
135
- sequence: Sequence, helm_connection_mode: str = HelmConnectionMode.linear.name
245
+ sequence: Sequence,
246
+ helm_connection_mode: HelmConnectionMode = HelmConnectionMode.linear,
136
247
  ) -> SequenceSquashed:
137
- def is_cyclic(helm_connection_mode: str) -> bool:
138
- return helm_connection_mode == HelmConnectionMode.cyclic.name or (
139
- helm_connection_mode == HelmConnectionMode.mixed.name
140
- and random.random() < 0.5
248
+ """
249
+ Converts the sequence to HELM format
250
+ """
251
+
252
+ def is_cyclic(helm_connection_mode: HelmConnectionMode) -> bool:
253
+ return helm_connection_mode == HelmConnectionMode.cyclic or (
254
+ helm_connection_mode == HelmConnectionMode.mixed and random.random() < 0.5
141
255
  )
142
256
 
143
257
  sequence_escaped: Sequence = [
@@ -149,135 +263,211 @@ def sequence_to_helm(
149
263
  return f"PEPTIDE1{{{sequence_to_fasta(sequence_escaped,'.')}}}${connection_format}$$$V2.0"
150
264
 
151
265
 
152
- def generate_cluster(
153
- n_sequences: int,
154
- motif_length: int,
155
- prefix_length: int,
156
- suffix_length: int,
157
- max_variants_per_position: int,
158
- make_cliffs: bool,
159
- alphabet: Alphabet,
160
- cliff_probability: float,
161
- cliff_strength: float,
162
- ) -> Iterator[SequenceRecord]:
163
- # Making a motif template
164
- motif_template = generate_motif_template(
165
- motif_length, alphabet, max_variants_per_position
166
- )
167
- # Setting average and dispersion for activity
168
- activity_average = random.random() * 10
169
- activity_dispersion = random.random()
170
- sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
171
-
172
- for n_seq in range(n_sequences):
173
- activity = random.gauss(activity_average, activity_dispersion)
174
-
175
- motif = generate_motif(motif_template, alphabet)
176
- prefix = generate_random(prefix_length, alphabet)
177
- suffix = generate_random(suffix_length, alphabet)
178
- seq = prefix + motif + suffix
179
- sequence_record: SequenceRecord = (n_seq, seq, activity, False)
180
- yield sequence_record
181
-
182
- is_cliff = make_cliffs and (random.random() <= cliff_probability)
183
- if is_cliff:
184
- # Making activity cliff
185
- cliff_motif = make_cliff(motif_template, alphabet, motif)
186
- cliff_seq = prefix + cliff_motif + suffix
187
- # Recalculating activity
188
- cliff_disp = activity_dispersion * cliff_strength * (0.5 + random.random())
189
- activity = activity_average - cliff_disp
190
- cliff_activity = activity_average + cliff_disp
191
-
192
- # sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
193
- # sys.stderr.write(f"{activity_average}\t{motif}\t{activity}\n")
194
- # sys.stderr.write(f"{activity_average}\t{cliff_motif}\t{cliff_activity}\n")
195
- n_seq += 1
196
- sequence_record = (n_seq, cliff_seq, cliff_activity, is_cliff)
197
- yield sequence_record
266
+ # ===== Activity generation functions =====
267
+ def generate_ideal_activities(n: int, activity_range: float = 0) -> ActivityList:
268
+ """
269
+ Generate ideal activities with Gauss distribution
270
+ The distribution center is chosen randomly with some dispersion
271
+ """
272
+ mean = random.uniform(-activity_range, activity_range) if activity_range > 0 else 0
273
+ return [random.gauss(mean, 1) for _ in range(n)]
198
274
 
199
275
 
200
- def generate_sequences(
276
+ def make_activity_cliff(
277
+ activities: ActivityList,
278
+ cliffs: List[CliffPair],
279
+ cliff_strength: float,
280
+ cliff_strength_dispersion: float,
281
+ ) -> ActivityList:
282
+ """
283
+ Introduce activity cliffs -
284
+ make a pair of activities differ for random gauss-distributed value defined by cliff_strength and cliff_strength_dispersion
285
+ """
286
+ cliff_activities = activities[:]
287
+ for first, second in cliffs:
288
+ activity1 = activities[first]
289
+ activity2 = activities[second]
290
+ average = (activity1 + activity2) / 2
291
+ scale = random.gauss(cliff_strength, cliff_strength_dispersion) / abs(
292
+ activity1 - activity2
293
+ )
294
+ cliff_activities[first] = average + (activity1 - average) * scale
295
+ cliff_activities[second] = average + (activity2 - average) * scale
296
+ return cliff_activities
297
+
298
+
299
+ def generate_assay_activities(
300
+ activities: ActivityList,
301
+ assay: AssayParameters,
302
+ disable_negatives: bool = True,
303
+ ) -> ActivityList:
304
+ """
305
+ Generates activities measured in assay from some "ideal" activities.
306
+ Adds noise and scales the values to emulate some assay measurement scale
307
+ """
308
+ assay_activities = []
309
+ scale_factor = 3 * (
310
+ 1 + assay.noise_level
311
+ ) # real activity 3-sigma in the interval [-scale_factor,+scale_factor]
312
+ for activity in activities:
313
+ noise = random.uniform(
314
+ -3, 3
315
+ ) # some random noize in [-3,3] - 3 sigma for ideal activity
316
+ # Adding noize and normalizing
317
+ noised_activity = activity + noise * assay.noise_level
318
+ rescaled_activity = (
319
+ noised_activity / (scale_factor * 2)
320
+ ) + 0.5 # rescaling activity to the interval [0;1]
321
+
322
+ assay_result = assay.min + (rescaled_activity * (assay.max - assay.min))
323
+
324
+ if disable_negatives and assay_result < 0:
325
+ assay_result = 0
326
+
327
+ assay_activities.append(assay_result)
328
+ return assay_activities
329
+
330
+
331
+ def generate_data(
201
332
  n_clusters: int,
202
333
  n_sequences: int,
203
- average_motif_length: int,
204
- max_variants_per_position: int,
205
- average_random_length: int,
206
- dispersion: int,
334
+ cluster_parameters: ClusterParameters,
335
+ assays: List[AssayParameters],
336
+ disable_negatives: bool,
207
337
  alphabet: Alphabet,
208
- make_cliffs: bool,
209
- cliff_probability: float,
210
- cliff_strength: float,
211
- ) -> Tuple[List[str], List[ClusterSequenceRecord]]:
212
- headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
213
- sequences: List[ClusterSequenceRecord] = []
338
+ output_format: OutputFormat,
339
+ fasta_separator: str,
340
+ helm_connection_mode: HelmConnectionMode,
341
+ activity_range: float,
342
+ cliff_probability: float = 0.05,
343
+ cliff_strength: float = 5.0,
344
+ cliff_dispersion: float = 1.0,
345
+ ) -> Tuple[List[str], List[DataLine]]:
346
+ """
347
+ Main function generating all data set - sequences, activities, etc
348
+ """
349
+ headers: List[str] = ["cluster", "sequence_id", "sequence", "is_cliff"]
350
+ headers += [f"Assay_{i+1}" for i in range(len(assays))]
351
+ data: List[DataLine] = []
352
+
353
+ def cliffs_to_positions(cliffs: CliffList) -> Set[int]:
354
+ """
355
+ Convert CliffList to a set containing positions of cliffs
356
+ """
357
+ unique_pos = {pos for cliff in cliffs for pos in cliff}
358
+ return unique_pos
214
359
 
215
360
  for n_cluster in range(n_clusters):
216
- motif_length = mean_range(average_motif_length, dispersion)
217
-
218
- # sys.stderr.write(f"Cluster {n_cluster:2} motif template: {motif_notation(motif_template)}\n")
219
- total_length = mean_range(average_random_length * 2, dispersion) + motif_length
220
- prefix_length = mean_range(average_random_length, dispersion // 2)
361
+ motif_length = mean_range(
362
+ cluster_parameters.motif_length, cluster_parameters.dispersion
363
+ )
364
+ total_length = (
365
+ mean_range(
366
+ cluster_parameters.random_length * 2, cluster_parameters.dispersion
367
+ )
368
+ + motif_length
369
+ )
370
+ prefix_length = mean_range(
371
+ cluster_parameters.random_length, cluster_parameters.dispersion // 2
372
+ )
221
373
  suffix_length = total_length - motif_length - prefix_length
222
- sys.stderr.write(f"Generating sequences for cluster {n_cluster}\n")
223
- for n_seq, seq, activity, is_cliff in generate_cluster(
374
+
375
+ # Making a motif template
376
+ motif_template = generate_motif_template(
377
+ motif_length, alphabet, cluster_parameters.max_variants_per_position
378
+ )
379
+ sys.stderr.write(
380
+ f"Motif template for cluster {n_cluster}: {motif_notation(motif_template, fasta_separator)}\n"
381
+ )
382
+
383
+ sequences, cliffs = generate_cluster_sequences(
224
384
  n_sequences,
225
- motif_length,
385
+ motif_template,
226
386
  prefix_length,
227
387
  suffix_length,
228
- max_variants_per_position,
229
- make_cliffs,
230
388
  alphabet,
231
389
  cliff_probability,
232
- cliff_strength,
233
- ):
234
- sequences.append(
235
- (n_cluster, f"c{n_cluster}_s{n_seq:03d}", seq, activity, is_cliff)
236
- )
237
- return headers, sequences
390
+ )
238
391
 
392
+ if output_format == OutputFormat.Fasta:
393
+ squashed_sequences = [
394
+ sequence_to_fasta(seq, fasta_separator) for seq in sequences
395
+ ]
396
+ elif output_format == OutputFormat.Helm:
397
+ squashed_sequences = [
398
+ sequence_to_helm(seq, helm_connection_mode) for seq in sequences
399
+ ]
400
+ else:
401
+ print("Unsupported output format")
402
+ exit(-1)
239
403
 
240
- def convert_to_fasta(
241
- cluster_sequence_records: List[ClusterSequenceRecord], separator: str
242
- ) -> List[Tuple[int, str, str, float, bool]]:
243
- return [
244
- (n_cluster, name_cluster, sequence_to_fasta(seq, separator), activity, is_cliff)
245
- for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
246
- ]
404
+ ideal_activities = generate_ideal_activities(n_sequences, activity_range)
405
+ cliffed_activities = make_activity_cliff(
406
+ ideal_activities, cliffs, cliff_strength, cliff_dispersion
407
+ )
408
+
409
+ assay_activities = [
410
+ generate_assay_activities(cliffed_activities, assay, disable_negatives)
411
+ for assay in assays
412
+ ]
247
413
 
414
+ cliffs_positions = cliffs_to_positions(cliffs)
415
+ is_cliffs = [pos in cliffs_positions for pos in range(n_sequences)]
416
+ sequence_IDs = [f"c{n_cluster}_s{n:03d}" for n in range(n_sequences)]
248
417
 
249
- def convert_to_helm(
250
- cluster_sequence_records: List[ClusterSequenceRecord], helm_connection_mode: str
251
- ) -> List[Tuple[int, str, str, float, bool]]:
252
- return [
253
- (
254
- n_cluster,
255
- name_cluster,
256
- sequence_to_helm(seq, helm_connection_mode),
257
- activity,
258
- is_cliff,
418
+ cluster_data = zip(
419
+ [n_cluster] * n_sequences,
420
+ sequence_IDs,
421
+ squashed_sequences,
422
+ is_cliffs,
423
+ *assay_activities,
259
424
  )
260
- for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
425
+
426
+ data.extend(cluster_data)
427
+
428
+ return headers, data
429
+
430
+
431
+ def repack_assays(noise_levels_str: str, scales_str: str) -> List[AssayParameters]:
432
+ """
433
+ Converts strings passed from the input data to the list of AssayParameters namedtuples
434
+ """
435
+ noise_levels = [float(s) for s in noise_levels_str.split(",")]
436
+ scales = [s.strip().split("|") for s in scales_str.split(",")]
437
+ minmaxes = [(float(x[0].strip("() ")), float(x[1].strip("()"))) for x in scales]
438
+ if not (len(noise_levels) == len(minmaxes)):
439
+ print("Not equal range of parameters for assay definition")
440
+ exit(-1)
441
+ assays = [
442
+ AssayParameters(noise, min, max)
443
+ for noise, (min, max) in zip(noise_levels, minmaxes)
261
444
  ]
445
+ return assays
262
446
 
263
447
 
264
- def is_monomer_suitable(monomer: Any) -> bool:
265
- return (
266
- monomer["polymerType"] == "PEPTIDE"
267
- and monomer["monomerType"] == "Backbone"
268
- and len(monomer["rgroups"]) == 2
448
+ # ===== Tests =====
449
+
450
+
451
+ def test_activities_correlation() -> None:
452
+ import numpy as np
453
+
454
+ ideal_activities = generate_ideal_activities(25, 0.1)
455
+ cliff_activities = make_activity_cliff(
456
+ ideal_activities, [(0, 1)], cliff_strength=5.0, cliff_strength_dispersion=1.0
269
457
  )
458
+ assay_parameters = AssayParameters(0.3, 0, 10)
459
+ x = generate_assay_activities(cliff_activities, assay_parameters)
460
+ assay_parameters = AssayParameters(0.5, 0, 250)
461
+ y = generate_assay_activities(cliff_activities, assay_parameters)
270
462
 
463
+ print("Assay1: " + ",".join([str(a) for a in x]))
464
+ print("Assay2: " + ",".join([str(a) for a in y]))
465
+ corr = np.corrcoef(x, y)
466
+ print("Correlation: ", corr[1, 0])
467
+ assert corr[1, 0] >= 0.5
271
468
 
272
- def alphabet_from_helm(helm_library_file: str) -> Alphabet:
273
- import json
274
469
 
275
- alphabet: Alphabet = []
276
- with open(helm_library_file) as helm_library:
277
- for monomer in json.load(helm_library):
278
- if is_monomer_suitable(monomer):
279
- alphabet.append(monomer["symbol"])
280
- return alphabet
470
+ # ===== Command-line arguments parsing =====
281
471
 
282
472
 
283
473
  def parse_command_line_args() -> Any:
@@ -285,30 +475,52 @@ def parse_command_line_args() -> Any:
285
475
  prog="MotifSequencesGenerator",
286
476
  description=description,
287
477
  epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
478
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
288
479
  )
289
480
 
290
- parser.add_argument(
481
+ cluster_group = parser.add_argument_group("Cluster parameters")
482
+
483
+ cluster_group.add_argument(
291
484
  "-c", "--clusters", type=int, default=5, help="Number of clusters"
292
485
  )
293
- parser.add_argument(
486
+ cluster_group.add_argument(
294
487
  "-s",
295
488
  "--sequences",
296
489
  type=int,
297
490
  default=50,
298
491
  help="Number of sequences in each supercluster",
299
492
  )
300
- parser.add_argument(
493
+
494
+ available_alphabets = ",".join(list(alphabets.keys()))
495
+ cluster_group.add_argument(
496
+ "--alphabet",
497
+ type=str,
498
+ default=list(alphabets.keys())[0],
499
+ help=f"Sequence alphabet: {available_alphabets}.\n"
500
+ + "Ignored if the HELM library is specified",
501
+ )
502
+
503
+ motif_group = parser.add_argument_group("Motif parameters")
504
+
505
+ motif_group.add_argument(
301
506
  "-m,", "--motif-length", type=int, default=12, help="Average length of motif"
302
507
  )
303
508
 
304
- parser.add_argument(
509
+ motif_group.add_argument(
510
+ "--max-variants-position",
511
+ type=int,
512
+ default=3,
513
+ help="Maximum number of different letters in a conservative position of the motif",
514
+ )
515
+
516
+ motif_group.add_argument(
305
517
  "-r,",
306
518
  "--random-length",
307
519
  type=int,
308
520
  default=3,
309
521
  help="Average length of random sequence parts before and after motif",
310
522
  )
311
- parser.add_argument(
523
+ motif_group.add_argument(
312
524
  "-d,",
313
525
  "--dispersion",
314
526
  type=int,
@@ -316,124 +528,171 @@ def parse_command_line_args() -> Any:
316
528
  help="Variation of total sequence length",
317
529
  )
318
530
 
319
- parser.add_argument(
320
- "-h,",
321
- "--helm-library-file",
322
- type=str,
323
- help="JSON file containing the HELM monomer library in the same format as used for Datagrok. "
324
- + "The alphabet property is ignored when helm library is specified.",
325
- )
531
+ cliffs_group = parser.add_argument_group("Activity parameters")
326
532
 
327
- parser.add_argument(
328
- "--helm-connection-mode",
329
- type=str,
330
- default=HelmConnectionMode.linear.value,
331
- help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
533
+ cliffs_group.add_argument(
534
+ "--activity-range",
535
+ type=float,
536
+ default=0.5,
537
+ help="Range of the mean activity value difference between clusters",
332
538
  )
333
539
 
334
- available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
335
- parser.add_argument(
336
- "--alphabet",
337
- type=str,
338
- default=list(alphabets.keys())[0],
339
- help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
340
- f"by comma",
341
- )
342
- parser.add_argument(
343
- "--max-variants-position",
344
- type=int,
345
- default=3,
346
- help="Maximum number of different letters in conservative position in motif",
347
- )
348
- parser.add_argument(
540
+ cliffs_group.add_argument(
349
541
  "--cliff-probability",
350
542
  type=float,
351
- default=0.01,
543
+ default=0.05,
352
544
  help="Probability to make activity cliff of a sequence",
353
545
  )
354
- parser.add_argument(
546
+ cliffs_group.add_argument(
355
547
  "--cliff-strength",
356
548
  type=float,
357
- default=4.0,
358
- help="Strength of cliff",
549
+ default=5.0,
550
+ help="Average strength of cliff",
359
551
  )
360
- parser.add_argument(
361
- "--disable-cliffs",
552
+
553
+ cliffs_group.add_argument(
554
+ "--cliff-strength-dispersion",
555
+ type=float,
556
+ default=1.0,
557
+ help="Cliff strength dispersion",
558
+ )
559
+
560
+ assay_group = parser.add_argument_group("Assay parameters")
561
+
562
+ assay_group.add_argument(
563
+ "--assay-noise-levels",
564
+ type=str,
565
+ default="0.4, 0.85",
566
+ help="Noise level(s) for assays. A list of values separated by comma.",
567
+ )
568
+
569
+ assay_group.add_argument(
570
+ "--assay-scales",
571
+ type=str,
572
+ default="(0|10), (0|150.0)",
573
+ help="Typical scale size for each assay. Assays are separated by comma. Minimum and maximum values are separated by pipe. Brackets are optional."
574
+ + "Activity outliers may be located outside the specified scale",
575
+ )
576
+
577
+ assay_group.add_argument(
578
+ "--enable-negatives",
362
579
  type=bool,
363
- default=False,
364
- help="Disable generation of cliffs",
580
+ help="Enable negative values for assays results",
581
+ )
582
+
583
+ output_group = parser.add_argument_group("Output parameters")
584
+
585
+ output_group.add_argument(
586
+ "--custom-alphabet",
587
+ type=str,
588
+ default="",
589
+ help=f"Custom sequence alphabet: list of letters separated by comma. Used only if the --alphabet=custom",
365
590
  )
366
- parser.add_argument(
591
+
592
+ output_group.add_argument(
367
593
  "--fasta-separator",
368
594
  type=str,
369
595
  default="",
370
- help="Separator symbol for FASTA sequence",
596
+ help="Monomers separator for FASTA format",
597
+ )
598
+
599
+ output_group.add_argument(
600
+ "-H,",
601
+ "--helm-library-file",
602
+ type=str,
603
+ help="JSON file containing the HELM monomer library. "
604
+ + "The alphabet property is ignored when helm library is specified.",
371
605
  )
606
+
607
+ output_group.add_argument(
608
+ "--helm-connection-mode",
609
+ type=str,
610
+ default=HelmConnectionMode.linear.name,
611
+ help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
612
+ )
613
+
372
614
  command_line_args = parser.parse_args()
373
615
 
374
616
  return command_line_args
375
617
 
376
618
 
377
- # ====================================================================================
378
-
379
- grok = "clusters" in globals()
380
-
381
- if not grok:
382
- # We are not in Datagrok - need to parse command line arguments
383
- args = parse_command_line_args()
384
- clusters = args.clusters
385
- num_sequences = args.sequences
386
- motif_length = args.motif_length
387
- max_variants_position = args.max_variants_position
388
- random_length = args.random_length
389
- dispersion = args.dispersion
390
- alphabet_key = args.alphabet
391
- enable_cliffs = not args.disable_cliffs
392
- cliff_probability = args.cliff_probability
393
- cliff_strength = args.cliff_strength
394
- fasta_separator = args.fasta_separator
395
- helm_library_file = args.helm_library_file
396
- helm_connection_mode = args.helm_connection_mode
397
-
398
- helm_init = "helm_library_file" in globals() and helm_library_file is not None and helm_library_file != ''
399
-
400
- if not helm_init:
401
- alphabet: Alphabet = (
402
- alphabets[alphabet_key].split(",")
403
- if alphabet_key in alphabets
404
- else alphabet_key.split(",")
619
+ # ===== Main part of script =====
620
+
621
+ if __name__ == "__main__":
622
+ grok = "clusters" in globals()
623
+
624
+ if not grok:
625
+ # We are not in Datagrok - need to parse command line arguments
626
+ args = parse_command_line_args()
627
+ #
628
+ clusters = args.clusters
629
+ num_sequences = args.sequences
630
+ alphabet_key = args.alphabet
631
+ #
632
+ motif_length = args.motif_length
633
+ max_variants_position = args.max_variants_position
634
+ random_length = args.random_length
635
+ dispersion = args.dispersion
636
+ #
637
+ activity_range = args.activity_range
638
+ cliff_probability = args.cliff_probability
639
+ cliff_strength = args.cliff_strength
640
+ cliff_strength_dispersion = args.cliff_strength_dispersion
641
+ #
642
+ assay_noise_levels = args.assay_noise_levels
643
+ assay_scales = args.assay_scales
644
+ disable_negatives = not args.enable_negatives
645
+ #
646
+ custom_alphabet = args.custom_alphabet
647
+ fasta_separator = args.fasta_separator
648
+ helm_library_file = args.helm_library_file
649
+ helm_connection_mode = args.helm_connection_mode
650
+
651
+ helm_init = helm_library_file is not None and helm_library_file != ""
652
+
653
+ if helm_init:
654
+ alphabet = alphabet_from_helm(helm_library_file)
655
+ output_format = OutputFormat.Helm
656
+ fasta_separator = "|"
657
+ else:
658
+ output_format = OutputFormat.Fasta
659
+ if not alphabet_key in alphabets:
660
+ pass # TBD: custom alphabet
661
+ alphabet = alphabets[alphabet_key]
662
+
663
+ # Packing parameters to structures to simplify function signatures
664
+ cluster_parameters = ClusterParameters(
665
+ motif_length, max_variants_position, random_length, dispersion
405
666
  )
406
- else:
407
- alphabet = alphabet_from_helm(helm_library_file)
408
-
409
- # Running sequence generator
410
- header, data = generate_sequences(
411
- clusters,
412
- num_sequences,
413
- motif_length,
414
- max_variants_position,
415
- random_length,
416
- dispersion,
417
- alphabet,
418
- enable_cliffs,
419
- cliff_probability,
420
- cliff_strength,
421
- )
422
- if not helm_init:
423
- data_formatted = convert_to_fasta(data, fasta_separator)
424
- else:
425
- data_formatted = convert_to_helm(data, helm_connection_mode)
426
-
427
- if grok:
428
- # Exporting data to Datagrok as a Pandas dataframe
429
- import pandas as pd
430
-
431
- sequences = pd.DataFrame.from_records(data_formatted, columns=header)
432
- else:
433
- # Writing results to stdout - no need to work with big and heavy Pandas
434
- import csv
435
-
436
- csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
437
- csv_writer.writerow(header)
438
- for line in data_formatted:
439
- csv_writer.writerow(line)
667
+ assays = repack_assays(assay_noise_levels, assay_scales)
668
+
669
+ # Running sequence generator
670
+ header, data = generate_data(
671
+ clusters,
672
+ num_sequences,
673
+ cluster_parameters,
674
+ assays,
675
+ disable_negatives,
676
+ alphabet,
677
+ output_format,
678
+ fasta_separator,
679
+ HelmConnectionMode[helm_connection_mode],
680
+ activity_range,
681
+ cliff_probability,
682
+ cliff_strength,
683
+ cliff_strength_dispersion,
684
+ )
685
+
686
+ if grok:
687
+ # Exporting data to Datagrok as a Pandas dataframe
688
+ import pandas as pd
689
+
690
+ sequences_data = pd.DataFrame.from_records(data, columns=header)
691
+ else:
692
+ # Writing results to stdout - no need to work with big and heavy Pandas
693
+ import csv
694
+
695
+ csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
696
+ csv_writer.writerow(header)
697
+ for line in data:
698
+ csv_writer.writerow(line)