npm - @datagrok/bio - Versions diffs - 2.11.30 → 2.11.34 - Mend

@datagrok/bio 2.11.30 → 2.11.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/CHANGELOG.md +19 -0
package/dist/36.js +1 -1
package/dist/36.js.map +1 -1
package/dist/42.js +1 -1
package/dist/42.js.map +1 -1
package/dist/590.js +2 -0
package/dist/590.js.map +1 -0
package/dist/709.js +1 -2
package/dist/709.js.map +1 -1
package/dist/79.js.map +1 -1
package/dist/895.js +3 -0
package/dist/895.js.map +1 -0
package/dist/package-test.js +8 -1
package/dist/package-test.js.LICENSE.txt +1 -0
package/dist/package-test.js.map +1 -1
package/dist/package.js +8 -1
package/dist/package.js.LICENSE.txt +1 -0
package/dist/package.js.map +1 -1
package/files/{data → monomer-libraries}/HELMCoreLibrary.json +594 -594
package/files/tests/libraries/HELMmonomerSchema.json +96 -0
package/package.json +13 -11
package/scripts/sequence_generator.md +48 -0
package/scripts/sequence_generator.py +515 -256
package/src/package-test.ts +4 -0
package/src/package.ts +26 -24
package/src/tests/WebLogo-layout-tests.ts +37 -0
package/src/tests/WebLogo-positions-test.ts +5 -0
package/src/tests/WebLogo-project-tests.ts +63 -0
package/src/tests/activity-cliffs-tests.ts +3 -2
package/src/tests/monomer-libraries-tests.ts +7 -4
package/src/tests/scoring.ts +3 -2
package/src/tests/substructure-filters-tests.ts +3 -2
package/src/tests/to-atomic-level-tests.ts +3 -2
package/src/utils/helm-to-molfile.ts +3 -3
package/src/utils/monomer-lib/lib-manager.ts +116 -0
package/src/utils/monomer-lib/library-file-manager/consts.ts +1 -0
package/src/utils/monomer-lib/library-file-manager/custom-monomer-lib-handlers.ts +80 -0
package/src/utils/monomer-lib/library-file-manager/event-manager.ts +58 -0
package/src/utils/monomer-lib/library-file-manager/file-manager.ts +187 -0
package/src/utils/monomer-lib/library-file-manager/file-validator.ts +56 -0
package/src/utils/monomer-lib/library-file-manager/style.css +8 -0
package/src/utils/monomer-lib/library-file-manager/ui.ts +224 -0
package/src/utils/monomer-lib/monomer-lib.ts +114 -0
package/src/utils/poly-tool/const.ts +28 -0
package/src/utils/poly-tool/monomer-lib-handler.ts +115 -0
package/src/utils/poly-tool/types.ts +6 -0
package/src/utils/poly-tool/ui.ts +2 -2
package/src/viewers/vd-regions-viewer.ts +5 -4
package/src/viewers/web-logo-viewer.ts +6 -5
package/src/widgets/bio-substructure-filter.ts +4 -1
package/files/libraries/HELMCoreLibrary.json +0 -18218
package/src/utils/monomer-lib.ts +0 -305
/package/dist/{709.js.LICENSE.txt → 895.js.LICENSE.txt} +0 -0

package/scripts/sequence_generator.py CHANGED Viewed

@@ -3,65 +3,120 @@
 # description: Create the model peptides/DNA sequences with peptides data
 # language: python
 # tags: template, demo
-# input: int clusters = 5 { caption: Number of clusters; category: Clusters }
-# input: int num_sequences = 50 { caption: Number of sequences in each cluster; category: Clusters }
-# input: int motif_length = 12 { caption: Average length of motif; category: Motif }
-# input: int max_variants_position = 3 { caption: Maximum number of different letters in conservative position in motif; category: Motif }
-# input: int random_length = 3 { caption: Average length of random sequence parts before and after motif; category: Motif }
-# input: int dispersion = 2 { caption: Variation of total sequence length; category: Motif }
-# input: bool enable_cliffs = true { caption: Enable activity cliffs; category: Activity cliffs }
-# input: double cliff_probability = 0.01 { caption: Probability to make activity cliff of a sequence; category: Activity cliffs; format: 0.000}
-# input: double cliff_strength = 4.0 { caption: Strength of cliff; category: Activity cliffs }
-# input: string alphabet_key = "PT" { caption: Sequence alphabet; category: Output format; hint: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma}
-# input: string fasta_separator = "" { caption: Fasta format separator; nullable: true; category: Output format}
-# input: file helm_library_file { caption: HELM library to produce HELM output; nullable: true; category: Output format}
-# input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Peptides connection mode (HELM only); category: Output format}
-# output: dataframe sequences
-description="""The utility generates clusters of macromolecule sequences to test SAR fucntionality.
-Each cluster contains randomly generated sequence motif.
+# input: int clusters = 5 { caption: Clusters; category: Clusters } [Number of clusters]
+# input: int num_sequences = 50 { caption: Sequences; category: Clusters } [Number of sequences in each cluster]
+# input: string alphabet_key = "Protein" {  choices: ["Protein", "DNA", "RNA", "Protein_EXT"]; caption: Alphabet; category: Clusters;} [Sequence alphabet. Ignored if the HELM library is specified.]
+# input: int motif_length = 12 { caption: Motif length; category: Motif } [Average length of motif]
+# input: int max_variants_position = 3 { caption: Position variants; category: Motif } [Maximum number of different letters in a conservative position of the motif]
+# input: int random_length = 3 { caption: Randon length; category: Motif } [Average length of random sequence parts before and after motif]
+# input: int dispersion = 2 { caption: Length variation; category: Motif } [Variation of total sequence length]
+# input: double activity_range = 0.2 { caption: Activity range; category: Activity parameters; format: 0.000} [Range of the mean activity value difference between clusters]
+# input: double cliff_probability = 0.05 { caption: Cliff probability; category: Activity parameters; format: 0.000} [Probability to make activity cliff of a sequence]
+# input: double cliff_strength = 5.0 { caption: Cliff strength; category: Activity parameters } [The size of the cliff comparing to the dispersion of the initial activity]
+# input: double cliff_strength_dispersion = 1.0 { caption: Cliff dispersion; category: Activity parameters } [Dispersion of cliff strength]
+# input: string assay_noise_levels = "0.4, 0.85" { caption: Noise levels; category: Assay settings } [List of assay noise levels, separated by comma]
+# input: string assay_scales = "(0|10), (0|150.0)"  { caption: Assay scales; category: Assay settings } [Typical scale size for each assay. Assays are separated by comma. Minimum and maximum values are separated by pipe. Brackets are optional]
+# input: bool disable_negatives = true { caption: Crop negatives; category: Assay settings } [Set negative measurements for assay to zero]
+# input: string fasta_separator = "" { caption: Fasta separator; nullable: true; category: Output format} [Monomers separator for FASTA format]
+# input: file helm_library_file { caption: HELM library; nullable: true; category: Output format} [HELM library to load alphabet. Output format is set to HELM if the HELM library is specified]
+# input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Connection mode; category: Output format} [Peptides connection mode for HELM output)]
+# output: dataframe sequences_data
+description = """The utility generates clusters of macromolecule sequences to test SAR functionality.
+Each cluster contains a randomly generated sequence motif.
 Each sequence has activity - a Gauss-distributed random value.
-All sequences in the cluster has activities from the same distibution.
 The utility can simulate activity cliffs - random changes in the conservative motif letters,
-leading to drastical change in the activity.
-"""
+leading to the significant change in the activity.
+Utility can simulate multiple experimental assays measuring activity, with different scales and noise levels."""
 import random
 import argparse
 import sys
+from collections import namedtuple
 from enum import Enum
-from typing import List, Tuple, Dict, Iterator, Any
-# --- Type definitions ---
+from typing import List, Tuple, NamedTuple, Dict, Set, Any
+# ===== Type definitions =====
 Letter = str
-Alphabet = List[str]
+Alphabet = List[Letter]
 LetterChoice = List[Letter]
 MotifTemplate = List[LetterChoice]
-Sequence = List[Letter]  # The sequence in a form of list
+# The sequence in a list of a monomers from the alphabet.
+# We can't use string because monomers can have several letters
+Sequence = List[Letter]
+SequenceList = List[Sequence]
 SequenceSquashed = str  # Sequence, joined together in string form
-SequenceRecord = Tuple[int, Sequence, float, bool]
-ClusterSequenceRecord = Tuple[int, str, Sequence, float, bool]
+CliffPair = Tuple[int, int]
+CliffList = List[CliffPair]
+Activity = float
+ActivityList = List[Activity]
+ClusterParameters = NamedTuple(
+    "ClusterParameters",
+    [
+        ("motif_length", int),
+        ("max_variants_per_position", int),
+        ("random_length", int),
+        ("dispersion", int),
+    ],
+)
+CliffParameters = namedtuple(
+    "CliffParameters",
+    ["cliff_probability", "cliff_strength", "cliff_strength_dispersion"],
+)
+AssayParameters = NamedTuple(
+    "AssayParameters", [("noise_level", float), ("min", float), ("max", float)]
+)
-# --- constants ---
+DataLine = Tuple[
+    Any, ...
+]  # Contains strings and 1+ number of floats - can't type more exactly
+# ===== Constants =====
+OutputFormat = Enum("OutputFormat", ["Fasta", "Helm"])
 HelmConnectionMode = Enum("HelmConnectionMode", ["linear", "cyclic", "mixed"])
-alphabets: Dict[str, str] = {
-    "PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
-    "DNA": "A,T,G,C",
-    "RNA": "A,U,G,C",
-    "PT_HELM": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,dA,dC,dD,dE,dF,dH,dI,dK,dL,dM,dN,dP,dQ,dR,dS,dT,dV,dW,dY,meA,meD,meS,meT,meV,meY,meE,meG,meI,meK,meM,meN,meQ,meC,meR,meW,meF,meH,meL,Nle,Nva,Orn,Iva,aIle,gGlu,Hcy,Hse,Hyp,D-gGlu,D-Nle,D-hPhe,D-Hyp,D-Nva,D-Orn,Pyr,Phe_3Cl,Phe_4Cl,Phe_4NH2,Phg,Ser_tBu,Tyr_Bn,Tza,1Nal,Cha,Lys_Boc,aThr,D-2Nal,D-2Thi,D-aHyp,D-aIle,D-Phg,D-Ser_tBu,Cya,Lys_Me3,Pen,Phe_4Me,Ser_Bn,Tyr_tBu,2Nal,Thi,aHyp,Ala_tBu,hPhe,D-1Nal,D-aThr,D-Cha,D-Pen,D-Phe_4Cl,D-Ser_Bn,Wil,Oic_3aS-7aS,Pip,3Pal,4Pal,Abu,Apm,Chg,Dab,Dap,D-3Pal,D-aMeAbu,D-Chg,D-Cit,D-Dab,D-Pip,D-Tic,Aca,Tic,Aad,Cit,Aze,Ac5c,Aib,D-2Pal,D-Abu,D-Dap,Asu,D-Thz,D-Trp_For,D-Tyr_Et,Lys_Ac,Asp_OMe,Phe_ab-dehydro,Sta_3xi4xi,Tyr_ab-dehydroMe,App,Cap,Cys_SEt,Dsu,pnC,pnG,Pqa,Pro_4Me3OH,Met_O2,Phe_2Me,Phe_34diCl,Phe_4Br,Phe_4I,Phe_4Sdihydroorotamido,Pyl,Ser_PO3H2,Thr_PO3H2,Thz,Trp_Me,Tyr_26diMe,Tyr_3I,Tyr_3NO2,Tyr_Ph4OH,Tyr_SO3H,Val_3OH,xiIle,NMe2Abz,NMebAla,aMePhe,aMePro,aMeTyr_3OH,Bmt,Bmt_E,Cys_Bn,Gla,hHis,His_1Me,Gly_allyl,Gly_cPr,Asp_Ph2NH2,Azi,2Abz,3Abz,4Abz,Ac3c,Ac6c,bAla,D-Bmt,D-Bmt_E,D-hArg,D-Phe_4F,D-Trp_2Me,D-Tyr_Me,D-xiIle,Lys_iPr,Phe_ab-dehydro_3NO2,Sta_3S4S,Bux,Dpm,pnA,pnT,seC,Met_O,nTyr,Oic_3aR-7aS,Oic_3axi-7axi,Phe_2F,Phe_3F,Phe_4F,Phe_4NO2,Phe_bbdiMe,Trp_5OH,Trp_Ome,Tyr_35diI,Tyr_3OH,Tyr_Me,Tyr_PO3H2,xiHyp,xiThr,NMe4Abz,aMeTyr,Aoda,Bpa,Cys_Me,Dip,hArg,His_1Bn,His_3Me,Hyl_5xi,Bip,Abu_23dehydro,D-Dip,Dha,D-hArg_Et2,D-Met_S-O,D-His_1Bn,D-nTyr,D-Phe_4ureido",
+alphabets: Dict[str, Alphabet] = {
+    "Protein": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y".split(","),
+    "DNA": "A,T,G,C".split(","),
+    "RNA": "A,U,G,C".split(","),
+    "Protein_EXT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,dA,dC,dD,dE,dF,dH,dI,dK,dL,dM,dN,dP,dQ,dR,dS,dT,dV,dW,dY,meA,meD,meS,meT,meV,meY,meE,meG,meI,meK,meM,meN,meQ,meC,meR,meW,meF,meH,meL,Nle,Nva,Orn,Iva,aIle,gGlu,Hcy,Hse,Hyp,D-gGlu,D-Nle,D-hPhe,D-Hyp,D-Nva,D-Orn,Pyr,Phe_3Cl,Phe_4Cl,Phe_4NH2,Phg,Ser_tBu,Tyr_Bn,Tza,1Nal,Cha,Lys_Boc,aThr,D-2Nal,D-2Thi,D-aHyp,D-aIle,D-Phg,D-Ser_tBu,Cya,Lys_Me3,Pen,Phe_4Me,Ser_Bn,Tyr_tBu,2Nal,Thi,aHyp,Ala_tBu,hPhe,D-1Nal,D-aThr,D-Cha,D-Pen,D-Phe_4Cl,D-Ser_Bn,Wil,Oic_3aS-7aS,Pip,3Pal,4Pal,Abu,Apm,Chg,Dab,Dap,D-3Pal,D-aMeAbu,D-Chg,D-Cit,D-Dab,D-Pip,D-Tic,Aca,Tic,Aad,Cit,Aze,Ac5c,Aib,D-2Pal,D-Abu,D-Dap,Asu,D-Thz,D-Trp_For,D-Tyr_Et,Lys_Ac,Asp_OMe,Phe_ab-dehydro,Sta_3xi4xi,Tyr_ab-dehydroMe,App,Cap,Cys_SEt,Dsu,pnC,pnG,Pqa,Pro_4Me3OH,Met_O2,Phe_2Me,Phe_34diCl,Phe_4Br,Phe_4I,Phe_4Sdihydroorotamido,Pyl,Ser_PO3H2,Thr_PO3H2,Thz,Trp_Me,Tyr_26diMe,Tyr_3I,Tyr_3NO2,Tyr_Ph4OH,Tyr_SO3H,Val_3OH,xiIle,NMe2Abz,NMebAla,aMePhe,aMePro,aMeTyr_3OH,Bmt,Bmt_E,Cys_Bn,Gla,hHis,His_1Me,Gly_allyl,Gly_cPr,Asp_Ph2NH2,Azi,2Abz,3Abz,4Abz,Ac3c,Ac6c,bAla,D-Bmt,D-Bmt_E,D-hArg,D-Phe_4F,D-Trp_2Me,D-Tyr_Me,D-xiIle,Lys_iPr,Phe_ab-dehydro_3NO2,Sta_3S4S,Bux,Dpm,pnA,pnT,seC,Met_O,nTyr,Oic_3aR-7aS,Oic_3axi-7axi,Phe_2F,Phe_3F,Phe_4F,Phe_4NO2,Phe_bbdiMe,Trp_5OH,Trp_Ome,Tyr_35diI,Tyr_3OH,Tyr_Me,Tyr_PO3H2,xiHyp,xiThr,NMe4Abz,aMeTyr,Aoda,Bpa,Cys_Me,Dip,hArg,His_1Bn,His_3Me,Hyl_5xi,Bip,Abu_23dehydro,D-Dip,Dha,D-hArg_Et2,D-Met_S-O,D-His_1Bn,D-nTyr,D-Phe_4ureido".split(
+        ","
+    ),
 }
+# ===== Motif and sequence generation functions =====
+def alphabet_from_helm(helm_library_file: str) -> Alphabet:
+    """
+    Reads the HELM library from a JSON file and extracts only backbone monomers suitable for sequence generation
+    """
+    import json
+    def is_monomer_suitable(monomer: Any) -> bool:
+        return (
+            monomer["polymerType"] == "PEPTIDE"
+            and monomer["monomerType"] == "Backbone"
+            and len(monomer["rgroups"]) == 2
+        )
+    alphabet: Alphabet = []
+    with open(helm_library_file) as helm_library:
+        for monomer in json.load(helm_library):
+            if is_monomer_suitable(monomer):
+                alphabet.append(monomer["symbol"])
+    return alphabet
 def mean_range(mean: int, disp: int) -> int:
+    """
+    Returns random positive value around some mean with selected dispersion
+    """
     return random.randint(max(mean - disp, 0), mean + disp)
@@ -69,8 +124,11 @@ def generate_motif_template(
     motif_length: int,
     alphabet: Alphabet,
     max_variants_cluster: int,
-    prob_any: float = 0.2,
+    prob_any: float = 0.2,  # The probability to have a non-conservative letter (the `?` sign in notation) inside motif
 ) -> MotifTemplate:
+    """
+    Generated random template from the alphabet
+    """
     motif_template = []
     for position in range(motif_length):
         # Selecting letters for position i
@@ -84,32 +142,44 @@ def generate_motif_template(
 def generate_motif(template: MotifTemplate, alphabet: Alphabet) -> Sequence:
+    """
+    Generate sequence motif by motif template
+    """
     template_with_any = [
         (letters if not "?" in letters else alphabet) for letters in template
     ]
     return [random.choice(letters) for letters in template_with_any]
-def motif_notation(motif_template: MotifTemplate) -> str:
+def motif_notation(motif_template: MotifTemplate, fasta_separator: str = "") -> str:
+    """
+    Returns string representation of motif template
+    """
     def motif_notation_code(letter_choice: LetterChoice) -> str:
         if len(letter_choice) == 1:
-            return letter_choice[0]
+            return letter_choice[0] + fasta_separator
         else:
-            return f"[{''.join(letter_choice)}]"
+            return f"[{fasta_separator.join(letter_choice)}]"
     return "".join(
         [motif_notation_code(letter_choice) for letter_choice in motif_template]
     )
-def generate_random(n: int, alphabet: Alphabet) -> Sequence:
+def generate_random_sequence(n: int, alphabet: Alphabet) -> Sequence:
+    """
+    Generate a sequence containing n random letters from the alphabet
+    """
     return [random.choice(alphabet) for i in range(n)]
-def make_cliff(
+def make_motif_cliff(
     motif_template: MotifTemplate, alphabet: Alphabet, motif: Sequence
 ) -> Sequence:
-    # Mutate conservative letter in motif
+    """
+    Mutates a random conservative letter in the motif
+    """
     motif_len = len(motif_template)
     pos = random.randrange(motif_len)
     while "?" in motif_template[pos]:
@@ -127,17 +197,61 @@ def make_cliff(
     )
+def generate_cluster_sequences(
+    n_sequences: int,
+    motif_template: MotifTemplate,
+    prefix_length: int,
+    suffix_length: int,
+    alphabet: Alphabet,
+    cliff_probability: float,
+) -> Tuple[SequenceList, CliffList]:
+    """
+    Returns set of sequences for one cluster and introduces sequence cliffs
+    Also makes activity cliffs
+    """
+    n_seq = 0
+    sequences: SequenceList = []
+    cliffs: CliffList = []
+    while n_seq < n_sequences:
+        motif = generate_motif(motif_template, alphabet)
+        prefix = generate_random_sequence(prefix_length, alphabet)
+        suffix = generate_random_sequence(suffix_length, alphabet)
+        seq = prefix + motif + suffix
+        sequences.append(seq)
+        n_seq += 1
+        if n_seq >= n_sequences:
+            break  # This is the last sequence - can't do cliff
+        is_cliff = random.random() <= cliff_probability
+        if is_cliff:
+            # Making activity cliff
+            cliff_motif = make_motif_cliff(motif_template, alphabet, motif)
+            cliff_seq = prefix + cliff_motif + suffix
+            sequences.append(cliff_seq)
+            cliffs.append((n_seq - 1, n_seq))
+            n_seq += 1
+            # sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
+    return sequences, cliffs
 def sequence_to_fasta(sequence: Sequence, separator: str) -> SequenceSquashed:
+    """
+    Converts the sequence to FASTA format
+    """
     return separator.join(sequence)
 def sequence_to_helm(
-    sequence: Sequence, helm_connection_mode: str = HelmConnectionMode.linear.name
+    sequence: Sequence,
+    helm_connection_mode: HelmConnectionMode = HelmConnectionMode.linear,
 ) -> SequenceSquashed:
-    def is_cyclic(helm_connection_mode: str) -> bool:
-        return helm_connection_mode == HelmConnectionMode.cyclic.name or (
-            helm_connection_mode == HelmConnectionMode.mixed.name
-            and random.random() < 0.5
+    """
+    Converts the sequence to HELM format
+    """
+    def is_cyclic(helm_connection_mode: HelmConnectionMode) -> bool:
+        return helm_connection_mode == HelmConnectionMode.cyclic or (
+            helm_connection_mode == HelmConnectionMode.mixed and random.random() < 0.5
         )
     sequence_escaped: Sequence = [
@@ -149,135 +263,211 @@ def sequence_to_helm(
     return f"PEPTIDE1{{{sequence_to_fasta(sequence_escaped,'.')}}}${connection_format}$$$V2.0"
-def generate_cluster(
-    n_sequences: int,
-    motif_length: int,
-    prefix_length: int,
-    suffix_length: int,
-    max_variants_per_position: int,
-    make_cliffs: bool,
-    alphabet: Alphabet,
-    cliff_probability: float,
-    cliff_strength: float,
-) -> Iterator[SequenceRecord]:
-    # Making a motif template
-    motif_template = generate_motif_template(
-        motif_length, alphabet, max_variants_per_position
-    )
-    # Setting average and dispersion for activity
-    activity_average = random.random() * 10
-    activity_dispersion = random.random()
-    sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
-    for n_seq in range(n_sequences):
-        activity = random.gauss(activity_average, activity_dispersion)
-        motif = generate_motif(motif_template, alphabet)
-        prefix = generate_random(prefix_length, alphabet)
-        suffix = generate_random(suffix_length, alphabet)
-        seq = prefix + motif + suffix
-        sequence_record: SequenceRecord = (n_seq, seq, activity, False)
-        yield sequence_record
-        is_cliff = make_cliffs and (random.random() <= cliff_probability)
-        if is_cliff:
-            # Making activity cliff
-            cliff_motif = make_cliff(motif_template, alphabet, motif)
-            cliff_seq = prefix + cliff_motif + suffix
-            # Recalculating activity
-            cliff_disp = activity_dispersion * cliff_strength * (0.5 + random.random())
-            activity = activity_average - cliff_disp
-            cliff_activity = activity_average + cliff_disp
-            # sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
-            # sys.stderr.write(f"{activity_average}\t{motif}\t{activity}\n")
-            # sys.stderr.write(f"{activity_average}\t{cliff_motif}\t{cliff_activity}\n")
-            n_seq += 1
-            sequence_record = (n_seq, cliff_seq, cliff_activity, is_cliff)
-            yield sequence_record
+# ===== Activity generation functions =====
+def generate_ideal_activities(n: int, activity_range: float = 0) -> ActivityList:
+    """
+    Generate ideal activities with Gauss distribution
+    The distribution center is chosen randomly with some dispersion
+    """
+    mean = random.uniform(-activity_range, activity_range) if activity_range > 0 else 0
+    return [random.gauss(mean, 1) for _ in range(n)]
-def generate_sequences(
+def make_activity_cliff(
+    activities: ActivityList,
+    cliffs: List[CliffPair],
+    cliff_strength: float,
+    cliff_strength_dispersion: float,
+) -> ActivityList:
+    """
+    Introduce activity cliffs -
+    make a pair of activities differ for random gauss-distributed value defined by cliff_strength and cliff_strength_dispersion
+    """
+    cliff_activities = activities[:]
+    for first, second in cliffs:
+        activity1 = activities[first]
+        activity2 = activities[second]
+        average = (activity1 + activity2) / 2
+        scale = random.gauss(cliff_strength, cliff_strength_dispersion) / abs(
+            activity1 - activity2
+        )
+        cliff_activities[first] = average + (activity1 - average) * scale
+        cliff_activities[second] = average + (activity2 - average) * scale
+    return cliff_activities
+def generate_assay_activities(
+    activities: ActivityList,
+    assay: AssayParameters,
+    disable_negatives: bool = True,
+) -> ActivityList:
+    """
+    Generates activities measured in assay from some "ideal" activities.
+    Adds noise and scales the values to emulate some assay measurement scale
+    """
+    assay_activities = []
+    scale_factor = 3 * (
+        1 + assay.noise_level
+    )  # real activity 3-sigma in the interval [-scale_factor,+scale_factor]
+    for activity in activities:
+        noise = random.uniform(
+            -3, 3
+        )  # some random noize in [-3,3] - 3 sigma for ideal activity
+        # Adding noize and normalizing
+        noised_activity = activity + noise * assay.noise_level
+        rescaled_activity = (
+            noised_activity / (scale_factor * 2)
+        ) + 0.5  # rescaling activity to the interval [0;1]
+        assay_result = assay.min + (rescaled_activity * (assay.max - assay.min))
+        if disable_negatives and assay_result < 0:
+            assay_result = 0
+        assay_activities.append(assay_result)
+    return assay_activities
+def generate_data(
     n_clusters: int,
     n_sequences: int,
-    average_motif_length: int,
-    max_variants_per_position: int,
-    average_random_length: int,
-    dispersion: int,
+    cluster_parameters: ClusterParameters,
+    assays: List[AssayParameters],
+    disable_negatives: bool,
     alphabet: Alphabet,
-    make_cliffs: bool,
-    cliff_probability: float,
-    cliff_strength: float,
-) -> Tuple[List[str], List[ClusterSequenceRecord]]:
-    headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
-    sequences: List[ClusterSequenceRecord] = []
+    output_format: OutputFormat,
+    fasta_separator: str,
+    helm_connection_mode: HelmConnectionMode,
+    activity_range: float,
+    cliff_probability: float = 0.05,
+    cliff_strength: float = 5.0,
+    cliff_dispersion: float = 1.0,
+) -> Tuple[List[str], List[DataLine]]:
+    """
+    Main function generating all data set - sequences, activities, etc
+    """
+    headers: List[str] = ["cluster", "sequence_id", "sequence", "is_cliff"]
+    headers += [f"Assay_{i+1}" for i in range(len(assays))]
+    data: List[DataLine] = []
+    def cliffs_to_positions(cliffs: CliffList) -> Set[int]:
+        """
+        Convert CliffList to a set containing positions of cliffs
+        """
+        unique_pos = {pos for cliff in cliffs for pos in cliff}
+        return unique_pos
     for n_cluster in range(n_clusters):
-        motif_length = mean_range(average_motif_length, dispersion)
-        # sys.stderr.write(f"Cluster {n_cluster:2} motif template: {motif_notation(motif_template)}\n")
-        total_length = mean_range(average_random_length * 2, dispersion) + motif_length
-        prefix_length = mean_range(average_random_length, dispersion // 2)
+        motif_length = mean_range(
+            cluster_parameters.motif_length, cluster_parameters.dispersion
+        )
+        total_length = (
+            mean_range(
+                cluster_parameters.random_length * 2, cluster_parameters.dispersion
+            )
+            + motif_length
+        )
+        prefix_length = mean_range(
+            cluster_parameters.random_length, cluster_parameters.dispersion // 2
+        )
         suffix_length = total_length - motif_length - prefix_length
-        sys.stderr.write(f"Generating sequences for cluster {n_cluster}\n")
-        for n_seq, seq, activity, is_cliff in generate_cluster(
+        # Making a motif template
+        motif_template = generate_motif_template(
+            motif_length, alphabet, cluster_parameters.max_variants_per_position
+        )
+        sys.stderr.write(
+            f"Motif template for cluster {n_cluster}: {motif_notation(motif_template, fasta_separator)}\n"
+        )
+        sequences, cliffs = generate_cluster_sequences(
             n_sequences,
-            motif_length,
+            motif_template,
             prefix_length,
             suffix_length,
-            max_variants_per_position,
-            make_cliffs,
             alphabet,
             cliff_probability,
-            cliff_strength,
-        ):
-            sequences.append(
-                (n_cluster, f"c{n_cluster}_s{n_seq:03d}", seq, activity, is_cliff)
-            )
-    return headers, sequences
+        )
+        if output_format == OutputFormat.Fasta:
+            squashed_sequences = [
+                sequence_to_fasta(seq, fasta_separator) for seq in sequences
+            ]
+        elif output_format == OutputFormat.Helm:
+            squashed_sequences = [
+                sequence_to_helm(seq, helm_connection_mode) for seq in sequences
+            ]
+        else:
+            print("Unsupported output format")
+            exit(-1)
-def convert_to_fasta(
-    cluster_sequence_records: List[ClusterSequenceRecord], separator: str
-) -> List[Tuple[int, str, str, float, bool]]:
-    return [
-        (n_cluster, name_cluster, sequence_to_fasta(seq, separator), activity, is_cliff)
-        for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
-    ]
+        ideal_activities = generate_ideal_activities(n_sequences, activity_range)
+        cliffed_activities = make_activity_cliff(
+            ideal_activities, cliffs, cliff_strength, cliff_dispersion
+        )
+        assay_activities = [
+            generate_assay_activities(cliffed_activities, assay, disable_negatives)
+            for assay in assays
+        ]
+        cliffs_positions = cliffs_to_positions(cliffs)
+        is_cliffs = [pos in cliffs_positions for pos in range(n_sequences)]
+        sequence_IDs = [f"c{n_cluster}_s{n:03d}" for n in range(n_sequences)]
-def convert_to_helm(
-    cluster_sequence_records: List[ClusterSequenceRecord], helm_connection_mode: str
-) -> List[Tuple[int, str, str, float, bool]]:
-    return [
-        (
-            n_cluster,
-            name_cluster,
-            sequence_to_helm(seq, helm_connection_mode),
-            activity,
-            is_cliff,
+        cluster_data = zip(
+            [n_cluster] * n_sequences,
+            sequence_IDs,
+            squashed_sequences,
+            is_cliffs,
+            *assay_activities,
         )
-        for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
+        data.extend(cluster_data)
+    return headers, data
+def repack_assays(noise_levels_str: str, scales_str: str) -> List[AssayParameters]:
+    """
+    Converts strings passed from the input data to the list of AssayParameters namedtuples
+    """
+    noise_levels = [float(s) for s in noise_levels_str.split(",")]
+    scales = [s.strip().split("|") for s in scales_str.split(",")]
+    minmaxes = [(float(x[0].strip("() ")), float(x[1].strip("()"))) for x in scales]
+    if not (len(noise_levels) == len(minmaxes)):
+        print("Not equal range of parameters for assay definition")
+        exit(-1)
+    assays = [
+        AssayParameters(noise, min, max)
+        for noise, (min, max) in zip(noise_levels, minmaxes)
     ]
+    return assays
-def is_monomer_suitable(monomer: Any) -> bool:
-    return (
-        monomer["polymerType"] == "PEPTIDE"
-        and monomer["monomerType"] == "Backbone"
-        and len(monomer["rgroups"]) == 2
+# ===== Tests =====
+def test_activities_correlation() -> None:
+    import numpy as np
+    ideal_activities = generate_ideal_activities(25, 0.1)
+    cliff_activities = make_activity_cliff(
+        ideal_activities, [(0, 1)], cliff_strength=5.0, cliff_strength_dispersion=1.0
     )
+    assay_parameters = AssayParameters(0.3, 0, 10)
+    x = generate_assay_activities(cliff_activities, assay_parameters)
+    assay_parameters = AssayParameters(0.5, 0, 250)
+    y = generate_assay_activities(cliff_activities, assay_parameters)
+    print("Assay1: " + ",".join([str(a) for a in x]))
+    print("Assay2: " + ",".join([str(a) for a in y]))
+    corr = np.corrcoef(x, y)
+    print("Correlation: ", corr[1, 0])
+    assert corr[1, 0] >= 0.5
-def alphabet_from_helm(helm_library_file: str) -> Alphabet:
-    import json
-    alphabet: Alphabet = []
-    with open(helm_library_file) as helm_library:
-        for monomer in json.load(helm_library):
-            if is_monomer_suitable(monomer):
-                alphabet.append(monomer["symbol"])
-    return alphabet
+# ===== Command-line arguments parsing =====
 def parse_command_line_args() -> Any:
@@ -285,30 +475,52 @@ def parse_command_line_args() -> Any:
         prog="MotifSequencesGenerator",
         description=description,
         epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument(
+    cluster_group = parser.add_argument_group("Cluster parameters")
+    cluster_group.add_argument(
         "-c", "--clusters", type=int, default=5, help="Number of clusters"
     )
-    parser.add_argument(
+    cluster_group.add_argument(
         "-s",
         "--sequences",
         type=int,
         default=50,
         help="Number of sequences in each supercluster",
     )
-    parser.add_argument(
+    available_alphabets = ",".join(list(alphabets.keys()))
+    cluster_group.add_argument(
+        "--alphabet",
+        type=str,
+        default=list(alphabets.keys())[0],
+        help=f"Sequence alphabet: {available_alphabets}.\n"
+        + "Ignored if the HELM library is specified",
+    )
+    motif_group = parser.add_argument_group("Motif parameters")
+    motif_group.add_argument(
         "-m,", "--motif-length", type=int, default=12, help="Average length of motif"
     )
-    parser.add_argument(
+    motif_group.add_argument(
+        "--max-variants-position",
+        type=int,
+        default=3,
+        help="Maximum number of different letters in a conservative position of the motif",
+    )
+    motif_group.add_argument(
         "-r,",
         "--random-length",
         type=int,
         default=3,
         help="Average length of random sequence parts before and after motif",
     )
-    parser.add_argument(
+    motif_group.add_argument(
         "-d,",
         "--dispersion",
         type=int,
@@ -316,124 +528,171 @@ def parse_command_line_args() -> Any:
         help="Variation of total sequence length",
     )
-    parser.add_argument(
-        "-h,",
-        "--helm-library-file",
-        type=str,
-        help="JSON file containing the HELM monomer library in the same format as used for Datagrok. "
-        + "The alphabet property is ignored when helm library is specified.",
-    )
+    cliffs_group = parser.add_argument_group("Activity parameters")
-    parser.add_argument(
-        "--helm-connection-mode",
-        type=str,
-        default=HelmConnectionMode.linear.value,
-        help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
+    cliffs_group.add_argument(
+        "--activity-range",
+        type=float,
+        default=0.5,
+        help="Range of the mean activity value difference between clusters",
     )
-    available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
-    parser.add_argument(
-        "--alphabet",
-        type=str,
-        default=list(alphabets.keys())[0],
-        help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
-        f"by comma",
-    )
-    parser.add_argument(
-        "--max-variants-position",
-        type=int,
-        default=3,
-        help="Maximum number of different letters in conservative position in motif",
-    )
-    parser.add_argument(
+    cliffs_group.add_argument(
         "--cliff-probability",
         type=float,
-        default=0.01,
+        default=0.05,
         help="Probability to make activity cliff of a sequence",
     )
-    parser.add_argument(
+    cliffs_group.add_argument(
         "--cliff-strength",
         type=float,
-        default=4.0,
-        help="Strength of cliff",
+        default=5.0,
+        help="Average strength of cliff",
     )
-    parser.add_argument(
-        "--disable-cliffs",
+    cliffs_group.add_argument(
+        "--cliff-strength-dispersion",
+        type=float,
+        default=1.0,
+        help="Cliff strength dispersion",
+    )
+    assay_group = parser.add_argument_group("Assay parameters")
+    assay_group.add_argument(
+        "--assay-noise-levels",
+        type=str,
+        default="0.4, 0.85",
+        help="Noise level(s) for assays. A list of values separated by comma.",
+    )
+    assay_group.add_argument(
+        "--assay-scales",
+        type=str,
+        default="(0|10), (0|150.0)",
+        help="Typical scale size for each assay. Assays are separated by comma. Minimum and maximum values are separated by pipe. Brackets are optional."
+        + "Activity outliers may be located outside the specified scale",
+    )
+    assay_group.add_argument(
+        "--enable-negatives",
         type=bool,
-        default=False,
-        help="Disable generation of cliffs",
+        help="Enable negative values for assays results",
+    )
+    output_group = parser.add_argument_group("Output parameters")
+    output_group.add_argument(
+        "--custom-alphabet",
+        type=str,
+        default="",
+        help=f"Custom sequence alphabet: list of letters separated by comma. Used only if the --alphabet=custom",
     )
-    parser.add_argument(
+    output_group.add_argument(
         "--fasta-separator",
         type=str,
         default="",
-        help="Separator symbol for FASTA sequence",
+        help="Monomers separator for FASTA format",
+    )
+    output_group.add_argument(
+        "-H,",
+        "--helm-library-file",
+        type=str,
+        help="JSON file containing the HELM monomer library. "
+        + "The alphabet property is ignored when helm library is specified.",
     )
+    output_group.add_argument(
+        "--helm-connection-mode",
+        type=str,
+        default=HelmConnectionMode.linear.name,
+        help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
+    )
     command_line_args = parser.parse_args()
     return command_line_args
-# ====================================================================================
-grok = "clusters" in globals()
-if not grok:
-    # We are not in Datagrok - need to parse command line arguments
-    args = parse_command_line_args()
-    clusters = args.clusters
-    num_sequences = args.sequences
-    motif_length = args.motif_length
-    max_variants_position = args.max_variants_position
-    random_length = args.random_length
-    dispersion = args.dispersion
-    alphabet_key = args.alphabet
-    enable_cliffs = not args.disable_cliffs
-    cliff_probability = args.cliff_probability
-    cliff_strength = args.cliff_strength
-    fasta_separator = args.fasta_separator
-    helm_library_file = args.helm_library_file
-    helm_connection_mode = args.helm_connection_mode
-helm_init = "helm_library_file" in globals() and helm_library_file is not None and helm_library_file != ''
-if not helm_init:
-    alphabet: Alphabet = (
-        alphabets[alphabet_key].split(",")
-        if alphabet_key in alphabets
-        else alphabet_key.split(",")
+# ===== Main part of script =====
+if __name__ == "__main__":
+    grok = "clusters" in globals()
+    if not grok:
+        # We are not in Datagrok - need to parse command line arguments
+        args = parse_command_line_args()
+        #
+        clusters = args.clusters
+        num_sequences = args.sequences
+        alphabet_key = args.alphabet
+        #
+        motif_length = args.motif_length
+        max_variants_position = args.max_variants_position
+        random_length = args.random_length
+        dispersion = args.dispersion
+        #
+        activity_range = args.activity_range
+        cliff_probability = args.cliff_probability
+        cliff_strength = args.cliff_strength
+        cliff_strength_dispersion = args.cliff_strength_dispersion
+        #
+        assay_noise_levels = args.assay_noise_levels
+        assay_scales = args.assay_scales
+        disable_negatives = not args.enable_negatives
+        #
+        custom_alphabet = args.custom_alphabet
+        fasta_separator = args.fasta_separator
+        helm_library_file = args.helm_library_file
+        helm_connection_mode = args.helm_connection_mode
+    helm_init = helm_library_file is not None and helm_library_file != ""
+    if helm_init:
+        alphabet = alphabet_from_helm(helm_library_file)
+        output_format = OutputFormat.Helm
+        fasta_separator = "|"
+    else:
+        output_format = OutputFormat.Fasta
+        if not alphabet_key in alphabets:
+            pass  # TBD: custom alphabet
+        alphabet = alphabets[alphabet_key]
+    # Packing parameters to structures to simplify function signatures
+    cluster_parameters = ClusterParameters(
+        motif_length, max_variants_position, random_length, dispersion
     )
-else:
-    alphabet = alphabet_from_helm(helm_library_file)
-# Running sequence generator
-header, data = generate_sequences(
-    clusters,
-    num_sequences,
-    motif_length,
-    max_variants_position,
-    random_length,
-    dispersion,
-    alphabet,
-    enable_cliffs,
-    cliff_probability,
-    cliff_strength,
-)
-if not helm_init:
-    data_formatted = convert_to_fasta(data, fasta_separator)
-else:
-    data_formatted = convert_to_helm(data, helm_connection_mode)
-if grok:
-    # Exporting data to Datagrok as a Pandas dataframe
-    import pandas as pd
-    sequences = pd.DataFrame.from_records(data_formatted, columns=header)
-else:
-    # Writing results to stdout - no need to work with big and heavy Pandas
-    import csv
-    csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
-    csv_writer.writerow(header)
-    for line in data_formatted:
-        csv_writer.writerow(line)
+    assays = repack_assays(assay_noise_levels, assay_scales)
+    # Running sequence generator
+    header, data = generate_data(
+        clusters,
+        num_sequences,
+        cluster_parameters,
+        assays,
+        disable_negatives,
+        alphabet,
+        output_format,
+        fasta_separator,
+        HelmConnectionMode[helm_connection_mode],
+        activity_range,
+        cliff_probability,
+        cliff_strength,
+        cliff_strength_dispersion,
+    )
+    if grok:
+        # Exporting data to Datagrok as a Pandas dataframe
+        import pandas as pd
+        sequences_data = pd.DataFrame.from_records(data, columns=header)
+    else:
+        # Writing results to stdout - no need to work with big and heavy Pandas
+        import csv
+        csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+        csv_writer.writerow(header)
+        for line in data:
+            csv_writer.writerow(line)