grnsight 6.0.7 → 7.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/.eslintrc.yml +4 -4
  2. package/.github/workflows/node.js.yml +35 -0
  3. package/README.md +1 -1
  4. package/database/README.md +218 -97
  5. package/database/constants.py +42 -0
  6. package/database/filter_update.py +168 -0
  7. package/database/grnsettings-database/README.md +52 -0
  8. package/database/grnsettings-database/schema.sql +4 -0
  9. package/database/loader.py +30 -0
  10. package/database/loader_update.py +36 -0
  11. package/database/network-database/scripts/generate_network.py +15 -23
  12. package/database/network-database/scripts/generate_new_network_version.py +17 -24
  13. package/database/protein-protein-database/README.md +71 -0
  14. package/database/protein-protein-database/schema.sql +37 -0
  15. package/database/protein-protein-database/scripts/generate_protein_network.py +227 -0
  16. package/database/protein-protein-database/scripts/remove_duplicates.sh +4 -0
  17. package/database/utils.py +418 -0
  18. package/package.json +3 -2
  19. package/server/app.js +2 -0
  20. package/server/config/config.js +4 -4
  21. package/server/controllers/additional-sheet-parser.js +2 -1
  22. package/server/controllers/constants.js +5 -0
  23. package/server/controllers/custom-workbook-controller.js +4 -3
  24. package/server/controllers/demo-workbooks.js +1462 -6
  25. package/server/controllers/export-constants.js +3 -2
  26. package/server/controllers/exporters/sif.js +6 -1
  27. package/server/controllers/exporters/xlsx.js +8 -3
  28. package/server/controllers/expression-sheet-parser.js +0 -6
  29. package/server/controllers/grnsettings-database-controller.js +17 -0
  30. package/server/controllers/importers/sif.js +30 -11
  31. package/server/controllers/network-database-controller.js +2 -2
  32. package/server/controllers/network-sheet-parser.js +54 -12
  33. package/server/controllers/protein-database-controller.js +18 -0
  34. package/server/controllers/sif-constants.js +11 -4
  35. package/server/controllers/spreadsheet-controller.js +44 -1
  36. package/server/controllers/workbook-constants.js +21 -4
  37. package/server/dals/expression-dal.js +4 -4
  38. package/server/dals/grnsetting-dal.js +49 -0
  39. package/server/dals/network-dal.js +14 -15
  40. package/server/dals/protein-dal.js +106 -0
  41. package/test/additional-sheet-parser-tests.js +1 -1
  42. package/test/export-tests.js +136 -9
  43. package/test/import-sif-tests.js +67 -13
  44. package/test/test.js +1 -1
  45. package/test-files/additional-sheet-test-files/optimization-parameters-default.xlsx +0 -0
  46. package/test-files/demo-files/18_proteins_81_edges_PPI.xlsx +0 -0
  47. package/test-files/expression-data-test-sheets/expression_sheet_missing_data_ok_export_exact.xlsx +0 -0
  48. package/web-client/config/config.js +4 -4
  49. package/web-client/public/js/api/grnsight-api.js +18 -3
  50. package/web-client/public/js/constants.js +27 -12
  51. package/web-client/public/js/generateNetwork.js +170 -72
  52. package/web-client/public/js/graph.js +424 -161
  53. package/web-client/public/js/grnsight.js +25 -4
  54. package/web-client/public/js/grnstate.js +4 -1
  55. package/web-client/public/js/iframe-coordination.js +3 -3
  56. package/web-client/public/js/setup-handlers.js +76 -61
  57. package/web-client/public/js/setup-load-and-import-handlers.js +32 -7
  58. package/web-client/public/js/update-app.js +119 -28
  59. package/web-client/public/js/upload.js +142 -85
  60. package/web-client/public/js/warnings.js +25 -0
  61. package/web-client/public/lib/bootstrap.file-input/bootstrap.file-input.js +0 -1
  62. package/web-client/public/stylesheets/grnsight.styl +40 -16
  63. package/web-client/views/components/demo.pug +7 -5
  64. package/web-client/views/upload.pug +64 -50
  65. package/database/network-database/scripts/filter_genes.py +0 -76
  66. package/database/network-database/scripts/loader.py +0 -79
  67. package/database/network-database/scripts/loader_updates.py +0 -99
@@ -0,0 +1,52 @@
1
+ # GRNsettings Database
2
+ The schema of this database lives within this directory.
3
+
4
+ ## The basics
5
+
6
+ ### Schema
7
+ The default database name is stored within the settings schema on our Postgres database.
8
+
9
+ The schema is located within this directory at the top level of this file `schema.sql`. It creates the schema as well as defining the table located within the settings schema.
10
+
11
+ 1. Move to file that contains `schema.sql` file, which is under `database/grnsettings-database` folder
12
+
13
+ 2. Load database
14
+
15
+ Example of loading to local database
16
+
17
+ For Windows:
18
+ ```
19
+ psql -U postgres -f schema.sql postgresql://localhost/postgres
20
+ ```
21
+
22
+ For Mac:
23
+ ```
24
+ psql -f schema.sql postgresql://localhost/postgres
25
+ ```
26
+
27
+ ### Changing the default database name
28
+
29
+ 1. In order to change the default database name you would first need to login to the database using the following command:
30
+
31
+ For Windows:
32
+ ```
33
+ psql -U postgres <address to database>
34
+ ```
35
+ For Mac:
36
+ ```
37
+ psql <address to database>
38
+ ```
39
+
40
+ 2. Then you will need to set your search path to the settings schema using the following command :
41
+ ```
42
+ SET SEARCH_PATH TO settings;
43
+ ```
44
+ 3. After that you will simply delete the current default database name using this command
45
+ ```
46
+ DELETE FROM grnsettings;
47
+ ```
48
+ 4. And then insert the new database name using the following command
49
+ ```
50
+ INSERT INTO grnsettings(expression_dataset) VALUES ('the new default database name');
51
+ ```
52
+ _The current default database is 'dahlquist_2018'_
@@ -0,0 +1,4 @@
1
+ CREATE SCHEMA settings;
2
+ CREATE TABLE settings.grnsettings (
3
+ expression_dataset VARCHAR PRIMARY KEY
4
+ );
@@ -0,0 +1,30 @@
1
+ import csv
2
+ from utils import *
3
+ from constants import Constants
4
+ # python3 loader.py | psql postgresql://localhost/postgres
5
+ import os
6
+
7
+ if not os.path.exists('union-gene-data'):
8
+ os.makedirs('union-gene-data')
9
+
10
+ # Get union gene data
11
+ Utils.create_union_file([Constants.EXPRESSION_GENE_SOURCE, Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY)
12
+
13
+ # Regulatory Network
14
+ Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
15
+ Utils.load_grn_genes(Constants.GRN_GENE_SOURCE, Constants.GRN_DATABASE_NAMESPACE)
16
+ Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
17
+
18
+ # Protein-protein-interactions
19
+ Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
20
+ Utils.load_ppi_genes(Constants.PPI_GENE_SOURCE, Constants.PPI_DATABASE_NAMESPACE)
21
+ Utils.load_proteins(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
22
+ Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
23
+
24
+ # Expression data
25
+ Utils.load_refs(Constants.EXPRESSION_REFS_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
26
+ Utils.load_expression_genes(Constants.EXPRESSION_GENE_SOURCE, Constants.EXPRESISON_DATABASE_NAMESPACE)
27
+ Utils.load_expression_metadata(Constants.EXPRESSION_METADATA_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
28
+ Utils.load_expression_data(Constants.EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
29
+ Utils.load_production_rates(Constants.EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
30
+ Utils.load_degradation_rates(Constants.EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
@@ -0,0 +1,36 @@
1
+ import argparse
2
+ from constants import Constants
3
+ from utils import Utils
4
+
5
+ def load_grn_data_into_database():
6
+ Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
7
+ Utils.update_grn_genes(Constants.GRN_UPDATE_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
8
+ Utils.load_grn_genes(Constants.GRN_MISSING_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
9
+ Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
10
+
11
+ def load_ppi_data_into_database():
12
+ Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
13
+ Utils.update_ppi_genes(Constants.PPI_UPDATE_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
14
+ Utils.update_ppi_proteins(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
15
+ Utils.load_ppi_genes(Constants.PPI_MISSING_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
16
+ Utils.load_proteins(Constants.PPI_MISSING_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
17
+ Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
18
+
19
+ def main():
20
+ # Set up argument parsing
21
+ parser = argparse.ArgumentParser(description="Load data into database for GRN or PPI networks.")
22
+ parser.add_argument('--network', choices=['GRN', 'PPI'], required=True, help="Specify the network type to load data for")
23
+
24
+ # Parse arguments
25
+ args = parser.parse_args()
26
+
27
+ # Execute the relevant operations based on the argument
28
+ if args.network == 'GRN':
29
+ load_grn_data_into_database()
30
+ elif args.network == 'PPI':
31
+ load_ppi_data_into_database()
32
+ else:
33
+ print("Invalid network type. Please choose 'GRN' or 'PPI'.")
34
+
35
+ if __name__ == "__main__":
36
+ main()
@@ -111,29 +111,21 @@ for regulator in regulators_to_targets:
111
111
  if regulator != None:
112
112
  regulators_list.append(regulator)
113
113
 
114
- print(f'Creating REGULATORS TO TARGETS MATRIX\n')
115
- regulator_to_target_file = open(REGULATORS_TO_TARGETS_MATRIX, 'w')
116
- headers = "cols regulators/rows targets"
117
- headers += '\t'.join(regulators_list)
118
- regulator_to_target_file.write(f'{headers}\n')
119
- for target in targets:
120
- result = create_regulator_to_target_row(target, regulators_to_targets)
121
- if result != False:
122
- regulator_to_target_file.write(f'{result}\n')
123
- regulator_to_target_file.close()
124
-
125
- print(f'Creating REGULATORS TO TARGETS MATRIX\n')
126
- regulator_to_regulator_file = open(REGULATORS_TO_REGULATORS_MATRIX, 'w')
127
- headers = "cols regulators/rows targets"
128
- headers += '\t'.join(regulators_list)
129
- regulator_to_regulator_file.write(f'{headers}\n')
130
- for target in targets:
131
- result = create_regulator_to_target_row(target, regulators_to_targets)
132
- if result != False:
133
- regulator_to_regulator_file.write(f'{result}\n')
134
- regulator_to_regulator_file.close()
135
-
136
-
114
+ def createMatrix(fileName: str):
115
+ statusMessage: str = 'Creating REGULATORS TO TARGETS MATRIX\n' if fileName == REGULATORS_TO_TARGETS_MATRIX else 'Creating REGULATORS TO REGULATORS MATRIX\n'
116
+ print(statusMessage)
117
+ regulator_to_target_file = open(fileName, 'w')
118
+ headers = "cols regulators/rows targets\t"
119
+ headers += '\t'.join(regulators_list)
120
+ regulator_to_target_file.write(f'{headers}\n')
121
+ for target in targets:
122
+ result = create_regulator_to_target_row(target, regulators_to_targets)
123
+ if result:
124
+ regulator_to_target_file.write(f'{result}\n')
125
+ regulator_to_target_file.close()
126
+
127
+ createMatrix(REGULATORS_TO_TARGETS_MATRIX)
128
+ createMatrix(REGULATORS_TO_REGULATORS_MATRIX)
137
129
 
138
130
  # Create loader-files
139
131
 
@@ -17,7 +17,7 @@ def get_all_genes():
17
17
  with db.connect() as connection:
18
18
  result_set = connection.execute(
19
19
  f"""
20
- SELECT display_gene_id, gene_id FROM gene_regulatory_network.gene;
20
+ SELECT display_gene_id, gene_id FROM gene_regulatory_network.gene
21
21
  """)
22
22
  result = result_set.fetchall()
23
23
  return list(result)
@@ -126,29 +126,22 @@ for regulator in regulators_to_targets:
126
126
  if regulator != None:
127
127
  regulators_list.append(regulator)
128
128
 
129
- print(f'Creating REGULATORS TO TARGETS MATRIX\n')
130
- regulator_to_target_file = open(REGULATORS_TO_TARGETS_MATRIX, 'w')
131
- headers = "cols regulators/rows targets"
132
- headers += '\t'.join(regulators_list)
133
- regulator_to_target_file.write(f'{headers}\n')
134
- for target in targets:
135
- result = create_regulator_to_target_row(target, regulators_to_targets)
136
- if result != False:
137
- regulator_to_target_file.write(f'{result}\n')
138
- regulator_to_target_file.close()
139
-
140
- print(f'Creating REGULATORS TO TARGETS MATRIX\n')
141
- regulator_to_regulator_file = open(REGULATORS_TO_REGULATORS_MATRIX, 'w')
142
- headers = "cols regulators/rows targets"
143
- headers += '\t'.join(regulators_list)
144
- regulator_to_regulator_file.write(f'{headers}\n')
145
- for target in targets:
146
- result = create_regulator_to_target_row(target, regulators_to_targets)
147
- if result != False:
148
- regulator_to_regulator_file.write(f'{result}\n')
149
- regulator_to_regulator_file.close()
150
-
151
-
129
+ # this is the same method used in generate_network.py to create Regulators to Targets Matrix and Regulators to Regulators Matrix
130
+ def createMatrix(fileName: str):
131
+ statusMessage: str = 'Creating REGULATORS TO TARGETS MATRIX\n' if fileName == REGULATORS_TO_TARGETS_MATRIX else 'Creating REGULATORS TO REGULATORS MATRIX\n'
132
+ print(statusMessage)
133
+ regulator_to_target_file = open(fileName, 'w')
134
+ headers = "cols regulators/rows targets\t"
135
+ headers += '\t'.join(regulators_list)
136
+ regulator_to_target_file.write(f'{headers}\n')
137
+ for target in targets:
138
+ result = create_regulator_to_target_row(target, regulators_to_targets)
139
+ if result:
140
+ regulator_to_target_file.write(f'{result}\n')
141
+ regulator_to_target_file.close()
142
+
143
+ createMatrix(REGULATORS_TO_TARGETS_MATRIX)
144
+ createMatrix(REGULATORS_TO_REGULATORS_MATRIX)
152
145
 
153
146
  # Create loader-files
154
147
 
@@ -0,0 +1,71 @@
1
+ # Protein-Protein Database
2
+
3
+ All files pertaining the protein-protein database live within this directory.
4
+
5
+ ## The basics
6
+
7
+ #### Schema
8
+
9
+ All network data is stored within the protein_protein_interactions schema on our Postgres database.
10
+
11
+ The schema is located within this directory at the top level in the file `schema.sql`. It defines the tables located within the protein_protein_interactions schema.
12
+
13
+ Usage:
14
+ To load to local database
15
+ ```
16
+ psql -f schema.sql postgresql://localhost/postgres
17
+ ```
18
+ To load to production database
19
+ ```
20
+ psql -f schema.sql <address to database>
21
+ ```
22
+
23
+ ### Scripts
24
+
25
+ All scripts live within the subdirectory `scripts`, located in the top-level of the network database directory.
26
+
27
+ Any source files required to run the scripts live within the subdirectory `source-files`, located in the top-level of the network database directory. As source files may be large, you must create this directory yourself and add any source files you need to use there.
28
+
29
+ All generated results of the scripts live in the subdirectory `script-results`, located in the top-level of the network database directory. Currently, all scripts that generate code create the directory if it does not currently exist. When adding a new script that generates resulting code, best practice is to create the script-results directory and any subdirectories if it does not exist, in order to prevent errors and snafus for recently cloned repositories.
30
+
31
+ Within the scripts directory, there are the following files:
32
+
33
+ - `generate_protein_network.py`
34
+ - `remove_duplicates.sh`
35
+ - `loader.py`
36
+
37
+ #### Data Generator
38
+
39
+ This script (`generate_protein_network.py`) generates the genes, protein information and the physical interactions between these genes from Yeastmine; then it writes this into the csv files used to load the database. Please make sure you have enough time (around 1.5 - 2 hours) to run this script. The files (`gene.csv`), (`physical_interaction.csv`), (`protein.csv`) will be generated in the script-results directory located in the sub-directory processed-loader-files.
40
+
41
+ Usage:
42
+ ```
43
+ python3 generate_protein_network.py
44
+ ```
45
+
46
+ Once you have finished generating the loader files, you need to remove duplicate entries from the physical interactions file. The bash script (`remove_duplicates.sh`) does this for you. The resultant file (`no_dupe.csv`)will be generated in the script-results directory located in the sub-directory processed-loader-files. If your machine doesn't support bash shell scripts, then you have to make a new script that removes duplicate lines from a file and writes the results to a file. Sorry!
47
+
48
+ Usage:
49
+ ```
50
+ chmod u+x remove_duplicates.sh
51
+
52
+ ./remove_duplicates.sh
53
+ ```
54
+
55
+ #### Database Loader
56
+
57
+ This script (`loader.py`) is to be used to load your preprocessed expression data into the database.
58
+
59
+ This program generates direct SQL statements from the source files generated by the data preprocessor in order to populate a relational database with those files’ data
60
+
61
+ Note: You may get an error saying that there was a duplicate protein. You have to manually check which protein was being inserted twice, go to the SGD website (or Yeastmine) and confirm the correct protein gene interaction. Currently this occurs with the protein 'Aad6p'. To fix it go to your protein.csv file and make sure that 'Aad6p' is paired with the gene 'YFL056C', and 'Aad16p' is paired with the gene 'YFL057C'. If any other issues arise, you must manually confirm on the SGD website. Sorry!
62
+
63
+ Usage:
64
+ To load to local database
65
+ ```
66
+ python3 loader.py | psql postgresql://localhost/postgres
67
+ ```
68
+ To load to production database
69
+ ```
70
+ python3 loader.py | psql <path to database>
71
+ ```
@@ -0,0 +1,37 @@
1
+ CREATE TABLE protein_protein_interactions.source (
2
+ time_stamp TIMESTAMP WITH TIME ZONE,
3
+ source VARCHAR,
4
+ display_name VARCHAR,
5
+ PRIMARY KEY(time_stamp, source)
6
+ );
7
+
8
+ CREATE TABLE protein_protein_interactions.gene (
9
+ gene_id VARCHAR, -- systematic like name
10
+ display_gene_id VARCHAR, -- standard like name
11
+ species VARCHAR,
12
+ taxon_id VARCHAR,
13
+ PRIMARY KEY(gene_id, taxon_id)
14
+ );
15
+
16
+ CREATE TABLE protein_protein_interactions.protein (
17
+ standard_name VARCHAR PRIMARY KEY,
18
+ gene_systematic_name VARCHAR,
19
+ length FLOAT,
20
+ molecular_weight FLOAT,
21
+ PI FLOAT,
22
+ taxon_id VARCHAR,
23
+ FOREIGN KEY (gene_systematic_name, taxon_id) REFERENCES protein_protein_interactions.gene(gene_id, taxon_id)
24
+ );
25
+
26
+ CREATE TABLE protein_protein_interactions.physical_interactions (
27
+ protein1 VARCHAR,
28
+ protein2 VARCHAR,
29
+ interaction_detection_methods_identifier VARCHAR,
30
+ experiment_name VARCHAR,
31
+ time_stamp TIMESTAMP WITH TIME ZONE,
32
+ source VARCHAR,
33
+ FOREIGN KEY (protein1) REFERENCES protein_protein_interactions.protein(standard_name),
34
+ FOREIGN KEY (protein2) REFERENCES protein_protein_interactions.protein(standard_name),
35
+ FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions.source(time_stamp, source),
36
+ CONSTRAINT unique_physical_interaction UNIQUE (protein1, protein2, interaction_detection_methods_identifier, experiment_name, time_stamp, source)
37
+ );
@@ -0,0 +1,227 @@
1
+ from __future__ import print_function
2
+
3
+ from intermine.webservice import Service
4
+ service = Service("https://yeastmine.yeastgenome.org/yeastmine/service")
5
+
6
+ import csv
7
+ import re
8
+ import sys
9
+ import os
10
+ import gc
11
+ import datetime
12
+
13
+
14
+ # Create files
15
+
16
+ # Create folder paths
17
+ if not os.path.exists('../script-results'):
18
+ os.makedirs('../script-results')
19
+
20
+ if not os.path.exists('../script-results/processed-loader-files'):
21
+ os.makedirs('../script-results/processed-loader-files')
22
+
23
+
24
+
25
+ # Files to be generated
26
+ GENE_FILE = '../script-results/processed-loader-files/gene.csv'
27
+ PROTEIN_FILE = '../script-results/processed-loader-files/protein.csv'
28
+ PHYSICAL_INTERACTION_FILE = '../script-results/processed-loader-files/physical_interaction.csv'
29
+ SOURCE_DESTINATION = '../script-results/processed-loader-files/source.csv'
30
+
31
+ # Instantiate Source variables
32
+ timestamp = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)
33
+ source = "YeastMine - Saccharomyces Genome Database"
34
+ display_name = "Yeastmine - SGD"
35
+
36
+ # Get Network Data from Yeastmine
37
+
38
+ def getPhysicalInteractions(gene):
39
+ print("Query data from Yeastmine to get Physical Interactions")
40
+ query = service.new_query("Gene")
41
+ query.add_constraint("interactions.participant2", "Gene")
42
+ query.add_view(
43
+ "primaryIdentifier", "symbol", "secondaryIdentifier", "sgdAlias", "name",
44
+ "organism.shortName", "interactions.details.annotationType",
45
+ "interactions.details.role1", "interactions.participant2.symbol",
46
+ "interactions.participant2.secondaryIdentifier",
47
+ "interactions.details.experiment.interactionDetectionMethods.identifier",
48
+ "interactions.details.experiment.name",
49
+ "interactions.details.relationshipType", "interactions.details.note"
50
+ )
51
+ query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B")
52
+ query.add_constraint("interactions.details.relationshipType", "=", "physical", code="C")
53
+ query.add_constraint("Gene", "LOOKUP", gene, code="A")
54
+ return query
55
+
56
+ def getProteinFromGene(gene):
57
+ print("Query data from Yeastmine to get Protein information")
58
+ query = service.new_query("Gene")
59
+ query.add_view(
60
+ "primaryIdentifier", "proteins.symbol", "sgdAlias", "proteins.length",
61
+ "proteins.molecularWeight", "proteins.pI", "featureType", "qualifier",
62
+ "description", "proteins.sequence.residues"
63
+ )
64
+ query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B")
65
+ query.add_constraint("Gene", "LOOKUP", gene, code="A")
66
+ return query
67
+
68
+ def getAllProteins():
69
+ print("Query data from Yeastmine to get all proteins")
70
+
71
+ query = service.new_query("Protein")
72
+
73
+ query.add_view(
74
+ "genes.primaryIdentifier", "genes.secondaryIdentifier", "symbol", "length",
75
+ "molecularWeight", "pI", "genes.featureType", "genes.qualifier",
76
+ "genes.sgdAlias", "genes.description"
77
+ )
78
+ query.add_sort_order("Protein.symbol", "ASC")
79
+ query.add_constraint("genes.featureType", "=", "transposable_element_gene", code="G")
80
+ query.add_constraint("genes.featureType", "=", "ORF", code="F")
81
+ query.add_constraint("genes.status", "=", "Active", code="D")
82
+ query.add_constraint("genes.featureType", "=", "blocked_reading_frame", code="E")
83
+ query.add_constraint("genes.featureType", "=", "intein_encoding_region", code="H")
84
+ query.add_constraint("organism.name", "=", "Saccharomyces cerevisiae", code="A")
85
+ query.set_logic("A and D and (F or G or E or H)")
86
+
87
+ return query
88
+
89
+
90
+
91
+ query = getAllProteins()
92
+ all_proteins = {}
93
+
94
+ genes = {
95
+ # stored as gene sysyematic name : {
96
+ # proteins : {protein standard name : {protein info}}
97
+ # }
98
+ }
99
+ print("COLLECTING PROTEINS FROM QUERY RESULTS\n")
100
+
101
+ for row in query.rows():
102
+ gene_systematic_name = row["genes.secondaryIdentifier"]
103
+ protein_standard_name = row["symbol"]
104
+ length = row["length"]
105
+ molecular_weight = row["molecularWeight"]
106
+ PI = row["pI"]
107
+ genes[gene_systematic_name] = {
108
+ "standard_name" : None,
109
+ "protein" : {
110
+ "standard_name": protein_standard_name,
111
+ "length": length,
112
+ "molecular_weight": molecular_weight,
113
+ "PI": PI
114
+ }
115
+ }
116
+
117
+
118
+ print("COLLECTING/WRITING INTERACTIONS\n")
119
+ file = open(PHYSICAL_INTERACTION_FILE,"w")
120
+ print(f"Open file {PHYSICAL_INTERACTION_FILE} and write data into that file")
121
+ file.write(f"Protein1\tProtein2\tInteraction Detection Methods Identifier\tExperiment Name\tTime_Stamp\tSource\n")
122
+
123
+
124
+ exceptions = []
125
+ print("Processing Physical Interactions")
126
+ for gene in genes:
127
+ query = getPhysicalInteractions(gene)
128
+ first_row = True
129
+
130
+ for row in query.rows():
131
+ gene1 = row["secondaryIdentifier"]
132
+ gene2 = row["interactions.participant2.secondaryIdentifier"]
133
+ if first_row:
134
+ # update the gene's standard name
135
+ genes[gene]["standard_name"] = row["symbol"] if row["symbol"] != None else gene
136
+ first_row = False
137
+ if gene2 in genes:
138
+ g = sorted([genes[gene1]["protein"]["standard_name"], genes[gene2]["protein"]["standard_name"]])
139
+ idmi = row["interactions.details.experiment.interactionDetectionMethods.identifier"]
140
+ exp_name = row["interactions.details.experiment.name"]
141
+
142
+ if gene2 in genes and gene1 in genes:
143
+ file.write(f'{g[0]}\t{g[1]}\t{idmi}\t{exp_name}\t{timestamp}\t{source}\n')
144
+ else:
145
+ exceptions.append(gene2)
146
+
147
+ print("Handling Exceptions")
148
+ failed_genes = []
149
+ while exceptions != None:
150
+ acceptable_genes = []
151
+ for gene in exceptions:
152
+ query = getProteinFromGene(gene)
153
+ rows = query.rows()
154
+ for row in rows:
155
+ acceptable_genes.append(gene)
156
+ protein_standard_name = row["proteins.symbol"]
157
+ length = row["proteins.length"]
158
+ molecular_weight = row["proteins.molecularWeight"]
159
+ PI = row["proteins.pI"]
160
+ genes[gene] = {
161
+ "standard_name" : None,
162
+ "protein" : {
163
+ "standard_name": protein_standard_name,
164
+ "length": length,
165
+ "molecular_weight": molecular_weight,
166
+ "PI": PI
167
+ }
168
+ }
169
+ if len(rows) == 0:
170
+ failed_genes.append(gene)
171
+
172
+ more_exceptions = []
173
+ for gene in acceptable_genes:
174
+ query = getPhysicalInteractions(gene)
175
+ first_row = True
176
+ for row in query.rows():
177
+ gene1 = row["secondaryIdentifier"]
178
+ gene2 = row["interactions.participant2.secondaryIdentifier"]
179
+ if first_row:
180
+ # update the gene's standard name
181
+ genes[gene]["standard_name"] = row["symbol"] if row["symbol"] != None else gene
182
+ first_row = False
183
+ if gene2 in genes:
184
+ g = sorted([genes[gene1]["protein"]["standard_name"], genes[gene2]["protein"]["standard_name"]])
185
+ idmi = row["interactions.details.experiment.interactionDetectionMethods.identifier"]
186
+ exp_name = row["interactions.details.experiment.name"]
187
+
188
+ if gene2 in genes and gene1 in genes:
189
+ file.write(f'{g[0]}\t{g[1]}\t{idmi}\t{exp_name}\t{timestamp}\t{source}\n')
190
+ elif gene not in failed_genes:
191
+ more_exceptions.append(gene2)
192
+ if len(more_exceptions) == 0:
193
+ exceptions = None
194
+ else :
195
+ exceptions = more_exceptions
196
+
197
+
198
+ file.close()
199
+
200
+ # Source Table
201
+
202
+ print(f"Completed {PHYSICAL_INTERACTION_FILE} Starting{SOURCE_DESTINATION}")
203
+
204
+ source_file = open(SOURCE_DESTINATION, 'w')
205
+ headers = f'Timestamp\tSource\tDisplay Name\n{timestamp}\t{source}\t{display_name}'
206
+ source_file.write(f'{headers}\n')
207
+ source_file.close()
208
+
209
+
210
+ species = "Saccharomyces cerevisiae"
211
+ taxon_id = "559292"
212
+
213
+ # create gene csv
214
+ print(f"Completed {SOURCE_DESTINATION} Starting{GENE_FILE}")
215
+ file = open(GENE_FILE,"w")
216
+ file.write(f"Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID\n")
217
+ for gene in genes:
218
+ file.write(f"{gene}\t{genes[gene]['standard_name']}\t{species}\t{taxon_id}\n")
219
+ file.close()
220
+
221
+ # create protein csv
222
+ print(f"Completed {GENE_FILE} Starting{PROTEIN_FILE}")
223
+ file = open(PROTEIN_FILE, "w")
224
+ file.write(f"Standard Name\tGene Systematic Name\tLength\tMolecular Weight\tPI\tTaxon ID\n")
225
+ for gene in genes:
226
+ file.write(f"{genes[gene]['protein']['standard_name']}\t{gene}\t{genes[gene]['protein']['length']}\t{genes[gene]['protein']['molecular_weight']}\t{genes[gene]['protein']['PI']}\t{taxon_id}\n")
227
+ file.close()
@@ -0,0 +1,4 @@
1
+ #!/bin/bash
2
+ output="../script-results/processed-loader-files/physical_interaction_no_dupe.csv"
3
+ source="../script-results/processed-loader-files/physical_interaction.csv"
4
+ awk '!x[$0]++' $source > $output