grnsight 6.0.7 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.yml +4 -4
- package/.github/workflows/node.js.yml +35 -0
- package/README.md +1 -1
- package/database/README.md +218 -97
- package/database/constants.py +42 -0
- package/database/filter_update.py +168 -0
- package/database/grnsettings-database/README.md +52 -0
- package/database/grnsettings-database/schema.sql +4 -0
- package/database/loader.py +30 -0
- package/database/loader_update.py +36 -0
- package/database/network-database/scripts/generate_network.py +15 -23
- package/database/network-database/scripts/generate_new_network_version.py +17 -24
- package/database/protein-protein-database/README.md +71 -0
- package/database/protein-protein-database/schema.sql +37 -0
- package/database/protein-protein-database/scripts/generate_protein_network.py +227 -0
- package/database/protein-protein-database/scripts/remove_duplicates.sh +4 -0
- package/database/utils.py +418 -0
- package/package.json +3 -2
- package/server/app.js +2 -0
- package/server/config/config.js +4 -4
- package/server/controllers/additional-sheet-parser.js +2 -1
- package/server/controllers/constants.js +5 -0
- package/server/controllers/custom-workbook-controller.js +4 -3
- package/server/controllers/demo-workbooks.js +1462 -6
- package/server/controllers/export-constants.js +3 -2
- package/server/controllers/exporters/sif.js +6 -1
- package/server/controllers/exporters/xlsx.js +8 -3
- package/server/controllers/expression-sheet-parser.js +0 -6
- package/server/controllers/grnsettings-database-controller.js +17 -0
- package/server/controllers/importers/sif.js +30 -11
- package/server/controllers/network-database-controller.js +2 -2
- package/server/controllers/network-sheet-parser.js +54 -12
- package/server/controllers/protein-database-controller.js +18 -0
- package/server/controllers/sif-constants.js +11 -4
- package/server/controllers/spreadsheet-controller.js +44 -1
- package/server/controllers/workbook-constants.js +21 -4
- package/server/dals/expression-dal.js +4 -4
- package/server/dals/grnsetting-dal.js +49 -0
- package/server/dals/network-dal.js +14 -15
- package/server/dals/protein-dal.js +106 -0
- package/test/additional-sheet-parser-tests.js +1 -1
- package/test/export-tests.js +136 -9
- package/test/import-sif-tests.js +67 -13
- package/test/test.js +1 -1
- package/test-files/additional-sheet-test-files/optimization-parameters-default.xlsx +0 -0
- package/test-files/demo-files/18_proteins_81_edges_PPI.xlsx +0 -0
- package/test-files/expression-data-test-sheets/expression_sheet_missing_data_ok_export_exact.xlsx +0 -0
- package/web-client/config/config.js +4 -4
- package/web-client/public/js/api/grnsight-api.js +18 -3
- package/web-client/public/js/constants.js +27 -12
- package/web-client/public/js/generateNetwork.js +170 -72
- package/web-client/public/js/graph.js +424 -161
- package/web-client/public/js/grnsight.js +25 -4
- package/web-client/public/js/grnstate.js +4 -1
- package/web-client/public/js/iframe-coordination.js +3 -3
- package/web-client/public/js/setup-handlers.js +76 -61
- package/web-client/public/js/setup-load-and-import-handlers.js +32 -7
- package/web-client/public/js/update-app.js +119 -28
- package/web-client/public/js/upload.js +142 -85
- package/web-client/public/js/warnings.js +25 -0
- package/web-client/public/lib/bootstrap.file-input/bootstrap.file-input.js +0 -1
- package/web-client/public/stylesheets/grnsight.styl +40 -16
- package/web-client/views/components/demo.pug +7 -5
- package/web-client/views/upload.pug +64 -50
- package/database/network-database/scripts/filter_genes.py +0 -76
- package/database/network-database/scripts/loader.py +0 -79
- package/database/network-database/scripts/loader_updates.py +0 -99
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# GRNsettings Database
|
|
2
|
+
The schema of this database lives within this directory.
|
|
3
|
+
|
|
4
|
+
## The basics
|
|
5
|
+
|
|
6
|
+
### Schema
|
|
7
|
+
The default database name is stored within the settings schema on our Postgres database.
|
|
8
|
+
|
|
9
|
+
The schema is located within this directory at the top level of this file `schema.sql`. It creates the schema as well as defining the table located within the settings schema.
|
|
10
|
+
|
|
11
|
+
1. Move to file that contains `schema.sql` file, which is under `database/grnsettings-database` folder
|
|
12
|
+
|
|
13
|
+
2. Load database
|
|
14
|
+
|
|
15
|
+
Example of loading to local database
|
|
16
|
+
|
|
17
|
+
For Windows:
|
|
18
|
+
```
|
|
19
|
+
psql -U postgres -f schema.sql postgresql://localhost/postgres
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
For Mac:
|
|
23
|
+
```
|
|
24
|
+
psql -f schema.sql postgresql://localhost/postgres
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Changing the default database name
|
|
28
|
+
|
|
29
|
+
1. In order to change the default database name you would first need to login to the database using the following command:
|
|
30
|
+
|
|
31
|
+
For Windows:
|
|
32
|
+
```
|
|
33
|
+
psql -U postgres <address to database>
|
|
34
|
+
```
|
|
35
|
+
For Mac:
|
|
36
|
+
```
|
|
37
|
+
psql <address to database>
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
2. Then you will need to set your search path to the settings schema using the following command :
|
|
41
|
+
```
|
|
42
|
+
SET SEARCH_PATH TO settings;
|
|
43
|
+
```
|
|
44
|
+
3. After that you will simply delete the current default database name using this command
|
|
45
|
+
```
|
|
46
|
+
DELETE FROM grnsettings;
|
|
47
|
+
```
|
|
48
|
+
4. And then insert the new database name using the following command
|
|
49
|
+
```
|
|
50
|
+
INSERT INTO grnsettings(expression_dataset) VALUES ('the new default database name');
|
|
51
|
+
```
|
|
52
|
+
_The current default database is 'dahlquist_2018'_
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from utils import *
|
|
3
|
+
from constants import Constants
|
|
4
|
+
# python3 loader.py | psql postgresql://localhost/postgres
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
if not os.path.exists('union-gene-data'):
|
|
8
|
+
os.makedirs('union-gene-data')
|
|
9
|
+
|
|
10
|
+
# Get union gene data
|
|
11
|
+
Utils.create_union_file([Constants.EXPRESSION_GENE_SOURCE, Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY)
|
|
12
|
+
|
|
13
|
+
# Regulatory Network
|
|
14
|
+
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
|
|
15
|
+
Utils.load_grn_genes(Constants.GRN_GENE_SOURCE, Constants.GRN_DATABASE_NAMESPACE)
|
|
16
|
+
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
|
|
17
|
+
|
|
18
|
+
# Protein-protein-interactions
|
|
19
|
+
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
20
|
+
Utils.load_ppi_genes(Constants.PPI_GENE_SOURCE, Constants.PPI_DATABASE_NAMESPACE)
|
|
21
|
+
Utils.load_proteins(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
22
|
+
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
23
|
+
|
|
24
|
+
# Expression data
|
|
25
|
+
Utils.load_refs(Constants.EXPRESSION_REFS_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
|
|
26
|
+
Utils.load_expression_genes(Constants.EXPRESSION_GENE_SOURCE, Constants.EXPRESISON_DATABASE_NAMESPACE)
|
|
27
|
+
Utils.load_expression_metadata(Constants.EXPRESSION_METADATA_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
|
|
28
|
+
Utils.load_expression_data(Constants.EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
|
|
29
|
+
Utils.load_production_rates(Constants.EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
|
|
30
|
+
Utils.load_degradation_rates(Constants.EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from constants import Constants
|
|
3
|
+
from utils import Utils
|
|
4
|
+
|
|
5
|
+
def load_grn_data_into_database():
|
|
6
|
+
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
|
|
7
|
+
Utils.update_grn_genes(Constants.GRN_UPDATE_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
|
|
8
|
+
Utils.load_grn_genes(Constants.GRN_MISSING_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
|
|
9
|
+
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
|
|
10
|
+
|
|
11
|
+
def load_ppi_data_into_database():
|
|
12
|
+
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
13
|
+
Utils.update_ppi_genes(Constants.PPI_UPDATE_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
14
|
+
Utils.update_ppi_proteins(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
15
|
+
Utils.load_ppi_genes(Constants.PPI_MISSING_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
16
|
+
Utils.load_proteins(Constants.PPI_MISSING_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
17
|
+
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
|
|
18
|
+
|
|
19
|
+
def main():
|
|
20
|
+
# Set up argument parsing
|
|
21
|
+
parser = argparse.ArgumentParser(description="Load data into database for GRN or PPI networks.")
|
|
22
|
+
parser.add_argument('--network', choices=['GRN', 'PPI'], required=True, help="Specify the network type to load data for")
|
|
23
|
+
|
|
24
|
+
# Parse arguments
|
|
25
|
+
args = parser.parse_args()
|
|
26
|
+
|
|
27
|
+
# Execute the relevant operations based on the argument
|
|
28
|
+
if args.network == 'GRN':
|
|
29
|
+
load_grn_data_into_database()
|
|
30
|
+
elif args.network == 'PPI':
|
|
31
|
+
load_ppi_data_into_database()
|
|
32
|
+
else:
|
|
33
|
+
print("Invalid network type. Please choose 'GRN' or 'PPI'.")
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
main()
|
|
@@ -111,29 +111,21 @@ for regulator in regulators_to_targets:
|
|
|
111
111
|
if regulator != None:
|
|
112
112
|
regulators_list.append(regulator)
|
|
113
113
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
regulator_to_regulator_file.write(f'{headers}\n')
|
|
130
|
-
for target in targets:
|
|
131
|
-
result = create_regulator_to_target_row(target, regulators_to_targets)
|
|
132
|
-
if result != False:
|
|
133
|
-
regulator_to_regulator_file.write(f'{result}\n')
|
|
134
|
-
regulator_to_regulator_file.close()
|
|
135
|
-
|
|
136
|
-
|
|
114
|
+
def createMatrix(fileName: str):
|
|
115
|
+
statusMessage: str = 'Creating REGULATORS TO TARGETS MATRIX\n' if fileName == REGULATORS_TO_TARGETS_MATRIX else 'Creating REGULATORS TO REGULATORS MATRIX\n'
|
|
116
|
+
print(statusMessage)
|
|
117
|
+
regulator_to_target_file = open(fileName, 'w')
|
|
118
|
+
headers = "cols regulators/rows targets\t"
|
|
119
|
+
headers += '\t'.join(regulators_list)
|
|
120
|
+
regulator_to_target_file.write(f'{headers}\n')
|
|
121
|
+
for target in targets:
|
|
122
|
+
result = create_regulator_to_target_row(target, regulators_to_targets)
|
|
123
|
+
if result:
|
|
124
|
+
regulator_to_target_file.write(f'{result}\n')
|
|
125
|
+
regulator_to_target_file.close()
|
|
126
|
+
|
|
127
|
+
createMatrix(REGULATORS_TO_TARGETS_MATRIX)
|
|
128
|
+
createMatrix(REGULATORS_TO_REGULATORS_MATRIX)
|
|
137
129
|
|
|
138
130
|
# Create loader-files
|
|
139
131
|
|
|
@@ -17,7 +17,7 @@ def get_all_genes():
|
|
|
17
17
|
with db.connect() as connection:
|
|
18
18
|
result_set = connection.execute(
|
|
19
19
|
f"""
|
|
20
|
-
SELECT display_gene_id, gene_id FROM gene_regulatory_network.gene
|
|
20
|
+
SELECT display_gene_id, gene_id FROM gene_regulatory_network.gene
|
|
21
21
|
""")
|
|
22
22
|
result = result_set.fetchall()
|
|
23
23
|
return list(result)
|
|
@@ -126,29 +126,22 @@ for regulator in regulators_to_targets:
|
|
|
126
126
|
if regulator != None:
|
|
127
127
|
regulators_list.append(regulator)
|
|
128
128
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
regulator_to_target_file
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
for target in targets:
|
|
146
|
-
result = create_regulator_to_target_row(target, regulators_to_targets)
|
|
147
|
-
if result != False:
|
|
148
|
-
regulator_to_regulator_file.write(f'{result}\n')
|
|
149
|
-
regulator_to_regulator_file.close()
|
|
150
|
-
|
|
151
|
-
|
|
129
|
+
# this is the same method used in generate_network.py to create Regulators to Targets Matrix and Regulators to Regulators Matrix
|
|
130
|
+
def createMatrix(fileName: str):
|
|
131
|
+
statusMessage: str = 'Creating REGULATORS TO TARGETS MATRIX\n' if fileName == REGULATORS_TO_TARGETS_MATRIX else 'Creating REGULATORS TO REGULATORS MATRIX\n'
|
|
132
|
+
print(statusMessage)
|
|
133
|
+
regulator_to_target_file = open(fileName, 'w')
|
|
134
|
+
headers = "cols regulators/rows targets\t"
|
|
135
|
+
headers += '\t'.join(regulators_list)
|
|
136
|
+
regulator_to_target_file.write(f'{headers}\n')
|
|
137
|
+
for target in targets:
|
|
138
|
+
result = create_regulator_to_target_row(target, regulators_to_targets)
|
|
139
|
+
if result:
|
|
140
|
+
regulator_to_target_file.write(f'{result}\n')
|
|
141
|
+
regulator_to_target_file.close()
|
|
142
|
+
|
|
143
|
+
createMatrix(REGULATORS_TO_TARGETS_MATRIX)
|
|
144
|
+
createMatrix(REGULATORS_TO_REGULATORS_MATRIX)
|
|
152
145
|
|
|
153
146
|
# Create loader-files
|
|
154
147
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Protein-Protein Database
|
|
2
|
+
|
|
3
|
+
All files pertaining the protein-protein database live within this directory.
|
|
4
|
+
|
|
5
|
+
## The basics
|
|
6
|
+
|
|
7
|
+
#### Schema
|
|
8
|
+
|
|
9
|
+
All network data is stored within the protein_protein_interactions schema on our Postgres database.
|
|
10
|
+
|
|
11
|
+
The schema is located within this directory at the top level in the file `schema.sql`. It defines the tables located within the protein_protein_interactions schema.
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
To load to local database
|
|
15
|
+
```
|
|
16
|
+
psql -f schema.sql postgresql://localhost/postgres
|
|
17
|
+
```
|
|
18
|
+
To load to production database
|
|
19
|
+
```
|
|
20
|
+
psql -f schema.sql <address to database>
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Scripts
|
|
24
|
+
|
|
25
|
+
All scripts live within the subdirectory `scripts`, located in the top-level of the network database directory.
|
|
26
|
+
|
|
27
|
+
Any source files required to run the scripts live within the subdirectory `source-files`, located in the top-level of the network database directory. As source files may be large, you must create this directory yourself and add any source files you need to use there.
|
|
28
|
+
|
|
29
|
+
All generated results of the scripts live in the subdirectory `script-results`, located in the top-level of the network database directory. Currently, all scripts that generate code create the directory if it does not currently exist. When adding a new script that generates resulting code, best practice is to create the script-results directory and any subdirectories if it does not exist, in order to prevent errors and snafus for recently cloned repositories.
|
|
30
|
+
|
|
31
|
+
Within the scripts directory, there are the following files:
|
|
32
|
+
|
|
33
|
+
- `generate_protein_network.py`
|
|
34
|
+
- `remove_duplicates.sh`
|
|
35
|
+
- `loader.py`
|
|
36
|
+
|
|
37
|
+
#### Data Generator
|
|
38
|
+
|
|
39
|
+
This script (`generate_protein_network.py`) generates the genes, protein information and the physical interactions between these genes from Yeastmine; then it writes this into the csv files used to load the database. Please make sure you have enough time (around 1.5 - 2 hours) to run this script. The files (`gene.csv`), (`physical_interaction.csv`), (`protein.csv`) will be generated in the script-results directory located in the sub-directory processed-loader-files.
|
|
40
|
+
|
|
41
|
+
Usage:
|
|
42
|
+
```
|
|
43
|
+
python3 generate_protein_network.py
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Once you have finished generating the loader files, you need to remove duplicate entries from the physical interactions file. The bash script (`remove_duplicates.sh`) does this for you. The resultant file (`no_dupe.csv`)will be generated in the script-results directory located in the sub-directory processed-loader-files. If your machine doesn't support bash shell scripts, then you have to make a new script that removes duplicate lines from a file and writes the results to a file. Sorry!
|
|
47
|
+
|
|
48
|
+
Usage:
|
|
49
|
+
```
|
|
50
|
+
chmod u+x remove_duplicates.sh
|
|
51
|
+
|
|
52
|
+
./remove_duplicates.sh
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
#### Database Loader
|
|
56
|
+
|
|
57
|
+
This script (`loader.py`) is to be used to load your preprocessed expression data into the database.
|
|
58
|
+
|
|
59
|
+
This program generates direct SQL statements from the source files generated by the data preprocessor in order to populate a relational database with those files’ data
|
|
60
|
+
|
|
61
|
+
Note: You may get an error saying that there was a duplicate protein. You have to manually check which protein was being inserted twice, go to the SGD website (or Yeastmine) and confirm the correct protein gene interaction. Currently this occurs with the protein 'Aad6p'. To fix it go to your protein.csv file and make sure that 'Aad6p' is paired with the gene 'YFL056C', and 'Aad16p' is paired with the gene 'YFL057C'. If any other issues arise, you must manually confirm on the SGD website. Sorry!
|
|
62
|
+
|
|
63
|
+
Usage:
|
|
64
|
+
To load to local database
|
|
65
|
+
```
|
|
66
|
+
python3 loader.py | psql postgresql://localhost/postgres
|
|
67
|
+
```
|
|
68
|
+
To load to production database
|
|
69
|
+
```
|
|
70
|
+
python3 loader.py | psql <path to database>
|
|
71
|
+
```
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
CREATE TABLE protein_protein_interactions.source (
|
|
2
|
+
time_stamp TIMESTAMP WITH TIME ZONE,
|
|
3
|
+
source VARCHAR,
|
|
4
|
+
display_name VARCHAR,
|
|
5
|
+
PRIMARY KEY(time_stamp, source)
|
|
6
|
+
);
|
|
7
|
+
|
|
8
|
+
CREATE TABLE protein_protein_interactions.gene (
|
|
9
|
+
gene_id VARCHAR, -- systematic like name
|
|
10
|
+
display_gene_id VARCHAR, -- standard like name
|
|
11
|
+
species VARCHAR,
|
|
12
|
+
taxon_id VARCHAR,
|
|
13
|
+
PRIMARY KEY(gene_id, taxon_id)
|
|
14
|
+
);
|
|
15
|
+
|
|
16
|
+
CREATE TABLE protein_protein_interactions.protein (
|
|
17
|
+
standard_name VARCHAR PRIMARY KEY,
|
|
18
|
+
gene_systematic_name VARCHAR,
|
|
19
|
+
length FLOAT,
|
|
20
|
+
molecular_weight FLOAT,
|
|
21
|
+
PI FLOAT,
|
|
22
|
+
taxon_id VARCHAR,
|
|
23
|
+
FOREIGN KEY (gene_systematic_name, taxon_id) REFERENCES protein_protein_interactions.gene(gene_id, taxon_id)
|
|
24
|
+
);
|
|
25
|
+
|
|
26
|
+
CREATE TABLE protein_protein_interactions.physical_interactions (
|
|
27
|
+
protein1 VARCHAR,
|
|
28
|
+
protein2 VARCHAR,
|
|
29
|
+
interaction_detection_methods_identifier VARCHAR,
|
|
30
|
+
experiment_name VARCHAR,
|
|
31
|
+
time_stamp TIMESTAMP WITH TIME ZONE,
|
|
32
|
+
source VARCHAR,
|
|
33
|
+
FOREIGN KEY (protein1) REFERENCES protein_protein_interactions.protein(standard_name),
|
|
34
|
+
FOREIGN KEY (protein2) REFERENCES protein_protein_interactions.protein(standard_name),
|
|
35
|
+
FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions.source(time_stamp, source),
|
|
36
|
+
CONSTRAINT unique_physical_interaction UNIQUE (protein1, protein2, interaction_detection_methods_identifier, experiment_name, time_stamp, source)
|
|
37
|
+
);
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
|
|
3
|
+
from intermine.webservice import Service
|
|
4
|
+
service = Service("https://yeastmine.yeastgenome.org/yeastmine/service")
|
|
5
|
+
|
|
6
|
+
import csv
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
import gc
|
|
11
|
+
import datetime
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Create files
|
|
15
|
+
|
|
16
|
+
# Create folder paths
|
|
17
|
+
if not os.path.exists('../script-results'):
|
|
18
|
+
os.makedirs('../script-results')
|
|
19
|
+
|
|
20
|
+
if not os.path.exists('../script-results/processed-loader-files'):
|
|
21
|
+
os.makedirs('../script-results/processed-loader-files')
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Files to be generated
|
|
26
|
+
GENE_FILE = '../script-results/processed-loader-files/gene.csv'
|
|
27
|
+
PROTEIN_FILE = '../script-results/processed-loader-files/protein.csv'
|
|
28
|
+
PHYSICAL_INTERACTION_FILE = '../script-results/processed-loader-files/physical_interaction.csv'
|
|
29
|
+
SOURCE_DESTINATION = '../script-results/processed-loader-files/source.csv'
|
|
30
|
+
|
|
31
|
+
# Instantiate Source variables
|
|
32
|
+
timestamp = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)
|
|
33
|
+
source = "YeastMine - Saccharomyces Genome Database"
|
|
34
|
+
display_name = "Yeastmine - SGD"
|
|
35
|
+
|
|
36
|
+
# Get Network Data from Yeastmine
|
|
37
|
+
|
|
38
|
+
def getPhysicalInteractions(gene):
|
|
39
|
+
print("Query data from Yeastmine to get Physical Interactions")
|
|
40
|
+
query = service.new_query("Gene")
|
|
41
|
+
query.add_constraint("interactions.participant2", "Gene")
|
|
42
|
+
query.add_view(
|
|
43
|
+
"primaryIdentifier", "symbol", "secondaryIdentifier", "sgdAlias", "name",
|
|
44
|
+
"organism.shortName", "interactions.details.annotationType",
|
|
45
|
+
"interactions.details.role1", "interactions.participant2.symbol",
|
|
46
|
+
"interactions.participant2.secondaryIdentifier",
|
|
47
|
+
"interactions.details.experiment.interactionDetectionMethods.identifier",
|
|
48
|
+
"interactions.details.experiment.name",
|
|
49
|
+
"interactions.details.relationshipType", "interactions.details.note"
|
|
50
|
+
)
|
|
51
|
+
query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B")
|
|
52
|
+
query.add_constraint("interactions.details.relationshipType", "=", "physical", code="C")
|
|
53
|
+
query.add_constraint("Gene", "LOOKUP", gene, code="A")
|
|
54
|
+
return query
|
|
55
|
+
|
|
56
|
+
def getProteinFromGene(gene):
|
|
57
|
+
print("Query data from Yeastmine to get Protein information")
|
|
58
|
+
query = service.new_query("Gene")
|
|
59
|
+
query.add_view(
|
|
60
|
+
"primaryIdentifier", "proteins.symbol", "sgdAlias", "proteins.length",
|
|
61
|
+
"proteins.molecularWeight", "proteins.pI", "featureType", "qualifier",
|
|
62
|
+
"description", "proteins.sequence.residues"
|
|
63
|
+
)
|
|
64
|
+
query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B")
|
|
65
|
+
query.add_constraint("Gene", "LOOKUP", gene, code="A")
|
|
66
|
+
return query
|
|
67
|
+
|
|
68
|
+
def getAllProteins():
|
|
69
|
+
print("Query data from Yeastmine to get all proteins")
|
|
70
|
+
|
|
71
|
+
query = service.new_query("Protein")
|
|
72
|
+
|
|
73
|
+
query.add_view(
|
|
74
|
+
"genes.primaryIdentifier", "genes.secondaryIdentifier", "symbol", "length",
|
|
75
|
+
"molecularWeight", "pI", "genes.featureType", "genes.qualifier",
|
|
76
|
+
"genes.sgdAlias", "genes.description"
|
|
77
|
+
)
|
|
78
|
+
query.add_sort_order("Protein.symbol", "ASC")
|
|
79
|
+
query.add_constraint("genes.featureType", "=", "transposable_element_gene", code="G")
|
|
80
|
+
query.add_constraint("genes.featureType", "=", "ORF", code="F")
|
|
81
|
+
query.add_constraint("genes.status", "=", "Active", code="D")
|
|
82
|
+
query.add_constraint("genes.featureType", "=", "blocked_reading_frame", code="E")
|
|
83
|
+
query.add_constraint("genes.featureType", "=", "intein_encoding_region", code="H")
|
|
84
|
+
query.add_constraint("organism.name", "=", "Saccharomyces cerevisiae", code="A")
|
|
85
|
+
query.set_logic("A and D and (F or G or E or H)")
|
|
86
|
+
|
|
87
|
+
return query
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
query = getAllProteins()
|
|
92
|
+
all_proteins = {}
|
|
93
|
+
|
|
94
|
+
genes = {
|
|
95
|
+
# stored as gene sysyematic name : {
|
|
96
|
+
# proteins : {protein standard name : {protein info}}
|
|
97
|
+
# }
|
|
98
|
+
}
|
|
99
|
+
print("COLLECTING PROTEINS FROM QUERY RESULTS\n")
|
|
100
|
+
|
|
101
|
+
for row in query.rows():
|
|
102
|
+
gene_systematic_name = row["genes.secondaryIdentifier"]
|
|
103
|
+
protein_standard_name = row["symbol"]
|
|
104
|
+
length = row["length"]
|
|
105
|
+
molecular_weight = row["molecularWeight"]
|
|
106
|
+
PI = row["pI"]
|
|
107
|
+
genes[gene_systematic_name] = {
|
|
108
|
+
"standard_name" : None,
|
|
109
|
+
"protein" : {
|
|
110
|
+
"standard_name": protein_standard_name,
|
|
111
|
+
"length": length,
|
|
112
|
+
"molecular_weight": molecular_weight,
|
|
113
|
+
"PI": PI
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
print("COLLECTING/WRITING INTERACTIONS\n")
|
|
119
|
+
file = open(PHYSICAL_INTERACTION_FILE,"w")
|
|
120
|
+
print(f"Open file {PHYSICAL_INTERACTION_FILE} and write data into that file")
|
|
121
|
+
file.write(f"Protein1\tProtein2\tInteraction Detection Methods Identifier\tExperiment Name\tTime_Stamp\tSource\n")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
exceptions = []
|
|
125
|
+
print("Processing Physical Interactions")
|
|
126
|
+
for gene in genes:
|
|
127
|
+
query = getPhysicalInteractions(gene)
|
|
128
|
+
first_row = True
|
|
129
|
+
|
|
130
|
+
for row in query.rows():
|
|
131
|
+
gene1 = row["secondaryIdentifier"]
|
|
132
|
+
gene2 = row["interactions.participant2.secondaryIdentifier"]
|
|
133
|
+
if first_row:
|
|
134
|
+
# update the gene's standard name
|
|
135
|
+
genes[gene]["standard_name"] = row["symbol"] if row["symbol"] != None else gene
|
|
136
|
+
first_row = False
|
|
137
|
+
if gene2 in genes:
|
|
138
|
+
g = sorted([genes[gene1]["protein"]["standard_name"], genes[gene2]["protein"]["standard_name"]])
|
|
139
|
+
idmi = row["interactions.details.experiment.interactionDetectionMethods.identifier"]
|
|
140
|
+
exp_name = row["interactions.details.experiment.name"]
|
|
141
|
+
|
|
142
|
+
if gene2 in genes and gene1 in genes:
|
|
143
|
+
file.write(f'{g[0]}\t{g[1]}\t{idmi}\t{exp_name}\t{timestamp}\t{source}\n')
|
|
144
|
+
else:
|
|
145
|
+
exceptions.append(gene2)
|
|
146
|
+
|
|
147
|
+
print("Handling Exceptions")
|
|
148
|
+
failed_genes = []
|
|
149
|
+
while exceptions != None:
|
|
150
|
+
acceptable_genes = []
|
|
151
|
+
for gene in exceptions:
|
|
152
|
+
query = getProteinFromGene(gene)
|
|
153
|
+
rows = query.rows()
|
|
154
|
+
for row in rows:
|
|
155
|
+
acceptable_genes.append(gene)
|
|
156
|
+
protein_standard_name = row["proteins.symbol"]
|
|
157
|
+
length = row["proteins.length"]
|
|
158
|
+
molecular_weight = row["proteins.molecularWeight"]
|
|
159
|
+
PI = row["proteins.pI"]
|
|
160
|
+
genes[gene] = {
|
|
161
|
+
"standard_name" : None,
|
|
162
|
+
"protein" : {
|
|
163
|
+
"standard_name": protein_standard_name,
|
|
164
|
+
"length": length,
|
|
165
|
+
"molecular_weight": molecular_weight,
|
|
166
|
+
"PI": PI
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
if len(rows) == 0:
|
|
170
|
+
failed_genes.append(gene)
|
|
171
|
+
|
|
172
|
+
more_exceptions = []
|
|
173
|
+
for gene in acceptable_genes:
|
|
174
|
+
query = getPhysicalInteractions(gene)
|
|
175
|
+
first_row = True
|
|
176
|
+
for row in query.rows():
|
|
177
|
+
gene1 = row["secondaryIdentifier"]
|
|
178
|
+
gene2 = row["interactions.participant2.secondaryIdentifier"]
|
|
179
|
+
if first_row:
|
|
180
|
+
# update the gene's standard name
|
|
181
|
+
genes[gene]["standard_name"] = row["symbol"] if row["symbol"] != None else gene
|
|
182
|
+
first_row = False
|
|
183
|
+
if gene2 in genes:
|
|
184
|
+
g = sorted([genes[gene1]["protein"]["standard_name"], genes[gene2]["protein"]["standard_name"]])
|
|
185
|
+
idmi = row["interactions.details.experiment.interactionDetectionMethods.identifier"]
|
|
186
|
+
exp_name = row["interactions.details.experiment.name"]
|
|
187
|
+
|
|
188
|
+
if gene2 in genes and gene1 in genes:
|
|
189
|
+
file.write(f'{g[0]}\t{g[1]}\t{idmi}\t{exp_name}\t{timestamp}\t{source}\n')
|
|
190
|
+
elif gene not in failed_genes:
|
|
191
|
+
more_exceptions.append(gene2)
|
|
192
|
+
if len(more_exceptions) == 0:
|
|
193
|
+
exceptions = None
|
|
194
|
+
else :
|
|
195
|
+
exceptions = more_exceptions
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
file.close()
|
|
199
|
+
|
|
200
|
+
# Source Table
|
|
201
|
+
|
|
202
|
+
print(f"Completed {PHYSICAL_INTERACTION_FILE} Starting{SOURCE_DESTINATION}")
|
|
203
|
+
|
|
204
|
+
source_file = open(SOURCE_DESTINATION, 'w')
|
|
205
|
+
headers = f'Timestamp\tSource\tDisplay Name\n{timestamp}\t{source}\t{display_name}'
|
|
206
|
+
source_file.write(f'{headers}\n')
|
|
207
|
+
source_file.close()
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
species = "Saccharomyces cerevisiae"
|
|
211
|
+
taxon_id = "559292"
|
|
212
|
+
|
|
213
|
+
# create gene csv
|
|
214
|
+
print(f"Completed {SOURCE_DESTINATION} Starting{GENE_FILE}")
|
|
215
|
+
file = open(GENE_FILE,"w")
|
|
216
|
+
file.write(f"Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID\n")
|
|
217
|
+
for gene in genes:
|
|
218
|
+
file.write(f"{gene}\t{genes[gene]['standard_name']}\t{species}\t{taxon_id}\n")
|
|
219
|
+
file.close()
|
|
220
|
+
|
|
221
|
+
# create protein csv
|
|
222
|
+
print(f"Completed {GENE_FILE} Starting{PROTEIN_FILE}")
|
|
223
|
+
file = open(PROTEIN_FILE, "w")
|
|
224
|
+
file.write(f"Standard Name\tGene Systematic Name\tLength\tMolecular Weight\tPI\tTaxon ID\n")
|
|
225
|
+
for gene in genes:
|
|
226
|
+
file.write(f"{genes[gene]['protein']['standard_name']}\t{gene}\t{genes[gene]['protein']['length']}\t{genes[gene]['protein']['molecular_weight']}\t{genes[gene]['protein']['PI']}\t{taxon_id}\n")
|
|
227
|
+
file.close()
|