npm - grnsight - Versions diffs - 6.0.7 → 7.2.0 - Mend

grnsight 6.0.7 → 7.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/.eslintrc.yml +4 -4
package/.github/workflows/node.js.yml +35 -0
package/README.md +1 -1
package/database/README.md +218 -97
package/database/constants.py +42 -0
package/database/filter_update.py +168 -0
package/database/grnsettings-database/README.md +52 -0
package/database/grnsettings-database/schema.sql +4 -0
package/database/loader.py +30 -0
package/database/loader_update.py +36 -0
package/database/network-database/scripts/generate_network.py +15 -23
package/database/network-database/scripts/generate_new_network_version.py +17 -24
package/database/protein-protein-database/README.md +71 -0
package/database/protein-protein-database/schema.sql +37 -0
package/database/protein-protein-database/scripts/generate_protein_network.py +227 -0
package/database/protein-protein-database/scripts/remove_duplicates.sh +4 -0
package/database/utils.py +418 -0
package/package.json +3 -2
package/server/app.js +2 -0
package/server/config/config.js +4 -4
package/server/controllers/additional-sheet-parser.js +2 -1
package/server/controllers/constants.js +5 -0
package/server/controllers/custom-workbook-controller.js +4 -3
package/server/controllers/demo-workbooks.js +1462 -6
package/server/controllers/export-constants.js +3 -2
package/server/controllers/exporters/sif.js +6 -1
package/server/controllers/exporters/xlsx.js +8 -3
package/server/controllers/expression-sheet-parser.js +0 -6
package/server/controllers/grnsettings-database-controller.js +17 -0
package/server/controllers/importers/sif.js +30 -11
package/server/controllers/network-database-controller.js +2 -2
package/server/controllers/network-sheet-parser.js +54 -12
package/server/controllers/protein-database-controller.js +18 -0
package/server/controllers/sif-constants.js +11 -4
package/server/controllers/spreadsheet-controller.js +44 -1
package/server/controllers/workbook-constants.js +21 -4
package/server/dals/expression-dal.js +4 -4
package/server/dals/grnsetting-dal.js +49 -0
package/server/dals/network-dal.js +14 -15
package/server/dals/protein-dal.js +106 -0
package/test/additional-sheet-parser-tests.js +1 -1
package/test/export-tests.js +136 -9
package/test/import-sif-tests.js +67 -13
package/test/test.js +1 -1
package/test-files/additional-sheet-test-files/optimization-parameters-default.xlsx +0 -0
package/test-files/demo-files/18_proteins_81_edges_PPI.xlsx +0 -0
package/test-files/expression-data-test-sheets/expression_sheet_missing_data_ok_export_exact.xlsx +0 -0
package/web-client/config/config.js +4 -4
package/web-client/public/js/api/grnsight-api.js +18 -3
package/web-client/public/js/constants.js +27 -12
package/web-client/public/js/generateNetwork.js +170 -72
package/web-client/public/js/graph.js +424 -161
package/web-client/public/js/grnsight.js +25 -4
package/web-client/public/js/grnstate.js +4 -1
package/web-client/public/js/iframe-coordination.js +3 -3
package/web-client/public/js/setup-handlers.js +76 -61
package/web-client/public/js/setup-load-and-import-handlers.js +32 -7
package/web-client/public/js/update-app.js +119 -28
package/web-client/public/js/upload.js +142 -85
package/web-client/public/js/warnings.js +25 -0
package/web-client/public/lib/bootstrap.file-input/bootstrap.file-input.js +0 -1
package/web-client/public/stylesheets/grnsight.styl +40 -16
package/web-client/views/components/demo.pug +7 -5
package/web-client/views/upload.pug +64 -50
package/database/network-database/scripts/filter_genes.py +0 -76
package/database/network-database/scripts/loader.py +0 -79
package/database/network-database/scripts/loader_updates.py +0 -99

package/.eslintrc.yml CHANGED Viewed

@@ -4,9 +4,9 @@ env:
   jquery: true
   mocha: true
   es6: true
-extends: 'eslint:recommended'
+extends: "eslint:recommended"
 parserOptions:
-  ecmaVersion: 6
+  ecmaVersion: 8
   sourceType: module
   ecmaFeatures:
     jsx: true
@@ -45,7 +45,7 @@ rules:
   brace-style:
     - error
     - 1tbs
-    - allowSingleLine: true
+    - allowSingleLine: true
   comma-spacing:
     - error
   max-len:
@@ -63,6 +63,6 @@ rules:
     - error
   space-before-function-paren:
     - error
-    - anonymous: 'always'
+    - anonymous: "always"
   no-trailing-spaces:
     - error

package/.github/workflows/node.js.yml ADDED Viewed

@@ -0,0 +1,35 @@
+# This workflow will do a clean installation of node dependencies, cache/restore them, build the source code and run tests across different versions of node
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-nodejs
+name: Node.js CI
+on: [push]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        node-version: [18.x, 20.x, 22.x]
+    steps:
+    - uses: actions/checkout@v4
+    # install system dependencies needed by the 'canvas' package
+    - name: Install dependencies for canvas
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y libcairo2-dev libpango1.0-dev libjpeg62 libgif-dev librsvg2-dev
+    - name: Use Node.js ${{ matrix.node-version }}
+      uses: actions/setup-node@v4
+      with:
+        node-version: ${{ matrix.node-version }}
+        cache: 'npm'
+    - run: npm ci
+    - run: npm run lint
+    - run: npm run build --if-present
+    - run: npm test

package/README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 GRNsight
 ========
 [![DOI](https://zenodo.org/badge/16195791.svg)](https://zenodo.org/badge/latestdoi/16195791)
-[![Build Status](https://app.travis-ci.com/dondi/GRNsight.svg?branch=master)](https://app.travis-ci.com/dondi/GRNsight)
+[![Node.js CI](https://github.com/dondi/GRNsight/actions/workflows/node.js.yml/badge.svg)](https://github.com/dondi/GRNsight/actions/workflows/node.js.yml)
 [![Coverage Status](https://coveralls.io/repos/github/dondi/GRNsight/badge.svg?branch=master)](https://coveralls.io/github/dondi/GRNsight?branch=master)
 http://dondi.github.io/GRNsight/

package/database/README.md CHANGED Viewed

@@ -1,111 +1,232 @@
 # GRNsight Database
 Here are the files pertaining to both the network and expression databases. Look within the README.md files of both folders for information pertinent to the schema that you intend to be using.
 ## Setting up a local postgres GRNsight Database
 1. Installing PostgreSQL on your computer
-    - MacOS and Windows can follow these instructions on how to install postgreSQL.
-        - Install the software at this [link](https://www.postgresql.org/download/)
-        - Initialize the database
-           - If your terminal emits a message that looks like `initdb --locale=C -E UTF-8 location-of-cluster` from Step 1B, then your installer has initialized a database for you.
-           - Open the terminal and type the command `initdb --locale=C -E UTF-8 location-of-cluster`
-           - "Cluster" is the PostgreSQL term for the file structure of a PostgreSQL database instance
-           - You will have to modify location-of-cluster to the folder name you want to store the database (you don't need to create a folder, the command will create the folder for you, just create the name)
-        - Start and stop the server
-            - Additionally, your installer may start the server for you upon installation (You can save this command for further reuse).
-            - To start the server yourself run `pg_ctl start -D location-of-cluster` (You can save this command for further reuse).
-            - To stop the server run `pg_ctl stop -D location-of-cluster`.
-    - Linux users
-      - The MacOS and Windows instructions will _probably_ not work for you. You can try at your own risk to check.
-      - Linux users can try these [instructions](https://www.geeksforgeeks.org/install-postgresql-on-linux/) and that should work for you (...maybe...). If it doesn't try googling instructions with your specific operating system. Sorry!
+   - MacOS and Windows can follow these instructions on how to install postgreSQL.
+     - Install the software at this [link](https://www.postgresql.org/download/)
+     - > MacOS users: It is recommended to install with homebrew rather than the interactive installation in order to correctly view the `initdb --locale=C -E UTF-8 location-of-cluster` message in the documentation.
+     - > Windows users: when prompted for a password at the end of the installation process, save this password. It is the password for the postgres user
+     - Initialize the database
+       - If your terminal emits a message that looks like `initdb --locale=C -E UTF-8 location-of-cluster` from Step 1B, then your installer has initialized a database for you.
+       - Open the terminal and type the command `initdb --locale=C -E UTF-8 location-of-cluster`
+       - "Cluster" is the PostgreSQL term for the file structure of a PostgreSQL database instance
+       - You will have to modify location-of-cluster to the folder name you want to store the database (you don't need to create a folder, the command will create the folder for you, just create the name)
+     - Start and stop the server
+       - Additionally, your installer may start the server for you upon installation (You can save this command for further reuse).
+       - To start the server yourself run `pg_ctl start -D location-of-cluster` (You can save this command for further reuse).
+       - To stop the server run `pg_ctl stop -D location-of-cluster`.
+         - After installing with homebrew on MacOS, you may receive an error when you try to start the server that the server is unable to be started, and when attempting to stop the server, there terminal states there is no server running. In this case, you have to directly kill the port that the server is running on.
+         - To double check that this is the issue, you can open the Activity Monitor app on your computer and search for the `postgres` activity. If there is one, that means the server is running, and we have to terminate the port that the server is running on.
+         - First, we have to check what port the server is running on. Navigate to your homebrew installation, which is the same `location-of-cluster` from when the database was initialized and open that location in VSCode.
+         - Search for `port =` in the file `postgresql.conf`. By default, the port should be port 5432, but keep note of this port in case it is different.
+         - Refer to this Stack Overflow documentation on how to kill a server:
+           - https://stackoverflow.com/questions/4075287/node-express-eaddrinuse-address-already-in-use-kill-server
+         - If that doesn't work, then refer to the different methods on this link from Stack Overflow:
+           - https://stackoverflow.com/questions/42416527/postgres-app-port-in-use
+   - Linux users
+     - The MacOS and Windows instructions will _probably_ not work for you. You can try at your own risk to check.
+     - Linux users can try these [instructions](https://www.geeksforgeeks.org/install-postgresql-on-linux/) and that should work for you (...maybe...). If it doesn't try googling instructions with your specific operating system. Sorry!
 2. Loading data to your database
-    1. Adding the Schemas to your database.
-        1. Go into your database using the following command:
-            ```
-            psql postgresql://localhost/postgres
-            ```
-           From there, create the schemas using the following commands:
-            ```
-            CREATE SCHEMA gene_regulatory_network;
-            ```
-            ```
-            CREATE SCHEMA gene_expression;
-            ```
-           Once they are created you can exit your database using the command `\q`.
-         2. Once your schema's are created, you can add the table specifications using the following commands:
-            ```
-            psql postgresql://localhost/postgres -f <path to GRNsight/database/network-database>/schema.sql
-            ```
-            ```
-            psql postgresql://localhost/postgres -f <path to GRNsight/database/expression-database>/schema.sql
-            ```
-            Your database is now ready to accept expression and network data!
-    2. Loading the GRNsight Network Data to your local database
-        1. GRNsight generates Network Data from SGD through YeastMine. In order to run the script that generates these Network files, you must pip3 install the dependencies used. If you get an error saying that a module doesn't exist, just run `pip3 install <Module Name>` and it should fix the error. If the error persists and is found in a specific file on your machine, you might have to manually go into that file and alter the naming conventions of the dependencies that are used. _Note: So far this issue has only occured on Ubuntu 22.04.1, so you might be lucky and not have to do it!_
-          ```
-          pip3 install pandas requests intermine tzlocal
-          ```
-          Once the dependencies have been installed, you can run
-          ```
-          cd <path to GRNsight/database/network-database/scripts>
-          python3 generate_network.py
-          ```
-          This will take a while to get all of the network data and generate all of the files. This will create a folder full of the processed files in `database/network-database/script-results`.
-          *** Note: *** If you get an error similar to the following image where it references the in then you are one of the unlucky few who has to edit the intermine.py file directly.
-        ![image](https://user-images.githubusercontent.com/21343072/213089777-dfe772bc-deca-4df7-816f-72703db24d1e.png)
-          - Navigate the referenced file ( \<path specific to your machine>/intermine/webservice.py )
-          - The try-catch block should look like this:
-              - ![image](https://user-images.githubusercontent.com/21343072/213094796-c48f54da-b76c-4266-81fb-6aaef24a36c9.png)
-          - Change it to the following, rerun the `generate_network.py` command and it should work! If it doesn't you may need to troubleshoot a bit further (´◕ ᵔ ◕`✿)*ᶜʳᶦᵉˢ*.
-              - ![image](https://user-images.githubusercontent.com/21343072/213094984-bff2deb3-d26b-4809-83d6-6a6615b6e3cf.png)
-        2. Load the processed files into your database.
-            ```
-            cd <path to GRNsight/database/network-database/scripts>
-            python3 loader.py | psql postgresql://localhost/postgres
-            ```
-            This should output a bunch of COPY print statements to your terminal. Once complete your database is now loaded with the network data.
-    3. Loading the GRNsight Expression Data to your local database
-        1. Create a directory (aka folder) in the database/expression-database folder called `source-files`.
+   1. Adding the Schemas to your database.
+      1. Go into your database using the following command:
+         ```
+         psql postgresql://localhost/postgres
+         ```
+         > For Windows users use this command:
+         ```
+         psql -U postgres postgresql://localhost/postgres
+         ```
+         When prompted for the password, use the password you specified earlier during the installation process. For all future commands requiring you to access postgres, you will need to add `-U postgres `
+         From there, create the schemas using the following commands:
+         ```
+         CREATE SCHEMA gene_regulatory_network;
+         ```
+         ```
+         CREATE SCHEMA gene_expression;
+         ```
+         ```
+         CREATE SCHEMA protein_protein_interactions;
+         ```
+         Once they are created you can exit your database using the command `\q`.
+      2. Once your schema's are created, you can add the table specifications using the following commands:
+         ```
+         psql -f <path to GRNsight/database/network-database>/schema.sql postgresql://localhost/postgres
+         ```
+         ```
+         psql -f <path to GRNsight/database/expression-database>/schema.sql postgresql://localhost/postgres
+         ```
+         ```
+         psql -f <path to GRNsight/database/protein-protein-database>/schema.sql postgresql://localhost/postgres
+         ```
+         Your database is now ready to accept expression and network data!
+      3. However, before you load the data, follow the steps of grnsettings-database README.md. Instructions are [located here!](https://github.com/dondi/GRNsight/tree/master/database/grnsettings-database)
+   2. Loading the GRNsight Network Data to your local database
+      1. Getting Data for Network
+         GRNsight generates Network Data from SGD through YeastMine. In order to run the script that generates these Network files, you must pip3 install the dependencies used. If you get an error saying that a module doesn't exist, just run `pip3 install <Module Name>` and it should fix the error. If the error persists and is found in a specific file on your machine, you might have to manually go into that file and alter the naming conventions of the dependencies that are used. _Note: So far this issue has only occured on Ubuntu 22.04.1, and certain MacOS versions so you might be lucky and not have to do it!_
+         ```
+         pip3 install pandas requests intermine tzlocal
+         ```
+         Once the dependencies have been installed, you can run
+         ```
+         cd <path to GRNsight/database/network-database/scripts>
+         python3 generate_network.py
+         ```
+         > Windows users should use `py` instead of `python3`.
+         This will take a while to get all of the network data and generate all of the files. This will create a folder full of the processed files in `database/network-database/script-results`.
+         **Note:** If you get the following error:
+         ImportError: urllib3 v2.0 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'OpenSSL 1.1.0h 27 Mar 2018'. See: Drop support for OpenSSL<1.1.1 urllib3/urllib3#2168
+         Run `pip install urllib3==1.26.6`
+         **Note:** If you get an error similar to the following image where it references the in then you are one of the unlucky few who has to edit the intermine.py file directly.
+         ![image](https://user-images.githubusercontent.com/21343072/213089777-dfe772bc-deca-4df7-816f-72703db24d1e.png)
+         - Navigate the referenced file ( \<path specific to your machine>/intermine/webservice.py )
+         - The try-catch block should look like this:
+         - ![image](https://user-images.githubusercontent.com/21343072/213094796-c48f54da-b76c-4266-81fb-6aaef24a36c9.png)
+         - Change it to the following, rerun the `generate_network.py` command and it should work! If it doesn't you may need to troubleshoot a bit further (´◕ ᵔ ◕`✿)_ᶜʳᶦᵉˢ_.
+         - ![image](https://user-images.githubusercontent.com/21343072/213094984-bff2deb3-d26b-4809-83d6-6a6615b6e3cf.png)
+      2. Getting Data for Expression
+         1. Create a directory (aka folder) in the database/expression-database folder called `source-files`.
             ```
             mkdir <path to GRNsight/database/expression-database>/source-files
             ```
-        2. Download the _"Expression 2020"_ folder from Box located in `GRNsight > GRNsight Expression > Expression 2020` to your newly created `source-files` folder. Your the path should look like this: GRNsight > database > expression-database > source-files > Expression 2020 > [the actual csv and xlsx files are here!]
-        3. Run the pre-processing script on the data. This will create a folder full of the processed files in `database/expression-database/script-results`.
+         2. Download the _"Expression 2020"_ folder from Box located in `GRNsight > GRNsight Expression > Expression 2020` to your newly created `source-files` folder. Your the path should look like this: GRNsight > database > expression-database > source-files > Expression 2020 > [the actual csv and xlsx files are here!]
+         3. Run the pre-processing script on the data. This will create a folder full of the processed files in `database/expression-database/script-results`.
             ```
             cd <path to GRNsight/database/expression-database/scripts>
             python3 preprocessing.py
             ```
-        4. Load the processed files into your database.
+         **Note:** If you receive a UnicodeEncodeError add `-X utf8` to the beginning of the command
+      3. Getting Data for Protein-Protein Interactions
+         1. GRNsight generates Protein-Protein Interactions from SGD through YeastMine. In order to run the script that generates these Network files, you must pip3 install the dependencies used. These are the same dependencies used when creating the Network Database, so if you have completed step 2.2.1, then you should be fine. Once the dependencies have been installed, you can run
             ```
-            cd <path to GRNsight/database/expression-database/scripts>
-            python3 loader.py | psql postgresql://localhost/postgres
+            cd <path to GRNsight/database/protein-protein-database/scripts>
+            python3 generate_protein_network.py
             ```
-            This should output a bunch of COPY print statements to your terminal. Once complete your database is now loaded with the expression data.
-3. Continue setting up in the [Initial Setup Wiki page](https://github.com/dondi/GRNsight/wiki/Initial-Setup)
+            This will take a while {almost 2 hours (´◕ ᵔ ◕\`✿)_ᶜʳᶦᵉˢ_} to get all of the network data and generate all of the files. This will create a folder full of the processed files in `database/protein-protein-database/script-results`.
+         2. Once you have finished generating the loader files, you need to remove duplicate entries from the physical interactions file. The bash script (`remove_duplicates.sh`) does this for you. The resultant file (`no_dupe.csv`)will be generated in the script-results directory located in the sub-directory processed-loader-files. If your machine doesn't support bash shell scripts, then you have to make a new script that removes duplicate lines from a file and writes the results to a file. Sorry!
+            Run the following:
+            ```
+            chmod u+x remove_duplicates.sh
+            ./remove_duplicates.sh
+            ```
+   3. Loading all processed files into your local database
+      Need to run the script under `database` folder and run `loader.py`. The file contains scripts to collect union genes from expression, network, and protein-protein interactions. After that, the scripts populate all data from generated files in "Getting Data" section above into database.
+      ```
+      cd <path to GRNsight/database>
+      ```
+      To load to local database
+      ```
+      python3 loader.py | psql postgresql://localhost/postgres
+      ```
+      To load to production database
+      ```
+      python3 loader.py | psql <path to database>
+      ```
+      This should output a bunch of COPY print statements to your terminal. Once complete your database is now loaded with the expression, network, and protein-protein interactions data.
+## Instructions for Updating Database
+1.  Getting new data
+    1. Generate a new network from Yeastmine using the script `generate_network.py` inside `network-database` folder.
+       ```
+       cd <path to GRNsight/database/network-database/scripts>
+       python3 generate_network.py
+       ```
+    2. Generate a new Protein-Protein Interactions from SGD using Yeastmine
+       ```
+       cd <path to GRNsight/database/protein-protein-database/scripts>
+       python3 generate_protein_network.py
+       ```
+2.  Filter all the missing genes, and updated genes in both Network and Protein-Protein Interactions. Also you need to filter the missing protein, and updated proteins in Protein-Protein Interactions. Everything is done in `filter_update.py`. The script will access the database get all of the genes stored within. From there it will generate a csv file of all genes that are missing from your database, and all genes that have updated their display name (standard like name). After running this script, you will see `missing-genes.csv`, `update-genes.csv` in `processed-loader-files` for both `network-database` and `protein-protein-database`, also `missing-protein.csv` and `update-protein.csv`.
+    ```
+    cd <path to GRNsight/database/>
+    DB_URL="postgresql://[<db_user>:<password>]@<address to database>/<database name>" python3 filter_update.py
+    ```
+    Ex:
+    ```
+    DB_URL="postgresql://postgres@localhost/postgres" python3 filter_update.py
+    ```
+3.  Loading all the updates from Network or Protein-Protein Interactions to database.
+    In the command below, the --network option specifies the network source, which can be either GRN or PPI. Ensure you select the correct network type.
+    To load to local database
+    ```
+    python3 loader_update.py --network=[GRN|PPI] | psql postgresql://localhost/postgres
+    ```
+    To load to production database
+    ```
+    python3 loader_update.py --network=[GRN|PPI]| psql <path to database>
+    ```
+Continue setting up in the [Initial Setup Wiki page](https://github.com/dondi/GRNsight/wiki/Initial-Setup)

package/database/constants.py ADDED Viewed

@@ -0,0 +1,42 @@
+class Constants:
+    GRN_FOLDER_PATH = 'network-database'
+    PPI_FOLDER_PATH = 'protein-protein-database'
+    EXPRESSION_FOLDER_PATH = 'expression-database'
+    UNION_GENE_FOLDER_PATH = 'union-gene-data/'
+    # Gene data source file path
+    GRN_GENE_SOURCE = GRN_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv"
+    PPI_GENE_SOURCE = PPI_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv"
+    EXPRESSION_GENE_SOURCE = EXPRESSION_FOLDER_PATH + "/script-results/processed-expression/genes.csv"
+    # Union gene data
+    GENE_DATA_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union_genes.csv'
+    MISSING_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-missing-genes.csv'
+    UPDATE_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-update-genes.csv'
+    # Constants name: NETWORK_<table_name>_DATA_DIRECTORY
+    GRN_DATABASE_NAMESPACE = 'gene_regulatory_network'
+    GRN_SOURCE_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/source.csv'
+    GRN_NETWORK_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/network.csv'
+    # Protein-protein-interactions
+    PPI_DATABASE_NAMESPACE = 'protein_protein_interactions'
+    PPI_SOURCE_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/source.csv'
+    PPI_NETWORK_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/physical_interaction_no_dupe.csv'
+    PPI_PROTEIN_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/protein.csv'
+    # Expression data
+    EXPRESISON_DATABASE_NAMESPACE = 'gene_expression'
+    EXPRESSION_REFS_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/refs.csv'
+    EXPRESSION_METADATA_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-metadata.csv'
+    EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-data.csv'
+    EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/production-rates.csv'
+    EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/degradation-rates.csv'
+    # Paths for update files
+    PPI_MISSING_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv'
+    PPI_UPDATE_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv'
+    PPI_MISSING_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-proteins.csv'
+    PPI_UPDATE_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-proteins.csv'
+    GRN_MISSING_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv'
+    GRN_UPDATE_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv'

package/database/filter_update.py ADDED Viewed

@@ -0,0 +1,168 @@
+import os
+import csv
+from sqlalchemy import create_engine
+from sqlalchemy import text
+from constants import Constants
+from utils import Utils
+PROTEIN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID'
+GRN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID\tRegulator'
+def _get_all_data_from_database_table(database_namespace, table_name):
+    db = create_engine(os.environ['DB_URL'])
+    with db.connect() as connection:
+        result_set = connection.execute(text(f"SELECT * FROM {database_namespace}.{table_name}"))
+        return result_set.fetchall()
+def _get_all_db_genes(database_namespace):
+    gene_records = _get_all_data_from_database_table(database_namespace, "gene")
+    genes = {}
+    for gene in gene_records:
+        key = (gene[0], gene[3])
+        if len(gene) > 4:
+            value = (gene[1], gene[2], gene[4])
+        else:
+            value = (gene[1], gene[2])
+        genes[key] = value
+    return genes
+def _get_all_db_grn_genes():
+    return _get_all_db_genes(Constants.GRN_DATABASE_NAMESPACE)
+def _get_all_db_ppi_genes():
+    return _get_all_db_genes(Constants.PPI_DATABASE_NAMESPACE)
+def _get_all_genes():
+    db_grn_genes = _get_all_db_grn_genes()
+    db_ppi_genes = _get_all_db_ppi_genes()
+    if not os.path.exists('union-gene-data'):
+        os.makedirs('union-gene-data')
+    Utils.create_union_file([Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY)
+    genes = db_grn_genes
+    for gene in db_ppi_genes:
+        if gene not in genes:
+            display_gene_id, species = db_ppi_genes[gene]
+            genes[gene] = [display_gene_id, species, False]
+    with open(Constants.GENE_DATA_DIRECTORY, 'r+', encoding="UTF-8") as f:
+        i = 0
+        reader = csv.reader(f)
+        for row in reader:
+            if i != 0:
+                row = row[0].split('\t')
+                gene_id = row[0]
+                display_gene_id = row[1]
+                species = row[2]
+                taxon_id = row[3]
+                regulator = row[4].capitalize()
+                key = (gene_id, taxon_id)
+                value = (display_gene_id, species, regulator)
+                if key not in genes:
+                    genes[key] = value
+                elif genes[key][0] != display_gene_id:
+                    if display_gene_id != "None":
+                        genes[key] = value
+            i+=1
+    return genes
+def get_all_proteins():
+    protein_records = _get_all_data_from_database_table(Constants.PPI_DATABASE_NAMESPACE, "protein")
+    proteins = {}
+    for protein in protein_records:
+        key = (protein[0], protein[5])
+        value = (protein[1], protein[2], protein[3], protein[4])
+        proteins[key] = value
+    return proteins
+def processing_grn_gene_file():
+    return _processing_gene_file(_get_all_db_grn_genes(), is_protein=False)
+def processing_ppi_gene_file():
+    return _processing_gene_file(_get_all_db_ppi_genes())
+def _processing_gene_file(db_genes, is_protein=True):
+    print(f'Processing gene')
+    missing_genes = {}
+    genes_to_update = {}
+    all_genes = _get_all_genes()
+    for gene in all_genes:
+        display_gene_id, species, regulator = all_genes[gene]
+        values_for_ppi = (display_gene_id, species)
+        values_for_grn = (display_gene_id, species, regulator)
+        if gene not in db_genes:
+            if is_protein:
+                missing_genes[gene] = values_for_ppi
+            else:
+                missing_genes[gene] = values_for_grn
+        elif gene in db_genes and db_genes[gene][0] != display_gene_id:
+            if db_genes[gene][0] != "None":
+                if is_protein:
+                    genes_to_update[gene] = values_for_ppi
+                else:
+                    genes_to_update[gene] = values_for_grn
+    return missing_genes, genes_to_update
+def processing_protein_file(file_path, db_proteins):
+    print(f'Processing file {file_path}')
+    ppi_missing_proteins = {}
+    ppi_proteins_to_update = {}
+    with open(file_path, 'r+', encoding="UTF-8") as f:
+        i = 0
+        reader = csv.reader(f)
+        for row in reader:
+            if i != 0:
+                row = row[0].split('\t')
+                standard_name = row[0]
+                gene_systematic_name = row[1]
+                length = float(row[2]) if row[2] != "None" else 0
+                molecular_weight = float(row[3]) if row[3] != "None" else 0
+                pi = float(row[4]) if row[4] != "None" else 0
+                taxon_id = row[5]
+                key = (standard_name, taxon_id)
+                value = (gene_systematic_name, length, molecular_weight, pi)
+                if key not in db_proteins:
+                    ppi_missing_proteins[key] = value
+                elif db_proteins[key] != value:
+                    ppi_proteins_to_update[key] = value
+            i+=1
+    return ppi_missing_proteins, ppi_proteins_to_update
+def create_grn_gene_file(file_path, data):
+    _create_gene_file(file_path, GRN_GENE_HEADER, data, is_protein=False)
+def create_ppi_gene_file(file_path, data):
+    _create_gene_file(file_path, PROTEIN_GENE_HEADER, data)
+def _create_gene_file(file_path, headers, data, is_protein=True):
+    print(f'Creating {file_path}\n')
+    gene_file = open(file_path, 'w')
+    gene_file.write(f'{headers}\n')
+    for gene in data:
+        if is_protein:
+            gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\n')
+        else:
+            gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\t{data[gene][2]}\n')
+    gene_file.close()
+def create_ppi_protein_file(file_path, data):
+    print(f'Creating {file_path}\n')
+    protein_file = open(file_path, 'w')
+    headers = f'Standard Name\tGene Systematic Name\tLength\tMolecular Weight\tPI\tTaxon ID'
+    protein_file.write(f'{headers}\n')
+    for protein in data:
+        protein_file.write(f'{protein[0]}\t{data[protein][0]}\t{data[protein][1]}\t{data[protein][2]}\t{data[protein][3]}\t{protein[1]}\n')
+    protein_file.close()
+# Processing gene files
+ppi_missing_genes, ppi_genes_to_update = processing_ppi_gene_file()
+grn_missing_genes, grn_genes_to_update = processing_grn_gene_file()
+ppi_missing_proteins, ppi_proteins_to_update = processing_protein_file(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, get_all_proteins())
+create_grn_gene_file(Constants.GRN_MISSING_GENE_DIRECTORY, grn_missing_genes)
+create_grn_gene_file(Constants.GRN_UPDATE_GENE_DIRECTORY, grn_genes_to_update)
+create_ppi_gene_file(Constants.PPI_MISSING_GENE_DIRECTORY, ppi_missing_genes)
+create_ppi_gene_file(Constants.PPI_UPDATE_GENE_DIRECTORY, ppi_genes_to_update)
+create_ppi_protein_file(Constants.PPI_MISSING_PROTEIN_DIRECTORY, ppi_missing_proteins)
+create_ppi_protein_file(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, ppi_proteins_to_update)