cazy-webscraper 2.2.8__tar.gz → 2.3.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cazy_webscraper-2.2.8/cazy_webscraper.egg-info → cazy_webscraper-2.3.0.2}/PKG-INFO +139 -29
- cazy_webscraper-2.2.8/PKG-INFO → cazy_webscraper-2.3.0.2/README.md +127 -50
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/__init__.py +35 -4
- cazy_webscraper-2.3.0.2/cazy_webscraper/cache/__init__.py +41 -0
- cazy_webscraper-2.3.0.2/cazy_webscraper/cache/cazy.py +78 -0
- cazy_webscraper-2.3.0.2/cazy_webscraper/cache/ncbi.py +127 -0
- cazy_webscraper-2.3.0.2/cazy_webscraper/cache/uniprot.py +131 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/cazy/__init__.py +53 -21
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/cazy_scraper.py +31 -4
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py +1 -79
- cazy_webscraper-2.3.0.2/cazy_webscraper/expand/uniprot/get_uniprot_data.py +697 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/gene_names/__init__.py +1 -2
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/taxonomy/multiple_taxa.py +92 -82
- cazy_webscraper-2.3.0.2/cazy_webscraper/sql/get_schema.py +79 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_cazyme_data.py +5 -2
- cazy_webscraper-2.3.0.2/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py +501 -0
- cazy_webscraper-2.3.0.2/cazy_webscraper/sql/sql_interface/delete_data/__init__.py +189 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_table_dicts.py +28 -3
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_taxonomies.py +1 -1
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_orm.py +37 -7
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/cazy_webscraper_parser.py +20 -0
- cazy_webscraper-2.3.0.2/cazy_webscraper/utilities/parsers/get_schema_parser.py +102 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/uniprot_parser.py +72 -55
- cazy_webscraper-2.2.8/README.md → cazy_webscraper-2.3.0.2/cazy_webscraper.egg-info/PKG-INFO +160 -27
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/SOURCES.txt +7 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/entry_points.txt +1 -0
- cazy_webscraper-2.3.0.2/cazy_webscraper.egg-info/requires.txt +10 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/setup.py +7 -5
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_add_cazymes.py +1 -1
- cazy_webscraper-2.3.0.2/tests/test_cazy_init.py +274 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_parsers.py +1 -1
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_genbank.py +1 -1
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_taxonomy.py +17 -7
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_uniprot.py +28 -19
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_webscraper.py +13 -2
- cazy_webscraper-2.2.8/cazy_webscraper/expand/uniprot/get_uniprot_data.py +0 -663
- cazy_webscraper-2.2.8/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py +0 -414
- cazy_webscraper-2.2.8/cazy_webscraper.egg-info/requires.txt +0 -9
- cazy_webscraper-2.2.8/tests/test_cazy_init.py +0 -208
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/LICENSE +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/api/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/api/cw_query_database.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/crawler/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/crawler/get_validation_data.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/extract_seqs/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/extract_seqs/extract_db_seqs.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/genomes/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/genomes/entrez.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/genomes/get_genome_accs.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/sequences/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/taxonomy/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/taxonomy/get_ncbi_taxs.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/gtdb/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/gtdb/get_gtdb_tax.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/pdb/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/pdb/get_pdb_structures.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/uniprot/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/genomes/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/sequences/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/taxonomy/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/taxonomy/lineage.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_genbank_data.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_genome_data.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_gtdb_tax.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_ncbi_tax_data.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_api_data.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_assemblies.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_records.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_selected_gbks.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_selected_pdbs.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parse_configuration/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parse_configuration/cazy_class_synonym_dict.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/__init__.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/api_parser.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/extract_seq_parser.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/gbk_seq_parser.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/get_genomes_parser.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/get_gtdb_parser.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/pdb_strctre_parser.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/tax_ncbi_parser.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/dependency_links.txt +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/top_level.txt +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/setup.cfg +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_api.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_crawler.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_cw_init.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_expand.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_extract_seqs.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_get_ncbi_tax.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_gtdb.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_ncbi.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_ncbi_genomes.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_orm.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_parse_config.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_pdb.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_genomes.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_gtdb.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_ncbi_tax.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_interf_gd_get_records.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_interf_gd_get_tax.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_interface.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_queries.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_table_dicts.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_utilities.py +0 -0
- {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_validation_data.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
|
-
Name:
|
|
3
|
-
Version: 2.2
|
|
2
|
+
Name: cazy_webscraper
|
|
3
|
+
Version: 2.3.0.2
|
|
4
4
|
Summary: A tool to automate retrieving data from CAZy, build a local CAZyme SQL database, and throughly interrogating the data. Also, automate retrieving protein data, sequences, EC numbers and structure files for specific datasets in the CAZyme database from UniProt, GenBank and PDB.
|
|
5
5
|
Home-page: https://github.com/HobnobMancer/cazy_webscraper
|
|
6
6
|
Author: Emma E. M. Hobbs
|
|
@@ -20,6 +20,16 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
20
20
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
|
+
Requires-Dist: biopython
|
|
24
|
+
Requires-Dist: bioservices>=1.11.0
|
|
25
|
+
Requires-Dist: mechanicalsoup
|
|
26
|
+
Requires-Dist: pandas
|
|
27
|
+
Requires-Dist: pyyaml
|
|
28
|
+
Requires-Dist: requests
|
|
29
|
+
Requires-Dist: saintBioutils>=0.0.25
|
|
30
|
+
Requires-Dist: sqlalchemy==1.4.20
|
|
31
|
+
Requires-Dist: tqdm
|
|
32
|
+
Requires-Dist: html5lib
|
|
23
33
|
|
|
24
34
|
# cazy_webscraper
|
|
25
35
|
|
|
@@ -33,16 +43,19 @@ License-File: LICENSE
|
|
|
33
43
|
[](https://anaconda.org/bioconda/cazy_webscraper)
|
|
34
44
|
[](https://anaconda.org/bioconda/cazy_webscraper)
|
|
35
45
|
[](https://pypi.python.org/pypi/cazy_webscraper)
|
|
36
|
-
[](https://pepy.tech/project/cazy-webscraper)
|
|
46
|
+
[](https://pepy.tech/project/cazy-webscraper)
|
|
47
|
+
[](https://github.com/HobnobMancer/cazy_webscraper/actions/workflows/main.yml)
|
|
37
48
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
> `cazy_webscraper` version 1 is depracted. Please ensure you are using version 2 or newer.
|
|
41
|
-
> `bioconda` installation is fixed for >= v2.1.3.1
|
|
49
|
+
--------------------------------
|
|
42
50
|
|
|
43
51
|
## cazy_webscraper
|
|
44
52
|
|
|
45
53
|
`cazy_webscraper` is an application and Python3 package for the automated retrieval of protein data from the [CAZy](http://wwww.cazy.org/) database. The code is distributed under the MIT license.
|
|
54
|
+
The full documentation can be found at [Read the Docs](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest).
|
|
55
|
+
|
|
56
|
+
For full details see our publication in [Microbial Genomics](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.001086), which includes example analyses.
|
|
57
|
+
|
|
58
|
+
> Hobbs, E. E. M, Gloster, T. M., Pritchard, L. (2023) cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets, _Microbial Genomics_, 9(8). [https://doi.org/10.1099/mgen.0.001086](https://doi.org/10.1099/mgen.0.001086)
|
|
46
59
|
|
|
47
60
|
**`cazy_webscraper` retrieves protein data from the [CAZy database](https://www.cazy.org) and stores the data in a local SQLite3 database.** This enables users to integrate the dataset into analytical pipelines, and interrogate the data in a manner unachievable through the CAZy website.
|
|
48
61
|
|
|
@@ -55,12 +68,13 @@ License-File: LICENSE
|
|
|
55
68
|
- Latest taxonomic classification - including complete lineage (including phylum, class, order and family) (version >=2.1.2)
|
|
56
69
|
- Latest genomic assembly data (GenBank and RefSeq (when available) version accession and ID numbers) (version >=2.1.3)
|
|
57
70
|
|
|
58
|
-
**[UniProt](https://www.uniprot.org/):**
|
|
71
|
+
**[UniProt](https://www.uniprot.org/):**
|
|
72
|
+
- UniProt ID/accession
|
|
59
73
|
- Protein name
|
|
60
|
-
- UniProt accession
|
|
61
74
|
- EC numbers
|
|
62
75
|
- PDB accessions
|
|
63
|
-
- Protein
|
|
76
|
+
- Protein sequence (and date sequence was last updated)
|
|
77
|
+
- Taxonomic classification (genus and species)
|
|
64
78
|
|
|
65
79
|
**[Research Collaboratory for Structural Bioinformatics (RCSB) Protein Data Bank (PDB)](https://www.rcsb.org/):**
|
|
66
80
|
- Protein structure files
|
|
@@ -80,12 +94,56 @@ Protein sequences (retrieved from GenBank and/or UniProt) from the local CAZyme
|
|
|
80
94
|
|
|
81
95
|
Please see the [full documentation at ReadTheDocs](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest).
|
|
82
96
|
|
|
97
|
+
## Updates
|
|
98
|
+
|
|
99
|
+
**New in version 2.3.0**
|
|
100
|
+
* Downloading protein data from UniProt is several magnitudes faster than before - and should have fewer issues with using older version of `bioservices`
|
|
101
|
+
- Uses `bioservices` mapping to map directly from NCBI protein version accession to UniProt
|
|
102
|
+
- `cw_get_uniprot_data` not longer calls to NCBI and thus no longer requires an email address as a positional argument
|
|
103
|
+
* Updated database schema: Changed `Genbanks 1--* Uniprots` to `Genbanks *--1 Uniprots`. `Uniprots.uniprot_id` is now listed in the `Genbanks` table, instead of listing `Genbanks.genbank_id` in the `Uniprots` table
|
|
104
|
+
|
|
105
|
+
* Retrieve taxonomic classifications from UniProt
|
|
106
|
+
* Use the `--taxonomy`/`-t` flag to retrieve the scientific name (genus and species) for proteins of interest
|
|
107
|
+
* Adds downloaded taxonomic information to the `UniprotsTaxs` table
|
|
108
|
+
|
|
109
|
+
* Improved clarrification of deleting old records when using `cw_get_uniprot_data`
|
|
110
|
+
- Separate arguments to delete Genbanks-EC number and Genbanks-PDB accession relationships that are no longer listed in UniProt for those proteins in the local CAZyme database for proteins whom data is downloaded from UniProt
|
|
111
|
+
- New args:
|
|
112
|
+
- `--delete_old_ec_relationships` = deletes Genbank(protein)-EC number relationships no longer in UniProt
|
|
113
|
+
- `--delete_old_ecs` = deletes EC numbers in the local db not linked to any proteins
|
|
114
|
+
- `--delete_old_pdb_relationships` = deletes Genbank(protein)-PDB relationships no longer in UniProt
|
|
115
|
+
- `--delete_old_pdbs` = deletes PDB accessions in the local db not linked to any proteins
|
|
116
|
+
|
|
117
|
+
* Retrieve the local db schema
|
|
118
|
+
- New command `cw_get_db_schema` added.
|
|
119
|
+
- Retrieves the SQLite schema of a local CAZyme database and prints it to the terminal
|
|
120
|
+
|
|
121
|
+
* Added option to skip retrieving the latest taxonomic classifications NCBI taxonomies
|
|
122
|
+
- By default, when retreiving data from CAZy, `cazy_webscraper` retrieves the latest taxonomic classifications for proteins listed under multiple tax
|
|
123
|
+
- To increase scrapping time, and to reduce burden on the NCBI-Entrez server, if this data is not needed (e.g. GTDB taxs will be use) this step can be skipped by using the new `--skip_ncbi_tax` flag.
|
|
124
|
+
- When skipping retrieval of the latest taxa classifications from NCBI, `cazy_webscraper` will add the first taxa retrieved from CAZy for those proteins listed under mutliple taxa
|
|
125
|
+
|
|
126
|
+
|
|
83
127
|
## Documentation
|
|
84
128
|
|
|
129
|
+
The full documentation can be found at [Read the Docs](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest).
|
|
130
|
+
|
|
131
|
+
### Our paper: Implementation and demonstration of use
|
|
132
|
+
|
|
85
133
|
For a full description of the operation and examples of use, please see our paper in (BioRxiv)[https://www.biorxiv.org/content/10.1101/2022.12.02.518825v1.full].
|
|
86
134
|
|
|
87
135
|
> Hobbs, E. E. M., Gloster, T. M., and Pritchard, L. (2022) 'cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets', _bioRxiv_, [https://doi.org/10.1101/2022.12.02.518825](https://www.biorxiv.org/content/10.1101/2022.12.02.518825v1.full)
|
|
88
136
|
|
|
137
|
+
### Database structure
|
|
138
|
+
|
|
139
|
+
You can view the database schema [here](#database-schema) and find a PDF of the database schema [here](https://hobnobmancer.github.io/cazy_webscraper/database_schema.pdf).
|
|
140
|
+
|
|
141
|
+
## Contributions
|
|
142
|
+
|
|
143
|
+
We welcome contributions and suggestions. You can raise issues at this repository, or fork the repository and submit pull requests, at the links below:
|
|
144
|
+
|
|
145
|
+
- [Issues](https://github.com/HobnobMancer/cazy_webscraper/issues)
|
|
146
|
+
- [Pull Requests](https://github.com/HobnobMancer/cazy_webscraper/pulls)
|
|
89
147
|
|
|
90
148
|
## Table of Contents
|
|
91
149
|
<!-- TOC -->
|
|
@@ -118,6 +176,7 @@ For a full description of the operation and examples of use, please see our pape
|
|
|
118
176
|
- [CAZy coverage of GenBank](#cazy-coverage-of-genbank)
|
|
119
177
|
- [Configure calculating CAZy coverage of GenBank](#configure-calculating-cazy-coverage-of-genbank)
|
|
120
178
|
- [Integrating a local CAZyme database](#integrating-a-local-cazyme-database)
|
|
179
|
+
- [Database schema](#database-schema)
|
|
121
180
|
- [Contributions](#contributions)
|
|
122
181
|
- [License and copyright](#license-and-copyright)
|
|
123
182
|
<!-- /TOC -->
|
|
@@ -126,15 +185,16 @@ For a full description of the operation and examples of use, please see our pape
|
|
|
126
185
|
## Features in the pipeline:
|
|
127
186
|
- Retrieve and stored PubMed IDs in the local CAZyme database
|
|
128
187
|
- Fix any remaining bugs we can find (if you find a bug, please report it and provide as detailed bug report as possible!)
|
|
129
|
-
-
|
|
130
|
-
- Update the documentation
|
|
188
|
+
- Increase unit test coverage
|
|
131
189
|
- Automate analysing the taxonomic distribution across CAZy and datasets of interest, including generating a final report
|
|
132
190
|
|
|
133
191
|
## Citation
|
|
134
192
|
|
|
135
193
|
If you use `cazy_webscraper`, please cite the following publication:
|
|
136
194
|
|
|
137
|
-
> Hobbs, E. E. M
|
|
195
|
+
> Hobbs, E. E. M, Gloster, T. M., Pritchard, L. (2023) cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets, _Microbial Genomics_, 9(8). [https://doi.org/10.1099/mgen.0.001086](https://doi.org/10.1099/mgen.0.001086)
|
|
196
|
+
|
|
197
|
+
The supplementary information for this manuscript is available via the BioRxiv server, and in the `manuscript` directory in this repository.
|
|
138
198
|
|
|
139
199
|
cazy_webscraper depends on a number of tools. To recognise the contributions that the
|
|
140
200
|
authors and developers have made, please also cite the following:
|
|
@@ -194,13 +254,21 @@ cazy_webscraper <user_email>
|
|
|
194
254
|
To retrieve the version, use the following command:
|
|
195
255
|
|
|
196
256
|
```bash
|
|
197
|
-
cazy_webscraper
|
|
257
|
+
cazy_webscraper -V
|
|
258
|
+
```
|
|
259
|
+
or
|
|
260
|
+
```bash
|
|
261
|
+
cazy_webscraper --version
|
|
198
262
|
```
|
|
199
263
|
|
|
200
264
|
To retrieve the citation to use:
|
|
201
265
|
|
|
202
266
|
```bash
|
|
203
|
-
cazy_webscraper
|
|
267
|
+
cazy_webscraper -C
|
|
268
|
+
```
|
|
269
|
+
or
|
|
270
|
+
```bash
|
|
271
|
+
cazy_webscraper --citation
|
|
204
272
|
```
|
|
205
273
|
|
|
206
274
|
### Command summary
|
|
@@ -240,11 +308,27 @@ To protein structure files from PDB use the `cw_get_pdb_structures` command.
|
|
|
240
308
|
|
|
241
309
|
To interrogate the database, use the `cw_query_database` command.
|
|
242
310
|
|
|
311
|
+
### Local CAZyme database schema
|
|
312
|
+
|
|
313
|
+
The schema of a local CAZyme database can be retrieved using `cazy_webscraper`:
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
cw_get_db_schema <path to local CAZyme database>
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
Alternatively, `sqlite3` can be used to retrieve the schema:
|
|
320
|
+
```bash
|
|
321
|
+
sqlite3 <path to local CAZyme database> .schema
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
A visual representation of the db schema when using `cazy_webscraper` version >= 2.3.0 can be found [here](https://hobnobmancer.github.io/cazy_webscraper/database_schema.pdf).
|
|
325
|
+
|
|
243
326
|
## Creating a local CAZyme database
|
|
244
327
|
Command line options for `cazy_webscraper`, which is used to scrape CAZy and compile a local SQLite database.
|
|
245
328
|
Options are written in alphabetical order.
|
|
246
329
|
|
|
247
330
|
`email` - \[REQUIRED\] User email address. This is required by NCBI Entrez for querying the Entrez server.
|
|
331
|
+
**Email address is not required when printing out the citation and version number information**
|
|
248
332
|
|
|
249
333
|
`--cache_dir` - Path to cache dir to be used instead of default cache dir path.
|
|
250
334
|
|
|
@@ -282,12 +366,16 @@ _If `--db_output` **and** `--database` are **not** called, `cazy_webscraper` wri
|
|
|
282
366
|
|
|
283
367
|
__When the `--db_output` flag is used, `cazy_webscraper` will create any necessary parent directories. If the direct/immediate parent directory of the database exists, by default `cazy_webscraper` will delete the content in this parent directory._
|
|
284
368
|
|
|
369
|
+
`--ncbi_batch_size` - The number of protein IDs submitted per batch to NCBI, when retrieving taxonomic classifications. Default 200.
|
|
370
|
+
|
|
285
371
|
`--nodelete_cache` - When called, content in the existing cache dir will **not** be deleted. Default: False (existing content is deleted).
|
|
286
372
|
|
|
287
373
|
`--nodelete_log` - When called, content in the existing log dir will **not** be deleted. Default: False (existing content is deleted).
|
|
288
374
|
|
|
289
375
|
`--retries`, `-r` - Define the number of times to retry making a connection to CAZy if the connection should fail. Default: 10.
|
|
290
376
|
|
|
377
|
+
`--skip-ncbi_tax` - Skip retrieving the latest taxonomic information for NCBI were multiple taxonomic classifications are retrieved from CAZy for a protein. The first taxonomy retrieved from CAZy will be added to the local CAZyme database. Default False - the first taxon listed for each protein is added to the local CAZyme database.
|
|
378
|
+
|
|
291
379
|
`--sql_echo` - Set SQLite engine echo parameter to True, causing SQLite to print log messages. Default: False.
|
|
292
380
|
|
|
293
381
|
`--subfamilies`, `-s` - Enable retrival of CAZy subfamilies, otherwise **only** CAZy family annotations will be retrieved. Default: False.
|
|
@@ -341,7 +429,7 @@ Data can be retrieived for all proteins in the local CAZyme database, or a speci
|
|
|
341
429
|
|
|
342
430
|
To retrieve all UniProt data for all proteins in a local CAZyme datbase, using the following command:
|
|
343
431
|
```bash
|
|
344
|
-
cw_get_uniprot_data <path_to_local_CAZyme_db>
|
|
432
|
+
cw_get_uniprot_data <path_to_local_CAZyme_db> --ec --pdb --sequence
|
|
345
433
|
```
|
|
346
434
|
|
|
347
435
|
### Configuring UniProt data retrieval
|
|
@@ -350,11 +438,7 @@ Below are listed the command-line flags for configuring the retrieval of UniProt
|
|
|
350
438
|
|
|
351
439
|
`database` - \[REQUIRED\] Path to a local CAZyme database to add UniProt data to.
|
|
352
440
|
|
|
353
|
-
`
|
|
354
|
-
|
|
355
|
-
`--ncbi_batch_size` - Size of batch query posted to NCBI Entrez. Default 150.
|
|
356
|
-
|
|
357
|
-
`--uniprot_batch_size` - Change the query batch size submitted via [`bioservices`](https://bioservices.readthedocs.io/en/master/) to UniProt to retrieve protein data. Default is 150. `bioservices` recommands queries not larger than 200 objects.
|
|
441
|
+
`--bioservices_batch_size` - Change the query batch size submitted via [`bioservices`](https://bioservices.readthedocs.io/en/master/) to UniProt to retrieve protein data. Default is 1000. See the [UniProt REST API documentation](https://www.uniprot.org/help/id_mapping) for batch size limits.
|
|
358
442
|
|
|
359
443
|
`--cache_dir` - Path to cache dir to be used instead of default cache dir path.
|
|
360
444
|
|
|
@@ -364,10 +448,6 @@ Below are listed the command-line flags for configuring the retrieval of UniProt
|
|
|
364
448
|
|
|
365
449
|
`--config`, `-c` - Path to a configuration YAML file. Default: None.
|
|
366
450
|
|
|
367
|
-
`--delete_old_ec` - Boolean, delete EC number - Protein relationships that are no longer listed in UniProt, i.e. an EC number annotation is no longer included in UniProt but is in the local database. If set to TRUE these relationships will be DELETED from the database.
|
|
368
|
-
|
|
369
|
-
`--delete_old_pdbs` - Boolean, delete PDB accessions - Protein relationships that are no longer listed in UniProt, i.e. an PDB accessions that are no longer included in UniProt but is in the local database. If set to TRUE these relationships will be DELETED from the database.
|
|
370
|
-
|
|
371
451
|
`--ec`, `-e` - Enable retrieval of EC number annotations from UniProt
|
|
372
452
|
|
|
373
453
|
`--ec_filter` - Limist retrieval of protein data to proteins annotated with a provided list of EC numbers. Separate the EC numbers bu single commas without spaces. Recommend to wrap the entire str in quotation marks, for example:
|
|
@@ -397,21 +477,33 @@ cw_get_uniprot_data my_cazyme_db/cazyme_db.db --ec_filter 'EC1.2.3.4,EC2.3.1.-'
|
|
|
397
477
|
|
|
398
478
|
`--retries`, `-r` - Define the number of times to retry making a connection to CAZy if the connection should fail. Default: 10.
|
|
399
479
|
|
|
480
|
+
`--delete_old_ec_relationships` - Boolean, delete old Genbanks-EC number relationships - For those proteins in the local db for whom data is downloaded from UniProt, compare the current links between the proteins in the Genbanks table and EC numbers in the Ecs table. Delete Genbanks-Ecs relationships that are not longer listed in the respective protein records in UniProt.
|
|
481
|
+
|
|
482
|
+
`--delete_old_ecs` - Boolean, delete EC number - Delete EC numbers that are not linked to any proteins listed in the Genbanks table. These can arise from multiple retrievals of data from the UniProt data over a period of time during UniProt records have been updated.
|
|
483
|
+
|
|
484
|
+
`--delete_old_pdb_relationships` - Boolean, delete old Genbanks-PDB relationships - For those proteins in the local db for whom data is downloaded from UniProt, compare the current links between the proteins in the Genbanks table and PDB accessions in the Pdbs table. Delete Genbanks-Pdbs relationships that are not longer listed in the respective protein records in UniProt.
|
|
485
|
+
|
|
486
|
+
`--delete_old_pdbs` - Boolean, delete PDB accessions - Protein relationships that are no longer listed in UniProt, i.e. an PDB accessions that are no longer included in UniProt but is in the local database. If set to TRUE these relationships will be DELETED from the database.
|
|
487
|
+
|
|
400
488
|
`--use_uniprot_cache` - Path to a JSON file, keyed by UniProt accessions/IDs and valued by dicts containing `{'gbk_acc': str, 'db_id': int}`. This file part of the cache created by `cw_get_uniprot_data`. This is option to skip retrieving the UniProt IDs for a set of GenBank accessions, if retrieving data for the same dataset (this save a lot of time!)
|
|
401
489
|
|
|
402
490
|
`skip_download` - Bool, default False. If set to True, only uses data from UniProt cache and will not download new data from UniProt.
|
|
403
491
|
|
|
404
492
|
`--sequence`, `-s` - Retrieve protein amino acid sequences from UniProt
|
|
405
493
|
|
|
406
|
-
`--seq_update` - If a newer version of the protein sequence is available, overwrite the existing sequence for the protein in the database. Default is false, the protein sequence is **not** overwritten and updated.
|
|
407
|
-
|
|
408
494
|
`--sql_echo` - Set SQLite engine echo parameter to True, causing SQLite to print log messages. Default: False.
|
|
409
495
|
|
|
410
496
|
`--species` - List of species written as Genus Species) to restrict the scraping of CAZymes to. CAZymes will be retrieved for **all** strains of each given species.
|
|
411
497
|
|
|
412
498
|
`--strains` - List of specific species strains to restrict the scraping of CAZymes to.
|
|
413
499
|
|
|
414
|
-
`--
|
|
500
|
+
`--taxonomy`, `-t` - Retrieve taxonomic classifications (genus species) and add to the local CAZyme database
|
|
501
|
+
|
|
502
|
+
`--timeout` - Connection timout limit (seconds). Default: 45.
|
|
503
|
+
|
|
504
|
+
`--update_name` - If a newer version of the protein name is available, overwrite the existing name for the protein in the database. Default is false, the protein name is **not** overwritten and updated.
|
|
505
|
+
|
|
506
|
+
`--update_seq` - If a newer version of the protein sequence is available, overwrite the existing sequence for the protein in the database. Default is false, the protein sequence is **not** overwritten and updated.
|
|
415
507
|
|
|
416
508
|
`--use_uniprot_cache` - Path to JSON file containing data previosuly retrieved from UniProt by `cazy_webscraper`, use if an error occurred while adding the data to the local CAZyme database. This will skip the retrieval of data from UniProt, and the cached data will be added to the local CAZyme database. This can also be shared with others to add the same data to their local CAZyme database.
|
|
417
509
|
|
|
@@ -419,6 +511,18 @@ cw_get_uniprot_data my_cazyme_db/cazyme_db.db --ec_filter 'EC1.2.3.4,EC2.3.1.-'
|
|
|
419
511
|
|
|
420
512
|
`--verbose`, `-v` - Enable verbose logging. This does not set the SQLite engine `echo` parameter to True. Default: False.
|
|
421
513
|
|
|
514
|
+
**UniProt batch sizes:**
|
|
515
|
+
Note that according to Uniprot (June 2022), there are various limits on ID Mapping Job Submission:
|
|
516
|
+
|
|
517
|
+
========= =====================================================================================
|
|
518
|
+
Limit Details
|
|
519
|
+
========= =====================================================================================
|
|
520
|
+
100,000 Total number of ids allowed in comma separated param ids in /idmapping/run api
|
|
521
|
+
500,000 Total number of "mapped to" ids allowed
|
|
522
|
+
100,000 Total number of "mapped to" ids allowed to be enriched by UniProt data
|
|
523
|
+
10,000 Total number of "mapped to" ids allowed with filtering
|
|
524
|
+
========= =====================================================================================
|
|
525
|
+
|
|
422
526
|
### UniProt data retrieval cache
|
|
423
527
|
|
|
424
528
|
- The tables retrieved from UniProt are converted to dataframes, and written out as CSV files in the cache directory
|
|
@@ -1024,7 +1128,7 @@ When listing EC numbers, the 'EC' prefix can be included or excluded. For exampl
|
|
|
1024
1128
|
|
|
1025
1129
|
`cazy_webscraper` performs a direct EC number comparison. Therefore, supplying `cazy_webscraper` with the EC number EC1.2.3.- will only retrieve protein specifically annotated with EC1.2.3.-. `cazy_webscraper` will **not** retrieve proteins will all completed EC numbers under EC1.2.3.-, thus, `cazy_webscraper` will **not** retrieve data for proteins annotated with EC1.2.3.1, EC1.2.3.2, EC1.2.3.3, etc.
|
|
1026
1130
|
|
|
1027
|
-
Example configuration files, and an empty configuraiton file template are located in the
|
|
1131
|
+
Example configuration files, and an empty configuraiton file template are located in the `configuration_files/` directory of this repo.
|
|
1028
1132
|
|
|
1029
1133
|
|
|
1030
1134
|
## Integrating a local CAZyme database
|
|
@@ -1045,6 +1149,12 @@ Import the function into the `Python` script using:
|
|
|
1045
1149
|
from cazy_webscraper.sql.sql_orm import get_db_connection
|
|
1046
1150
|
```
|
|
1047
1151
|
|
|
1152
|
+
## Database Schema
|
|
1153
|
+
|
|
1154
|
+
This is the structure of the local SQLite3 database compiled by `cazy_webscraper` version >=2.3.0:
|
|
1155
|
+
|
|
1156
|
+

|
|
1157
|
+
|
|
1048
1158
|
|
|
1049
1159
|
## Contributions
|
|
1050
1160
|
|