cazy-webscraper 2.2.8__tar.gz → 2.3.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {cazy_webscraper-2.2.8/cazy_webscraper.egg-info → cazy_webscraper-2.3.0.2}/PKG-INFO +139 -29
  2. cazy_webscraper-2.2.8/PKG-INFO → cazy_webscraper-2.3.0.2/README.md +127 -50
  3. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/__init__.py +35 -4
  4. cazy_webscraper-2.3.0.2/cazy_webscraper/cache/__init__.py +41 -0
  5. cazy_webscraper-2.3.0.2/cazy_webscraper/cache/cazy.py +78 -0
  6. cazy_webscraper-2.3.0.2/cazy_webscraper/cache/ncbi.py +127 -0
  7. cazy_webscraper-2.3.0.2/cazy_webscraper/cache/uniprot.py +131 -0
  8. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/cazy/__init__.py +53 -21
  9. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/cazy_scraper.py +31 -4
  10. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py +1 -79
  11. cazy_webscraper-2.3.0.2/cazy_webscraper/expand/uniprot/get_uniprot_data.py +697 -0
  12. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/gene_names/__init__.py +1 -2
  13. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/taxonomy/multiple_taxa.py +92 -82
  14. cazy_webscraper-2.3.0.2/cazy_webscraper/sql/get_schema.py +79 -0
  15. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_cazyme_data.py +5 -2
  16. cazy_webscraper-2.3.0.2/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py +501 -0
  17. cazy_webscraper-2.3.0.2/cazy_webscraper/sql/sql_interface/delete_data/__init__.py +189 -0
  18. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_table_dicts.py +28 -3
  19. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_taxonomies.py +1 -1
  20. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_orm.py +37 -7
  21. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/cazy_webscraper_parser.py +20 -0
  22. cazy_webscraper-2.3.0.2/cazy_webscraper/utilities/parsers/get_schema_parser.py +102 -0
  23. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/uniprot_parser.py +72 -55
  24. cazy_webscraper-2.2.8/README.md → cazy_webscraper-2.3.0.2/cazy_webscraper.egg-info/PKG-INFO +160 -27
  25. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/SOURCES.txt +7 -0
  26. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/entry_points.txt +1 -0
  27. cazy_webscraper-2.3.0.2/cazy_webscraper.egg-info/requires.txt +10 -0
  28. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/setup.py +7 -5
  29. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_add_cazymes.py +1 -1
  30. cazy_webscraper-2.3.0.2/tests/test_cazy_init.py +274 -0
  31. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_parsers.py +1 -1
  32. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_genbank.py +1 -1
  33. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_taxonomy.py +17 -7
  34. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_uniprot.py +28 -19
  35. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_webscraper.py +13 -2
  36. cazy_webscraper-2.2.8/cazy_webscraper/expand/uniprot/get_uniprot_data.py +0 -663
  37. cazy_webscraper-2.2.8/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py +0 -414
  38. cazy_webscraper-2.2.8/cazy_webscraper.egg-info/requires.txt +0 -9
  39. cazy_webscraper-2.2.8/tests/test_cazy_init.py +0 -208
  40. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/LICENSE +0 -0
  41. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/api/__init__.py +0 -0
  42. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/api/cw_query_database.py +0 -0
  43. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/crawler/__init__.py +0 -0
  44. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/crawler/get_validation_data.py +0 -0
  45. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/__init__.py +0 -0
  46. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/extract_seqs/__init__.py +0 -0
  47. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/extract_seqs/extract_db_seqs.py +0 -0
  48. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/__init__.py +0 -0
  49. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/genomes/__init__.py +0 -0
  50. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/genomes/entrez.py +0 -0
  51. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/genomes/get_genome_accs.py +0 -0
  52. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/sequences/__init__.py +0 -0
  53. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/taxonomy/__init__.py +0 -0
  54. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/genbank/taxonomy/get_ncbi_taxs.py +0 -0
  55. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/gtdb/__init__.py +0 -0
  56. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/gtdb/get_gtdb_tax.py +0 -0
  57. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/pdb/__init__.py +0 -0
  58. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/pdb/get_pdb_structures.py +0 -0
  59. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/expand/uniprot/__init__.py +0 -0
  60. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/__init__.py +0 -0
  61. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/genomes/__init__.py +0 -0
  62. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/sequences/__init__.py +0 -0
  63. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/taxonomy/__init__.py +0 -0
  64. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/ncbi/taxonomy/lineage.py +0 -0
  65. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/__init__.py +0 -0
  66. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/__init__.py +0 -0
  67. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/__init__.py +0 -0
  68. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_genbank_data.py +0 -0
  69. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_genome_data.py +0 -0
  70. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_gtdb_tax.py +0 -0
  71. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/add_data/add_ncbi_tax_data.py +0 -0
  72. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/__init__.py +0 -0
  73. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_api_data.py +0 -0
  74. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_assemblies.py +0 -0
  75. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_records.py +0 -0
  76. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_selected_gbks.py +0 -0
  77. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/sql/sql_interface/get_data/get_selected_pdbs.py +0 -0
  78. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/__init__.py +0 -0
  79. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parse_configuration/__init__.py +0 -0
  80. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parse_configuration/cazy_class_synonym_dict.py +0 -0
  81. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/__init__.py +0 -0
  82. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/api_parser.py +0 -0
  83. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/extract_seq_parser.py +0 -0
  84. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/gbk_seq_parser.py +0 -0
  85. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/get_genomes_parser.py +0 -0
  86. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/get_gtdb_parser.py +0 -0
  87. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/pdb_strctre_parser.py +0 -0
  88. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper/utilities/parsers/tax_ncbi_parser.py +0 -0
  89. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/dependency_links.txt +0 -0
  90. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/cazy_webscraper.egg-info/top_level.txt +0 -0
  91. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/setup.cfg +0 -0
  92. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_api.py +0 -0
  93. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_crawler.py +0 -0
  94. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_cw_init.py +0 -0
  95. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_expand.py +0 -0
  96. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_extract_seqs.py +0 -0
  97. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_get_ncbi_tax.py +0 -0
  98. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_gtdb.py +0 -0
  99. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_ncbi.py +0 -0
  100. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_ncbi_genomes.py +0 -0
  101. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_orm.py +0 -0
  102. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_parse_config.py +0 -0
  103. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_pdb.py +0 -0
  104. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_genomes.py +0 -0
  105. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_gtdb.py +0 -0
  106. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_ad_ncbi_tax.py +0 -0
  107. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_interf_gd_get_records.py +0 -0
  108. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_interf_gd_get_tax.py +0 -0
  109. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_interface.py +0 -0
  110. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_sql_queries.py +0 -0
  111. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_table_dicts.py +0 -0
  112. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_utilities.py +0 -0
  113. {cazy_webscraper-2.2.8 → cazy_webscraper-2.3.0.2}/tests/test_validation_data.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
- Name: cazy-webscraper
3
- Version: 2.2.8
2
+ Name: cazy_webscraper
3
+ Version: 2.3.0.2
4
4
  Summary: A tool to automate retrieving data from CAZy, build a local CAZyme SQL database, and throughly interrogating the data. Also, automate retrieving protein data, sequences, EC numbers and structure files for specific datasets in the CAZyme database from UniProt, GenBank and PDB.
5
5
  Home-page: https://github.com/HobnobMancer/cazy_webscraper
6
6
  Author: Emma E. M. Hobbs
@@ -20,6 +20,16 @@ Classifier: Programming Language :: Python :: 3.8
20
20
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
21
  Description-Content-Type: text/markdown
22
22
  License-File: LICENSE
23
+ Requires-Dist: biopython
24
+ Requires-Dist: bioservices>=1.11.0
25
+ Requires-Dist: mechanicalsoup
26
+ Requires-Dist: pandas
27
+ Requires-Dist: pyyaml
28
+ Requires-Dist: requests
29
+ Requires-Dist: saintBioutils>=0.0.25
30
+ Requires-Dist: sqlalchemy==1.4.20
31
+ Requires-Dist: tqdm
32
+ Requires-Dist: html5lib
23
33
 
24
34
  # cazy_webscraper
25
35
 
@@ -33,16 +43,19 @@ License-File: LICENSE
33
43
  [![Anaconda-Server Badge](https://anaconda.org/bioconda/cazy_webscraper/badges/version.svg)](https://anaconda.org/bioconda/cazy_webscraper)
34
44
  [![Anaconda-Update Badge](https://anaconda.org/bioconda/cazy_webscraper/badges/latest_release_date.svg)](https://anaconda.org/bioconda/cazy_webscraper)
35
45
  [![pyani PyPi version](https://img.shields.io/pypi/v/cazy_webscraper "PyPI version")](https://pypi.python.org/pypi/cazy_webscraper)
36
- [![Downloads](https://pepy.tech/badge/cazy-webscraper)](https://pepy.tech/project/cazy-webscraper)
46
+ [![Downloads](https://static.pepy.tech/badge/cazy-webscraper)](https://pepy.tech/project/cazy-webscraper)
47
+ [![CITATION.cff](https://github.com/HobnobMancer/cazy_webscraper/actions/workflows/main.yml/badge.svg)](https://github.com/HobnobMancer/cazy_webscraper/actions/workflows/main.yml)
37
48
 
38
- -------------------------------
39
-
40
- > `cazy_webscraper` version 1 is depracted. Please ensure you are using version 2 or newer.
41
- > `bioconda` installation is fixed for >= v2.1.3.1
49
+ --------------------------------
42
50
 
43
51
  ## cazy_webscraper
44
52
 
45
53
  `cazy_webscraper` is an application and Python3 package for the automated retrieval of protein data from the [CAZy](http://wwww.cazy.org/) database. The code is distributed under the MIT license.
54
+ The full documentation can be found at [Read the Docs](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest).
55
+
56
+ For full details see our publication in [Microbial Genomics](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.001086), which includes example analyses.
57
+
58
+ > Hobbs, E. E. M, Gloster, T. M., Pritchard, L. (2023) cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets, _Microbial Genomics_, 9(8). [https://doi.org/10.1099/mgen.0.001086](https://doi.org/10.1099/mgen.0.001086)
46
59
 
47
60
  **`cazy_webscraper` retrieves protein data from the [CAZy database](https://www.cazy.org) and stores the data in a local SQLite3 database.** This enables users to integrate the dataset into analytical pipelines, and interrogate the data in a manner unachievable through the CAZy website.
48
61
 
@@ -55,12 +68,13 @@ License-File: LICENSE
55
68
  - Latest taxonomic classification - including complete lineage (including phylum, class, order and family) (version >=2.1.2)
56
69
  - Latest genomic assembly data (GenBank and RefSeq (when available) version accession and ID numbers) (version >=2.1.3)
57
70
 
58
- **[UniProt](https://www.uniprot.org/):**
71
+ **[UniProt](https://www.uniprot.org/):**
72
+ - UniProt ID/accession
59
73
  - Protein name
60
- - UniProt accession
61
74
  - EC numbers
62
75
  - PDB accessions
63
- - Protein sequences
76
+ - Protein sequence (and date sequence was last updated)
77
+ - Taxonomic classification (genus and species)
64
78
 
65
79
  **[Research Collaboratory for Structural Bioinformatics (RCSB) Protein Data Bank (PDB)](https://www.rcsb.org/):**
66
80
  - Protein structure files
@@ -80,12 +94,56 @@ Protein sequences (retrieved from GenBank and/or UniProt) from the local CAZyme
80
94
 
81
95
  Please see the [full documentation at ReadTheDocs](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest).
82
96
 
97
+ ## Updates
98
+
99
+ **New in version 2.3.0**
100
+ * Downloading protein data from UniProt is several magnitudes faster than before - and should have fewer issues with using older version of `bioservices`
101
+ - Uses `bioservices` mapping to map directly from NCBI protein version accession to UniProt
102
+ - `cw_get_uniprot_data` not longer calls to NCBI and thus no longer requires an email address as a positional argument
103
+ * Updated database schema: Changed `Genbanks 1--* Uniprots` to `Genbanks *--1 Uniprots`. `Uniprots.uniprot_id` is now listed in the `Genbanks` table, instead of listing `Genbanks.genbank_id` in the `Uniprots` table
104
+
105
+ * Retrieve taxonomic classifications from UniProt
106
+ * Use the `--taxonomy`/`-t` flag to retrieve the scientific name (genus and species) for proteins of interest
107
+ * Adds downloaded taxonomic information to the `UniprotsTaxs` table
108
+
109
+ * Improved clarrification of deleting old records when using `cw_get_uniprot_data`
110
+ - Separate arguments to delete Genbanks-EC number and Genbanks-PDB accession relationships that are no longer listed in UniProt for those proteins in the local CAZyme database for proteins whom data is downloaded from UniProt
111
+ - New args:
112
+ - `--delete_old_ec_relationships` = deletes Genbank(protein)-EC number relationships no longer in UniProt
113
+ - `--delete_old_ecs` = deletes EC numbers in the local db not linked to any proteins
114
+ - `--delete_old_pdb_relationships` = deletes Genbank(protein)-PDB relationships no longer in UniProt
115
+ - `--delete_old_pdbs` = deletes PDB accessions in the local db not linked to any proteins
116
+
117
+ * Retrieve the local db schema
118
+ - New command `cw_get_db_schema` added.
119
+ - Retrieves the SQLite schema of a local CAZyme database and prints it to the terminal
120
+
121
+ * Added option to skip retrieving the latest taxonomic classifications NCBI taxonomies
122
+ - By default, when retreiving data from CAZy, `cazy_webscraper` retrieves the latest taxonomic classifications for proteins listed under multiple tax
123
+ - To increase scrapping time, and to reduce burden on the NCBI-Entrez server, if this data is not needed (e.g. GTDB taxs will be use) this step can be skipped by using the new `--skip_ncbi_tax` flag.
124
+ - When skipping retrieval of the latest taxa classifications from NCBI, `cazy_webscraper` will add the first taxa retrieved from CAZy for those proteins listed under mutliple taxa
125
+
126
+
83
127
  ## Documentation
84
128
 
129
+ The full documentation can be found at [Read the Docs](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest).
130
+
131
+ ### Our paper: Implementation and demonstration of use
132
+
85
133
  For a full description of the operation and examples of use, please see our paper in (BioRxiv)[https://www.biorxiv.org/content/10.1101/2022.12.02.518825v1.full].
86
134
 
87
135
  > Hobbs, E. E. M., Gloster, T. M., and Pritchard, L. (2022) 'cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets', _bioRxiv_, [https://doi.org/10.1101/2022.12.02.518825](https://www.biorxiv.org/content/10.1101/2022.12.02.518825v1.full)
88
136
 
137
+ ### Database structure
138
+
139
+ You can view the database schema [here](#database-schema) and find a PDF of the database schema [here](https://hobnobmancer.github.io/cazy_webscraper/database_schema.pdf).
140
+
141
+ ## Contributions
142
+
143
+ We welcome contributions and suggestions. You can raise issues at this repository, or fork the repository and submit pull requests, at the links below:
144
+
145
+ - [Issues](https://github.com/HobnobMancer/cazy_webscraper/issues)
146
+ - [Pull Requests](https://github.com/HobnobMancer/cazy_webscraper/pulls)
89
147
 
90
148
  ## Table of Contents
91
149
  <!-- TOC -->
@@ -118,6 +176,7 @@ For a full description of the operation and examples of use, please see our pape
118
176
  - [CAZy coverage of GenBank](#cazy-coverage-of-genbank)
119
177
  - [Configure calculating CAZy coverage of GenBank](#configure-calculating-cazy-coverage-of-genbank)
120
178
  - [Integrating a local CAZyme database](#integrating-a-local-cazyme-database)
179
+ - [Database schema](#database-schema)
121
180
  - [Contributions](#contributions)
122
181
  - [License and copyright](#license-and-copyright)
123
182
  <!-- /TOC -->
@@ -126,15 +185,16 @@ For a full description of the operation and examples of use, please see our pape
126
185
  ## Features in the pipeline:
127
186
  - Retrieve and stored PubMed IDs in the local CAZyme database
128
187
  - Fix any remaining bugs we can find (if you find a bug, please report it and provide as detailed bug report as possible!)
129
- - Update the unit tests to work with the new `cazy_webscraper` architecture
130
- - Update the documentation
188
+ - Increase unit test coverage
131
189
  - Automate analysing the taxonomic distribution across CAZy and datasets of interest, including generating a final report
132
190
 
133
191
  ## Citation
134
192
 
135
193
  If you use `cazy_webscraper`, please cite the following publication:
136
194
 
137
- > Hobbs, E. E. M., Gloster, T. M., and Pritchard, L. (2022) 'cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets', _bioRxiv_, [https://doi.org/10.1101/2022.12.02.518825](https://www.biorxiv.org/content/10.1101/2022.12.02.518825v1)
195
+ > Hobbs, E. E. M, Gloster, T. M., Pritchard, L. (2023) cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets, _Microbial Genomics_, 9(8). [https://doi.org/10.1099/mgen.0.001086](https://doi.org/10.1099/mgen.0.001086)
196
+
197
+ The supplementary information for this manuscript is available via the BioRxiv server, and in the `manuscript` directory in this repository.
138
198
 
139
199
  cazy_webscraper depends on a number of tools. To recognise the contributions that the
140
200
  authors and developers have made, please also cite the following:
@@ -194,13 +254,21 @@ cazy_webscraper <user_email>
194
254
  To retrieve the version, use the following command:
195
255
 
196
256
  ```bash
197
- cazy_webscraper <user_email> -V
257
+ cazy_webscraper -V
258
+ ```
259
+ or
260
+ ```bash
261
+ cazy_webscraper --version
198
262
  ```
199
263
 
200
264
  To retrieve the citation to use:
201
265
 
202
266
  ```bash
203
- cazy_webscraper <user_email> -C
267
+ cazy_webscraper -C
268
+ ```
269
+ or
270
+ ```bash
271
+ cazy_webscraper --citation
204
272
  ```
205
273
 
206
274
  ### Command summary
@@ -240,11 +308,27 @@ To protein structure files from PDB use the `cw_get_pdb_structures` command.
240
308
 
241
309
  To interrogate the database, use the `cw_query_database` command.
242
310
 
311
+ ### Local CAZyme database schema
312
+
313
+ The schema of a local CAZyme database can be retrieved using `cazy_webscraper`:
314
+
315
+ ```bash
316
+ cw_get_db_schema <path to local CAZyme database>
317
+ ```
318
+
319
+ Alternatively, `sqlite3` can be used to retrieve the schema:
320
+ ```bash
321
+ sqlite3 <path to local CAZyme database> .schema
322
+ ```
323
+
324
+ A visual representation of the db schema when using `cazy_webscraper` version >= 2.3.0 can be found [here](https://hobnobmancer.github.io/cazy_webscraper/database_schema.pdf).
325
+
243
326
  ## Creating a local CAZyme database
244
327
  Command line options for `cazy_webscraper`, which is used to scrape CAZy and compile a local SQLite database.
245
328
  Options are written in alphabetical order.
246
329
 
247
330
  `email` - \[REQUIRED\] User email address. This is required by NCBI Entrez for querying the Entrez server.
331
+ **Email address is not required when printing out the citation and version number information**
248
332
 
249
333
  `--cache_dir` - Path to cache dir to be used instead of default cache dir path.
250
334
 
@@ -282,12 +366,16 @@ _If `--db_output` **and** `--database` are **not** called, `cazy_webscraper` wri
282
366
 
283
367
  __When the `--db_output` flag is used, `cazy_webscraper` will create any necessary parent directories. If the direct/immediate parent directory of the database exists, by default `cazy_webscraper` will delete the content in this parent directory._
284
368
 
369
+ `--ncbi_batch_size` - The number of protein IDs submitted per batch to NCBI, when retrieving taxonomic classifications. Default 200.
370
+
285
371
  `--nodelete_cache` - When called, content in the existing cache dir will **not** be deleted. Default: False (existing content is deleted).
286
372
 
287
373
  `--nodelete_log` - When called, content in the existing log dir will **not** be deleted. Default: False (existing content is deleted).
288
374
 
289
375
  `--retries`, `-r` - Define the number of times to retry making a connection to CAZy if the connection should fail. Default: 10.
290
376
 
377
+ `--skip-ncbi_tax` - Skip retrieving the latest taxonomic information for NCBI were multiple taxonomic classifications are retrieved from CAZy for a protein. The first taxonomy retrieved from CAZy will be added to the local CAZyme database. Default False - the first taxon listed for each protein is added to the local CAZyme database.
378
+
291
379
  `--sql_echo` - Set SQLite engine echo parameter to True, causing SQLite to print log messages. Default: False.
292
380
 
293
381
  `--subfamilies`, `-s` - Enable retrival of CAZy subfamilies, otherwise **only** CAZy family annotations will be retrieved. Default: False.
@@ -341,7 +429,7 @@ Data can be retrieived for all proteins in the local CAZyme database, or a speci
341
429
 
342
430
  To retrieve all UniProt data for all proteins in a local CAZyme datbase, using the following command:
343
431
  ```bash
344
- cw_get_uniprot_data <path_to_local_CAZyme_db> <user_email> --ec --pdb --sequence
432
+ cw_get_uniprot_data <path_to_local_CAZyme_db> --ec --pdb --sequence
345
433
  ```
346
434
 
347
435
  ### Configuring UniProt data retrieval
@@ -350,11 +438,7 @@ Below are listed the command-line flags for configuring the retrieval of UniProt
350
438
 
351
439
  `database` - \[REQUIRED\] Path to a local CAZyme database to add UniProt data to.
352
440
 
353
- `email` - \[REQUIRED\] User email address. This is required by NCBI Entrez for querying the Entrez server.
354
-
355
- `--ncbi_batch_size` - Size of batch query posted to NCBI Entrez. Default 150.
356
-
357
- `--uniprot_batch_size` - Change the query batch size submitted via [`bioservices`](https://bioservices.readthedocs.io/en/master/) to UniProt to retrieve protein data. Default is 150. `bioservices` recommands queries not larger than 200 objects.
441
+ `--bioservices_batch_size` - Change the query batch size submitted via [`bioservices`](https://bioservices.readthedocs.io/en/master/) to UniProt to retrieve protein data. Default is 1000. See the [UniProt REST API documentation](https://www.uniprot.org/help/id_mapping) for batch size limits.
358
442
 
359
443
  `--cache_dir` - Path to cache dir to be used instead of default cache dir path.
360
444
 
@@ -364,10 +448,6 @@ Below are listed the command-line flags for configuring the retrieval of UniProt
364
448
 
365
449
  `--config`, `-c` - Path to a configuration YAML file. Default: None.
366
450
 
367
- `--delete_old_ec` - Boolean, delete EC number - Protein relationships that are no longer listed in UniProt, i.e. an EC number annotation is no longer included in UniProt but is in the local database. If set to TRUE these relationships will be DELETED from the database.
368
-
369
- `--delete_old_pdbs` - Boolean, delete PDB accessions - Protein relationships that are no longer listed in UniProt, i.e. an PDB accessions that are no longer included in UniProt but is in the local database. If set to TRUE these relationships will be DELETED from the database.
370
-
371
451
  `--ec`, `-e` - Enable retrieval of EC number annotations from UniProt
372
452
 
373
453
  `--ec_filter` - Limist retrieval of protein data to proteins annotated with a provided list of EC numbers. Separate the EC numbers bu single commas without spaces. Recommend to wrap the entire str in quotation marks, for example:
@@ -397,21 +477,33 @@ cw_get_uniprot_data my_cazyme_db/cazyme_db.db --ec_filter 'EC1.2.3.4,EC2.3.1.-'
397
477
 
398
478
  `--retries`, `-r` - Define the number of times to retry making a connection to CAZy if the connection should fail. Default: 10.
399
479
 
480
+ `--delete_old_ec_relationships` - Boolean, delete old Genbanks-EC number relationships - For those proteins in the local db for whom data is downloaded from UniProt, compare the current links between the proteins in the Genbanks table and EC numbers in the Ecs table. Delete Genbanks-Ecs relationships that are not longer listed in the respective protein records in UniProt.
481
+
482
+ `--delete_old_ecs` - Boolean, delete EC number - Delete EC numbers that are not linked to any proteins listed in the Genbanks table. These can arise from multiple retrievals of data from the UniProt data over a period of time during UniProt records have been updated.
483
+
484
+ `--delete_old_pdb_relationships` - Boolean, delete old Genbanks-PDB relationships - For those proteins in the local db for whom data is downloaded from UniProt, compare the current links between the proteins in the Genbanks table and PDB accessions in the Pdbs table. Delete Genbanks-Pdbs relationships that are not longer listed in the respective protein records in UniProt.
485
+
486
+ `--delete_old_pdbs` - Boolean, delete PDB accessions - Protein relationships that are no longer listed in UniProt, i.e. an PDB accessions that are no longer included in UniProt but is in the local database. If set to TRUE these relationships will be DELETED from the database.
487
+
400
488
  `--use_uniprot_cache` - Path to a JSON file, keyed by UniProt accessions/IDs and valued by dicts containing `{'gbk_acc': str, 'db_id': int}`. This file part of the cache created by `cw_get_uniprot_data`. This is option to skip retrieving the UniProt IDs for a set of GenBank accessions, if retrieving data for the same dataset (this save a lot of time!)
401
489
 
402
490
  `skip_download` - Bool, default False. If set to True, only uses data from UniProt cache and will not download new data from UniProt.
403
491
 
404
492
  `--sequence`, `-s` - Retrieve protein amino acid sequences from UniProt
405
493
 
406
- `--seq_update` - If a newer version of the protein sequence is available, overwrite the existing sequence for the protein in the database. Default is false, the protein sequence is **not** overwritten and updated.
407
-
408
494
  `--sql_echo` - Set SQLite engine echo parameter to True, causing SQLite to print log messages. Default: False.
409
495
 
410
496
  `--species` - List of species written as Genus Species) to restrict the scraping of CAZymes to. CAZymes will be retrieved for **all** strains of each given species.
411
497
 
412
498
  `--strains` - List of specific species strains to restrict the scraping of CAZymes to.
413
499
 
414
- `--timeout`, `-t` - Connection timout limit (seconds). Default: 45.
500
+ `--taxonomy`, `-t` - Retrieve taxonomic classifications (genus species) and add to the local CAZyme database
501
+
502
+ `--timeout` - Connection timout limit (seconds). Default: 45.
503
+
504
+ `--update_name` - If a newer version of the protein name is available, overwrite the existing name for the protein in the database. Default is false, the protein name is **not** overwritten and updated.
505
+
506
+ `--update_seq` - If a newer version of the protein sequence is available, overwrite the existing sequence for the protein in the database. Default is false, the protein sequence is **not** overwritten and updated.
415
507
 
416
508
  `--use_uniprot_cache` - Path to JSON file containing data previosuly retrieved from UniProt by `cazy_webscraper`, use if an error occurred while adding the data to the local CAZyme database. This will skip the retrieval of data from UniProt, and the cached data will be added to the local CAZyme database. This can also be shared with others to add the same data to their local CAZyme database.
417
509
 
@@ -419,6 +511,18 @@ cw_get_uniprot_data my_cazyme_db/cazyme_db.db --ec_filter 'EC1.2.3.4,EC2.3.1.-'
419
511
 
420
512
  `--verbose`, `-v` - Enable verbose logging. This does not set the SQLite engine `echo` parameter to True. Default: False.
421
513
 
514
+ **UniProt batch sizes:**
515
+ Note that according to Uniprot (June 2022), there are various limits on ID Mapping Job Submission:
516
+
517
+ ========= =====================================================================================
518
+ Limit Details
519
+ ========= =====================================================================================
520
+ 100,000 Total number of ids allowed in comma separated param ids in /idmapping/run api
521
+ 500,000 Total number of "mapped to" ids allowed
522
+ 100,000 Total number of "mapped to" ids allowed to be enriched by UniProt data
523
+ 10,000 Total number of "mapped to" ids allowed with filtering
524
+ ========= =====================================================================================
525
+
422
526
  ### UniProt data retrieval cache
423
527
 
424
528
  - The tables retrieved from UniProt are converted to dataframes, and written out as CSV files in the cache directory
@@ -1024,7 +1128,7 @@ When listing EC numbers, the 'EC' prefix can be included or excluded. For exampl
1024
1128
 
1025
1129
  `cazy_webscraper` performs a direct EC number comparison. Therefore, supplying `cazy_webscraper` with the EC number EC1.2.3.- will only retrieve protein specifically annotated with EC1.2.3.-. `cazy_webscraper` will **not** retrieve proteins will all completed EC numbers under EC1.2.3.-, thus, `cazy_webscraper` will **not** retrieve data for proteins annotated with EC1.2.3.1, EC1.2.3.2, EC1.2.3.3, etc.
1026
1130
 
1027
- Example configuration files, and an empty configuraiton file template are located in the [`config_files`]() directory of this repo.
1131
+ Example configuration files, and an empty configuraiton file template are located in the `configuration_files/` directory of this repo.
1028
1132
 
1029
1133
 
1030
1134
  ## Integrating a local CAZyme database
@@ -1045,6 +1149,12 @@ Import the function into the `Python` script using:
1045
1149
  from cazy_webscraper.sql.sql_orm import get_db_connection
1046
1150
  ```
1047
1151
 
1152
+ ## Database Schema
1153
+
1154
+ This is the structure of the local SQLite3 database compiled by `cazy_webscraper` version >=2.3.0:
1155
+
1156
+ ![database schema](assets/cazy_webscraper_v2.3+.svg "database schema")
1157
+
1048
1158
 
1049
1159
  ## Contributions
1050
1160