boldigger3 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
boldigger3/__init__.py ADDED
File without changes
boldigger3/__main__.py ADDED
@@ -0,0 +1,125 @@
1
+ import argparse, sys, datetime, time
2
+ from boldigger3 import id_engine, additional_data_download, select_top_hit
3
+ from importlib.metadata import version
4
+ from get_pypi_latest_version import GetPyPiLatestVersion
5
+
6
+
7
+ # main function to program the commandline interface
8
+ def main() -> None:
9
+ """Function to define the commandline interface."""
10
+ # initialize the default behaviour if boldigger3 is called without any argument
11
+ formatter = lambda prog: argparse.HelpFormatter(prog, max_help_position=35)
12
+
13
+ # define the parser
14
+ parser = argparse.ArgumentParser(
15
+ prog="boldigger3",
16
+ description="A Python package to identify and organise sequences with the Barcode of Life Data systems.",
17
+ formatter_class=formatter,
18
+ )
19
+
20
+ # display help when no argument is called
21
+ parser.set_defaults(func=lambda x: parser.print_help())
22
+
23
+ # add the subparsers
24
+ subparsers = parser.add_subparsers(dest="function")
25
+
26
+ # add the identify parser
27
+ parser_identify = subparsers.add_parser(
28
+ "identify", help="Run the BOLD v5 identification engine"
29
+ )
30
+
31
+ # add the fasta path argument
32
+ parser_identify.add_argument(
33
+ "fasta_file",
34
+ help="Path to the fasta file or fasta file in the current working directory to be identified.",
35
+ type=str,
36
+ )
37
+
38
+ # add the database argument
39
+ parser_identify.add_argument(
40
+ "--db",
41
+ required=True,
42
+ help="Integer that defines which database to use (1 to 7). See readme for details",
43
+ type=int,
44
+ choices=range(1, 8),
45
+ )
46
+
47
+ # add the operating mode argument
48
+ parser_identify.add_argument(
49
+ "--mode",
50
+ required=True,
51
+ help="Integer that defines which operating mode to use (1 to 3). See readme for details.",
52
+ type=int,
53
+ choices=range(1, 4),
54
+ )
55
+
56
+ # add the optional argument thresholds
57
+ parser_identify.add_argument(
58
+ "--thresholds",
59
+ nargs="+",
60
+ type=int,
61
+ help="Thresholds to use for the selection of the top hit.",
62
+ )
63
+
64
+ # add version control
65
+ # get the installed version
66
+ current_version = version("boldigger2")
67
+ obtainer = GetPyPiLatestVersion()
68
+ latest_version = obtainer("boldigger2")
69
+
70
+ # give a user warning if the latest version is not installed
71
+ if current_version != latest_version:
72
+ print(
73
+ "{}: Your boldigger3 version is outdated. Consider updating to the latest version.".format(
74
+ datetime.datetime.now().strftime("%H:%M:%S")
75
+ )
76
+ )
77
+
78
+ # add the version argument
79
+ parser.add_argument("--version", action="version", version=version("boldigger2"))
80
+
81
+ # parse the arguments
82
+ arguments = parser.parse_args()
83
+
84
+ # print help if no argument is provided
85
+ if len(sys.argv) == 1:
86
+ arguments.func(arguments)
87
+ sys.exit()
88
+
89
+ # only use the threshold provided by the user replace the rest with defaults
90
+ default_thresholds = [97, 95, 90, 85]
91
+ thresholds = []
92
+
93
+ for i in range(4):
94
+ try:
95
+ thresholds.append(arguments.thresholds[i])
96
+ except (IndexError, TypeError):
97
+ thresholds.append(default_thresholds[i])
98
+
99
+ if arguments.thresholds:
100
+ # give user output
101
+ print(
102
+ "{}: Default thresholds changed!\n{}: Species: {}, Genus: {}, Family: {}, Order: {}".format(
103
+ datetime.datetime.now().strftime("%H:%M:%S"),
104
+ datetime.datetime.now().strftime("%H:%M:%S"),
105
+ *thresholds
106
+ )
107
+ )
108
+
109
+ # run the identification engine
110
+ if arguments.function == "identify":
111
+ # run the id engine
112
+ id_engine.main(
113
+ arguments.fasta_file,
114
+ database=arguments.db,
115
+ operating_mode=arguments.mode,
116
+ )
117
+ # download the additional data
118
+ additional_data_download.main(arguments.fasta_file)
119
+ # select the top hit
120
+ select_top_hit.main(arguments.fasta_file, thresholds=thresholds)
121
+
122
+
123
+ # run only if called as a top level script
124
+ if __name__ == "__main__":
125
+ main()
@@ -0,0 +1,357 @@
1
+ import asyncio, requests_html_playwright, more_itertools, datetime
2
+ from boldigger3.id_engine import parse_fasta
3
+ from bs4 import BeautifulSoup as BSoup
4
+ from requests import Response
5
+ from tqdm.asyncio import tqdm
6
+ import pandas as pd
7
+ from requests.adapters import HTTPAdapter
8
+ from urllib3.util.retry import Retry
9
+ from requests.exceptions import RetryError
10
+
11
+
12
+ # function to collect the process ids from the hdf storage
13
+ def collect_process_ids(hdf_name_results: str) -> list:
14
+ """Function to collect all process ids from the downloaded data.
15
+
16
+ Args:
17
+ hdf_name_results (str): Path to the hdf storage that's generated by the data download
18
+
19
+ Returns:
20
+ tuple: Returns a list of unique process ids.
21
+ """
22
+
23
+ # read the results from hdf storage
24
+ unsorted_results = pd.read_hdf(hdf_name_results, key="results_unsorted")
25
+ process_ids = unsorted_results["process_id"]
26
+
27
+ # remove duplicates and empty strings from process ids
28
+ unique_process_ids = list(set([idx for idx in process_ids if idx != ""]))
29
+
30
+ # return the ids
31
+ return unique_process_ids
32
+
33
+
34
+ # function to check if the additional data or parts of it have already been downloaded
35
+ def check_already_downloaded(hdf_name_results: str, unique_process_ids: list) -> list:
36
+ """Function that checks if the requested download has already been performed
37
+
38
+ Args:
39
+ hdf_name_results (str): Path to the hdf data storage.
40
+ unique_process_ids (list): List of unique process ids that are requested for data download.
41
+
42
+ Returns:
43
+ list: Returns a list of the unique process ids that have not been downloaded yet.
44
+ """
45
+ try:
46
+ additional_data = pd.read_hdf(hdf_name_results, key="additional_data")
47
+ downloaded_ids = set(additional_data["process_id"])
48
+
49
+ # filter unique ids
50
+ unique_process_ids = [
51
+ unique_id
52
+ for unique_id in unique_process_ids
53
+ if unique_id not in downloaded_ids
54
+ ]
55
+
56
+ return unique_process_ids
57
+ except KeyError:
58
+ return unique_process_ids
59
+
60
+
61
+ # funtion to parse a record page
62
+ def parse_record_page(html_response: object, url: str) -> list:
63
+ """This function parses a page for any given record on bold
64
+
65
+ Args:
66
+ html_response (object): Requests response object to parse
67
+ url (string): url as string to parse the process id
68
+
69
+ Returns:
70
+ list: A line of data representing the record data
71
+ """
72
+ # extract the process id from the url
73
+ process_id = url.split("/")[-1]
74
+
75
+ # get the status from the status code
76
+ status_code_to_status = {200: "public", 404: "private", 500: "unavailable"}
77
+ status = status_code_to_status[html_response.status_code]
78
+
79
+ # testing
80
+ if html_response.status_code == 200:
81
+ # transform html into beautifulsoup to parse out the data
82
+ soup = BSoup(html_response.text, "html5lib")
83
+
84
+ # find the specimen table
85
+ specimen = soup.find("h1", string="Specimen").find_next("table")
86
+ # extract the sex and lifestage
87
+ sex = specimen.find("th", string="Sex:").find_next("td").text
88
+ lifestage = specimen.find("th", string="Life Stage:").find_next("td").text
89
+ # extract institution storing
90
+ identifiers = soup.find("h1", string="Identifiers").find_next("table")
91
+ institution_storing = (
92
+ identifiers.find("th", string="Deposited In:").find_next("td").text
93
+ )
94
+ # extract the country
95
+ collection = soup.find("h1", string="Collection").find_next("table")
96
+ country_ocean = (
97
+ collection.find("th", string="Country/Ocean:").find_next("td").text
98
+ )
99
+ # extract identifier
100
+ attribution = soup.find("h1", string="Attribution").find_next("table")
101
+ identifier = (
102
+ attribution.find("th", string="Specimen Identification:")
103
+ .find_next("td")
104
+ .text
105
+ )
106
+ # extract identification method
107
+ taxonomy = soup.find("h1", string="Taxonomy").find_next("table")
108
+ id_method = (
109
+ taxonomy.find("th", string="Identification Method:").find_next("td").text
110
+ )
111
+ # record page
112
+ record_page = "https://portal.boldsystems.org/record/{}".format(process_id)
113
+ # return the resulting line
114
+ return [
115
+ process_id,
116
+ status,
117
+ sex,
118
+ lifestage,
119
+ institution_storing,
120
+ country_ocean,
121
+ identifier,
122
+ id_method,
123
+ record_page,
124
+ ]
125
+ else:
126
+ return [process_id, status] + [""] * 7
127
+
128
+
129
+ # async function to perform the request
130
+ async def as_request(url, as_session) -> list:
131
+ """Function to perform async requests and parse the data.
132
+
133
+ Args:
134
+ url (_type_): url to request
135
+ as_session (_type_): async session to perform the request with
136
+
137
+ Returns:
138
+ list: Returns the parsed response as list.
139
+ """
140
+ # request the url from BOLD
141
+ try:
142
+ response = await as_session.get(url)
143
+ except RetryError:
144
+ response = Response()
145
+ response.status_code = 500
146
+ # parse the response here
147
+ response = parse_record_page(response, url)
148
+
149
+ # return the response to the caller to append it to hdf
150
+ return response
151
+
152
+
153
+ # function to limit concurrency of the code
154
+ async def limit_concurrency(url, as_session, semaphore) -> object:
155
+ """Function to limit the concurrency of requests.
156
+
157
+ Args:
158
+ url (_type_): url to request.
159
+ as_session (_type_): session to use for the requests
160
+ semaphore (_type_): semaphore object
161
+
162
+ Returns:
163
+ function: as request with semaphore added
164
+ """
165
+ async with semaphore:
166
+ return await as_request(url, as_session)
167
+
168
+
169
+ # function to launch the async session including retry strategy
170
+ async def as_session(download_urls, semaphore) -> list:
171
+ """Function to launch the async session and perform the data download.
172
+
173
+ Args:
174
+ download_urls (_type_): All urls to download.
175
+ semaphore (_type_): semaphore to limit the concurrency
176
+
177
+ Returns:
178
+ list: returns a list of list with the gathered responses of the batch downloaded
179
+ """
180
+ # create the session with correct headers and a retry strategy
181
+ as_session = requests_html_playwright.AsyncHTMLSession()
182
+ as_session.headers.update(
183
+ {
184
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
185
+ }
186
+ )
187
+
188
+ # only retry if the page failed to load
189
+ retry_strategy = Retry(
190
+ total=5,
191
+ status_forcelist=[500],
192
+ backoff_factor=2,
193
+ )
194
+ adapter = HTTPAdapter(max_retries=retry_strategy)
195
+ as_session.mount("https://", adapter)
196
+
197
+ tasks = (limit_concurrency(url, as_session, semaphore) for url in download_urls)
198
+
199
+ return await tqdm.gather(*tasks, desc="Downloading additional data")
200
+
201
+
202
+ # function to transform the response to a pandas dataframe
203
+ def response_to_dataframe(parsed_response: list) -> object:
204
+ """Function that transforms a list of rows to dataframe for easy saving
205
+
206
+ Args:
207
+ parsed_response (list): List of lists that is returned from parsing BOLD
208
+
209
+ Returns:
210
+ object: Returns a pandas dataframe that can be easily saved in hdf format.
211
+ """
212
+ # generate dataframe
213
+ parsed_response = pd.DataFrame(
214
+ parsed_response,
215
+ columns=[
216
+ "process_id",
217
+ "status",
218
+ "sex",
219
+ "lifestage",
220
+ "institution_storing",
221
+ "country_or_ocean",
222
+ "identifier",
223
+ "id_method",
224
+ "record_page",
225
+ ],
226
+ )
227
+
228
+ return parsed_response
229
+
230
+
231
+ # function to add the data to the hdf storage
232
+ def add_to_hdf(hdf_name_results: str, response: object) -> None:
233
+ """Function to add the downloaded additional data to the hdf storage.
234
+
235
+ Args:
236
+ hdf_name_results (str): Path to the hdf storage.
237
+ response (object): Dataframe to be appended to the hdf storage.
238
+ """
239
+ # define the item sizes for the hdf storage
240
+ item_sizes = {
241
+ "process_id": 30,
242
+ "status": 11,
243
+ "sex": 8,
244
+ "lifestage": 80,
245
+ "institution_storing": 150,
246
+ "country_or_ocean": 80,
247
+ "identifier": 80,
248
+ "id_method": 150,
249
+ "record_page": 70,
250
+ }
251
+
252
+ # save results in hdf storage
253
+ with pd.HDFStore(
254
+ hdf_name_results, mode="a", complib="blosc:blosclz", complevel=9
255
+ ) as hdf_output:
256
+ hdf_output.append(
257
+ value=response,
258
+ key="additional_data",
259
+ format="t",
260
+ data_columns=True,
261
+ min_itemsize=item_sizes,
262
+ complib="blosc:blosclz",
263
+ complevel=9,
264
+ )
265
+
266
+
267
+ # main functio to run the additional data download
268
+ def main(fasta_path: str) -> None:
269
+ """Main function to run the additional data download. Downloads additional data
270
+ for all public process ids, saves the to hdf first and finally add them to the
271
+ results.
272
+
273
+ Args:
274
+ hdf_name_results (str): Path to the fasta file to be identified
275
+ """
276
+ # user output
277
+ tqdm.write(
278
+ "{}: Collecting process ids.".format(
279
+ datetime.datetime.now().strftime("%H:%M:%S")
280
+ )
281
+ )
282
+
283
+ # read the input fasta
284
+ fasta_dict, fasta_name, project_directory = parse_fasta(fasta_path)
285
+
286
+ # generate a new for the hdf storage to store the downloaded data
287
+ hdf_name_results = project_directory.joinpath(
288
+ "{}_result_storage.h5.lz".format(fasta_name)
289
+ )
290
+
291
+ # collect all process ids first
292
+ unique_process_ids = collect_process_ids(hdf_name_results)
293
+
294
+ # check already downloaded data
295
+ unique_process_ids = check_already_downloaded(hdf_name_results, unique_process_ids)
296
+
297
+ # calculate the number of batches
298
+ batch_count = len(list(more_itertools.chunked(unique_process_ids, 5000)))
299
+ batch_counter = 1
300
+
301
+ # only start the download loop if there are ids to downlaod
302
+ if unique_process_ids:
303
+ # user output
304
+ tqdm.write(
305
+ "{}: Divided {} unique process ids into {} batch(es) for download.".format(
306
+ datetime.datetime.now().strftime("%H:%M:%S"),
307
+ len(unique_process_ids),
308
+ batch_count,
309
+ )
310
+ )
311
+
312
+ urls = [
313
+ "https://portal.boldsystems.org/record/{}".format(idx)
314
+ for idx in unique_process_ids
315
+ ]
316
+
317
+ for chunk in more_itertools.chunked(urls, 5000):
318
+ tqdm.write(
319
+ "{}: Downloading batch {} of {}.".format(
320
+ datetime.datetime.now().strftime("%H:%M:%S"),
321
+ batch_counter,
322
+ batch_count,
323
+ )
324
+ )
325
+ # requests the additional data asynchronously in batches of 5000 urls
326
+ # limit the concurrent requests to 25
327
+ semaphore = asyncio.Semaphore(25)
328
+
329
+ # gather the responses
330
+ response = asyncio.run(as_session(chunk, semaphore))
331
+
332
+ # transform to dataframe for saving
333
+ response = response_to_dataframe(response)
334
+
335
+ # saving the data
336
+ add_to_hdf(hdf_name_results, response)
337
+
338
+ # user output
339
+ tqdm.write(
340
+ "{}: Saving batch {} of {}.".format(
341
+ datetime.datetime.now().strftime("%H:%M:%S"),
342
+ batch_counter,
343
+ batch_count,
344
+ )
345
+ )
346
+
347
+ # increase batch counter for user output
348
+ batch_counter += 1
349
+ else:
350
+ # user output
351
+ tqdm.write(
352
+ "{}: Additional data has already been completly downloaded.".format(
353
+ datetime.datetime.now().strftime("%H:%M:%S"),
354
+ batch_counter,
355
+ batch_count,
356
+ )
357
+ )