PyPI - boldigger3 - Versions diffs - 1.0.0__py3-none-any.whl - Mend

boldigger3 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

boldigger3/__init__.py +0 -0
boldigger3/__main__.py +125 -0
boldigger3/additional_data_download.py +357 -0
boldigger3/id_engine.py +479 -0
boldigger3/select_top_hit.py +518 -0
boldigger3-1.0.0.dist-info/LICENSE +21 -0
boldigger3-1.0.0.dist-info/METADATA +190 -0
boldigger3-1.0.0.dist-info/RECORD +11 -0
boldigger3-1.0.0.dist-info/WHEEL +5 -0
boldigger3-1.0.0.dist-info/entry_points.txt +2 -0
boldigger3-1.0.0.dist-info/top_level.txt +1 -0

boldigger3/__init__.py ADDED Viewed

File without changes

boldigger3/__main__.py ADDED Viewed

@@ -0,0 +1,125 @@
+import argparse, sys, datetime, time
+from boldigger3 import id_engine, additional_data_download, select_top_hit
+from importlib.metadata import version
+from get_pypi_latest_version import GetPyPiLatestVersion
+# main function to program the commandline interface
+def main() -> None:
+    """Function to define the commandline interface."""
+    # initialize the default behaviour if boldigger3 is called without any argument
+    formatter = lambda prog: argparse.HelpFormatter(prog, max_help_position=35)
+    # define the parser
+    parser = argparse.ArgumentParser(
+        prog="boldigger3",
+        description="A Python package to identify and organise sequences with the Barcode of Life Data systems.",
+        formatter_class=formatter,
+    )
+    # display help when no argument is called
+    parser.set_defaults(func=lambda x: parser.print_help())
+    # add the subparsers
+    subparsers = parser.add_subparsers(dest="function")
+    # add the identify parser
+    parser_identify = subparsers.add_parser(
+        "identify", help="Run the BOLD v5 identification engine"
+    )
+    # add the fasta path argument
+    parser_identify.add_argument(
+        "fasta_file",
+        help="Path to the fasta file or fasta file in the current working directory to be identified.",
+        type=str,
+    )
+    # add the database argument
+    parser_identify.add_argument(
+        "--db",
+        required=True,
+        help="Integer that defines which database to use (1 to 7). See readme for details",
+        type=int,
+        choices=range(1, 8),
+    )
+    # add the operating mode argument
+    parser_identify.add_argument(
+        "--mode",
+        required=True,
+        help="Integer that defines which operating mode to use (1 to 3). See readme for details.",
+        type=int,
+        choices=range(1, 4),
+    )
+    # add the optional argument thresholds
+    parser_identify.add_argument(
+        "--thresholds",
+        nargs="+",
+        type=int,
+        help="Thresholds to use for the selection of the top hit.",
+    )
+    # add version control
+    # get the installed version
+    current_version = version("boldigger2")
+    obtainer = GetPyPiLatestVersion()
+    latest_version = obtainer("boldigger2")
+    # give a user warning if the latest version is not installed
+    if current_version != latest_version:
+        print(
+            "{}: Your boldigger3 version is outdated. Consider updating to the latest version.".format(
+                datetime.datetime.now().strftime("%H:%M:%S")
+            )
+        )
+    # add the version argument
+    parser.add_argument("--version", action="version", version=version("boldigger2"))
+    # parse the arguments
+    arguments = parser.parse_args()
+    # print help if no argument is provided
+    if len(sys.argv) == 1:
+        arguments.func(arguments)
+        sys.exit()
+    # only use the threshold provided by the user replace the rest with defaults
+    default_thresholds = [97, 95, 90, 85]
+    thresholds = []
+    for i in range(4):
+        try:
+            thresholds.append(arguments.thresholds[i])
+        except (IndexError, TypeError):
+            thresholds.append(default_thresholds[i])
+    if arguments.thresholds:
+        # give user output
+        print(
+            "{}: Default thresholds changed!\n{}: Species: {}, Genus: {}, Family: {}, Order: {}".format(
+                datetime.datetime.now().strftime("%H:%M:%S"),
+                datetime.datetime.now().strftime("%H:%M:%S"),
+                *thresholds
+            )
+        )
+    # run the identification engine
+    if arguments.function == "identify":
+        # run the id engine
+        id_engine.main(
+            arguments.fasta_file,
+            database=arguments.db,
+            operating_mode=arguments.mode,
+        )
+        # download the additional data
+        additional_data_download.main(arguments.fasta_file)
+        # select the top hit
+        select_top_hit.main(arguments.fasta_file, thresholds=thresholds)
+# run only if called as a top level script
+if __name__ == "__main__":
+    main()

boldigger3/additional_data_download.py ADDED Viewed

@@ -0,0 +1,357 @@
+import asyncio, requests_html_playwright, more_itertools, datetime
+from boldigger3.id_engine import parse_fasta
+from bs4 import BeautifulSoup as BSoup
+from requests import Response
+from tqdm.asyncio import tqdm
+import pandas as pd
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from requests.exceptions import RetryError
+# function to collect the process ids from the hdf storage
+def collect_process_ids(hdf_name_results: str) -> list:
+    """Function to collect all process ids from the downloaded data.
+    Args:
+        hdf_name_results (str): Path to the hdf storage that's generated by the data download
+    Returns:
+        tuple: Returns a list of unique process ids.
+    """
+    # read the results from hdf storage
+    unsorted_results = pd.read_hdf(hdf_name_results, key="results_unsorted")
+    process_ids = unsorted_results["process_id"]
+    # remove duplicates and empty strings from process ids
+    unique_process_ids = list(set([idx for idx in process_ids if idx != ""]))
+    # return the ids
+    return unique_process_ids
+# function to check if the additional data or parts of it have already been downloaded
+def check_already_downloaded(hdf_name_results: str, unique_process_ids: list) -> list:
+    """Function that checks if the requested download has already been performed
+    Args:
+        hdf_name_results (str): Path to the hdf data storage.
+        unique_process_ids (list): List of unique process ids that are requested for data download.
+    Returns:
+        list: Returns a list of the unique process ids that have not been downloaded yet.
+    """
+    try:
+        additional_data = pd.read_hdf(hdf_name_results, key="additional_data")
+        downloaded_ids = set(additional_data["process_id"])
+        # filter unique ids
+        unique_process_ids = [
+            unique_id
+            for unique_id in unique_process_ids
+            if unique_id not in downloaded_ids
+        ]
+        return unique_process_ids
+    except KeyError:
+        return unique_process_ids
+# funtion to parse a record page
+def parse_record_page(html_response: object, url: str) -> list:
+    """This function parses a page for any given record on bold
+    Args:
+        html_response (object): Requests response object to parse
+        url (string): url as string to parse the process id
+    Returns:
+        list: A line of data representing the record data
+    """
+    # extract the process id from the url
+    process_id = url.split("/")[-1]
+    # get the status from the status code
+    status_code_to_status = {200: "public", 404: "private", 500: "unavailable"}
+    status = status_code_to_status[html_response.status_code]
+    # testing
+    if html_response.status_code == 200:
+        # transform html into beautifulsoup to parse out the data
+        soup = BSoup(html_response.text, "html5lib")
+        # find the specimen table
+        specimen = soup.find("h1", string="Specimen").find_next("table")
+        # extract the sex and lifestage
+        sex = specimen.find("th", string="Sex:").find_next("td").text
+        lifestage = specimen.find("th", string="Life Stage:").find_next("td").text
+        # extract institution storing
+        identifiers = soup.find("h1", string="Identifiers").find_next("table")
+        institution_storing = (
+            identifiers.find("th", string="Deposited In:").find_next("td").text
+        )
+        # extract the country
+        collection = soup.find("h1", string="Collection").find_next("table")
+        country_ocean = (
+            collection.find("th", string="Country/Ocean:").find_next("td").text
+        )
+        # extract identifier
+        attribution = soup.find("h1", string="Attribution").find_next("table")
+        identifier = (
+            attribution.find("th", string="Specimen Identification:")
+            .find_next("td")
+            .text
+        )
+        # extract identification method
+        taxonomy = soup.find("h1", string="Taxonomy").find_next("table")
+        id_method = (
+            taxonomy.find("th", string="Identification Method:").find_next("td").text
+        )
+        # record page
+        record_page = "https://portal.boldsystems.org/record/{}".format(process_id)
+        # return the resulting line
+        return [
+            process_id,
+            status,
+            sex,
+            lifestage,
+            institution_storing,
+            country_ocean,
+            identifier,
+            id_method,
+            record_page,
+        ]
+    else:
+        return [process_id, status] + [""] * 7
+# async function to perform the request
+async def as_request(url, as_session) -> list:
+    """Function to perform async requests and parse the data.
+    Args:
+        url (_type_): url to request
+        as_session (_type_): async session to perform the request with
+    Returns:
+        list: Returns the parsed response as list.
+    """
+    # request the url from BOLD
+    try:
+        response = await as_session.get(url)
+    except RetryError:
+        response = Response()
+        response.status_code = 500
+    # parse the response here
+    response = parse_record_page(response, url)
+    # return the response to the caller to append it to hdf
+    return response
+# function to limit concurrency of the code
+async def limit_concurrency(url, as_session, semaphore) -> object:
+    """Function to limit the concurrency of requests.
+    Args:
+        url (_type_): url to request.
+        as_session (_type_): session to use for the requests
+        semaphore (_type_): semaphore object
+    Returns:
+        function: as request with semaphore added
+    """
+    async with semaphore:
+        return await as_request(url, as_session)
+# function to launch the async session including retry strategy
+async def as_session(download_urls, semaphore) -> list:
+    """Function to launch the async session and perform the data download.
+    Args:
+        download_urls (_type_): All urls to download.
+        semaphore (_type_): semaphore to limit the concurrency
+    Returns:
+        list: returns a list of list with the gathered responses of the batch downloaded
+    """
+    # create the session with correct headers and a retry strategy
+    as_session = requests_html_playwright.AsyncHTMLSession()
+    as_session.headers.update(
+        {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
+        }
+    )
+    # only retry if the page failed to load
+    retry_strategy = Retry(
+        total=5,
+        status_forcelist=[500],
+        backoff_factor=2,
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    as_session.mount("https://", adapter)
+    tasks = (limit_concurrency(url, as_session, semaphore) for url in download_urls)
+    return await tqdm.gather(*tasks, desc="Downloading additional data")
+# function to transform the response to a pandas dataframe
+def response_to_dataframe(parsed_response: list) -> object:
+    """Function that transforms a list of rows to dataframe for easy saving
+    Args:
+        parsed_response (list): List of lists that is  returned from parsing BOLD
+    Returns:
+        object: Returns a pandas dataframe that can be easily saved in hdf format.
+    """
+    # generate dataframe
+    parsed_response = pd.DataFrame(
+        parsed_response,
+        columns=[
+            "process_id",
+            "status",
+            "sex",
+            "lifestage",
+            "institution_storing",
+            "country_or_ocean",
+            "identifier",
+            "id_method",
+            "record_page",
+        ],
+    )
+    return parsed_response
+# function to add the data to the hdf storage
+def add_to_hdf(hdf_name_results: str, response: object) -> None:
+    """Function to add the downloaded additional data to the hdf storage.
+    Args:
+        hdf_name_results (str): Path to the hdf storage.
+        response (object): Dataframe to be appended to the hdf storage.
+    """
+    # define the item sizes for the hdf storage
+    item_sizes = {
+        "process_id": 30,
+        "status": 11,
+        "sex": 8,
+        "lifestage": 80,
+        "institution_storing": 150,
+        "country_or_ocean": 80,
+        "identifier": 80,
+        "id_method": 150,
+        "record_page": 70,
+    }
+    # save results in hdf storage
+    with pd.HDFStore(
+        hdf_name_results, mode="a", complib="blosc:blosclz", complevel=9
+    ) as hdf_output:
+        hdf_output.append(
+            value=response,
+            key="additional_data",
+            format="t",
+            data_columns=True,
+            min_itemsize=item_sizes,
+            complib="blosc:blosclz",
+            complevel=9,
+        )
+# main functio to run the additional data download
+def main(fasta_path: str) -> None:
+    """Main function to run the additional data download. Downloads additional data
+    for all public process ids, saves the to hdf first and finally add them to the
+    results.
+    Args:
+        hdf_name_results (str): Path to the fasta file to be identified
+    """
+    # user output
+    tqdm.write(
+        "{}: Collecting process ids.".format(
+            datetime.datetime.now().strftime("%H:%M:%S")
+        )
+    )
+    # read the input fasta
+    fasta_dict, fasta_name, project_directory = parse_fasta(fasta_path)
+    # generate a new for the hdf storage to store the downloaded data
+    hdf_name_results = project_directory.joinpath(
+        "{}_result_storage.h5.lz".format(fasta_name)
+    )
+    # collect all process ids first
+    unique_process_ids = collect_process_ids(hdf_name_results)
+    # check already downloaded data
+    unique_process_ids = check_already_downloaded(hdf_name_results, unique_process_ids)
+    # calculate the number of batches
+    batch_count = len(list(more_itertools.chunked(unique_process_ids, 5000)))
+    batch_counter = 1
+    # only start the download loop if there are ids to downlaod
+    if unique_process_ids:
+        # user output
+        tqdm.write(
+            "{}: Divided {} unique process ids into {} batch(es) for download.".format(
+                datetime.datetime.now().strftime("%H:%M:%S"),
+                len(unique_process_ids),
+                batch_count,
+            )
+        )
+        urls = [
+            "https://portal.boldsystems.org/record/{}".format(idx)
+            for idx in unique_process_ids
+        ]
+        for chunk in more_itertools.chunked(urls, 5000):
+            tqdm.write(
+                "{}: Downloading batch {} of {}.".format(
+                    datetime.datetime.now().strftime("%H:%M:%S"),
+                    batch_counter,
+                    batch_count,
+                )
+            )
+            # requests the additional data asynchronously in batches of 5000 urls
+            # limit the concurrent requests to 25
+            semaphore = asyncio.Semaphore(25)
+            # gather the responses
+            response = asyncio.run(as_session(chunk, semaphore))
+            # transform to dataframe for saving
+            response = response_to_dataframe(response)
+            # saving the data
+            add_to_hdf(hdf_name_results, response)
+            # user output
+            tqdm.write(
+                "{}: Saving batch {} of {}.".format(
+                    datetime.datetime.now().strftime("%H:%M:%S"),
+                    batch_counter,
+                    batch_count,
+                )
+            )
+            # increase batch counter for user output
+            batch_counter += 1
+    else:
+        # user output
+        tqdm.write(
+            "{}: Additional data has already been completly downloaded.".format(
+                datetime.datetime.now().strftime("%H:%M:%S"),
+                batch_counter,
+                batch_count,
+            )
+        )