PyPI - scibite-toolkit - Versions diffs - 1.0.0__py3-none-any.whl - Mend

scibite-toolkit 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

scibite_toolkit/__init__.py +4 -0
scibite_toolkit/centree.py +158 -0
scibite_toolkit/docstore.py +324 -0
scibite_toolkit/scibite_search.py +990 -0
scibite_toolkit/termite.py +1119 -0
scibite_toolkit/texpress.py +592 -0
scibite_toolkit/utilities.py +108 -0
scibite_toolkit/workbench.py +780 -0
scibite_toolkit-1.0.0.data/data/LICENSE.txt +1 -0
scibite_toolkit-1.0.0.dist-info/LICENSE.txt +1 -0
scibite_toolkit-1.0.0.dist-info/METADATA +241 -0
scibite_toolkit-1.0.0.dist-info/RECORD +14 -0
scibite_toolkit-1.0.0.dist-info/WHEEL +5 -0
scibite_toolkit-1.0.0.dist-info/top_level.txt +1 -0

scibite_toolkit/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .termite import *
+from .texpress import *

scibite_toolkit/centree.py ADDED Viewed

@@ -0,0 +1,158 @@
+import requests
+import logging
+# Get the logger for this module
+logger = logging.getLogger(__name__)
+class CentreeRequestBuilder:
+    """
+    Class for creating CENtree Requests.
+    """
+    def __init__(self, timeout: int = 10):
+        """
+        Initialize the CentreeRequestBuilder.
+        Parameters
+        ----------
+        timeout : int, optional
+            The timeout for HTTP requests in seconds (default is 10 seconds).
+        """
+        self.centree_url = ''
+        self.headers = {}
+        self.session = requests.Session()
+        self.timeout = timeout
+        self.logger: logging.Logger = logger
+    def set_url(self, centree_url: str):
+        """
+        Set the URL of the CENtree instance.
+        Parameters
+        ----------
+        centree_url : str
+            The URL of the CENtree instance to be hit.
+        Examples
+        --------
+        >>> crb.set_url("http://example.com")
+        """
+        self.centree_url = centree_url.rstrip('/')
+        self.logger.info(f"Set CENtree URL to {self.centree_url}")
+    def set_authentication(self, username: str, password: str, remember_me: bool = True, verification: bool = True):
+        """
+        Authenticates with the CENtree token API using username and password, generates an access token,
+        and sets the request header.
+        Parameters
+        ----------
+        username : str
+            The username for authentication.
+        password : str
+            The password for authentication.
+        remember_me : bool, optional
+            Whether to remember the user (default is True).
+        verification : bool, optional
+            Whether to verify SSL certificates (default is True).
+        Examples
+        --------
+        >>> crb.set_authentication("user", "pass")
+        """
+        authenticate_url = f"{self.centree_url}/api/authenticate"
+        try:
+            token_response = self.session.post(
+                authenticate_url,
+                json={
+                    "rememberMe": remember_me,
+                    "username": username,
+                    "password": password,
+                },
+                headers={"Content-Type": "application/json"},
+                verify=verification,
+                timeout=self.timeout
+            )
+            token_response.raise_for_status()
+            access_token = token_response.json().get("id_token")
+            if not access_token:
+                raise ValueError("Access token not found in the response.")
+            self.headers = {"Authorization": f"Bearer {access_token}"}
+            self.logger.info("Authentication successful")
+        except requests.exceptions.HTTPError as http_err:
+            self.logger.error(f"HTTP error occurred: {http_err.response.status_code} - {http_err.response.reason}")
+            raise http_err  # Re-raise the HTTPError for the test to catch
+        except requests.exceptions.RequestException as req_err:
+            self.logger.error(f"Request error: {req_err}")
+            raise req_err  # Re-raise the RequestException for the test to catch
+        except ValueError as val_err:
+            self.logger.error(f"Value error: {val_err}")
+            raise val_err  # Re-raise the ValueError for the test to catch
+        except Exception as err:
+            self.logger.error(f"An error occurred: {err}")
+            raise err  # Re-raise the generic exception for the test to catch
+    def search_classes(self, query: str, ontology_id: str = None, exact: bool = False, obsolete: bool = False,
+                       page_from: int = 0, page_size: int = 10) -> dict:
+        """
+        Search classes in the CENtree ontology.
+        Parameters
+        ----------
+        query : str
+            The search query.
+        ontology_id : str, optional
+            The ontology ID to search within.
+        exact : bool, optional
+            Whether to perform an exact search (default is False).
+        obsolete : bool, optional
+            Whether to include obsolete classes (default is False).
+        page_from : int, optional
+            The starting page number (default is 0).
+        page_size : int, optional
+            The number of results per page (default is 10).
+        Returns
+        -------
+        dict
+            The JSON response from the search endpoint.
+        Examples
+        --------
+        >>> result = crb.search_classes("diabetes")
+        """
+        params = {
+            "q": query,
+            "ontology": ontology_id,
+            "from": page_from,
+            "size": page_size
+        }
+        # Clean up params dictionary to remove None values
+        params = {k: v for k, v in params.items() if v is not None}
+        # Construct the endpoint URL
+        endpoint_suffix = ''
+        if obsolete:
+            endpoint_suffix += '/obsolete'
+        if exact:
+            endpoint_suffix += '/exact'
+        search_endpoint = f"{self.centree_url}/api/search{endpoint_suffix}"
+        try:
+            response = self.session.get(search_endpoint, params=params, headers=self.headers, timeout=self.timeout)
+            response.raise_for_status()
+            self.logger.info("Search request successful")
+            return response.json()
+        except requests.exceptions.HTTPError as http_err:
+            self.logger.error(f"HTTP error occurred: {http_err}")
+        except requests.exceptions.RequestException as req_err:
+            self.logger.error(f"Request error occurred: {req_err}")
+        except Exception as err:
+            self.logger.error(f"An error occurred: {err}")

scibite_toolkit/docstore.py ADDED Viewed

@@ -0,0 +1,324 @@
+"""
+  ____       _ ____  _ _         _____           _ _    _ _
+ / ___|  ___(_) __ )(_) |_ ___  |_   _|__   ___ | | | _(_) |_
+ \___ \ / __| |  _ \| | __/ _ \   | |/ _ \ / _ \| | |/ / | __|
+  ___) | (__| | |_) | | ||  __/   | | (_) | (_) | |   <| | |_
+ |____/ \___|_|____/|_|\__\___|   |_|\___/ \___/|_|_|\_\_|\__|
+Preprocessing functions- using your TERMite output to make AI-ready data
+"""
+__author__ = 'SciBite'
+__copyright__ = '(c) 2024, SciBite Ltd'
+__license__ = 'Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License'
+import requests
+import pandas as pd
+class DocStoreRequestBuilder():
+    """
+    Class for creating DOCStore requests
+    """
+    def __init__(self):
+        self.url = ""
+        self.input_file_path = ''
+        self.payload = {"output": "json"}
+        self.options = {}
+        self.binary_content = None
+        self.basic_auth = ()
+        self.verify_request = True
+    def set_basic_auth(self, username='', password='', verification=True):
+        """
+        Pass basic authentication credentials
+        **ONLY change verification if you are calling a known source**
+        :param username: username to be used for basic authentication
+        :param password: password to be used for basic authentication
+        :param verification: if set to False requests will ignore verifying the SSL certificate, can also pass the path
+        to a certificate file
+        """
+        self.basic_auth = (username, password)
+        self.verify_request = verification
+    def set_url(self, url):
+        """
+        Set the URL of the DOCStore instance
+        :param url: the URL of the DOCStore instance to be hit
+        """
+        self.url = url.rstrip('/')
+    def get_dcc_docs(self, entity_list, source='*', options_dict=None):
+        """
+        - Document co-occurrence -
+        Retrieve document co-occurrence of provided entities
+        :param entity_list: list of entities to be searched for
+        :param source: name of data source(s) to be searched against
+        :param options_dict: search parameters
+        :return: results of search in json format
+        """
+        base_url = self.url
+        query_url = (base_url) + "/api/ds/v1/search/co/document/{}/*/*/*".format(source)
+        entity_string = " ".join(entity_list)
+        options = {"fmt": "json",
+                   "fields": "*",
+                   "terms": entity_string,
+                   "limit": "10",
+                   "from": "0",
+                   "facettype": "NONE",
+                   "significantTerms": "false",
+                   "excludehits": "false",
+                   "sortby": "document_date:desc",
+                   }
+        try:
+            for k, v in options_dict.items():
+                if k in options.keys():
+                    options[k] = v
+        except:
+            pass
+        response = requests.get(query_url, params=options, auth=self.basic_auth)
+        resp_json = response.json()
+        return resp_json
+    def get_boolean_docs(self, query_string, source='*', options_dict=None):
+        """
+        - Document-level query of Doc Store -
+        Document-level query of Doc Store, produced both hit and facet data
+        :param query_string: query to be completed
+        :param source: name of data source(s) to be searched against
+        :param options_dict: search parameters
+        :return: results of search in json format
+        """
+        base_url = self.url
+        query_url = (base_url) + "/api/ds/v1/search/document/{}/*/*/*".format(source)
+        options = {"fmt": "json",
+                   "fields": "*",
+                   "query": query_string,
+                   "limit": "10",
+                   "from": "0",
+                   "facettype": "NONE",
+                   "significantTerms": "false",
+                   "excludehits": "false",
+                   "sortby": "document_date:desc",
+                   "filters": ""
+                   }
+        try:
+            for k, v in options_dict.items():
+                if k in options.keys():
+                    options[k] = v
+        except:
+            pass
+        response = requests.get(query_url, params=options, auth=self.basic_auth)
+        resp_json = response.json()
+        return resp_json
+    def get_docs(self, query_string, source='*', options_dict=None):
+        """
+        - Document-level query of Doc Store, returning only the documents hit,
+        no facet data. -
+        The output is TERMite/TEXpress ready
+        :param query_string: query to be completed
+        :param source: name of data source(s) to be searched against
+        :param options_dict: search parameters
+        :return: results of search in json format
+        """
+        base_url = self.url
+        query_url = (base_url) + '/api/ds/v1/search/document/docs/{}/*/*/*'.format(source)
+        options = {"fields": "*",
+                   "fmt":"json",
+                   "query": query_string,
+                   "limit": "10",
+                   "from": "0",
+                   "sortby": "document_date:desc",
+                   "filters": "",
+                   "zip":"false",
+                   "metaonly":"false"
+                   }
+        try:
+            for k, v in options_dict.items():
+                if k in options.keys():
+                    options[k] = v
+        except:
+            pass
+        response = requests.get(query_url, params=options, auth=self.basic_auth)
+        resp_json = response.json()
+        return resp_json
+    def get_scc_docs(self, entity_list, source='*', options_dict=None):
+        """
+        - Sentence co-occurrence on entity ids or types, returns documents
+        containing sentences fulfilling the co-occurrence. -
+        :param entity_list: list of entities to be searched for
+        :param source: name of data source(s) to be searched against
+        :param options_dict: search parameters
+        :return: results of search in json format
+        """
+        base_url = self.url
+        query_url = (base_url) + "/api/ds/v1/search/co/sentence/sentencedetail/flat/{}/*/*/*".format(
+            source)
+        entity_string = " ".join(entity_list)
+        options = {"fmt": "json",
+                   "fields": "*",
+                   "terms": entity_string,
+                   "inorder": "false",
+                   "slop": "2",
+                   "limit": "10",
+                   "from": "0",
+                   "sortby": "document_date:desc",
+                   "zip": "false"}
+        try:
+            for k, v in options_dict.items():
+                if k in options.keys():
+                    options[k] = v
+        except:
+            pass
+        response = requests.get(query_url, params=options, auth=self.basic_auth)
+        resp_json = response.json()
+        return resp_json
+    def get_doc_by_id(self,doc_id, fmt='json'):
+        """Retrieves document by its unique ID"""
+        options = {"fmt": fmt,
+                   "uid":doc_id}
+        base_url = self.url
+        query_url = (base_url) + "/api/ds/v1/lookup/doc"
+        response = requests.get(query_url, params=options, auth=self.basic_auth)
+        resp_json = response.json()
+        return resp_json
+    def entity_lookup_id(self, syn, entity_type, options_dict=None):
+        """Lookup IDs for a synonym and type"""
+        options = {"syn": syn,
+                   "type":entity_type}
+        base_url = self.url
+        query_url = (base_url) + "/api/entity/v1/lookup/id"
+        response = requests.get(query_url, params=options, auth=self.basic_auth)
+        resp_json = response.json()
+        return resp_json
+    def get_facets_only(self,query_string,facetFilter, source ='*', significantTerms = False, options_dict = None):
+        """Document-level query of Doc Store, returning only the facets"""
+        options ={"fmt": "json",
+                   "fields": "*",
+                   "query": query_string,
+                   "facetFilter":facetFilter,
+                   "limit": "10",
+                   "from": "0",
+                   "facettype": "BY_TYPE",
+                   "significantTerms": "false",
+                   "excludehits": "false",
+                   }
+        try:
+            for k, v in options_dict.items():
+                if k in options.keys():
+                    options[k] = v
+        except:
+            pass
+        base_url = self.url
+        query_url = (base_url) + '/api/ds/v1/search/document/facets/{}/*/*/*'.format(source)
+        response = requests.get(query_url, params=options, auth=self.basic_auth)
+        resp_json = response.json()
+        return resp_json
+def get_docstore_dcc_df(json):
+    """
+    Converts document co-occurrence json into a dataframe
+    :param json: dcc json
+    :return: dcc dataframe
+    """
+    df_rows = []
+    hits = json["hits"]
+    for h in hits:
+        hit_dict = {}
+        # Document id
+        doc_id = h["id"]
+        # Document date
+        doc_date = ""
+        try:
+            doc_date = h["documentDate"][0:10]
+        except:
+            pass
+        # Title
+        highlighted_sections = h['highlightedSections'][0]
+        title_words = highlighted_sections['titleWords']
+        title_list = []
+        for t in title_words:
+            word = (t['p']).rstrip()
+            title_list.append(word)
+        title = ((' ').join(title_list))
+        # Authors
+        authors = ""
+        try:
+            authors = h["authors"]
+        except:
+            pass
+        # Citation
+        citation = ""
+        try:
+            citation = h["citation"]
+        except:
+            pass
+        hit_dict.update([("document_id", doc_id), ("document_date", doc_date), ("title", title),
+                         ("authors", authors), ("citation", citation)])
+        df_rows.append(hit_dict)
+    dcc_df = pd.DataFrame(df_rows)
+    return (dcc_df)
+def get_docstore_scc_df(json):
+    """
+     Converts sentence co-occurrence json into a dataframe
+     :param json: scc json
+     :return: scc dataframe
+     """
+    df_rows = []
+    hits = json["hits"]
+    for h in hits:
+        hit_dict = {}
+        # Document id
+        doc_id = h["docId"]
+        # Document date
+        doc_date = ""
+        try:
+            doc_date = h["docDate"][0:10]
+        except:
+            pass
+        # SCC Sentence
+        doc_sent = h["sentence"]
+        hit_dict.update([("document_id", doc_id), ("document_date", doc_date), ("scc_sentence", doc_sent)])
+        df_rows.append(hit_dict)
+    scc_df = pd.DataFrame(df_rows, columns=["document_id", "document_date", "scc_sentence"])
+    return (scc_df)