PyPI - biolmai - Versions diffs - 0.1.3__tar.gz → 0.1.5__tar.gz - Mend

biolmai 0.1.3tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biolmai might be problematic. Click here for more details.

Files changed (74) hide show

{biolmai-0.1.3 → biolmai-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: biolmai
-Version: 0.1.3
+Version: 0.1.5
 Summary: Python client and SDK for https://biolm.ai
 Home-page: https://github.com/BioLM/py-biolm
 Author: Nikhil Haas

biolmai-0.1.5/biolmai/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Top-level package for BioLM AI."""
+__author__ = """Nikhil Haas"""
+__email__ = 'nikhil@biolm.ai'
+__version__ = '0.1.5'
+from biolmai.auth import get_api_token
+from biolmai.cls import ESMFoldSingleChain, ESMFoldMultiChain, ESM2Embeddings, ESM1v1, ESM1v2, ESM1v3, ESM1v4, ESM1v5
+__all__ = [
+]

{biolmai-0.1.3 → biolmai-0.1.5}/biolmai/api.py RENAMED Viewed

@@ -1,61 +1,22 @@
 """References to API endpoints."""
-from biolmai import biolmai
+import datetime
+import time
+import requests
+from requests.adapters import HTTPAdapter
+import biolmai.auth
+import biolmai
 import inspect
 import pandas as pd
 import numpy as np
+from biolmai.asynch import async_api_call_wrapper
-from biolmai.biolmai import get_user_auth_header
+from biolmai.biolmai import log
 from biolmai.const import MULTIPROCESS_THREADS
-if MULTIPROCESS_THREADS:
-    from pandarallel import pandarallel
-    pandarallel.initialize(progress_bar=False,
-                           nb_workers=int(MULTIPROCESS_THREADS), verbose=2)
 from functools import lru_cache
-from biolmai.payloads import INST_DAT_TXT
-from biolmai.validate import UnambiguousAA
-def predict_resp_many_in_one_to_many_singles(resp_json, status_code,
-                                             batch_id, local_err, batch_size):
-    expected_root_key = 'predictions'
-    to_ret = []
-    if not local_err and status_code and status_code == 200:
-        list_of_individual_seq_results = resp_json[expected_root_key]
-    elif local_err:
-        list_of_individual_seq_results = [{'error': resp_json}]
-    elif status_code and status_code != 200 and isinstance(resp_json, dict):
-        list_of_individual_seq_results = [resp_json] * batch_size
-    else:
-        raise ValueError("Unexpected response in parser")
-    for idx, item in enumerate(list_of_individual_seq_results):
-        d = {'status_code': status_code,
-             'batch_id': batch_id,
-             'batch_item': idx}
-        if not status_code or status_code != 200:
-            d.update(item)  # Put all resp keys at root there
-        else:
-            # We just append one item, mimicking a single seq in POST req/resp
-            d[expected_root_key] = []
-            d[expected_root_key].append(item)
-        to_ret.append(d)
-    return to_ret
-def api_call_wrapper(df, args):
-    """Wrap API calls to assist with sequence validation as a pre-cursor to
-    each API call.
-    """
-    model_name, action, payload_maker, response_key = args
-    payload = payload_maker(df)
-    headers = get_user_auth_header()  # Need to pull each time
-    api_resp = biolmai.api_call(model_name, action, headers, payload,
-                                response_key)
-    resp_json = api_resp.json()
-    batch_id = int(df.batch.iloc[0])
-    batch_size = df.shape[0]
-    response = predict_resp_many_in_one_to_many_singles(
-        resp_json, api_resp.status_code, batch_id, None, batch_size)
-    return response
+from biolmai.payloads import INST_DAT_TXT, predict_resp_many_in_one_to_many_singles
 @lru_cache(maxsize=64)
@@ -92,7 +53,8 @@ def validate(f):
         # Is the function we decorated a class method?
         if is_method:
-            name = '{}.{}.{}'.format(f.__module__, args[0].__class__.__name__,
+            name = '{}.{}.{}'.format(f.__module__,
+                                     class_obj_self.__class__.__name__,
                                      f.__name__)
         else:
             name = '{}.{}'.format(f.__module__, f.__name__)
@@ -111,9 +73,9 @@ def validate(f):
         for c in class_obj_self.seq_classes:
             # Validate input data against regex
             if class_obj_self.multiprocess_threads:
-                validation = input_data.text.parallel_apply(text_validator, args=(c(), ))
+                validation = input_data.text.apply(text_validator, args=(c, ))
             else:
-                validation = input_data.text.apply(text_validator, args=(c(), ))
+                validation = input_data.text.apply(text_validator, args=(c, ))
             if 'validation' not in input_data.columns:
                 input_data['validation'] = validation
             else:
@@ -138,7 +100,7 @@ def validate(f):
 def convert_input(f):
     def wrapper(*args, **kwargs):
-    # Get the user-input data argument to the decorated function
+        # Get the user-input data argument to the decorated function
         class_obj_self = args[0]
         input_data = args[1]
         # Make sure we have expected input types
@@ -172,44 +134,35 @@ class APIEndpoint(object):
         else:
             self.multiprocess_threads = MULTIPROCESS_THREADS  # Could be False
         # Get correct auth-like headers
-        self.auth_headers = biolmai.get_user_auth_header()
+        self.auth_headers = biolmai.auth.get_user_auth_header()
         self.action_class_strings = tuple([
             c.__name__.replace('Action', '').lower() for c in self.action_classes
         ])
-    @convert_input
-    @validate
-    def predict(self, dat):
+    def post_batches(self, dat, slug, action, payload_maker, resp_key):
         keep_batches = dat.loc[~dat.batch.isnull(), ['text', 'batch']]
         if keep_batches.shape[0] == 0:
-            err = "No inputs found following local validation"
+            pass  # Do nothing - we made nice JSON errors to return in the DF
+            # err = "No inputs found following local validation"
             # raise AssertionError(err)
-        elif self.multiprocess_threads:
-            api_resps = keep_batches.groupby('batch').parallel_apply(
-                api_call_wrapper,
-                (
-                    self.slug,
-                    'predict',
-                    INST_DAT_TXT,
-                    'predictions'
-                ),
-            )
-        else:
-            api_resps = keep_batches.groupby('batch').apply(
-                api_call_wrapper,
-                (
-                    self.slug,
-                    'predict',
-                    INST_DAT_TXT,
-                    'predictions'
-                ),
-            )
         if keep_batches.shape[0] > 0:
-            batch_res = api_resps.explode('api_resp')  # Should be lists of results
+            api_resps = async_api_call_wrapper(
+                keep_batches,
+                slug,
+                action,
+                payload_maker,
+                resp_key
+            )
+            if isinstance(api_resps, pd.DataFrame):
+                batch_res = api_resps.explode('api_resp')  # Should be lists of results
+                len_res = batch_res.shape[0]
+            else:
+                batch_res = pd.DataFrame({'api_resp': api_resps})
+                len_res = batch_res.shape[0]
             orig_request_rows = keep_batches.shape[0]
-            if batch_res.shape[0] != orig_request_rows:
+            if len_res != orig_request_rows:
                 err = "Response rows ({}) mismatch with input rows ({})"
-                err = err.format(batch_res.shape[0], orig_request_rows)
+                err = err.format(len_res, orig_request_rows)
                 raise AssertionError(err)
             # Stack the results horizontally w/ original rows of batches
@@ -221,28 +174,100 @@ class APIEndpoint(object):
             dat = dat.join(keep_batches.reindex(['api_resp'], axis=1))
         else:
             dat['api_resp'] = None
+        return dat
+    def unpack_local_validations(self, dat):
         dat.loc[
             dat.api_resp.isnull(), 'api_resp'
         ] = dat.loc[~dat.validation.isnull(), 'validation'].apply(
             predict_resp_many_in_one_to_many_singles,
             args=(None, None, True, None)).explode()
+        return dat
+    @convert_input
+    @validate
+    def predict(self, dat):
+        dat = self.post_batches(dat, self.slug, 'predict', INST_DAT_TXT, 'predictions')
+        dat = self.unpack_local_validations(dat)
         return dat.api_resp.replace(np.nan, None).tolist()
     def infer(self, dat):
         return self.predict(dat)
+    @convert_input
     @validate
-    def tokenize(self, dat):
-        payload = {"instances": [{"data": {"text": dat}}]}
-        resp = biolmai.api_call(
-            model_name=self.slug,
-            headers=self.auth_headers,  # From APIEndpoint base class
-            action='tokenize',
-            payload=payload
-        )
-        return resp
+    def transform(self, dat):
+        dat = self.post_batches(dat, self.slug, 'transform', INST_DAT_TXT, 'predictions')
+        dat = self.unpack_local_validations(dat)
+        return dat.api_resp.replace(np.nan, None).tolist()
+    @convert_input
+    @validate
+    def generate(self, dat):
+        dat = self.post_batches(dat, self.slug, 'generate', INST_DAT_TXT, 'generated')
+        dat = self.unpack_local_validations(dat)
+        return dat.api_resp.replace(np.nan, None).tolist()
+def retry_minutes(sess, URL, HEADERS, dat, timeout, mins):
+    """Retry for N minutes."""
+    HEADERS.update({'Content-Type': 'application/json'})
+    attempts, max_attempts = 0, 5
+    try:
+        now = datetime.datetime.now()
+        try_until = now + datetime.timedelta(minutes=mins)
+        while datetime.datetime.now() < try_until and attempts < max_attempts:
+            response = None
+            try:
+                log.info('Trying {}'.format(datetime.datetime.now()))
+                response = sess.post(
+                    URL,
+                    headers=HEADERS,
+                    data=dat,
+                    timeout=timeout
+                )
+                if response.status_code not in (400, 404):
+                    response.raise_for_status()
+                if 'error' in response.json():
+                    raise ValueError(response.json().dumps())
+                else:
+                    break
+            except Exception as e:
+                log.warning(e)
+                if response:
+                    log.warning(response.text)
+                time.sleep(5)  # Wait 5 seconds between tries
+            attempts += 1
+        if response is None:
+            err = "Got Nonetype response"
+            raise ValueError(err)
+        elif 'Server Error' in response.text:
+            err = "Got Server Error"
+            raise ValueError(err)
+    except Exception as e:
+        return response
+    return response
+def requests_retry_session(
+    retries=3,
+    backoff_factor=0.3,
+    status_forcelist=list(range(400, 599)),
+    session=None,
+):
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+    return session
 class PredictAction(object):
@@ -250,21 +275,25 @@ class PredictAction(object):
     def __str__(self):
         return 'PredictAction'
 class GenerateAction(object):
     def __str__(self):
         return 'GenerateAction'
-class TokenizeAction(object):
+class TransformAction(object):
     def __str__(self):
-        return 'TokenizeAction'
+        return 'TransformAction'
 class ExplainAction(object):
     def __str__(self):
         return 'ExplainAction'
 class SimilarityAction(object):
     def __str__(self):
@@ -275,17 +304,3 @@ class FinetuneAction(object):
     def __str__(self):
         return 'FinetuneAction'
-class ESMFoldSingleChain(APIEndpoint):
-    slug = 'esmfold-singlechain'
-    action_classes = (PredictAction, )
-    seq_classes = (UnambiguousAA, )
-    batch_size = 2
-class ESMFoldMultiChain(APIEndpoint):
-    slug = 'esmfold-multichain'
-    action_classes = (PredictAction, )
-    seq_classes = (UnambiguousAA, )
-    batch_size = 2

biolmai-0.1.5/biolmai/asynch.py ADDED Viewed

@@ -0,0 +1,224 @@
+import aiohttp.resolver
+from biolmai.auth import get_user_auth_header
+from biolmai.const import BASE_API_URL, MULTIPROCESS_THREADS
+aiohttp.resolver.DefaultResolver = aiohttp.resolver.AsyncResolver
+from aiohttp import ClientSession, TCPConnector
+from typing import List
+import json
+import asyncio
+from asyncio import create_task, gather, run, sleep
+async def get_one(session: ClientSession, slug: str, action: str,
+                  payload: dict, response_key: str):
+    pass
+from aiohttp import ClientSession
+async def get_one(session: ClientSession, url: str) -> None:
+    print("Requesting", url)
+    async with session.get(url) as resp:
+        text = await resp.text()
+        # await sleep(2)  # for demo purposes
+        text_resp = text.strip().split("\n", 1)[0]
+        print("Got response from", url, text_resp)
+        return text_resp
+async def get_one_biolm(session: ClientSession,
+                        url: str,
+                        pload: dict,
+                        headers: dict,
+                        response_key: str = None) -> None:
+    print("Requesting", url)
+    pload_batch = pload.pop('batch')
+    pload_batch_size = pload.pop('batch_size')
+    t = aiohttp.ClientTimeout(
+        total=1600,  # 27 mins
+        # total timeout (time consists connection establishment for a new connection or waiting for a free connection from a pool if pool connection limits are exceeded) default value is 5 minutes, set to `None` or `0` for unlimited timeout
+        sock_connect=None,
+        # Maximal number of seconds for connecting to a peer for a new connection, not given from a pool. See also connect.
+        sock_read=None
+        # Maximal number of seconds for reading a portion of data from a peer
+    )
+    async with session.post(url, headers=headers, json=pload, timeout=t) as resp:
+        resp_json = await resp.json()
+        resp_json['batch'] = pload_batch
+        status_code = resp.status
+        expected_root_key = response_key
+        to_ret = []
+        if status_code and status_code == 200:
+            list_of_individual_seq_results = resp_json[expected_root_key]
+        # elif local_err:
+        #     list_of_individual_seq_results = [{'error': resp_json}]
+        elif status_code and status_code != 200 and isinstance(resp_json, dict):
+            list_of_individual_seq_results = [resp_json] * pload_batch_size
+        else:
+            raise ValueError("Unexpected response in parser")
+        for idx, item in enumerate(list_of_individual_seq_results):
+            d = {'status_code': status_code,
+                 'batch_id': pload_batch,
+                 'batch_item': idx}
+            if not status_code or status_code != 200:
+                d.update(item)  # Put all resp keys at root there
+            else:
+                # We just append one item, mimicking a single seq in POST req/resp
+                d[expected_root_key] = []
+                d[expected_root_key].append(item)
+            to_ret.append(d)
+        return to_ret
+        # text = await resp.text()
+        # await sleep(2)  # for demo purposes
+        # text_resp = text.strip().split("\n", 1)[0]
+        # print("Got response from", url, text_resp)
+        return j
+async def async_range(count):
+    for i in range(count):
+        yield(i)
+        await asyncio.sleep(0.0)
+async def get_all(urls: List[str], num_concurrent: int) -> List:
+    url_iterator = iter(urls)
+    keep_going = True
+    results = []
+    async with ClientSession() as session:
+        while keep_going:
+            tasks = []
+            for _ in range(num_concurrent):
+                try:
+                    url = next(url_iterator)
+                except StopIteration:
+                    keep_going = False
+                    break
+                new_task = create_task(get_one(session, url))
+                tasks.append(new_task)
+            res = await gather(*tasks)
+            results.extend(res)
+    return results
+async def get_all_biolm(url: str,
+                        ploads: List[dict],
+                        headers: dict,
+                        num_concurrent: int,
+                        response_key: str = None) -> List:
+    ploads_iterator = iter(ploads)
+    keep_going = True
+    results = []
+    connector = aiohttp.TCPConnector(limit=100,
+                                     limit_per_host=50,
+                                     ttl_dns_cache=60)
+    ov_tout = aiohttp.ClientTimeout(
+        total=None,
+        # total timeout (time consists connection establishment for a new connection or waiting for a free connection from a pool if pool connection limits are exceeded) default value is 5 minutes, set to `None` or `0` for unlimited timeout
+        sock_connect=None,
+        # Maximal number of seconds for connecting to a peer for a new connection, not given from a pool. See also connect.
+        sock_read=None
+        # Maximal number of seconds for reading a portion of data from a peer
+    )
+    async with ClientSession(connector=connector, timeout=ov_tout) as session:
+        while keep_going:
+            tasks = []
+            for _ in range(num_concurrent):
+                try:
+                    pload = next(ploads_iterator)
+                except StopIteration:
+                    keep_going = False
+                    break
+                new_task = create_task(get_one_biolm(session, url, pload,
+                                                     headers, response_key))
+                tasks.append(new_task)
+            res = await gather(*tasks)
+            results.extend(res)
+    return results
+async def async_main(urls, concurrency) -> List:
+    return await get_all(urls, concurrency)
+async def async_api_calls(model_name,
+                          action,
+                          headers,
+                          payloads,
+                          response_key=None):
+    """Hit an arbitrary BioLM model inference API."""
+    # Normally would POST multiple sequences at once for greater efficiency,
+    # but for simplicity sake will do one at at time right now
+    url = f'{BASE_API_URL}/models/{model_name}/{action}/'
+    if not isinstance(payloads, (list, dict)):
+        err = "API request payload must be a list or dict, got {}"
+        raise AssertionError(err.format(type(payloads)))
+    concurrency = int(MULTIPROCESS_THREADS)
+    return await get_all_biolm(url, payloads, headers, concurrency,
+                               response_key)
+    # payload = json.dumps(payload)
+    # session = requests_retry_session()
+    # tout = urllib3.util.Timeout(total=180, read=180)
+    # response = retry_minutes(session, url, headers, payload, tout, mins=10)
+    # # If token expired / invalid, attempt to refresh.
+    # if response.status_code == 401 and os.path.exists(ACCESS_TOK_PATH):
+    #     # Add jitter to slow down in case we're multiprocessing so all threads
+    #     # don't try to re-authenticate at once
+    #     time.sleep(random.random() * 4)
+    #     with open(ACCESS_TOK_PATH, 'r') as f:
+    #         access_refresh_dict = json.load(f)
+    #     refresh = access_refresh_dict.get('refresh')
+    #     if not refresh_access_token(refresh):
+    #         err = "Unauthenticated! Please run `biolmai status` to debug or " \
+    #               "`biolmai login`."
+    #         raise AssertionError(err)
+    #     headers = get_user_auth_header()  # Need to re-get these now
+    #     response = retry_minutes(session, url, headers, payload, tout, mins=10)
+    # return response
+def async_api_call_wrapper(grouped_df, slug, action, payload_maker,
+                           response_key):
+    """Wrap API calls to assist with sequence validation as a pre-cursor to
+    each API call.
+    """
+    model_name = slug
+    # payload = payload_maker(grouped_df)
+    init_ploads = grouped_df.groupby('batch').apply(payload_maker, include_batch_size=True)
+    ploads = init_ploads.to_list()
+    init_ploads = init_ploads.to_frame(name='pload')
+    init_ploads['batch'] = init_ploads.index
+    init_ploads = init_ploads.reset_index(drop=True)
+    assert len(ploads) == init_ploads.shape[0]
+    for inst, b in zip(ploads, init_ploads['batch'].to_list()):
+        inst['batch'] = b
+    headers = get_user_auth_header()  # Need to pull each time
+    urls = [
+        "https://github.com",
+        "https://stackoverflow.com",
+        "https://python.org",
+    ]
+    # concurrency = 3
+    api_resp = run(async_api_calls(model_name, action, headers,
+                                   ploads, response_key))
+    api_resp = [item for sublist in api_resp for item in sublist]
+    api_resp = sorted(api_resp, key=lambda x: x['batch_id'])
+    # print(api_resp)
+    # api_resp = biolmai.api_call(model_name, action, headers, payload,
+    #                             response_key)
+    # resp_json = api_resp.json()
+    # batch_id = int(grouped_df.batch.iloc[0])
+    # batch_size = grouped_df.shape[0]
+    # response = predict_resp_many_in_one_to_many_singles(
+    #     resp_json, api_resp.status_code, batch_id, None, batch_size)
+    return api_resp

{biolmai-0.1.3 → biolmai-0.1.5}/biolmai/auth.py RENAMED Viewed

@@ -125,3 +125,46 @@ def save_access_refresh_token(access_refresh_dict):
     access = access_refresh_dict.get('access')
     refresh = access_refresh_dict.get('refresh')
     validate_user_auth(access=access, refresh=refresh)
+def get_api_token():
+    """Get a BioLM API token to use with future API requests.
+    Copied from https://api.biolm.ai/#d7f87dfd-321f-45ae-99b6-eb203519ddeb.
+    """
+    url = "https://biolm.ai/api/auth/token/"
+    payload = json.dumps({
+        "username": os.environ.get("BIOLM_USER"),
+        "password": os.environ.get("BIOLM_PASSWORD")
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+    response_json = response.json()
+    return response_json
+def get_user_auth_header():
+    """Returns a dict with the appropriate Authorization header, either using
+    an API token from BIOLMAI_TOKEN environment variable, or by reading the
+    credentials file at ~/.biolmai/credntials next."""
+    api_token = os.environ.get('BIOLMAI_TOKEN', None)
+    if api_token:
+        headers = {'Authorization': f'Token {api_token}'}
+    elif os.path.exists(ACCESS_TOK_PATH):
+        with open(ACCESS_TOK_PATH, 'r') as f:
+            access_refresh_dict = json.load(f)
+        access = access_refresh_dict.get('access')
+        refresh = access_refresh_dict.get('refresh')
+        headers = {
+            'Cookie': 'access={};refresh={}'.format(access, refresh),
+            'Content-Type': 'application/json'
+        }
+    else:
+        err = "No https://biolm.ai credentials found. Please run `biolmai status` to debug."
+        raise AssertionError(err)
+    return headers

biolmai-0.1.5/biolmai/biolmai.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Main module."""
+import logging
+log = logging.getLogger('biolm_util')

biolmai 0.1.3__tar.gz → 0.1.5__tar.gz

Potentially problematic release.

biolmai 0.1.3tar.gz → 0.1.5tar.gz