PyPI - haystack-ml-stack - Versions diffs - 0.2.3__tar.gz → 0.2.5__tar.gz - Mend

haystack-ml-stack 0.2.3tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: haystack-ml-stack
-Version: 0.2.3
+Version: 0.2.5
 Summary: Functions related to Haystack ML
 Author-email: Oscar Vega <oscar@haystack.tv>
 License: MIT

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "haystack-ml-stack"
-version = "0.2.3"
+version = "0.2.5"
 description = "Functions related to Haystack ML"
 readme = "README.md"
 authors = [{ name = "Oscar Vega", email = "oscar@haystack.tv" }]

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .app import create_app
 __all__ = ["create_app"]
-__version__ = "0.2.3"
+__version__ = "0.2.5"

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack/app.py RENAMED Viewed

@@ -5,10 +5,13 @@ import sys
 from http import HTTPStatus
 from typing import Any, Dict, List, Optional
 import time
+from contextlib import asynccontextmanager, AsyncExitStack
 import aiobotocore.session
+from aiobotocore.config import AioConfig
 from fastapi import FastAPI, HTTPException, Request, Response
 from fastapi.encoders import jsonable_encoder
+import newrelic.agent
 from .cache import make_features_cache
@@ -24,8 +27,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-import newrelic.agent
+MAX_POOL_CONNECTIONS = int(os.environ.get("MAX_POOL_CONNECTIONS", 50))
 def create_app(
@@ -39,12 +41,6 @@ def create_app(
     """
     cfg = settings or Settings()
-    app = FastAPI(
-        title="ML Stream Scorer",
-        description="Scores video streams using a pre-trained ML model and DynamoDB features.",
-        version="1.0.0",
-    )
     # Mutable state: cache + model
     features_cache = make_features_cache(cfg.cache_maxsize)
     state: Dict[str, Any] = {
@@ -55,24 +51,59 @@ def create_app(
         ),
     }
-    @app.on_event("startup")
-    async def _startup() -> None:
-        if state["model"] is not None:
-            logger.info("Using preloaded model.")
-            return
-        if not cfg.s3_model_path:
-            logger.critical("S3_MODEL_PATH not set; service will be unhealthy.")
-            return
-        try:
-            state["model"] = await download_and_load_model(
-                cfg.s3_model_path, aio_session=state["session"]
+    @asynccontextmanager
+    async def lifespan(app_server: FastAPI):
+        """
+        Handles startup and shutdown logic.
+        Everything before 'yield' runs on startup.
+        Everything after 'yield' runs on shutdown.
+        """
+        async with AsyncExitStack() as stack:
+            # 1. Initialize DynamoDB Client (Persistent Pool)
+            session = state["session"]
+            state["dynamo_client"] = await stack.enter_async_context(
+                session.create_client(
+                    "dynamodb",
+                    # Ensure the pool is large enough for ML concurrency
+                    config=AioConfig(max_pool_connections=MAX_POOL_CONNECTIONS),
+                )
             )
-            state["stream_features"] = state["model"].get("stream_features", [])
-            logger.info("Model loaded on startup.")
-        except Exception as e:
-            logger.critical("Failed to load model: %s", e)
+            logger.info("DynamoDB persistent client initialized.")
+            # 2. Load ML Model
+            if state["model"] is None:
+                if not cfg.s3_model_path:
+                    logger.critical("S3_MODEL_PATH not set; service will be unhealthy.")
+                else:
+                    try:
+                        # Pass the persistent session/client if needed
+                        state["model"] = await download_and_load_model(
+                            cfg.s3_model_path, aio_session=state["session"]
+                        )
+                        state["stream_features"] = state["model"].get(
+                            "stream_features", []
+                        )
+                        state["user_features"] = state["model"].get("user_features", [])
+                        newrelic.agent.add_custom_attribute(
+                            "total_stream_features", len(state["stream_features"])
+                        )
+                        logger.info("Model loaded successfully.")
+                    except Exception as e:
+                        logger.critical("Failed to load model: %s", e)
+            yield
+            # 3. Shutdown Logic
+            # The AsyncExitStack automatically closes the DynamoDB client pool here
+            logger.info("Shutting down: Connection pools closed.")
+    app = FastAPI(
+        title="ML Stream Scorer",
+        description="Scores video streams using a pre-trained ML model and DynamoDB features.",
+        version="1.0.0",
+        lifespan=lifespan,
+    )
     @app.get("/health", status_code=HTTPStatus.OK)
     async def health():
@@ -121,11 +152,16 @@ def create_app(
         model = state["model"]
         stream_features = model.get("stream_features", []) or []
         retrieval_meta = FeatureRetrievalMeta(
-            cache_misses=0, retrieval_ms=0, success=True, cache_delay_minutes=0
+            cache_misses=0,
+            retrieval_ms=0,
+            success=True,
+            cache_delay_minutes=0,
+            dynamo_ms=0,
+            parsing_ms=0,
         )
         if stream_features:
             retrieval_meta = await set_stream_features(
-                aio_session=state["session"],
+                dynamo_client=state["dynamo_client"],
                 streams=streams,
                 stream_features=stream_features,
                 features_cache=features_cache,
@@ -166,10 +202,12 @@ def create_app(
                 "cache_misses": retrieval_meta.cache_misses,
                 "retrieval_success": int(retrieval_meta.success),
                 "cache_delay_minutes": retrieval_meta.cache_delay_minutes,
+                "dynamo_ms": retrieval_meta.dynamo_ms,
+                "dynamo_parse_ms": retrieval_meta.parsing_ms,
                 "retrieval_ms": retrieval_meta.retrieval_ms,
                 "preprocess_ms": (predict_start - preprocess_start) * 1e-6,
                 "predict_ms": (predict_end - predict_start) * 1e-6,
-                "total_scores": len(model_output),
+                "total_streams": len(model_output),
             },
         )
         if model_output:

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack/cache.py RENAMED Viewed

@@ -5,14 +5,14 @@ from cachetools import TLRUCache
 def _ttu(_, value: Any, now: float) -> float:
     """Time-To-Use policy: allow per-item TTL via 'cache_ttl_in_seconds' or fallback."""
-    ONE_YEAR = 365 * 24 * 60 * 60
+    ONE_WEEK = 7 * 24 * 60 * 60
     try:
         ttl = int(value.get("cache_ttl_in_seconds", -1))
         if ttl > 0:
             return now + ttl
     except Exception:
         pass
-    return now + ONE_YEAR
+    return now + ONE_WEEK
 def make_features_cache(maxsize: int) -> TLRUCache:

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack/dynamo.py RENAMED Viewed

@@ -2,19 +2,29 @@ from typing import Any, Dict, List, NamedTuple
 import logging
 import time
 import datetime
-import aiobotocore.session
+from boto3.dynamodb.types import TypeDeserializer
 import newrelic.agent
+import asyncio
 logger = logging.getLogger(__name__)
+class FloatDeserializer(TypeDeserializer):
+    def _deserialize_n(self, value):
+        return float(value)
+_deser = FloatDeserializer()
 class FeatureRetrievalMeta(NamedTuple):
     cache_misses: int
     retrieval_ms: float
     success: bool
     cache_delay_minutes: float
+    dynamo_ms: float
+    parsing_ms: float
 @newrelic.agent.function_trace()
@@ -25,68 +35,54 @@ async def async_batch_get(
     Asynchronous batch_get_item with chunking for requests > 100 keys
     and handling for unprocessed keys.
     """
-    all_items: List[Dict[str, Any]] = []
     # DynamoDB's BatchGetItem has a 100-item limit per request.
     CHUNK_SIZE = 100
-    # Split the keys into chunks of 100
-    for i in range(0, len(keys), CHUNK_SIZE):
-        chunk_keys = keys[i : i + CHUNK_SIZE]
-        to_fetch = {table_name: {"Keys": chunk_keys}}
-        # Inner loop to handle unprocessed keys for the current chunk
-        # Max retries of 3
-        retries = 3
-        while to_fetch and retries > 0:
-            retries -= 1
-            try:
-                resp = await dynamo_client.batch_get_item(RequestItems=to_fetch)
-                if "Responses" in resp and table_name in resp["Responses"]:
-                    all_items.extend(resp["Responses"][table_name])
-                unprocessed = resp.get("UnprocessedKeys", {})
-                # If there are unprocessed keys, set them to be fetched in the next iteration
-                if unprocessed and unprocessed.get(table_name):
-                    logger.warning(
-                        "Retrying %d unprocessed keys.",
-                        len(unprocessed[table_name]["Keys"]),
-                    )
-                    to_fetch = unprocessed
-                else:
-                    # All keys in the chunk were processed, exit the inner loop
-                    to_fetch = {}
-            except Exception as e:
-                logger.error("Error during batch_get_item for a chunk: %s", e)
-                # Stop trying to process this chunk on error and move to the next
+    if len(keys) <= CHUNK_SIZE:
+        all_items = await _fetch_chunk(dynamo_client, table_name, keys)
+    else:
+        chunks = [keys[i : i + CHUNK_SIZE] for i in range(0, len(keys), CHUNK_SIZE)]
+        tasks = [_fetch_chunk(dynamo_client, table_name, chunk) for chunk in chunks]
+        results = await asyncio.gather(*tasks)
+        all_items = [item for batch in results for item in batch]
+    return all_items
+async def _fetch_chunk(dynamo_client, table_name: str, chunk_keys):
+    """Fetch a single chunk of up to 100 keys with retry handling."""
+    to_fetch = {table_name: {"Keys": chunk_keys}}
+    retries = 3
+    items = []
+    while to_fetch and retries > 0:
+        retries -= 1
+        try:
+            resp = await dynamo_client.batch_get_item(RequestItems=to_fetch)
+            # Collect retrieved items
+            if "Responses" in resp and table_name in resp["Responses"]:
+                items.extend(resp["Responses"][table_name])
+            # Check for unprocessed keys
+            unprocessed = resp.get("UnprocessedKeys", {})
+            if unprocessed and unprocessed.get(table_name):
+                unp = unprocessed[table_name]["Keys"]
+                logger.warning("Retrying %d unprocessed keys.", len(unp))
+                to_fetch = {table_name: {"Keys": unp}}
+            else:
                 to_fetch = {}
-    return all_items
+        except Exception as e:
+            logger.error("Error in batch_get_item chunk: %s", e)
+            break
+    return items
-@newrelic.agent.function_trace()
 def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
     """Parse a DynamoDB attribute map (low-level) to Python types."""
-    out: Dict[str, Any] = {}
-    for k, v in item.items():
-        if "N" in v:
-            out[k] = float(v["N"])
-        elif "S" in v:
-            out[k] = v["S"]
-        elif "SS" in v:
-            out[k] = v["SS"]
-        elif "NS" in v:
-            out[k] = [float(n) for n in v["NS"]]
-        elif "BOOL" in v:
-            out[k] = v["BOOL"]
-        elif "NULL" in v:
-            out[k] = None
-        elif "L" in v:
-            out[k] = [parse_dynamo_item({"value": i})["value"] for i in v["L"]]
-        elif "M" in v:
-            out[k] = parse_dynamo_item(v["M"])
-    return out
+    # out: Dict[str, Any] = {}
+    return {k: _deser.deserialize(v) for k, v in item.items()}
 @newrelic.agent.function_trace()
@@ -98,7 +94,7 @@ async def set_stream_features(
     features_table: str,
     stream_pk_prefix: str,
     cache_sep: str,
-    aio_session: aiobotocore.session.Session | None = None,
+    dynamo_client,
 ) -> FeatureRetrievalMeta:
     time_start = time.perf_counter_ns()
     """Fetch missing features for streams from DynamoDB and fill them into streams."""
@@ -108,6 +104,8 @@ async def set_stream_features(
             retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
             success=True,
             cache_delay_minutes=0,
+            dynamo_ms=0,
+            parsing_ms=0,
         )
     cache_miss: Dict[str, Dict[str, Any]] = {}
@@ -122,7 +120,8 @@ async def set_stream_features(
                 if cached["value"] is not None:
                     s[f] = cached["value"]
                     cache_delay_obj[f] = max(
-                        cache_delay_obj[f], (now - cached["updated_at"]).total_seconds()
+                        cache_delay_obj[f],
+                        (now - cached["inserted_at"]).total_seconds(),
                     )
             else:
                 cache_miss[key] = s
@@ -135,6 +134,8 @@ async def set_stream_features(
             retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
             success=True,
             cache_delay_minutes=cache_delay / 60,
+            dynamo_ms=0,
+            parsing_ms=0,
         )
     cache_misses = len(cache_miss)
     logger.info("Cache miss for %d items", cache_misses)
@@ -146,19 +147,21 @@ async def set_stream_features(
         pk = f"{stream_pk_prefix}{stream_url}"
         keys.append({"pk": {"S": pk}, "sk": {"S": sk}})
-    session = aio_session or aiobotocore.session.get_session()
-    async with session.create_client("dynamodb") as dynamodb:
-        try:
-            items = await async_batch_get(dynamodb, features_table, keys)
-        except Exception as e:
-            logger.error("DynamoDB batch_get failed: %s", e)
-            return FeatureRetrievalMeta(
-                cache_misses=cache_misses,
-                retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
-                success=False,
-                cache_delay_minutes=cache_delay / 60,
-            )
+    dynamo_start = time.perf_counter_ns()
+    try:
+        items = await async_batch_get(dynamo_client, features_table, keys)
+    except Exception as e:
+        logger.error("DynamoDB batch_get failed: %s", e)
+        end_time = time.perf_counter_ns()
+        return FeatureRetrievalMeta(
+            cache_misses=cache_misses,
+            retrieval_ms=(end_time - time_start) * 1e-6,
+            success=False,
+            cache_delay_minutes=cache_delay / 60,
+            dynamo_ms=(end_time - dynamo_start) * 1e-6,
+            parsing_ms=0,
+        )
+    dynamo_end = time.perf_counter_ns()
     updated_keys = set()
     for item in items:
         stream_url = item["pk"]["S"].removeprefix(stream_pk_prefix)
@@ -169,22 +172,23 @@ async def set_stream_features(
         features_cache[cache_key] = {
             "value": parsed.get("value"),
             "cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
-            "updated_at": datetime.datetime.fromisoformat(
-                parsed.get("updated_at")
-            ).replace(tzinfo=None),
+            "inserted_at": datetime.datetime.utcnow(),
         }
         if cache_key in cache_miss:
             cache_miss[cache_key][feature_name] = parsed.get("value")
             updated_keys.add(cache_key)
+    parsing_end = time.perf_counter_ns()
     # Save keys that were not found in DynamoDB with None value
     if len(updated_keys) < len(cache_miss):
         missing_keys = set(cache_miss.keys()) - updated_keys
         for k in missing_keys:
             features_cache[k] = {"value": None, "cache_ttl_in_seconds": 300}
+    end_time = time.perf_counter_ns()
     return FeatureRetrievalMeta(
         cache_misses=cache_misses,
-        retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
+        retrieval_ms=(end_time - time_start) * 1e-6,
         success=True,
         cache_delay_minutes=cache_delay / 60,
+        dynamo_ms=(dynamo_end - dynamo_start) * 1e-6,
+        parsing_ms=(parsing_end - dynamo_end) * 1e-6,
     )

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack/utils.py RENAMED Viewed

@@ -4,8 +4,13 @@ import typing as _t
 def stream_favorites_cleanup(
-    stream, user_favorite_tags: list[str], user_favorite_authors: list[str]
+    stream,
+    user_favorite_tags: list[str],
+    user_favorite_authors: list[str],
+    out: dict = None,
 ) -> dict:
+    if out is None:
+        out = {}
     stream_tags = stream.get("haystackTags", [])
     is_favorite_tag = (
         any(stream_tag in user_favorite_tags for stream_tag in stream_tags)
@@ -17,15 +22,15 @@ def stream_favorites_cleanup(
         if user_favorite_authors is not None
         else False
     )
-    return {
-        "IS_FAVORITE_TAG": is_favorite_tag,
-        "IS_FAVORITE_AUTHOR": is_favorite_author,
-    }
+    out["IS_FAVORITE_TAG"] = is_favorite_tag
+    out["IS_FAVORITE_AUTHOR"] = is_favorite_author
+    return out
 def browsed_count_cleanups(
     stream,
     position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
+    out: dict = None,
 ) -> dict:
     position_alias_mapping = {
         "0": "1ST_POS",
@@ -43,7 +48,8 @@ def browsed_count_cleanups(
     total_selects = 0
     total_browsed = 0
     total_selects_and_watched = 0
-    feats = {}
+    if out is None:
+        out = {}
     for position in position_alias_mapping.keys():
         pos_counts = browsed_count_obj.get(position, {})
         total_browsed += pos_counts.get("total_browsed", 0)
@@ -55,16 +61,17 @@ def browsed_count_cleanups(
         suffix = ""
     else:
         raise ValueError("Should not be here.")
-    feats[f"STREAM_24H_TOTAL_BROWSED{suffix}"] = total_browsed
-    feats[f"STREAM_24H_TOTAL_SELECTS{suffix}"] = total_selects
-    feats[f"STREAM_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = total_selects_and_watched
-    return feats
+    out[f"STREAM_24H_TOTAL_BROWSED{suffix}"] = total_browsed
+    out[f"STREAM_24H_TOTAL_SELECTS{suffix}"] = total_selects
+    out[f"STREAM_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = total_selects_and_watched
+    return out
 def device_split_browsed_count_cleanups(
     stream,
     device_type: _t.Literal["TV", "MOBILE"],
     position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
+    out: dict = None,
 ) -> dict:
     position_alias_mapping = {
         "0": "1ST_POS",
@@ -87,21 +94,24 @@ def device_split_browsed_count_cleanups(
     total_selects = 0
     total_browsed = 0
     total_selects_and_watched = 0
-    feats = {}
+    if out is None:
+        out = {}
     for position, alias in position_alias_mapping.items():
         pos_counts = browsed_count_obj.get(position, {})
         total_browsed = pos_counts.get("total_browsed", 0)
         total_selects = pos_counts.get("total_selects", 0)
         total_selects_and_watched = pos_counts.get("total_selects_and_watched", 0)
-        feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_BROWSED{suffix}"] = total_browsed
-        feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS{suffix}"] = total_selects
-        feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = (
+        out[f"STREAM_{alias}_{device_type}_24H_TOTAL_BROWSED{suffix}"] = total_browsed
+        out[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS{suffix}"] = total_selects
+        out[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = (
             total_selects_and_watched
         )
-    return feats
+    return out
-def watched_count_cleanups(stream, entry_contexts: list[str] = None) -> dict:
+def watched_count_cleanups(
+    stream, entry_contexts: list[str] = None, out: dict = None
+) -> dict:
     if entry_contexts is None:
         entry_contexts = [
             "autoplay",
@@ -113,19 +123,20 @@ def watched_count_cleanups(stream, entry_contexts: list[str] = None) -> dict:
     _validate_pwatched_entry_context(entry_contexts)
     counts_obj = stream.get(f"PWATCHED#24H", {})
-    feats = {}
+    if out is None:
+        out = {}
     for entry_context in entry_contexts:
         attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
         watched = counts_obj.get(entry_context, {}).get("watched", 0)
         context_key = entry_context if "launch" not in entry_context else "launch"
         context_key = context_key.upper().replace(" ", "_")
-        feats[f"STREAM_{context_key}_24H_TOTAL_WATCHED"] = watched
-        feats[f"STREAM_{context_key}_24H_TOTAL_ATTEMPTS"] = attempts
-    return feats
+        out[f"STREAM_{context_key}_24H_TOTAL_WATCHED"] = watched
+        out[f"STREAM_{context_key}_24H_TOTAL_ATTEMPTS"] = attempts
+    return out
 def device_watched_count_cleanups(
-    stream, device_type: str, entry_contexts: list[str] = None
+    stream, device_type: str, entry_contexts: list[str] = None, out: dict = None
 ) -> dict:
     if entry_contexts is None:
         entry_contexts = [
@@ -140,23 +151,24 @@ def device_watched_count_cleanups(
     _validate_device_type(device_type)
     counts_obj = stream.get(f"PWATCHED#24H#{device_type}", {})
-    feats = {}
+    if out is None:
+        out = {}
     for entry_context in entry_contexts:
         attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
         watched = counts_obj.get(entry_context, {}).get("watched", 0)
         context_key = entry_context if "launch" not in entry_context else "launch"
         context_key = context_key.upper().replace(" ", "_")
-        feats[f"STREAM_{context_key}_{device_type}_24H_TOTAL_WATCHED"] = watched
-        feats[f"STREAM_{context_key}_{device_type}_24H_TOTAL_ATTEMPTS"] = attempts
-    return feats
+        out[f"STREAM_{context_key}_{device_type}_24H_TOTAL_WATCHED"] = watched
+        out[f"STREAM_{context_key}_{device_type}_24H_TOTAL_ATTEMPTS"] = attempts
+    return out
 def generic_beta_adjust_features(
     data: pd.DataFrame,
     prefix: str,
-    pwatched_beta_params: dict,
-    pselect_beta_params: dict,
-    pslw_beta_params: dict,
+    pwatched_beta_params: dict = None,
+    pselect_beta_params: dict = None,
+    pslw_beta_params: dict = None,
     use_low_sample_flags: bool = False,
     low_sample_threshold: int = 3,
     use_attempt_features: bool = False,
@@ -164,67 +176,92 @@ def generic_beta_adjust_features(
     debiased_pselect: bool = True,
     use_logodds: bool = False,
 ) -> pd.DataFrame:
-    pwatched_features = {}
-    for context, (alpha, beta) in pwatched_beta_params.items():
-        total_watched = data[f"{prefix}_{context}_TOTAL_WATCHED"].fillna(0)
-        total_attempts = data[f"{prefix}_{context}_TOTAL_ATTEMPTS"].fillna(0)
-        pwatched_features[f"{prefix}_{context}_ADJ_PWATCHED"] = (
-            total_watched + alpha
-        ) / (total_attempts + alpha + beta)
-        if use_low_sample_flags:
-            pwatched_features[f"{prefix}_{context}_LOW_SAMPLE"] = total_attempts.le(
-                low_sample_threshold
-            ).astype(int)
-        if use_attempt_features:
-            pwatched_features[f"{prefix}_{context}_ATTEMPTS"] = total_attempts.clip(
-                upper=max_attempt_cap
+    features = {}
+    counting_feature_cols = [
+        c
+        for c in data.columns
+        if "TOTAL_WATCHED" in c
+        or "TOTAL_ATTEMPTS" in c
+        or "SELECT" in c
+        or "BROWSED" in c
+    ]
+    data_arr = data[counting_feature_cols].to_numpy(dtype=float)
+    col_to_idx = {col: i for i, col in enumerate(counting_feature_cols)}
+    if pwatched_beta_params is not None:
+        for context, (alpha, beta) in pwatched_beta_params.items():
+            total_watched = np.nan_to_num(
+                data_arr[:, col_to_idx[f"{prefix}_{context}_TOTAL_WATCHED"]]
+            )
+            total_attempts = np.nan_to_num(
+                data_arr[:, col_to_idx[f"{prefix}_{context}_TOTAL_ATTEMPTS"]]
             )
+            features[f"{prefix}_{context}_ADJ_PWATCHED"] = (total_watched + alpha) / (
+                total_attempts + alpha + beta
+            )
+            low_sample_arr = np.empty_like(total_attempts, dtype=float)
+            if use_low_sample_flags:
+                features[f"{prefix}_{context}_LOW_SAMPLE"] = np.less_equal(
+                    total_attempts, low_sample_threshold, out=low_sample_arr
+                )
+            if use_attempt_features:
+                features[f"{prefix}_{context}_ATTEMPTS"] = np.clip(
+                    total_attempts, a_min=None, a_max=max_attempt_cap
+                )
-    pselect_features = {}
     debias_suffix = "_UP_TO_4_BROWSED" if debiased_pselect else ""
-    for key, (alpha, beta) in pselect_beta_params.items():
-        total_selects = data[f"{prefix}_{key}_TOTAL_SELECTS{debias_suffix}"].fillna(0)
-        total_browsed = data[f"{prefix}_{key}_TOTAL_BROWSED{debias_suffix}"].fillna(0)
-        pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"] = (
-            total_selects + alpha
-        ) / (total_selects + total_browsed + alpha + beta)
-        if use_low_sample_flags:
-            pselect_features[f"{prefix}_{key}_PSELECT_LOW_SAMPLE{debias_suffix}"] = (
-                (total_selects + total_browsed).le(low_sample_threshold).astype(int)
-            )
-        if use_attempt_features:
-            pselect_features[f"{prefix}_{key}_PSELECT_ATTEMPTS{debias_suffix}"] = (
-                total_selects + total_browsed
-            ).clip(upper=max_attempt_cap)
-        total_slw = data[
-            f"{prefix}_{key}_TOTAL_SELECTS_AND_WATCHED{debias_suffix}"
-        ].fillna(0)
-        pslw_alpha, pslw_beta = pslw_beta_params[key]
-        pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"] = (
-            total_slw + pslw_alpha
-        ) / (total_selects + total_browsed + pslw_alpha + pslw_beta)
-        pselect_features[f"{prefix}_{key}_PSelNotW{debias_suffix}"] = (
-            pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"]
-            - pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"]
-        )
+    if pselect_beta_params is not None or pslw_beta_params is not None:
+        for key, (alpha, beta) in pselect_beta_params.items():
+            total_selects_idx = col_to_idx[
+                f"{prefix}_{key}_TOTAL_SELECTS{debias_suffix}"
+            ]
+            total_browsed_idx = col_to_idx[
+                f"{prefix}_{key}_TOTAL_BROWSED{debias_suffix}"
+            ]
+            total_slw_idx = col_to_idx[
+                f"{prefix}_{key}_TOTAL_SELECTS_AND_WATCHED{debias_suffix}"
+            ]
+            total_selects = np.nan_to_num(data_arr[:, total_selects_idx])
+            total_browsed = np.nan_to_num(data_arr[:, total_browsed_idx])
+            total_slw = np.nan_to_num(data_arr[:, total_slw_idx])
+            if pselect_beta_params is not None:
+                features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"] = (
+                    total_selects + alpha
+                ) / (total_selects + total_browsed + alpha + beta)
+            if use_low_sample_flags:
+                low_sample_arr = np.empty_like(total_selects, dtype=float)
+                features[f"{prefix}_{key}_PSELECT_LOW_SAMPLE{debias_suffix}"] = (
+                    np.less_equal(
+                        total_selects + total_browsed,
+                        low_sample_threshold,
+                        out=low_sample_arr,
+                    )
+                )
+            if use_attempt_features:
+                features[f"{prefix}_{key}_PSELECT_ATTEMPTS{debias_suffix}"] = np.clip(
+                    total_selects + total_browsed, a_min=0, a_max=max_attempt_cap
+                )
+            if pslw_beta_params is not None:
+                pslw_alpha, pslw_beta = pslw_beta_params[key]
+                features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"] = (
+                    total_slw + pslw_alpha
+                ) / (total_selects + total_browsed + pslw_alpha + pslw_beta)
+            if pslw_beta_params is not None and pselect_beta_params is not None:
+                features[f"{prefix}_{key}_PSelNotW{debias_suffix}"] = (
+                    features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"]
+                    - features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"]
+                )
-    adjusted_feats = pd.DataFrame({**pwatched_features, **pselect_features})
+    adjusted_feats = pd.DataFrame(features, index=data.index)
     if use_logodds:
-        adjusted_feats = adjusted_feats.pipe(
-            lambda x: x.assign(
-                **x[
-                    [
-                        c
-                        for c in x.columns
-                        if "PSELECT" in c
-                        or "PSLW" in c
-                        or "PWATCHED" in c
-                        or "PSelNotW" in c
-                    ]
-                ]
-                .clip(lower=0.001)
-                .pipe(prob_to_logodds)
-            )
+        arr = adjusted_feats.to_numpy()
+        col_idxs = [
+            i
+            for i, c in enumerate(adjusted_feats.columns)
+            if ("PSELECT" in c or "PSLW" in c or "PWATCHED" in c or "PSelNotW" in c)
+            and ("LOW_SAMPLE" not in c and "ATTEMPTS" not in c)
+        ]
+        arr[:, col_idxs] = prob_to_logodds(
+            np.clip(arr[:, col_idxs], a_min=0.001, a_max=None)
         )
     return adjusted_feats
@@ -251,7 +288,10 @@ def sigmoid(x: float) -> float:
 def generic_logistic_predict(
     data: pd.DataFrame, coeffs: pd.Series, intercept: float
 ) -> pd.Series:
-    return ((data[coeffs.index] * coeffs).sum(axis=1) + intercept).pipe(sigmoid)
+    scores = (data[coeffs.index] * coeffs).sum(axis=1) + intercept
+    raw_arr = scores.to_numpy()
+    raw_arr[:] = sigmoid(raw_arr)
+    return scores
 def _validate_device_type(device_type: str):

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: haystack-ml-stack
-Version: 0.2.3
+Version: 0.2.5
 Summary: Functions related to Haystack ML
 Author-email: Oscar Vega <oscar@haystack.tv>
 License: MIT

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,4 +11,5 @@ src/haystack_ml_stack.egg-info/PKG-INFO
 src/haystack_ml_stack.egg-info/SOURCES.txt
 src/haystack_ml_stack.egg-info/dependency_links.txt
 src/haystack_ml_stack.egg-info/requires.txt
-src/haystack_ml_stack.egg-info/top_level.txt
+src/haystack_ml_stack.egg-info/top_level.txt
+tests/test_utils.py

haystack_ml_stack-0.2.5/tests/test_utils.py ADDED Viewed

@@ -0,0 +1,76 @@
+import pytest
+import pandas as pd
+from haystack_ml_stack import utils
+import numpy as np
+def test_sigmoid():
+    values_to_test = np.array([-1, 0, 1])
+    expected = np.array([0.26894142136992605, 0.5, 0.731058578630074])
+    actual = utils.sigmoid(values_to_test)
+    assert np.isclose(actual, expected).all()
+def test_prob_to_logodds():
+    values_to_test = np.array([0.25, 0.5, 0.75])
+    expected = np.array([-1.0986122886681096, 0, 1.0986122886681096])
+    actual = utils.prob_to_logodds(values_to_test)
+    assert np.isclose(actual, expected).all(), print(actual - expected)
+def test_generic_beta_adjust_features():
+    data_to_test = pd.DataFrame(
+        {
+            "STREAM_AUTOPLAY_24H_TOTAL_ATTEMPTS": [1, 2],
+            "STREAM_AUTOPLAY_24H_TOTAL_WATCHED": [0, 1],
+            "STREAM_24H_TOTAL_SELECTS_UP_TO_4_BROWSED": [1, 1],
+            "STREAM_24H_TOTAL_SELECTS_AND_WATCHED_UP_TO_4_BROWSED": [0, 1],
+            "STREAM_24H_TOTAL_BROWSED_UP_TO_4_BROWSED": [2, 0],
+        },
+        dtype=float,
+    )
+    actual = utils.generic_beta_adjust_features(
+        data=data_to_test,
+        prefix="STREAM",
+        pwatched_beta_params={"AUTOPLAY_24H": (2, 1)},
+        pselect_beta_params={"24H": (1, 1)},
+        pslw_beta_params={"24H": (0.5, 1)},
+        use_low_sample_flags=True,
+    )
+    # print(actual)
+    expected = pd.DataFrame(
+        {
+            "STREAM_AUTOPLAY_24H_ADJ_PWATCHED": [
+                (0 + 2) / (1 + 2 + 1),
+                (1 + 2) / (2 + 2 + 1),
+            ],
+            "STREAM_24H_ADJ_PSELECT_UP_TO_4_BROWSED": [
+                (1 + 1) / (1 + 2 + 1 + 1),
+                (1 + 1) / (1 + 0 + 1 + 1),
+            ],
+            "STREAM_24H_ADJ_PSLW_UP_TO_4_BROWSED": [
+                (0 + 0.5) / (1 + 2 + 0.5 + 1),
+                (1 + 0.5) / (1 + 0 + 0.5 + 1),
+            ],
+            "STREAM_24H_PSelNotW_UP_TO_4_BROWSED": [
+                (1 + 1) / (1 + 2 + 1 + 1) - (0 + 0.5) / (1 + 2 + 0.5 + 1),
+                (1 + 1) / (1 + 0 + 1 + 1) - (1 + 0.5) / (1 + 0 + 0.5 + 1),
+            ],
+            "STREAM_AUTOPLAY_24H_LOW_SAMPLE": [1, 1],
+            "STREAM_24H_PSELECT_LOW_SAMPLE_UP_TO_4_BROWSED": [1, 1],
+        }
+    )
+    assert (actual[expected.columns] == expected).all(axis=None), actual - expected
+def test_generic_logistic_predict():
+    features = pd.DataFrame({"feat1": [0, 1, 2], "feat2": [3, 3, 5]}, dtype=float)
+    coeffs = pd.Series({"feat1": 1, "feat2": 2})
+    intercept = 1
+    expected = utils.sigmoid(
+        pd.Series([0 * 1 + 2 * 3, 1 * 1 + 2 * 3, 2 * 1 + 5 * 2]) + 1
+    )
+    actual = utils.generic_logistic_predict(
+        data=features, coeffs=coeffs, intercept=intercept
+    )
+    assert (expected == actual).all(), actual - expected

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/README.md RENAMED Viewed

File without changes

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/setup.cfg RENAMED Viewed

File without changes

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack/model_store.py RENAMED Viewed

File without changes

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack/settings.py RENAMED Viewed

File without changes

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack.egg-info/requires.txt RENAMED Viewed

File without changes

{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.5}/src/haystack_ml_stack.egg-info/top_level.txt RENAMED Viewed

File without changes

haystack-ml-stack 0.2.3__tar.gz → 0.2.5__tar.gz

haystack-ml-stack 0.2.3tar.gz → 0.2.5tar.gz