PyPI - haystack-ml-stack - Versions diffs - 0.1.2__tar.gz → 0.2.1__tar.gz - Mend

haystack-ml-stack 0.1.2tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: haystack-ml-stack
-Version: 0.1.2
+Version: 0.2.1
 Summary: Functions related to Haystack ML
 Author-email: Oscar Vega <oscar@haystack.tv>
 License: MIT
@@ -12,6 +12,7 @@ Requires-Dist: cloudpickle==2.2.1
 Requires-Dist: aioboto3==12.0.0
 Requires-Dist: fastapi==0.104.1
 Requires-Dist: pydantic-settings==2.2
+Requires-Dist: newrelic==11.1.0
 # Haystack ML Stack

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "haystack-ml-stack"
-version = "0.1.2"
+version = "0.2.1"
 description = "Functions related to Haystack ML"
 readme = "README.md"
 authors = [{ name = "Oscar Vega", email = "oscar@haystack.tv" }]
@@ -16,6 +16,7 @@ dependencies = [
     "cloudpickle==2.2.1",
     "aioboto3==12.0.0",
     "fastapi==0.104.1",
-    "pydantic-settings==2.2"
+    "pydantic-settings==2.2",
+    "newrelic==11.1.0"
 ]
 license = { text = "MIT" }

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .app import create_app
 __all__ = ["create_app"]
-__version__ = "0.1.2"
+__version__ = "0.2.1"

{haystack_ml_stack-0.1.2/src/haystack_test_package → haystack_ml_stack-0.2.1/src/haystack_ml_stack}/app.py RENAMED Viewed

@@ -4,13 +4,15 @@ import random
 import sys
 from http import HTTPStatus
 from typing import Any, Dict, List, Optional
+import time
 import aiobotocore.session
 from fastapi import FastAPI, HTTPException, Request, Response
 from fastapi.encoders import jsonable_encoder
 from .cache import make_features_cache
-from .dynamo import set_stream_features
+from .dynamo import set_stream_features, FeatureRetrievalMeta
 from .model_store import download_and_load_model
 from .settings import Settings
@@ -23,6 +25,8 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
+import newrelic.agent
 def create_app(
     settings: Optional[Settings] = None,
@@ -96,11 +100,15 @@ def create_app(
         try:
             data = await request.json()
-        except Exception:
+        except Exception as e:
             raise HTTPException(
                 status_code=HTTPStatus.BAD_REQUEST, detail="Invalid JSON payload"
-            )
+            ) from e
+        query_params = {}
+        for k in request.query_params.keys():
+            values = request.query_params.getlist(k)
+            # flatten single-element lists
+            query_params[k] = values[0] if len(values) == 1 else values
         user = data.get("user", {})
         streams: List[Dict[str, Any]] = data.get("streams", [])
         playlist = data.get("playlist", {})
@@ -112,9 +120,11 @@ def create_app(
         # Feature fetch (optional based on model)
         model = state["model"]
         stream_features = model.get("stream_features", []) or []
+        retrieval_meta = FeatureRetrievalMeta(
+            cache_misses=0, retrieval_ms=0, success=True, cache_delay_minutes=0
+        )
         if stream_features:
-            logger.info("Fetching stream features for user %s", user.get("userid", ""))
-            await set_stream_features(
+            retrieval_meta = await set_stream_features(
                 aio_session=state["session"],
                 streams=streams,
                 stream_features=stream_features,
@@ -124,24 +134,52 @@ def create_app(
                 cache_sep=cfg.cache_separator,
             )
+        random_number = random.random()
+        userid = user.get("userid", "")
         # Sampling logs
-        if random.random() < cfg.logs_fraction:
+        if random_number < cfg.logs_fraction:
             logger.info("User %s streams: %s", user.get("userid", ""), streams)
         # Synchronous model execution (user code)
         try:
+            preprocess_start = time.perf_counter_ns()
             model_input = model["preprocess"](
-                user, streams, playlist, model.get("params")
+                user,
+                streams,
+                playlist,
+                {**model.get("params"), "query_params": query_params},
+            )
+            predict_start = time.perf_counter_ns()
+            model_output = model["predict"](
+                model_input, {**model.get("params"), "query_params": query_params}
             )
-            model_output = model["predict"](model_input, model.get("params"))
+            predict_end = time.perf_counter_ns()
         except Exception as e:
             logger.error("Model prediction failed: %s", e)
             raise HTTPException(
                 status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
                 detail="Model prediction failed",
-            )
+            ) from e
+        newrelic.agent.record_custom_event(
+            "Inference",
+            {
+                "cache_misses": retrieval_meta.cache_misses,
+                "retrieval_success": int(retrieval_meta.success),
+                "cache_delay_minutes": retrieval_meta.cache_delay_minutes,
+                "retrieval_ms": retrieval_meta.retrieval_ms,
+                "preprocess_ms": (predict_start - preprocess_start) * 1e-6,
+                "predict_ms": (predict_end - predict_start) * 1e-6,
+                "total_scores": len(model_output),
+            },
+        )
         if model_output:
+            if random_number < cfg.logs_fraction:
+                logger.info(
+                    "User %s - model output %s",
+                    userid,
+                    model_output,
+                )
             return jsonable_encoder(model_output)
         raise HTTPException(

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/dynamo.py RENAMED Viewed

@@ -1,11 +1,23 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, NamedTuple
 import logging
+import time
+import datetime
 import aiobotocore.session
+import newrelic.agent
 logger = logging.getLogger(__name__)
+class FeatureRetrievalMeta(NamedTuple):
+    cache_misses: int
+    retrieval_ms: float
+    success: bool
+    cache_delay_minutes: float
+@newrelic.agent.function_trace()
 async def async_batch_get(
     dynamo_client, table_name: str, keys: List[Dict[str, Any]]
 ) -> List[Dict[str, Any]]:
@@ -53,6 +65,7 @@ async def async_batch_get(
     return all_items
+@newrelic.agent.function_trace()
 def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
     """Parse a DynamoDB attribute map (low-level) to Python types."""
     out: Dict[str, Any] = {}
@@ -76,6 +89,7 @@ def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
     return out
+@newrelic.agent.function_trace()
 async def set_stream_features(
     *,
     streams: List[Dict[str, Any]],
@@ -85,12 +99,20 @@ async def set_stream_features(
     stream_pk_prefix: str,
     cache_sep: str,
     aio_session: aiobotocore.session.Session | None = None,
-) -> None:
+) -> FeatureRetrievalMeta:
+    time_start = time.perf_counter_ns()
     """Fetch missing features for streams from DynamoDB and fill them into streams."""
     if not streams or not stream_features:
-        return
+        return FeatureRetrievalMeta(
+            cache_misses=0,
+            retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
+            success=True,
+            cache_delay_minutes=0,
+        )
     cache_miss: Dict[str, Dict[str, Any]] = {}
+    cache_delay_obj: dict[str, float] = {f: 0 for f in stream_features}
+    now = datetime.datetime.utcnow()
     for f in stream_features:
         for s in streams:
             key = f"{s['streamUrl']}{cache_sep}{f}"
@@ -99,13 +121,23 @@ async def set_stream_features(
                 cached = features_cache.get(key)
                 if cached["value"] is not None:
                     s[f] = cached["value"]
+                    cache_delay_obj[f] = max(
+                        cache_delay_obj[f], (now - cached["updated_at"]).total_seconds()
+                    )
             else:
                 cache_miss[key] = s
+    valid_cache_delays = list(v for v in cache_delay_obj.values() if v > 0)
+    cache_delay = min(valid_cache_delays) if valid_cache_delays else 0
     if not cache_miss:
-        return
-    logger.info("Cache miss for %d items", len(cache_miss))
+        return FeatureRetrievalMeta(
+            cache_misses=0,
+            retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
+            success=True,
+            cache_delay_minutes=cache_delay / 60,
+        )
+    cache_misses = len(cache_miss)
+    logger.info("Cache miss for %d items", cache_misses)
     # Prepare keys
     keys = []
@@ -120,7 +152,12 @@ async def set_stream_features(
             items = await async_batch_get(dynamodb, features_table, keys)
         except Exception as e:
             logger.error("DynamoDB batch_get failed: %s", e)
-            return
+            return FeatureRetrievalMeta(
+                cache_misses=cache_misses,
+                retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
+                success=False,
+                cache_delay_minutes=cache_delay / 60,
+            )
     updated_keys = set()
     for item in items:
@@ -132,6 +169,9 @@ async def set_stream_features(
         features_cache[cache_key] = {
             "value": parsed.get("value"),
             "cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
+            "updated_at": datetime.datetime.fromisoformat(
+                parsed.get("updated_at")
+            ).replace(tzinfo=None),
         }
         if cache_key in cache_miss:
             cache_miss[cache_key][feature_name] = parsed.get("value")
@@ -142,3 +182,9 @@ async def set_stream_features(
         missing_keys = set(cache_miss.keys()) - updated_keys
         for k in missing_keys:
             features_cache[k] = {"value": None, "cache_ttl_in_seconds": 300}
+    return FeatureRetrievalMeta(
+        cache_misses=cache_misses,
+        retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
+        success=True,
+        cache_delay_minutes=cache_delay / 60,
+    )

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/model_store.py RENAMED Viewed

@@ -4,10 +4,12 @@ from typing import Any, Dict
 import aiobotocore.session
 import cloudpickle
+import newrelic.agent
 logger = logging.getLogger(__name__)
+@newrelic.agent.function_trace()
 async def download_and_load_model(
     s3_url: str, aio_session: aiobotocore.session.Session | None = None
 ) -> Dict[str, Any]:
@@ -33,4 +35,4 @@ async def download_and_load_model(
     with open(local_path, "rb") as f:
         model: Dict[str, Any] = cloudpickle.load(f)
-    return model
+    return model

haystack_ml_stack-0.2.1/src/haystack_ml_stack/utils.py ADDED Viewed

@@ -0,0 +1,276 @@
+import pandas as pd
+import numpy as np
+import typing as _t
+def stream_tags_cleanup(
+    stream, user_favorite_tags: list[str], user_favorite_authors: list[str]
+) -> dict:
+    stream_tags = stream.get("haystackTags", [])
+    is_favorite_tag = (
+        any(stream_tag in user_favorite_tags for stream_tag in stream_tags)
+        if user_favorite_tags is not None
+        else False
+    )
+    is_favorite_author = (
+        stream.get("author", None) in user_favorite_authors
+        if user_favorite_authors is not None
+        else False
+    )
+    return {
+        "IS_FAVORITE_TAG": is_favorite_tag,
+        "IS_FAVORITE_AUTHOR": is_favorite_author,
+    }
+def browsed_count_cleanups(
+    stream,
+    position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
+) -> dict:
+    position_alias_mapping = {
+        "0": "1ST_POS",
+        "1": "2ND_POS",
+        "2": "3RD_POS",
+        "3+": "REST_POS",
+    }
+    if position_debiasing == "4_browsed":
+        suffix = "_UP_TO_4_BROWSED"
+    elif position_debiasing == "all_browsed":
+        suffix = ""
+    else:
+        raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
+    browsed_count_obj = stream.get("PSELECT#24H", {}).get(position_debiasing, {})
+    total_selects = 0
+    total_browsed = 0
+    total_selects_and_watched = 0
+    feats = {}
+    for position in position_alias_mapping.keys():
+        pos_counts = browsed_count_obj.get(position, {})
+        total_browsed += pos_counts.get("total_browsed", 0)
+        total_selects += pos_counts.get("total_selects", 0)
+        total_selects_and_watched += pos_counts.get("total_selects_and_watched", 0)
+    if position_debiasing == "4_browsed":
+        suffix = "_UP_TO_4_BROWSED"
+    elif position_debiasing == "all_browsed":
+        suffix = ""
+    else:
+        raise ValueError("Should not be here.")
+    feats[f"STREAM_24H_TOTAL_BROWSED{suffix}"] = total_browsed
+    feats[f"STREAM_24H_TOTAL_SELECTS{suffix}"] = total_selects
+    feats[f"STREAM_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = total_selects_and_watched
+    return feats
+def device_split_browsed_count_cleanups(
+    stream,
+    device_type: _t.Literal["TV", "MOBILE"],
+    position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
+) -> dict:
+    position_alias_mapping = {
+        "0": "1ST_POS",
+        "1": "2ND_POS",
+        "2": "3RD_POS",
+        "3+": "REST_POS",
+    }
+    if position_debiasing == "4_browsed":
+        suffix = "_UP_TO_4_BROWSED"
+    elif position_debiasing == "all_browsed":
+        suffix = ""
+    else:
+        raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
+    _validate_device_type(device_type)
+    browsed_count_obj = stream.get(f"PSELECT#24H#{device_type}", {}).get(
+        position_debiasing, {}
+    )
+    total_selects = 0
+    total_browsed = 0
+    total_selects_and_watched = 0
+    feats = {}
+    for position, alias in position_alias_mapping.items():
+        pos_counts = browsed_count_obj.get(position, {})
+        total_browsed = pos_counts.get("total_browsed", 0)
+        total_selects = pos_counts.get("total_selects", 0)
+        total_selects_and_watched = pos_counts.get("total_selects_and_watched", 0)
+        feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_BROWSED{suffix}"] = total_browsed
+        feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS{suffix}"] = total_selects
+        feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = (
+            total_selects_and_watched
+        )
+    return feats
+def watched_count_cleanups(stream, entry_contexts: list[str] = None) -> dict:
+    if entry_contexts is None:
+        entry_contexts = [
+            "autoplay",
+            "choose next",
+            "ch swtch",
+            "sel thumb",
+            "launch first in session",
+        ]
+    _validate_pwatched_entry_context(entry_contexts)
+    counts_obj = stream.get(f"PWATCHED#24H", {})
+    feats = {}
+    for entry_context in entry_contexts:
+        attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
+        watched = counts_obj.get(entry_context, {}).get("watched", 0)
+        context_key = entry_context if "launch" not in entry_context else "launch"
+        context_key = context_key.upper().replace(" ", "_")
+        feats[f"STREAM_{context_key}_24H_TOTAL_WATCHED"] = watched
+        feats[f"STREAM_{context_key}_24H_TOTAL_ATTEMPTS"] = attempts
+    return feats
+def device_watched_count_cleanups(
+    stream, device_type: str, entry_contexts: list[str] = None
+) -> dict:
+    if entry_contexts is None:
+        entry_contexts = [
+            "autoplay",
+            "choose next",
+            "ch swtch",
+            "sel thumb",
+            "launch first in session",
+        ]
+    _validate_pwatched_entry_context(entry_contexts)
+    _validate_device_type(device_type)
+    counts_obj = stream.get(f"PWATCHED#24H#{device_type}", {})
+    feats = {}
+    for entry_context in entry_contexts:
+        attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
+        watched = counts_obj.get(entry_context, {}).get("watched", 0)
+        context_key = entry_context if "launch" not in entry_context else "launch"
+        context_key = context_key.upper().replace(" ", "_")
+        feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_WATCHED"] = (
+            watched
+        )
+        feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_ATTEMPTS"] = (
+            attempts
+        )
+    return feats
+def generic_beta_adjust_features(
+    data: pd.DataFrame,
+    prefix: str,
+    pwatched_beta_params: dict,
+    pselect_beta_params: dict,
+    pslw_beta_params: dict,
+    use_low_sample_flags: bool = False,
+    low_sample_threshold: int = 3,
+    use_attempt_features: bool = False,
+    max_attempt_cap: int = 100,
+    debiased_pselect: bool = True,
+    use_logodds: bool = False,
+) -> pd.DataFrame:
+    pwatched_features = {}
+    for context, (alpha, beta) in pwatched_beta_params.items():
+        total_watched = data[f"{prefix}_{context}_TOTAL_WATCHED"].fillna(0)
+        total_attempts = data[f"{prefix}_{context}_TOTAL_ATTEMPTS"].fillna(0)
+        pwatched_features[f"{prefix}_{context}_ADJ_PWATCHED"] = (
+            total_watched + alpha
+        ) / (total_attempts + alpha + beta)
+        if use_low_sample_flags:
+            pwatched_features[f"{prefix}_{context}_LOW_SAMPLE"] = total_attempts.le(
+                low_sample_threshold
+            ).astype(int)
+        if use_attempt_features:
+            pwatched_features[f"{prefix}_{context}_ATTEMPTS"] = total_attempts.clip(
+                upper=max_attempt_cap
+            )
+    pselect_features = {}
+    debias_suffix = "_UP_TO_4_BROWSED" if debiased_pselect else ""
+    for key, (alpha, beta) in pselect_beta_params.items():
+        total_selects = data[f"{prefix}_{key}_TOTAL_SELECTS{debias_suffix}"].fillna(0)
+        total_browsed = data[f"{prefix}_{key}_TOTAL_BROWSED{debias_suffix}"].fillna(0)
+        pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"] = (
+            total_selects + alpha
+        ) / (total_selects + total_browsed + alpha + beta)
+        if use_low_sample_flags:
+            pselect_features[f"{prefix}_{key}_PSELECT_LOW_SAMPLE{debias_suffix}"] = (
+                (total_selects + total_browsed).le(low_sample_threshold).astype(int)
+            )
+        if use_attempt_features:
+            pselect_features[f"{prefix}_{key}_PSELECT_ATTEMPTS{debias_suffix}"] = (
+                total_selects + total_browsed
+            ).clip(upper=max_attempt_cap)
+        total_slw = data[
+            f"{prefix}_{key}_TOTAL_SELECTS_AND_WATCHED{debias_suffix}"
+        ].fillna(0)
+        pslw_alpha, pslw_beta = pslw_beta_params[key]
+        pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"] = (
+            total_slw + pslw_alpha
+        ) / (total_selects + total_browsed + pslw_alpha + pslw_beta)
+        pselect_features[f"{prefix}_{key}_PSelNotW{debias_suffix}"] = (
+            pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"]
+            - pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"]
+        )
+    adjusted_feats = pd.DataFrame({**pwatched_features, **pselect_features})
+    if use_logodds:
+        adjusted_feats = adjusted_feats.pipe(
+            lambda x: x.assign(
+                **x[
+                    [
+                        c
+                        for c in x.columns
+                        if "PSELECT" in c
+                        or "PSLW" in c
+                        or "PWATCHED" in c
+                        or "PSelNotW" in c
+                    ]
+                ]
+                .clip(lower=0.001)
+                .pipe(prob_to_logodds)
+            )
+        )
+    return adjusted_feats
+def prob_to_logodds(prob: float) -> float:
+    return np.log(prob) - np.log(1 - prob)
+def scale_preds(
+    preds: pd.Series,
+    original_mean: float,
+    original_std: float,
+    target_mean: float,
+    target_std: float,
+) -> pd.Series:
+    z_score = (preds - original_mean) / original_std
+    return z_score * target_std + target_mean
+def sigmoid(x: float) -> float:
+    return 1 / (1 + np.exp(-x))
+def generic_logistic_predict(
+    data: pd.DataFrame, coeffs: pd.Series, intercept: float
+) -> pd.Series:
+    return ((data[coeffs.index] * coeffs).sum(axis=1) + intercept).pipe(sigmoid)
+def _validate_device_type(device_type: str):
+    if device_type not in ("TV", "MOBILE"):
+        raise ValueError(f"Invalid device type '{device_type}")
+def _validate_pwatched_entry_context(entry_contexts: list[str]):
+    valid_contexts = [
+        "autoplay",
+        "choose next",
+        "ch swtch",
+        "sel thumb",
+        "launch first in session",
+    ]
+    invalid_contexts = [c for c in entry_contexts if c not in valid_contexts]
+    if invalid_contexts:
+        raise ValueError(f"Invalid entry contexts found: {invalid_contexts}")

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: haystack-ml-stack
-Version: 0.1.2
+Version: 0.2.1
 Summary: Functions related to Haystack ML
 Author-email: Oscar Vega <oscar@haystack.tv>
 License: MIT
@@ -12,6 +12,7 @@ Requires-Dist: cloudpickle==2.2.1
 Requires-Dist: aioboto3==12.0.0
 Requires-Dist: fastapi==0.104.1
 Requires-Dist: pydantic-settings==2.2
+Requires-Dist: newrelic==11.1.0
 # Haystack ML Stack

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/SOURCES.txt RENAMED Viewed

@@ -6,14 +6,9 @@ src/haystack_ml_stack/cache.py
 src/haystack_ml_stack/dynamo.py
 src/haystack_ml_stack/model_store.py
 src/haystack_ml_stack/settings.py
+src/haystack_ml_stack/utils.py
 src/haystack_ml_stack.egg-info/PKG-INFO
 src/haystack_ml_stack.egg-info/SOURCES.txt
 src/haystack_ml_stack.egg-info/dependency_links.txt
 src/haystack_ml_stack.egg-info/requires.txt
-src/haystack_ml_stack.egg-info/top_level.txt
-src/haystack_test_package/__init__.py
-src/haystack_test_package/app.py
-src/haystack_test_package/cache.py
-src/haystack_test_package/dynamo.py
-src/haystack_test_package/model_store.py
-src/haystack_test_package/settings.py
+src/haystack_ml_stack.egg-info/top_level.txt

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/requires.txt RENAMED Viewed

@@ -4,3 +4,4 @@ cloudpickle==2.2.1
 aioboto3==12.0.0
 fastapi==0.104.1
 pydantic-settings==2.2
+newrelic==11.1.0

haystack_ml_stack-0.2.1/src/haystack_ml_stack.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ haystack_ml_stack

haystack_ml_stack-0.1.2/src/haystack_ml_stack/app.py DELETED Viewed

@@ -1,158 +0,0 @@
-import logging
-import os
-import random
-import sys
-from http import HTTPStatus
-from typing import Any, Dict, List, Optional
-import aiobotocore.session
-from fastapi import FastAPI, HTTPException, Request, Response
-from fastapi.encoders import jsonable_encoder
-from .cache import make_features_cache
-from .dynamo import set_stream_features
-from .model_store import download_and_load_model
-from .settings import Settings
-logging.basicConfig(
-    level=logging.INFO,
-    format="[%(levelname)s] [%(process)d] %(name)s : %(message)s",
-    handlers=[logging.StreamHandler(sys.stdout)],
-    force=True,
-)
-logger = logging.getLogger(__name__)
-def create_app(
-    settings: Optional[Settings] = None,
-    *,
-    preloaded_model: Optional[Dict[str, Any]] = None,
-) -> FastAPI:
-    """
-    Build a FastAPI app with injectable settings and model.
-    If `preloaded_model` is None, the app will load from S3 on startup.
-    """
-    cfg = settings or Settings()
-    app = FastAPI(
-        title="ML Stream Scorer",
-        description="Scores video streams using a pre-trained ML model and DynamoDB features.",
-        version="1.0.0",
-    )
-    # Mutable state: cache + model
-    features_cache = make_features_cache(cfg.cache_maxsize)
-    state: Dict[str, Any] = {
-        "model": preloaded_model,
-        "session": aiobotocore.session.get_session(),
-        "model_name": (
-            os.path.basename(cfg.s3_model_path) if cfg.s3_model_path else None
-        ),
-    }
-    @app.on_event("startup")
-    async def _startup() -> None:
-        if state["model"] is not None:
-            logger.info("Using preloaded model.")
-            return
-        if not cfg.s3_model_path:
-            logger.critical("S3_MODEL_PATH not set; service will be unhealthy.")
-            return
-        try:
-            state["model"] = await download_and_load_model(
-                cfg.s3_model_path, aio_session=state["session"]
-            )
-            state["stream_features"] = state["model"].get("stream_features", [])
-            logger.info("Model loaded on startup.")
-        except Exception as e:
-            logger.critical("Failed to load model: %s", e)
-    @app.get("/health", status_code=HTTPStatus.OK)
-    async def health():
-        model_ok = state["model"] is not None
-        if not model_ok:
-            raise HTTPException(
-                status_code=HTTPStatus.SERVICE_UNAVAILABLE,
-                detail="ML Model not loaded",
-            )
-        return {
-            "status": "ok",
-            "model_loaded": True,
-            "cache_size": len(features_cache),
-            "model_name": state.get("model_name"),
-            "stream_features": state.get("stream_features", []),
-        }
-    @app.post("/score", status_code=HTTPStatus.OK)
-    async def score_stream(request: Request, response: Response):
-        if state["model"] is None:
-            raise HTTPException(
-                status_code=HTTPStatus.SERVICE_UNAVAILABLE,
-                detail="ML Model not loaded",
-            )
-        try:
-            data = await request.json()
-        except Exception:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST, detail="Invalid JSON payload"
-            )
-        user = data.get("user", {})
-        streams: List[Dict[str, Any]] = data.get("streams", [])
-        playlist = data.get("playlist", {})
-        if not streams:
-            logger.warning("No streams provided for user %s", user.get("userid", ""))
-            return {}
-        # Feature fetch (optional based on model)
-        model = state["model"]
-        stream_features = model.get("stream_features", []) or []
-        if stream_features:
-            logger.info("Fetching stream features for user %s", user.get("userid", ""))
-            await set_stream_features(
-                aio_session=state["session"],
-                streams=streams,
-                stream_features=stream_features,
-                features_cache=features_cache,
-                features_table=cfg.features_table,
-                stream_pk_prefix=cfg.stream_pk_prefix,
-                cache_sep=cfg.cache_separator,
-            )
-        # Sampling logs
-        if random.random() < cfg.logs_fraction:
-            logger.info("User %s streams: %s", user.get("userid", ""), streams)
-        # Synchronous model execution (user code)
-        try:
-            model_input = model["preprocess"](
-                user, streams, playlist, model.get("params")
-            )
-            model_output = model["predict"](model_input, model.get("params"))
-        except Exception as e:
-            logger.error("Model prediction failed: %s", e)
-            raise HTTPException(
-                status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
-                detail="Model prediction failed",
-            )
-        if model_output:
-            return jsonable_encoder(model_output)
-        raise HTTPException(
-            status_code=HTTPStatus.NOT_FOUND, detail="No model output generated"
-        )
-    @app.get("/", status_code=HTTPStatus.OK)
-    async def root():
-        return {
-            "message": "ML Scoring Service is running.",
-            "model_name": state.get("model_name"),
-        }
-    return app

haystack_ml_stack-0.1.2/src/haystack_ml_stack.egg-info/top_level.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- haystack_ml_stack
2	- haystack_test_package

haystack_ml_stack-0.1.2/src/haystack_test_package/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .app import create_app
-__all__ = ["create_app"]
-__version__ = "0.1.3"

haystack_ml_stack-0.1.2/src/haystack_test_package/cache.py DELETED Viewed

@@ -1,19 +0,0 @@
-from typing import Any
-from cachetools import TLRUCache
-def _ttu(_, value: Any, now: float) -> float:
-    """Time-To-Use policy: allow per-item TTL via 'cache_ttl_in_seconds' or fallback."""
-    ONE_YEAR = 365 * 24 * 60 * 60
-    try:
-        ttl = int(value.get("cache_ttl_in_seconds", -1))
-        if ttl > 0:
-            return now + ttl
-    except Exception:
-        pass
-    return now + ONE_YEAR
-def make_features_cache(maxsize: int) -> TLRUCache:
-    return TLRUCache(maxsize=maxsize, ttu=_ttu)

haystack_ml_stack-0.1.2/src/haystack_test_package/dynamo.py DELETED Viewed

@@ -1,137 +0,0 @@
-from typing import Any, Dict, List
-import logging
-import aiobotocore.session
-logger = logging.getLogger(__name__)
-async def async_batch_get(
-    dynamo_client, table_name: str, keys: List[Dict[str, Any]]
-) -> List[Dict[str, Any]]:
-    """
-    Asynchronous batch_get_item with chunking for requests > 100 keys
-    and handling for unprocessed keys.
-    """
-    all_items: List[Dict[str, Any]] = []
-    # DynamoDB's BatchGetItem has a 100-item limit per request.
-    CHUNK_SIZE = 100
-    # Split the keys into chunks of 100
-    for i in range(0, len(keys), CHUNK_SIZE):
-        chunk_keys = keys[i : i + CHUNK_SIZE]
-        to_fetch = {table_name: {"Keys": chunk_keys}}
-        # Inner loop to handle unprocessed keys for the current chunk
-        # Max retries of 3
-        retries = 3
-        while to_fetch and retries > 0:
-            retries -= 1
-            try:
-                resp = await dynamo_client.batch_get_item(RequestItems=to_fetch)
-                if "Responses" in resp and table_name in resp["Responses"]:
-                    all_items.extend(resp["Responses"][table_name])
-                unprocessed = resp.get("UnprocessedKeys", {})
-                # If there are unprocessed keys, set them to be fetched in the next iteration
-                if unprocessed and unprocessed.get(table_name):
-                    logger.warning(
-                        "Retrying %d unprocessed keys.",
-                        len(unprocessed[table_name]["Keys"]),
-                    )
-                    to_fetch = unprocessed
-                else:
-                    # All keys in the chunk were processed, exit the inner loop
-                    to_fetch = {}
-            except Exception as e:
-                logger.error("Error during batch_get_item for a chunk: %s", e)
-                # Stop trying to process this chunk on error and move to the next
-                to_fetch = {}
-    return all_items
-def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
-    """Parse a DynamoDB attribute map (low-level) to Python types."""
-    out: Dict[str, Any] = {}
-    for k, v in item.items():
-        if "N" in v:
-            out[k] = float(v["N"])
-        elif "S" in v:
-            out[k] = v["S"]
-        elif "SS" in v:
-            out[k] = v["SS"]
-        elif "NS" in v:
-            out[k] = [float(n) for n in v["NS"]]
-        elif "BOOL" in v:
-            out[k] = v["BOOL"]
-        elif "NULL" in v:
-            out[k] = None
-        elif "L" in v:
-            out[k] = [parse_dynamo_item({"value": i})["value"] for i in v["L"]]
-        elif "M" in v:
-            out[k] = parse_dynamo_item(v["M"])
-    return out
-async def set_stream_features(
-    *,
-    streams: List[Dict[str, Any]],
-    stream_features: List[str],
-    features_cache,
-    features_table: str,
-    stream_pk_prefix: str,
-    cache_sep: str,
-    aio_session: aiobotocore.session.Session | None = None,
-) -> None:
-    """Fetch missing features for streams from DynamoDB and fill them into streams."""
-    if not streams or not stream_features:
-        return
-    cache_miss: Dict[str, Dict[str, Any]] = {}
-    for f in stream_features:
-        for s in streams:
-            key = f"{s['streamUrl']}{cache_sep}{f}"
-            cached = features_cache.get(key)
-            if cached is not None:
-                s[f] = cached["value"]
-            else:
-                cache_miss[key] = s
-    if not cache_miss:
-        return
-    logger.info("Cache miss for %d items", len(cache_miss))
-    # Prepare keys
-    keys = []
-    for k in cache_miss.keys():
-        stream_url, sk = k.split(cache_sep, 1)
-        pk = f"{stream_pk_prefix}{stream_url}"
-        keys.append({"pk": {"S": pk}, "sk": {"S": sk}})
-    logger.info("Keys prepared for DynamoDB: %s", keys)
-    session = aio_session or aiobotocore.session.get_session()
-    async with session.create_client("dynamodb") as dynamodb:
-        try:
-            items = await async_batch_get(dynamodb, features_table, keys)
-        except Exception as e:
-            logger.error("DynamoDB batch_get failed: %s", e)
-            return
-        logger.info("DynamoDB returned %d items", len(items))
-    for item in items:
-        stream_url = item["pk"]["S"].removeprefix(stream_pk_prefix)
-        feature_name = item["sk"]["S"]
-        cache_key = f"{stream_url}{cache_sep}{feature_name}"
-        parsed = parse_dynamo_item(item)
-        logger.info("DynamoDB item parsed: %s for %s", parsed, cache_key)
-        features_cache[cache_key] = {
-            "value": parsed.get("value"),
-            "cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
-        }
-        if cache_key in cache_miss:
-            cache_miss[cache_key][feature_name] = parsed.get("value")

haystack_ml_stack-0.1.2/src/haystack_test_package/model_store.py DELETED Viewed

@@ -1,36 +0,0 @@
-import logging
-import os
-from typing import Any, Dict
-import aiobotocore.session
-import cloudpickle
-logger = logging.getLogger(__name__)
-async def download_and_load_model(
-    s3_url: str, aio_session: aiobotocore.session.Session | None = None
-) -> Dict[str, Any]:
-    """
-    Downloads cloudpickled model dict from S3 and loads it.
-    Expected keys: 'preprocess', 'predict', 'params', optional 'stream_features'.
-    """
-    if not s3_url or not s3_url.startswith("s3://"):
-        raise ValueError("S3_MODEL_PATH must be a valid s3:// URL")
-    bucket, key = s3_url.replace("s3://", "").split("/", 1)
-    pid = os.getpid()
-    local_path = f"/tmp/model_{pid}.pkl"
-    session = aio_session or aiobotocore.session.get_session()
-    async with session.create_client("s3") as s3:
-        logger.info("Downloading model from %s...", s3_url)
-        resp = await s3.get_object(Bucket=bucket, Key=key)
-        data = await resp["Body"].read()
-        with open(local_path, "wb") as f:
-            f.write(data)
-        logger.info("Model downloaded to %s", local_path)
-    with open(local_path, "rb") as f:
-        model: Dict[str, Any] = cloudpickle.load(f)
-    return model

haystack_ml_stack-0.1.2/src/haystack_test_package/settings.py DELETED Viewed

@@ -1,22 +0,0 @@
-from pydantic_settings import BaseSettings
-from pydantic import Field
-class Settings(BaseSettings):
-    # Logging
-    logs_fraction: float = Field(0.01, alias="LOGS_FRACTION")
-    # Model (S3)
-    s3_model_path: str | None = Field(default=None, alias="S3_MODEL_PATH")
-    # DynamoDB
-    features_table: str = Field("features", alias="FEATURES_TABLE")
-    stream_pk_prefix: str = "STREAM#"
-    # Cache
-    cache_maxsize: int = 50_000
-    cache_separator: str = "--"
-    class Config:
-        env_file = ".env"
-        env_file_encoding = "utf-8"
-        extra = "ignore"

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/README.md RENAMED Viewed

File without changes

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/setup.cfg RENAMED Viewed

File without changes

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/cache.py RENAMED Viewed

File without changes

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/settings.py RENAMED Viewed

File without changes

{haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/dependency_links.txt RENAMED Viewed

File without changes

haystack-ml-stack 0.1.2__tar.gz → 0.2.1__tar.gz

haystack-ml-stack 0.1.2tar.gz → 0.2.1tar.gz