PyPI - haystack-ml-stack - Versions diffs - 0.3.3__tar.gz → 0.4.0__tar.gz - Mend

haystack-ml-stack 0.3.3tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: haystack-ml-stack
-Version: 0.3.3
+Version: 0.4.0
 Summary: Functions related to Haystack ML
 Author-email: Oscar Vega <oscar@haystack.tv>
 License: MIT
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 Requires-Dist: protobuf==6.33.2
+Requires-Dist: orjson==3.11.7
 Provides-Extra: server
 Requires-Dist: pydantic==2.5.0; extra == "server"
 Requires-Dist: cachetools==5.5.2; extra == "server"
@@ -15,6 +16,7 @@ Requires-Dist: aioboto3==12.0.0; extra == "server"
 Requires-Dist: fastapi==0.104.1; extra == "server"
 Requires-Dist: pydantic-settings==2.2; extra == "server"
 Requires-Dist: newrelic==11.1.0; extra == "server"
+Requires-Dist: confluent-kafka==2.13.0; extra == "server"
 # Haystack ML Stack

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/pyproject.toml RENAMED Viewed

@@ -5,13 +5,13 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "haystack-ml-stack"
-version = "0.3.3"
+version = "0.4.0"
 description = "Functions related to Haystack ML"
 readme = "README.md"
 authors = [{ name = "Oscar Vega", email = "oscar@haystack.tv" }]
 requires-python = ">=3.11"
 dependencies = [
-    "protobuf==6.33.2",
+    "protobuf==6.33.2", "orjson==3.11.7"
 ]
 license = { text = "MIT" }
@@ -24,4 +24,5 @@ server = [
     "fastapi==0.104.1",
     "pydantic-settings==2.2",
     "newrelic==11.1.0",
+    "confluent-kafka==2.13.0"
 ]

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/src/haystack_ml_stack/__init__.py RENAMED Viewed

@@ -11,4 +11,4 @@ from ._serializers import SerializerRegistry, FeatureRegistryId
 __all__ = [*__all__, "SerializerRegistry", "FeatureRegistryId"]
-__version__ = "0.3.3"
+__version__ = "0.4.0"

haystack_ml_stack-0.4.0/src/haystack_ml_stack/_kafka.py ADDED Viewed

@@ -0,0 +1,88 @@
+from confluent_kafka.aio import AIOProducer
+import orjson
+from google.protobuf.message import Message
+import base64
+import os
+import logging
+from .settings import Settings
+from . import __version__
+import hashlib
+logger = logging.getLogger(__name__)
+SECURITY_PROTOCOL = "SASL_SSL"
+SASL_MECHANISM = "SCRAM-SHA-512"
+async def send_to_kafka(
+    producer: AIOProducer,
+    topic: str,
+    user: dict,
+    streams: list[dict],
+    playlist: dict,
+    state: dict,
+    model_output: dict,
+    monitoring_meta: dict,
+) -> None:
+    if topic is None or producer is None:
+        return
+    message = {
+        "userid": user.get("userid"),
+        "client_os": playlist.get("clientOs"),
+        "model_input": {"user": user, "streams": streams, "playlist": playlist},
+        "model_output": model_output,
+        "model_name": state["model_name"].replace(".pkl", "")
+        if state["model_name"]
+        else None,
+        "model_type": "streams",
+        "meta": {
+            "monitoring": monitoring_meta,
+            "haystack_ml_stack_version": __version__,
+            "playlist_category": playlist.get("category"),
+            "user_features": state.get("user_features", []),
+            "stream_features": state.get("stream_features", []),
+        },
+    }
+    delivery_future = await producer.produce(
+        topic, orjson.dumps(message, default=default_serialization)
+    )
+    await delivery_future
+    return
+def default_serialization(obj):
+    if isinstance(obj, Message):
+        return {
+            "version": obj.version,
+            "proto": base64.b64encode(obj.SerializeToString()).decode("ascii"),
+        }
+    raise orjson.JSONEncodeError("Unknown data type to serialize!")
+def initialize_kafka_producer(app_config: Settings) -> AIOProducer:
+    secret_keys = orjson.loads(os.getenv("SECRET_KEYS") or "{}")
+    if not secret_keys:
+        raise ValueError("No Kafka credentials found.")
+    with open("/tmp/ca.pem", "w") as f:
+        f.write(base64.b64decode(secret_keys["KAFKA_BROKER_CA_CERTIFICATE"]).decode())
+    kafka_config = {
+        "bootstrap.servers": app_config.kafka_bootstrap_servers,
+        "security.protocol": SECURITY_PROTOCOL,
+        "sasl.username": secret_keys["KAFKA_WRITER_USER"],
+        "sasl.password": secret_keys["KAFKA_WRITER_PASSWORD"],
+        "sasl.mechanism": SASL_MECHANISM,
+        "ssl.ca.location": "/tmp/ca.pem",
+        "compression.type": "lz4",
+    }
+    logger.info(
+        "Initializing kafka producer pushing to topic %s", app_config.kafka_topic
+    )
+    producer = AIOProducer(kafka_config)
+    logger.info("Producer initialized!")
+    return producer
+def should_log_user(userid: str, kafka_fraction: float) -> bool:
+    if not userid:
+        return False
+    hash_value = int(hashlib.sha256(userid.encode()).hexdigest(), 16) / (2**256)
+    return hash_value < kafka_fraction

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/src/haystack_ml_stack/app.py RENAMED Viewed

@@ -7,20 +7,22 @@ from typing import Any, Dict, List, Optional
 import time
 from contextlib import asynccontextmanager, AsyncExitStack
 import traceback
+import json
+import asyncio
 import aiobotocore.session
 from aiobotocore.config import AioConfig
-from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi import FastAPI, HTTPException, Request, Response, BackgroundTasks
 from fastapi.encoders import jsonable_encoder
 import newrelic.agent
 from .cache import make_features_cache
 from .dynamo import set_all_features, FeatureRetrievalMeta
 from .model_store import download_and_load_model
 from .settings import Settings
 from . import exceptions
 from ._serializers import SerializerRegistry
+from ._kafka import send_to_kafka, initialize_kafka_producer, should_log_user
 from google.protobuf import text_format
 logging.basicConfig(
@@ -122,6 +124,10 @@ def create_app(
         # 1. Load ML Model
         if state["model"] is None:
             await load_model(state, cfg)
+        kafka_producer = None
+        if cfg.kafka_bootstrap_servers is not None:
+            kafka_producer = initialize_kafka_producer(app_config=cfg)
+        state["kafka_producer"] = kafka_producer
         async with AsyncExitStack() as stack:
             # 2. Initialize DynamoDB Client (Persistent Pool)
             session = state["session"]
@@ -138,6 +144,17 @@ def create_app(
             # 3. Shutdown Logic
             # The AsyncExitStack automatically closes the DynamoDB client pool here
             logger.info("Shutting down: Connection pools closed.")
+        logger.info("Shutting down: Flushing Kafka queue.")
+        if kafka_producer is not None:
+            try:
+                await kafka_producer.flush()
+            except Exception:
+                logger.error(
+                    "Unknown exception while flushing kafka queue, shutting down producer.\n%s",
+                    traceback.format_exc(),
+                )
+            finally:
+                await kafka_producer.close()
     app = FastAPI(
         title="ML Stream Scorer",
@@ -161,10 +178,13 @@ def create_app(
             "user_cache_size": len(user_features_cache),
             "model_name": state.get("model_name"),
             "stream_features": state.get("stream_features", []),
+            "user_features": state.get("user_features", []),
         }
     @app.post("/score", status_code=HTTPStatus.OK)
-    async def score_stream(request: Request, response: Response):
+    async def score_stream(
+        request: Request, response: Response, background_tasks: BackgroundTasks
+    ):
         if state["model"] is None:
             raise HTTPException(
                 status_code=HTTPStatus.SERVICE_UNAVAILABLE,
@@ -173,10 +193,24 @@ def create_app(
         try:
             data = await request.json()
-        except Exception as e:
+        except json.JSONDecodeError as e:
+            body = await request.body()
+            logger.error(
+                "Received malformed json. Raw body: %s\n%s",
+                body.decode(errors="replace"),
+                traceback.format_exc(),
+            )
             raise HTTPException(
                 status_code=HTTPStatus.BAD_REQUEST, detail="Invalid JSON payload"
             ) from e
+        except Exception as e:
+            logger.error(
+                "Unexpected exception when parsing request.\n %s",
+                traceback.format_exc(),
+            )
+            raise HTTPException(
+                status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail="Unknown exception"
+            ) from e
         query_params = {}
         for k in request.query_params.keys():
             values = request.query_params.getlist(k)
@@ -259,22 +293,24 @@ def create_app(
                 status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
                 detail="Model prediction failed",
             ) from e
+        monitoring_meta = {
+            "cache_misses": retrieval_meta.cache_misses,
+            "user_cache_misses": retrieval_meta.user_cache_misses,
+            "stream_cache_misses": retrieval_meta.stream_cache_misses,
+            "user_cache_size": len(user_features_cache),
+            "stream_cache_size": len(stream_features_cache),
+            "retrieval_success": int(retrieval_meta.success),
+            "cache_delay_minutes": retrieval_meta.cache_delay_minutes,
+            "dynamo_ms": retrieval_meta.dynamo_ms,
+            "dynamo_parse_ms": retrieval_meta.parsing_ms,
+            "retrieval_ms": retrieval_meta.retrieval_ms,
+            "preprocess_ms": (predict_start - preprocess_start) * 1e-6,
+            "predict_ms": (predict_end - predict_start) * 1e-6,
+            "total_streams": len(model_output),
+        }
         newrelic.agent.record_custom_event(
             "Inference",
-            {
-                "cache_misses": retrieval_meta.cache_misses,
-                "user_cache_misses": retrieval_meta.user_cache_misses,
-                "stream_cache_misses": retrieval_meta.stream_cache_misses,
-                "retrieval_success": int(retrieval_meta.success),
-                "cache_delay_minutes": retrieval_meta.cache_delay_minutes,
-                "dynamo_ms": retrieval_meta.dynamo_ms,
-                "dynamo_parse_ms": retrieval_meta.parsing_ms,
-                "retrieval_ms": retrieval_meta.retrieval_ms,
-                "preprocess_ms": (predict_start - preprocess_start) * 1e-6,
-                "predict_ms": (predict_end - predict_start) * 1e-6,
-                "total_streams": len(model_output),
-            },
+            monitoring_meta,
         )
         if model_output:
             if random_number < cfg.logs_fraction:
@@ -283,6 +319,18 @@ def create_app(
                     userid,
                     model_output,
                 )
+            if should_log_user(userid=userid, kafka_fraction=cfg.kafka_fraction):
+                background_tasks.add_task(
+                    send_to_kafka,
+                    producer=state["kafka_producer"],
+                    topic=cfg.kafka_topic,
+                    user=user,
+                    streams=streams,
+                    playlist=playlist,
+                    state=state,
+                    model_output=model_output,
+                    monitoring_meta=monitoring_meta,
+                )
             return jsonable_encoder(model_output)
         raise HTTPException(

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/src/haystack_ml_stack/settings.py RENAMED Viewed

@@ -1,9 +1,15 @@
 from pydantic_settings import BaseSettings
 from pydantic import Field
 class Settings(BaseSettings):
     # Logging
     logs_fraction: float = Field(0.01, alias="LOGS_FRACTION")
+    kafka_bootstrap_servers: str | None = Field(
+        default=None, alias="KAFKA_BOOTSTRAP_SERVERS"
+    )
+    kafka_fraction: float = Field(0.01, alias="KAFKA_FRACTION")
+    kafka_topic: str = Field(default=None, alias="KAFKA_TOPIC")
     # Model (S3)
     s3_model_path: str | None = Field(default=None, alias="S3_MODEL_PATH")
@@ -14,10 +20,10 @@ class Settings(BaseSettings):
     # Cache
     stream_cache_maxsize: int = 50_000
-    user_cache_maxsize: int = 500_000
+    user_cache_maxsize: int = 80_000
     cache_separator: str = "--"
     class Config:
         env_file = ".env"
         env_file_encoding = "utf-8"
-        extra = "ignore"
+        extra = "ignore"

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/src/haystack_ml_stack/utils.py RENAMED Viewed

@@ -336,7 +336,7 @@ def user_pwatched_cleanup(
             "launch_first_in_session",
         ]
     _validate_pwatched_entry_context(entry_contexts)
-    counts_obj = user.get("PWATCHED#6M", UserPWatched())
+    counts_obj = user.get("PWATCHED#6M", UserPWatched()).data
     out = _cleanup_entry_context_counts(
         counts_obj=counts_obj,
         entry_contexts=entry_contexts,

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/src/haystack_ml_stack.egg-info/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: haystack-ml-stack
-Version: 0.3.3
+Version: 0.4.0
 Summary: Functions related to Haystack ML
 Author-email: Oscar Vega <oscar@haystack.tv>
 License: MIT
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 Requires-Dist: protobuf==6.33.2
+Requires-Dist: orjson==3.11.7
 Provides-Extra: server
 Requires-Dist: pydantic==2.5.0; extra == "server"
 Requires-Dist: cachetools==5.5.2; extra == "server"
@@ -15,6 +16,7 @@ Requires-Dist: aioboto3==12.0.0; extra == "server"
 Requires-Dist: fastapi==0.104.1; extra == "server"
 Requires-Dist: pydantic-settings==2.2; extra == "server"
 Requires-Dist: newrelic==11.1.0; extra == "server"
+Requires-Dist: confluent-kafka==2.13.0; extra == "server"
 # Haystack ML Stack

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/src/haystack_ml_stack.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 README.md
 pyproject.toml
 src/haystack_ml_stack/__init__.py
+src/haystack_ml_stack/_kafka.py
 src/haystack_ml_stack/_serializers.py
 src/haystack_ml_stack/app.py
 src/haystack_ml_stack/cache.py

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/src/haystack_ml_stack.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 protobuf==6.33.2
+orjson==3.11.7
 [server]
 pydantic==2.5.0
@@ -8,3 +9,4 @@ aioboto3==12.0.0
 fastapi==0.104.1
 pydantic-settings==2.2
 newrelic==11.1.0
+confluent-kafka==2.13.0

{haystack_ml_stack-0.3.3 → haystack_ml_stack-0.4.0}/tests/test_utils.py RENAMED Viewed

@@ -560,3 +560,78 @@ def test_stream_similarity_top_category_functions():
     assert all(
         actual_key == expected_key for actual_key, expected_key in zip(actual, expected)
     )
+def test_user_pwatched_cleanup():
+    user_pwatched_data = {
+        "version": 1,
+        "data": {
+            "sel_thumb": {"attempts": 1, "watched": 1},
+            "ch_swtch": {"attempts": 2, "watched": 0},
+        },
+    }
+    user_pwatched_msg = features_pb2_v1.UserPWatched()
+    ProtoParseDict(js_dict=user_pwatched_data, message=user_pwatched_msg)
+    user = {"PWATCHED#6M": user_pwatched_msg}
+    out = {}
+    utils.user_pwatched_cleanup(
+        user=user, entry_contexts=["autoplay", "sel_thumb", "ch_swtch"], out=out
+    )
+    expected = pd.Series(
+        {
+            "USER_AUTOPLAY_6M_TOTAL_ATTEMPTS": 0,
+            "USER_AUTOPLAY_6M_TOTAL_WATCHED": 0,
+            "USER_SEL_THUMB_6M_TOTAL_ATTEMPTS": 1,
+            "USER_SEL_THUMB_6M_TOTAL_WATCHED": 1,
+            "USER_CH_SWTCH_6M_TOTAL_ATTEMPTS": 2,
+            "USER_CH_SWTCH_6M_TOTAL_WATCHED": 0,
+        }
+    )
+    actual = pd.Series(out).loc[expected.index]
+    assert (expected == actual).all()
+def test_user_pselect_cleanup():
+    user_pselect_data = {
+        "version": 1,
+        "data": {
+            "all_browsed": {
+                "first_pos": {
+                    "total_selects": 0,
+                    "total_selects_and_watched": 0,
+                    "total_browsed": 1,
+                },
+                "rest_pos": {
+                    "total_selects": 2,
+                    "total_selects_and_watched": 2,
+                    "total_browsed": 1,
+                },
+            },
+            "up_to_4_browsed": {
+                "first_pos": {
+                    "total_selects": 0,
+                    "total_selects_and_watched": 0,
+                    "total_browsed": 1,
+                },
+                "rest_pos": {
+                    "total_selects": 2,
+                    "total_selects_and_watched": 2,
+                    "total_browsed": 0,
+                },
+            },
+        },
+    }
+    user_pselect_msg = features_pb2_v1.UserPSelect()
+    ProtoParseDict(js_dict=user_pselect_data, message=user_pselect_msg)
+    user = {"PSELECT#6M": user_pselect_msg}
+    out = {}
+    utils.user_pselect_cleanup(user=user, position_debiasing="up_to_4_browsed", out=out)
+    expected = pd.Series(
+        {
+            "USER_6M_TOTAL_BROWSED_UP_TO_4_BROWSED": 1,
+            "USER_6M_TOTAL_SELECTS_UP_TO_4_BROWSED": 2,
+            "USER_6M_TOTAL_SELECTS_AND_WATCHED_UP_TO_4_BROWSED": 2,
+        }
+    )
+    actual = pd.Series(out).loc[expected.index]
+    assert (actual == expected).all()