haystack-ml-stack 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haystack-ml-stack might be problematic. Click here for more details.

@@ -0,0 +1,4 @@
1
+ from .app import create_app
2
+
3
+ __all__ = ["create_app"]
4
+ __version__ = "0.2.1"
@@ -0,0 +1,196 @@
1
+ import logging
2
+ import os
3
+ import random
4
+ import sys
5
+ from http import HTTPStatus
6
+ from typing import Any, Dict, List, Optional
7
+ import time
8
+
9
+ import aiobotocore.session
10
+ from fastapi import FastAPI, HTTPException, Request, Response
11
+ from fastapi.encoders import jsonable_encoder
12
+
13
+
14
+ from .cache import make_features_cache
15
+ from .dynamo import set_stream_features, FeatureRetrievalMeta
16
+ from .model_store import download_and_load_model
17
+ from .settings import Settings
18
+
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format="[%(levelname)s] [%(process)d] %(name)s : %(message)s",
22
+ handlers=[logging.StreamHandler(sys.stdout)],
23
+ force=True,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ import newrelic.agent
29
+
30
+
31
+ def create_app(
32
+ settings: Optional[Settings] = None,
33
+ *,
34
+ preloaded_model: Optional[Dict[str, Any]] = None,
35
+ ) -> FastAPI:
36
+ """
37
+ Build a FastAPI app with injectable settings and model.
38
+ If `preloaded_model` is None, the app will load from S3 on startup.
39
+ """
40
+ cfg = settings or Settings()
41
+
42
+ app = FastAPI(
43
+ title="ML Stream Scorer",
44
+ description="Scores video streams using a pre-trained ML model and DynamoDB features.",
45
+ version="1.0.0",
46
+ )
47
+
48
+ # Mutable state: cache + model
49
+ features_cache = make_features_cache(cfg.cache_maxsize)
50
+ state: Dict[str, Any] = {
51
+ "model": preloaded_model,
52
+ "session": aiobotocore.session.get_session(),
53
+ "model_name": (
54
+ os.path.basename(cfg.s3_model_path) if cfg.s3_model_path else None
55
+ ),
56
+ }
57
+
58
+ @app.on_event("startup")
59
+ async def _startup() -> None:
60
+ if state["model"] is not None:
61
+ logger.info("Using preloaded model.")
62
+ return
63
+
64
+ if not cfg.s3_model_path:
65
+ logger.critical("S3_MODEL_PATH not set; service will be unhealthy.")
66
+ return
67
+
68
+ try:
69
+ state["model"] = await download_and_load_model(
70
+ cfg.s3_model_path, aio_session=state["session"]
71
+ )
72
+ state["stream_features"] = state["model"].get("stream_features", [])
73
+ logger.info("Model loaded on startup.")
74
+ except Exception as e:
75
+ logger.critical("Failed to load model: %s", e)
76
+
77
+ @app.get("/health", status_code=HTTPStatus.OK)
78
+ async def health():
79
+ model_ok = state["model"] is not None
80
+ if not model_ok:
81
+ raise HTTPException(
82
+ status_code=HTTPStatus.SERVICE_UNAVAILABLE,
83
+ detail="ML Model not loaded",
84
+ )
85
+ return {
86
+ "status": "ok",
87
+ "model_loaded": True,
88
+ "cache_size": len(features_cache),
89
+ "model_name": state.get("model_name"),
90
+ "stream_features": state.get("stream_features", []),
91
+ }
92
+
93
+ @app.post("/score", status_code=HTTPStatus.OK)
94
+ async def score_stream(request: Request, response: Response):
95
+ if state["model"] is None:
96
+ raise HTTPException(
97
+ status_code=HTTPStatus.SERVICE_UNAVAILABLE,
98
+ detail="ML Model not loaded",
99
+ )
100
+
101
+ try:
102
+ data = await request.json()
103
+ except Exception as e:
104
+ raise HTTPException(
105
+ status_code=HTTPStatus.BAD_REQUEST, detail="Invalid JSON payload"
106
+ ) from e
107
+ query_params = {}
108
+ for k in request.query_params.keys():
109
+ values = request.query_params.getlist(k)
110
+ # flatten single-element lists
111
+ query_params[k] = values[0] if len(values) == 1 else values
112
+ user = data.get("user", {})
113
+ streams: List[Dict[str, Any]] = data.get("streams", [])
114
+ playlist = data.get("playlist", {})
115
+
116
+ if not streams:
117
+ logger.warning("No streams provided for user %s", user.get("userid", ""))
118
+ return {}
119
+
120
+ # Feature fetch (optional based on model)
121
+ model = state["model"]
122
+ stream_features = model.get("stream_features", []) or []
123
+ retrieval_meta = FeatureRetrievalMeta(
124
+ cache_misses=0, retrieval_ms=0, success=True, cache_delay_minutes=0
125
+ )
126
+ if stream_features:
127
+ retrieval_meta = await set_stream_features(
128
+ aio_session=state["session"],
129
+ streams=streams,
130
+ stream_features=stream_features,
131
+ features_cache=features_cache,
132
+ features_table=cfg.features_table,
133
+ stream_pk_prefix=cfg.stream_pk_prefix,
134
+ cache_sep=cfg.cache_separator,
135
+ )
136
+
137
+ random_number = random.random()
138
+ userid = user.get("userid", "")
139
+ # Sampling logs
140
+ if random_number < cfg.logs_fraction:
141
+ logger.info("User %s streams: %s", user.get("userid", ""), streams)
142
+
143
+ # Synchronous model execution (user code)
144
+ try:
145
+ preprocess_start = time.perf_counter_ns()
146
+ model_input = model["preprocess"](
147
+ user,
148
+ streams,
149
+ playlist,
150
+ {**model.get("params"), "query_params": query_params},
151
+ )
152
+ predict_start = time.perf_counter_ns()
153
+ model_output = model["predict"](
154
+ model_input, {**model.get("params"), "query_params": query_params}
155
+ )
156
+ predict_end = time.perf_counter_ns()
157
+ except Exception as e:
158
+ logger.error("Model prediction failed: %s", e)
159
+ raise HTTPException(
160
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
161
+ detail="Model prediction failed",
162
+ ) from e
163
+
164
+ newrelic.agent.record_custom_event(
165
+ "Inference",
166
+ {
167
+ "cache_misses": retrieval_meta.cache_misses,
168
+ "retrieval_success": int(retrieval_meta.success),
169
+ "cache_delay_minutes": retrieval_meta.cache_delay_minutes,
170
+ "retrieval_ms": retrieval_meta.retrieval_ms,
171
+ "preprocess_ms": (predict_start - preprocess_start) * 1e-6,
172
+ "predict_ms": (predict_end - predict_start) * 1e-6,
173
+ "total_scores": len(model_output),
174
+ },
175
+ )
176
+ if model_output:
177
+ if random_number < cfg.logs_fraction:
178
+ logger.info(
179
+ "User %s - model output %s",
180
+ userid,
181
+ model_output,
182
+ )
183
+ return jsonable_encoder(model_output)
184
+
185
+ raise HTTPException(
186
+ status_code=HTTPStatus.NOT_FOUND, detail="No model output generated"
187
+ )
188
+
189
+ @app.get("/", status_code=HTTPStatus.OK)
190
+ async def root():
191
+ return {
192
+ "message": "ML Scoring Service is running.",
193
+ "model_name": state.get("model_name"),
194
+ }
195
+
196
+ return app
@@ -0,0 +1,19 @@
1
+ from typing import Any
2
+
3
+ from cachetools import TLRUCache
4
+
5
+
6
+ def _ttu(_, value: Any, now: float) -> float:
7
+ """Time-To-Use policy: allow per-item TTL via 'cache_ttl_in_seconds' or fallback."""
8
+ ONE_YEAR = 365 * 24 * 60 * 60
9
+ try:
10
+ ttl = int(value.get("cache_ttl_in_seconds", -1))
11
+ if ttl > 0:
12
+ return now + ttl
13
+ except Exception:
14
+ pass
15
+ return now + ONE_YEAR
16
+
17
+
18
+ def make_features_cache(maxsize: int) -> TLRUCache:
19
+ return TLRUCache(maxsize=maxsize, ttu=_ttu)
@@ -0,0 +1,190 @@
1
+ from typing import Any, Dict, List, NamedTuple
2
+ import logging
3
+ import time
4
+ import datetime
5
+
6
+ import aiobotocore.session
7
+ import newrelic.agent
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class FeatureRetrievalMeta(NamedTuple):
14
+ cache_misses: int
15
+ retrieval_ms: float
16
+ success: bool
17
+ cache_delay_minutes: float
18
+
19
+
20
+ @newrelic.agent.function_trace()
21
+ async def async_batch_get(
22
+ dynamo_client, table_name: str, keys: List[Dict[str, Any]]
23
+ ) -> List[Dict[str, Any]]:
24
+ """
25
+ Asynchronous batch_get_item with chunking for requests > 100 keys
26
+ and handling for unprocessed keys.
27
+ """
28
+ all_items: List[Dict[str, Any]] = []
29
+ # DynamoDB's BatchGetItem has a 100-item limit per request.
30
+ CHUNK_SIZE = 100
31
+
32
+ # Split the keys into chunks of 100
33
+ for i in range(0, len(keys), CHUNK_SIZE):
34
+ chunk_keys = keys[i : i + CHUNK_SIZE]
35
+ to_fetch = {table_name: {"Keys": chunk_keys}}
36
+
37
+ # Inner loop to handle unprocessed keys for the current chunk
38
+ # Max retries of 3
39
+ retries = 3
40
+ while to_fetch and retries > 0:
41
+ retries -= 1
42
+ try:
43
+ resp = await dynamo_client.batch_get_item(RequestItems=to_fetch)
44
+
45
+ if "Responses" in resp and table_name in resp["Responses"]:
46
+ all_items.extend(resp["Responses"][table_name])
47
+
48
+ unprocessed = resp.get("UnprocessedKeys", {})
49
+ # If there are unprocessed keys, set them to be fetched in the next iteration
50
+ if unprocessed and unprocessed.get(table_name):
51
+ logger.warning(
52
+ "Retrying %d unprocessed keys.",
53
+ len(unprocessed[table_name]["Keys"]),
54
+ )
55
+ to_fetch = unprocessed
56
+ else:
57
+ # All keys in the chunk were processed, exit the inner loop
58
+ to_fetch = {}
59
+
60
+ except Exception as e:
61
+ logger.error("Error during batch_get_item for a chunk: %s", e)
62
+ # Stop trying to process this chunk on error and move to the next
63
+ to_fetch = {}
64
+
65
+ return all_items
66
+
67
+
68
+ @newrelic.agent.function_trace()
69
+ def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
70
+ """Parse a DynamoDB attribute map (low-level) to Python types."""
71
+ out: Dict[str, Any] = {}
72
+ for k, v in item.items():
73
+ if "N" in v:
74
+ out[k] = float(v["N"])
75
+ elif "S" in v:
76
+ out[k] = v["S"]
77
+ elif "SS" in v:
78
+ out[k] = v["SS"]
79
+ elif "NS" in v:
80
+ out[k] = [float(n) for n in v["NS"]]
81
+ elif "BOOL" in v:
82
+ out[k] = v["BOOL"]
83
+ elif "NULL" in v:
84
+ out[k] = None
85
+ elif "L" in v:
86
+ out[k] = [parse_dynamo_item({"value": i})["value"] for i in v["L"]]
87
+ elif "M" in v:
88
+ out[k] = parse_dynamo_item(v["M"])
89
+ return out
90
+
91
+
92
+ @newrelic.agent.function_trace()
93
+ async def set_stream_features(
94
+ *,
95
+ streams: List[Dict[str, Any]],
96
+ stream_features: List[str],
97
+ features_cache,
98
+ features_table: str,
99
+ stream_pk_prefix: str,
100
+ cache_sep: str,
101
+ aio_session: aiobotocore.session.Session | None = None,
102
+ ) -> FeatureRetrievalMeta:
103
+ time_start = time.perf_counter_ns()
104
+ """Fetch missing features for streams from DynamoDB and fill them into streams."""
105
+ if not streams or not stream_features:
106
+ return FeatureRetrievalMeta(
107
+ cache_misses=0,
108
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
109
+ success=True,
110
+ cache_delay_minutes=0,
111
+ )
112
+
113
+ cache_miss: Dict[str, Dict[str, Any]] = {}
114
+ cache_delay_obj: dict[str, float] = {f: 0 for f in stream_features}
115
+ now = datetime.datetime.utcnow()
116
+ for f in stream_features:
117
+ for s in streams:
118
+ key = f"{s['streamUrl']}{cache_sep}{f}"
119
+ if key in features_cache:
120
+ # Only set if value is not None
121
+ cached = features_cache.get(key)
122
+ if cached["value"] is not None:
123
+ s[f] = cached["value"]
124
+ cache_delay_obj[f] = max(
125
+ cache_delay_obj[f], (now - cached["updated_at"]).total_seconds()
126
+ )
127
+ else:
128
+ cache_miss[key] = s
129
+ valid_cache_delays = list(v for v in cache_delay_obj.values() if v > 0)
130
+ cache_delay = min(valid_cache_delays) if valid_cache_delays else 0
131
+
132
+ if not cache_miss:
133
+ return FeatureRetrievalMeta(
134
+ cache_misses=0,
135
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
136
+ success=True,
137
+ cache_delay_minutes=cache_delay / 60,
138
+ )
139
+ cache_misses = len(cache_miss)
140
+ logger.info("Cache miss for %d items", cache_misses)
141
+
142
+ # Prepare keys
143
+ keys = []
144
+ for k in cache_miss.keys():
145
+ stream_url, sk = k.split(cache_sep, 1)
146
+ pk = f"{stream_pk_prefix}{stream_url}"
147
+ keys.append({"pk": {"S": pk}, "sk": {"S": sk}})
148
+
149
+ session = aio_session or aiobotocore.session.get_session()
150
+ async with session.create_client("dynamodb") as dynamodb:
151
+ try:
152
+ items = await async_batch_get(dynamodb, features_table, keys)
153
+ except Exception as e:
154
+ logger.error("DynamoDB batch_get failed: %s", e)
155
+ return FeatureRetrievalMeta(
156
+ cache_misses=cache_misses,
157
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
158
+ success=False,
159
+ cache_delay_minutes=cache_delay / 60,
160
+ )
161
+
162
+ updated_keys = set()
163
+ for item in items:
164
+ stream_url = item["pk"]["S"].removeprefix(stream_pk_prefix)
165
+ feature_name = item["sk"]["S"]
166
+ cache_key = f"{stream_url}{cache_sep}{feature_name}"
167
+ parsed = parse_dynamo_item(item)
168
+
169
+ features_cache[cache_key] = {
170
+ "value": parsed.get("value"),
171
+ "cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
172
+ "updated_at": datetime.datetime.fromisoformat(
173
+ parsed.get("updated_at")
174
+ ).replace(tzinfo=None),
175
+ }
176
+ if cache_key in cache_miss:
177
+ cache_miss[cache_key][feature_name] = parsed.get("value")
178
+ updated_keys.add(cache_key)
179
+
180
+ # Save keys that were not found in DynamoDB with None value
181
+ if len(updated_keys) < len(cache_miss):
182
+ missing_keys = set(cache_miss.keys()) - updated_keys
183
+ for k in missing_keys:
184
+ features_cache[k] = {"value": None, "cache_ttl_in_seconds": 300}
185
+ return FeatureRetrievalMeta(
186
+ cache_misses=cache_misses,
187
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
188
+ success=True,
189
+ cache_delay_minutes=cache_delay / 60,
190
+ )
@@ -0,0 +1,38 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ import aiobotocore.session
6
+ import cloudpickle
7
+ import newrelic.agent
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ @newrelic.agent.function_trace()
13
+ async def download_and_load_model(
14
+ s3_url: str, aio_session: aiobotocore.session.Session | None = None
15
+ ) -> Dict[str, Any]:
16
+ """
17
+ Downloads cloudpickled model dict from S3 and loads it.
18
+ Expected keys: 'preprocess', 'predict', 'params', optional 'stream_features'.
19
+ """
20
+ if not s3_url or not s3_url.startswith("s3://"):
21
+ raise ValueError("S3_MODEL_PATH must be a valid s3:// URL")
22
+
23
+ bucket, key = s3_url.replace("s3://", "").split("/", 1)
24
+ pid = os.getpid()
25
+ local_path = f"/tmp/model_{pid}.pkl"
26
+
27
+ session = aio_session or aiobotocore.session.get_session()
28
+ async with session.create_client("s3") as s3:
29
+ logger.info("Downloading model from %s...", s3_url)
30
+ resp = await s3.get_object(Bucket=bucket, Key=key)
31
+ data = await resp["Body"].read()
32
+ with open(local_path, "wb") as f:
33
+ f.write(data)
34
+ logger.info("Model downloaded to %s", local_path)
35
+
36
+ with open(local_path, "rb") as f:
37
+ model: Dict[str, Any] = cloudpickle.load(f)
38
+ return model
@@ -0,0 +1,22 @@
1
+ from pydantic_settings import BaseSettings
2
+ from pydantic import Field
3
+
4
+ class Settings(BaseSettings):
5
+ # Logging
6
+ logs_fraction: float = Field(0.01, alias="LOGS_FRACTION")
7
+
8
+ # Model (S3)
9
+ s3_model_path: str | None = Field(default=None, alias="S3_MODEL_PATH")
10
+
11
+ # DynamoDB
12
+ features_table: str = Field("features", alias="FEATURES_TABLE")
13
+ stream_pk_prefix: str = "STREAM#"
14
+
15
+ # Cache
16
+ cache_maxsize: int = 50_000
17
+ cache_separator: str = "--"
18
+
19
+ class Config:
20
+ env_file = ".env"
21
+ env_file_encoding = "utf-8"
22
+ extra = "ignore"
@@ -0,0 +1,276 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import typing as _t
4
+
5
+
6
+ def stream_tags_cleanup(
7
+ stream, user_favorite_tags: list[str], user_favorite_authors: list[str]
8
+ ) -> dict:
9
+ stream_tags = stream.get("haystackTags", [])
10
+ is_favorite_tag = (
11
+ any(stream_tag in user_favorite_tags for stream_tag in stream_tags)
12
+ if user_favorite_tags is not None
13
+ else False
14
+ )
15
+ is_favorite_author = (
16
+ stream.get("author", None) in user_favorite_authors
17
+ if user_favorite_authors is not None
18
+ else False
19
+ )
20
+ return {
21
+ "IS_FAVORITE_TAG": is_favorite_tag,
22
+ "IS_FAVORITE_AUTHOR": is_favorite_author,
23
+ }
24
+
25
+
26
+ def browsed_count_cleanups(
27
+ stream,
28
+ position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
29
+ ) -> dict:
30
+ position_alias_mapping = {
31
+ "0": "1ST_POS",
32
+ "1": "2ND_POS",
33
+ "2": "3RD_POS",
34
+ "3+": "REST_POS",
35
+ }
36
+ if position_debiasing == "4_browsed":
37
+ suffix = "_UP_TO_4_BROWSED"
38
+ elif position_debiasing == "all_browsed":
39
+ suffix = ""
40
+ else:
41
+ raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
42
+ browsed_count_obj = stream.get("PSELECT#24H", {}).get(position_debiasing, {})
43
+ total_selects = 0
44
+ total_browsed = 0
45
+ total_selects_and_watched = 0
46
+ feats = {}
47
+ for position in position_alias_mapping.keys():
48
+ pos_counts = browsed_count_obj.get(position, {})
49
+ total_browsed += pos_counts.get("total_browsed", 0)
50
+ total_selects += pos_counts.get("total_selects", 0)
51
+ total_selects_and_watched += pos_counts.get("total_selects_and_watched", 0)
52
+ if position_debiasing == "4_browsed":
53
+ suffix = "_UP_TO_4_BROWSED"
54
+ elif position_debiasing == "all_browsed":
55
+ suffix = ""
56
+ else:
57
+ raise ValueError("Should not be here.")
58
+ feats[f"STREAM_24H_TOTAL_BROWSED{suffix}"] = total_browsed
59
+ feats[f"STREAM_24H_TOTAL_SELECTS{suffix}"] = total_selects
60
+ feats[f"STREAM_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = total_selects_and_watched
61
+ return feats
62
+
63
+
64
+ def device_split_browsed_count_cleanups(
65
+ stream,
66
+ device_type: _t.Literal["TV", "MOBILE"],
67
+ position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
68
+ ) -> dict:
69
+ position_alias_mapping = {
70
+ "0": "1ST_POS",
71
+ "1": "2ND_POS",
72
+ "2": "3RD_POS",
73
+ "3+": "REST_POS",
74
+ }
75
+ if position_debiasing == "4_browsed":
76
+ suffix = "_UP_TO_4_BROWSED"
77
+ elif position_debiasing == "all_browsed":
78
+ suffix = ""
79
+ else:
80
+ raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
81
+
82
+ _validate_device_type(device_type)
83
+
84
+ browsed_count_obj = stream.get(f"PSELECT#24H#{device_type}", {}).get(
85
+ position_debiasing, {}
86
+ )
87
+ total_selects = 0
88
+ total_browsed = 0
89
+ total_selects_and_watched = 0
90
+ feats = {}
91
+ for position, alias in position_alias_mapping.items():
92
+ pos_counts = browsed_count_obj.get(position, {})
93
+ total_browsed = pos_counts.get("total_browsed", 0)
94
+ total_selects = pos_counts.get("total_selects", 0)
95
+ total_selects_and_watched = pos_counts.get("total_selects_and_watched", 0)
96
+ feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_BROWSED{suffix}"] = total_browsed
97
+ feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS{suffix}"] = total_selects
98
+ feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = (
99
+ total_selects_and_watched
100
+ )
101
+ return feats
102
+
103
+
104
+ def watched_count_cleanups(stream, entry_contexts: list[str] = None) -> dict:
105
+ if entry_contexts is None:
106
+ entry_contexts = [
107
+ "autoplay",
108
+ "choose next",
109
+ "ch swtch",
110
+ "sel thumb",
111
+ "launch first in session",
112
+ ]
113
+ _validate_pwatched_entry_context(entry_contexts)
114
+
115
+ counts_obj = stream.get(f"PWATCHED#24H", {})
116
+ feats = {}
117
+ for entry_context in entry_contexts:
118
+ attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
119
+ watched = counts_obj.get(entry_context, {}).get("watched", 0)
120
+ context_key = entry_context if "launch" not in entry_context else "launch"
121
+ context_key = context_key.upper().replace(" ", "_")
122
+ feats[f"STREAM_{context_key}_24H_TOTAL_WATCHED"] = watched
123
+ feats[f"STREAM_{context_key}_24H_TOTAL_ATTEMPTS"] = attempts
124
+ return feats
125
+
126
+
127
+ def device_watched_count_cleanups(
128
+ stream, device_type: str, entry_contexts: list[str] = None
129
+ ) -> dict:
130
+ if entry_contexts is None:
131
+ entry_contexts = [
132
+ "autoplay",
133
+ "choose next",
134
+ "ch swtch",
135
+ "sel thumb",
136
+ "launch first in session",
137
+ ]
138
+
139
+ _validate_pwatched_entry_context(entry_contexts)
140
+ _validate_device_type(device_type)
141
+
142
+ counts_obj = stream.get(f"PWATCHED#24H#{device_type}", {})
143
+ feats = {}
144
+ for entry_context in entry_contexts:
145
+ attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
146
+ watched = counts_obj.get(entry_context, {}).get("watched", 0)
147
+ context_key = entry_context if "launch" not in entry_context else "launch"
148
+ context_key = context_key.upper().replace(" ", "_")
149
+ feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_WATCHED"] = (
150
+ watched
151
+ )
152
+ feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_ATTEMPTS"] = (
153
+ attempts
154
+ )
155
+ return feats
156
+
157
+
158
+ def generic_beta_adjust_features(
159
+ data: pd.DataFrame,
160
+ prefix: str,
161
+ pwatched_beta_params: dict,
162
+ pselect_beta_params: dict,
163
+ pslw_beta_params: dict,
164
+ use_low_sample_flags: bool = False,
165
+ low_sample_threshold: int = 3,
166
+ use_attempt_features: bool = False,
167
+ max_attempt_cap: int = 100,
168
+ debiased_pselect: bool = True,
169
+ use_logodds: bool = False,
170
+ ) -> pd.DataFrame:
171
+ pwatched_features = {}
172
+ for context, (alpha, beta) in pwatched_beta_params.items():
173
+ total_watched = data[f"{prefix}_{context}_TOTAL_WATCHED"].fillna(0)
174
+ total_attempts = data[f"{prefix}_{context}_TOTAL_ATTEMPTS"].fillna(0)
175
+ pwatched_features[f"{prefix}_{context}_ADJ_PWATCHED"] = (
176
+ total_watched + alpha
177
+ ) / (total_attempts + alpha + beta)
178
+ if use_low_sample_flags:
179
+ pwatched_features[f"{prefix}_{context}_LOW_SAMPLE"] = total_attempts.le(
180
+ low_sample_threshold
181
+ ).astype(int)
182
+ if use_attempt_features:
183
+ pwatched_features[f"{prefix}_{context}_ATTEMPTS"] = total_attempts.clip(
184
+ upper=max_attempt_cap
185
+ )
186
+
187
+ pselect_features = {}
188
+ debias_suffix = "_UP_TO_4_BROWSED" if debiased_pselect else ""
189
+ for key, (alpha, beta) in pselect_beta_params.items():
190
+ total_selects = data[f"{prefix}_{key}_TOTAL_SELECTS{debias_suffix}"].fillna(0)
191
+ total_browsed = data[f"{prefix}_{key}_TOTAL_BROWSED{debias_suffix}"].fillna(0)
192
+ pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"] = (
193
+ total_selects + alpha
194
+ ) / (total_selects + total_browsed + alpha + beta)
195
+ if use_low_sample_flags:
196
+ pselect_features[f"{prefix}_{key}_PSELECT_LOW_SAMPLE{debias_suffix}"] = (
197
+ (total_selects + total_browsed).le(low_sample_threshold).astype(int)
198
+ )
199
+ if use_attempt_features:
200
+ pselect_features[f"{prefix}_{key}_PSELECT_ATTEMPTS{debias_suffix}"] = (
201
+ total_selects + total_browsed
202
+ ).clip(upper=max_attempt_cap)
203
+ total_slw = data[
204
+ f"{prefix}_{key}_TOTAL_SELECTS_AND_WATCHED{debias_suffix}"
205
+ ].fillna(0)
206
+ pslw_alpha, pslw_beta = pslw_beta_params[key]
207
+ pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"] = (
208
+ total_slw + pslw_alpha
209
+ ) / (total_selects + total_browsed + pslw_alpha + pslw_beta)
210
+ pselect_features[f"{prefix}_{key}_PSelNotW{debias_suffix}"] = (
211
+ pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"]
212
+ - pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"]
213
+ )
214
+
215
+ adjusted_feats = pd.DataFrame({**pwatched_features, **pselect_features})
216
+ if use_logodds:
217
+ adjusted_feats = adjusted_feats.pipe(
218
+ lambda x: x.assign(
219
+ **x[
220
+ [
221
+ c
222
+ for c in x.columns
223
+ if "PSELECT" in c
224
+ or "PSLW" in c
225
+ or "PWATCHED" in c
226
+ or "PSelNotW" in c
227
+ ]
228
+ ]
229
+ .clip(lower=0.001)
230
+ .pipe(prob_to_logodds)
231
+ )
232
+ )
233
+ return adjusted_feats
234
+
235
+
236
+ def prob_to_logodds(prob: float) -> float:
237
+ return np.log(prob) - np.log(1 - prob)
238
+
239
+
240
+ def scale_preds(
241
+ preds: pd.Series,
242
+ original_mean: float,
243
+ original_std: float,
244
+ target_mean: float,
245
+ target_std: float,
246
+ ) -> pd.Series:
247
+ z_score = (preds - original_mean) / original_std
248
+ return z_score * target_std + target_mean
249
+
250
+
251
+ def sigmoid(x: float) -> float:
252
+ return 1 / (1 + np.exp(-x))
253
+
254
+
255
+ def generic_logistic_predict(
256
+ data: pd.DataFrame, coeffs: pd.Series, intercept: float
257
+ ) -> pd.Series:
258
+ return ((data[coeffs.index] * coeffs).sum(axis=1) + intercept).pipe(sigmoid)
259
+
260
+
261
+ def _validate_device_type(device_type: str):
262
+ if device_type not in ("TV", "MOBILE"):
263
+ raise ValueError(f"Invalid device type '{device_type}")
264
+
265
+
266
+ def _validate_pwatched_entry_context(entry_contexts: list[str]):
267
+ valid_contexts = [
268
+ "autoplay",
269
+ "choose next",
270
+ "ch swtch",
271
+ "sel thumb",
272
+ "launch first in session",
273
+ ]
274
+ invalid_contexts = [c for c in entry_contexts if c not in valid_contexts]
275
+ if invalid_contexts:
276
+ raise ValueError(f"Invalid entry contexts found: {invalid_contexts}")
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: haystack-ml-stack
3
+ Version: 0.2.1
4
+ Summary: Functions related to Haystack ML
5
+ Author-email: Oscar Vega <oscar@haystack.tv>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pydantic==2.5.0
10
+ Requires-Dist: cachetools==5.5.2
11
+ Requires-Dist: cloudpickle==2.2.1
12
+ Requires-Dist: aioboto3==12.0.0
13
+ Requires-Dist: fastapi==0.104.1
14
+ Requires-Dist: pydantic-settings==2.2
15
+ Requires-Dist: newrelic==11.1.0
16
+
17
+ # Haystack ML Stack
18
+
19
+ Currently this project contains a FastAPI-based service designed for low-latency scoring of streams data coming from http requests
20
+
21
+ ## 🚀 Features
22
+
23
+ * **FastAPI Service:** Lightweight and fast web service for ML inference.
24
+ * **Asynchronous I/O:** Utilizes `aiobotocore` for non-blocking S3 and DynamoDB operations.
25
+ * **Model Loading:** Downloads and loads the ML model (using `cloudpickle`) from a configurable S3 path on startup.
26
+ * **Feature Caching:** Implements a thread-safe Time-To-Live (TTL) / Least-Recently-Used (LRU) cache (`cachetools.TLRUCache`) for DynamoDB features, reducing latency and database load.
27
+ * **DynamoDB Integration:** Fetches stream-specific features from DynamoDB to enrich the data before scoring.
28
+ * **Health Check:** Provides a `/health` endpoint to monitor service status and model loading.
29
+
30
+ ## 📦 Installation
31
+
32
+ This project requires Python 3.11 or later.
33
+
34
+ 1. **Install package:**
35
+ The dependencies associated are listed in `pyproject.toml`.
36
+
37
+ ```bash
38
+ pip install haystack-ml-stack
39
+ ```
40
+
41
+ ## ⚙️ Configuration
42
+
43
+ The service is configured using environment variables, managed by `pydantic-settings`. You can use a `.env` file for local development.
44
+
45
+ | Variable Name | Alias | Default | Description |
46
+ | :--- | :--- | :--- | :--- |
47
+ | `S3_MODEL_PATH` | `S3_MODEL_PATH` | `None` | **Required.** The `s3://bucket/key` URL for the cloudpickled ML model file. |
48
+ | `FEATURES_TABLE`| `FEATURES_TABLE`| `"features"` | Name of the DynamoDB table storing stream features. |
49
+ | `LOGS_FRACTION` | `LOGS_FRACTION` | `0.01` | Fraction of requests to log detailed stream data for sampling/debugging (0.0 to 1.0). |
50
+ | `CACHE_MAXSIZE` | *(none)* | `50000` | Maximum size of the in-memory feature cache. |
51
+
52
+ **Example env vars**
53
+
54
+ ```env
55
+ S3_MODEL_PATH="s3://my-ml-models/stream-scorer/latest.pkl"
56
+ FEATURES_TABLE="features"
57
+ LOGS_FRACTION=0.05
58
+ ```
59
+
60
+ ## 🌐 Endpoints
61
+ | Method | Path | Description |
62
+ | :--- | :--- | :--- |
63
+ | **GET** | `/` | Root endpoint, returns a simple running message. |
64
+ | **GET** | `/health` | Checks if the service is running and if the ML model has been loaded. |
65
+ | **POST** | `/score` | **Main scoring endpoint.** Accepts stream data and returns model predictions. |
66
+
67
+ ## 💻 Technical Details
68
+
69
+ ### Model Structure
70
+ The ML model file downloaded from S3 is expected to be a cloudpickle-serialized Python dictionary with the following structure:
71
+
72
+ ``` python
73
+
74
+ model = {
75
+ "preprocess": <function>, # Function to transform request data into model input.
76
+ "predict": <function>, # Function to perform the actual model inference.
77
+ "params": <dict/any>, # Optional parameters passed to preprocess/predict.
78
+ "stream_features": <list[str]>, # Optional list of feature names to fetch from DynamoDB.
79
+ }
80
+ ```
81
+
82
+ ### Feature Caching (cache.py)
83
+ The `ThreadSafeTLRUCache` ensures that feature lookups and updates are thread-safe.
84
+ The `_ttu` (time-to-use) policy allows features to specify their own TTL via a `cache_ttl_in_seconds` key in the stored value.
85
+
86
+ ### DynamoDB Feature Fetching (dynamo.py)
87
+ The set_stream_features function handles:
88
+
89
+ - Checking the in-memory cache for required `stream_features`.
90
+
91
+ - Batch-fetching any missing features from DynamoDB.
92
+
93
+ - Parsing the low-level DynamoDB items into Python types.
94
+
95
+ - Populating the cache with the fetched data, respecting the feature's TTL.
96
+
97
+ - Injecting the fetched feature values back into the streams list in the request payload.
@@ -0,0 +1,11 @@
1
+ haystack_ml_stack/__init__.py,sha256=RAW9zUwU7LLT50KJD-hDRKn0Rn0UzhAwH3YOb4l1w80,75
2
+ haystack_ml_stack/app.py,sha256=08PSZJObC6p6CwW1140zGRwmkhDFNMAs7ij-07OzlxU,6756
3
+ haystack_ml_stack/cache.py,sha256=X5sWARTvxbkCnl6NdILa4lwDt7iAm5Wl9CcgoEv7E6s,506
4
+ haystack_ml_stack/dynamo.py,sha256=JovZAmv8GC6QPRrFXkbWT3Vk92_5v68SYL4rTOMxdOw,6776
5
+ haystack_ml_stack/model_store.py,sha256=50vuFeNwtYZknPaEB4vzglQ_AeVTjjPew0TNhaXyNCU,1235
6
+ haystack_ml_stack/settings.py,sha256=2JH-H7NtKaVbc11jq4Yl3h5TjCve98g--8pqNvQjfj4,574
7
+ haystack_ml_stack/utils.py,sha256=jZanwbgF6j_6eXzE2J1VhQiPz-rUnrDi8bgzIYym-qI,9892
8
+ haystack_ml_stack-0.2.1.dist-info/METADATA,sha256=9pxaz8HHDNQrHfVe695YmFEChnKCT9FeplN9vUyxA9U,3933
9
+ haystack_ml_stack-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
+ haystack_ml_stack-0.2.1.dist-info/top_level.txt,sha256=S3g0eH9BeMKygOIwmfB25jtCiAfj0b3CbwPTalcIwvc,18
11
+ haystack_ml_stack-0.2.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ haystack_ml_stack