haystack-ml-stack 0.1.2__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/PKG-INFO +2 -1
  2. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/pyproject.toml +3 -2
  3. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/__init__.py +1 -1
  4. {haystack_ml_stack-0.1.2/src/haystack_test_package → haystack_ml_stack-0.2.1/src/haystack_ml_stack}/app.py +49 -11
  5. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/dynamo.py +53 -7
  6. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/model_store.py +3 -1
  7. haystack_ml_stack-0.2.1/src/haystack_ml_stack/utils.py +276 -0
  8. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/PKG-INFO +2 -1
  9. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/SOURCES.txt +2 -7
  10. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/requires.txt +1 -0
  11. haystack_ml_stack-0.2.1/src/haystack_ml_stack.egg-info/top_level.txt +1 -0
  12. haystack_ml_stack-0.1.2/src/haystack_ml_stack/app.py +0 -158
  13. haystack_ml_stack-0.1.2/src/haystack_ml_stack.egg-info/top_level.txt +0 -2
  14. haystack_ml_stack-0.1.2/src/haystack_test_package/__init__.py +0 -4
  15. haystack_ml_stack-0.1.2/src/haystack_test_package/cache.py +0 -19
  16. haystack_ml_stack-0.1.2/src/haystack_test_package/dynamo.py +0 -137
  17. haystack_ml_stack-0.1.2/src/haystack_test_package/model_store.py +0 -36
  18. haystack_ml_stack-0.1.2/src/haystack_test_package/settings.py +0 -22
  19. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/README.md +0 -0
  20. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/setup.cfg +0 -0
  21. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/cache.py +0 -0
  22. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack/settings.py +0 -0
  23. {haystack_ml_stack-0.1.2 → haystack_ml_stack-0.2.1}/src/haystack_ml_stack.egg-info/dependency_links.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haystack-ml-stack
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: Functions related to Haystack ML
5
5
  Author-email: Oscar Vega <oscar@haystack.tv>
6
6
  License: MIT
@@ -12,6 +12,7 @@ Requires-Dist: cloudpickle==2.2.1
12
12
  Requires-Dist: aioboto3==12.0.0
13
13
  Requires-Dist: fastapi==0.104.1
14
14
  Requires-Dist: pydantic-settings==2.2
15
+ Requires-Dist: newrelic==11.1.0
15
16
 
16
17
  # Haystack ML Stack
17
18
 
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "haystack-ml-stack"
8
- version = "0.1.2"
8
+ version = "0.2.1"
9
9
  description = "Functions related to Haystack ML"
10
10
  readme = "README.md"
11
11
  authors = [{ name = "Oscar Vega", email = "oscar@haystack.tv" }]
@@ -16,6 +16,7 @@ dependencies = [
16
16
  "cloudpickle==2.2.1",
17
17
  "aioboto3==12.0.0",
18
18
  "fastapi==0.104.1",
19
- "pydantic-settings==2.2"
19
+ "pydantic-settings==2.2",
20
+ "newrelic==11.1.0"
20
21
  ]
21
22
  license = { text = "MIT" }
@@ -1,4 +1,4 @@
1
1
  from .app import create_app
2
2
 
3
3
  __all__ = ["create_app"]
4
- __version__ = "0.1.2"
4
+ __version__ = "0.2.1"
@@ -4,13 +4,15 @@ import random
4
4
  import sys
5
5
  from http import HTTPStatus
6
6
  from typing import Any, Dict, List, Optional
7
+ import time
7
8
 
8
9
  import aiobotocore.session
9
10
  from fastapi import FastAPI, HTTPException, Request, Response
10
11
  from fastapi.encoders import jsonable_encoder
11
12
 
13
+
12
14
  from .cache import make_features_cache
13
- from .dynamo import set_stream_features
15
+ from .dynamo import set_stream_features, FeatureRetrievalMeta
14
16
  from .model_store import download_and_load_model
15
17
  from .settings import Settings
16
18
 
@@ -23,6 +25,8 @@ logging.basicConfig(
23
25
 
24
26
  logger = logging.getLogger(__name__)
25
27
 
28
+ import newrelic.agent
29
+
26
30
 
27
31
  def create_app(
28
32
  settings: Optional[Settings] = None,
@@ -96,11 +100,15 @@ def create_app(
96
100
 
97
101
  try:
98
102
  data = await request.json()
99
- except Exception:
103
+ except Exception as e:
100
104
  raise HTTPException(
101
105
  status_code=HTTPStatus.BAD_REQUEST, detail="Invalid JSON payload"
102
- )
103
-
106
+ ) from e
107
+ query_params = {}
108
+ for k in request.query_params.keys():
109
+ values = request.query_params.getlist(k)
110
+ # flatten single-element lists
111
+ query_params[k] = values[0] if len(values) == 1 else values
104
112
  user = data.get("user", {})
105
113
  streams: List[Dict[str, Any]] = data.get("streams", [])
106
114
  playlist = data.get("playlist", {})
@@ -112,9 +120,11 @@ def create_app(
112
120
  # Feature fetch (optional based on model)
113
121
  model = state["model"]
114
122
  stream_features = model.get("stream_features", []) or []
123
+ retrieval_meta = FeatureRetrievalMeta(
124
+ cache_misses=0, retrieval_ms=0, success=True, cache_delay_minutes=0
125
+ )
115
126
  if stream_features:
116
- logger.info("Fetching stream features for user %s", user.get("userid", ""))
117
- await set_stream_features(
127
+ retrieval_meta = await set_stream_features(
118
128
  aio_session=state["session"],
119
129
  streams=streams,
120
130
  stream_features=stream_features,
@@ -124,24 +134,52 @@ def create_app(
124
134
  cache_sep=cfg.cache_separator,
125
135
  )
126
136
 
137
+ random_number = random.random()
138
+ userid = user.get("userid", "")
127
139
  # Sampling logs
128
- if random.random() < cfg.logs_fraction:
140
+ if random_number < cfg.logs_fraction:
129
141
  logger.info("User %s streams: %s", user.get("userid", ""), streams)
130
142
 
131
143
  # Synchronous model execution (user code)
132
144
  try:
145
+ preprocess_start = time.perf_counter_ns()
133
146
  model_input = model["preprocess"](
134
- user, streams, playlist, model.get("params")
147
+ user,
148
+ streams,
149
+ playlist,
150
+ {**model.get("params"), "query_params": query_params},
151
+ )
152
+ predict_start = time.perf_counter_ns()
153
+ model_output = model["predict"](
154
+ model_input, {**model.get("params"), "query_params": query_params}
135
155
  )
136
- model_output = model["predict"](model_input, model.get("params"))
156
+ predict_end = time.perf_counter_ns()
137
157
  except Exception as e:
138
158
  logger.error("Model prediction failed: %s", e)
139
159
  raise HTTPException(
140
160
  status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
141
161
  detail="Model prediction failed",
142
- )
143
-
162
+ ) from e
163
+
164
+ newrelic.agent.record_custom_event(
165
+ "Inference",
166
+ {
167
+ "cache_misses": retrieval_meta.cache_misses,
168
+ "retrieval_success": int(retrieval_meta.success),
169
+ "cache_delay_minutes": retrieval_meta.cache_delay_minutes,
170
+ "retrieval_ms": retrieval_meta.retrieval_ms,
171
+ "preprocess_ms": (predict_start - preprocess_start) * 1e-6,
172
+ "predict_ms": (predict_end - predict_start) * 1e-6,
173
+ "total_scores": len(model_output),
174
+ },
175
+ )
144
176
  if model_output:
177
+ if random_number < cfg.logs_fraction:
178
+ logger.info(
179
+ "User %s - model output %s",
180
+ userid,
181
+ model_output,
182
+ )
145
183
  return jsonable_encoder(model_output)
146
184
 
147
185
  raise HTTPException(
@@ -1,11 +1,23 @@
1
- from typing import Any, Dict, List
1
+ from typing import Any, Dict, List, NamedTuple
2
2
  import logging
3
+ import time
4
+ import datetime
3
5
 
4
6
  import aiobotocore.session
7
+ import newrelic.agent
8
+
5
9
 
6
10
  logger = logging.getLogger(__name__)
7
11
 
8
12
 
13
+ class FeatureRetrievalMeta(NamedTuple):
14
+ cache_misses: int
15
+ retrieval_ms: float
16
+ success: bool
17
+ cache_delay_minutes: float
18
+
19
+
20
+ @newrelic.agent.function_trace()
9
21
  async def async_batch_get(
10
22
  dynamo_client, table_name: str, keys: List[Dict[str, Any]]
11
23
  ) -> List[Dict[str, Any]]:
@@ -53,6 +65,7 @@ async def async_batch_get(
53
65
  return all_items
54
66
 
55
67
 
68
+ @newrelic.agent.function_trace()
56
69
  def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
57
70
  """Parse a DynamoDB attribute map (low-level) to Python types."""
58
71
  out: Dict[str, Any] = {}
@@ -76,6 +89,7 @@ def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
76
89
  return out
77
90
 
78
91
 
92
+ @newrelic.agent.function_trace()
79
93
  async def set_stream_features(
80
94
  *,
81
95
  streams: List[Dict[str, Any]],
@@ -85,12 +99,20 @@ async def set_stream_features(
85
99
  stream_pk_prefix: str,
86
100
  cache_sep: str,
87
101
  aio_session: aiobotocore.session.Session | None = None,
88
- ) -> None:
102
+ ) -> FeatureRetrievalMeta:
103
+ time_start = time.perf_counter_ns()
89
104
  """Fetch missing features for streams from DynamoDB and fill them into streams."""
90
105
  if not streams or not stream_features:
91
- return
106
+ return FeatureRetrievalMeta(
107
+ cache_misses=0,
108
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
109
+ success=True,
110
+ cache_delay_minutes=0,
111
+ )
92
112
 
93
113
  cache_miss: Dict[str, Dict[str, Any]] = {}
114
+ cache_delay_obj: dict[str, float] = {f: 0 for f in stream_features}
115
+ now = datetime.datetime.utcnow()
94
116
  for f in stream_features:
95
117
  for s in streams:
96
118
  key = f"{s['streamUrl']}{cache_sep}{f}"
@@ -99,13 +121,23 @@ async def set_stream_features(
99
121
  cached = features_cache.get(key)
100
122
  if cached["value"] is not None:
101
123
  s[f] = cached["value"]
124
+ cache_delay_obj[f] = max(
125
+ cache_delay_obj[f], (now - cached["updated_at"]).total_seconds()
126
+ )
102
127
  else:
103
128
  cache_miss[key] = s
129
+ valid_cache_delays = list(v for v in cache_delay_obj.values() if v > 0)
130
+ cache_delay = min(valid_cache_delays) if valid_cache_delays else 0
104
131
 
105
132
  if not cache_miss:
106
- return
107
-
108
- logger.info("Cache miss for %d items", len(cache_miss))
133
+ return FeatureRetrievalMeta(
134
+ cache_misses=0,
135
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
136
+ success=True,
137
+ cache_delay_minutes=cache_delay / 60,
138
+ )
139
+ cache_misses = len(cache_miss)
140
+ logger.info("Cache miss for %d items", cache_misses)
109
141
 
110
142
  # Prepare keys
111
143
  keys = []
@@ -120,7 +152,12 @@ async def set_stream_features(
120
152
  items = await async_batch_get(dynamodb, features_table, keys)
121
153
  except Exception as e:
122
154
  logger.error("DynamoDB batch_get failed: %s", e)
123
- return
155
+ return FeatureRetrievalMeta(
156
+ cache_misses=cache_misses,
157
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
158
+ success=False,
159
+ cache_delay_minutes=cache_delay / 60,
160
+ )
124
161
 
125
162
  updated_keys = set()
126
163
  for item in items:
@@ -132,6 +169,9 @@ async def set_stream_features(
132
169
  features_cache[cache_key] = {
133
170
  "value": parsed.get("value"),
134
171
  "cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
172
+ "updated_at": datetime.datetime.fromisoformat(
173
+ parsed.get("updated_at")
174
+ ).replace(tzinfo=None),
135
175
  }
136
176
  if cache_key in cache_miss:
137
177
  cache_miss[cache_key][feature_name] = parsed.get("value")
@@ -142,3 +182,9 @@ async def set_stream_features(
142
182
  missing_keys = set(cache_miss.keys()) - updated_keys
143
183
  for k in missing_keys:
144
184
  features_cache[k] = {"value": None, "cache_ttl_in_seconds": 300}
185
+ return FeatureRetrievalMeta(
186
+ cache_misses=cache_misses,
187
+ retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
188
+ success=True,
189
+ cache_delay_minutes=cache_delay / 60,
190
+ )
@@ -4,10 +4,12 @@ from typing import Any, Dict
4
4
 
5
5
  import aiobotocore.session
6
6
  import cloudpickle
7
+ import newrelic.agent
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
10
11
 
12
+ @newrelic.agent.function_trace()
11
13
  async def download_and_load_model(
12
14
  s3_url: str, aio_session: aiobotocore.session.Session | None = None
13
15
  ) -> Dict[str, Any]:
@@ -33,4 +35,4 @@ async def download_and_load_model(
33
35
 
34
36
  with open(local_path, "rb") as f:
35
37
  model: Dict[str, Any] = cloudpickle.load(f)
36
- return model
38
+ return model
@@ -0,0 +1,276 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import typing as _t
4
+
5
+
6
+ def stream_tags_cleanup(
7
+ stream, user_favorite_tags: list[str], user_favorite_authors: list[str]
8
+ ) -> dict:
9
+ stream_tags = stream.get("haystackTags", [])
10
+ is_favorite_tag = (
11
+ any(stream_tag in user_favorite_tags for stream_tag in stream_tags)
12
+ if user_favorite_tags is not None
13
+ else False
14
+ )
15
+ is_favorite_author = (
16
+ stream.get("author", None) in user_favorite_authors
17
+ if user_favorite_authors is not None
18
+ else False
19
+ )
20
+ return {
21
+ "IS_FAVORITE_TAG": is_favorite_tag,
22
+ "IS_FAVORITE_AUTHOR": is_favorite_author,
23
+ }
24
+
25
+
26
+ def browsed_count_cleanups(
27
+ stream,
28
+ position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
29
+ ) -> dict:
30
+ position_alias_mapping = {
31
+ "0": "1ST_POS",
32
+ "1": "2ND_POS",
33
+ "2": "3RD_POS",
34
+ "3+": "REST_POS",
35
+ }
36
+ if position_debiasing == "4_browsed":
37
+ suffix = "_UP_TO_4_BROWSED"
38
+ elif position_debiasing == "all_browsed":
39
+ suffix = ""
40
+ else:
41
+ raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
42
+ browsed_count_obj = stream.get("PSELECT#24H", {}).get(position_debiasing, {})
43
+ total_selects = 0
44
+ total_browsed = 0
45
+ total_selects_and_watched = 0
46
+ feats = {}
47
+ for position in position_alias_mapping.keys():
48
+ pos_counts = browsed_count_obj.get(position, {})
49
+ total_browsed += pos_counts.get("total_browsed", 0)
50
+ total_selects += pos_counts.get("total_selects", 0)
51
+ total_selects_and_watched += pos_counts.get("total_selects_and_watched", 0)
52
+ if position_debiasing == "4_browsed":
53
+ suffix = "_UP_TO_4_BROWSED"
54
+ elif position_debiasing == "all_browsed":
55
+ suffix = ""
56
+ else:
57
+ raise ValueError("Should not be here.")
58
+ feats[f"STREAM_24H_TOTAL_BROWSED{suffix}"] = total_browsed
59
+ feats[f"STREAM_24H_TOTAL_SELECTS{suffix}"] = total_selects
60
+ feats[f"STREAM_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = total_selects_and_watched
61
+ return feats
62
+
63
+
64
+ def device_split_browsed_count_cleanups(
65
+ stream,
66
+ device_type: _t.Literal["TV", "MOBILE"],
67
+ position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
68
+ ) -> dict:
69
+ position_alias_mapping = {
70
+ "0": "1ST_POS",
71
+ "1": "2ND_POS",
72
+ "2": "3RD_POS",
73
+ "3+": "REST_POS",
74
+ }
75
+ if position_debiasing == "4_browsed":
76
+ suffix = "_UP_TO_4_BROWSED"
77
+ elif position_debiasing == "all_browsed":
78
+ suffix = ""
79
+ else:
80
+ raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
81
+
82
+ _validate_device_type(device_type)
83
+
84
+ browsed_count_obj = stream.get(f"PSELECT#24H#{device_type}", {}).get(
85
+ position_debiasing, {}
86
+ )
87
+ total_selects = 0
88
+ total_browsed = 0
89
+ total_selects_and_watched = 0
90
+ feats = {}
91
+ for position, alias in position_alias_mapping.items():
92
+ pos_counts = browsed_count_obj.get(position, {})
93
+ total_browsed = pos_counts.get("total_browsed", 0)
94
+ total_selects = pos_counts.get("total_selects", 0)
95
+ total_selects_and_watched = pos_counts.get("total_selects_and_watched", 0)
96
+ feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_BROWSED{suffix}"] = total_browsed
97
+ feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS{suffix}"] = total_selects
98
+ feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = (
99
+ total_selects_and_watched
100
+ )
101
+ return feats
102
+
103
+
104
+ def watched_count_cleanups(stream, entry_contexts: list[str] = None) -> dict:
105
+ if entry_contexts is None:
106
+ entry_contexts = [
107
+ "autoplay",
108
+ "choose next",
109
+ "ch swtch",
110
+ "sel thumb",
111
+ "launch first in session",
112
+ ]
113
+ _validate_pwatched_entry_context(entry_contexts)
114
+
115
+ counts_obj = stream.get(f"PWATCHED#24H", {})
116
+ feats = {}
117
+ for entry_context in entry_contexts:
118
+ attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
119
+ watched = counts_obj.get(entry_context, {}).get("watched", 0)
120
+ context_key = entry_context if "launch" not in entry_context else "launch"
121
+ context_key = context_key.upper().replace(" ", "_")
122
+ feats[f"STREAM_{context_key}_24H_TOTAL_WATCHED"] = watched
123
+ feats[f"STREAM_{context_key}_24H_TOTAL_ATTEMPTS"] = attempts
124
+ return feats
125
+
126
+
127
+ def device_watched_count_cleanups(
128
+ stream, device_type: str, entry_contexts: list[str] = None
129
+ ) -> dict:
130
+ if entry_contexts is None:
131
+ entry_contexts = [
132
+ "autoplay",
133
+ "choose next",
134
+ "ch swtch",
135
+ "sel thumb",
136
+ "launch first in session",
137
+ ]
138
+
139
+ _validate_pwatched_entry_context(entry_contexts)
140
+ _validate_device_type(device_type)
141
+
142
+ counts_obj = stream.get(f"PWATCHED#24H#{device_type}", {})
143
+ feats = {}
144
+ for entry_context in entry_contexts:
145
+ attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
146
+ watched = counts_obj.get(entry_context, {}).get("watched", 0)
147
+ context_key = entry_context if "launch" not in entry_context else "launch"
148
+ context_key = context_key.upper().replace(" ", "_")
149
+ feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_WATCHED"] = (
150
+ watched
151
+ )
152
+ feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_ATTEMPTS"] = (
153
+ attempts
154
+ )
155
+ return feats
156
+
157
+
158
+ def generic_beta_adjust_features(
159
+ data: pd.DataFrame,
160
+ prefix: str,
161
+ pwatched_beta_params: dict,
162
+ pselect_beta_params: dict,
163
+ pslw_beta_params: dict,
164
+ use_low_sample_flags: bool = False,
165
+ low_sample_threshold: int = 3,
166
+ use_attempt_features: bool = False,
167
+ max_attempt_cap: int = 100,
168
+ debiased_pselect: bool = True,
169
+ use_logodds: bool = False,
170
+ ) -> pd.DataFrame:
171
+ pwatched_features = {}
172
+ for context, (alpha, beta) in pwatched_beta_params.items():
173
+ total_watched = data[f"{prefix}_{context}_TOTAL_WATCHED"].fillna(0)
174
+ total_attempts = data[f"{prefix}_{context}_TOTAL_ATTEMPTS"].fillna(0)
175
+ pwatched_features[f"{prefix}_{context}_ADJ_PWATCHED"] = (
176
+ total_watched + alpha
177
+ ) / (total_attempts + alpha + beta)
178
+ if use_low_sample_flags:
179
+ pwatched_features[f"{prefix}_{context}_LOW_SAMPLE"] = total_attempts.le(
180
+ low_sample_threshold
181
+ ).astype(int)
182
+ if use_attempt_features:
183
+ pwatched_features[f"{prefix}_{context}_ATTEMPTS"] = total_attempts.clip(
184
+ upper=max_attempt_cap
185
+ )
186
+
187
+ pselect_features = {}
188
+ debias_suffix = "_UP_TO_4_BROWSED" if debiased_pselect else ""
189
+ for key, (alpha, beta) in pselect_beta_params.items():
190
+ total_selects = data[f"{prefix}_{key}_TOTAL_SELECTS{debias_suffix}"].fillna(0)
191
+ total_browsed = data[f"{prefix}_{key}_TOTAL_BROWSED{debias_suffix}"].fillna(0)
192
+ pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"] = (
193
+ total_selects + alpha
194
+ ) / (total_selects + total_browsed + alpha + beta)
195
+ if use_low_sample_flags:
196
+ pselect_features[f"{prefix}_{key}_PSELECT_LOW_SAMPLE{debias_suffix}"] = (
197
+ (total_selects + total_browsed).le(low_sample_threshold).astype(int)
198
+ )
199
+ if use_attempt_features:
200
+ pselect_features[f"{prefix}_{key}_PSELECT_ATTEMPTS{debias_suffix}"] = (
201
+ total_selects + total_browsed
202
+ ).clip(upper=max_attempt_cap)
203
+ total_slw = data[
204
+ f"{prefix}_{key}_TOTAL_SELECTS_AND_WATCHED{debias_suffix}"
205
+ ].fillna(0)
206
+ pslw_alpha, pslw_beta = pslw_beta_params[key]
207
+ pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"] = (
208
+ total_slw + pslw_alpha
209
+ ) / (total_selects + total_browsed + pslw_alpha + pslw_beta)
210
+ pselect_features[f"{prefix}_{key}_PSelNotW{debias_suffix}"] = (
211
+ pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"]
212
+ - pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"]
213
+ )
214
+
215
+ adjusted_feats = pd.DataFrame({**pwatched_features, **pselect_features})
216
+ if use_logodds:
217
+ adjusted_feats = adjusted_feats.pipe(
218
+ lambda x: x.assign(
219
+ **x[
220
+ [
221
+ c
222
+ for c in x.columns
223
+ if "PSELECT" in c
224
+ or "PSLW" in c
225
+ or "PWATCHED" in c
226
+ or "PSelNotW" in c
227
+ ]
228
+ ]
229
+ .clip(lower=0.001)
230
+ .pipe(prob_to_logodds)
231
+ )
232
+ )
233
+ return adjusted_feats
234
+
235
+
236
+ def prob_to_logodds(prob: float) -> float:
237
+ return np.log(prob) - np.log(1 - prob)
238
+
239
+
240
+ def scale_preds(
241
+ preds: pd.Series,
242
+ original_mean: float,
243
+ original_std: float,
244
+ target_mean: float,
245
+ target_std: float,
246
+ ) -> pd.Series:
247
+ z_score = (preds - original_mean) / original_std
248
+ return z_score * target_std + target_mean
249
+
250
+
251
+ def sigmoid(x: float) -> float:
252
+ return 1 / (1 + np.exp(-x))
253
+
254
+
255
+ def generic_logistic_predict(
256
+ data: pd.DataFrame, coeffs: pd.Series, intercept: float
257
+ ) -> pd.Series:
258
+ return ((data[coeffs.index] * coeffs).sum(axis=1) + intercept).pipe(sigmoid)
259
+
260
+
261
+ def _validate_device_type(device_type: str):
262
+ if device_type not in ("TV", "MOBILE"):
263
+ raise ValueError(f"Invalid device type '{device_type}")
264
+
265
+
266
+ def _validate_pwatched_entry_context(entry_contexts: list[str]):
267
+ valid_contexts = [
268
+ "autoplay",
269
+ "choose next",
270
+ "ch swtch",
271
+ "sel thumb",
272
+ "launch first in session",
273
+ ]
274
+ invalid_contexts = [c for c in entry_contexts if c not in valid_contexts]
275
+ if invalid_contexts:
276
+ raise ValueError(f"Invalid entry contexts found: {invalid_contexts}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haystack-ml-stack
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: Functions related to Haystack ML
5
5
  Author-email: Oscar Vega <oscar@haystack.tv>
6
6
  License: MIT
@@ -12,6 +12,7 @@ Requires-Dist: cloudpickle==2.2.1
12
12
  Requires-Dist: aioboto3==12.0.0
13
13
  Requires-Dist: fastapi==0.104.1
14
14
  Requires-Dist: pydantic-settings==2.2
15
+ Requires-Dist: newrelic==11.1.0
15
16
 
16
17
  # Haystack ML Stack
17
18
 
@@ -6,14 +6,9 @@ src/haystack_ml_stack/cache.py
6
6
  src/haystack_ml_stack/dynamo.py
7
7
  src/haystack_ml_stack/model_store.py
8
8
  src/haystack_ml_stack/settings.py
9
+ src/haystack_ml_stack/utils.py
9
10
  src/haystack_ml_stack.egg-info/PKG-INFO
10
11
  src/haystack_ml_stack.egg-info/SOURCES.txt
11
12
  src/haystack_ml_stack.egg-info/dependency_links.txt
12
13
  src/haystack_ml_stack.egg-info/requires.txt
13
- src/haystack_ml_stack.egg-info/top_level.txt
14
- src/haystack_test_package/__init__.py
15
- src/haystack_test_package/app.py
16
- src/haystack_test_package/cache.py
17
- src/haystack_test_package/dynamo.py
18
- src/haystack_test_package/model_store.py
19
- src/haystack_test_package/settings.py
14
+ src/haystack_ml_stack.egg-info/top_level.txt
@@ -4,3 +4,4 @@ cloudpickle==2.2.1
4
4
  aioboto3==12.0.0
5
5
  fastapi==0.104.1
6
6
  pydantic-settings==2.2
7
+ newrelic==11.1.0
@@ -0,0 +1 @@
1
+ haystack_ml_stack
@@ -1,158 +0,0 @@
1
- import logging
2
- import os
3
- import random
4
- import sys
5
- from http import HTTPStatus
6
- from typing import Any, Dict, List, Optional
7
-
8
- import aiobotocore.session
9
- from fastapi import FastAPI, HTTPException, Request, Response
10
- from fastapi.encoders import jsonable_encoder
11
-
12
- from .cache import make_features_cache
13
- from .dynamo import set_stream_features
14
- from .model_store import download_and_load_model
15
- from .settings import Settings
16
-
17
- logging.basicConfig(
18
- level=logging.INFO,
19
- format="[%(levelname)s] [%(process)d] %(name)s : %(message)s",
20
- handlers=[logging.StreamHandler(sys.stdout)],
21
- force=True,
22
- )
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- def create_app(
28
- settings: Optional[Settings] = None,
29
- *,
30
- preloaded_model: Optional[Dict[str, Any]] = None,
31
- ) -> FastAPI:
32
- """
33
- Build a FastAPI app with injectable settings and model.
34
- If `preloaded_model` is None, the app will load from S3 on startup.
35
- """
36
- cfg = settings or Settings()
37
-
38
- app = FastAPI(
39
- title="ML Stream Scorer",
40
- description="Scores video streams using a pre-trained ML model and DynamoDB features.",
41
- version="1.0.0",
42
- )
43
-
44
- # Mutable state: cache + model
45
- features_cache = make_features_cache(cfg.cache_maxsize)
46
- state: Dict[str, Any] = {
47
- "model": preloaded_model,
48
- "session": aiobotocore.session.get_session(),
49
- "model_name": (
50
- os.path.basename(cfg.s3_model_path) if cfg.s3_model_path else None
51
- ),
52
- }
53
-
54
- @app.on_event("startup")
55
- async def _startup() -> None:
56
- if state["model"] is not None:
57
- logger.info("Using preloaded model.")
58
- return
59
-
60
- if not cfg.s3_model_path:
61
- logger.critical("S3_MODEL_PATH not set; service will be unhealthy.")
62
- return
63
-
64
- try:
65
- state["model"] = await download_and_load_model(
66
- cfg.s3_model_path, aio_session=state["session"]
67
- )
68
- state["stream_features"] = state["model"].get("stream_features", [])
69
- logger.info("Model loaded on startup.")
70
- except Exception as e:
71
- logger.critical("Failed to load model: %s", e)
72
-
73
- @app.get("/health", status_code=HTTPStatus.OK)
74
- async def health():
75
- model_ok = state["model"] is not None
76
- if not model_ok:
77
- raise HTTPException(
78
- status_code=HTTPStatus.SERVICE_UNAVAILABLE,
79
- detail="ML Model not loaded",
80
- )
81
- return {
82
- "status": "ok",
83
- "model_loaded": True,
84
- "cache_size": len(features_cache),
85
- "model_name": state.get("model_name"),
86
- "stream_features": state.get("stream_features", []),
87
- }
88
-
89
- @app.post("/score", status_code=HTTPStatus.OK)
90
- async def score_stream(request: Request, response: Response):
91
- if state["model"] is None:
92
- raise HTTPException(
93
- status_code=HTTPStatus.SERVICE_UNAVAILABLE,
94
- detail="ML Model not loaded",
95
- )
96
-
97
- try:
98
- data = await request.json()
99
- except Exception:
100
- raise HTTPException(
101
- status_code=HTTPStatus.BAD_REQUEST, detail="Invalid JSON payload"
102
- )
103
-
104
- user = data.get("user", {})
105
- streams: List[Dict[str, Any]] = data.get("streams", [])
106
- playlist = data.get("playlist", {})
107
-
108
- if not streams:
109
- logger.warning("No streams provided for user %s", user.get("userid", ""))
110
- return {}
111
-
112
- # Feature fetch (optional based on model)
113
- model = state["model"]
114
- stream_features = model.get("stream_features", []) or []
115
- if stream_features:
116
- logger.info("Fetching stream features for user %s", user.get("userid", ""))
117
- await set_stream_features(
118
- aio_session=state["session"],
119
- streams=streams,
120
- stream_features=stream_features,
121
- features_cache=features_cache,
122
- features_table=cfg.features_table,
123
- stream_pk_prefix=cfg.stream_pk_prefix,
124
- cache_sep=cfg.cache_separator,
125
- )
126
-
127
- # Sampling logs
128
- if random.random() < cfg.logs_fraction:
129
- logger.info("User %s streams: %s", user.get("userid", ""), streams)
130
-
131
- # Synchronous model execution (user code)
132
- try:
133
- model_input = model["preprocess"](
134
- user, streams, playlist, model.get("params")
135
- )
136
- model_output = model["predict"](model_input, model.get("params"))
137
- except Exception as e:
138
- logger.error("Model prediction failed: %s", e)
139
- raise HTTPException(
140
- status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
141
- detail="Model prediction failed",
142
- )
143
-
144
- if model_output:
145
- return jsonable_encoder(model_output)
146
-
147
- raise HTTPException(
148
- status_code=HTTPStatus.NOT_FOUND, detail="No model output generated"
149
- )
150
-
151
- @app.get("/", status_code=HTTPStatus.OK)
152
- async def root():
153
- return {
154
- "message": "ML Scoring Service is running.",
155
- "model_name": state.get("model_name"),
156
- }
157
-
158
- return app
@@ -1,2 +0,0 @@
1
- haystack_ml_stack
2
- haystack_test_package
@@ -1,4 +0,0 @@
1
- from .app import create_app
2
-
3
- __all__ = ["create_app"]
4
- __version__ = "0.1.3"
@@ -1,19 +0,0 @@
1
- from typing import Any
2
-
3
- from cachetools import TLRUCache
4
-
5
-
6
- def _ttu(_, value: Any, now: float) -> float:
7
- """Time-To-Use policy: allow per-item TTL via 'cache_ttl_in_seconds' or fallback."""
8
- ONE_YEAR = 365 * 24 * 60 * 60
9
- try:
10
- ttl = int(value.get("cache_ttl_in_seconds", -1))
11
- if ttl > 0:
12
- return now + ttl
13
- except Exception:
14
- pass
15
- return now + ONE_YEAR
16
-
17
-
18
- def make_features_cache(maxsize: int) -> TLRUCache:
19
- return TLRUCache(maxsize=maxsize, ttu=_ttu)
@@ -1,137 +0,0 @@
1
- from typing import Any, Dict, List
2
- import logging
3
-
4
- import aiobotocore.session
5
-
6
- logger = logging.getLogger(__name__)
7
-
8
-
9
- async def async_batch_get(
10
- dynamo_client, table_name: str, keys: List[Dict[str, Any]]
11
- ) -> List[Dict[str, Any]]:
12
- """
13
- Asynchronous batch_get_item with chunking for requests > 100 keys
14
- and handling for unprocessed keys.
15
- """
16
- all_items: List[Dict[str, Any]] = []
17
- # DynamoDB's BatchGetItem has a 100-item limit per request.
18
- CHUNK_SIZE = 100
19
-
20
- # Split the keys into chunks of 100
21
- for i in range(0, len(keys), CHUNK_SIZE):
22
- chunk_keys = keys[i : i + CHUNK_SIZE]
23
- to_fetch = {table_name: {"Keys": chunk_keys}}
24
-
25
- # Inner loop to handle unprocessed keys for the current chunk
26
- # Max retries of 3
27
- retries = 3
28
- while to_fetch and retries > 0:
29
- retries -= 1
30
- try:
31
- resp = await dynamo_client.batch_get_item(RequestItems=to_fetch)
32
-
33
- if "Responses" in resp and table_name in resp["Responses"]:
34
- all_items.extend(resp["Responses"][table_name])
35
-
36
- unprocessed = resp.get("UnprocessedKeys", {})
37
- # If there are unprocessed keys, set them to be fetched in the next iteration
38
- if unprocessed and unprocessed.get(table_name):
39
- logger.warning(
40
- "Retrying %d unprocessed keys.",
41
- len(unprocessed[table_name]["Keys"]),
42
- )
43
- to_fetch = unprocessed
44
- else:
45
- # All keys in the chunk were processed, exit the inner loop
46
- to_fetch = {}
47
-
48
- except Exception as e:
49
- logger.error("Error during batch_get_item for a chunk: %s", e)
50
- # Stop trying to process this chunk on error and move to the next
51
- to_fetch = {}
52
-
53
- return all_items
54
-
55
-
56
- def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
57
- """Parse a DynamoDB attribute map (low-level) to Python types."""
58
- out: Dict[str, Any] = {}
59
- for k, v in item.items():
60
- if "N" in v:
61
- out[k] = float(v["N"])
62
- elif "S" in v:
63
- out[k] = v["S"]
64
- elif "SS" in v:
65
- out[k] = v["SS"]
66
- elif "NS" in v:
67
- out[k] = [float(n) for n in v["NS"]]
68
- elif "BOOL" in v:
69
- out[k] = v["BOOL"]
70
- elif "NULL" in v:
71
- out[k] = None
72
- elif "L" in v:
73
- out[k] = [parse_dynamo_item({"value": i})["value"] for i in v["L"]]
74
- elif "M" in v:
75
- out[k] = parse_dynamo_item(v["M"])
76
- return out
77
-
78
-
79
- async def set_stream_features(
80
- *,
81
- streams: List[Dict[str, Any]],
82
- stream_features: List[str],
83
- features_cache,
84
- features_table: str,
85
- stream_pk_prefix: str,
86
- cache_sep: str,
87
- aio_session: aiobotocore.session.Session | None = None,
88
- ) -> None:
89
- """Fetch missing features for streams from DynamoDB and fill them into streams."""
90
- if not streams or not stream_features:
91
- return
92
-
93
- cache_miss: Dict[str, Dict[str, Any]] = {}
94
- for f in stream_features:
95
- for s in streams:
96
- key = f"{s['streamUrl']}{cache_sep}{f}"
97
- cached = features_cache.get(key)
98
- if cached is not None:
99
- s[f] = cached["value"]
100
- else:
101
- cache_miss[key] = s
102
-
103
- if not cache_miss:
104
- return
105
-
106
- logger.info("Cache miss for %d items", len(cache_miss))
107
-
108
- # Prepare keys
109
- keys = []
110
- for k in cache_miss.keys():
111
- stream_url, sk = k.split(cache_sep, 1)
112
- pk = f"{stream_pk_prefix}{stream_url}"
113
- keys.append({"pk": {"S": pk}, "sk": {"S": sk}})
114
- logger.info("Keys prepared for DynamoDB: %s", keys)
115
-
116
- session = aio_session or aiobotocore.session.get_session()
117
- async with session.create_client("dynamodb") as dynamodb:
118
- try:
119
- items = await async_batch_get(dynamodb, features_table, keys)
120
- except Exception as e:
121
- logger.error("DynamoDB batch_get failed: %s", e)
122
- return
123
- logger.info("DynamoDB returned %d items", len(items))
124
-
125
- for item in items:
126
- stream_url = item["pk"]["S"].removeprefix(stream_pk_prefix)
127
- feature_name = item["sk"]["S"]
128
- cache_key = f"{stream_url}{cache_sep}{feature_name}"
129
- parsed = parse_dynamo_item(item)
130
- logger.info("DynamoDB item parsed: %s for %s", parsed, cache_key)
131
-
132
- features_cache[cache_key] = {
133
- "value": parsed.get("value"),
134
- "cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
135
- }
136
- if cache_key in cache_miss:
137
- cache_miss[cache_key][feature_name] = parsed.get("value")
@@ -1,36 +0,0 @@
1
- import logging
2
- import os
3
- from typing import Any, Dict
4
-
5
- import aiobotocore.session
6
- import cloudpickle
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- async def download_and_load_model(
12
- s3_url: str, aio_session: aiobotocore.session.Session | None = None
13
- ) -> Dict[str, Any]:
14
- """
15
- Downloads cloudpickled model dict from S3 and loads it.
16
- Expected keys: 'preprocess', 'predict', 'params', optional 'stream_features'.
17
- """
18
- if not s3_url or not s3_url.startswith("s3://"):
19
- raise ValueError("S3_MODEL_PATH must be a valid s3:// URL")
20
-
21
- bucket, key = s3_url.replace("s3://", "").split("/", 1)
22
- pid = os.getpid()
23
- local_path = f"/tmp/model_{pid}.pkl"
24
-
25
- session = aio_session or aiobotocore.session.get_session()
26
- async with session.create_client("s3") as s3:
27
- logger.info("Downloading model from %s...", s3_url)
28
- resp = await s3.get_object(Bucket=bucket, Key=key)
29
- data = await resp["Body"].read()
30
- with open(local_path, "wb") as f:
31
- f.write(data)
32
- logger.info("Model downloaded to %s", local_path)
33
-
34
- with open(local_path, "rb") as f:
35
- model: Dict[str, Any] = cloudpickle.load(f)
36
- return model
@@ -1,22 +0,0 @@
1
- from pydantic_settings import BaseSettings
2
- from pydantic import Field
3
-
4
- class Settings(BaseSettings):
5
- # Logging
6
- logs_fraction: float = Field(0.01, alias="LOGS_FRACTION")
7
-
8
- # Model (S3)
9
- s3_model_path: str | None = Field(default=None, alias="S3_MODEL_PATH")
10
-
11
- # DynamoDB
12
- features_table: str = Field("features", alias="FEATURES_TABLE")
13
- stream_pk_prefix: str = "STREAM#"
14
-
15
- # Cache
16
- cache_maxsize: int = 50_000
17
- cache_separator: str = "--"
18
-
19
- class Config:
20
- env_file = ".env"
21
- env_file_encoding = "utf-8"
22
- extra = "ignore"