haystack-ml-stack 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/PKG-INFO +2 -1
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/pyproject.toml +3 -2
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack/__init__.py +1 -1
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack/app.py +49 -11
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack/dynamo.py +65 -11
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack/model_store.py +3 -1
- haystack_ml_stack-0.2.0/src/haystack_ml_stack/utils.py +276 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack.egg-info/PKG-INFO +2 -1
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack.egg-info/SOURCES.txt +1 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack.egg-info/requires.txt +1 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/README.md +0 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/setup.cfg +0 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack/cache.py +0 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack/settings.py +0 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack.egg-info/dependency_links.txt +0 -0
- {haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: haystack-ml-stack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Functions related to Haystack ML
|
|
5
5
|
Author-email: Oscar Vega <oscar@haystack.tv>
|
|
6
6
|
License: MIT
|
|
@@ -12,6 +12,7 @@ Requires-Dist: cloudpickle==2.2.1
|
|
|
12
12
|
Requires-Dist: aioboto3==12.0.0
|
|
13
13
|
Requires-Dist: fastapi==0.104.1
|
|
14
14
|
Requires-Dist: pydantic-settings==2.2
|
|
15
|
+
Requires-Dist: newrelic==11.1.0
|
|
15
16
|
|
|
16
17
|
# Haystack ML Stack
|
|
17
18
|
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "haystack-ml-stack"
|
|
8
|
-
version = "0.
|
|
8
|
+
version = "0.2.0"
|
|
9
9
|
description = "Functions related to Haystack ML"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
authors = [{ name = "Oscar Vega", email = "oscar@haystack.tv" }]
|
|
@@ -16,6 +16,7 @@ dependencies = [
|
|
|
16
16
|
"cloudpickle==2.2.1",
|
|
17
17
|
"aioboto3==12.0.0",
|
|
18
18
|
"fastapi==0.104.1",
|
|
19
|
-
"pydantic-settings==2.2"
|
|
19
|
+
"pydantic-settings==2.2",
|
|
20
|
+
"newrelic==11.1.0"
|
|
20
21
|
]
|
|
21
22
|
license = { text = "MIT" }
|
|
@@ -4,13 +4,15 @@ import random
|
|
|
4
4
|
import sys
|
|
5
5
|
from http import HTTPStatus
|
|
6
6
|
from typing import Any, Dict, List, Optional
|
|
7
|
+
import time
|
|
7
8
|
|
|
8
9
|
import aiobotocore.session
|
|
9
10
|
from fastapi import FastAPI, HTTPException, Request, Response
|
|
10
11
|
from fastapi.encoders import jsonable_encoder
|
|
11
12
|
|
|
13
|
+
|
|
12
14
|
from .cache import make_features_cache
|
|
13
|
-
from .dynamo import set_stream_features
|
|
15
|
+
from .dynamo import set_stream_features, FeatureRetrievalMeta
|
|
14
16
|
from .model_store import download_and_load_model
|
|
15
17
|
from .settings import Settings
|
|
16
18
|
|
|
@@ -23,6 +25,8 @@ logging.basicConfig(
|
|
|
23
25
|
|
|
24
26
|
logger = logging.getLogger(__name__)
|
|
25
27
|
|
|
28
|
+
import newrelic.agent
|
|
29
|
+
|
|
26
30
|
|
|
27
31
|
def create_app(
|
|
28
32
|
settings: Optional[Settings] = None,
|
|
@@ -96,11 +100,15 @@ def create_app(
|
|
|
96
100
|
|
|
97
101
|
try:
|
|
98
102
|
data = await request.json()
|
|
99
|
-
except Exception:
|
|
103
|
+
except Exception as e:
|
|
100
104
|
raise HTTPException(
|
|
101
105
|
status_code=HTTPStatus.BAD_REQUEST, detail="Invalid JSON payload"
|
|
102
|
-
)
|
|
103
|
-
|
|
106
|
+
) from e
|
|
107
|
+
query_params = {}
|
|
108
|
+
for k in request.query_params.keys():
|
|
109
|
+
values = request.query_params.getlist(k)
|
|
110
|
+
# flatten single-element lists
|
|
111
|
+
query_params[k] = values[0] if len(values) == 1 else values
|
|
104
112
|
user = data.get("user", {})
|
|
105
113
|
streams: List[Dict[str, Any]] = data.get("streams", [])
|
|
106
114
|
playlist = data.get("playlist", {})
|
|
@@ -112,9 +120,11 @@ def create_app(
|
|
|
112
120
|
# Feature fetch (optional based on model)
|
|
113
121
|
model = state["model"]
|
|
114
122
|
stream_features = model.get("stream_features", []) or []
|
|
123
|
+
retrieval_meta = FeatureRetrievalMeta(
|
|
124
|
+
cache_misses=0, retrieval_ms=0, success=True, cache_delay_minutes=0
|
|
125
|
+
)
|
|
115
126
|
if stream_features:
|
|
116
|
-
|
|
117
|
-
await set_stream_features(
|
|
127
|
+
retrieval_meta = await set_stream_features(
|
|
118
128
|
aio_session=state["session"],
|
|
119
129
|
streams=streams,
|
|
120
130
|
stream_features=stream_features,
|
|
@@ -124,24 +134,52 @@ def create_app(
|
|
|
124
134
|
cache_sep=cfg.cache_separator,
|
|
125
135
|
)
|
|
126
136
|
|
|
137
|
+
random_number = random.random()
|
|
138
|
+
userid = user.get("userid", "")
|
|
127
139
|
# Sampling logs
|
|
128
|
-
if
|
|
140
|
+
if random_number < cfg.logs_fraction:
|
|
129
141
|
logger.info("User %s streams: %s", user.get("userid", ""), streams)
|
|
130
142
|
|
|
131
143
|
# Synchronous model execution (user code)
|
|
132
144
|
try:
|
|
145
|
+
preprocess_start = time.perf_counter_ns()
|
|
133
146
|
model_input = model["preprocess"](
|
|
134
|
-
user,
|
|
147
|
+
user,
|
|
148
|
+
streams,
|
|
149
|
+
playlist,
|
|
150
|
+
{**model.get("params"), "query_params": query_params},
|
|
151
|
+
)
|
|
152
|
+
predict_start = time.perf_counter_ns()
|
|
153
|
+
model_output = model["predict"](
|
|
154
|
+
model_input, {**model.get("params"), "query_params": query_params}
|
|
135
155
|
)
|
|
136
|
-
|
|
156
|
+
predict_end = time.perf_counter_ns()
|
|
137
157
|
except Exception as e:
|
|
138
158
|
logger.error("Model prediction failed: %s", e)
|
|
139
159
|
raise HTTPException(
|
|
140
160
|
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
|
141
161
|
detail="Model prediction failed",
|
|
142
|
-
)
|
|
143
|
-
|
|
162
|
+
) from e
|
|
163
|
+
|
|
164
|
+
newrelic.agent.record_custom_event(
|
|
165
|
+
"Inference",
|
|
166
|
+
{
|
|
167
|
+
"cache_misses": retrieval_meta.cache_misses,
|
|
168
|
+
"retrieval_success": int(retrieval_meta.success),
|
|
169
|
+
"cache_delay_minutes": retrieval_meta.cache_delay_minutes,
|
|
170
|
+
"retrieval_ms": retrieval_meta.retrieval_ms,
|
|
171
|
+
"preprocess_ms": (predict_start - preprocess_start) * 1e-6,
|
|
172
|
+
"predict_ms": (predict_end - predict_start) * 1e-6,
|
|
173
|
+
"total_scores": len(model_output),
|
|
174
|
+
},
|
|
175
|
+
)
|
|
144
176
|
if model_output:
|
|
177
|
+
if random_number < cfg.logs_fraction:
|
|
178
|
+
logger.info(
|
|
179
|
+
"User %s - model output %s",
|
|
180
|
+
userid,
|
|
181
|
+
model_output,
|
|
182
|
+
)
|
|
145
183
|
return jsonable_encoder(model_output)
|
|
146
184
|
|
|
147
185
|
raise HTTPException(
|
|
@@ -1,11 +1,23 @@
|
|
|
1
|
-
from typing import Any, Dict, List
|
|
1
|
+
from typing import Any, Dict, List, NamedTuple
|
|
2
2
|
import logging
|
|
3
|
+
import time
|
|
4
|
+
import datetime
|
|
3
5
|
|
|
4
6
|
import aiobotocore.session
|
|
7
|
+
import newrelic.agent
|
|
8
|
+
|
|
5
9
|
|
|
6
10
|
logger = logging.getLogger(__name__)
|
|
7
11
|
|
|
8
12
|
|
|
13
|
+
class FeatureRetrievalMeta(NamedTuple):
|
|
14
|
+
cache_misses: int
|
|
15
|
+
retrieval_ms: float
|
|
16
|
+
success: bool
|
|
17
|
+
cache_delay_minutes: float
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@newrelic.agent.function_trace()
|
|
9
21
|
async def async_batch_get(
|
|
10
22
|
dynamo_client, table_name: str, keys: List[Dict[str, Any]]
|
|
11
23
|
) -> List[Dict[str, Any]]:
|
|
@@ -53,6 +65,7 @@ async def async_batch_get(
|
|
|
53
65
|
return all_items
|
|
54
66
|
|
|
55
67
|
|
|
68
|
+
@newrelic.agent.function_trace()
|
|
56
69
|
def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
57
70
|
"""Parse a DynamoDB attribute map (low-level) to Python types."""
|
|
58
71
|
out: Dict[str, Any] = {}
|
|
@@ -76,6 +89,7 @@ def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
76
89
|
return out
|
|
77
90
|
|
|
78
91
|
|
|
92
|
+
@newrelic.agent.function_trace()
|
|
79
93
|
async def set_stream_features(
|
|
80
94
|
*,
|
|
81
95
|
streams: List[Dict[str, Any]],
|
|
@@ -85,25 +99,45 @@ async def set_stream_features(
|
|
|
85
99
|
stream_pk_prefix: str,
|
|
86
100
|
cache_sep: str,
|
|
87
101
|
aio_session: aiobotocore.session.Session | None = None,
|
|
88
|
-
) ->
|
|
102
|
+
) -> FeatureRetrievalMeta:
|
|
103
|
+
time_start = time.perf_counter_ns()
|
|
89
104
|
"""Fetch missing features for streams from DynamoDB and fill them into streams."""
|
|
90
105
|
if not streams or not stream_features:
|
|
91
|
-
return
|
|
106
|
+
return FeatureRetrievalMeta(
|
|
107
|
+
cache_misses=0,
|
|
108
|
+
retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
|
|
109
|
+
success=True,
|
|
110
|
+
cache_delay_minutes=0,
|
|
111
|
+
)
|
|
92
112
|
|
|
93
113
|
cache_miss: Dict[str, Dict[str, Any]] = {}
|
|
114
|
+
cache_delay_obj: dict[str, float] = {f: 0 for f in stream_features}
|
|
115
|
+
now = datetime.datetime.utcnow()
|
|
94
116
|
for f in stream_features:
|
|
95
117
|
for s in streams:
|
|
96
118
|
key = f"{s['streamUrl']}{cache_sep}{f}"
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
119
|
+
if key in features_cache:
|
|
120
|
+
# Only set if value is not None
|
|
121
|
+
cached = features_cache.get(key)
|
|
122
|
+
if cached["value"] is not None:
|
|
123
|
+
s[f] = cached["value"]
|
|
124
|
+
cache_delay_obj[f] = max(
|
|
125
|
+
cache_delay_obj[f], (now - cached["updated_at"]).total_seconds()
|
|
126
|
+
)
|
|
100
127
|
else:
|
|
101
128
|
cache_miss[key] = s
|
|
129
|
+
valid_cache_delays = list(v for v in cache_delay_obj.values() if v > 0)
|
|
130
|
+
cache_delay = min(valid_cache_delays) if valid_cache_delays else 0
|
|
102
131
|
|
|
103
132
|
if not cache_miss:
|
|
104
|
-
return
|
|
105
|
-
|
|
106
|
-
|
|
133
|
+
return FeatureRetrievalMeta(
|
|
134
|
+
cache_misses=0,
|
|
135
|
+
retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
|
|
136
|
+
success=True,
|
|
137
|
+
cache_delay_minutes=cache_delay / 60,
|
|
138
|
+
)
|
|
139
|
+
cache_misses = len(cache_miss)
|
|
140
|
+
logger.info("Cache miss for %d items", cache_misses)
|
|
107
141
|
|
|
108
142
|
# Prepare keys
|
|
109
143
|
keys = []
|
|
@@ -118,8 +152,14 @@ async def set_stream_features(
|
|
|
118
152
|
items = await async_batch_get(dynamodb, features_table, keys)
|
|
119
153
|
except Exception as e:
|
|
120
154
|
logger.error("DynamoDB batch_get failed: %s", e)
|
|
121
|
-
return
|
|
122
|
-
|
|
155
|
+
return FeatureRetrievalMeta(
|
|
156
|
+
cache_misses=cache_misses,
|
|
157
|
+
retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
|
|
158
|
+
success=False,
|
|
159
|
+
cache_delay_minutes=cache_delay / 60,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
updated_keys = set()
|
|
123
163
|
for item in items:
|
|
124
164
|
stream_url = item["pk"]["S"].removeprefix(stream_pk_prefix)
|
|
125
165
|
feature_name = item["sk"]["S"]
|
|
@@ -129,6 +169,20 @@ async def set_stream_features(
|
|
|
129
169
|
features_cache[cache_key] = {
|
|
130
170
|
"value": parsed.get("value"),
|
|
131
171
|
"cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
|
|
172
|
+
"updated_at": datetime.datetime.fromisoformat(parsed.get("updated_at")),
|
|
132
173
|
}
|
|
133
174
|
if cache_key in cache_miss:
|
|
134
175
|
cache_miss[cache_key][feature_name] = parsed.get("value")
|
|
176
|
+
updated_keys.add(cache_key)
|
|
177
|
+
|
|
178
|
+
# Save keys that were not found in DynamoDB with None value
|
|
179
|
+
if len(updated_keys) < len(cache_miss):
|
|
180
|
+
missing_keys = set(cache_miss.keys()) - updated_keys
|
|
181
|
+
for k in missing_keys:
|
|
182
|
+
features_cache[k] = {"value": None, "cache_ttl_in_seconds": 300}
|
|
183
|
+
return FeatureRetrievalMeta(
|
|
184
|
+
cache_misses=cache_misses,
|
|
185
|
+
retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
|
|
186
|
+
success=True,
|
|
187
|
+
cache_delay_minutes=cache_delay / 60,
|
|
188
|
+
)
|
|
@@ -4,10 +4,12 @@ from typing import Any, Dict
|
|
|
4
4
|
|
|
5
5
|
import aiobotocore.session
|
|
6
6
|
import cloudpickle
|
|
7
|
+
import newrelic.agent
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
@newrelic.agent.function_trace()
|
|
11
13
|
async def download_and_load_model(
|
|
12
14
|
s3_url: str, aio_session: aiobotocore.session.Session | None = None
|
|
13
15
|
) -> Dict[str, Any]:
|
|
@@ -33,4 +35,4 @@ async def download_and_load_model(
|
|
|
33
35
|
|
|
34
36
|
with open(local_path, "rb") as f:
|
|
35
37
|
model: Dict[str, Any] = cloudpickle.load(f)
|
|
36
|
-
return model
|
|
38
|
+
return model
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import typing as _t
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def stream_tags_cleanup(
|
|
7
|
+
stream, user_favorite_tags: list[str], user_favorite_authors: list[str]
|
|
8
|
+
) -> dict:
|
|
9
|
+
stream_tags = stream.get("haystackTags", [])
|
|
10
|
+
is_favorite_tag = (
|
|
11
|
+
any(stream_tag in user_favorite_tags for stream_tag in stream_tags)
|
|
12
|
+
if user_favorite_tags is not None
|
|
13
|
+
else False
|
|
14
|
+
)
|
|
15
|
+
is_favorite_author = (
|
|
16
|
+
stream.get("author", None) in user_favorite_authors
|
|
17
|
+
if user_favorite_authors is not None
|
|
18
|
+
else False
|
|
19
|
+
)
|
|
20
|
+
return {
|
|
21
|
+
"IS_FAVORITE_TAG": is_favorite_tag,
|
|
22
|
+
"IS_FAVORITE_AUTHOR": is_favorite_author,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def browsed_count_cleanups(
|
|
27
|
+
stream,
|
|
28
|
+
position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
|
|
29
|
+
) -> dict:
|
|
30
|
+
position_alias_mapping = {
|
|
31
|
+
"0": "1ST_POS",
|
|
32
|
+
"1": "2ND_POS",
|
|
33
|
+
"2": "3RD_POS",
|
|
34
|
+
"3+": "REST_POS",
|
|
35
|
+
}
|
|
36
|
+
if position_debiasing == "4_browsed":
|
|
37
|
+
suffix = "_UP_TO_4_BROWSED"
|
|
38
|
+
elif position_debiasing == "all_browsed":
|
|
39
|
+
suffix = ""
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
|
|
42
|
+
browsed_count_obj = stream.get("PSELECT#24H", {}).get(position_debiasing, {})
|
|
43
|
+
total_selects = 0
|
|
44
|
+
total_browsed = 0
|
|
45
|
+
total_selects_and_watched = 0
|
|
46
|
+
feats = {}
|
|
47
|
+
for position in position_alias_mapping.keys():
|
|
48
|
+
pos_counts = browsed_count_obj.get(position, {})
|
|
49
|
+
total_browsed += pos_counts.get("total_browsed", 0)
|
|
50
|
+
total_selects += pos_counts.get("total_selects", 0)
|
|
51
|
+
total_selects_and_watched += pos_counts.get("total_selects_and_watched", 0)
|
|
52
|
+
if position_debiasing == "4_browsed":
|
|
53
|
+
suffix = "_UP_TO_4_BROWSED"
|
|
54
|
+
elif position_debiasing == "all_browsed":
|
|
55
|
+
suffix = ""
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError("Should not be here.")
|
|
58
|
+
feats[f"STREAM_24H_TOTAL_BROWSED{suffix}"] = total_browsed
|
|
59
|
+
feats[f"STREAM_24H_TOTAL_SELECTS{suffix}"] = total_selects
|
|
60
|
+
feats[f"STREAM_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = total_selects_and_watched
|
|
61
|
+
return feats
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def device_split_browsed_count_cleanups(
|
|
65
|
+
stream,
|
|
66
|
+
device_type: _t.Literal["TV", "MOBILE"],
|
|
67
|
+
position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
|
|
68
|
+
) -> dict:
|
|
69
|
+
position_alias_mapping = {
|
|
70
|
+
"0": "1ST_POS",
|
|
71
|
+
"1": "2ND_POS",
|
|
72
|
+
"2": "3RD_POS",
|
|
73
|
+
"3+": "REST_POS",
|
|
74
|
+
}
|
|
75
|
+
if position_debiasing == "4_browsed":
|
|
76
|
+
suffix = "_UP_TO_4_BROWSED"
|
|
77
|
+
elif position_debiasing == "all_browsed":
|
|
78
|
+
suffix = ""
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError(f"Unexpected position debiasing '{position_debiasing}'.")
|
|
81
|
+
|
|
82
|
+
_validate_device_type(device_type)
|
|
83
|
+
|
|
84
|
+
browsed_count_obj = stream.get(f"PSELECT#24H#{device_type}", {}).get(
|
|
85
|
+
position_debiasing, {}
|
|
86
|
+
)
|
|
87
|
+
total_selects = 0
|
|
88
|
+
total_browsed = 0
|
|
89
|
+
total_selects_and_watched = 0
|
|
90
|
+
feats = {}
|
|
91
|
+
for position, alias in position_alias_mapping.items():
|
|
92
|
+
pos_counts = browsed_count_obj.get(position, {})
|
|
93
|
+
total_browsed = pos_counts.get("total_browsed", 0)
|
|
94
|
+
total_selects = pos_counts.get("total_selects", 0)
|
|
95
|
+
total_selects_and_watched = pos_counts.get("total_selects_and_watched", 0)
|
|
96
|
+
feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_BROWSED{suffix}"] = total_browsed
|
|
97
|
+
feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS{suffix}"] = total_selects
|
|
98
|
+
feats[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = (
|
|
99
|
+
total_selects_and_watched
|
|
100
|
+
)
|
|
101
|
+
return feats
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def watched_count_cleanups(stream, entry_contexts: list[str] = None) -> dict:
|
|
105
|
+
if entry_contexts is None:
|
|
106
|
+
entry_contexts = [
|
|
107
|
+
"autoplay",
|
|
108
|
+
"choose next",
|
|
109
|
+
"ch swtch",
|
|
110
|
+
"sel thumb",
|
|
111
|
+
"launch first in session",
|
|
112
|
+
]
|
|
113
|
+
_validate_pwatched_entry_context(entry_contexts)
|
|
114
|
+
|
|
115
|
+
counts_obj = stream.get(f"PWATCHED#24H", {})
|
|
116
|
+
feats = {}
|
|
117
|
+
for entry_context in entry_contexts:
|
|
118
|
+
attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
|
|
119
|
+
watched = counts_obj.get(entry_context, {}).get("watched", 0)
|
|
120
|
+
context_key = entry_context if "launch" not in entry_context else "launch"
|
|
121
|
+
context_key = context_key.upper().replace(" ", "_")
|
|
122
|
+
feats[f"STREAM_{context_key}_24H_TOTAL_WATCHED"] = watched
|
|
123
|
+
feats[f"STREAM_{context_key}_24H_TOTAL_ATTEMPTS"] = attempts
|
|
124
|
+
return feats
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def device_watched_count_cleanups(
|
|
128
|
+
stream, device_type: str, entry_contexts: list[str] = None
|
|
129
|
+
) -> dict:
|
|
130
|
+
if entry_contexts is None:
|
|
131
|
+
entry_contexts = [
|
|
132
|
+
"autoplay",
|
|
133
|
+
"choose next",
|
|
134
|
+
"ch swtch",
|
|
135
|
+
"sel thumb",
|
|
136
|
+
"launch first in session",
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
_validate_pwatched_entry_context(entry_contexts)
|
|
140
|
+
_validate_device_type(device_type)
|
|
141
|
+
|
|
142
|
+
counts_obj = stream.get(f"PWATCHED#24H#{device_type}", {})
|
|
143
|
+
feats = {}
|
|
144
|
+
for entry_context in entry_contexts:
|
|
145
|
+
attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
|
|
146
|
+
watched = counts_obj.get(entry_context, {}).get("watched", 0)
|
|
147
|
+
context_key = entry_context if "launch" not in entry_context else "launch"
|
|
148
|
+
context_key = context_key.upper().replace(" ", "_")
|
|
149
|
+
feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_WATCHED"] = (
|
|
150
|
+
watched
|
|
151
|
+
)
|
|
152
|
+
feats["features"][f"STREAM_{context_key}_{device_type}_24H_TOTAL_ATTEMPTS"] = (
|
|
153
|
+
attempts
|
|
154
|
+
)
|
|
155
|
+
return feats
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def generic_beta_adjust_features(
|
|
159
|
+
data: pd.DataFrame,
|
|
160
|
+
prefix: str,
|
|
161
|
+
pwatched_beta_params: dict,
|
|
162
|
+
pselect_beta_params: dict,
|
|
163
|
+
pslw_beta_params: dict,
|
|
164
|
+
use_low_sample_flags: bool = False,
|
|
165
|
+
low_sample_threshold: int = 3,
|
|
166
|
+
use_attempt_features: bool = False,
|
|
167
|
+
max_attempt_cap: int = 100,
|
|
168
|
+
debiased_pselect: bool = True,
|
|
169
|
+
use_logodds: bool = False,
|
|
170
|
+
) -> pd.DataFrame:
|
|
171
|
+
pwatched_features = {}
|
|
172
|
+
for context, (alpha, beta) in pwatched_beta_params.items():
|
|
173
|
+
total_watched = data[f"{prefix}_{context}_TOTAL_WATCHED"].fillna(0)
|
|
174
|
+
total_attempts = data[f"{prefix}_{context}_TOTAL_ATTEMPTS"].fillna(0)
|
|
175
|
+
pwatched_features[f"{prefix}_{context}_ADJ_PWATCHED"] = (
|
|
176
|
+
total_watched + alpha
|
|
177
|
+
) / (total_attempts + alpha + beta)
|
|
178
|
+
if use_low_sample_flags:
|
|
179
|
+
pwatched_features[f"{prefix}_{context}_LOW_SAMPLE"] = total_attempts.le(
|
|
180
|
+
low_sample_threshold
|
|
181
|
+
).astype(int)
|
|
182
|
+
if use_attempt_features:
|
|
183
|
+
pwatched_features[f"{prefix}_{context}_ATTEMPTS"] = total_attempts.clip(
|
|
184
|
+
upper=max_attempt_cap
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
pselect_features = {}
|
|
188
|
+
debias_suffix = "_UP_TO_4_BROWSED" if debiased_pselect else ""
|
|
189
|
+
for key, (alpha, beta) in pselect_beta_params.items():
|
|
190
|
+
total_selects = data[f"{prefix}_{key}_TOTAL_SELECTS{debias_suffix}"].fillna(0)
|
|
191
|
+
total_browsed = data[f"{prefix}_{key}_TOTAL_BROWSED{debias_suffix}"].fillna(0)
|
|
192
|
+
pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"] = (
|
|
193
|
+
total_selects + alpha
|
|
194
|
+
) / (total_selects + total_browsed + alpha + beta)
|
|
195
|
+
if use_low_sample_flags:
|
|
196
|
+
pselect_features[f"{prefix}_{key}_PSELECT_LOW_SAMPLE{debias_suffix}"] = (
|
|
197
|
+
(total_selects + total_browsed).le(low_sample_threshold).astype(int)
|
|
198
|
+
)
|
|
199
|
+
if use_attempt_features:
|
|
200
|
+
pselect_features[f"{prefix}_{key}_PSELECT_ATTEMPTS{debias_suffix}"] = (
|
|
201
|
+
total_selects + total_browsed
|
|
202
|
+
).clip(upper=max_attempt_cap)
|
|
203
|
+
total_slw = data[
|
|
204
|
+
f"{prefix}_{key}_TOTAL_SELECTS_AND_WATCHED{debias_suffix}"
|
|
205
|
+
].fillna(0)
|
|
206
|
+
pslw_alpha, pslw_beta = pslw_beta_params[key]
|
|
207
|
+
pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"] = (
|
|
208
|
+
total_slw + pslw_alpha
|
|
209
|
+
) / (total_selects + total_browsed + pslw_alpha + pslw_beta)
|
|
210
|
+
pselect_features[f"{prefix}_{key}_PSelNotW{debias_suffix}"] = (
|
|
211
|
+
pselect_features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"]
|
|
212
|
+
- pselect_features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"]
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
adjusted_feats = pd.DataFrame({**pwatched_features, **pselect_features})
|
|
216
|
+
if use_logodds:
|
|
217
|
+
adjusted_feats = adjusted_feats.pipe(
|
|
218
|
+
lambda x: x.assign(
|
|
219
|
+
**x[
|
|
220
|
+
[
|
|
221
|
+
c
|
|
222
|
+
for c in x.columns
|
|
223
|
+
if "PSELECT" in c
|
|
224
|
+
or "PSLW" in c
|
|
225
|
+
or "PWATCHED" in c
|
|
226
|
+
or "PSelNotW" in c
|
|
227
|
+
]
|
|
228
|
+
]
|
|
229
|
+
.clip(lower=0.001)
|
|
230
|
+
.pipe(prob_to_logodds)
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
return adjusted_feats
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def prob_to_logodds(prob: float) -> float:
|
|
237
|
+
return np.log(prob) - np.log(1 - prob)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def scale_preds(
|
|
241
|
+
preds: pd.Series,
|
|
242
|
+
original_mean: float,
|
|
243
|
+
original_std: float,
|
|
244
|
+
target_mean: float,
|
|
245
|
+
target_std: float,
|
|
246
|
+
) -> pd.Series:
|
|
247
|
+
z_score = (preds - original_mean) / original_std
|
|
248
|
+
return z_score * target_std + target_mean
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def sigmoid(x: float) -> float:
|
|
252
|
+
return 1 / (1 + np.exp(-x))
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def generic_logistic_predict(
|
|
256
|
+
data: pd.DataFrame, coeffs: pd.Series, intercept: float
|
|
257
|
+
) -> pd.Series:
|
|
258
|
+
return ((data[coeffs.index] * coeffs).sum(axis=1) + intercept).pipe(sigmoid)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _validate_device_type(device_type: str):
|
|
262
|
+
if device_type not in ("TV", "MOBILE"):
|
|
263
|
+
raise ValueError(f"Invalid device type '{device_type}")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _validate_pwatched_entry_context(entry_contexts: list[str]):
|
|
267
|
+
valid_contexts = [
|
|
268
|
+
"autoplay",
|
|
269
|
+
"choose next",
|
|
270
|
+
"ch swtch",
|
|
271
|
+
"sel thumb",
|
|
272
|
+
"launch first in session",
|
|
273
|
+
]
|
|
274
|
+
invalid_contexts = [c for c in entry_contexts if c not in valid_contexts]
|
|
275
|
+
if invalid_contexts:
|
|
276
|
+
raise ValueError(f"Invalid entry contexts found: {invalid_contexts}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: haystack-ml-stack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Functions related to Haystack ML
|
|
5
5
|
Author-email: Oscar Vega <oscar@haystack.tv>
|
|
6
6
|
License: MIT
|
|
@@ -12,6 +12,7 @@ Requires-Dist: cloudpickle==2.2.1
|
|
|
12
12
|
Requires-Dist: aioboto3==12.0.0
|
|
13
13
|
Requires-Dist: fastapi==0.104.1
|
|
14
14
|
Requires-Dist: pydantic-settings==2.2
|
|
15
|
+
Requires-Dist: newrelic==11.1.0
|
|
15
16
|
|
|
16
17
|
# Haystack ML Stack
|
|
17
18
|
|
{haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack.egg-info/SOURCES.txt
RENAMED
|
@@ -6,6 +6,7 @@ src/haystack_ml_stack/cache.py
|
|
|
6
6
|
src/haystack_ml_stack/dynamo.py
|
|
7
7
|
src/haystack_ml_stack/model_store.py
|
|
8
8
|
src/haystack_ml_stack/settings.py
|
|
9
|
+
src/haystack_ml_stack/utils.py
|
|
9
10
|
src/haystack_ml_stack.egg-info/PKG-INFO
|
|
10
11
|
src/haystack_ml_stack.egg-info/SOURCES.txt
|
|
11
12
|
src/haystack_ml_stack.egg-info/dependency_links.txt
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{haystack_ml_stack-0.1.1 → haystack_ml_stack-0.2.0}/src/haystack_ml_stack.egg-info/top_level.txt
RENAMED
|
File without changes
|