haystack-ml-stack 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/PKG-INFO +1 -1
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/pyproject.toml +1 -1
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack/__init__.py +1 -1
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack/app.py +12 -4
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack/dynamo.py +65 -61
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack/utils.py +125 -85
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/PKG-INFO +1 -1
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/SOURCES.txt +2 -1
- haystack_ml_stack-0.2.4/tests/test_utils.py +76 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/README.md +0 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/setup.cfg +0 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack/cache.py +0 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack/model_store.py +0 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack/settings.py +0 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/dependency_links.txt +0 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/requires.txt +0 -0
- {haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ import time
|
|
|
9
9
|
import aiobotocore.session
|
|
10
10
|
from fastapi import FastAPI, HTTPException, Request, Response
|
|
11
11
|
from fastapi.encoders import jsonable_encoder
|
|
12
|
+
import newrelic.agent
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
from .cache import make_features_cache
|
|
@@ -24,8 +25,7 @@ logging.basicConfig(
|
|
|
24
25
|
)
|
|
25
26
|
|
|
26
27
|
logger = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
import newrelic.agent
|
|
28
|
+
APP_NAME = os.environ.get("NEW_RELIC_APP_NAME", None)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def create_app(
|
|
@@ -121,7 +121,12 @@ def create_app(
|
|
|
121
121
|
model = state["model"]
|
|
122
122
|
stream_features = model.get("stream_features", []) or []
|
|
123
123
|
retrieval_meta = FeatureRetrievalMeta(
|
|
124
|
-
cache_misses=0,
|
|
124
|
+
cache_misses=0,
|
|
125
|
+
retrieval_ms=0,
|
|
126
|
+
success=True,
|
|
127
|
+
cache_delay_minutes=0,
|
|
128
|
+
dynamo_ms=0,
|
|
129
|
+
parsing_ms=0,
|
|
125
130
|
)
|
|
126
131
|
if stream_features:
|
|
127
132
|
retrieval_meta = await set_stream_features(
|
|
@@ -163,13 +168,16 @@ def create_app(
|
|
|
163
168
|
newrelic.agent.record_custom_event(
|
|
164
169
|
"Inference",
|
|
165
170
|
{
|
|
171
|
+
"app_name": APP_NAME,
|
|
166
172
|
"cache_misses": retrieval_meta.cache_misses,
|
|
167
173
|
"retrieval_success": int(retrieval_meta.success),
|
|
168
174
|
"cache_delay_minutes": retrieval_meta.cache_delay_minutes,
|
|
175
|
+
"dynamo_ms": retrieval_meta.dynamo_ms,
|
|
176
|
+
"dynamo_parse_ms": retrieval_meta.parsing_ms,
|
|
169
177
|
"retrieval_ms": retrieval_meta.retrieval_ms,
|
|
170
178
|
"preprocess_ms": (predict_start - preprocess_start) * 1e-6,
|
|
171
179
|
"predict_ms": (predict_end - predict_start) * 1e-6,
|
|
172
|
-
"
|
|
180
|
+
"total_streams": len(model_output),
|
|
173
181
|
},
|
|
174
182
|
)
|
|
175
183
|
if model_output:
|
|
@@ -2,19 +2,30 @@ from typing import Any, Dict, List, NamedTuple
|
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
4
|
import datetime
|
|
5
|
-
|
|
6
5
|
import aiobotocore.session
|
|
6
|
+
from boto3.dynamodb.types import TypeDeserializer
|
|
7
7
|
import newrelic.agent
|
|
8
|
+
import asyncio
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
logger = logging.getLogger(__name__)
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
class FloatDeserializer(TypeDeserializer):
|
|
15
|
+
def _deserialize_n(self, value):
|
|
16
|
+
return float(value)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_deser = FloatDeserializer()
|
|
20
|
+
|
|
21
|
+
|
|
13
22
|
class FeatureRetrievalMeta(NamedTuple):
|
|
14
23
|
cache_misses: int
|
|
15
24
|
retrieval_ms: float
|
|
16
25
|
success: bool
|
|
17
26
|
cache_delay_minutes: float
|
|
27
|
+
dynamo_ms: float
|
|
28
|
+
parsing_ms: float
|
|
18
29
|
|
|
19
30
|
|
|
20
31
|
@newrelic.agent.function_trace()
|
|
@@ -25,68 +36,51 @@ async def async_batch_get(
|
|
|
25
36
|
Asynchronous batch_get_item with chunking for requests > 100 keys
|
|
26
37
|
and handling for unprocessed keys.
|
|
27
38
|
"""
|
|
28
|
-
all_items: List[Dict[str, Any]] = []
|
|
29
39
|
# DynamoDB's BatchGetItem has a 100-item limit per request.
|
|
30
40
|
CHUNK_SIZE = 100
|
|
41
|
+
chunks = [keys[i : i + CHUNK_SIZE] for i in range(0, len(keys), CHUNK_SIZE)]
|
|
42
|
+
tasks = [_fetch_chunk(dynamo_client, table_name, chunk) for chunk in chunks]
|
|
43
|
+
results = await asyncio.gather(*tasks)
|
|
44
|
+
all_items = [item for batch in results for item in batch]
|
|
45
|
+
return all_items
|
|
31
46
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
to_fetch = unprocessed
|
|
56
|
-
else:
|
|
57
|
-
# All keys in the chunk were processed, exit the inner loop
|
|
58
|
-
to_fetch = {}
|
|
59
|
-
|
|
60
|
-
except Exception as e:
|
|
61
|
-
logger.error("Error during batch_get_item for a chunk: %s", e)
|
|
62
|
-
# Stop trying to process this chunk on error and move to the next
|
|
47
|
+
|
|
48
|
+
async def _fetch_chunk(dynamo_client, table_name: str, chunk_keys):
|
|
49
|
+
"""Fetch a single chunk of up to 100 keys with retry handling."""
|
|
50
|
+
to_fetch = {table_name: {"Keys": chunk_keys}}
|
|
51
|
+
retries = 3
|
|
52
|
+
items = []
|
|
53
|
+
|
|
54
|
+
while to_fetch and retries > 0:
|
|
55
|
+
retries -= 1
|
|
56
|
+
try:
|
|
57
|
+
resp = await dynamo_client.batch_get_item(RequestItems=to_fetch)
|
|
58
|
+
|
|
59
|
+
# Collect retrieved items
|
|
60
|
+
if "Responses" in resp and table_name in resp["Responses"]:
|
|
61
|
+
items.extend(resp["Responses"][table_name])
|
|
62
|
+
|
|
63
|
+
# Check for unprocessed keys
|
|
64
|
+
unprocessed = resp.get("UnprocessedKeys", {})
|
|
65
|
+
if unprocessed and unprocessed.get(table_name):
|
|
66
|
+
unp = unprocessed[table_name]["Keys"]
|
|
67
|
+
logger.warning("Retrying %d unprocessed keys.", len(unp))
|
|
68
|
+
to_fetch = {table_name: {"Keys": unp}}
|
|
69
|
+
else:
|
|
63
70
|
to_fetch = {}
|
|
64
71
|
|
|
65
|
-
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.error("Error in batch_get_item chunk: %s", e)
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
return items
|
|
66
77
|
|
|
67
78
|
|
|
68
79
|
@newrelic.agent.function_trace()
|
|
69
80
|
def parse_dynamo_item(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
70
81
|
"""Parse a DynamoDB attribute map (low-level) to Python types."""
|
|
71
|
-
out: Dict[str, Any] = {}
|
|
72
|
-
for k, v in item.items()
|
|
73
|
-
if "N" in v:
|
|
74
|
-
out[k] = float(v["N"])
|
|
75
|
-
elif "S" in v:
|
|
76
|
-
out[k] = v["S"]
|
|
77
|
-
elif "SS" in v:
|
|
78
|
-
out[k] = v["SS"]
|
|
79
|
-
elif "NS" in v:
|
|
80
|
-
out[k] = [float(n) for n in v["NS"]]
|
|
81
|
-
elif "BOOL" in v:
|
|
82
|
-
out[k] = v["BOOL"]
|
|
83
|
-
elif "NULL" in v:
|
|
84
|
-
out[k] = None
|
|
85
|
-
elif "L" in v:
|
|
86
|
-
out[k] = [parse_dynamo_item({"value": i})["value"] for i in v["L"]]
|
|
87
|
-
elif "M" in v:
|
|
88
|
-
out[k] = parse_dynamo_item(v["M"])
|
|
89
|
-
return out
|
|
82
|
+
# out: Dict[str, Any] = {}
|
|
83
|
+
return {k: _deser.deserialize(v) for k, v in item.items()}
|
|
90
84
|
|
|
91
85
|
|
|
92
86
|
@newrelic.agent.function_trace()
|
|
@@ -108,6 +102,8 @@ async def set_stream_features(
|
|
|
108
102
|
retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
|
|
109
103
|
success=True,
|
|
110
104
|
cache_delay_minutes=0,
|
|
105
|
+
dynamo_ms=0,
|
|
106
|
+
parsing_ms=0,
|
|
111
107
|
)
|
|
112
108
|
|
|
113
109
|
cache_miss: Dict[str, Dict[str, Any]] = {}
|
|
@@ -122,7 +118,8 @@ async def set_stream_features(
|
|
|
122
118
|
if cached["value"] is not None:
|
|
123
119
|
s[f] = cached["value"]
|
|
124
120
|
cache_delay_obj[f] = max(
|
|
125
|
-
cache_delay_obj[f],
|
|
121
|
+
cache_delay_obj[f],
|
|
122
|
+
(now - cached["inserted_at"]).total_seconds(),
|
|
126
123
|
)
|
|
127
124
|
else:
|
|
128
125
|
cache_miss[key] = s
|
|
@@ -135,6 +132,8 @@ async def set_stream_features(
|
|
|
135
132
|
retrieval_ms=(time.perf_counter_ns() - time_start) * 1e-6,
|
|
136
133
|
success=True,
|
|
137
134
|
cache_delay_minutes=cache_delay / 60,
|
|
135
|
+
dynamo_ms=0,
|
|
136
|
+
parsing_ms=0,
|
|
138
137
|
)
|
|
139
138
|
cache_misses = len(cache_miss)
|
|
140
139
|
logger.info("Cache miss for %d items", cache_misses)
|
|
@@ -147,18 +146,22 @@ async def set_stream_features(
|
|
|
147
146
|
keys.append({"pk": {"S": pk}, "sk": {"S": sk}})
|
|
148
147
|
|
|
149
148
|
session = aio_session or aiobotocore.session.get_session()
|
|
149
|
+
dynamo_start = time.perf_counter_ns()
|
|
150
150
|
async with session.create_client("dynamodb") as dynamodb:
|
|
151
151
|
try:
|
|
152
152
|
items = await async_batch_get(dynamodb, features_table, keys)
|
|
153
153
|
except Exception as e:
|
|
154
154
|
logger.error("DynamoDB batch_get failed: %s", e)
|
|
155
|
+
end_time = time.perf_counter_ns()
|
|
155
156
|
return FeatureRetrievalMeta(
|
|
156
157
|
cache_misses=cache_misses,
|
|
157
|
-
retrieval_ms=(
|
|
158
|
+
retrieval_ms=(end_time - time_start) * 1e-6,
|
|
158
159
|
success=False,
|
|
159
160
|
cache_delay_minutes=cache_delay / 60,
|
|
161
|
+
dynamo_ms=(end_time - dynamo_start) * 1e-6,
|
|
162
|
+
parsing_ms=0,
|
|
160
163
|
)
|
|
161
|
-
|
|
164
|
+
dynamo_end = time.perf_counter_ns()
|
|
162
165
|
updated_keys = set()
|
|
163
166
|
for item in items:
|
|
164
167
|
stream_url = item["pk"]["S"].removeprefix(stream_pk_prefix)
|
|
@@ -169,22 +172,23 @@ async def set_stream_features(
|
|
|
169
172
|
features_cache[cache_key] = {
|
|
170
173
|
"value": parsed.get("value"),
|
|
171
174
|
"cache_ttl_in_seconds": int(parsed.get("cache_ttl_in_seconds", -1)),
|
|
172
|
-
"
|
|
173
|
-
parsed.get("updated_at")
|
|
174
|
-
).replace(tzinfo=None),
|
|
175
|
+
"inserted_at": datetime.datetime.utcnow(),
|
|
175
176
|
}
|
|
176
177
|
if cache_key in cache_miss:
|
|
177
178
|
cache_miss[cache_key][feature_name] = parsed.get("value")
|
|
178
179
|
updated_keys.add(cache_key)
|
|
179
|
-
|
|
180
|
+
parsing_end = time.perf_counter_ns()
|
|
180
181
|
# Save keys that were not found in DynamoDB with None value
|
|
181
182
|
if len(updated_keys) < len(cache_miss):
|
|
182
183
|
missing_keys = set(cache_miss.keys()) - updated_keys
|
|
183
184
|
for k in missing_keys:
|
|
184
185
|
features_cache[k] = {"value": None, "cache_ttl_in_seconds": 300}
|
|
186
|
+
end_time = time.perf_counter_ns()
|
|
185
187
|
return FeatureRetrievalMeta(
|
|
186
188
|
cache_misses=cache_misses,
|
|
187
|
-
retrieval_ms=(
|
|
189
|
+
retrieval_ms=(end_time - time_start) * 1e-6,
|
|
188
190
|
success=True,
|
|
189
191
|
cache_delay_minutes=cache_delay / 60,
|
|
192
|
+
dynamo_ms=(dynamo_end - dynamo_start) * 1e-6,
|
|
193
|
+
parsing_ms=(parsing_end - dynamo_end) * 1e-6,
|
|
190
194
|
)
|
|
@@ -4,8 +4,13 @@ import typing as _t
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def stream_favorites_cleanup(
|
|
7
|
-
stream,
|
|
7
|
+
stream,
|
|
8
|
+
user_favorite_tags: list[str],
|
|
9
|
+
user_favorite_authors: list[str],
|
|
10
|
+
out: dict = None,
|
|
8
11
|
) -> dict:
|
|
12
|
+
if out is None:
|
|
13
|
+
out = {}
|
|
9
14
|
stream_tags = stream.get("haystackTags", [])
|
|
10
15
|
is_favorite_tag = (
|
|
11
16
|
any(stream_tag in user_favorite_tags for stream_tag in stream_tags)
|
|
@@ -17,15 +22,15 @@ def stream_favorites_cleanup(
|
|
|
17
22
|
if user_favorite_authors is not None
|
|
18
23
|
else False
|
|
19
24
|
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
}
|
|
25
|
+
out["IS_FAVORITE_TAG"] = is_favorite_tag
|
|
26
|
+
out["IS_FAVORITE_AUTHOR"] = is_favorite_author
|
|
27
|
+
return out
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
def browsed_count_cleanups(
|
|
27
31
|
stream,
|
|
28
32
|
position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
|
|
33
|
+
out: dict = None,
|
|
29
34
|
) -> dict:
|
|
30
35
|
position_alias_mapping = {
|
|
31
36
|
"0": "1ST_POS",
|
|
@@ -43,7 +48,8 @@ def browsed_count_cleanups(
|
|
|
43
48
|
total_selects = 0
|
|
44
49
|
total_browsed = 0
|
|
45
50
|
total_selects_and_watched = 0
|
|
46
|
-
|
|
51
|
+
if out is None:
|
|
52
|
+
out = {}
|
|
47
53
|
for position in position_alias_mapping.keys():
|
|
48
54
|
pos_counts = browsed_count_obj.get(position, {})
|
|
49
55
|
total_browsed += pos_counts.get("total_browsed", 0)
|
|
@@ -55,16 +61,17 @@ def browsed_count_cleanups(
|
|
|
55
61
|
suffix = ""
|
|
56
62
|
else:
|
|
57
63
|
raise ValueError("Should not be here.")
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return
|
|
64
|
+
out[f"STREAM_24H_TOTAL_BROWSED{suffix}"] = total_browsed
|
|
65
|
+
out[f"STREAM_24H_TOTAL_SELECTS{suffix}"] = total_selects
|
|
66
|
+
out[f"STREAM_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = total_selects_and_watched
|
|
67
|
+
return out
|
|
62
68
|
|
|
63
69
|
|
|
64
70
|
def device_split_browsed_count_cleanups(
|
|
65
71
|
stream,
|
|
66
72
|
device_type: _t.Literal["TV", "MOBILE"],
|
|
67
73
|
position_debiasing: _t.Literal["4_browsed", "all_browsed"] = "4_browsed",
|
|
74
|
+
out: dict = None,
|
|
68
75
|
) -> dict:
|
|
69
76
|
position_alias_mapping = {
|
|
70
77
|
"0": "1ST_POS",
|
|
@@ -87,21 +94,24 @@ def device_split_browsed_count_cleanups(
|
|
|
87
94
|
total_selects = 0
|
|
88
95
|
total_browsed = 0
|
|
89
96
|
total_selects_and_watched = 0
|
|
90
|
-
|
|
97
|
+
if out is None:
|
|
98
|
+
out = {}
|
|
91
99
|
for position, alias in position_alias_mapping.items():
|
|
92
100
|
pos_counts = browsed_count_obj.get(position, {})
|
|
93
101
|
total_browsed = pos_counts.get("total_browsed", 0)
|
|
94
102
|
total_selects = pos_counts.get("total_selects", 0)
|
|
95
103
|
total_selects_and_watched = pos_counts.get("total_selects_and_watched", 0)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
104
|
+
out[f"STREAM_{alias}_{device_type}_24H_TOTAL_BROWSED{suffix}"] = total_browsed
|
|
105
|
+
out[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS{suffix}"] = total_selects
|
|
106
|
+
out[f"STREAM_{alias}_{device_type}_24H_TOTAL_SELECTS_AND_WATCHED{suffix}"] = (
|
|
99
107
|
total_selects_and_watched
|
|
100
108
|
)
|
|
101
|
-
return
|
|
109
|
+
return out
|
|
102
110
|
|
|
103
111
|
|
|
104
|
-
def watched_count_cleanups(
|
|
112
|
+
def watched_count_cleanups(
|
|
113
|
+
stream, entry_contexts: list[str] = None, out: dict = None
|
|
114
|
+
) -> dict:
|
|
105
115
|
if entry_contexts is None:
|
|
106
116
|
entry_contexts = [
|
|
107
117
|
"autoplay",
|
|
@@ -113,19 +123,20 @@ def watched_count_cleanups(stream, entry_contexts: list[str] = None) -> dict:
|
|
|
113
123
|
_validate_pwatched_entry_context(entry_contexts)
|
|
114
124
|
|
|
115
125
|
counts_obj = stream.get(f"PWATCHED#24H", {})
|
|
116
|
-
|
|
126
|
+
if out is None:
|
|
127
|
+
out = {}
|
|
117
128
|
for entry_context in entry_contexts:
|
|
118
129
|
attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
|
|
119
130
|
watched = counts_obj.get(entry_context, {}).get("watched", 0)
|
|
120
131
|
context_key = entry_context if "launch" not in entry_context else "launch"
|
|
121
132
|
context_key = context_key.upper().replace(" ", "_")
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
return
|
|
133
|
+
out[f"STREAM_{context_key}_24H_TOTAL_WATCHED"] = watched
|
|
134
|
+
out[f"STREAM_{context_key}_24H_TOTAL_ATTEMPTS"] = attempts
|
|
135
|
+
return out
|
|
125
136
|
|
|
126
137
|
|
|
127
138
|
def device_watched_count_cleanups(
|
|
128
|
-
stream, device_type: str, entry_contexts: list[str] = None
|
|
139
|
+
stream, device_type: str, entry_contexts: list[str] = None, out: dict = None
|
|
129
140
|
) -> dict:
|
|
130
141
|
if entry_contexts is None:
|
|
131
142
|
entry_contexts = [
|
|
@@ -140,23 +151,24 @@ def device_watched_count_cleanups(
|
|
|
140
151
|
_validate_device_type(device_type)
|
|
141
152
|
|
|
142
153
|
counts_obj = stream.get(f"PWATCHED#24H#{device_type}", {})
|
|
143
|
-
|
|
154
|
+
if out is None:
|
|
155
|
+
out = {}
|
|
144
156
|
for entry_context in entry_contexts:
|
|
145
157
|
attempts = counts_obj.get(entry_context, {}).get("attempts", 0)
|
|
146
158
|
watched = counts_obj.get(entry_context, {}).get("watched", 0)
|
|
147
159
|
context_key = entry_context if "launch" not in entry_context else "launch"
|
|
148
160
|
context_key = context_key.upper().replace(" ", "_")
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
return
|
|
161
|
+
out[f"STREAM_{context_key}_{device_type}_24H_TOTAL_WATCHED"] = watched
|
|
162
|
+
out[f"STREAM_{context_key}_{device_type}_24H_TOTAL_ATTEMPTS"] = attempts
|
|
163
|
+
return out
|
|
152
164
|
|
|
153
165
|
|
|
154
166
|
def generic_beta_adjust_features(
|
|
155
167
|
data: pd.DataFrame,
|
|
156
168
|
prefix: str,
|
|
157
|
-
pwatched_beta_params: dict,
|
|
158
|
-
pselect_beta_params: dict,
|
|
159
|
-
pslw_beta_params: dict,
|
|
169
|
+
pwatched_beta_params: dict = None,
|
|
170
|
+
pselect_beta_params: dict = None,
|
|
171
|
+
pslw_beta_params: dict = None,
|
|
160
172
|
use_low_sample_flags: bool = False,
|
|
161
173
|
low_sample_threshold: int = 3,
|
|
162
174
|
use_attempt_features: bool = False,
|
|
@@ -164,67 +176,92 @@ def generic_beta_adjust_features(
|
|
|
164
176
|
debiased_pselect: bool = True,
|
|
165
177
|
use_logodds: bool = False,
|
|
166
178
|
) -> pd.DataFrame:
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
179
|
+
features = {}
|
|
180
|
+
counting_feature_cols = [
|
|
181
|
+
c
|
|
182
|
+
for c in data.columns
|
|
183
|
+
if "TOTAL_WATCHED" in c
|
|
184
|
+
or "TOTAL_ATTEMPTS" in c
|
|
185
|
+
or "SELECT" in c
|
|
186
|
+
or "BROWSED" in c
|
|
187
|
+
]
|
|
188
|
+
data_arr = data[counting_feature_cols].to_numpy(dtype=float)
|
|
189
|
+
col_to_idx = {col: i for i, col in enumerate(counting_feature_cols)}
|
|
190
|
+
if pwatched_beta_params is not None:
|
|
191
|
+
for context, (alpha, beta) in pwatched_beta_params.items():
|
|
192
|
+
total_watched = np.nan_to_num(
|
|
193
|
+
data_arr[:, col_to_idx[f"{prefix}_{context}_TOTAL_WATCHED"]]
|
|
194
|
+
)
|
|
195
|
+
total_attempts = np.nan_to_num(
|
|
196
|
+
data_arr[:, col_to_idx[f"{prefix}_{context}_TOTAL_ATTEMPTS"]]
|
|
181
197
|
)
|
|
198
|
+
features[f"{prefix}_{context}_ADJ_PWATCHED"] = (total_watched + alpha) / (
|
|
199
|
+
total_attempts + alpha + beta
|
|
200
|
+
)
|
|
201
|
+
low_sample_arr = np.empty_like(total_attempts, dtype=float)
|
|
202
|
+
if use_low_sample_flags:
|
|
203
|
+
features[f"{prefix}_{context}_LOW_SAMPLE"] = np.less_equal(
|
|
204
|
+
total_attempts, low_sample_threshold, out=low_sample_arr
|
|
205
|
+
)
|
|
206
|
+
if use_attempt_features:
|
|
207
|
+
features[f"{prefix}_{context}_ATTEMPTS"] = np.clip(
|
|
208
|
+
total_attempts, a_min=None, a_max=max_attempt_cap
|
|
209
|
+
)
|
|
182
210
|
|
|
183
|
-
pselect_features = {}
|
|
184
211
|
debias_suffix = "_UP_TO_4_BROWSED" if debiased_pselect else ""
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
212
|
+
if pselect_beta_params is not None or pslw_beta_params is not None:
|
|
213
|
+
for key, (alpha, beta) in pselect_beta_params.items():
|
|
214
|
+
total_selects_idx = col_to_idx[
|
|
215
|
+
f"{prefix}_{key}_TOTAL_SELECTS{debias_suffix}"
|
|
216
|
+
]
|
|
217
|
+
total_browsed_idx = col_to_idx[
|
|
218
|
+
f"{prefix}_{key}_TOTAL_BROWSED{debias_suffix}"
|
|
219
|
+
]
|
|
220
|
+
total_slw_idx = col_to_idx[
|
|
221
|
+
f"{prefix}_{key}_TOTAL_SELECTS_AND_WATCHED{debias_suffix}"
|
|
222
|
+
]
|
|
223
|
+
total_selects = np.nan_to_num(data_arr[:, total_selects_idx])
|
|
224
|
+
total_browsed = np.nan_to_num(data_arr[:, total_browsed_idx])
|
|
225
|
+
total_slw = np.nan_to_num(data_arr[:, total_slw_idx])
|
|
226
|
+
if pselect_beta_params is not None:
|
|
227
|
+
features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"] = (
|
|
228
|
+
total_selects + alpha
|
|
229
|
+
) / (total_selects + total_browsed + alpha + beta)
|
|
230
|
+
if use_low_sample_flags:
|
|
231
|
+
low_sample_arr = np.empty_like(total_selects, dtype=float)
|
|
232
|
+
features[f"{prefix}_{key}_PSELECT_LOW_SAMPLE{debias_suffix}"] = (
|
|
233
|
+
np.less_equal(
|
|
234
|
+
total_selects + total_browsed,
|
|
235
|
+
low_sample_threshold,
|
|
236
|
+
out=low_sample_arr,
|
|
237
|
+
)
|
|
238
|
+
)
|
|
239
|
+
if use_attempt_features:
|
|
240
|
+
features[f"{prefix}_{key}_PSELECT_ATTEMPTS{debias_suffix}"] = np.clip(
|
|
241
|
+
total_selects + total_browsed, a_min=0, a_max=max_attempt_cap
|
|
242
|
+
)
|
|
243
|
+
if pslw_beta_params is not None:
|
|
244
|
+
pslw_alpha, pslw_beta = pslw_beta_params[key]
|
|
245
|
+
features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"] = (
|
|
246
|
+
total_slw + pslw_alpha
|
|
247
|
+
) / (total_selects + total_browsed + pslw_alpha + pslw_beta)
|
|
248
|
+
if pslw_beta_params is not None and pselect_beta_params is not None:
|
|
249
|
+
features[f"{prefix}_{key}_PSelNotW{debias_suffix}"] = (
|
|
250
|
+
features[f"{prefix}_{key}_ADJ_PSELECT{debias_suffix}"]
|
|
251
|
+
- features[f"{prefix}_{key}_ADJ_PSLW{debias_suffix}"]
|
|
252
|
+
)
|
|
210
253
|
|
|
211
|
-
adjusted_feats = pd.DataFrame(
|
|
254
|
+
adjusted_feats = pd.DataFrame(features, index=data.index)
|
|
212
255
|
if use_logodds:
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
or "PSelNotW" in c
|
|
223
|
-
]
|
|
224
|
-
]
|
|
225
|
-
.clip(lower=0.001)
|
|
226
|
-
.pipe(prob_to_logodds)
|
|
227
|
-
)
|
|
256
|
+
arr = adjusted_feats.to_numpy()
|
|
257
|
+
col_idxs = [
|
|
258
|
+
i
|
|
259
|
+
for i, c in enumerate(adjusted_feats.columns)
|
|
260
|
+
if ("PSELECT" in c or "PSLW" in c or "PWATCHED" in c or "PSelNotW" in c)
|
|
261
|
+
and ("LOW_SAMPLE" not in c and "ATTEMPTS" not in c)
|
|
262
|
+
]
|
|
263
|
+
arr[:, col_idxs] = prob_to_logodds(
|
|
264
|
+
np.clip(arr[:, col_idxs], a_min=0.001, a_max=None)
|
|
228
265
|
)
|
|
229
266
|
return adjusted_feats
|
|
230
267
|
|
|
@@ -251,7 +288,10 @@ def sigmoid(x: float) -> float:
|
|
|
251
288
|
def generic_logistic_predict(
|
|
252
289
|
data: pd.DataFrame, coeffs: pd.Series, intercept: float
|
|
253
290
|
) -> pd.Series:
|
|
254
|
-
|
|
291
|
+
scores = (data[coeffs.index] * coeffs).sum(axis=1) + intercept
|
|
292
|
+
raw_arr = scores.to_numpy()
|
|
293
|
+
raw_arr[:] = sigmoid(raw_arr)
|
|
294
|
+
return scores
|
|
255
295
|
|
|
256
296
|
|
|
257
297
|
def _validate_device_type(device_type: str):
|
{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/SOURCES.txt
RENAMED
|
@@ -11,4 +11,5 @@ src/haystack_ml_stack.egg-info/PKG-INFO
|
|
|
11
11
|
src/haystack_ml_stack.egg-info/SOURCES.txt
|
|
12
12
|
src/haystack_ml_stack.egg-info/dependency_links.txt
|
|
13
13
|
src/haystack_ml_stack.egg-info/requires.txt
|
|
14
|
-
src/haystack_ml_stack.egg-info/top_level.txt
|
|
14
|
+
src/haystack_ml_stack.egg-info/top_level.txt
|
|
15
|
+
tests/test_utils.py
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from haystack_ml_stack import utils
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_sigmoid():
|
|
8
|
+
values_to_test = np.array([-1, 0, 1])
|
|
9
|
+
expected = np.array([0.26894142136992605, 0.5, 0.731058578630074])
|
|
10
|
+
actual = utils.sigmoid(values_to_test)
|
|
11
|
+
assert np.isclose(actual, expected).all()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_prob_to_logodds():
|
|
15
|
+
values_to_test = np.array([0.25, 0.5, 0.75])
|
|
16
|
+
expected = np.array([-1.0986122886681096, 0, 1.0986122886681096])
|
|
17
|
+
actual = utils.prob_to_logodds(values_to_test)
|
|
18
|
+
assert np.isclose(actual, expected).all(), print(actual - expected)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_generic_beta_adjust_features():
|
|
22
|
+
data_to_test = pd.DataFrame(
|
|
23
|
+
{
|
|
24
|
+
"STREAM_AUTOPLAY_24H_TOTAL_ATTEMPTS": [1, 2],
|
|
25
|
+
"STREAM_AUTOPLAY_24H_TOTAL_WATCHED": [0, 1],
|
|
26
|
+
"STREAM_24H_TOTAL_SELECTS_UP_TO_4_BROWSED": [1, 1],
|
|
27
|
+
"STREAM_24H_TOTAL_SELECTS_AND_WATCHED_UP_TO_4_BROWSED": [0, 1],
|
|
28
|
+
"STREAM_24H_TOTAL_BROWSED_UP_TO_4_BROWSED": [2, 0],
|
|
29
|
+
},
|
|
30
|
+
dtype=float,
|
|
31
|
+
)
|
|
32
|
+
actual = utils.generic_beta_adjust_features(
|
|
33
|
+
data=data_to_test,
|
|
34
|
+
prefix="STREAM",
|
|
35
|
+
pwatched_beta_params={"AUTOPLAY_24H": (2, 1)},
|
|
36
|
+
pselect_beta_params={"24H": (1, 1)},
|
|
37
|
+
pslw_beta_params={"24H": (0.5, 1)},
|
|
38
|
+
use_low_sample_flags=True,
|
|
39
|
+
)
|
|
40
|
+
# print(actual)
|
|
41
|
+
expected = pd.DataFrame(
|
|
42
|
+
{
|
|
43
|
+
"STREAM_AUTOPLAY_24H_ADJ_PWATCHED": [
|
|
44
|
+
(0 + 2) / (1 + 2 + 1),
|
|
45
|
+
(1 + 2) / (2 + 2 + 1),
|
|
46
|
+
],
|
|
47
|
+
"STREAM_24H_ADJ_PSELECT_UP_TO_4_BROWSED": [
|
|
48
|
+
(1 + 1) / (1 + 2 + 1 + 1),
|
|
49
|
+
(1 + 1) / (1 + 0 + 1 + 1),
|
|
50
|
+
],
|
|
51
|
+
"STREAM_24H_ADJ_PSLW_UP_TO_4_BROWSED": [
|
|
52
|
+
(0 + 0.5) / (1 + 2 + 0.5 + 1),
|
|
53
|
+
(1 + 0.5) / (1 + 0 + 0.5 + 1),
|
|
54
|
+
],
|
|
55
|
+
"STREAM_24H_PSelNotW_UP_TO_4_BROWSED": [
|
|
56
|
+
(1 + 1) / (1 + 2 + 1 + 1) - (0 + 0.5) / (1 + 2 + 0.5 + 1),
|
|
57
|
+
(1 + 1) / (1 + 0 + 1 + 1) - (1 + 0.5) / (1 + 0 + 0.5 + 1),
|
|
58
|
+
],
|
|
59
|
+
"STREAM_AUTOPLAY_24H_LOW_SAMPLE": [1, 1],
|
|
60
|
+
"STREAM_24H_PSELECT_LOW_SAMPLE_UP_TO_4_BROWSED": [1, 1],
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
assert (actual[expected.columns] == expected).all(axis=None), actual - expected
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_generic_logistic_predict():
|
|
67
|
+
features = pd.DataFrame({"feat1": [0, 1, 2], "feat2": [3, 3, 5]}, dtype=float)
|
|
68
|
+
coeffs = pd.Series({"feat1": 1, "feat2": 2})
|
|
69
|
+
intercept = 1
|
|
70
|
+
expected = utils.sigmoid(
|
|
71
|
+
pd.Series([0 * 1 + 2 * 3, 1 * 1 + 2 * 3, 2 * 1 + 5 * 2]) + 1
|
|
72
|
+
)
|
|
73
|
+
actual = utils.generic_logistic_predict(
|
|
74
|
+
data=features, coeffs=coeffs, intercept=intercept
|
|
75
|
+
)
|
|
76
|
+
assert (expected == actual).all(), actual - expected
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/requires.txt
RENAMED
|
File without changes
|
{haystack_ml_stack-0.2.3 → haystack_ml_stack-0.2.4}/src/haystack_ml_stack.egg-info/top_level.txt
RENAMED
|
File without changes
|