inference-logging-client 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inference_logging_client-0.1.0/MANIFEST.in +3 -0
- inference_logging_client-0.1.0/PKG-INFO +222 -0
- inference_logging_client-0.1.0/inference_logging_client/__init__.py +443 -0
- inference_logging_client-0.1.0/inference_logging_client/__main__.py +6 -0
- inference_logging_client-0.1.0/inference_logging_client/cli.py +134 -0
- inference_logging_client-0.1.0/inference_logging_client/decoder.py +395 -0
- inference_logging_client-0.1.0/inference_logging_client/exceptions.py +31 -0
- inference_logging_client-0.1.0/inference_logging_client/formats.py +328 -0
- inference_logging_client-0.1.0/inference_logging_client/io.py +241 -0
- inference_logging_client-0.1.0/inference_logging_client/types.py +46 -0
- inference_logging_client-0.1.0/inference_logging_client/utils.py +133 -0
- inference_logging_client-0.1.0/inference_logging_client.egg-info/PKG-INFO +222 -0
- inference_logging_client-0.1.0/inference_logging_client.egg-info/SOURCES.txt +19 -0
- inference_logging_client-0.1.0/inference_logging_client.egg-info/dependency_links.txt +1 -0
- inference_logging_client-0.1.0/inference_logging_client.egg-info/entry_points.txt +2 -0
- inference_logging_client-0.1.0/inference_logging_client.egg-info/requires.txt +9 -0
- inference_logging_client-0.1.0/inference_logging_client.egg-info/top_level.txt +2 -0
- inference_logging_client-0.1.0/pyproject.toml +68 -0
- inference_logging_client-0.1.0/readme.md +190 -0
- inference_logging_client-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inference-logging-client
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Decode MPLog feature logs from proto, arrow, or parquet format
|
|
5
|
+
Author-email: Dhruv Gupta <dhruv.gupta@meesho.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Meesho/BharatMLStack
|
|
8
|
+
Project-URL: Repository, https://github.com/Meesho/BharatMLStack
|
|
9
|
+
Project-URL: Issues, https://github.com/Meesho/BharatMLStack/issues
|
|
10
|
+
Keywords: mplog,decoder,inference,logging,proto,arrow,parquet
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: pandas>=1.3.0
|
|
25
|
+
Requires-Dist: pyarrow>=5.0.0
|
|
26
|
+
Requires-Dist: zstandard>=0.15.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
32
|
+
|
|
33
|
+
# Inference Logging Client
|
|
34
|
+
|
|
35
|
+
A Python package for decoding MPLog feature logs from proto, arrow, or parquet format.
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- Decode MPLog feature logs from multiple encoding formats:
|
|
40
|
+
- **Proto**: Custom binary encoding with generated flag + sequential features
|
|
41
|
+
- **Arrow**: Arrow IPC format with binary columns
|
|
42
|
+
- **Parquet**: Parquet format with feature map
|
|
43
|
+
- Automatic format detection from metadata
|
|
44
|
+
- Support for zstd compression
|
|
45
|
+
- Fetch feature schemas from inference API
|
|
46
|
+
- Convert decoded logs to pandas DataFrames
|
|
47
|
+
- Command-line interface for easy usage
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install inference-logging-client
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
### Python API
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import inference_logging_client
|
|
61
|
+
|
|
62
|
+
# Decode MPLog from bytes
|
|
63
|
+
with open("log.bin", "rb") as f:
|
|
64
|
+
data = f.read()
|
|
65
|
+
|
|
66
|
+
df = inference_logging_client.decode_mplog(
|
|
67
|
+
log_data=data,
|
|
68
|
+
model_proxy_id="my-model",
|
|
69
|
+
version=1
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
print(df.head())
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Decode from DataFrame
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
import pandas as pd
|
|
79
|
+
import inference_logging_client
|
|
80
|
+
|
|
81
|
+
# Read DataFrame with MPLog columns
|
|
82
|
+
df = pd.read_parquet("logs.parquet")
|
|
83
|
+
|
|
84
|
+
# Decode features
|
|
85
|
+
decoded_df = inference_logging_client.decode_mplog_dataframe(df)
|
|
86
|
+
|
|
87
|
+
print(decoded_df.head())
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Command Line Interface
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# Decode with auto-detection
|
|
94
|
+
inference-logging-client --model-proxy-id my-model --version 1 input.bin
|
|
95
|
+
|
|
96
|
+
# Specify format explicitly
|
|
97
|
+
inference-logging-client --model-proxy-id my-model --version 1 --format proto input.bin
|
|
98
|
+
|
|
99
|
+
# Output to CSV
|
|
100
|
+
inference-logging-client --model-proxy-id my-model --version 1 input.bin -o output.csv
|
|
101
|
+
|
|
102
|
+
# Decode from stdin (base64)
|
|
103
|
+
echo "BASE64_DATA" | inference-logging-client --model-proxy-id my-model --version 1 --base64 -
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Configuration
|
|
107
|
+
|
|
108
|
+
The package uses environment variables for configuration:
|
|
109
|
+
|
|
110
|
+
- `INFERENCE_HOST`: Inference service host URL (default: `http://localhost:8082`)
|
|
111
|
+
- `INFERENCE_PATH`: API path for schema fetching (default: `/api/v1/inference/mp-config-registry/get_feature_schema`)
|
|
112
|
+
|
|
113
|
+
## API Reference
|
|
114
|
+
|
|
115
|
+
### `decode_mplog()`
|
|
116
|
+
|
|
117
|
+
Decode MPLog bytes to a pandas DataFrame.
|
|
118
|
+
|
|
119
|
+
**Parameters:**
|
|
120
|
+
- `log_data` (bytes): The MPLog bytes (possibly compressed)
|
|
121
|
+
- `model_proxy_id` (str): The model proxy config ID
|
|
122
|
+
- `version` (int): The schema version
|
|
123
|
+
- `format_type` (Format, optional): The encoding format. If None, auto-detect from metadata.
|
|
124
|
+
- `inference_host` (str, optional): The inference service host URL
|
|
125
|
+
- `decompress` (bool): Whether to attempt zstd decompression (default: True)
|
|
126
|
+
- `schema` (list, optional): Pre-fetched schema (list of FeatureInfo). If provided, skips schema fetch.
|
|
127
|
+
|
|
128
|
+
**Returns:**
|
|
129
|
+
- `pd.DataFrame`: DataFrame with `entity_id` as first column and features as remaining columns
|
|
130
|
+
|
|
131
|
+
### `decode_mplog_dataframe()`
|
|
132
|
+
|
|
133
|
+
Decode MPLog features from a DataFrame with specific column structure.
|
|
134
|
+
|
|
135
|
+
**Parameters:**
|
|
136
|
+
- `df` (pd.DataFrame): Input DataFrame with MPLog data columns
|
|
137
|
+
- `inference_host` (str, optional): The inference service host URL
|
|
138
|
+
- `decompress` (bool): Whether to attempt zstd decompression (default: True)
|
|
139
|
+
- `features_column` (str): Name of the column containing encoded features (default: "features")
|
|
140
|
+
- `metadata_column` (str): Name of the column containing metadata byte (default: "metadata")
|
|
141
|
+
- `mp_config_id_column` (str): Name of the column containing model proxy config ID (default: "mp_config_id")
|
|
142
|
+
|
|
143
|
+
**Returns:**
|
|
144
|
+
- `pd.DataFrame`: DataFrame with decoded features, one row per entity
|
|
145
|
+
|
|
146
|
+
### `get_mplog_metadata()`
|
|
147
|
+
|
|
148
|
+
Extract metadata from MPLog bytes without full decoding.
|
|
149
|
+
|
|
150
|
+
**Parameters:**
|
|
151
|
+
- `log_data` (bytes): The MPLog bytes (possibly compressed)
|
|
152
|
+
- `decompress` (bool): Whether to attempt zstd decompression (default: True)
|
|
153
|
+
|
|
154
|
+
**Returns:**
|
|
155
|
+
- `DecodedMPLog`: Object with metadata fields populated
|
|
156
|
+
|
|
157
|
+
## Supported Feature Types
|
|
158
|
+
|
|
159
|
+
### Scalar Types
|
|
160
|
+
- Integer: INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64
|
|
161
|
+
- Float: FP16, FP32, FP64, FP8E5M2, FP8E4M3
|
|
162
|
+
- Boolean: BOOL
|
|
163
|
+
- String: STRING
|
|
164
|
+
|
|
165
|
+
### Vector Types
|
|
166
|
+
- All scalar types can be vectors (e.g., FP32VECTOR, INT64VECTOR)
|
|
167
|
+
- Vectors can be binary-encoded or JSON-encoded
|
|
168
|
+
|
|
169
|
+
## Encoding Formats
|
|
170
|
+
|
|
171
|
+
### Proto Format
|
|
172
|
+
- First byte: generated flag
|
|
173
|
+
- Scalars: fixed size bytes based on type
|
|
174
|
+
- Strings/Vectors: 2-byte little-endian size prefix + data bytes
|
|
175
|
+
|
|
176
|
+
### Arrow Format
|
|
177
|
+
- Arrow IPC format with binary columns
|
|
178
|
+
- Column names are feature indices ("0", "1", ...)
|
|
179
|
+
- Each column contains raw feature value bytes
|
|
180
|
+
|
|
181
|
+
### Parquet Format
|
|
182
|
+
- Parquet file with Features column (`map[int][]byte`)
|
|
183
|
+
- Each row represents an entity
|
|
184
|
+
|
|
185
|
+
## Metadata Byte Layout
|
|
186
|
+
|
|
187
|
+
The metadata byte encodes:
|
|
188
|
+
- Bit 0: compression flag (0=disabled, 1=enabled)
|
|
189
|
+
- Bit 1: reserved
|
|
190
|
+
- Bits 2-5: version (4 bits, 0-15)
|
|
191
|
+
- Bits 6-7: format type (00=proto, 01=arrow, 10=parquet)
|
|
192
|
+
|
|
193
|
+
## Development
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
# Install in development mode
|
|
197
|
+
pip install -e .
|
|
198
|
+
|
|
199
|
+
# Install with dev dependencies
|
|
200
|
+
pip install -e ".[dev]"
|
|
201
|
+
|
|
202
|
+
# Run tests
|
|
203
|
+
pytest
|
|
204
|
+
|
|
205
|
+
# Format code
|
|
206
|
+
black src/
|
|
207
|
+
|
|
208
|
+
# Lint code
|
|
209
|
+
ruff check src/
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
MIT License
|
|
215
|
+
|
|
216
|
+
## Repository
|
|
217
|
+
|
|
218
|
+
[https://github.com/Meesho/BharatMLStack](https://github.com/Meesho/BharatMLStack)
|
|
219
|
+
|
|
220
|
+
## Contributing
|
|
221
|
+
|
|
222
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Inference Logging Client - Decode MPLog feature logs from proto, arrow, or parquet format.
|
|
3
|
+
|
|
4
|
+
This package provides functionality to:
|
|
5
|
+
1. Decode MPLog feature logs from various encoding formats (proto, arrow, parquet)
|
|
6
|
+
2. Fetch feature schemas from inference API
|
|
7
|
+
3. Convert decoded logs to pandas DataFrames
|
|
8
|
+
|
|
9
|
+
Main functions:
|
|
10
|
+
- decode_mplog: Decode MPLog bytes to a DataFrame
|
|
11
|
+
- decode_mplog_dataframe: Decode MPLog features from a DataFrame
|
|
12
|
+
- get_mplog_metadata: Extract metadata from MPLog bytes
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import warnings
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
# Check for zstandard availability at import time for clear error messages
|
|
21
|
+
try:
|
|
22
|
+
import zstandard as zstd
|
|
23
|
+
_ZSTD_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
_ZSTD_AVAILABLE = False
|
|
26
|
+
zstd = None
|
|
27
|
+
|
|
28
|
+
from .types import Format, FeatureInfo, DecodedMPLog, FORMAT_TYPE_MAP
|
|
29
|
+
from .io import get_feature_schema, parse_mplog_protobuf, get_mplog_metadata, clear_schema_cache
|
|
30
|
+
from .formats import decode_proto_format, decode_arrow_format, decode_parquet_format
|
|
31
|
+
from .utils import format_dataframe_floats, get_format_name, unpack_metadata_byte
|
|
32
|
+
from .exceptions import (
|
|
33
|
+
InferenceLoggingError,
|
|
34
|
+
SchemaFetchError,
|
|
35
|
+
SchemaNotFoundError,
|
|
36
|
+
DecodeError,
|
|
37
|
+
FormatError,
|
|
38
|
+
ProtobufError,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
__version__ = "0.1.0"
|
|
42
|
+
|
|
43
|
+
# Maximum supported schema version (4 bits = 0-15)
|
|
44
|
+
_MAX_SCHEMA_VERSION = 15
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
"decode_mplog",
|
|
48
|
+
"decode_mplog_dataframe",
|
|
49
|
+
"get_mplog_metadata",
|
|
50
|
+
"get_feature_schema",
|
|
51
|
+
"clear_schema_cache",
|
|
52
|
+
"Format",
|
|
53
|
+
"FeatureInfo",
|
|
54
|
+
"DecodedMPLog",
|
|
55
|
+
"get_format_name",
|
|
56
|
+
"unpack_metadata_byte",
|
|
57
|
+
# Exceptions
|
|
58
|
+
"InferenceLoggingError",
|
|
59
|
+
"SchemaFetchError",
|
|
60
|
+
"SchemaNotFoundError",
|
|
61
|
+
"DecodeError",
|
|
62
|
+
"FormatError",
|
|
63
|
+
"ProtobufError",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _decompress_zstd(data: bytes) -> bytes:
|
|
68
|
+
"""Decompress zstd-compressed data.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
data: Potentially zstd-compressed bytes
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Decompressed bytes, or original data if not compressed or zstd unavailable
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ImportError: If data is zstd-compressed but zstandard is not installed
|
|
78
|
+
"""
|
|
79
|
+
# Check for zstd magic number: 0x28 0xB5 0x2F 0xFD
|
|
80
|
+
if len(data) >= 4 and data[:4] == b'\x28\xB5\x2F\xFD':
|
|
81
|
+
if not _ZSTD_AVAILABLE:
|
|
82
|
+
raise ImportError(
|
|
83
|
+
"Data appears to be zstd-compressed but the 'zstandard' package is not installed. "
|
|
84
|
+
"Install it with: pip install zstandard"
|
|
85
|
+
)
|
|
86
|
+
decompressor = zstd.ZstdDecompressor()
|
|
87
|
+
return decompressor.decompress(data)
|
|
88
|
+
return data
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def decode_mplog(
|
|
92
|
+
log_data: bytes,
|
|
93
|
+
model_proxy_id: str,
|
|
94
|
+
version: int,
|
|
95
|
+
format_type: Optional[Format] = None,
|
|
96
|
+
inference_host: Optional[str] = None,
|
|
97
|
+
decompress: bool = True,
|
|
98
|
+
schema: Optional[list] = None
|
|
99
|
+
) -> pd.DataFrame:
|
|
100
|
+
"""
|
|
101
|
+
Main function to decode MPLog bytes to a DataFrame.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
log_data: The MPLog bytes (possibly compressed)
|
|
105
|
+
model_proxy_id: The model proxy config ID
|
|
106
|
+
version: The schema version (0-15)
|
|
107
|
+
format_type: The encoding format (proto, arrow, parquet). If None, auto-detect from metadata.
|
|
108
|
+
inference_host: The inference service host URL. If None, reads from INFERENCE_HOST env var.
|
|
109
|
+
decompress: Whether to attempt zstd decompression
|
|
110
|
+
schema: Optional pre-fetched schema (list of FeatureInfo). If provided, skips schema fetch.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
pandas DataFrame with entity_id as first column and features as remaining columns
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ValueError: If version is out of valid range (0-15)
|
|
117
|
+
ImportError: If data is zstd-compressed but zstandard is not installed
|
|
118
|
+
FormatError: If format is unsupported or data cannot be parsed
|
|
119
|
+
|
|
120
|
+
Example:
|
|
121
|
+
>>> import inference_logging_client
|
|
122
|
+
>>> with open("log.bin", "rb") as f:
|
|
123
|
+
... data = f.read()
|
|
124
|
+
>>> df = inference_logging_client.decode_mplog(
|
|
125
|
+
... log_data=data,
|
|
126
|
+
... model_proxy_id="my-model",
|
|
127
|
+
... version=1
|
|
128
|
+
... )
|
|
129
|
+
>>> print(df.head())
|
|
130
|
+
"""
|
|
131
|
+
import os
|
|
132
|
+
|
|
133
|
+
# Validate version range
|
|
134
|
+
if not (0 <= version <= _MAX_SCHEMA_VERSION):
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"Version {version} is out of valid range (0-{_MAX_SCHEMA_VERSION}). "
|
|
137
|
+
f"Version is encoded in 4 bits of the metadata byte."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Read from environment variable if not provided
|
|
141
|
+
if inference_host is None:
|
|
142
|
+
inference_host = os.getenv("INFERENCE_HOST", "http://localhost:8082")
|
|
143
|
+
|
|
144
|
+
# Attempt decompression if enabled
|
|
145
|
+
working_data = log_data
|
|
146
|
+
if decompress:
|
|
147
|
+
working_data = _decompress_zstd(log_data)
|
|
148
|
+
|
|
149
|
+
# If format_type is None, parse the protobuf to get format from metadata
|
|
150
|
+
detected_format = format_type
|
|
151
|
+
if detected_format is None:
|
|
152
|
+
# Parse protobuf to extract metadata and detect format
|
|
153
|
+
parsed = parse_mplog_protobuf(working_data)
|
|
154
|
+
if parsed.format_type in FORMAT_TYPE_MAP:
|
|
155
|
+
detected_format = FORMAT_TYPE_MAP[parsed.format_type]
|
|
156
|
+
else:
|
|
157
|
+
# Default to proto if format type is unknown
|
|
158
|
+
detected_format = Format.PROTO
|
|
159
|
+
|
|
160
|
+
# Use provided schema or fetch from inference service
|
|
161
|
+
if schema is None:
|
|
162
|
+
schema = get_feature_schema(model_proxy_id, version, inference_host)
|
|
163
|
+
|
|
164
|
+
# Decode based on format
|
|
165
|
+
if detected_format == Format.PROTO:
|
|
166
|
+
entity_ids, decoded_rows = decode_proto_format(working_data, schema)
|
|
167
|
+
elif detected_format == Format.ARROW:
|
|
168
|
+
entity_ids, decoded_rows = decode_arrow_format(working_data, schema)
|
|
169
|
+
elif detected_format == Format.PARQUET:
|
|
170
|
+
entity_ids, decoded_rows = decode_parquet_format(working_data, schema)
|
|
171
|
+
else:
|
|
172
|
+
raise FormatError(f"Unsupported format: {detected_format}")
|
|
173
|
+
|
|
174
|
+
if not decoded_rows:
|
|
175
|
+
# Return empty DataFrame with correct columns
|
|
176
|
+
columns = ["entity_id"] + [f.name for f in schema]
|
|
177
|
+
return pd.DataFrame(columns=columns)
|
|
178
|
+
|
|
179
|
+
# Build DataFrame
|
|
180
|
+
df = pd.DataFrame(decoded_rows)
|
|
181
|
+
|
|
182
|
+
# Insert entity_id as first column
|
|
183
|
+
df.insert(0, "entity_id", entity_ids)
|
|
184
|
+
|
|
185
|
+
return df
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def decode_mplog_dataframe(
|
|
189
|
+
df: pd.DataFrame,
|
|
190
|
+
inference_host: Optional[str] = None,
|
|
191
|
+
decompress: bool = True,
|
|
192
|
+
features_column: str = "features",
|
|
193
|
+
metadata_column: str = "metadata",
|
|
194
|
+
mp_config_id_column: str = "mp_config_id"
|
|
195
|
+
) -> pd.DataFrame:
|
|
196
|
+
"""
|
|
197
|
+
Decode MPLog features from a DataFrame with specific column structure.
|
|
198
|
+
|
|
199
|
+
Expected DataFrame columns:
|
|
200
|
+
- prism_ingested_at, prism_extracted_at, created_at
|
|
201
|
+
- entities, features, metadata
|
|
202
|
+
- mp_config_id, parent_entity, tracking_id, user_id
|
|
203
|
+
- year, month, day, hour
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
df: Input DataFrame with MPLog data columns
|
|
207
|
+
inference_host: The inference service host URL. If None, reads from INFERENCE_HOST env var.
|
|
208
|
+
decompress: Whether to attempt zstd decompression
|
|
209
|
+
features_column: Name of the column containing encoded features (default: "features")
|
|
210
|
+
metadata_column: Name of the column containing metadata byte (default: "metadata")
|
|
211
|
+
mp_config_id_column: Name of the column containing model proxy config ID (default: "mp_config_id")
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
pandas DataFrame with decoded features. Each row from input becomes multiple rows
|
|
215
|
+
(one per entity) with entity_id as first column and features as remaining columns.
|
|
216
|
+
Original row metadata (prism_ingested_at, mp_config_id, etc.) is preserved.
|
|
217
|
+
|
|
218
|
+
Example:
|
|
219
|
+
>>> import pandas as pd
|
|
220
|
+
>>> import inference_logging_client
|
|
221
|
+
>>> df = pd.read_parquet("logs.parquet")
|
|
222
|
+
>>> decoded_df = inference_logging_client.decode_mplog_dataframe(df)
|
|
223
|
+
>>> print(decoded_df.head())
|
|
224
|
+
"""
|
|
225
|
+
import os
|
|
226
|
+
import sys
|
|
227
|
+
import json
|
|
228
|
+
import base64
|
|
229
|
+
|
|
230
|
+
# Read from environment variable if not provided
|
|
231
|
+
if inference_host is None:
|
|
232
|
+
inference_host = os.getenv("INFERENCE_HOST", "http://localhost:8082")
|
|
233
|
+
|
|
234
|
+
# Track decode errors for summary
|
|
235
|
+
decode_errors = []
|
|
236
|
+
|
|
237
|
+
if df.empty:
|
|
238
|
+
return pd.DataFrame()
|
|
239
|
+
|
|
240
|
+
# Validate required columns
|
|
241
|
+
required_columns = [features_column, metadata_column, mp_config_id_column]
|
|
242
|
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
|
243
|
+
if missing_columns:
|
|
244
|
+
raise ValueError(f"Missing required columns: {missing_columns}")
|
|
245
|
+
|
|
246
|
+
# Pre-fetch schemas for unique (mp_config_id, version) combinations to avoid
|
|
247
|
+
# redundant HTTP requests during row iteration.
|
|
248
|
+
# Key: (mp_config_id, version) only - host/path intentionally excluded as schemas are canonical
|
|
249
|
+
schema_cache: dict[tuple[str, int], list[FeatureInfo]] = {}
|
|
250
|
+
|
|
251
|
+
# First pass: collect unique (mp_config_id, version) pairs
|
|
252
|
+
for idx, row in df.iterrows():
|
|
253
|
+
# Extract metadata byte to get version
|
|
254
|
+
metadata_data = row[metadata_column]
|
|
255
|
+
metadata_byte = 0
|
|
256
|
+
if not pd.isna(metadata_data):
|
|
257
|
+
if isinstance(metadata_data, (int, float)):
|
|
258
|
+
metadata_byte = int(metadata_data)
|
|
259
|
+
elif isinstance(metadata_data, bytes) and len(metadata_data) > 0:
|
|
260
|
+
metadata_byte = metadata_data[0]
|
|
261
|
+
elif isinstance(metadata_data, (bytearray, memoryview)) and len(metadata_data) > 0:
|
|
262
|
+
metadata_byte = metadata_data[0]
|
|
263
|
+
elif isinstance(metadata_data, str):
|
|
264
|
+
try:
|
|
265
|
+
metadata_byte = int(metadata_data)
|
|
266
|
+
except ValueError:
|
|
267
|
+
pass
|
|
268
|
+
|
|
269
|
+
_, version, _ = unpack_metadata_byte(metadata_byte)
|
|
270
|
+
|
|
271
|
+
# Skip invalid versions
|
|
272
|
+
if not (0 <= version <= _MAX_SCHEMA_VERSION):
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
# Extract mp_config_id
|
|
276
|
+
mp_config_id = row[mp_config_id_column]
|
|
277
|
+
if pd.isna(mp_config_id):
|
|
278
|
+
continue
|
|
279
|
+
mp_config_id = str(mp_config_id)
|
|
280
|
+
|
|
281
|
+
cache_key = (mp_config_id, version)
|
|
282
|
+
if cache_key not in schema_cache:
|
|
283
|
+
# Pre-fetch schema and store in local cache
|
|
284
|
+
try:
|
|
285
|
+
schema_cache[cache_key] = get_feature_schema(mp_config_id, version, inference_host)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
# Log warning but don't fail - will be caught again in main loop
|
|
288
|
+
warnings.warn(f"Failed to pre-fetch schema for {cache_key}: {e}", UserWarning)
|
|
289
|
+
|
|
290
|
+
all_decoded_rows = []
|
|
291
|
+
|
|
292
|
+
for idx, row in df.iterrows():
|
|
293
|
+
# Extract features bytes
|
|
294
|
+
features_data = row[features_column]
|
|
295
|
+
if pd.isna(features_data):
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# Convert features to bytes (handle base64, hex, or raw bytes)
|
|
299
|
+
features_bytes = None
|
|
300
|
+
if isinstance(features_data, bytes):
|
|
301
|
+
features_bytes = features_data
|
|
302
|
+
elif isinstance(features_data, str):
|
|
303
|
+
# Try base64 first
|
|
304
|
+
try:
|
|
305
|
+
features_bytes = base64.b64decode(features_data)
|
|
306
|
+
except Exception:
|
|
307
|
+
# Try hex
|
|
308
|
+
try:
|
|
309
|
+
features_bytes = bytes.fromhex(features_data)
|
|
310
|
+
except Exception:
|
|
311
|
+
# Try UTF-8 encoding
|
|
312
|
+
features_bytes = features_data.encode('utf-8')
|
|
313
|
+
elif isinstance(features_data, (bytearray, memoryview)):
|
|
314
|
+
features_bytes = bytes(features_data)
|
|
315
|
+
else:
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
if features_bytes is None or len(features_bytes) == 0:
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
# Extract metadata byte
|
|
322
|
+
metadata_data = row[metadata_column]
|
|
323
|
+
metadata_byte = 0
|
|
324
|
+
if not pd.isna(metadata_data):
|
|
325
|
+
if isinstance(metadata_data, (int, float)):
|
|
326
|
+
metadata_byte = int(metadata_data)
|
|
327
|
+
elif isinstance(metadata_data, bytes) and len(metadata_data) > 0:
|
|
328
|
+
metadata_byte = metadata_data[0]
|
|
329
|
+
elif isinstance(metadata_data, (bytearray, memoryview)) and len(metadata_data) > 0:
|
|
330
|
+
metadata_byte = metadata_data[0]
|
|
331
|
+
elif isinstance(metadata_data, str):
|
|
332
|
+
try:
|
|
333
|
+
metadata_byte = int(metadata_data)
|
|
334
|
+
except ValueError:
|
|
335
|
+
pass
|
|
336
|
+
|
|
337
|
+
# Extract version from metadata byte
|
|
338
|
+
_, version, _ = unpack_metadata_byte(metadata_byte)
|
|
339
|
+
|
|
340
|
+
# Validate version range
|
|
341
|
+
if not (0 <= version <= _MAX_SCHEMA_VERSION):
|
|
342
|
+
warnings.warn(
|
|
343
|
+
f"Row {idx}: Version {version} extracted from metadata is out of valid range (0-{_MAX_SCHEMA_VERSION}). "
|
|
344
|
+
f"This may indicate corrupted metadata.",
|
|
345
|
+
UserWarning
|
|
346
|
+
)
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
# Extract mp_config_id
|
|
350
|
+
mp_config_id = row[mp_config_id_column]
|
|
351
|
+
if pd.isna(mp_config_id):
|
|
352
|
+
continue
|
|
353
|
+
mp_config_id = str(mp_config_id)
|
|
354
|
+
|
|
355
|
+
# Lookup cached schema
|
|
356
|
+
cache_key = (mp_config_id, version)
|
|
357
|
+
cached_schema = schema_cache.get(cache_key)
|
|
358
|
+
|
|
359
|
+
# Decode this row's features using cached schema
|
|
360
|
+
try:
|
|
361
|
+
decoded_df = decode_mplog(
|
|
362
|
+
log_data=features_bytes,
|
|
363
|
+
model_proxy_id=mp_config_id,
|
|
364
|
+
version=version,
|
|
365
|
+
format_type=None, # Auto-detect from metadata
|
|
366
|
+
inference_host=inference_host,
|
|
367
|
+
decompress=decompress,
|
|
368
|
+
schema=cached_schema # Pass cached schema to avoid redundant fetches
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Add original row metadata to each decoded entity row
|
|
372
|
+
if not decoded_df.empty:
|
|
373
|
+
# Preserve original metadata columns
|
|
374
|
+
metadata_columns = [
|
|
375
|
+
"prism_ingested_at", "prism_extracted_at", "created_at",
|
|
376
|
+
"mp_config_id", "parent_entity", "tracking_id", "user_id",
|
|
377
|
+
"year", "month", "day", "hour"
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
for col in metadata_columns:
|
|
381
|
+
if col in df.columns:
|
|
382
|
+
decoded_df[col] = row[col]
|
|
383
|
+
|
|
384
|
+
# Update entity_id from entities column if available and matches count
|
|
385
|
+
if "entities" in df.columns and not pd.isna(row["entities"]):
|
|
386
|
+
# entities might be a list or string representation
|
|
387
|
+
entities_val = row["entities"]
|
|
388
|
+
if isinstance(entities_val, str):
|
|
389
|
+
try:
|
|
390
|
+
entities_val = json.loads(entities_val)
|
|
391
|
+
except (json.JSONDecodeError, ValueError):
|
|
392
|
+
entities_val = [entities_val]
|
|
393
|
+
elif not isinstance(entities_val, list):
|
|
394
|
+
entities_val = [entities_val]
|
|
395
|
+
|
|
396
|
+
# Match entities with decoded rows (only if counts match)
|
|
397
|
+
if len(entities_val) == len(decoded_df):
|
|
398
|
+
decoded_df["entity_id"] = entities_val
|
|
399
|
+
|
|
400
|
+
# Add parent_entity if it exists
|
|
401
|
+
if "parent_entity" in df.columns and not pd.isna(row["parent_entity"]):
|
|
402
|
+
parent_val = row["parent_entity"]
|
|
403
|
+
if isinstance(parent_val, str):
|
|
404
|
+
try:
|
|
405
|
+
parent_val = json.loads(parent_val)
|
|
406
|
+
except (json.JSONDecodeError, ValueError):
|
|
407
|
+
parent_val = [parent_val]
|
|
408
|
+
if isinstance(parent_val, list):
|
|
409
|
+
# If list, use first element or join if multiple
|
|
410
|
+
if len(parent_val) == 1:
|
|
411
|
+
decoded_df["parent_entity"] = parent_val[0]
|
|
412
|
+
elif len(parent_val) > 1:
|
|
413
|
+
decoded_df["parent_entity"] = str(parent_val)
|
|
414
|
+
else:
|
|
415
|
+
decoded_df["parent_entity"] = None
|
|
416
|
+
else:
|
|
417
|
+
decoded_df["parent_entity"] = parent_val
|
|
418
|
+
|
|
419
|
+
all_decoded_rows.append(decoded_df)
|
|
420
|
+
except Exception as e:
|
|
421
|
+
# Track error but continue processing other rows
|
|
422
|
+
decode_errors.append((idx, str(e)))
|
|
423
|
+
warnings.warn(f"Failed to decode row {idx}: {e}", UserWarning)
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
if not all_decoded_rows:
|
|
427
|
+
return pd.DataFrame()
|
|
428
|
+
|
|
429
|
+
# Combine all decoded DataFrames
|
|
430
|
+
result_df = pd.concat(all_decoded_rows, ignore_index=True)
|
|
431
|
+
|
|
432
|
+
# Reorder columns: entity_id first, then metadata columns, then features
|
|
433
|
+
metadata_cols = ["entity_id"]
|
|
434
|
+
for col in ["prism_ingested_at", "prism_extracted_at", "created_at",
|
|
435
|
+
"mp_config_id", "parent_entity", "tracking_id", "user_id",
|
|
436
|
+
"year", "month", "day", "hour"]:
|
|
437
|
+
if col in result_df.columns:
|
|
438
|
+
metadata_cols.append(col)
|
|
439
|
+
|
|
440
|
+
feature_cols = [col for col in result_df.columns if col not in metadata_cols]
|
|
441
|
+
column_order = metadata_cols + feature_cols
|
|
442
|
+
|
|
443
|
+
return result_df[column_order]
|