inference-logging-client 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
@@ -0,0 +1,222 @@
1
+ Metadata-Version: 2.4
2
+ Name: inference-logging-client
3
+ Version: 0.1.0
4
+ Summary: Decode MPLog feature logs from proto, arrow, or parquet format
5
+ Author-email: Dhruv Gupta <dhruv.gupta@meesho.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Meesho/BharatMLStack
8
+ Project-URL: Repository, https://github.com/Meesho/BharatMLStack
9
+ Project-URL: Issues, https://github.com/Meesho/BharatMLStack/issues
10
+ Keywords: mplog,decoder,inference,logging,proto,arrow,parquet
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Scientific/Engineering
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: pandas>=1.3.0
25
+ Requires-Dist: pyarrow>=5.0.0
26
+ Requires-Dist: zstandard>=0.15.0
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
29
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
30
+ Requires-Dist: black>=22.0.0; extra == "dev"
31
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
32
+
33
+ # Inference Logging Client
34
+
35
+ A Python package for decoding MPLog feature logs from proto, arrow, or parquet format.
36
+
37
+ ## Features
38
+
39
+ - Decode MPLog feature logs from multiple encoding formats:
40
+ - **Proto**: Custom binary encoding with generated flag + sequential features
41
+ - **Arrow**: Arrow IPC format with binary columns
42
+ - **Parquet**: Parquet format with feature map
43
+ - Automatic format detection from metadata
44
+ - Support for zstd compression
45
+ - Fetch feature schemas from inference API
46
+ - Convert decoded logs to pandas DataFrames
47
+ - Command-line interface for easy usage
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ pip install inference-logging-client
53
+ ```
54
+
55
+ ## Quick Start
56
+
57
+ ### Python API
58
+
59
+ ```python
60
+ import inference_logging_client
61
+
62
+ # Decode MPLog from bytes
63
+ with open("log.bin", "rb") as f:
64
+ data = f.read()
65
+
66
+ df = inference_logging_client.decode_mplog(
67
+ log_data=data,
68
+ model_proxy_id="my-model",
69
+ version=1
70
+ )
71
+
72
+ print(df.head())
73
+ ```
74
+
75
+ ### Decode from DataFrame
76
+
77
+ ```python
78
+ import pandas as pd
79
+ import inference_logging_client
80
+
81
+ # Read DataFrame with MPLog columns
82
+ df = pd.read_parquet("logs.parquet")
83
+
84
+ # Decode features
85
+ decoded_df = inference_logging_client.decode_mplog_dataframe(df)
86
+
87
+ print(decoded_df.head())
88
+ ```
89
+
90
+ ### Command Line Interface
91
+
92
+ ```bash
93
+ # Decode with auto-detection
94
+ inference-logging-client --model-proxy-id my-model --version 1 input.bin
95
+
96
+ # Specify format explicitly
97
+ inference-logging-client --model-proxy-id my-model --version 1 --format proto input.bin
98
+
99
+ # Output to CSV
100
+ inference-logging-client --model-proxy-id my-model --version 1 input.bin -o output.csv
101
+
102
+ # Decode from stdin (base64)
103
+ echo "BASE64_DATA" | inference-logging-client --model-proxy-id my-model --version 1 --base64 -
104
+ ```
105
+
106
+ ## Configuration
107
+
108
+ The package uses environment variables for configuration:
109
+
110
+ - `INFERENCE_HOST`: Inference service host URL (default: `http://localhost:8082`)
111
+ - `INFERENCE_PATH`: API path for schema fetching (default: `/api/v1/inference/mp-config-registry/get_feature_schema`)
112
+
113
+ ## API Reference
114
+
115
+ ### `decode_mplog()`
116
+
117
+ Decode MPLog bytes to a pandas DataFrame.
118
+
119
+ **Parameters:**
120
+ - `log_data` (bytes): The MPLog bytes (possibly compressed)
121
+ - `model_proxy_id` (str): The model proxy config ID
122
+ - `version` (int): The schema version
123
+ - `format_type` (Format, optional): The encoding format. If None, auto-detect from metadata.
124
+ - `inference_host` (str, optional): The inference service host URL
125
+ - `decompress` (bool): Whether to attempt zstd decompression (default: True)
126
+ - `schema` (list, optional): Pre-fetched schema (list of FeatureInfo). If provided, skips schema fetch.
127
+
128
+ **Returns:**
129
+ - `pd.DataFrame`: DataFrame with `entity_id` as first column and features as remaining columns
130
+
131
+ ### `decode_mplog_dataframe()`
132
+
133
+ Decode MPLog features from a DataFrame with specific column structure.
134
+
135
+ **Parameters:**
136
+ - `df` (pd.DataFrame): Input DataFrame with MPLog data columns
137
+ - `inference_host` (str, optional): The inference service host URL
138
+ - `decompress` (bool): Whether to attempt zstd decompression (default: True)
139
+ - `features_column` (str): Name of the column containing encoded features (default: "features")
140
+ - `metadata_column` (str): Name of the column containing metadata byte (default: "metadata")
141
+ - `mp_config_id_column` (str): Name of the column containing model proxy config ID (default: "mp_config_id")
142
+
143
+ **Returns:**
144
+ - `pd.DataFrame`: DataFrame with decoded features, one row per entity
145
+
146
+ ### `get_mplog_metadata()`
147
+
148
+ Extract metadata from MPLog bytes without full decoding.
149
+
150
+ **Parameters:**
151
+ - `log_data` (bytes): The MPLog bytes (possibly compressed)
152
+ - `decompress` (bool): Whether to attempt zstd decompression (default: True)
153
+
154
+ **Returns:**
155
+ - `DecodedMPLog`: Object with metadata fields populated
156
+
157
+ ## Supported Feature Types
158
+
159
+ ### Scalar Types
160
+ - Integer: INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64
161
+ - Float: FP16, FP32, FP64, FP8E5M2, FP8E4M3
162
+ - Boolean: BOOL
163
+ - String: STRING
164
+
165
+ ### Vector Types
166
+ - All scalar types can be vectors (e.g., FP32VECTOR, INT64VECTOR)
167
+ - Vectors can be binary-encoded or JSON-encoded
168
+
169
+ ## Encoding Formats
170
+
171
+ ### Proto Format
172
+ - First byte: generated flag
173
+ - Scalars: fixed size bytes based on type
174
+ - Strings/Vectors: 2-byte little-endian size prefix + data bytes
175
+
176
+ ### Arrow Format
177
+ - Arrow IPC format with binary columns
178
+ - Column names are feature indices ("0", "1", ...)
179
+ - Each column contains raw feature value bytes
180
+
181
+ ### Parquet Format
182
+ - Parquet file with Features column (`map[int][]byte`)
183
+ - Each row represents an entity
184
+
185
+ ## Metadata Byte Layout
186
+
187
+ The metadata byte encodes:
188
+ - Bit 0: compression flag (0=disabled, 1=enabled)
189
+ - Bit 1: reserved
190
+ - Bits 2-5: version (4 bits, 0-15)
191
+ - Bits 6-7: format type (00=proto, 01=arrow, 10=parquet)
192
+
193
+ ## Development
194
+
195
+ ```bash
196
+ # Install in development mode
197
+ pip install -e .
198
+
199
+ # Install with dev dependencies
200
+ pip install -e ".[dev]"
201
+
202
+ # Run tests
203
+ pytest
204
+
205
+ # Format code
206
+ black src/
207
+
208
+ # Lint code
209
+ ruff check src/
210
+ ```
211
+
212
+ ## License
213
+
214
+ MIT License
215
+
216
+ ## Repository
217
+
218
+ [https://github.com/Meesho/BharatMLStack](https://github.com/Meesho/BharatMLStack)
219
+
220
+ ## Contributing
221
+
222
+ Contributions are welcome! Please feel free to submit a Pull Request.
@@ -0,0 +1,443 @@
1
+ """
2
+ Inference Logging Client - Decode MPLog feature logs from proto, arrow, or parquet format.
3
+
4
+ This package provides functionality to:
5
+ 1. Decode MPLog feature logs from various encoding formats (proto, arrow, parquet)
6
+ 2. Fetch feature schemas from inference API
7
+ 3. Convert decoded logs to pandas DataFrames
8
+
9
+ Main functions:
10
+ - decode_mplog: Decode MPLog bytes to a DataFrame
11
+ - decode_mplog_dataframe: Decode MPLog features from a DataFrame
12
+ - get_mplog_metadata: Extract metadata from MPLog bytes
13
+ """
14
+
15
+ import warnings
16
+ from typing import Optional
17
+
18
+ import pandas as pd
19
+
20
+ # Check for zstandard availability at import time for clear error messages
21
+ try:
22
+ import zstandard as zstd
23
+ _ZSTD_AVAILABLE = True
24
+ except ImportError:
25
+ _ZSTD_AVAILABLE = False
26
+ zstd = None
27
+
28
+ from .types import Format, FeatureInfo, DecodedMPLog, FORMAT_TYPE_MAP
29
+ from .io import get_feature_schema, parse_mplog_protobuf, get_mplog_metadata, clear_schema_cache
30
+ from .formats import decode_proto_format, decode_arrow_format, decode_parquet_format
31
+ from .utils import format_dataframe_floats, get_format_name, unpack_metadata_byte
32
+ from .exceptions import (
33
+ InferenceLoggingError,
34
+ SchemaFetchError,
35
+ SchemaNotFoundError,
36
+ DecodeError,
37
+ FormatError,
38
+ ProtobufError,
39
+ )
40
+
41
+ __version__ = "0.1.0"
42
+
43
+ # Maximum supported schema version (4 bits = 0-15)
44
+ _MAX_SCHEMA_VERSION = 15
45
+
46
+ __all__ = [
47
+ "decode_mplog",
48
+ "decode_mplog_dataframe",
49
+ "get_mplog_metadata",
50
+ "get_feature_schema",
51
+ "clear_schema_cache",
52
+ "Format",
53
+ "FeatureInfo",
54
+ "DecodedMPLog",
55
+ "get_format_name",
56
+ "unpack_metadata_byte",
57
+ # Exceptions
58
+ "InferenceLoggingError",
59
+ "SchemaFetchError",
60
+ "SchemaNotFoundError",
61
+ "DecodeError",
62
+ "FormatError",
63
+ "ProtobufError",
64
+ ]
65
+
66
+
67
+ def _decompress_zstd(data: bytes) -> bytes:
68
+ """Decompress zstd-compressed data.
69
+
70
+ Args:
71
+ data: Potentially zstd-compressed bytes
72
+
73
+ Returns:
74
+ Decompressed bytes, or original data if not compressed or zstd unavailable
75
+
76
+ Raises:
77
+ ImportError: If data is zstd-compressed but zstandard is not installed
78
+ """
79
+ # Check for zstd magic number: 0x28 0xB5 0x2F 0xFD
80
+ if len(data) >= 4 and data[:4] == b'\x28\xB5\x2F\xFD':
81
+ if not _ZSTD_AVAILABLE:
82
+ raise ImportError(
83
+ "Data appears to be zstd-compressed but the 'zstandard' package is not installed. "
84
+ "Install it with: pip install zstandard"
85
+ )
86
+ decompressor = zstd.ZstdDecompressor()
87
+ return decompressor.decompress(data)
88
+ return data
89
+
90
+
91
+ def decode_mplog(
92
+ log_data: bytes,
93
+ model_proxy_id: str,
94
+ version: int,
95
+ format_type: Optional[Format] = None,
96
+ inference_host: Optional[str] = None,
97
+ decompress: bool = True,
98
+ schema: Optional[list] = None
99
+ ) -> pd.DataFrame:
100
+ """
101
+ Main function to decode MPLog bytes to a DataFrame.
102
+
103
+ Args:
104
+ log_data: The MPLog bytes (possibly compressed)
105
+ model_proxy_id: The model proxy config ID
106
+ version: The schema version (0-15)
107
+ format_type: The encoding format (proto, arrow, parquet). If None, auto-detect from metadata.
108
+ inference_host: The inference service host URL. If None, reads from INFERENCE_HOST env var.
109
+ decompress: Whether to attempt zstd decompression
110
+ schema: Optional pre-fetched schema (list of FeatureInfo). If provided, skips schema fetch.
111
+
112
+ Returns:
113
+ pandas DataFrame with entity_id as first column and features as remaining columns
114
+
115
+ Raises:
116
+ ValueError: If version is out of valid range (0-15)
117
+ ImportError: If data is zstd-compressed but zstandard is not installed
118
+ FormatError: If format is unsupported or data cannot be parsed
119
+
120
+ Example:
121
+ >>> import inference_logging_client
122
+ >>> with open("log.bin", "rb") as f:
123
+ ... data = f.read()
124
+ >>> df = inference_logging_client.decode_mplog(
125
+ ... log_data=data,
126
+ ... model_proxy_id="my-model",
127
+ ... version=1
128
+ ... )
129
+ >>> print(df.head())
130
+ """
131
+ import os
132
+
133
+ # Validate version range
134
+ if not (0 <= version <= _MAX_SCHEMA_VERSION):
135
+ raise ValueError(
136
+ f"Version {version} is out of valid range (0-{_MAX_SCHEMA_VERSION}). "
137
+ f"Version is encoded in 4 bits of the metadata byte."
138
+ )
139
+
140
+ # Read from environment variable if not provided
141
+ if inference_host is None:
142
+ inference_host = os.getenv("INFERENCE_HOST", "http://localhost:8082")
143
+
144
+ # Attempt decompression if enabled
145
+ working_data = log_data
146
+ if decompress:
147
+ working_data = _decompress_zstd(log_data)
148
+
149
+ # If format_type is None, parse the protobuf to get format from metadata
150
+ detected_format = format_type
151
+ if detected_format is None:
152
+ # Parse protobuf to extract metadata and detect format
153
+ parsed = parse_mplog_protobuf(working_data)
154
+ if parsed.format_type in FORMAT_TYPE_MAP:
155
+ detected_format = FORMAT_TYPE_MAP[parsed.format_type]
156
+ else:
157
+ # Default to proto if format type is unknown
158
+ detected_format = Format.PROTO
159
+
160
+ # Use provided schema or fetch from inference service
161
+ if schema is None:
162
+ schema = get_feature_schema(model_proxy_id, version, inference_host)
163
+
164
+ # Decode based on format
165
+ if detected_format == Format.PROTO:
166
+ entity_ids, decoded_rows = decode_proto_format(working_data, schema)
167
+ elif detected_format == Format.ARROW:
168
+ entity_ids, decoded_rows = decode_arrow_format(working_data, schema)
169
+ elif detected_format == Format.PARQUET:
170
+ entity_ids, decoded_rows = decode_parquet_format(working_data, schema)
171
+ else:
172
+ raise FormatError(f"Unsupported format: {detected_format}")
173
+
174
+ if not decoded_rows:
175
+ # Return empty DataFrame with correct columns
176
+ columns = ["entity_id"] + [f.name for f in schema]
177
+ return pd.DataFrame(columns=columns)
178
+
179
+ # Build DataFrame
180
+ df = pd.DataFrame(decoded_rows)
181
+
182
+ # Insert entity_id as first column
183
+ df.insert(0, "entity_id", entity_ids)
184
+
185
+ return df
186
+
187
+
188
+ def decode_mplog_dataframe(
189
+ df: pd.DataFrame,
190
+ inference_host: Optional[str] = None,
191
+ decompress: bool = True,
192
+ features_column: str = "features",
193
+ metadata_column: str = "metadata",
194
+ mp_config_id_column: str = "mp_config_id"
195
+ ) -> pd.DataFrame:
196
+ """
197
+ Decode MPLog features from a DataFrame with specific column structure.
198
+
199
+ Expected DataFrame columns:
200
+ - prism_ingested_at, prism_extracted_at, created_at
201
+ - entities, features, metadata
202
+ - mp_config_id, parent_entity, tracking_id, user_id
203
+ - year, month, day, hour
204
+
205
+ Args:
206
+ df: Input DataFrame with MPLog data columns
207
+ inference_host: The inference service host URL. If None, reads from INFERENCE_HOST env var.
208
+ decompress: Whether to attempt zstd decompression
209
+ features_column: Name of the column containing encoded features (default: "features")
210
+ metadata_column: Name of the column containing metadata byte (default: "metadata")
211
+ mp_config_id_column: Name of the column containing model proxy config ID (default: "mp_config_id")
212
+
213
+ Returns:
214
+ pandas DataFrame with decoded features. Each row from input becomes multiple rows
215
+ (one per entity) with entity_id as first column and features as remaining columns.
216
+ Original row metadata (prism_ingested_at, mp_config_id, etc.) is preserved.
217
+
218
+ Example:
219
+ >>> import pandas as pd
220
+ >>> import inference_logging_client
221
+ >>> df = pd.read_parquet("logs.parquet")
222
+ >>> decoded_df = inference_logging_client.decode_mplog_dataframe(df)
223
+ >>> print(decoded_df.head())
224
+ """
225
+ import os
226
+ import sys
227
+ import json
228
+ import base64
229
+
230
+ # Read from environment variable if not provided
231
+ if inference_host is None:
232
+ inference_host = os.getenv("INFERENCE_HOST", "http://localhost:8082")
233
+
234
+ # Track decode errors for summary
235
+ decode_errors = []
236
+
237
+ if df.empty:
238
+ return pd.DataFrame()
239
+
240
+ # Validate required columns
241
+ required_columns = [features_column, metadata_column, mp_config_id_column]
242
+ missing_columns = [col for col in required_columns if col not in df.columns]
243
+ if missing_columns:
244
+ raise ValueError(f"Missing required columns: {missing_columns}")
245
+
246
+ # Pre-fetch schemas for unique (mp_config_id, version) combinations to avoid
247
+ # redundant HTTP requests during row iteration.
248
+ # Key: (mp_config_id, version) only - host/path intentionally excluded as schemas are canonical
249
+ schema_cache: dict[tuple[str, int], list[FeatureInfo]] = {}
250
+
251
+ # First pass: collect unique (mp_config_id, version) pairs
252
+ for idx, row in df.iterrows():
253
+ # Extract metadata byte to get version
254
+ metadata_data = row[metadata_column]
255
+ metadata_byte = 0
256
+ if not pd.isna(metadata_data):
257
+ if isinstance(metadata_data, (int, float)):
258
+ metadata_byte = int(metadata_data)
259
+ elif isinstance(metadata_data, bytes) and len(metadata_data) > 0:
260
+ metadata_byte = metadata_data[0]
261
+ elif isinstance(metadata_data, (bytearray, memoryview)) and len(metadata_data) > 0:
262
+ metadata_byte = metadata_data[0]
263
+ elif isinstance(metadata_data, str):
264
+ try:
265
+ metadata_byte = int(metadata_data)
266
+ except ValueError:
267
+ pass
268
+
269
+ _, version, _ = unpack_metadata_byte(metadata_byte)
270
+
271
+ # Skip invalid versions
272
+ if not (0 <= version <= _MAX_SCHEMA_VERSION):
273
+ continue
274
+
275
+ # Extract mp_config_id
276
+ mp_config_id = row[mp_config_id_column]
277
+ if pd.isna(mp_config_id):
278
+ continue
279
+ mp_config_id = str(mp_config_id)
280
+
281
+ cache_key = (mp_config_id, version)
282
+ if cache_key not in schema_cache:
283
+ # Pre-fetch schema and store in local cache
284
+ try:
285
+ schema_cache[cache_key] = get_feature_schema(mp_config_id, version, inference_host)
286
+ except Exception as e:
287
+ # Log warning but don't fail - will be caught again in main loop
288
+ warnings.warn(f"Failed to pre-fetch schema for {cache_key}: {e}", UserWarning)
289
+
290
+ all_decoded_rows = []
291
+
292
+ for idx, row in df.iterrows():
293
+ # Extract features bytes
294
+ features_data = row[features_column]
295
+ if pd.isna(features_data):
296
+ continue
297
+
298
+ # Convert features to bytes (handle base64, hex, or raw bytes)
299
+ features_bytes = None
300
+ if isinstance(features_data, bytes):
301
+ features_bytes = features_data
302
+ elif isinstance(features_data, str):
303
+ # Try base64 first
304
+ try:
305
+ features_bytes = base64.b64decode(features_data)
306
+ except Exception:
307
+ # Try hex
308
+ try:
309
+ features_bytes = bytes.fromhex(features_data)
310
+ except Exception:
311
+ # Try UTF-8 encoding
312
+ features_bytes = features_data.encode('utf-8')
313
+ elif isinstance(features_data, (bytearray, memoryview)):
314
+ features_bytes = bytes(features_data)
315
+ else:
316
+ continue
317
+
318
+ if features_bytes is None or len(features_bytes) == 0:
319
+ continue
320
+
321
+ # Extract metadata byte
322
+ metadata_data = row[metadata_column]
323
+ metadata_byte = 0
324
+ if not pd.isna(metadata_data):
325
+ if isinstance(metadata_data, (int, float)):
326
+ metadata_byte = int(metadata_data)
327
+ elif isinstance(metadata_data, bytes) and len(metadata_data) > 0:
328
+ metadata_byte = metadata_data[0]
329
+ elif isinstance(metadata_data, (bytearray, memoryview)) and len(metadata_data) > 0:
330
+ metadata_byte = metadata_data[0]
331
+ elif isinstance(metadata_data, str):
332
+ try:
333
+ metadata_byte = int(metadata_data)
334
+ except ValueError:
335
+ pass
336
+
337
+ # Extract version from metadata byte
338
+ _, version, _ = unpack_metadata_byte(metadata_byte)
339
+
340
+ # Validate version range
341
+ if not (0 <= version <= _MAX_SCHEMA_VERSION):
342
+ warnings.warn(
343
+ f"Row {idx}: Version {version} extracted from metadata is out of valid range (0-{_MAX_SCHEMA_VERSION}). "
344
+ f"This may indicate corrupted metadata.",
345
+ UserWarning
346
+ )
347
+ continue
348
+
349
+ # Extract mp_config_id
350
+ mp_config_id = row[mp_config_id_column]
351
+ if pd.isna(mp_config_id):
352
+ continue
353
+ mp_config_id = str(mp_config_id)
354
+
355
+ # Lookup cached schema
356
+ cache_key = (mp_config_id, version)
357
+ cached_schema = schema_cache.get(cache_key)
358
+
359
+ # Decode this row's features using cached schema
360
+ try:
361
+ decoded_df = decode_mplog(
362
+ log_data=features_bytes,
363
+ model_proxy_id=mp_config_id,
364
+ version=version,
365
+ format_type=None, # Auto-detect from metadata
366
+ inference_host=inference_host,
367
+ decompress=decompress,
368
+ schema=cached_schema # Pass cached schema to avoid redundant fetches
369
+ )
370
+
371
+ # Add original row metadata to each decoded entity row
372
+ if not decoded_df.empty:
373
+ # Preserve original metadata columns
374
+ metadata_columns = [
375
+ "prism_ingested_at", "prism_extracted_at", "created_at",
376
+ "mp_config_id", "parent_entity", "tracking_id", "user_id",
377
+ "year", "month", "day", "hour"
378
+ ]
379
+
380
+ for col in metadata_columns:
381
+ if col in df.columns:
382
+ decoded_df[col] = row[col]
383
+
384
+ # Update entity_id from entities column if available and matches count
385
+ if "entities" in df.columns and not pd.isna(row["entities"]):
386
+ # entities might be a list or string representation
387
+ entities_val = row["entities"]
388
+ if isinstance(entities_val, str):
389
+ try:
390
+ entities_val = json.loads(entities_val)
391
+ except (json.JSONDecodeError, ValueError):
392
+ entities_val = [entities_val]
393
+ elif not isinstance(entities_val, list):
394
+ entities_val = [entities_val]
395
+
396
+ # Match entities with decoded rows (only if counts match)
397
+ if len(entities_val) == len(decoded_df):
398
+ decoded_df["entity_id"] = entities_val
399
+
400
+ # Add parent_entity if it exists
401
+ if "parent_entity" in df.columns and not pd.isna(row["parent_entity"]):
402
+ parent_val = row["parent_entity"]
403
+ if isinstance(parent_val, str):
404
+ try:
405
+ parent_val = json.loads(parent_val)
406
+ except (json.JSONDecodeError, ValueError):
407
+ parent_val = [parent_val]
408
+ if isinstance(parent_val, list):
409
+ # If list, use first element or join if multiple
410
+ if len(parent_val) == 1:
411
+ decoded_df["parent_entity"] = parent_val[0]
412
+ elif len(parent_val) > 1:
413
+ decoded_df["parent_entity"] = str(parent_val)
414
+ else:
415
+ decoded_df["parent_entity"] = None
416
+ else:
417
+ decoded_df["parent_entity"] = parent_val
418
+
419
+ all_decoded_rows.append(decoded_df)
420
+ except Exception as e:
421
+ # Track error but continue processing other rows
422
+ decode_errors.append((idx, str(e)))
423
+ warnings.warn(f"Failed to decode row {idx}: {e}", UserWarning)
424
+ continue
425
+
426
+ if not all_decoded_rows:
427
+ return pd.DataFrame()
428
+
429
+ # Combine all decoded DataFrames
430
+ result_df = pd.concat(all_decoded_rows, ignore_index=True)
431
+
432
+ # Reorder columns: entity_id first, then metadata columns, then features
433
+ metadata_cols = ["entity_id"]
434
+ for col in ["prism_ingested_at", "prism_extracted_at", "created_at",
435
+ "mp_config_id", "parent_entity", "tracking_id", "user_id",
436
+ "year", "month", "day", "hour"]:
437
+ if col in result_df.columns:
438
+ metadata_cols.append(col)
439
+
440
+ feature_cols = [col for col in result_df.columns if col not in metadata_cols]
441
+ column_order = metadata_cols + feature_cols
442
+
443
+ return result_df[column_order]
@@ -0,0 +1,6 @@
1
+ """Allow package to be executed as a module: python -m inference-logging-client"""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()