brynq-sdk-brynq 4.2.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of brynq-sdk-brynq might be problematic. Click here for more details.
- brynq_sdk_brynq/__init__.py +1 -0
- brynq_sdk_brynq/brynq.py +289 -0
- brynq_sdk_brynq/credentials.py +157 -0
- brynq_sdk_brynq/customers.py +88 -0
- brynq_sdk_brynq/interfaces.py +234 -0
- brynq_sdk_brynq/mappings.py +107 -0
- brynq_sdk_brynq/organization_chart.py +251 -0
- brynq_sdk_brynq/roles.py +272 -0
- brynq_sdk_brynq/scenarios.py +3495 -0
- brynq_sdk_brynq/schemas/__init__.py +52 -0
- brynq_sdk_brynq/schemas/credentials.py +37 -0
- brynq_sdk_brynq/schemas/customers.py +108 -0
- brynq_sdk_brynq/schemas/interfaces.py +237 -0
- brynq_sdk_brynq/schemas/organization_chart.py +70 -0
- brynq_sdk_brynq/schemas/roles.py +95 -0
- brynq_sdk_brynq/schemas/scenarios.py +419 -0
- brynq_sdk_brynq/schemas/users.py +126 -0
- brynq_sdk_brynq/source_systems.py +175 -0
- brynq_sdk_brynq/users.py +405 -0
- brynq_sdk_brynq-4.2.6.dev0.dist-info/METADATA +17 -0
- brynq_sdk_brynq-4.2.6.dev0.dist-info/RECORD +23 -0
- brynq_sdk_brynq-4.2.6.dev0.dist-info/WHEEL +5 -0
- brynq_sdk_brynq-4.2.6.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,3495 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scenarios SDK for BrynQ.
|
|
3
|
+
|
|
4
|
+
This module provides the `Scenarios` class for fetching, parsing, and applying
|
|
5
|
+
data transformation scenarios from the BrynQ API. It handles field renaming,
|
|
6
|
+
value mapping, and structure validation based on configured scenarios.
|
|
7
|
+
|
|
8
|
+
This module also contains parsed scenario models (ParsedScenario, Record, FieldProperties)
|
|
9
|
+
and parsing logic that transforms raw API responses into usable business logic models.
|
|
10
|
+
"""
|
|
11
|
+
# imports
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import warnings
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from functools import cached_property
|
|
20
|
+
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union, Literal
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import pandera as pa
|
|
24
|
+
from brynq_sdk_functions import BrynQPanderaDataFrameModel, Functions
|
|
25
|
+
from pandera.typing import Series, String # type: ignore[attr-defined]
|
|
26
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
27
|
+
from pydantic.types import AwareDatetime
|
|
28
|
+
|
|
29
|
+
from .schemas.scenarios import (
|
|
30
|
+
Scenario,
|
|
31
|
+
ScenarioDetail,
|
|
32
|
+
SourceOrTargetField,
|
|
33
|
+
ScenarioMappingConfiguration,
|
|
34
|
+
FieldType,
|
|
35
|
+
SystemType,
|
|
36
|
+
RelationType,
|
|
37
|
+
CustomSourceOrTargetField,
|
|
38
|
+
LibrarySourceOrTargetField,
|
|
39
|
+
ConfigurationSourceOrTargetField,
|
|
40
|
+
LibraryFieldValues,
|
|
41
|
+
MappingValue,
|
|
42
|
+
ConfigurationType,
|
|
43
|
+
ConfigFieldValues,
|
|
44
|
+
Template,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# ============================================================================
|
|
48
|
+
# Type Aliases for Parsed Models
|
|
49
|
+
# ============================================================================
|
|
50
|
+
FieldName = str
|
|
51
|
+
PythonicName = str
|
|
52
|
+
FieldPropertiesMap = Dict[FieldName, "FieldProperties"]
|
|
53
|
+
SourceToTargetMap = Dict[FieldName, List[FieldName]]
|
|
54
|
+
TargetToSourceMap = Dict[FieldName, Union[FieldName, List[FieldName]]]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ============================================================================
|
|
58
|
+
# Extraction Helpers
|
|
59
|
+
# ============================================================================
|
|
60
|
+
|
|
61
|
+
def _sanitize_alias(alias: str) -> str:
|
|
62
|
+
"""Converts a raw string into a valid Python variable name.
|
|
63
|
+
|
|
64
|
+
Converts names like "User ID" or "1st_Name" to "user_id" and "field_1st_name" to fix
|
|
65
|
+
Python syntax issues (spaces, special characters, leading digits). Used in
|
|
66
|
+
`_build_field_properties` and `_build_record` to create Python-safe aliases.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
alias: The raw string to sanitize.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
A snake_case string safe for use as a class attribute.
|
|
73
|
+
"""
|
|
74
|
+
# Replace non-word characters and leading digits with underscores to create a valid Python variable name
|
|
75
|
+
pythonic_name = re.sub(r"\W|^(?=\d)", "_", alias)
|
|
76
|
+
pythonic_name = re.sub(r"_+", "_", pythonic_name).strip("_").lower()
|
|
77
|
+
if not pythonic_name:
|
|
78
|
+
pythonic_name = "field"
|
|
79
|
+
if pythonic_name[0].isdigit(): #double check if regex failed
|
|
80
|
+
pythonic_name = f"field_{pythonic_name}"
|
|
81
|
+
return pythonic_name
|
|
82
|
+
|
|
83
|
+
def _extract_names_from_fields(fields: SourceOrTargetField) -> List[str]:
|
|
84
|
+
"""Extracts a list of field names from a field object, preserving order.
|
|
85
|
+
|
|
86
|
+
The API stores names in different places by field type (technical_name for CUSTOM,
|
|
87
|
+
field/uuid for LIBRARY). This provides a single way to get names regardless of structure.
|
|
88
|
+
Used during scenario parsing to build mapping dictionaries. Order is preserved from the API response.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
fields: The SourceOrTargetField object to extract names from.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
List of field names (technical_name for CUSTOM, field/uuid for LIBRARY, uuid for CONFIGURATION) in API order.
|
|
95
|
+
Empty list for FIXED/EMPTY fields.
|
|
96
|
+
"""
|
|
97
|
+
if isinstance(fields, CustomSourceOrTargetField):
|
|
98
|
+
names: List[str] = []
|
|
99
|
+
seen = set() # Track seen names to avoid duplicates while preserving order
|
|
100
|
+
for item in fields.data:
|
|
101
|
+
if item.technical_name and item.technical_name not in seen:
|
|
102
|
+
names.append(item.technical_name)
|
|
103
|
+
seen.add(item.technical_name)
|
|
104
|
+
if not names:
|
|
105
|
+
for item in fields.data:
|
|
106
|
+
uuid = getattr(item, "uuid", None)
|
|
107
|
+
if uuid and uuid not in seen:
|
|
108
|
+
names.append(str(uuid))
|
|
109
|
+
seen.add(str(uuid))
|
|
110
|
+
return names
|
|
111
|
+
|
|
112
|
+
if isinstance(fields, LibrarySourceOrTargetField):
|
|
113
|
+
names: List[str] = []
|
|
114
|
+
seen = set() # Track seen names to avoid duplicates while preserving order
|
|
115
|
+
for entry in fields.data:
|
|
116
|
+
# Handle different formats the API may return:
|
|
117
|
+
# - as a plain string: the string itself IS the field name/identifier
|
|
118
|
+
# - as a LibraryFieldValues object: the field name is in the 'field' attribute (preferred)
|
|
119
|
+
# or 'uuid' attribute (fallback if 'field' is missing)
|
|
120
|
+
if isinstance(entry, str):
|
|
121
|
+
# String entry is the field name itself
|
|
122
|
+
if entry not in seen:
|
|
123
|
+
names.append(entry)
|
|
124
|
+
seen.add(entry)
|
|
125
|
+
elif isinstance(entry, LibraryFieldValues):
|
|
126
|
+
if entry.field and entry.field not in seen:
|
|
127
|
+
names.append(entry.field)
|
|
128
|
+
seen.add(entry.field)
|
|
129
|
+
elif entry.uuid and entry.uuid not in seen:
|
|
130
|
+
names.append(str(entry.uuid))
|
|
131
|
+
seen.add(str(entry.uuid))
|
|
132
|
+
return names
|
|
133
|
+
|
|
134
|
+
if isinstance(fields, ConfigurationSourceOrTargetField):
|
|
135
|
+
names: List[str] = []
|
|
136
|
+
seen = set() # Track seen names to avoid duplicates while preserving order
|
|
137
|
+
for config_item in fields.data:
|
|
138
|
+
# Configuration fields use UUID as identifier
|
|
139
|
+
uuid_str = str(config_item.uuid)
|
|
140
|
+
if uuid_str not in seen:
|
|
141
|
+
names.append(uuid_str)
|
|
142
|
+
seen.add(uuid_str)
|
|
143
|
+
return names
|
|
144
|
+
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
def _extract_label_from_fields(
|
|
148
|
+
fields: SourceOrTargetField,
|
|
149
|
+
field_name: str
|
|
150
|
+
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
|
151
|
+
"""Extracts human-readable labels for customer-facing communication.
|
|
152
|
+
|
|
153
|
+
CUSTOM fields use 'name' directly; LIBRARY fields have multi-language 'field_label'
|
|
154
|
+
dictionaries. Prioritizes English, then falls back to any available value. Used in
|
|
155
|
+
`_build_field_properties` and `_build_record`.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
fields: The SourceOrTargetField object to extract from.
|
|
159
|
+
field_name: The field name to look up.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Tuple of (Preferred Label, English Label, Dutch Label).
|
|
163
|
+
"""
|
|
164
|
+
if isinstance(fields, CustomSourceOrTargetField):
|
|
165
|
+
for item in fields.data:
|
|
166
|
+
# Custom fields don't have multi-language field_label like library fields,
|
|
167
|
+
# so we use 'name' directly as the label (this is what shows up in BrynQ)
|
|
168
|
+
if item.technical_name == field_name or item.uuid == field_name:
|
|
169
|
+
return (item.name, None, None)
|
|
170
|
+
|
|
171
|
+
if isinstance(fields, LibrarySourceOrTargetField):
|
|
172
|
+
for entry in fields.data:
|
|
173
|
+
# Handle different formats the API may return:
|
|
174
|
+
# - as a plain string: no label available (skip)
|
|
175
|
+
# - as a LibraryFieldValues object: check field/uuid and extract field_label
|
|
176
|
+
if not isinstance(entry, str) and (entry.field == field_name or entry.uuid == field_name) and entry.field_label:
|
|
177
|
+
if isinstance(entry.field_label, dict):
|
|
178
|
+
l_en = entry.field_label.get("en")
|
|
179
|
+
# Return EN if present, else the first available value
|
|
180
|
+
return (l_en or next(iter(entry.field_label.values()), None), l_en, entry.field_label.get("nl"))
|
|
181
|
+
|
|
182
|
+
return (None, None, None)
|
|
183
|
+
|
|
184
|
+
def _extract_uuid_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
|
|
185
|
+
"""Extracts UUID from fields for a given field name.
|
|
186
|
+
|
|
187
|
+
The API's mappingValues reference fields by UUID. This extracts UUIDs so
|
|
188
|
+
`UuidToFieldNameMapper` can convert UUID-based references to field names.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
fields: SourceOrTargetField object to extract from.
|
|
192
|
+
field_name: The field name to look up.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
UUID string if found, None otherwise.
|
|
196
|
+
"""
|
|
197
|
+
if isinstance(fields, CustomSourceOrTargetField):
|
|
198
|
+
for item in fields.data:
|
|
199
|
+
if item.technical_name == field_name or item.uuid == field_name:
|
|
200
|
+
return item.uuid
|
|
201
|
+
|
|
202
|
+
if isinstance(fields, LibrarySourceOrTargetField):
|
|
203
|
+
for entry in fields.data:
|
|
204
|
+
if not isinstance(entry, str) and (entry.field == field_name or entry.uuid == field_name):
|
|
205
|
+
return entry.uuid
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _extract_schema_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
|
|
210
|
+
"""Extracts schema name identifying the source system or category.
|
|
211
|
+
|
|
212
|
+
Used when building field properties to store metadata (not used in transformation logic).
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
fields: SourceOrTargetField object to extract from.
|
|
216
|
+
field_name: The field name to look up.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Schema name string if found, None otherwise.
|
|
220
|
+
- For CUSTOM fields: returns CustomDataValues.source
|
|
221
|
+
- For LIBRARY fields: returns category.technicalName
|
|
222
|
+
"""
|
|
223
|
+
if isinstance(fields, CustomSourceOrTargetField):
|
|
224
|
+
for item in fields.data:
|
|
225
|
+
if item.technical_name == field_name or item.uuid == field_name:
|
|
226
|
+
return item.source
|
|
227
|
+
|
|
228
|
+
if isinstance(fields, LibrarySourceOrTargetField):
|
|
229
|
+
for entry in fields.data:
|
|
230
|
+
if not isinstance(entry, str) and (entry.field == field_name or entry.uuid == field_name):
|
|
231
|
+
return entry.category.get("technicalName") if entry.category else None
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
def _extract_technical_name_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
|
|
235
|
+
"""Extracts technical_name from fields for a given field name.
|
|
236
|
+
|
|
237
|
+
Technical names are system-specific identifiers (often numeric/encoded) that differ from
|
|
238
|
+
human-readable names. Used by `UuidToFieldNameMapper` to convert UUID/schema pattern keys to field names and as a
|
|
239
|
+
fallback alias in Pandera field definitions. Only CUSTOM fields have technical names.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
fields: SourceOrTargetField object to extract from.
|
|
243
|
+
field_name: The field name to look up (should be the technical_name for CUSTOM fields).
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Technical name string if found, None otherwise.
|
|
247
|
+
- For CUSTOM fields: returns the technical field ID needed for API calls to the system
|
|
248
|
+
(often not human-readable, e.g., "custom_field_2839471293")
|
|
249
|
+
- For LIBRARY fields: returns None (they use schema names instead, not technical identifiers)
|
|
250
|
+
"""
|
|
251
|
+
if isinstance(fields, CustomSourceOrTargetField):
|
|
252
|
+
for item in fields.data:
|
|
253
|
+
# Match by technical_name (primary) or uuid (fallback)
|
|
254
|
+
# field_name should be the technical_name extracted by _extract_names_from_fields
|
|
255
|
+
# Ensure technical_name exists and matches, or fall back to uuid match
|
|
256
|
+
if item.technical_name and item.technical_name == field_name:
|
|
257
|
+
return item.technical_name
|
|
258
|
+
elif item.uuid == field_name:
|
|
259
|
+
# If matched by uuid, return the technical_name if it exists
|
|
260
|
+
return item.technical_name if item.technical_name else None
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
def _extract_description_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
|
|
264
|
+
"""Extracts description explaining what a field represents.
|
|
265
|
+
|
|
266
|
+
Used when building field properties to store metadata for documentation.
|
|
267
|
+
Only CUSTOM fields have descriptions.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
fields: SourceOrTargetField object to extract from.
|
|
271
|
+
field_name: The field name to look up.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Description string for CUSTOM fields, None otherwise.
|
|
275
|
+
"""
|
|
276
|
+
if isinstance(fields, CustomSourceOrTargetField):
|
|
277
|
+
for item in fields.data:
|
|
278
|
+
if item.technical_name == field_name or item.uuid == field_name:
|
|
279
|
+
return item.description
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _extract_config_props(fields: SourceOrTargetField, field_name: str) -> Dict[str, Any]:
|
|
284
|
+
"""Extracts configuration field properties (question, type, value).
|
|
285
|
+
|
|
286
|
+
Used when building field properties for CONFIGURATION field types.
|
|
287
|
+
Extracts question (as dict, en, nl, and preferred string), config_type, and config_value.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
fields: SourceOrTargetField object to extract from.
|
|
291
|
+
field_name: The field name to look up (UUID for CONFIGURATION fields).
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Dictionary with config properties: question, question_dict, question_en, question_nl, config_type, config_value.
|
|
295
|
+
Returns empty dict with None values if not a CONFIGURATION field or not found.
|
|
296
|
+
"""
|
|
297
|
+
if isinstance(fields, ConfigurationSourceOrTargetField):
|
|
298
|
+
for config_item in fields.data:
|
|
299
|
+
# Match by UUID (configuration fields use UUID as identifier)
|
|
300
|
+
# Convert UUID to string for comparison
|
|
301
|
+
config_uuid_str = str(config_item.uuid)
|
|
302
|
+
if config_uuid_str == field_name or config_item.uuid == field_name:
|
|
303
|
+
question_dict = config_item.question
|
|
304
|
+
question_en = question_dict.get("en") if question_dict else None
|
|
305
|
+
question_nl = question_dict.get("nl") if question_dict else None
|
|
306
|
+
# Preferred question: English if available, else first available, else None
|
|
307
|
+
question = question_en or (next(iter(question_dict.values()), None) if question_dict else None)
|
|
308
|
+
|
|
309
|
+
# Get config_type value (handle both enum and string)
|
|
310
|
+
config_type_value = config_item.type.value if hasattr(config_item.type, 'value') else str(config_item.type)
|
|
311
|
+
|
|
312
|
+
return {
|
|
313
|
+
"question": question,
|
|
314
|
+
"question_dict": question_dict,
|
|
315
|
+
"question_en": question_en,
|
|
316
|
+
"question_nl": question_nl,
|
|
317
|
+
"config_type": config_type_value,
|
|
318
|
+
"config_value": config_item.value,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
# Return empty dict with None values for non-configuration fields
|
|
322
|
+
return {
|
|
323
|
+
"question": None,
|
|
324
|
+
"question_dict": None,
|
|
325
|
+
"question_en": None,
|
|
326
|
+
"question_nl": None,
|
|
327
|
+
"config_type": None,
|
|
328
|
+
"config_value": None,
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _parse_config_value(config_item: ConfigFieldValues) -> Optional[str]:
|
|
333
|
+
"""Convert a ConfigFieldValues object into a normalized string representation."""
|
|
334
|
+
cfg_type = getattr(config_item.type, "value", str(config_item.type))
|
|
335
|
+
value = config_item.value
|
|
336
|
+
|
|
337
|
+
# Attachment: explicitly suppressed
|
|
338
|
+
if cfg_type == ConfigurationType.ATTACHMENT.value:
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
# Selection: extract English labels if the payload is a list of dicts
|
|
342
|
+
if cfg_type == ConfigurationType.SELECTION.value:
|
|
343
|
+
if isinstance(value, list):
|
|
344
|
+
labels = [v.get("en", "") for v in value if isinstance(v, dict) and "en" in v]
|
|
345
|
+
return ", ".join(labels) if labels else str(value)
|
|
346
|
+
return str(value)
|
|
347
|
+
|
|
348
|
+
# Datepicker: normalize single or range
|
|
349
|
+
if cfg_type == ConfigurationType.DATEPICKER.value:
|
|
350
|
+
def fmt(dt):
|
|
351
|
+
return dt.isoformat() if isinstance(dt, (datetime, AwareDatetime)) else str(dt)
|
|
352
|
+
|
|
353
|
+
if isinstance(value, list):
|
|
354
|
+
parts = [fmt(v) for v in value]
|
|
355
|
+
return " - ".join(parts) if parts else None
|
|
356
|
+
return fmt(value) if value is not None else None
|
|
357
|
+
|
|
358
|
+
# Simple scalar types: TEXT, EMAIL, NUMBER, RICHTEXT
|
|
359
|
+
if cfg_type in {
|
|
360
|
+
ConfigurationType.TEXT.value,
|
|
361
|
+
ConfigurationType.EMAIL.value,
|
|
362
|
+
ConfigurationType.NUMBER.value,
|
|
363
|
+
ConfigurationType.RICHTEXT.value,
|
|
364
|
+
}:
|
|
365
|
+
return str(value) if value is not None else None
|
|
366
|
+
|
|
367
|
+
# Fallback
|
|
368
|
+
return str(value) if value is not None else None
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# ============================================================================
|
|
372
|
+
# Internal Schema Models (The "Useful" Objects)
|
|
373
|
+
# ============================================================================
|
|
374
|
+
|
|
375
|
+
class SourceTargetFields(BaseModel):
|
|
376
|
+
"""Nested structure for source or target field metadata.
|
|
377
|
+
|
|
378
|
+
Provides organized access to field information for either source or target fields
|
|
379
|
+
in a scenario. Access via `scenario.source` or `scenario.target`.
|
|
380
|
+
|
|
381
|
+
Example:
|
|
382
|
+
>>> scenario.source.field_names
|
|
383
|
+
['employee_id', 'first_name', 'last_name']
|
|
384
|
+
>>> scenario.source.unique_fields
|
|
385
|
+
['employee_id']
|
|
386
|
+
>>> scenario.source.field_properties[0].alias
|
|
387
|
+
'employee_id'
|
|
388
|
+
>>> scenario.target.custom_fields
|
|
389
|
+
['custom_field_1', 'custom_field_2']
|
|
390
|
+
>>> len(scenario.source)
|
|
391
|
+
3
|
|
392
|
+
>>> print(scenario.source)
|
|
393
|
+
SourceTargetFields(type='source', fields=3)
|
|
394
|
+
employee_id
|
|
395
|
+
first_name
|
|
396
|
+
last_name
|
|
397
|
+
|
|
398
|
+
Attributes:
|
|
399
|
+
type: Either 'source' or 'target' indicating the system type
|
|
400
|
+
field_names: List of all field names for this system type (source or target)
|
|
401
|
+
unique_fields: List of field names that are part of unique constraints
|
|
402
|
+
required_fields: List of field names that are required
|
|
403
|
+
field_properties: List of FieldProperties objects containing full metadata for all fields
|
|
404
|
+
custom_fields: List of field names that are custom fields (field_type='CUSTOM')
|
|
405
|
+
library_fields: List of field names that are library fields (field_type='LIBRARY')
|
|
406
|
+
fields_with_logic: List of field names that have transformation logic defined
|
|
407
|
+
"""
|
|
408
|
+
type: Literal["source", "target"]
|
|
409
|
+
field_names: List[str]
|
|
410
|
+
unique_fields: List[str]
|
|
411
|
+
required_fields: List[str]
|
|
412
|
+
field_properties: List[FieldProperties]
|
|
413
|
+
custom_fields: List[str]
|
|
414
|
+
library_fields: List[str]
|
|
415
|
+
fields_with_logic: List[str]
|
|
416
|
+
|
|
417
|
+
def __iter__(self) -> Iterator[FieldProperties]:
|
|
418
|
+
"""Make SourceTargetFields iterable, yielding FieldProperties objects.
|
|
419
|
+
|
|
420
|
+
Allows direct iteration: `for field in scenario.source:`
|
|
421
|
+
|
|
422
|
+
Example:
|
|
423
|
+
>>> for field in scenario.source:
|
|
424
|
+
... print(f"{field.alias} (required: {field.required})")
|
|
425
|
+
employee_id (required: True)
|
|
426
|
+
first_name (required: False)
|
|
427
|
+
|
|
428
|
+
Yields:
|
|
429
|
+
FieldProperties objects for each field
|
|
430
|
+
"""
|
|
431
|
+
return iter(self.field_properties)
|
|
432
|
+
|
|
433
|
+
def __len__(self) -> int:
|
|
434
|
+
"""Return the number of fields.
|
|
435
|
+
|
|
436
|
+
Example:
|
|
437
|
+
>>> len(scenario.source)
|
|
438
|
+
3
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
Number of field names
|
|
442
|
+
"""
|
|
443
|
+
return len(self.field_names)
|
|
444
|
+
|
|
445
|
+
def __str__(self) -> str:
|
|
446
|
+
"""Return a string representation for print().
|
|
447
|
+
|
|
448
|
+
Example:
|
|
449
|
+
>>> print(scenario.source)
|
|
450
|
+
SourceTargetFields(type='source', fields=3)
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
String representation showing type and field count
|
|
454
|
+
"""
|
|
455
|
+
return f"SourceTargetFields(type={self.type!r}, fields={len(self.field_names)})"
|
|
456
|
+
|
|
457
|
+
def __repr__(self) -> str:
|
|
458
|
+
"""Return a string representation of SourceTargetFields.
|
|
459
|
+
|
|
460
|
+
Example:
|
|
461
|
+
>>> repr(scenario.source)
|
|
462
|
+
"SourceTargetFields(type='source', fields=3)"
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
String representation showing type and field count
|
|
466
|
+
"""
|
|
467
|
+
return f"SourceTargetFields(type={self.type!r}, fields={len(self.field_names)})"
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class FieldProperties(BaseModel):
|
|
471
|
+
"""Metadata for a single field in a mapping.
|
|
472
|
+
|
|
473
|
+
How to use:
|
|
474
|
+
Access this via `scenario.field_name`. It provides details on
|
|
475
|
+
validation (unique, required) and origins (schema, uuid).
|
|
476
|
+
|
|
477
|
+
Example:
|
|
478
|
+
>>> scenario = ParsedScenario(...)
|
|
479
|
+
>>> scenario.customer_id.required
|
|
480
|
+
True
|
|
481
|
+
>>> scenario.customer_id.unique
|
|
482
|
+
False
|
|
483
|
+
>>> scenario['customer_id'].label
|
|
484
|
+
'Customer ID'
|
|
485
|
+
|
|
486
|
+
Attributes:
|
|
487
|
+
logic: Transformation logic string as defined in the BrynQ template
|
|
488
|
+
unique: Whether this field is part of the unique key constraint
|
|
489
|
+
required: Whether this field is required (cannot be empty/null)
|
|
490
|
+
mapping: Value mapping dictionary (empty for individual fields, actual mapping is at Record level)
|
|
491
|
+
system_type: Indicates whether this is a 'source' or 'target' field
|
|
492
|
+
field_type: Indicates the field origin type: 'CUSTOM' or 'LIBRARY'
|
|
493
|
+
alias: The technical field name/identifier (pythonic name for the field)
|
|
494
|
+
uuid: The UUID identifier used in mapping values
|
|
495
|
+
schema_name: For LIBRARY fields: category.technicalName. For CUSTOM fields: CustomDataValues.source
|
|
496
|
+
technical_name: For CUSTOM fields: CustomDataValues.technical_name. Not populated for LIBRARY fields
|
|
497
|
+
label: Human-readable field name displayed in BrynQ
|
|
498
|
+
label_en: English human-readable field name
|
|
499
|
+
label_nl: Dutch human-readable field name
|
|
500
|
+
description: Business description/purpose of the field (for custom fields)
|
|
501
|
+
"""
|
|
502
|
+
model_config = ConfigDict(extra="allow", frozen=True)
|
|
503
|
+
|
|
504
|
+
# Core Mapping Properties, straight from api
|
|
505
|
+
logic: Optional[str] = None
|
|
506
|
+
unique: bool = False
|
|
507
|
+
required: bool = False
|
|
508
|
+
mapping: Dict[str, Any] = Field(default_factory=dict)
|
|
509
|
+
|
|
510
|
+
# Identification
|
|
511
|
+
system_type: Optional[str] = None # 'source' or 'target'
|
|
512
|
+
field_type: Optional[str] = None # 'CUSTOM' or 'LIBRARY'
|
|
513
|
+
alias: Optional[str] = None # Python variable name
|
|
514
|
+
uuid: Optional[str] = None # API ID
|
|
515
|
+
|
|
516
|
+
# Context
|
|
517
|
+
schema_name: Optional[str] = Field(default=None, alias="schema")
|
|
518
|
+
technical_name: Optional[str] = None
|
|
519
|
+
label: Optional[str] = None
|
|
520
|
+
label_dict: Optional[Dict[str,str]] = None
|
|
521
|
+
label_en: Optional[str] = None
|
|
522
|
+
label_nl: Optional[str] = None
|
|
523
|
+
description: Optional[str] = None
|
|
524
|
+
|
|
525
|
+
# config related optional fields
|
|
526
|
+
question: Optional[str] = None
|
|
527
|
+
question_dict: Optional[Dict[str,str]] = None
|
|
528
|
+
question_en: Optional[str] = None
|
|
529
|
+
question_nl: Optional[str] = None
|
|
530
|
+
config_type: Optional[str] = None
|
|
531
|
+
config_value: Optional[Any] = None
|
|
532
|
+
|
|
533
|
+
def __repr__(self) -> str:
|
|
534
|
+
"""A human-friendly string representation.
|
|
535
|
+
|
|
536
|
+
Example:
|
|
537
|
+
>>> repr(field_props)
|
|
538
|
+
"<FieldProperties alias='customer_id' system_type='source' field_type='CUSTOM'>"
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
String representation showing the pythonic field name/alias, system type, and field type
|
|
542
|
+
"""
|
|
543
|
+
alias_str = self.alias if self.alias else 'unnamed'
|
|
544
|
+
system_type_str = self.system_type if self.system_type else 'unknown'
|
|
545
|
+
field_type_str = self.field_type if self.field_type else 'unknown'
|
|
546
|
+
return f"<FieldProperties alias='{alias_str}' system_type='{system_type_str}' field_type='{field_type_str}'>"
|
|
547
|
+
|
|
548
|
+
def __str__(self) -> str:
|
|
549
|
+
"""String representation (used by print()). Delegates to __repr__."""
|
|
550
|
+
return self.__repr__()
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
class Record(BaseModel):
|
|
554
|
+
"""Represents a relationship between Source and Target fields. It's the unit of the Scenarios, and Scenario is a collection of records.
|
|
555
|
+
|
|
556
|
+
How to use:
|
|
557
|
+
Iterate over `scenario.records`. Each record can tell:
|
|
558
|
+
"Take these source fields, apply this logic/mapping, and put result in these target fields."
|
|
559
|
+
|
|
560
|
+
Example:
|
|
561
|
+
>>> scenario = ParsedScenario(...)
|
|
562
|
+
>>> for record in scenario.records:
|
|
563
|
+
... print(f"Source: {record.source.field_names} -> Target: {record.target.field_names}")
|
|
564
|
+
Source: ['first_name'] -> Target: ['firstname']
|
|
565
|
+
>>> record = scenario.records[0]
|
|
566
|
+
>>> for field in record.source:
|
|
567
|
+
... print(f"{field.alias} (required: {field.required})")
|
|
568
|
+
first_name (required: True)
|
|
569
|
+
>>> record.source.unique_fields
|
|
570
|
+
['first_name']
|
|
571
|
+
>>> record.target.required_fields
|
|
572
|
+
['firstname']
|
|
573
|
+
|
|
574
|
+
Attributes:
|
|
575
|
+
logic: Transformation logic string as defined in the BrynQ template
|
|
576
|
+
unique: Whether this mapping is part of the unique key constraint
|
|
577
|
+
required: Whether this mapping is required (cannot be empty/null)
|
|
578
|
+
source: SourceTargetFields object containing source field metadata (field_names, unique_fields, required_fields, field_properties, etc.)
|
|
579
|
+
target: SourceTargetFields object containing target field metadata (field_names, unique_fields, required_fields, field_properties, etc.)
|
|
580
|
+
source_field_types: Maps source field name to its type (CUSTOM, LIBRARY, FIXED, EMPTY)
|
|
581
|
+
target_field_types: Maps target field name to its type (CUSTOM, LIBRARY, FIXED, EMPTY)
|
|
582
|
+
relation_type: Type of mapping relationship: 'one_to_one', 'one_to_many', 'many_to_one', or 'many_to_many'
|
|
583
|
+
mapping: Value mapping configuration for translating source values to target values. Set to False when mapping has empty values list
|
|
584
|
+
id: Unique identifier for this mapping record
|
|
585
|
+
fixed_source_value: If source type is FIXED, this contains the fixed literal value to use for all target fields
|
|
586
|
+
"""
|
|
587
|
+
model_config = ConfigDict(extra="allow", frozen=True)
|
|
588
|
+
|
|
589
|
+
# Inherited properties applied to the whole group
|
|
590
|
+
logic: Optional[str] = None
|
|
591
|
+
unique: bool = False
|
|
592
|
+
required: bool = False
|
|
593
|
+
mapping: Union["ScenarioMappingConfiguration", bool, None] = None
|
|
594
|
+
id: Optional[str] = None
|
|
595
|
+
fixed_source_value: Optional[str] = None
|
|
596
|
+
|
|
597
|
+
# The fields involved in this relationship
|
|
598
|
+
source: SourceTargetFields
|
|
599
|
+
target: SourceTargetFields
|
|
600
|
+
source_field_types: Dict[str, str] = Field(default_factory=dict)
|
|
601
|
+
target_field_types: Dict[str, str] = Field(default_factory=dict)
|
|
602
|
+
|
|
603
|
+
# inferred
|
|
604
|
+
relation_type: Literal["one_to_one", "one_to_many", "many_to_one", "many_to_many"]
|
|
605
|
+
|
|
606
|
+
# Record dunders
|
|
607
|
+
def __iter__(self):
|
|
608
|
+
"""Enable iteration over all fields (both source and target).
|
|
609
|
+
|
|
610
|
+
Uses `source` and `target` attributes internally.
|
|
611
|
+
|
|
612
|
+
Example:
|
|
613
|
+
>>> for field in record:
|
|
614
|
+
... print(field.label)
|
|
615
|
+
First Name
|
|
616
|
+
Last Name
|
|
617
|
+
>>> list(record)
|
|
618
|
+
[FieldProperties(...), FieldProperties(...)]
|
|
619
|
+
|
|
620
|
+
"""
|
|
621
|
+
return iter(list(self.source.field_properties) + list(self.target.field_properties))
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def __repr__(self) -> str:
|
|
625
|
+
"""A human-friendly string representation.
|
|
626
|
+
|
|
627
|
+
Example:
|
|
628
|
+
>>> repr(record)
|
|
629
|
+
"<Record id='rec_123' relation_type='one_to_one' source=[<FieldProperties alias='first_name'>, ...] -> target=[<FieldProperties alias='firstname'>, ...]>"
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
String representation of the Record
|
|
633
|
+
"""
|
|
634
|
+
# Build source field representation using FieldProperties
|
|
635
|
+
source_repr = [repr(field) for field in self.source.field_properties]
|
|
636
|
+
source_str = f"[{', '.join(source_repr)}]" if source_repr else "[]"
|
|
637
|
+
|
|
638
|
+
# Build target field representation using FieldProperties
|
|
639
|
+
target_repr = [repr(field) for field in self.target.field_properties]
|
|
640
|
+
target_str = f"[{', '.join(target_repr)}]" if target_repr else "[]"
|
|
641
|
+
|
|
642
|
+
# Build the representation string
|
|
643
|
+
id_str = f"id='{self.id}' " if self.id else ""
|
|
644
|
+
return (
|
|
645
|
+
f"<Record {id_str}relation_type='{self.relation_type}' "
|
|
646
|
+
f"source={source_str} -> target={target_str}>"
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
def __str__(self) -> str:
|
|
650
|
+
"""String representation (used by print()). Delegates to __repr__."""
|
|
651
|
+
return self.__repr__()
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
# ============================================================================
|
|
655
|
+
# Parsing Logic (The Engine)
|
|
656
|
+
# ============================================================================
|
|
657
|
+
|
|
658
|
+
@dataclass
|
|
659
|
+
class UuidToFieldNameConverter:
|
|
660
|
+
"""Bundles all data needed to convert value mapping keys from UUIDs/schema patterns to field names.
|
|
661
|
+
|
|
662
|
+
The API returns value mappings where BOTH input and output dictionaries use field identifier
|
|
663
|
+
keys (UUIDs like "ea06ce9f-e10e-484e-bdf0-ec58087f15c5" or schema.name patterns like "work_schema-title").
|
|
664
|
+
We MUST convert these identifier keys to readable field names (like {"title": "CEO"}) because
|
|
665
|
+
the rest of the code expects field names, not UUIDs or schema patterns. This dataclass groups
|
|
666
|
+
all the lookup data needed for that conversion, avoiding passing 5+ separate arguments.
|
|
667
|
+
|
|
668
|
+
Created in ScenarioParser.parse() and passed to UuidToFieldNameMapper.__init__().
|
|
669
|
+
|
|
670
|
+
Attributes:
|
|
671
|
+
uuid_keyed_value_mappings: The value mappings that currently use field identifier keys (UUIDs/schema patterns)
|
|
672
|
+
and need conversion to field names. Both input and output dictionaries have identifier keys.
|
|
673
|
+
source_names: List of source field names (used to resolve UUIDs and validate keys), preserving API order.
|
|
674
|
+
target_names: List of target field names (used to resolve UUIDs and validate keys), preserving API order.
|
|
675
|
+
props: Dictionary mapping field names to FieldProperties (contains UUID-to-name lookups).
|
|
676
|
+
detail_model: The scenario detail model with source/target field definitions.
|
|
677
|
+
"""
|
|
678
|
+
uuid_keyed_value_mappings: Optional[ScenarioMappingConfiguration]
|
|
679
|
+
source_names: List[str]
|
|
680
|
+
target_names: List[str]
|
|
681
|
+
props: FieldPropertiesMap
|
|
682
|
+
detail_model: ScenarioDetail
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
class UuidToFieldNameMapper:
|
|
686
|
+
"""Converts value mapping keys from UUIDs/schema patterns to readable field names.
|
|
687
|
+
|
|
688
|
+
The API returns value mappings where BOTH input and output dictionaries use field identifier
|
|
689
|
+
keys (UUIDs like "ea06ce9f-e10e-484e-bdf0-ec58087f15c5" or schema.name patterns like "work_schema-title").
|
|
690
|
+
This class converts those identifier keys to field names (like {"title": "CEO"}) because
|
|
691
|
+
the rest of the codebase expects field names, not UUIDs or schema patterns. Uses multiple
|
|
692
|
+
lookup strategies to handle API inconsistencies.
|
|
693
|
+
"""
|
|
694
|
+
|
|
695
|
+
def __init__(self, uuid_converter: UuidToFieldNameConverter):
|
|
696
|
+
"""Initialize the converter with all data needed to convert UUID/schema pattern keys to field names.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
uuid_converter: Contains UUID-keyed value mappings, field names, properties, and detail model.
|
|
700
|
+
Created in ScenarioParser.parse() and provides all lookup data needed to convert
|
|
701
|
+
field identifier keys (UUIDs like "ea06ce9f..." or schema patterns like "work_schema-title")
|
|
702
|
+
to readable field names (like "title"). Used to convert keys in BOTH input and output dictionaries.
|
|
703
|
+
"""
|
|
704
|
+
# Store all data needed to convert UUID/schema pattern keys in value mappings to field names
|
|
705
|
+
self.uuid_converter = uuid_converter
|
|
706
|
+
self.source_uuid_to_field: Dict[str, str] = {}
|
|
707
|
+
self.target_uuid_to_field: Dict[str, str] = {}
|
|
708
|
+
self.source_technical_to_pythonic: Dict[str, str] = {}
|
|
709
|
+
self.target_technical_to_pythonic: Dict[str, str] = {}
|
|
710
|
+
self._build_mappings()
|
|
711
|
+
|
|
712
|
+
def _build_mappings(self) -> None:
|
|
713
|
+
"""Builds the lookup dictionaries needed for translation.
|
|
714
|
+
|
|
715
|
+
Strategies:
|
|
716
|
+
1. Technical Names -> Python Aliases (for CUSTOM fields).
|
|
717
|
+
2. UUIDs -> Python Aliases (for all fields using props as source of truth).
|
|
718
|
+
"""
|
|
719
|
+
# Strategy 1: Map Technical Names -> Python Aliases
|
|
720
|
+
self._map_technical_names(
|
|
721
|
+
model=self.uuid_converter.detail_model.source,
|
|
722
|
+
names=self.uuid_converter.source_names,
|
|
723
|
+
mapping=self.source_technical_to_pythonic,
|
|
724
|
+
system_type=SystemType.SOURCE
|
|
725
|
+
)
|
|
726
|
+
self._map_technical_names(
|
|
727
|
+
model=self.uuid_converter.detail_model.target,
|
|
728
|
+
names=self.uuid_converter.target_names,
|
|
729
|
+
mapping=self.target_technical_to_pythonic,
|
|
730
|
+
system_type=SystemType.TARGET
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
# Strategy 2: Map UUIDs -> Python Aliases
|
|
734
|
+
self._map_uuids(
|
|
735
|
+
names=self.uuid_converter.source_names,
|
|
736
|
+
tech_map=self.source_technical_to_pythonic,
|
|
737
|
+
uuid_map=self.source_uuid_to_field
|
|
738
|
+
)
|
|
739
|
+
self._map_uuids(
|
|
740
|
+
names=self.uuid_converter.target_names,
|
|
741
|
+
tech_map=self.target_technical_to_pythonic,
|
|
742
|
+
uuid_map=self.target_uuid_to_field
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
def _map_technical_names(
|
|
746
|
+
self,
|
|
747
|
+
model: SourceOrTargetField,
|
|
748
|
+
names: List[str],
|
|
749
|
+
mapping: Dict[str, str],
|
|
750
|
+
system_type: SystemType
|
|
751
|
+
) -> None:
|
|
752
|
+
"""Maps technical names to python aliases for custom fields."""
|
|
753
|
+
if not isinstance(model, CustomSourceOrTargetField):
|
|
754
|
+
return
|
|
755
|
+
|
|
756
|
+
names_set = set(names) # Convert to set for fast lookup
|
|
757
|
+
for item in model.data:
|
|
758
|
+
if item.technical_name not in names_set:
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
# Find matching pythonic name in props via UUID
|
|
762
|
+
for py_name, props in self.uuid_converter.props.items():
|
|
763
|
+
if props.system_type == system_type.value and props.uuid == item.uuid:
|
|
764
|
+
mapping[item.technical_name] = py_name
|
|
765
|
+
break
|
|
766
|
+
|
|
767
|
+
def _map_uuids(
|
|
768
|
+
self,
|
|
769
|
+
names: List[str],
|
|
770
|
+
tech_map: Dict[str, str],
|
|
771
|
+
uuid_map: Dict[str, str]
|
|
772
|
+
) -> None:
|
|
773
|
+
"""Maps UUIDs to python aliases using props."""
|
|
774
|
+
for name in names:
|
|
775
|
+
py_name = tech_map.get(name, name)
|
|
776
|
+
props = self.uuid_converter.props.get(py_name)
|
|
777
|
+
if props and props.uuid:
|
|
778
|
+
uuid_map[props.uuid] = py_name
|
|
779
|
+
|
|
780
|
+
def convert_key(self, key: str, direction: Literal["source", "target"]) -> str:
|
|
781
|
+
"""Converts a single API mapping key to a pythonic field name.
|
|
782
|
+
|
|
783
|
+
This helper method handles API inconsistency by trying multiple fallback strategies:
|
|
784
|
+
1. UUID lookup (most reliable - direct match)
|
|
785
|
+
2. Name lookup (handles technical names and pythonic names)
|
|
786
|
+
3. Pattern matching (handles schema.name or schema-name patterns)
|
|
787
|
+
|
|
788
|
+
Uses internal lookup maps (`source_uuid_to_field`, etc.) populated during initialization.
|
|
789
|
+
|
|
790
|
+
Example:
|
|
791
|
+
>>> mapper.convert_key('be3a4c1e...', 'source')
|
|
792
|
+
'gender'
|
|
793
|
+
|
|
794
|
+
Args:
|
|
795
|
+
key: The raw key from the API (could be UUID, Name, or Schema.Name).
|
|
796
|
+
direction: 'source' or 'target'.
|
|
797
|
+
|
|
798
|
+
Returns:
|
|
799
|
+
The best matching Pythonic field name.
|
|
800
|
+
"""
|
|
801
|
+
if direction == "source":
|
|
802
|
+
uuid_map = self.source_uuid_to_field
|
|
803
|
+
tech_map = self.source_technical_to_pythonic
|
|
804
|
+
valid_names = self.uuid_converter.source_names
|
|
805
|
+
else:
|
|
806
|
+
uuid_map = self.target_uuid_to_field
|
|
807
|
+
tech_map = self.target_technical_to_pythonic
|
|
808
|
+
valid_names = self.uuid_converter.target_names
|
|
809
|
+
|
|
810
|
+
# Strategy 1: Direct UUID Lookup (Most reliable)
|
|
811
|
+
if key in uuid_map:
|
|
812
|
+
return uuid_map[key]
|
|
813
|
+
|
|
814
|
+
# Strategy 2: Direct Name Lookup
|
|
815
|
+
if key in valid_names:
|
|
816
|
+
return tech_map.get(key, key)
|
|
817
|
+
if key in tech_map.values():
|
|
818
|
+
return key
|
|
819
|
+
|
|
820
|
+
# Strategy 3: Pattern Matching (Heuristic)
|
|
821
|
+
# Handles keys like 'schema_name.email' by checking suffixes
|
|
822
|
+
all_names = set(tech_map.values()) | set(valid_names)
|
|
823
|
+
for fname in all_names:
|
|
824
|
+
if key.endswith(f'.{fname}') or key.endswith(f'-{fname}'):
|
|
825
|
+
return tech_map.get(fname, fname)
|
|
826
|
+
|
|
827
|
+
# Fallback: Return original key
|
|
828
|
+
return key
|
|
829
|
+
|
|
830
|
+
def convert_mapping_config(self) -> Optional[ScenarioMappingConfiguration]:
|
|
831
|
+
"""Converts value mapping keys from field identifiers to field names.
|
|
832
|
+
|
|
833
|
+
The API returns value mappings where BOTH input and output dictionaries use field identifier
|
|
834
|
+
keys (UUIDs like "ea06ce9f-e10e-484e-bdf0-ec58087f15c5" or schema.name patterns like "work_schema-title").
|
|
835
|
+
This method converts all identifier keys to readable field names (like {"title": "CEO"})
|
|
836
|
+
because the rest of the codebase expects field names, not UUIDs or schema patterns.
|
|
837
|
+
|
|
838
|
+
Example:
|
|
839
|
+
>>> converted = mapper.convert_mapping_config()
|
|
840
|
+
>>> converted.values[0].input
|
|
841
|
+
{'title': 'CEO'} # Field identifier key converted to field name
|
|
842
|
+
>>> converted.values[0].output
|
|
843
|
+
{'job_code': '96'} # UUID key converted to field name
|
|
844
|
+
|
|
845
|
+
Returns:
|
|
846
|
+
ScenarioMappingConfiguration with field name keys (not UUIDs or schema patterns),
|
|
847
|
+
or None if no mapping config exists.
|
|
848
|
+
"""
|
|
849
|
+
if not self.uuid_converter.uuid_keyed_value_mappings or not self.uuid_converter.uuid_keyed_value_mappings.values:
|
|
850
|
+
return self.uuid_converter.uuid_keyed_value_mappings
|
|
851
|
+
|
|
852
|
+
# Convert UUID/schema pattern keys to field names in each value mapping
|
|
853
|
+
converted_values = []
|
|
854
|
+
for val in self.uuid_converter.uuid_keyed_value_mappings.values:
|
|
855
|
+
# Convert source field identifier keys (UUIDs/schema patterns) to field names
|
|
856
|
+
new_in = {
|
|
857
|
+
self.convert_key(key=k, direction="source"): v
|
|
858
|
+
for k, v in val.input.items()
|
|
859
|
+
}
|
|
860
|
+
# Convert target field identifier keys (UUIDs/schema patterns) to field names
|
|
861
|
+
new_out = {
|
|
862
|
+
self.convert_key(key=k, direction="target"): v
|
|
863
|
+
for k, v in val.output.items()
|
|
864
|
+
}
|
|
865
|
+
converted_values.append(MappingValue(input=new_in, output=new_out))
|
|
866
|
+
|
|
867
|
+
return ScenarioMappingConfiguration(
|
|
868
|
+
values=converted_values,
|
|
869
|
+
default_value=self.uuid_converter.uuid_keyed_value_mappings.default_value
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
class ScenarioParser:
|
|
874
|
+
"""Orchestrates the parsing of a Raw Scenario Dictionary.
|
|
875
|
+
|
|
876
|
+
This class breaks the logic into three distinct phases:
|
|
877
|
+
1. Extraction: Get raw names from the polymorphic API response.
|
|
878
|
+
2. Property Building: Create metadata objects (`FieldProperties`) for every field.
|
|
879
|
+
3. Linking: Create `Record` objects that link Sources to Targets.
|
|
880
|
+
"""
|
|
881
|
+
|
|
882
|
+
def __init__(self):
|
|
883
|
+
"""Initialize the parser."""
|
|
884
|
+
pass
|
|
885
|
+
|
|
886
|
+
def parse(self, scenario: Dict[str, Any]) -> "ParsedScenario":
|
|
887
|
+
"""Parse a raw API scenario dictionary into a ParsedScenario object.
|
|
888
|
+
|
|
889
|
+
Args:
|
|
890
|
+
scenario: Raw scenario dictionary from the BrynQ API
|
|
891
|
+
|
|
892
|
+
Returns:
|
|
893
|
+
ParsedScenario object with all parsed data
|
|
894
|
+
"""
|
|
895
|
+
details = scenario.get("details", [])
|
|
896
|
+
|
|
897
|
+
# Accumulators
|
|
898
|
+
source_to_target = defaultdict(set)
|
|
899
|
+
target_to_source = defaultdict(set)
|
|
900
|
+
props: FieldPropertiesMap = {}
|
|
901
|
+
value_mappings = defaultdict(list)
|
|
902
|
+
aliases = set()
|
|
903
|
+
alias_order = []
|
|
904
|
+
|
|
905
|
+
records = []
|
|
906
|
+
|
|
907
|
+
# details is the 'raw' api name for what is essentially called 'records' here.
|
|
908
|
+
for detail in details:
|
|
909
|
+
detail_model = ScenarioDetail.model_validate(detail)
|
|
910
|
+
|
|
911
|
+
# Phase 1: extract names
|
|
912
|
+
source_names = _extract_names_from_fields(detail_model.source)
|
|
913
|
+
target_names = _extract_names_from_fields(detail_model.target)
|
|
914
|
+
|
|
915
|
+
for source_name in source_names:
|
|
916
|
+
source_to_target[source_name].update(target_names)
|
|
917
|
+
for target_name in target_names:
|
|
918
|
+
target_to_source[target_name].update(source_names)
|
|
919
|
+
|
|
920
|
+
# Phase 2: Property Building
|
|
921
|
+
# We use the same method for source and target to avoid code duplication, just need sepperate types
|
|
922
|
+
|
|
923
|
+
# Identify reserved keys from target (Library fields) to avoid collisions with Source Custom fields
|
|
924
|
+
reserved_keys = set()
|
|
925
|
+
if detail_model.target.type == FieldType.LIBRARY.value:
|
|
926
|
+
reserved_keys = set(target_names) # Convert list to set for fast lookup
|
|
927
|
+
|
|
928
|
+
base_props = FieldProperties.model_validate(detail)
|
|
929
|
+
self._build_field_properties(
|
|
930
|
+
fields=detail_model.source,
|
|
931
|
+
names=source_names,
|
|
932
|
+
sys_type=SystemType.SOURCE,
|
|
933
|
+
base=base_props,
|
|
934
|
+
props=props,
|
|
935
|
+
aliases=aliases,
|
|
936
|
+
alias_order=alias_order,
|
|
937
|
+
reserved=reserved_keys
|
|
938
|
+
)
|
|
939
|
+
self._build_field_properties(
|
|
940
|
+
fields=detail_model.target,
|
|
941
|
+
names=target_names,
|
|
942
|
+
sys_type=SystemType.TARGET,
|
|
943
|
+
base=base_props,
|
|
944
|
+
props=props,
|
|
945
|
+
aliases=aliases,
|
|
946
|
+
alias_order=alias_order
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
# Phase 3: Linking & Mapping Conversion
|
|
950
|
+
# Convert value mapping keys from UUIDs/schema patterns to field names (API uses UUIDs/schema patterns, code expects field names)
|
|
951
|
+
uuid_converter = UuidToFieldNameConverter(
|
|
952
|
+
uuid_keyed_value_mappings=detail_model.mapping,
|
|
953
|
+
source_names=source_names,
|
|
954
|
+
target_names=target_names,
|
|
955
|
+
props=props,
|
|
956
|
+
detail_model=detail_model
|
|
957
|
+
)
|
|
958
|
+
converted_map = UuidToFieldNameMapper(uuid_converter).convert_mapping_config()
|
|
959
|
+
|
|
960
|
+
if converted_map:
|
|
961
|
+
# If values exist, store them in the lookup map
|
|
962
|
+
if converted_map.values:
|
|
963
|
+
# Preserve order from API, but sort for consistent key generation
|
|
964
|
+
key = '|'.join(sorted(source_names)) if source_names else detail_model.id
|
|
965
|
+
value_mappings[key].append(converted_map)
|
|
966
|
+
# If map exists but is empty, treat as False
|
|
967
|
+
else:
|
|
968
|
+
converted_map = False
|
|
969
|
+
|
|
970
|
+
records.append(
|
|
971
|
+
self._build_record(
|
|
972
|
+
detail=detail_model,
|
|
973
|
+
source_names=source_names,
|
|
974
|
+
target_names=target_names,
|
|
975
|
+
base=base_props,
|
|
976
|
+
props=props,
|
|
977
|
+
mapping_cfg=converted_map
|
|
978
|
+
)
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
# Final Phase: Assembly
|
|
982
|
+
return self._build_parsed_scenario(
|
|
983
|
+
raw=scenario,
|
|
984
|
+
records=records,
|
|
985
|
+
source_to_target_map=source_to_target,
|
|
986
|
+
target_to_source_map=target_to_source,
|
|
987
|
+
props=props,
|
|
988
|
+
source_to_value_mappings=value_mappings
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
def _build_field_properties(
|
|
992
|
+
self,
|
|
993
|
+
fields: SourceOrTargetField,
|
|
994
|
+
names: List[str],
|
|
995
|
+
sys_type: SystemType,
|
|
996
|
+
base: FieldProperties,
|
|
997
|
+
props: FieldPropertiesMap,
|
|
998
|
+
aliases: Set[str],
|
|
999
|
+
alias_order: List[str],
|
|
1000
|
+
reserved: Optional[Set[str]] = None
|
|
1001
|
+
) -> None:
|
|
1002
|
+
"""Creates FieldProperties for a set of fields and registers them.
|
|
1003
|
+
|
|
1004
|
+
Args:
|
|
1005
|
+
fields: SourceOrTargetField object containing field definitions
|
|
1006
|
+
names: Set of field names to process
|
|
1007
|
+
sys_type: Either SystemType.SOURCE or SystemType.TARGET
|
|
1008
|
+
base: Base FieldProperties shared across fields in this mapping
|
|
1009
|
+
props: Dictionary to store field properties (modified in place)
|
|
1010
|
+
aliases: Set to track custom field aliases (modified in place)
|
|
1011
|
+
alias_order: List to maintain custom alias order (modified in place)
|
|
1012
|
+
reserved: Set of reserved keys to avoid collisions (e.g. target library names)
|
|
1013
|
+
"""
|
|
1014
|
+
for name in names:
|
|
1015
|
+
label, l_en, l_nl = _extract_label_from_fields(fields, name) #only returned for library/custom
|
|
1016
|
+
|
|
1017
|
+
# Determine Python Alias
|
|
1018
|
+
f_type_str = fields.type.value if isinstance(fields.type, FieldType) else fields.type
|
|
1019
|
+
is_custom = (f_type_str == FieldType.CUSTOM.value)
|
|
1020
|
+
|
|
1021
|
+
# Only sanitize custom fields; libraries use fixed keys
|
|
1022
|
+
alias = _sanitize_alias(label or name) if is_custom else name
|
|
1023
|
+
key = alias if is_custom else name
|
|
1024
|
+
|
|
1025
|
+
# Handle collisions for Custom fields if key is reserved (e.g. used by Target Library field)
|
|
1026
|
+
if is_custom and reserved and key in reserved:
|
|
1027
|
+
alias = f"{alias}_{sys_type.value}"
|
|
1028
|
+
key = alias
|
|
1029
|
+
|
|
1030
|
+
config_props = _extract_config_props(fields, name)
|
|
1031
|
+
|
|
1032
|
+
props[key] = base.model_copy(update={
|
|
1033
|
+
"system_type": sys_type.value,
|
|
1034
|
+
"field_type": f_type_str,
|
|
1035
|
+
"alias": alias,
|
|
1036
|
+
"uuid": _extract_uuid_from_fields(fields, name),
|
|
1037
|
+
"schema_name": _extract_schema_from_fields(fields, name),
|
|
1038
|
+
"technical_name": _extract_technical_name_from_fields(fields, name),
|
|
1039
|
+
"label": label,
|
|
1040
|
+
"label_en": l_en,
|
|
1041
|
+
"label_nl": l_nl,
|
|
1042
|
+
"description": _extract_description_from_fields(fields, name),
|
|
1043
|
+
"mapping": {}, # Mappings are stored at Record level, not Field level
|
|
1044
|
+
#config fields
|
|
1045
|
+
**config_props
|
|
1046
|
+
})
|
|
1047
|
+
|
|
1048
|
+
if is_custom and key not in aliases:
|
|
1049
|
+
aliases.add(key)
|
|
1050
|
+
alias_order.append(key)
|
|
1051
|
+
|
|
1052
|
+
def _build_record(
|
|
1053
|
+
self,
|
|
1054
|
+
detail: ScenarioDetail,
|
|
1055
|
+
source_names: List[str],
|
|
1056
|
+
target_names: List[str],
|
|
1057
|
+
base: FieldProperties,
|
|
1058
|
+
props: FieldPropertiesMap,
|
|
1059
|
+
mapping_cfg
|
|
1060
|
+
) -> Record:
|
|
1061
|
+
"""Creates a Record object representing the relationship.
|
|
1062
|
+
|
|
1063
|
+
Args:
|
|
1064
|
+
detail: Validated ScenarioDetail object
|
|
1065
|
+
source_names: List of source field names (preserving API order)
|
|
1066
|
+
target_names: List of target field names (preserving API order)
|
|
1067
|
+
base: Base FieldProperties for this mapping
|
|
1068
|
+
props: Dictionary of field properties
|
|
1069
|
+
mapping_cfg: Converted mapping configuration
|
|
1070
|
+
|
|
1071
|
+
Returns:
|
|
1072
|
+
Record object representing this mapping
|
|
1073
|
+
"""
|
|
1074
|
+
# Helper to retrieve the correct prop keys, preserving order
|
|
1075
|
+
def _get_keys(names, field_obj, sys_type: SystemType):
|
|
1076
|
+
keys = []
|
|
1077
|
+
is_custom = (field_obj.type == FieldType.CUSTOM.value)
|
|
1078
|
+
for n in names: # Iterate in order
|
|
1079
|
+
if is_custom:
|
|
1080
|
+
# For custom fields, look up the actual key from props
|
|
1081
|
+
uuid = _extract_uuid_from_fields(field_obj, n)
|
|
1082
|
+
technical_name = _extract_technical_name_from_fields(field_obj, n)
|
|
1083
|
+
|
|
1084
|
+
# First try
|
|
1085
|
+
lbl, _, _ = _extract_label_from_fields(field_obj, n)
|
|
1086
|
+
sanitized_alias = _sanitize_alias(lbl or n)
|
|
1087
|
+
if sanitized_alias in props:
|
|
1088
|
+
prop = props[sanitized_alias]
|
|
1089
|
+
if prop.system_type == sys_type.value and prop.field_type == FieldType.CUSTOM.value:
|
|
1090
|
+
# Verify it's the same field by UUID or technical_name
|
|
1091
|
+
if (uuid and prop.uuid == uuid) or (technical_name and prop.technical_name == technical_name):
|
|
1092
|
+
keys.append(sanitized_alias)
|
|
1093
|
+
continue
|
|
1094
|
+
|
|
1095
|
+
# Second try: find matching key in props by UUID or technical_name
|
|
1096
|
+
found_key = None
|
|
1097
|
+
for key, prop in props.items():
|
|
1098
|
+
if prop.system_type == sys_type.value and prop.field_type == FieldType.CUSTOM.value:
|
|
1099
|
+
if (uuid and prop.uuid == uuid) or (technical_name and prop.technical_name == technical_name):
|
|
1100
|
+
found_key = key
|
|
1101
|
+
break
|
|
1102
|
+
|
|
1103
|
+
if found_key:
|
|
1104
|
+
keys.append(found_key)
|
|
1105
|
+
else:
|
|
1106
|
+
# Fallback: use sanitized alias (shouldn't happen if props were built correctly)
|
|
1107
|
+
keys.append(sanitized_alias)
|
|
1108
|
+
else:
|
|
1109
|
+
# For library/configuration fields, the name itself is the key
|
|
1110
|
+
keys.append(n)
|
|
1111
|
+
return keys
|
|
1112
|
+
|
|
1113
|
+
source_keys = _get_keys(source_names, detail.source, SystemType.SOURCE)
|
|
1114
|
+
target_keys = _get_keys(target_names, detail.target, SystemType.TARGET)
|
|
1115
|
+
|
|
1116
|
+
# Determine Cardinality
|
|
1117
|
+
rel = RelationType.ONE_TO_ONE.value
|
|
1118
|
+
if len(source_names) > 1 and len(target_names) > 1:
|
|
1119
|
+
rel = RelationType.MANY_TO_MANY.value
|
|
1120
|
+
elif len(source_names) > 1:
|
|
1121
|
+
rel = RelationType.MANY_TO_ONE.value
|
|
1122
|
+
elif len(target_names) > 1:
|
|
1123
|
+
rel = RelationType.ONE_TO_MANY.value
|
|
1124
|
+
|
|
1125
|
+
# Extract fixed_source_value based on source type
|
|
1126
|
+
fixed_source_value = None
|
|
1127
|
+
if detail.source.type == "FIXED":
|
|
1128
|
+
# For FIXED type, use the data directly (it's a string)
|
|
1129
|
+
fixed_source_value = detail.source.data
|
|
1130
|
+
elif detail.source.type == "CONFIGURATION":
|
|
1131
|
+
# For CONFIGURATION type, parse the config value according to its type
|
|
1132
|
+
if isinstance(detail.source, ConfigurationSourceOrTargetField) and detail.source.data:
|
|
1133
|
+
# Get the first config item (for one_to_one/one_to_many, there's typically one)
|
|
1134
|
+
config_item = detail.source.data[0]
|
|
1135
|
+
fixed_source_value = _parse_config_value(config_item)
|
|
1136
|
+
|
|
1137
|
+
# Build FieldProperties lists
|
|
1138
|
+
source_field_props = [props[k] for k in source_keys if k in props]
|
|
1139
|
+
target_field_props = [props[k] for k in target_keys if k in props]
|
|
1140
|
+
|
|
1141
|
+
# Build SourceTargetFields instances
|
|
1142
|
+
source_unique_fields = [k for k in source_keys if k in props and props[k].unique]
|
|
1143
|
+
source_required_fields = [k for k in source_keys if k in props and props[k].required]
|
|
1144
|
+
source_custom_fields = [k for k in source_keys if k in props and props[k].field_type == FieldType.CUSTOM.value]
|
|
1145
|
+
source_library_fields = [k for k in source_keys if k in props and props[k].field_type == FieldType.LIBRARY.value]
|
|
1146
|
+
source_fields_with_logic = [k for k in source_keys if k in props and props[k].logic is not None]
|
|
1147
|
+
|
|
1148
|
+
target_unique_fields = [k for k in target_keys if k in props and props[k].unique]
|
|
1149
|
+
target_required_fields = [k for k in target_keys if k in props and props[k].required]
|
|
1150
|
+
target_custom_fields = [k for k in target_keys if k in props and props[k].field_type == FieldType.CUSTOM.value]
|
|
1151
|
+
target_library_fields = [k for k in target_keys if k in props and props[k].field_type == FieldType.LIBRARY.value]
|
|
1152
|
+
target_fields_with_logic = [k for k in target_keys if k in props and props[k].logic is not None]
|
|
1153
|
+
|
|
1154
|
+
source_stf = SourceTargetFields(
|
|
1155
|
+
type="source",
|
|
1156
|
+
field_names=source_keys,
|
|
1157
|
+
unique_fields=source_unique_fields,
|
|
1158
|
+
required_fields=source_required_fields,
|
|
1159
|
+
field_properties=source_field_props,
|
|
1160
|
+
custom_fields=source_custom_fields,
|
|
1161
|
+
library_fields=source_library_fields,
|
|
1162
|
+
fields_with_logic=source_fields_with_logic
|
|
1163
|
+
)
|
|
1164
|
+
|
|
1165
|
+
target_stf = SourceTargetFields(
|
|
1166
|
+
type="target",
|
|
1167
|
+
field_names=target_keys,
|
|
1168
|
+
unique_fields=target_unique_fields,
|
|
1169
|
+
required_fields=target_required_fields,
|
|
1170
|
+
field_properties=target_field_props,
|
|
1171
|
+
custom_fields=target_custom_fields,
|
|
1172
|
+
library_fields=target_library_fields,
|
|
1173
|
+
fields_with_logic=target_fields_with_logic
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
return Record(
|
|
1177
|
+
logic=base.logic,
|
|
1178
|
+
unique=base.unique,
|
|
1179
|
+
required=base.required,
|
|
1180
|
+
source_field_types={k: detail.source.type for k in source_keys},
|
|
1181
|
+
target_field_types={k: detail.target.type for k in target_keys},
|
|
1182
|
+
source=source_stf,
|
|
1183
|
+
target=target_stf,
|
|
1184
|
+
relation_type=rel,
|
|
1185
|
+
mapping=mapping_cfg,
|
|
1186
|
+
id=detail.id,
|
|
1187
|
+
fixed_source_value=fixed_source_value
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
def _build_parsed_scenario(
|
|
1191
|
+
self,
|
|
1192
|
+
raw,
|
|
1193
|
+
records,
|
|
1194
|
+
source_to_target_map,
|
|
1195
|
+
target_to_source_map,
|
|
1196
|
+
props,
|
|
1197
|
+
source_to_value_mappings
|
|
1198
|
+
):
|
|
1199
|
+
"""Constructs the final immutable ParsedScenario object.
|
|
1200
|
+
|
|
1201
|
+
Args:
|
|
1202
|
+
raw: Original scenario dictionary
|
|
1203
|
+
records: List of Record objects
|
|
1204
|
+
source_to_target_map: Source to target mapping dictionary
|
|
1205
|
+
target_to_source_map: Target to source mapping dictionary
|
|
1206
|
+
props: Field properties dictionary
|
|
1207
|
+
source_to_value_mappings: Source field to value mappings dictionary
|
|
1208
|
+
|
|
1209
|
+
Returns:
|
|
1210
|
+
ParsedScenario object
|
|
1211
|
+
"""
|
|
1212
|
+
# Sort maps for deterministic behavior
|
|
1213
|
+
s_to_t = {k: sorted(v) for k, v in source_to_target_map.items()}
|
|
1214
|
+
t_to_s = {k: sorted(v) for k, v in target_to_source_map.items()}
|
|
1215
|
+
|
|
1216
|
+
# Only include custom fields that are source fields (based on system_type)
|
|
1217
|
+
custom_fields = {k: v for k, v in props.items()
|
|
1218
|
+
if v.field_type == FieldType.CUSTOM.value
|
|
1219
|
+
and v.system_type == SystemType.SOURCE.value}
|
|
1220
|
+
custom_model = ParsedScenario._build_custom_field_model(custom_fields) if custom_fields else None
|
|
1221
|
+
|
|
1222
|
+
# Build unique and required fields (all fields, regardless of source/target)
|
|
1223
|
+
unique_fields = [fid for fid, props in props.items() if props.unique]
|
|
1224
|
+
required_fields = [fid for fid, props in props.items() if props.required]
|
|
1225
|
+
|
|
1226
|
+
# Build source and target unique/required fields separately
|
|
1227
|
+
source_field_names = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value]
|
|
1228
|
+
source_unique_fields = [k for k, v in props.items() if v.unique and v.system_type == SystemType.SOURCE.value]
|
|
1229
|
+
source_required_fields = [k for k, v in props.items() if v.required and v.system_type == SystemType.SOURCE.value]
|
|
1230
|
+
source_field_properties = [v for k, v in props.items() if v.system_type == SystemType.SOURCE.value]
|
|
1231
|
+
source_custom_fields = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value and v.field_type == FieldType.CUSTOM.value]
|
|
1232
|
+
source_library_fields = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value and v.field_type == FieldType.LIBRARY.value]
|
|
1233
|
+
source_fields_with_logic = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value and v.logic is not None]
|
|
1234
|
+
|
|
1235
|
+
target_field_names = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value]
|
|
1236
|
+
target_unique_fields = [k for k, v in props.items() if v.unique and v.system_type == SystemType.TARGET.value]
|
|
1237
|
+
target_required_fields = [k for k, v in props.items() if v.required and v.system_type == SystemType.TARGET.value]
|
|
1238
|
+
target_field_properties = [v for k, v in props.items() if v.system_type == SystemType.TARGET.value]
|
|
1239
|
+
target_custom_fields = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value and v.field_type == FieldType.CUSTOM.value]
|
|
1240
|
+
target_library_fields = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value and v.field_type == FieldType.LIBRARY.value]
|
|
1241
|
+
target_fields_with_logic = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value and v.logic is not None]
|
|
1242
|
+
|
|
1243
|
+
# Build nested structures
|
|
1244
|
+
source = SourceTargetFields(
|
|
1245
|
+
type="source",
|
|
1246
|
+
field_names=source_field_names,
|
|
1247
|
+
unique_fields=source_unique_fields,
|
|
1248
|
+
required_fields=source_required_fields,
|
|
1249
|
+
field_properties=source_field_properties,
|
|
1250
|
+
custom_fields=source_custom_fields,
|
|
1251
|
+
library_fields=source_library_fields,
|
|
1252
|
+
fields_with_logic=source_fields_with_logic
|
|
1253
|
+
)
|
|
1254
|
+
target = SourceTargetFields(
|
|
1255
|
+
type="target",
|
|
1256
|
+
field_names=target_field_names,
|
|
1257
|
+
unique_fields=target_unique_fields,
|
|
1258
|
+
required_fields=target_required_fields,
|
|
1259
|
+
field_properties=target_field_properties,
|
|
1260
|
+
custom_fields=target_custom_fields,
|
|
1261
|
+
library_fields=target_library_fields,
|
|
1262
|
+
fields_with_logic=target_fields_with_logic
|
|
1263
|
+
)
|
|
1264
|
+
|
|
1265
|
+
all_source_fields = set(source_to_target_map.keys())
|
|
1266
|
+
all_target_fields = set(target_to_source_map.keys())
|
|
1267
|
+
|
|
1268
|
+
# Collect target fields from records where logic contains 'ignoreCompare'
|
|
1269
|
+
target_fields_to_ignore_in_compare = set()
|
|
1270
|
+
for record in records:
|
|
1271
|
+
if record.logic and 'ignoreCompare' in record.logic:
|
|
1272
|
+
target_fields_to_ignore_in_compare.update(record.target.field_names)
|
|
1273
|
+
|
|
1274
|
+
return ParsedScenario(
|
|
1275
|
+
name=raw.get("name", "Unnamed"),
|
|
1276
|
+
id=raw.get("id", ""),
|
|
1277
|
+
records_count=len(raw.get("details", [])),
|
|
1278
|
+
description=raw.get("description", ""),
|
|
1279
|
+
records=records,
|
|
1280
|
+
source_to_target_map=s_to_t,
|
|
1281
|
+
target_to_source_map=t_to_s,
|
|
1282
|
+
field_properties=props,
|
|
1283
|
+
source=source,
|
|
1284
|
+
target=target,
|
|
1285
|
+
unique_fields=unique_fields,
|
|
1286
|
+
required_fields=required_fields,
|
|
1287
|
+
custom_fields=custom_fields,
|
|
1288
|
+
custom_fields_model=custom_model,
|
|
1289
|
+
all_source_fields=all_source_fields,
|
|
1290
|
+
all_target_fields=all_target_fields,
|
|
1291
|
+
source_to_value_mappings=dict(source_to_value_mappings),
|
|
1292
|
+
target_fields_to_ignore_in_compare=target_fields_to_ignore_in_compare
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
class ParsedScenario(BaseModel):
|
|
1297
|
+
"""The final, usable representation of a Scenario.
|
|
1298
|
+
|
|
1299
|
+
This object is what users interact with. It contains all records,
|
|
1300
|
+
lookups, and property maps needed to perform data validation and transformation.
|
|
1301
|
+
|
|
1302
|
+
Example:
|
|
1303
|
+
>>> scenario = ParsedScenario(...)
|
|
1304
|
+
>>> scenario.name
|
|
1305
|
+
'Personal Information'
|
|
1306
|
+
>>> scenario.all_source_fields
|
|
1307
|
+
{'first_name', 'last_name', 'email'}
|
|
1308
|
+
>>> scenario.has_field('email', field_type='source')
|
|
1309
|
+
True
|
|
1310
|
+
>>> scenario.get_mapped_field_names('first_name')
|
|
1311
|
+
['firstname']
|
|
1312
|
+
>>> for record in scenario.records:
|
|
1313
|
+
... print(record.relation_type)
|
|
1314
|
+
one_to_one
|
|
1315
|
+
|
|
1316
|
+
Attributes:
|
|
1317
|
+
name: Scenario display name
|
|
1318
|
+
id: Scenario identifier
|
|
1319
|
+
records_count: Number of records in this scenario
|
|
1320
|
+
description: Scenario business context (description of what the scenario does)
|
|
1321
|
+
records: List of Record objects representing field mappings
|
|
1322
|
+
source_to_target_map: Dictionary mapping source field names to target field names
|
|
1323
|
+
target_to_source_map: Dictionary mapping target field names to source field names
|
|
1324
|
+
field_properties: Dictionary mapping field names to FieldProperties objects
|
|
1325
|
+
all_source_fields: Set of all source field names
|
|
1326
|
+
all_target_fields: Set of all target field names
|
|
1327
|
+
source: SourceTargetFields object containing source unique_fields and required_fields
|
|
1328
|
+
target: SourceTargetFields object containing target unique_fields and required_fields
|
|
1329
|
+
unique_fields: List of field names that are part of unique constraints (deprecated: use source.unique_fields or target.unique_fields)
|
|
1330
|
+
required_fields: List of field names that are required (deprecated: use source.required_fields or target.required_fields)
|
|
1331
|
+
custom_fields: Dictionary of custom field properties (filtered from field_properties)
|
|
1332
|
+
custom_fields_model: Dynamically generated Pandera schema model for custom fields
|
|
1333
|
+
source_to_value_mappings: Dictionary mapping source fields to value mapping configurations
|
|
1334
|
+
target_fields_to_ignore_in_compare: Set of target field names that should be ignored in compare function
|
|
1335
|
+
(determined by records where logic contains 'ignoreCompare')
|
|
1336
|
+
"""
|
|
1337
|
+
# Core
|
|
1338
|
+
name: str
|
|
1339
|
+
id: str
|
|
1340
|
+
records_count: int
|
|
1341
|
+
description: str
|
|
1342
|
+
|
|
1343
|
+
# Mapping Data
|
|
1344
|
+
records: List[Record]
|
|
1345
|
+
source_to_target_map: SourceToTargetMap
|
|
1346
|
+
target_to_source_map: TargetToSourceMap
|
|
1347
|
+
|
|
1348
|
+
# Field Metadata
|
|
1349
|
+
field_properties: FieldPropertiesMap
|
|
1350
|
+
all_source_fields: Set[str]
|
|
1351
|
+
all_target_fields: Set[str]
|
|
1352
|
+
source: SourceTargetFields
|
|
1353
|
+
target: SourceTargetFields
|
|
1354
|
+
unique_fields: List[str]
|
|
1355
|
+
required_fields: List[str]
|
|
1356
|
+
|
|
1357
|
+
# Custom Field Data
|
|
1358
|
+
custom_fields: FieldPropertiesMap
|
|
1359
|
+
custom_fields_model: Optional[type] = None
|
|
1360
|
+
|
|
1361
|
+
# Value Mappings
|
|
1362
|
+
source_to_value_mappings: Dict[str, List[ScenarioMappingConfiguration]]
|
|
1363
|
+
|
|
1364
|
+
# Compare Configuration
|
|
1365
|
+
target_fields_to_ignore_in_compare: Set[str] = Field(default_factory=set)
|
|
1366
|
+
|
|
1367
|
+
@classmethod
|
|
1368
|
+
def from_api_dict(cls, scenario: Dict[str, Any]) -> "ParsedScenario":
|
|
1369
|
+
"""Factory method to create a ParsedScenario from raw API data.
|
|
1370
|
+
|
|
1371
|
+
Args:
|
|
1372
|
+
scenario: Raw scenario dictionary from the BrynQ API
|
|
1373
|
+
|
|
1374
|
+
Returns:
|
|
1375
|
+
ParsedScenario object with all parsed data
|
|
1376
|
+
"""
|
|
1377
|
+
return ScenarioParser().parse(scenario)
|
|
1378
|
+
|
|
1379
|
+
def __getattribute__(self, name: str):
|
|
1380
|
+
"""Override attribute access to emit deprecation warnings for unique_fields and required_fields."""
|
|
1381
|
+
if name == 'unique_fields':
|
|
1382
|
+
warnings.warn(
|
|
1383
|
+
"unique_fields is deprecated. Use scenario.source.unique_fields or scenario.target.unique_fields instead.",
|
|
1384
|
+
DeprecationWarning,
|
|
1385
|
+
stacklevel=2
|
|
1386
|
+
)
|
|
1387
|
+
elif name == 'required_fields':
|
|
1388
|
+
warnings.warn(
|
|
1389
|
+
"required_fields is deprecated. Use scenario.source.required_fields or scenario.target.required_fields instead.",
|
|
1390
|
+
DeprecationWarning,
|
|
1391
|
+
stacklevel=2
|
|
1392
|
+
)
|
|
1393
|
+
return super().__getattribute__(name)
|
|
1394
|
+
|
|
1395
|
+
def get_source_fields_with_value_mappings(self) -> List[str]:
|
|
1396
|
+
"""Returns a list of source fields that have value mappings.
|
|
1397
|
+
|
|
1398
|
+
Uses `source_to_value_mappings` attribute internally.
|
|
1399
|
+
|
|
1400
|
+
Example:
|
|
1401
|
+
>>> scenario.get_source_fields_with_value_mappings()
|
|
1402
|
+
['gender', 'status']
|
|
1403
|
+
>>> list(scenario.source_to_value_mappings.keys())
|
|
1404
|
+
['gender', 'status']
|
|
1405
|
+
|
|
1406
|
+
Returns:
|
|
1407
|
+
List of source field names that have value mappings
|
|
1408
|
+
"""
|
|
1409
|
+
return list(self.source_to_value_mappings.keys())
|
|
1410
|
+
|
|
1411
|
+
def get_target_fields_with_value_mappings(self) -> List[str]:
|
|
1412
|
+
"""Returns a list of target fields that have value mappings (via their mapped source fields).
|
|
1413
|
+
|
|
1414
|
+
Uses `source_to_value_mappings` and `source_to_target_map` attributes internally.
|
|
1415
|
+
|
|
1416
|
+
Example:
|
|
1417
|
+
>>> scenario.get_target_fields_with_value_mappings()
|
|
1418
|
+
['gender_code', 'status_code']
|
|
1419
|
+
>>> scenario.source_to_target_map['gender']
|
|
1420
|
+
['gender_code']
|
|
1421
|
+
|
|
1422
|
+
Returns:
|
|
1423
|
+
List of target field names that have value mappings
|
|
1424
|
+
"""
|
|
1425
|
+
target_fields_with_mappings: Set[str] = set()
|
|
1426
|
+
for source_key in self.source_to_value_mappings.keys():
|
|
1427
|
+
# Handle keys that might be multiple source fields joined with '|'
|
|
1428
|
+
source_fields = source_key.split('|') if '|' in source_key else [source_key]
|
|
1429
|
+
for source_field in source_fields:
|
|
1430
|
+
# Find target fields mapped from this source field
|
|
1431
|
+
target_fields = self.source_to_target_map.get(source_field, [])
|
|
1432
|
+
target_fields_with_mappings.update(target_fields)
|
|
1433
|
+
return sorted(list(target_fields_with_mappings))
|
|
1434
|
+
|
|
1435
|
+
def has_field(self, field_name: str, field_type: Optional[str] = None) -> bool:
|
|
1436
|
+
"""Check field existence in scenario. Can denote source or target, else looks for both.
|
|
1437
|
+
|
|
1438
|
+
Uses `all_source_fields` and `all_target_fields` attributes internally.
|
|
1439
|
+
|
|
1440
|
+
Example:
|
|
1441
|
+
>>> scenario.has_field('email')
|
|
1442
|
+
True
|
|
1443
|
+
>>> scenario.has_field('email', field_type='source')
|
|
1444
|
+
True
|
|
1445
|
+
>>> scenario.has_field('email', field_type='target')
|
|
1446
|
+
False
|
|
1447
|
+
>>> 'email' in scenario.all_source_fields
|
|
1448
|
+
True
|
|
1449
|
+
|
|
1450
|
+
Args:
|
|
1451
|
+
field_name: The field name to check
|
|
1452
|
+
field_type: Optional field type filter ("source" or "target")
|
|
1453
|
+
|
|
1454
|
+
Returns:
|
|
1455
|
+
True if field exists, False otherwise
|
|
1456
|
+
"""
|
|
1457
|
+
if field_type == "source":
|
|
1458
|
+
return field_name in self.all_source_fields
|
|
1459
|
+
if field_type == "target":
|
|
1460
|
+
return field_name in self.all_target_fields
|
|
1461
|
+
return field_name in self.all_source_fields or field_name in self.all_target_fields
|
|
1462
|
+
|
|
1463
|
+
# Dunder(like) methods for pythonic field access
|
|
1464
|
+
def __iter__(self):
|
|
1465
|
+
"""Enable iteration over records.
|
|
1466
|
+
|
|
1467
|
+
Example:
|
|
1468
|
+
>>> for record in ParsedScenario:
|
|
1469
|
+
... print(f"Record {record.id}: {len(record.source.field_names)} source fields")
|
|
1470
|
+
Record rec_123: 2 source fields
|
|
1471
|
+
Record rec_456: 1 source fields
|
|
1472
|
+
>>> list(ParsedScenario)
|
|
1473
|
+
[Record(id='rec_123', ...), Record(id='rec_456', ...)]
|
|
1474
|
+
"""
|
|
1475
|
+
return iter(self.records)
|
|
1476
|
+
|
|
1477
|
+
def __len__(self) -> int:
|
|
1478
|
+
"""Return the number of records in this scenario.
|
|
1479
|
+
|
|
1480
|
+
Example:
|
|
1481
|
+
>>> len(scenario)
|
|
1482
|
+
15
|
|
1483
|
+
>>> scenario.records_count
|
|
1484
|
+
15
|
|
1485
|
+
|
|
1486
|
+
Returns:
|
|
1487
|
+
int: The number of records in the scenario
|
|
1488
|
+
"""
|
|
1489
|
+
return len(self.records)
|
|
1490
|
+
|
|
1491
|
+
def __getitem__(self, field_id: str) -> FieldProperties:
|
|
1492
|
+
"""Enable dict-style access to field properties.
|
|
1493
|
+
|
|
1494
|
+
Example:
|
|
1495
|
+
>>> ParsedScenario['customer_id']
|
|
1496
|
+
FieldProperties(alias='customer_id', uuid='...', label='Customer ID', ...)
|
|
1497
|
+
>>> ParsedScenario['customer_id'].required
|
|
1498
|
+
True
|
|
1499
|
+
>>> ParsedScenario['nonexistent']
|
|
1500
|
+
KeyError: Field 'nonexistent' not found in scenario 'Personal Information'.
|
|
1501
|
+
|
|
1502
|
+
Args:
|
|
1503
|
+
field_id: The field name to look up
|
|
1504
|
+
|
|
1505
|
+
Returns:
|
|
1506
|
+
FieldProperties object for the field
|
|
1507
|
+
|
|
1508
|
+
Raises:
|
|
1509
|
+
KeyError: If field is not found
|
|
1510
|
+
"""
|
|
1511
|
+
try:
|
|
1512
|
+
return self.field_properties[field_id]
|
|
1513
|
+
except KeyError as exc:
|
|
1514
|
+
raise KeyError(f"Field '{field_id}' not found in scenario '{self.name}'.") from exc
|
|
1515
|
+
|
|
1516
|
+
def __getattr__(self, name: str) -> FieldProperties:
|
|
1517
|
+
"""Enable attribute-style access to field properties.
|
|
1518
|
+
|
|
1519
|
+
Example:
|
|
1520
|
+
>>> ParsedScenario.customer_id
|
|
1521
|
+
FieldProperties(alias='customer_id', uuid='...', label='Customer ID', ...)
|
|
1522
|
+
>>> ParsedScenario.customer_id.unique
|
|
1523
|
+
True
|
|
1524
|
+
>>> ParsedScenario.nonexistent
|
|
1525
|
+
AttributeError: 'nonexistent' is not a valid field in scenario 'Personal Information'.
|
|
1526
|
+
|
|
1527
|
+
Args:
|
|
1528
|
+
name: The field name to look up
|
|
1529
|
+
|
|
1530
|
+
Returns:
|
|
1531
|
+
FieldProperties object for the field
|
|
1532
|
+
|
|
1533
|
+
Raises:
|
|
1534
|
+
AttributeError: If field is not found
|
|
1535
|
+
"""
|
|
1536
|
+
if name.startswith("_") or name in self.__dict__ or name in self.__class__.__dict__:
|
|
1537
|
+
return super().__getattribute__(name)
|
|
1538
|
+
try:
|
|
1539
|
+
return self.field_properties[name]
|
|
1540
|
+
except KeyError as exc:
|
|
1541
|
+
raise AttributeError(f"'{name}' is not a valid field in scenario '{self.name}'.") from exc
|
|
1542
|
+
|
|
1543
|
+
def __repr__(self) -> str:
|
|
1544
|
+
"""A human-friendly string representation.
|
|
1545
|
+
|
|
1546
|
+
Example:
|
|
1547
|
+
>>> repr(ParsedScenario)
|
|
1548
|
+
"<ParsedScenario (field mapping of scenario) name='Personal Information' id='abc123' details=5 unique=2 required=3>"
|
|
1549
|
+
|
|
1550
|
+
Returns:
|
|
1551
|
+
String representation of the ParsedScenario
|
|
1552
|
+
"""
|
|
1553
|
+
return (
|
|
1554
|
+
f"<ParsedScenario (field mapping of scenario) "
|
|
1555
|
+
f"name='{self.name}' id='{self.id}' "
|
|
1556
|
+
f"records={self.records_count} unique={len(self.unique_fields)} "
|
|
1557
|
+
f"required={len(self.required_fields)}>"
|
|
1558
|
+
)
|
|
1559
|
+
|
|
1560
|
+
def __str__(self) -> str:
|
|
1561
|
+
"""String representation (used by print()). Delegates to __repr__."""
|
|
1562
|
+
return self.__repr__()
|
|
1563
|
+
|
|
1564
|
+
@staticmethod
|
|
1565
|
+
def _build_custom_field_model(custom_fields: FieldPropertiesMap) -> Optional[type]:
|
|
1566
|
+
"""Dynamically creates a Pandera Schema for custom fields validation.
|
|
1567
|
+
|
|
1568
|
+
Uses the `custom_fields` dictionary to extract field metadata (technical_name, label, required)
|
|
1569
|
+
and create a Pandera schema model for validation.
|
|
1570
|
+
|
|
1571
|
+
Args:
|
|
1572
|
+
custom_fields: Dictionary mapping field names to their FieldProperties objects (filtered to CUSTOM fields only)
|
|
1573
|
+
|
|
1574
|
+
Returns:
|
|
1575
|
+
A dynamically generated BrynQ Pandera model class or None when no fields can be mapped
|
|
1576
|
+
"""
|
|
1577
|
+
annotations = {}
|
|
1578
|
+
fields = {}
|
|
1579
|
+
for name, props in custom_fields.items():
|
|
1580
|
+
annotations[name] = Optional[Series[String]]
|
|
1581
|
+
# Use fallback, technical_name can be None by definition for CUSTOM fields if not found in data
|
|
1582
|
+
alias_value = props.technical_name or props.uuid or name
|
|
1583
|
+
fields[name] = pa.Field(
|
|
1584
|
+
coerce=True,
|
|
1585
|
+
nullable=not props.required,
|
|
1586
|
+
alias=alias_value,
|
|
1587
|
+
description=props.label
|
|
1588
|
+
)
|
|
1589
|
+
|
|
1590
|
+
if not annotations:
|
|
1591
|
+
return None
|
|
1592
|
+
fields["__annotations__"] = annotations
|
|
1593
|
+
return type("CustomFieldModel", (BrynQPanderaDataFrameModel,), fields)
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
class DummyRecord:
|
|
1597
|
+
"""Dummy record for logging unmapped sources that don't belong to any record.
|
|
1598
|
+
|
|
1599
|
+
Used internally by Scenarios.rename_fields to track source columns present in
|
|
1600
|
+
the DataFrame but not mapped by the scenario.
|
|
1601
|
+
"""
|
|
1602
|
+
def __init__(self):
|
|
1603
|
+
"""Initialize a dummy record with empty attributes."""
|
|
1604
|
+
self.id = None
|
|
1605
|
+
self.logic = None
|
|
1606
|
+
self.relation_type = None
|
|
1607
|
+
self.source = SourceTargetFields(
|
|
1608
|
+
type="source",
|
|
1609
|
+
field_names=[],
|
|
1610
|
+
unique_fields=[],
|
|
1611
|
+
required_fields=[],
|
|
1612
|
+
field_properties=[],
|
|
1613
|
+
custom_fields=[],
|
|
1614
|
+
library_fields=[],
|
|
1615
|
+
fields_with_logic=[]
|
|
1616
|
+
)
|
|
1617
|
+
self.target = SourceTargetFields(
|
|
1618
|
+
type="target",
|
|
1619
|
+
field_names=[],
|
|
1620
|
+
unique_fields=[],
|
|
1621
|
+
required_fields=[],
|
|
1622
|
+
field_properties=[],
|
|
1623
|
+
custom_fields=[],
|
|
1624
|
+
library_fields=[],
|
|
1625
|
+
fields_with_logic=[]
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
|
|
1629
|
+
class Scenarios():
|
|
1630
|
+
"""
|
|
1631
|
+
Provides convenient access to BrynQ scenarios, with lookups and a Pythonic interface.
|
|
1632
|
+
|
|
1633
|
+
"""
|
|
1634
|
+
# Missing value representations to detect in dataframes
|
|
1635
|
+
MISSING_VALUES: List[str] = [
|
|
1636
|
+
'<NA>', 'nan', 'None', 'NaN', 'null', 'NaT', '_NA_', '', r'\[\]', r'\{ \}'
|
|
1637
|
+
]
|
|
1638
|
+
def __init__(self, brynq_instance: Any):
|
|
1639
|
+
"""Initializes the scenarios manager.
|
|
1640
|
+
|
|
1641
|
+
Fetches and parses scenarios from the BrynQ API. Scenarios are cached after first fetch.
|
|
1642
|
+
Dunder methods (__getitem__, __iter__, __len__) auto-fetch if not loaded.
|
|
1643
|
+
|
|
1644
|
+
**Core Methods:**
|
|
1645
|
+
- get(): Fetches/returns ParsedScenario objects (cached after first call)
|
|
1646
|
+
|
|
1647
|
+
**Convenience Methods:**
|
|
1648
|
+
- find_scenarios_with_field(): Find scenarios containing a field
|
|
1649
|
+
- scenario_names: Cached property with all scenario names
|
|
1650
|
+
|
|
1651
|
+
**Dunder Methods:**
|
|
1652
|
+
- __getitem__: Dict access `scenarios['Name']`
|
|
1653
|
+
- __iter__: Iterate scenarios `for scenario in scenarios:`
|
|
1654
|
+
- __len__: Count scenarios `len(scenarios)`
|
|
1655
|
+
|
|
1656
|
+
**ParsedScenario Iteration:**
|
|
1657
|
+
- Records: `for record in scenario:` (mapping records with logic/relation types)
|
|
1658
|
+
- Fields: `scenario.keys()`, `scenario.values()`, `scenario.items()`
|
|
1659
|
+
- Field access: `scenario['field']` or `scenario.field` (dict/attr style)
|
|
1660
|
+
- Source/Target: `scenario.source.field_names`, `scenario.target.field_names`, `scenario.source.unique_fields`, etc.
|
|
1661
|
+
- String repr: `print(scenario)` shows summary
|
|
1662
|
+
|
|
1663
|
+
**Record Iteration:**
|
|
1664
|
+
- All fields: `for field in record:` (iterates over both source and target fields)
|
|
1665
|
+
- Source/Target: `for field in record.source:`, `for field in record.target:`
|
|
1666
|
+
- Field names: `record.source.field_names`, `record.target.field_names`
|
|
1667
|
+
- Field properties: `record.source.field_properties`, `record.target.field_properties`
|
|
1668
|
+
|
|
1669
|
+
**Transformation Methods:**
|
|
1670
|
+
- rename_fields(): Rename/transform DataFrame columns per scenario.
|
|
1671
|
+
- apply_value_mappings(): Apply value mappings (e.g., 'F' → '1') per scenario.
|
|
1672
|
+
- add_fixed_values(): Add fixed literal values to DataFrames
|
|
1673
|
+
|
|
1674
|
+
Args:
|
|
1675
|
+
brynq_instance: Authenticated BrynQ client instance.
|
|
1676
|
+
"""
|
|
1677
|
+
self._brynq = brynq_instance
|
|
1678
|
+
|
|
1679
|
+
# Attributes populated by get()
|
|
1680
|
+
self.raw_scenarios: Optional[List[Dict]] = None
|
|
1681
|
+
self.scenarios: Optional[List[ParsedScenario]] = None
|
|
1682
|
+
|
|
1683
|
+
# ============================================================================
|
|
1684
|
+
# Public API Methods
|
|
1685
|
+
# ============================================================================
|
|
1686
|
+
|
|
1687
|
+
def get(self, strict: bool = True) -> List[ParsedScenario]:
|
|
1688
|
+
"""Fetches all scenarios from the API and returns them as ParsedScenario objects.
|
|
1689
|
+
|
|
1690
|
+
Results are cached after the first call.
|
|
1691
|
+
|
|
1692
|
+
Args:
|
|
1693
|
+
strict: If True, raises ValueError on validation errors. If False, skips invalid scenarios.
|
|
1694
|
+
|
|
1695
|
+
Returns:
|
|
1696
|
+
List[ParsedScenario]: Validated scenario objects.
|
|
1697
|
+
"""
|
|
1698
|
+
# only get once, else reuse initialized object
|
|
1699
|
+
if self.scenarios is None:
|
|
1700
|
+
self.raw_scenarios = self._fetch_from_api(strict=strict)
|
|
1701
|
+
self.scenarios = [
|
|
1702
|
+
ParsedScenario.from_api_dict(scenario=s)
|
|
1703
|
+
for s in self.raw_scenarios if "name" in s
|
|
1704
|
+
]
|
|
1705
|
+
return self.scenarios
|
|
1706
|
+
else:
|
|
1707
|
+
return self.scenarios
|
|
1708
|
+
|
|
1709
|
+
def find_scenarios_with_field(
|
|
1710
|
+
self,
|
|
1711
|
+
field_name: str,
|
|
1712
|
+
field_type: str = "source"
|
|
1713
|
+
) -> List[ParsedScenario]:
|
|
1714
|
+
"""Find all scenarios that contain a specific field.
|
|
1715
|
+
|
|
1716
|
+
Example:
|
|
1717
|
+
>>> scenarios.find_scenarios_with_field('employee_id')
|
|
1718
|
+
[]
|
|
1719
|
+
>>> scenarios.find_scenarios_with_field('employee_id', field_type='target')
|
|
1720
|
+
[<ParsedScenario name='Personal information' id='3c7f8e04-5b74-408f-a2d8-ad99b924a1af' details=15 unique=2 required=20>, <ParsedScenario name='Adres' ...>]
|
|
1721
|
+
|
|
1722
|
+
Args:
|
|
1723
|
+
field_name (str): The field name to search for.
|
|
1724
|
+
field_type (str): The type of field to search in;
|
|
1725
|
+
must be either "source" or "target". Defaults to "source".
|
|
1726
|
+
|
|
1727
|
+
Returns:
|
|
1728
|
+
List[ParsedScenario]: List of ParsedScenario objects containing the specified field.
|
|
1729
|
+
"""
|
|
1730
|
+
return [
|
|
1731
|
+
scenario for scenario in self.get()
|
|
1732
|
+
if scenario.has_field(field_name, field_type=field_type)
|
|
1733
|
+
]
|
|
1734
|
+
|
|
1735
|
+
@cached_property
|
|
1736
|
+
def scenario_names(self) -> List[str]:
|
|
1737
|
+
"""A list of all scenario names.
|
|
1738
|
+
|
|
1739
|
+
Example:
|
|
1740
|
+
>>> scenarios.scenario_names
|
|
1741
|
+
['Personal information', 'Adres', 'Bank Account', 'Contract Information', ...]
|
|
1742
|
+
|
|
1743
|
+
Returns:
|
|
1744
|
+
List[str]: List of all scenario names.
|
|
1745
|
+
"""
|
|
1746
|
+
return [s.name for s in self.get()] if self.scenarios is not None else []
|
|
1747
|
+
|
|
1748
|
+
def __getitem__(self, scenario_name: str) -> ParsedScenario:
|
|
1749
|
+
"""Returns scenario by name using dict-style access.
|
|
1750
|
+
|
|
1751
|
+
Example:
|
|
1752
|
+
>>> scenario = scenarios['Personal information']
|
|
1753
|
+
>>> scenario.name
|
|
1754
|
+
'Personal information'
|
|
1755
|
+
>>> scenario['first_name'].required
|
|
1756
|
+
True
|
|
1757
|
+
>>> scenario.firstname.required # Attribute-style access also works
|
|
1758
|
+
True
|
|
1759
|
+
|
|
1760
|
+
Args:
|
|
1761
|
+
scenario_name: Name of the scenario to retrieve.
|
|
1762
|
+
|
|
1763
|
+
Returns:
|
|
1764
|
+
ParsedScenario object with records, mappings, and field properties.
|
|
1765
|
+
|
|
1766
|
+
Raises:
|
|
1767
|
+
KeyError: If scenario name not found.
|
|
1768
|
+
"""
|
|
1769
|
+
scenarios = {s.name: s for s in self.get()}
|
|
1770
|
+
if scenario_name not in scenarios:
|
|
1771
|
+
raise KeyError(f"Scenario '{scenario_name}' not found.")
|
|
1772
|
+
return scenarios[scenario_name]
|
|
1773
|
+
|
|
1774
|
+
def __iter__(self) -> Iterator[ParsedScenario]:
|
|
1775
|
+
"""Iterates over all parsed scenarios.
|
|
1776
|
+
|
|
1777
|
+
Example:
|
|
1778
|
+
>>> for scenario in scenarios:
|
|
1779
|
+
... print(f"{scenario.name}: {len(scenario.required_fields)} required fields")
|
|
1780
|
+
Personal information: 20 required fields
|
|
1781
|
+
|
|
1782
|
+
Yields:
|
|
1783
|
+
ParsedScenario: Each scenario object.
|
|
1784
|
+
"""
|
|
1785
|
+
return iter(self.get())
|
|
1786
|
+
|
|
1787
|
+
def __len__(self) -> int:
|
|
1788
|
+
"""Return the number of parsed scenarios.
|
|
1789
|
+
|
|
1790
|
+
Example:
|
|
1791
|
+
>>> len(scenarios)
|
|
1792
|
+
13
|
|
1793
|
+
|
|
1794
|
+
Returns:
|
|
1795
|
+
int: The number of available scenarios.
|
|
1796
|
+
"""
|
|
1797
|
+
return len(self.get())
|
|
1798
|
+
|
|
1799
|
+
# ============================================================================
|
|
1800
|
+
# Internal API Helpers
|
|
1801
|
+
# ============================================================================
|
|
1802
|
+
|
|
1803
|
+
def _fetch_from_api(self, strict: bool = True) -> List[Dict[str, Any]]:
|
|
1804
|
+
"""Fetches raw scenario data from BrynQ API and validates it.
|
|
1805
|
+
|
|
1806
|
+
Makes HTTP GET request, validates JSON against Scenario model.
|
|
1807
|
+
Invalid scenarios are skipped (warning) or raise error, based on strict flag.
|
|
1808
|
+
|
|
1809
|
+
Args:
|
|
1810
|
+
strict (bool): If True, raise ValueError on validation errors. If False, skip invalid scenarios with warning.
|
|
1811
|
+
|
|
1812
|
+
Returns:
|
|
1813
|
+
List[Dict[str, Any]]: Validated scenario dictionaries (raw API format). Contains name, id, description, details.
|
|
1814
|
+
|
|
1815
|
+
Raises:
|
|
1816
|
+
requests.HTTPError: API request failed (non-2xx status).
|
|
1817
|
+
TypeError: API response is not a list.
|
|
1818
|
+
ValueError: strict=True and validation failed.
|
|
1819
|
+
|
|
1820
|
+
Note:
|
|
1821
|
+
Internal method called by get(). Returns raw dicts; get() converts to ParsedScenario objects.
|
|
1822
|
+
"""
|
|
1823
|
+
response = self._brynq.brynq_session.get(
|
|
1824
|
+
url=(
|
|
1825
|
+
f"{self._brynq.url}interfaces/"
|
|
1826
|
+
f"{self._brynq.data_interface_id}/scenarios"
|
|
1827
|
+
),
|
|
1828
|
+
timeout=self._brynq.timeout,
|
|
1829
|
+
)
|
|
1830
|
+
response.raise_for_status()
|
|
1831
|
+
scenario_list = response.json()
|
|
1832
|
+
if not isinstance(scenario_list, list):
|
|
1833
|
+
raise TypeError(f"Expected a list of scenarios, but got {type(scenario_list).__name__}.")
|
|
1834
|
+
|
|
1835
|
+
valid_scenarios, invalid_scenarios = Functions.validate_pydantic_data(
|
|
1836
|
+
scenario_list,
|
|
1837
|
+
schema=Scenario,
|
|
1838
|
+
debug=True,
|
|
1839
|
+
)
|
|
1840
|
+
|
|
1841
|
+
if invalid_scenarios:
|
|
1842
|
+
msg = (f"{len(invalid_scenarios)} scenario(s) failed validation and were skipped.")
|
|
1843
|
+
if strict:
|
|
1844
|
+
raise ValueError(f"Invalid scenario data found: {msg}")
|
|
1845
|
+
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
1846
|
+
|
|
1847
|
+
return valid_scenarios
|
|
1848
|
+
|
|
1849
|
+
# ============================================================================
|
|
1850
|
+
# Public Transformation Methods
|
|
1851
|
+
# ============================================================================
|
|
1852
|
+
|
|
1853
|
+
def add_fixed_values(
|
|
1854
|
+
self,
|
|
1855
|
+
df: pd.DataFrame,
|
|
1856
|
+
scenario_name: str
|
|
1857
|
+
) -> pd.DataFrame:
|
|
1858
|
+
"""Adds fixed literal values to DataFrame columns based on scenario mappings.
|
|
1859
|
+
|
|
1860
|
+
Creates new columns with target field names, fills all rows with the fixed value.
|
|
1861
|
+
Only processes records with relation_type 'one_to_one' or 'one_to_many'.
|
|
1862
|
+
Supports both FIXED and CONFIGURATION source field types.
|
|
1863
|
+
|
|
1864
|
+
Args:
|
|
1865
|
+
df (pd.DataFrame): Input DataFrame to add fixed value columns to.
|
|
1866
|
+
scenario_name (str): Name of scenario containing fixed value mappings.
|
|
1867
|
+
|
|
1868
|
+
Returns:
|
|
1869
|
+
pd.DataFrame: Copy of input DataFrame with fixed value columns added.
|
|
1870
|
+
|
|
1871
|
+
Raises:
|
|
1872
|
+
ValueError: Scenario name not found.
|
|
1873
|
+
|
|
1874
|
+
Examples
|
|
1875
|
+
--------
|
|
1876
|
+
Adding a fixed value column from a scenario with FIXED source type.
|
|
1877
|
+
|
|
1878
|
+
>>> df = pd.DataFrame({'id': [1, 2, 3], 'name': ['John', 'Jane', 'Bob']})
|
|
1879
|
+
>>> df.columns.tolist()
|
|
1880
|
+
['id', 'name']
|
|
1881
|
+
>>> df
|
|
1882
|
+
id name
|
|
1883
|
+
0 1 John
|
|
1884
|
+
1 2 Jane
|
|
1885
|
+
2 3 Bob
|
|
1886
|
+
|
|
1887
|
+
Scenario has a record with FIXED source value 'NL' mapping to target 'country_code'.
|
|
1888
|
+
|
|
1889
|
+
>>> df = scenarios.add_fixed_values(df, 'My Scenario')
|
|
1890
|
+
>>> df
|
|
1891
|
+
id name country_code
|
|
1892
|
+
0 1 John NL
|
|
1893
|
+
1 2 Jane NL
|
|
1894
|
+
2 3 Bob NL
|
|
1895
|
+
|
|
1896
|
+
The 'country_code' column is added and filled with the fixed value 'NL' for all rows.
|
|
1897
|
+
|
|
1898
|
+
Also supports CONFIGURATION source types. Config values are parsed according to
|
|
1899
|
+
their type (TEXT, EMAIL, NUMBER, SELECTION, DATEPICKER, etc.) during record creation.
|
|
1900
|
+
|
|
1901
|
+
Note:
|
|
1902
|
+
For many_to_one/many_to_many mappings, use rename_fields() instead.
|
|
1903
|
+
"""
|
|
1904
|
+
df_fixed = df.copy()
|
|
1905
|
+
try:
|
|
1906
|
+
scenario = self[scenario_name]
|
|
1907
|
+
except KeyError as e:
|
|
1908
|
+
raise ValueError(f"Scenario with name '{scenario_name}' not found.") from e
|
|
1909
|
+
|
|
1910
|
+
for record in scenario.records:
|
|
1911
|
+
if record.relation_type not in ("one_to_one", "one_to_many"):
|
|
1912
|
+
continue
|
|
1913
|
+
|
|
1914
|
+
if not record.fixed_source_value:
|
|
1915
|
+
warnings.warn(f"Missing fixed/config value for record {record.id}", stacklevel=2)
|
|
1916
|
+
continue
|
|
1917
|
+
|
|
1918
|
+
for target_field in record.target.field_names:
|
|
1919
|
+
df_fixed[target_field] = record.fixed_source_value
|
|
1920
|
+
|
|
1921
|
+
return df_fixed
|
|
1922
|
+
|
|
1923
|
+
def apply_value_mappings(
|
|
1924
|
+
self,
|
|
1925
|
+
df: pd.DataFrame,
|
|
1926
|
+
scenario_name: str,
|
|
1927
|
+
drop_unmapped: bool = False,
|
|
1928
|
+
how: Literal[ #Union list, geen valMap dan meer explicit
|
|
1929
|
+
'exactValMap',
|
|
1930
|
+
'ignoreCaseValMap',
|
|
1931
|
+
'ignoreSpecialValMap',
|
|
1932
|
+
'ignoreSpacesValMap',
|
|
1933
|
+
'flexValMap'
|
|
1934
|
+
] = 'exactValMap'
|
|
1935
|
+
) -> Tuple[pd.DataFrame, Set[str], pd.DataFrame]:
|
|
1936
|
+
"""Transforms source values to target values based on scenario mappings.
|
|
1937
|
+
|
|
1938
|
+
Processes records with value mapping configurations (e.g., "M" -> "1").
|
|
1939
|
+
Handles various relation types by preparing source values appropriately (direct vs concatenated).
|
|
1940
|
+
|
|
1941
|
+
Mapping strategies (how parameter):
|
|
1942
|
+
- exactValMap: Precise matching (default)
|
|
1943
|
+
- ignoreCaseValMap: Case-insensitive matching
|
|
1944
|
+
- ignoreSpecialValMap: Ignores special characters including spaces
|
|
1945
|
+
- ignoreSpacesValMap: Ignores spaces only
|
|
1946
|
+
- flexValMap: Case-insensitive + ignores special characters including spaces
|
|
1947
|
+
|
|
1948
|
+
Strategy selection priority:
|
|
1949
|
+
1. Check record.logic for 'matching strategy' (higher priority), evaluate if correspond to the strategy names above.
|
|
1950
|
+
2. Fall back to how kwarg if no match in logic
|
|
1951
|
+
|
|
1952
|
+
Examples
|
|
1953
|
+
--------
|
|
1954
|
+
Example 1: Basic value mapping with exactValMap (default).
|
|
1955
|
+
|
|
1956
|
+
>>> df = pd.DataFrame({'gender': ['F', 'M', 'F']})
|
|
1957
|
+
>>> # Scenario mapping configuration:
|
|
1958
|
+
>>> # {'gender': 'F'} -> {'gender_code': '1'}
|
|
1959
|
+
>>> # {'gender': 'M'} -> {'gender_code': '0'}
|
|
1960
|
+
>>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario')
|
|
1961
|
+
>>> df
|
|
1962
|
+
gender gender_code
|
|
1963
|
+
0 F 1
|
|
1964
|
+
1 M 0
|
|
1965
|
+
2 F 1
|
|
1966
|
+
|
|
1967
|
+
Example 2: Case-insensitive matching with ignoreCaseValMap.
|
|
1968
|
+
|
|
1969
|
+
>>> df = pd.DataFrame({'status': ['Active', 'ACTIVE', 'inactive']})
|
|
1970
|
+
>>> # Scenario mapping (source values normalized to lowercase for matching):
|
|
1971
|
+
>>> # {'status': 'active'} -> {'status_code': '1'} # Matches 'Active', 'ACTIVE'
|
|
1972
|
+
>>> # {'status': 'inactive'} -> {'status_code': '0'} # Matches 'inactive'
|
|
1973
|
+
>>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='ignoreCaseValMap')
|
|
1974
|
+
>>> df
|
|
1975
|
+
status status_code
|
|
1976
|
+
0 Active 1
|
|
1977
|
+
1 ACTIVE 1
|
|
1978
|
+
2 inactive 0
|
|
1979
|
+
|
|
1980
|
+
Example 3: Flexible matching with flexValMap (ignores case and special chars).
|
|
1981
|
+
|
|
1982
|
+
>>> df = pd.DataFrame({
|
|
1983
|
+
... 'product_code': ['ABC-123', 'xyz_456', 'MNO 789', 'PQR@#$%']
|
|
1984
|
+
... })
|
|
1985
|
+
>>> # Scenario mapping (source values normalized: lowercase + remove special chars):
|
|
1986
|
+
>>> # {'product_code': 'abc123'} -> {'product_id': 'P001'} # Matches 'ABC-123'
|
|
1987
|
+
>>> # {'product_code': 'xyz456'} -> {'product_id': 'P002'} # Matches 'xyz_456'
|
|
1988
|
+
>>> # {'product_code': 'mno789'} -> {'product_id': 'P003'} # Matches 'MNO 789'
|
|
1989
|
+
>>> # {'product_code': 'pqr'} -> {'product_id': 'P004'} # Matches 'PQR@#$%'
|
|
1990
|
+
>>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='flexValMap')
|
|
1991
|
+
>>> df
|
|
1992
|
+
product_code product_id
|
|
1993
|
+
0 ABC-123 P001
|
|
1994
|
+
1 xyz_456 P002
|
|
1995
|
+
2 MNO 789 P003
|
|
1996
|
+
3 PQR@#$% P004
|
|
1997
|
+
|
|
1998
|
+
Example 4: Many-to-one mapping with concatenated fields and special chars.
|
|
1999
|
+
|
|
2000
|
+
>>> df = pd.DataFrame({
|
|
2001
|
+
... 'first_name': ['John', 'Jane', 'José'],
|
|
2002
|
+
... 'last_name': ['Doe-Smith', 'O\'Brien', 'García-López']
|
|
2003
|
+
... })
|
|
2004
|
+
>>> # Scenario mapping (concatenated with |, then normalized for matching):
|
|
2005
|
+
>>> # {'first_name': 'John', 'last_name': 'Doe-Smith'} -> 'John|Doe-Smith' -> 'john|doesmith' -> {'full_id': 'JD001'}
|
|
2006
|
+
>>> # {'first_name': 'Jane', 'last_name': 'O\'Brien'} -> 'Jane|O\'Brien' -> 'jane|obrien' -> {'full_id': 'JO002'}
|
|
2007
|
+
>>> # {'first_name': 'José', 'last_name': 'García-López'} -> 'José|García-López' -> 'josé|garclpez' -> {'full_id': 'JG003'}
|
|
2008
|
+
>>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='flexValMap')
|
|
2009
|
+
>>> df
|
|
2010
|
+
first_name last_name full_id
|
|
2011
|
+
0 John Doe-Smith JD001
|
|
2012
|
+
1 Jane O'Brien JO002
|
|
2013
|
+
2 José García-López JG003
|
|
2014
|
+
|
|
2015
|
+
Example 5: ignoreSpacesValMap - removes spaces but preserves other special chars.
|
|
2016
|
+
|
|
2017
|
+
>>> df = pd.DataFrame({
|
|
2018
|
+
... 'location': ['New York', 'New York', 'New York', 'New York', 'New-York', 'New_York']
|
|
2019
|
+
... })
|
|
2020
|
+
>>> # Scenario mapping (source values normalized: remove spaces only, preserves hyphens/underscores):
|
|
2021
|
+
>>> # {'location': 'New York'} -> {'location_code': 'NYC'} # Mapping has spaces, normalizes to 'NewYork'
|
|
2022
|
+
>>> # {'location': 'New-York'} -> {'location_code': 'NYD'} # Matches 'New-York' (exact, spaces removed)
|
|
2023
|
+
>>> # {'location': 'New_York'} -> {'location_code': 'NYU'} # Matches 'New_York' (exact, spaces removed)
|
|
2024
|
+
>>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='ignoreSpacesValMap')
|
|
2025
|
+
>>> df
|
|
2026
|
+
location location_code
|
|
2027
|
+
0 New York NYC
|
|
2028
|
+
1 New York NYC
|
|
2029
|
+
2 New York NYC
|
|
2030
|
+
3 New York NYC
|
|
2031
|
+
4 New-York NYD
|
|
2032
|
+
5 New_York NYU
|
|
2033
|
+
|
|
2034
|
+
Example 6: Per-record strategy override via logic field.
|
|
2035
|
+
|
|
2036
|
+
>>> df = pd.DataFrame({'code': ['A-B', 'C D', 'E|F']})
|
|
2037
|
+
>>> # Record 1: logic='flexValMap' -> uses flexValMap (normalizes to 'ab', 'cd', 'ef')
|
|
2038
|
+
>>> # Record 2: logic=None -> uses how='exactValMap' (exact match required)
|
|
2039
|
+
>>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='exactValMap')
|
|
2040
|
+
>>> # Records with flexValMap in logic will match 'A-B'->'ab', 'C D'->'cd', 'E|F'->'ef'
|
|
2041
|
+
>>> # Records without logic will only match exact values from how kwarg
|
|
2042
|
+
|
|
2043
|
+
Args:
|
|
2044
|
+
df: Input DataFrame.
|
|
2045
|
+
scenario_name: Name of the scenario.
|
|
2046
|
+
drop_unmapped: If True (and no default value exists), drops rows that couldn't be mapped.
|
|
2047
|
+
how: Mapping strategy to use (default: 'exactValMap'). Can be overridden per record via logic.
|
|
2048
|
+
|
|
2049
|
+
Returns:
|
|
2050
|
+
Tuple[pd.DataFrame, Set[str], pd.DataFrame]:
|
|
2051
|
+
1. Transformed DataFrame.
|
|
2052
|
+
2. Set of source fields processed.
|
|
2053
|
+
3. Statistics DataFrame detailing mapping success rates and value distributions.
|
|
2054
|
+
|
|
2055
|
+
Statistics DataFrame columns:
|
|
2056
|
+
- record_id: Unique identifier for the mapping record
|
|
2057
|
+
- source_fields: Source field names, pipe-separated if multiple
|
|
2058
|
+
- target_fields: Target field names, pipe-separated if multiple
|
|
2059
|
+
- relation_type: Relation type ('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many')
|
|
2060
|
+
- mapping_strategy: Mapping strategy used ('exactValMap', 'ignoreCaseValMap', etc.)
|
|
2061
|
+
- total_rows: Total number of rows in DataFrame
|
|
2062
|
+
- mapped_rows: Number of rows successfully mapped
|
|
2063
|
+
- unmapped_rows: Number of rows that couldn't be mapped
|
|
2064
|
+
- mapping_success_pct: Percentage of rows successfully mapped
|
|
2065
|
+
- successful_indices: List of DataFrame row indices that were successfully mapped
|
|
2066
|
+
- unsuccessful_indices: List of DataFrame row indices that couldn't be mapped
|
|
2067
|
+
- mapped_value_counts: Dictionary of mapped source values and their counts
|
|
2068
|
+
- unmapped_value_counts: Dictionary of unmapped source values and their counts
|
|
2069
|
+
- used_mapping_values: List of mapping rules that were used (with counts)
|
|
2070
|
+
- unused_mapping_values: List of mapping rules that were never encountered
|
|
2071
|
+
"""
|
|
2072
|
+
try:
|
|
2073
|
+
scenario = self[scenario_name]
|
|
2074
|
+
except KeyError:
|
|
2075
|
+
# If scenario not found, return empty results
|
|
2076
|
+
stats_df = pd.DataFrame(
|
|
2077
|
+
columns=[
|
|
2078
|
+
'record_id',
|
|
2079
|
+
'source_fields',
|
|
2080
|
+
'target_fields',
|
|
2081
|
+
'relation_type',
|
|
2082
|
+
'mapping_strategy',
|
|
2083
|
+
'total_rows',
|
|
2084
|
+
'mapped_rows',
|
|
2085
|
+
'unmapped_rows',
|
|
2086
|
+
'mapping_success_pct',
|
|
2087
|
+
'successful_indices',
|
|
2088
|
+
'unsuccessful_indices',
|
|
2089
|
+
'mapped_value_counts',
|
|
2090
|
+
'unmapped_value_counts',
|
|
2091
|
+
'used_mapping_values',
|
|
2092
|
+
'unused_mapping_values'
|
|
2093
|
+
]
|
|
2094
|
+
)
|
|
2095
|
+
return df, set(), stats_df
|
|
2096
|
+
|
|
2097
|
+
# Warn about missing values before processing to help users identify data quality issues early.
|
|
2098
|
+
# Missing values in source fields can cause mappings to fail silently or produce unexpected
|
|
2099
|
+
# results, so detecting them upfront prevents confusion about why certain rows didn't map correctly.
|
|
2100
|
+
all_source_fields_to_check = set()
|
|
2101
|
+
for record in scenario.records:
|
|
2102
|
+
if record.mapping:
|
|
2103
|
+
all_source_fields_to_check.update(record.source.field_names)
|
|
2104
|
+
|
|
2105
|
+
if all_source_fields_to_check:
|
|
2106
|
+
missing_value_counts = self._detect_missing_values_in_fields(
|
|
2107
|
+
df=df,
|
|
2108
|
+
source_field_names=list(all_source_fields_to_check)
|
|
2109
|
+
)
|
|
2110
|
+
if missing_value_counts:
|
|
2111
|
+
missing_details = [
|
|
2112
|
+
f"{field}: {count} occurrence(s)"
|
|
2113
|
+
for field, count in missing_value_counts.items()
|
|
2114
|
+
]
|
|
2115
|
+
warnings.warn(
|
|
2116
|
+
f"DataFrame contains missing values (pd.NA or string representations) "
|
|
2117
|
+
f"in source fields used for value mapping: {', '.join(missing_details)}. "
|
|
2118
|
+
f"These may affect mapping accuracy.",
|
|
2119
|
+
UserWarning,
|
|
2120
|
+
stacklevel=2
|
|
2121
|
+
)
|
|
2122
|
+
|
|
2123
|
+
handled_source_fields = set()
|
|
2124
|
+
statistics_rows = []
|
|
2125
|
+
|
|
2126
|
+
# Process each record to apply value mappings (source values -> target values)
|
|
2127
|
+
for record in scenario.records:
|
|
2128
|
+
if not record.mapping:
|
|
2129
|
+
continue
|
|
2130
|
+
|
|
2131
|
+
source_field_names = record.source.field_names
|
|
2132
|
+
target_field_names = record.target.field_names
|
|
2133
|
+
total_rows = len(df)
|
|
2134
|
+
default_val = record.mapping.default_value
|
|
2135
|
+
|
|
2136
|
+
# Ensure source fields are present in the dataframe, else add default value to target column
|
|
2137
|
+
missing_fields = [field for field in source_field_names if field not in df.columns]
|
|
2138
|
+
if missing_fields:
|
|
2139
|
+
warnings.warn(f"Source fields {missing_fields} not found in dataframe for record {record.id}. Creating target columns with default values.", stacklevel=2)
|
|
2140
|
+
for target_field in target_field_names:
|
|
2141
|
+
df[target_field] = default_val if default_val else None
|
|
2142
|
+
|
|
2143
|
+
# Determine mapping strategy even when fields are missing (for statistics tracking)
|
|
2144
|
+
mapping_strategy = self._determine_mapping_strategy(record.logic, how)
|
|
2145
|
+
|
|
2146
|
+
# Record statistics when missing: 0 mapped rows, all mappings unused (source fields missing)
|
|
2147
|
+
statistics_rows.append({
|
|
2148
|
+
'record_id': record.id,
|
|
2149
|
+
'source_fields': '|'.join(source_field_names),
|
|
2150
|
+
'target_fields': '|'.join(target_field_names),
|
|
2151
|
+
'relation_type': record.relation_type,
|
|
2152
|
+
'mapping_strategy': mapping_strategy,
|
|
2153
|
+
'total_rows': total_rows,
|
|
2154
|
+
'mapped_rows': 0,
|
|
2155
|
+
'unmapped_rows': total_rows,
|
|
2156
|
+
'mapping_success_pct': 0.0,
|
|
2157
|
+
'successful_indices': [],
|
|
2158
|
+
'unsuccessful_indices': df.index.tolist(),
|
|
2159
|
+
'mapped_value_counts': {},
|
|
2160
|
+
'unmapped_value_counts': {},
|
|
2161
|
+
'used_mapping_values': [],
|
|
2162
|
+
'unused_mapping_values': [] # Source fields missing, so no mapping values could be evaluated
|
|
2163
|
+
})
|
|
2164
|
+
continue
|
|
2165
|
+
|
|
2166
|
+
# Source fields are not missing:
|
|
2167
|
+
else:
|
|
2168
|
+
# Track processed fields
|
|
2169
|
+
handled_source_fields.update(source_field_names)
|
|
2170
|
+
|
|
2171
|
+
# Determine mapping strategy: check record.logic first (higher priority), then use how kwarg
|
|
2172
|
+
mapping_strategy = self._determine_mapping_strategy(record.logic, how)
|
|
2173
|
+
|
|
2174
|
+
# Step 1: Normalize dataframe according to mapping strategy.
|
|
2175
|
+
normalized_df = df[source_field_names].copy()
|
|
2176
|
+
for field_name in source_field_names:
|
|
2177
|
+
if field_name in normalized_df.columns:
|
|
2178
|
+
normalized_df[field_name] = normalized_df[field_name].apply(
|
|
2179
|
+
lambda val: self._normalize_value_for_mapping(val, mapping_strategy)
|
|
2180
|
+
)
|
|
2181
|
+
|
|
2182
|
+
# Step 1b: Create Series with normalized source values (one Series, shared by all target fields)
|
|
2183
|
+
# Format: "f"/"m" (single) or "john|doe" (multiple, pipe-separated, pipes preserved)
|
|
2184
|
+
normalized_source_series = self._concatenate_source_fields(df=normalized_df, source_fields=source_field_names)
|
|
2185
|
+
|
|
2186
|
+
# Step 1c: Create original (non-normalized) concatenated series for statistics and fillna
|
|
2187
|
+
concatenated_source_series = self._concatenate_source_fields(df=df, source_fields=source_field_names)
|
|
2188
|
+
|
|
2189
|
+
# Step 2.A: Create empty mapping dicts (one dict per target field)
|
|
2190
|
+
# Structure: {target_field: {normalized_source_value: target_value}}
|
|
2191
|
+
# Each target gets its own dict; all use the same Series from Step 1
|
|
2192
|
+
# Example: {"gender_code": {"f": "1"}, "status": {"f": "Active"}} (if ignoreCaseValMap)
|
|
2193
|
+
replacements_by_target = {target_field: {} for target_field in target_field_names}
|
|
2194
|
+
|
|
2195
|
+
# defined_mapping_values: tracks all mapping definitions for statistics (used vs unused)
|
|
2196
|
+
defined_mapping_values = []
|
|
2197
|
+
|
|
2198
|
+
# 2.B Build lookup dictionaries for existing record.mapping.values
|
|
2199
|
+
# If values list is empty, skip building mappings but still collect statistics
|
|
2200
|
+
if record.mapping.values:
|
|
2201
|
+
for mapping_value in record.mapping.values:
|
|
2202
|
+
source_map_val = mapping_value.input
|
|
2203
|
+
target_map_val = mapping_value.output
|
|
2204
|
+
if not source_map_val or not target_map_val:
|
|
2205
|
+
continue
|
|
2206
|
+
|
|
2207
|
+
# Concat/combine source values mapping to create the lookup key value.
|
|
2208
|
+
# Normalize mapping values first, then join with pipe to preserve separator.
|
|
2209
|
+
# (e.g., ["John", "Doe"] -> normalize each -> ["john", "doe"] -> concat -> "john|doe")
|
|
2210
|
+
source_values = []
|
|
2211
|
+
normalized_source_values = []
|
|
2212
|
+
for field_name in source_field_names:
|
|
2213
|
+
if field_name in source_map_val:
|
|
2214
|
+
source_val = str(source_map_val[field_name]).strip()
|
|
2215
|
+
source_values.append(source_val)
|
|
2216
|
+
# Normalize individual field value before concatenation
|
|
2217
|
+
normalized_source_values.append(
|
|
2218
|
+
self._normalize_value_for_mapping(source_val, mapping_strategy)
|
|
2219
|
+
)
|
|
2220
|
+
else:
|
|
2221
|
+
source_values = None
|
|
2222
|
+
normalized_source_values = None
|
|
2223
|
+
break
|
|
2224
|
+
|
|
2225
|
+
# Validate that we have values for ALL source fields before using this mapping.
|
|
2226
|
+
if source_values and len(source_values) == len(source_field_names):
|
|
2227
|
+
combined_source_val = '|'.join(source_values)
|
|
2228
|
+
normalized_combined_source_val = '|'.join(normalized_source_values)
|
|
2229
|
+
|
|
2230
|
+
# Store mapping definition for statistics tracking (not used for transformation).
|
|
2231
|
+
mapping_def = {
|
|
2232
|
+
'input': combined_source_val,
|
|
2233
|
+
'output': {target_field: str(target_map_val.get(target_field, '')).strip()
|
|
2234
|
+
for target_field in target_field_names if target_field in target_map_val}
|
|
2235
|
+
}
|
|
2236
|
+
defined_mapping_values.append(mapping_def)
|
|
2237
|
+
|
|
2238
|
+
# Store mapping in lookup dict for actual transformation (used by apply_mapping_to_target in Step 4).
|
|
2239
|
+
for target_field in target_field_names:
|
|
2240
|
+
if target_field in target_map_val:
|
|
2241
|
+
target_val = str(target_map_val[target_field]).strip()
|
|
2242
|
+
replacements_by_target[target_field][normalized_combined_source_val] = target_val
|
|
2243
|
+
|
|
2244
|
+
# Step 3: Apply mappings to target columns using normalized source series for lookup
|
|
2245
|
+
for target_field in target_field_names:
|
|
2246
|
+
df = self._apply_mapping_to_target(
|
|
2247
|
+
df=df,
|
|
2248
|
+
concatenated_source_series=normalized_source_series,
|
|
2249
|
+
target_field=target_field,
|
|
2250
|
+
replacements=replacements_by_target[target_field],
|
|
2251
|
+
default_val=default_val,
|
|
2252
|
+
original_source_series=concatenated_source_series
|
|
2253
|
+
)
|
|
2254
|
+
|
|
2255
|
+
# Step 4: Collect statistics on mapping results
|
|
2256
|
+
all_mapped_source_values = set()
|
|
2257
|
+
for replacements in replacements_by_target.values():
|
|
2258
|
+
all_mapped_source_values.update(replacements.keys())
|
|
2259
|
+
|
|
2260
|
+
# Step 4b: Determine which rows were successfully mapped vs unmapped
|
|
2261
|
+
is_mapped = normalized_source_series.isin(all_mapped_source_values)
|
|
2262
|
+
mapped_rows = is_mapped.sum()
|
|
2263
|
+
unmapped_rows = (~is_mapped).sum()
|
|
2264
|
+
|
|
2265
|
+
# Get indices of successful and unsuccessful mappings
|
|
2266
|
+
successful_indices = df.index[is_mapped].tolist()
|
|
2267
|
+
unsuccessful_indices = df.index[~is_mapped].tolist()
|
|
2268
|
+
|
|
2269
|
+
# Step 4c: Count occurrences of each source VALUE in the data
|
|
2270
|
+
# Analyzes what values actually appeared in the data, regardless of mapping definitions.
|
|
2271
|
+
# (1) mapped_value_counts_dict: values that were successfully mapped (for Step 5d to use)
|
|
2272
|
+
# (2) unmapped_value_counts_dict: values that didn't map (to identify gaps in mapping rules)
|
|
2273
|
+
# (3) Convert Series to dict to avoid truncation and ensure ALL values are preserved in statistics.
|
|
2274
|
+
mapped_values = concatenated_source_series[is_mapped]
|
|
2275
|
+
unmapped_values = concatenated_source_series[~is_mapped]
|
|
2276
|
+
mapped_value_counts_dict = {}
|
|
2277
|
+
unmapped_value_counts_dict = {}
|
|
2278
|
+
if len(mapped_values) > 0:
|
|
2279
|
+
mapped_value_counts_dict = dict(mapped_values.value_counts())
|
|
2280
|
+
if len(unmapped_values) > 0:
|
|
2281
|
+
unmapped_value_counts_dict = dict(unmapped_values.value_counts())
|
|
2282
|
+
|
|
2283
|
+
# Step 4d: Compare defined MAPPING RULES (from Step 3) against actual data.
|
|
2284
|
+
# Analyzes which mapping definitions were used vs unused, regardless of what values exist in data.
|
|
2285
|
+
# Different from Step 4c: 4c analyzes data values, 4d analyzes mapping rules.
|
|
2286
|
+
# (1) unused mappings: rules defined but never encountered (possibly typos or outdated rules)
|
|
2287
|
+
# (2) used mappings: rules that actually fired and how many times (validates mapping logic)
|
|
2288
|
+
# Need to compare normalized mapping inputs against normalized data values
|
|
2289
|
+
# Build a map of normalized values to their counts (sum counts for values that normalize to same key)
|
|
2290
|
+
normalized_mapped_inputs = {}
|
|
2291
|
+
for orig_val, count in mapped_value_counts_dict.items():
|
|
2292
|
+
normalized_val = self._normalize_value_for_mapping(orig_val, mapping_strategy)
|
|
2293
|
+
normalized_mapped_inputs[normalized_val] = normalized_mapped_inputs.get(normalized_val, 0) + count
|
|
2294
|
+
|
|
2295
|
+
unused_mapping_values = []
|
|
2296
|
+
used_mapping_values_with_counts = []
|
|
2297
|
+
for mapping_def in defined_mapping_values:
|
|
2298
|
+
mapping_input = mapping_def['input']
|
|
2299
|
+
normalized_mapping_input = self._normalize_value_for_mapping(mapping_input, mapping_strategy)
|
|
2300
|
+
# found in data: used rule (compare normalized values)
|
|
2301
|
+
if normalized_mapping_input in normalized_mapped_inputs:
|
|
2302
|
+
used_mapping_values_with_counts.append({
|
|
2303
|
+
'input': mapping_input,
|
|
2304
|
+
'output': mapping_def['output'],
|
|
2305
|
+
'count': normalized_mapped_inputs.get(normalized_mapping_input, 0)
|
|
2306
|
+
})
|
|
2307
|
+
# never found in the data (unused rule)
|
|
2308
|
+
else:
|
|
2309
|
+
unused_mapping_values.append(mapping_def)
|
|
2310
|
+
|
|
2311
|
+
# Optionally filter out unmapped rows if requested
|
|
2312
|
+
if drop_unmapped and not default_val:
|
|
2313
|
+
df = df[is_mapped]
|
|
2314
|
+
|
|
2315
|
+
# Step 4f: Calculate mapping success rate and store all statistics for this record.
|
|
2316
|
+
# At this point, we have all the information needed (counts from 5c, used/unused from 5d, row counts from 5b).
|
|
2317
|
+
# We store statistics per record because each record has different source/target fields and relation types, so users can analyze effectiveness per record and per mapping rule.
|
|
2318
|
+
mapping_success_pct = (mapped_rows / total_rows * 100) if total_rows > 0 else 0.0
|
|
2319
|
+
statistics_rows.append({
|
|
2320
|
+
'record_id': record.id,
|
|
2321
|
+
'source_fields': '|'.join(source_field_names),
|
|
2322
|
+
'target_fields': '|'.join(target_field_names),
|
|
2323
|
+
'relation_type': record.relation_type,
|
|
2324
|
+
'mapping_strategy': mapping_strategy,
|
|
2325
|
+
'total_rows': total_rows,
|
|
2326
|
+
'mapped_rows': mapped_rows,
|
|
2327
|
+
'unmapped_rows': unmapped_rows,
|
|
2328
|
+
'mapping_success_pct': mapping_success_pct,
|
|
2329
|
+
'successful_indices': successful_indices,
|
|
2330
|
+
'unsuccessful_indices': unsuccessful_indices,
|
|
2331
|
+
'mapped_value_counts': mapped_value_counts_dict,
|
|
2332
|
+
'unmapped_value_counts': unmapped_value_counts_dict,
|
|
2333
|
+
'used_mapping_values': used_mapping_values_with_counts,
|
|
2334
|
+
'unused_mapping_values': unused_mapping_values
|
|
2335
|
+
})
|
|
2336
|
+
|
|
2337
|
+
if statistics_rows:
|
|
2338
|
+
stats_df = pd.DataFrame(statistics_rows)
|
|
2339
|
+
else:
|
|
2340
|
+
stats_df = pd.DataFrame(columns=[
|
|
2341
|
+
'record_id', 'source_fields', 'target_fields', 'relation_type',
|
|
2342
|
+
'mapping_strategy', 'total_rows', 'mapped_rows', 'unmapped_rows', 'mapping_success_pct',
|
|
2343
|
+
'successful_indices', 'unsuccessful_indices',
|
|
2344
|
+
'mapped_value_counts', 'unmapped_value_counts',
|
|
2345
|
+
'used_mapping_values', 'unused_mapping_values'
|
|
2346
|
+
])
|
|
2347
|
+
|
|
2348
|
+
return df, handled_source_fields, stats_df
|
|
2349
|
+
|
|
2350
|
+
def rename_fields(
|
|
2351
|
+
self,
|
|
2352
|
+
df: pd.DataFrame,
|
|
2353
|
+
scenario_name: str,
|
|
2354
|
+
columns_to_keep: List[str] = None,
|
|
2355
|
+
drop_unmapped: bool = True
|
|
2356
|
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
2357
|
+
"""Renames and transforms DataFrame columns based on scenario field mappings.
|
|
2358
|
+
|
|
2359
|
+
Handles complex mappings like concatenation (many-to-one) and splitting (one-to-many).
|
|
2360
|
+
Records with value mappings are logged but skipped (use `apply_value_mappings` for those).
|
|
2361
|
+
|
|
2362
|
+
Args:
|
|
2363
|
+
df: Input DataFrame.
|
|
2364
|
+
scenario_name: Name of the scenario.
|
|
2365
|
+
columns_to_keep: List of source column names to preserve even if mapped.
|
|
2366
|
+
drop_unmapped: If True, drops source columns that were successfully mapped (unless in columns_to_keep).
|
|
2367
|
+
|
|
2368
|
+
Returns:
|
|
2369
|
+
Tuple containing:
|
|
2370
|
+
- Modified DataFrame with renamed/transformed columns based on scenario mappings.
|
|
2371
|
+
- Statistics DataFrame (stats_df) with detailed mapping information (see Notes).
|
|
2372
|
+
|
|
2373
|
+
Raises:
|
|
2374
|
+
KeyError: If scenario_name is not found. Returns original DataFrame with empty statistics.
|
|
2375
|
+
|
|
2376
|
+
Logic types:
|
|
2377
|
+
- "concat": Concatenate all sources with '|', fill all targets
|
|
2378
|
+
- "fill": Map source[i] → target[i] in order
|
|
2379
|
+
- "keep source": Keep source fields unchanged, no target columns
|
|
2380
|
+
- Default (no logic): Uses relation_type:
|
|
2381
|
+
* one_to_one: Direct mapping source[0] → target[0]
|
|
2382
|
+
* one_to_many: Duplicate single source value to all target fields
|
|
2383
|
+
* many_to_one: Concatenate all source fields with '|' into single target
|
|
2384
|
+
* many_to_many (n:m): Behavior depends on field counts:
|
|
2385
|
+
- n == m: Direct 1:1 mapping source[i] → target[i]
|
|
2386
|
+
- n < m: Map first n sources to first n targets, fill remaining with last source
|
|
2387
|
+
- n > m: Concatenate all sources to each target field
|
|
2388
|
+
|
|
2389
|
+
Notes:
|
|
2390
|
+
The statistics DataFrame (stats_df) provides comprehensive visibility into the transformation process:
|
|
2391
|
+
|
|
2392
|
+
**What it reports:**
|
|
2393
|
+
- For each record processed: source/target column mappings, mapping status (mapped/source_missing/kept_source/value_mapped),
|
|
2394
|
+
number of rows affected, mapping type (concat/fill/one_to_one/etc.), and default logic used (if applicable).
|
|
2395
|
+
- For unmapped sources: Columns that exist in the DataFrame but weren't processed by any record.
|
|
2396
|
+
|
|
2397
|
+
**Statistics DataFrame columns:**
|
|
2398
|
+
- record_id: Unique identifier for the mapping record (None for unmapped sources)
|
|
2399
|
+
- source_column: Source column name(s), pipe-separated if multiple
|
|
2400
|
+
- target_column: Target column name (None if source was kept or unmapped)
|
|
2401
|
+
- mapping_status: Status of the mapping ('mapped', 'source_missing', 'kept_source', 'value_mapped', 'not_in_mapping')
|
|
2402
|
+
- source_existed: Whether source column(s) existed in the DataFrame
|
|
2403
|
+
- rows_affected: Number of rows in the DataFrame
|
|
2404
|
+
- mapping_type: Type of mapping applied ('concat', 'fill', 'one_to_one', 'one_to_many', 'many_to_one', 'many_to_many', etc.)
|
|
2405
|
+
- logic: Original logic string from the record
|
|
2406
|
+
- relation_type: Relation type from the record ('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many')
|
|
2407
|
+
- source_count: Number of source fields in the record
|
|
2408
|
+
- target_count: Number of target fields in the record
|
|
2409
|
+
- default_logic: Description of default logic used if no explicit logic was specified (e.g., 'direct_mapping', 'concatenate_with_pipe')
|
|
2410
|
+
|
|
2411
|
+
Examples
|
|
2412
|
+
--------
|
|
2413
|
+
Example 1: Renaming columns using one_to_one mapping (no logic, uses default).
|
|
2414
|
+
|
|
2415
|
+
>>> df = pd.DataFrame({'id': [1, 2], 'first_name': ['John', 'Jane']})
|
|
2416
|
+
>>> df
|
|
2417
|
+
id first_name
|
|
2418
|
+
0 1 John
|
|
2419
|
+
1 2 Jane
|
|
2420
|
+
|
|
2421
|
+
Scenario maps 'first_name' → 'firstname' (one_to_one, no logic specified).
|
|
2422
|
+
Default behavior: direct mapping source[0] → target[0].
|
|
2423
|
+
|
|
2424
|
+
>>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
|
|
2425
|
+
>>> df
|
|
2426
|
+
id firstname
|
|
2427
|
+
0 1 John
|
|
2428
|
+
1 2 Jane
|
|
2429
|
+
|
|
2430
|
+
>>> stats_df[['source_column', 'target_column', 'logic', 'relation_type', 'default_logic']]
|
|
2431
|
+
source_column target_column logic relation_type default_logic
|
|
2432
|
+
0 first_name firstname None one_to_one direct_mapping
|
|
2433
|
+
|
|
2434
|
+
|
|
2435
|
+
Example 2: Using many_to_one mapping (no logic, uses default).
|
|
2436
|
+
|
|
2437
|
+
>>> df = pd.DataFrame({'id': [1, 2], 'street': ['Main St', 'Oak Ave'], 'city': ['Amsterdam', 'Rotterdam']})
|
|
2438
|
+
>>> df
|
|
2439
|
+
id street city
|
|
2440
|
+
0 1 Main St Amsterdam
|
|
2441
|
+
1 2 Oak Ave Rotterdam
|
|
2442
|
+
|
|
2443
|
+
Scenario maps 'street'|'city' → 'address' (many_to_one, no logic specified).
|
|
2444
|
+
Default behavior: concatenate all source fields with '|' separator into single target.
|
|
2445
|
+
|
|
2446
|
+
>>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
|
|
2447
|
+
>>> df
|
|
2448
|
+
id address
|
|
2449
|
+
0 1 Main St|Amsterdam
|
|
2450
|
+
1 2 Oak Ave|Rotterdam
|
|
2451
|
+
|
|
2452
|
+
>>> stats_df[['source_column', 'target_column', 'logic', 'relation_type', 'default_logic']]
|
|
2453
|
+
source_column target_column logic relation_type default_logic
|
|
2454
|
+
0 street|city address None many_to_one concatenate_with_pipe
|
|
2455
|
+
|
|
2456
|
+
|
|
2457
|
+
Example 3: Using many_to_many mapping with explicit 'concat' logic.
|
|
2458
|
+
|
|
2459
|
+
>>> df = pd.DataFrame({
|
|
2460
|
+
... 'id': [1, 2],
|
|
2461
|
+
... 'first_name': ['John', 'Jane'],
|
|
2462
|
+
... 'last_name': ['Doe', 'Smith']
|
|
2463
|
+
... })
|
|
2464
|
+
>>> df
|
|
2465
|
+
id first_name last_name
|
|
2466
|
+
0 1 John Doe
|
|
2467
|
+
1 2 Jane Smith
|
|
2468
|
+
|
|
2469
|
+
Scenario maps 'first_name'|'last_name' → 'full_name'|'display_name' (many_to_many with explicit 'concat' logic).
|
|
2470
|
+
|
|
2471
|
+
>>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
|
|
2472
|
+
>>> df
|
|
2473
|
+
id full_name display_name
|
|
2474
|
+
0 1 John|Doe John|Doe
|
|
2475
|
+
1 2 Jane|Smith Jane|Smith
|
|
2476
|
+
|
|
2477
|
+
With 'concat' logic, all source fields are concatenated and filled into all target fields.
|
|
2478
|
+
|
|
2479
|
+
>>> stats_df[['source_column', 'target_column', 'logic', 'relation_type']]
|
|
2480
|
+
source_column target_column logic relation_type
|
|
2481
|
+
0 first_name|last_name full_name concat many_to_many
|
|
2482
|
+
1 first_name|last_name display_name concat many_to_many
|
|
2483
|
+
|
|
2484
|
+
|
|
2485
|
+
Example 4: Using many_to_many mapping with explicit 'fill' logic.
|
|
2486
|
+
|
|
2487
|
+
>>> df = pd.DataFrame({
|
|
2488
|
+
... 'id': [1, 2],
|
|
2489
|
+
... 'first_name': ['John', 'Jane'],
|
|
2490
|
+
... 'last_name': ['Doe', 'Smith']
|
|
2491
|
+
... })
|
|
2492
|
+
>>> df
|
|
2493
|
+
id first_name last_name
|
|
2494
|
+
0 1 John Doe
|
|
2495
|
+
1 2 Jane Smith
|
|
2496
|
+
|
|
2497
|
+
Scenario maps 'first_name'|'last_name' → 'first'|'last' (many_to_many with explicit 'fill' logic).
|
|
2498
|
+
|
|
2499
|
+
>>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
|
|
2500
|
+
>>> df
|
|
2501
|
+
id first last
|
|
2502
|
+
0 1 John Doe
|
|
2503
|
+
1 2 Jane Smith
|
|
2504
|
+
|
|
2505
|
+
With 'fill' logic, source[i] maps to target[i] in order (1:1 mapping by index).
|
|
2506
|
+
|
|
2507
|
+
>>> stats_df[['source_column', 'target_column', 'logic', 'relation_type']]
|
|
2508
|
+
source_column target_column logic relation_type
|
|
2509
|
+
0 first_name first fill many_to_many
|
|
2510
|
+
1 last_name last fill many_to_many
|
|
2511
|
+
|
|
2512
|
+
|
|
2513
|
+
Example 5: Using 'keep source' logic.
|
|
2514
|
+
|
|
2515
|
+
>>> df = pd.DataFrame({'id': [1, 2], 'employee_id': ['E001', 'E002'], 'department': ['IT', 'HR']})
|
|
2516
|
+
>>> df
|
|
2517
|
+
id employee_id department
|
|
2518
|
+
0 1 E001 IT
|
|
2519
|
+
1 2 E002 HR
|
|
2520
|
+
|
|
2521
|
+
Scenario has 'keep source' logic for 'employee_id' and 'department' fields.
|
|
2522
|
+
|
|
2523
|
+
>>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
|
|
2524
|
+
>>> df
|
|
2525
|
+
id employee_id department
|
|
2526
|
+
0 1 E001 IT
|
|
2527
|
+
1 2 E002 HR
|
|
2528
|
+
|
|
2529
|
+
Source columns are kept unchanged; no target columns are created.
|
|
2530
|
+
|
|
2531
|
+
>>> stats_df[['source_column', 'target_column', 'mapping_status', 'logic']]
|
|
2532
|
+
source_column target_column mapping_status logic
|
|
2533
|
+
0 employee_id None kept_source keep source
|
|
2534
|
+
1 department None kept_source keep source
|
|
2535
|
+
|
|
2536
|
+
|
|
2537
|
+
Example 6: Using one_to_many mapping without logic (uses default).
|
|
2538
|
+
|
|
2539
|
+
>>> df = pd.DataFrame({'id': [1, 2], 'postal_code': ['1234', '5678']})
|
|
2540
|
+
>>> df
|
|
2541
|
+
id postal_code
|
|
2542
|
+
0 1 1234
|
|
2543
|
+
1 2 5678
|
|
2544
|
+
|
|
2545
|
+
Scenario maps 'postal_code' → 'zip'|'postcode' (one_to_many, no logic specified).
|
|
2546
|
+
Default behavior: duplicate single source value to all target fields.
|
|
2547
|
+
|
|
2548
|
+
>>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
|
|
2549
|
+
>>> df
|
|
2550
|
+
id zip postcode
|
|
2551
|
+
0 1 1234 1234
|
|
2552
|
+
1 2 5678 5678
|
|
2553
|
+
|
|
2554
|
+
Both target columns are filled with the same source value (duplicated to all targets).
|
|
2555
|
+
|
|
2556
|
+
>>> stats_df[['source_column', 'target_column', 'logic', 'relation_type', 'default_logic']]
|
|
2557
|
+
source_column target_column logic relation_type default_logic
|
|
2558
|
+
0 postal_code zip None one_to_many duplicate_to_all_targets
|
|
2559
|
+
1 postal_code postcode None one_to_many duplicate_to_all_targets
|
|
2560
|
+
"""
|
|
2561
|
+
if columns_to_keep is None:
|
|
2562
|
+
columns_to_keep = []
|
|
2563
|
+
|
|
2564
|
+
try:
|
|
2565
|
+
scenario = self[scenario_name]
|
|
2566
|
+
except KeyError:
|
|
2567
|
+
warnings.warn(f"Scenario '{scenario_name}' not found. Returning original DataFrame with empty statistics.", stacklevel=2)
|
|
2568
|
+
empty_stats = pd.DataFrame(
|
|
2569
|
+
columns=[
|
|
2570
|
+
'record_id', 'source_column', 'target_column', 'mapping_status',
|
|
2571
|
+
'source_existed', 'rows_affected', 'mapping_type', 'logic',
|
|
2572
|
+
'relation_type', 'source_count', 'target_count', 'default_logic'
|
|
2573
|
+
]
|
|
2574
|
+
)
|
|
2575
|
+
return df, empty_stats
|
|
2576
|
+
|
|
2577
|
+
# objects for tracking statistics
|
|
2578
|
+
newly_created_target_fields = set()
|
|
2579
|
+
source_fields_to_keep = set()
|
|
2580
|
+
stats_data = []
|
|
2581
|
+
|
|
2582
|
+
# Handler dictionaries route records to transformation methods by logic (explicit) or relation_type (default).
|
|
2583
|
+
# Replaces long if/elif chains as adding handlers requires only a dictionary entry.
|
|
2584
|
+
logic_handlers = {
|
|
2585
|
+
'concat': self._apply_concat,
|
|
2586
|
+
'fill': self._apply_fill,
|
|
2587
|
+
'keepsource': self._apply_keep_source,
|
|
2588
|
+
'onlysource': self._apply_keep_source
|
|
2589
|
+
}
|
|
2590
|
+
|
|
2591
|
+
default_handlers = {
|
|
2592
|
+
'one_to_one': self._apply_one_to_one,
|
|
2593
|
+
'one_to_many': self._apply_one_to_many,
|
|
2594
|
+
'many_to_one': self._apply_many_to_one,
|
|
2595
|
+
'many_to_many': self._apply_many_to_many
|
|
2596
|
+
}
|
|
2597
|
+
|
|
2598
|
+
for record in scenario.records:
|
|
2599
|
+
source_field_names = record.source.field_names
|
|
2600
|
+
|
|
2601
|
+
# Skip records with value mappings, they're handled by apply_value_mappings
|
|
2602
|
+
if record.mapping:
|
|
2603
|
+
self._apply_value_mapping_logging(df, record, stats_data, newly_created_target_fields)
|
|
2604
|
+
continue
|
|
2605
|
+
|
|
2606
|
+
normalized_logic = self._normalize_logic(record.logic)
|
|
2607
|
+
existing_sources = [s for s in source_field_names if s in df.columns]
|
|
2608
|
+
|
|
2609
|
+
# Check if normalized logic contains any handler key (substring match)
|
|
2610
|
+
# This handles cases like "keep source | parse to from date" -> "keepsourceparsetofromdate"
|
|
2611
|
+
matched_handler_key = None
|
|
2612
|
+
for handler_key in logic_handlers.keys():
|
|
2613
|
+
if handler_key in normalized_logic:
|
|
2614
|
+
matched_handler_key = handler_key
|
|
2615
|
+
break
|
|
2616
|
+
|
|
2617
|
+
if matched_handler_key:
|
|
2618
|
+
logic_handler = logic_handlers[matched_handler_key]
|
|
2619
|
+
# 'keep source' handlers don't create columns, so they only need kept_sources to track preserved fields.
|
|
2620
|
+
# Other handlers need existing_sources and created_targets to filter and track new columns.
|
|
2621
|
+
if matched_handler_key in ('keepsource', 'onlysource'):
|
|
2622
|
+
logic_handler(
|
|
2623
|
+
df=df,
|
|
2624
|
+
record=record,
|
|
2625
|
+
stats_data=stats_data,
|
|
2626
|
+
kept_sources=source_fields_to_keep
|
|
2627
|
+
)
|
|
2628
|
+
else:
|
|
2629
|
+
logic_handler(
|
|
2630
|
+
df=df,
|
|
2631
|
+
record=record,
|
|
2632
|
+
existing_sources=existing_sources,
|
|
2633
|
+
stats_data=stats_data,
|
|
2634
|
+
created_targets=newly_created_target_fields
|
|
2635
|
+
)
|
|
2636
|
+
else:
|
|
2637
|
+
default_handler = default_handlers.get(record.relation_type)
|
|
2638
|
+
if default_handler:
|
|
2639
|
+
# Only many_to_many accepts kept_sources.
|
|
2640
|
+
if record.relation_type == 'many_to_many':
|
|
2641
|
+
default_handler(
|
|
2642
|
+
df=df,
|
|
2643
|
+
record=record,
|
|
2644
|
+
existing_sources=existing_sources,
|
|
2645
|
+
stats_data=stats_data,
|
|
2646
|
+
created_targets=newly_created_target_fields,
|
|
2647
|
+
kept_sources=source_fields_to_keep
|
|
2648
|
+
)
|
|
2649
|
+
else:
|
|
2650
|
+
default_handler(
|
|
2651
|
+
df=df,
|
|
2652
|
+
record=record,
|
|
2653
|
+
existing_sources=existing_sources,
|
|
2654
|
+
stats_data=stats_data,
|
|
2655
|
+
created_targets=newly_created_target_fields
|
|
2656
|
+
)
|
|
2657
|
+
else:
|
|
2658
|
+
raise ValueError(
|
|
2659
|
+
f"Unknown relation_type '{record.relation_type}' for record {record.id}. "
|
|
2660
|
+
f"Supported types: {', '.join(default_handlers.keys())}"
|
|
2661
|
+
)
|
|
2662
|
+
|
|
2663
|
+
#--- report
|
|
2664
|
+
stats_df = self._generate_statistics_dataframe(
|
|
2665
|
+
scenario=scenario,
|
|
2666
|
+
df=df,
|
|
2667
|
+
stats_data=stats_data,
|
|
2668
|
+
source_fields_to_keep=source_fields_to_keep
|
|
2669
|
+
)
|
|
2670
|
+
|
|
2671
|
+
#--- Clean up
|
|
2672
|
+
df = self._finalize_dataframe_columns(
|
|
2673
|
+
df=df,
|
|
2674
|
+
scenario=scenario,
|
|
2675
|
+
drop_unmapped=drop_unmapped,
|
|
2676
|
+
newly_created_target_fields=newly_created_target_fields,
|
|
2677
|
+
source_fields_to_keep=source_fields_to_keep,
|
|
2678
|
+
columns_to_keep=columns_to_keep
|
|
2679
|
+
)
|
|
2680
|
+
|
|
2681
|
+
return df, stats_df
|
|
2682
|
+
|
|
2683
|
+
# ============================================================================
|
|
2684
|
+
# Rename Handlers
|
|
2685
|
+
# ============================================================================
|
|
2686
|
+
|
|
2687
|
+
def _apply_keep_source(
|
|
2688
|
+
self,
|
|
2689
|
+
df: pd.DataFrame,
|
|
2690
|
+
record,
|
|
2691
|
+
stats_data: List[dict],
|
|
2692
|
+
kept_sources: Set[str]
|
|
2693
|
+
) -> None:
|
|
2694
|
+
"""Applies 'keep source' logic: preserves source columns without creating targets.
|
|
2695
|
+
|
|
2696
|
+
Applied when the logic is "keepsource" or "onlysource". This indicates that the
|
|
2697
|
+
source fields should be retained in the DataFrame as-is, and no corresponding
|
|
2698
|
+
target columns should be generated. Allowing the developer to apply custom logic themselves.
|
|
2699
|
+
|
|
2700
|
+
Args:
|
|
2701
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
2702
|
+
record: The scenario record with "keepsource" logic.
|
|
2703
|
+
stats_data (List[dict]): List to append statistics to.
|
|
2704
|
+
kept_sources (Set[str]): Set to track source columns that must be preserved.
|
|
2705
|
+
"""
|
|
2706
|
+
source_field_names = record.source.field_names
|
|
2707
|
+
for source_field in source_field_names:
|
|
2708
|
+
kept_sources.add(source_field)
|
|
2709
|
+
source_existed = source_field in df.columns
|
|
2710
|
+
self._log_transformation_stats(
|
|
2711
|
+
stats_data=stats_data,
|
|
2712
|
+
record=record,
|
|
2713
|
+
target_col=None,
|
|
2714
|
+
source_col=source_field,
|
|
2715
|
+
status='kept_source',
|
|
2716
|
+
mapping_type='keep_source',
|
|
2717
|
+
source_existed=source_existed,
|
|
2718
|
+
df_length=len(df) if source_existed else 0
|
|
2719
|
+
)
|
|
2720
|
+
|
|
2721
|
+
def _apply_concat(
|
|
2722
|
+
self,
|
|
2723
|
+
df: pd.DataFrame,
|
|
2724
|
+
record,
|
|
2725
|
+
existing_sources: List[str],
|
|
2726
|
+
stats_data: List[dict],
|
|
2727
|
+
created_targets: Set[str],
|
|
2728
|
+
) -> None:
|
|
2729
|
+
"""Applies 'concat' logic: joins all sources and fills all targets.
|
|
2730
|
+
|
|
2731
|
+
Applied when the logic is explicitly set to "concat". It concatenates values from
|
|
2732
|
+
all available source columns using a pipe ('|') separator and assigns this result
|
|
2733
|
+
to every target column defined in the record.
|
|
2734
|
+
|
|
2735
|
+
Args:
|
|
2736
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
2737
|
+
record: The scenario record with "concat" logic.
|
|
2738
|
+
existing_sources (List[str]): List of source fields present in the DataFrame.
|
|
2739
|
+
stats_data (List[dict]): List to append statistics to.
|
|
2740
|
+
created_targets (Set[str]): Set to track created target columns.
|
|
2741
|
+
"""
|
|
2742
|
+
target_field_names = record.target.field_names
|
|
2743
|
+
if len(existing_sources) > 0:
|
|
2744
|
+
concatenated = self._concatenate_source_fields(df=df, source_fields=existing_sources)
|
|
2745
|
+
for target_field in target_field_names:
|
|
2746
|
+
created_targets.add(target_field)
|
|
2747
|
+
df[target_field] = concatenated
|
|
2748
|
+
self._log_transformation_stats(
|
|
2749
|
+
stats_data=stats_data,
|
|
2750
|
+
record=record,
|
|
2751
|
+
target_col=target_field,
|
|
2752
|
+
source_col=existing_sources,
|
|
2753
|
+
status='mapped',
|
|
2754
|
+
mapping_type='concat',
|
|
2755
|
+
df_length=len(df)
|
|
2756
|
+
)
|
|
2757
|
+
else:
|
|
2758
|
+
for target_field in target_field_names:
|
|
2759
|
+
created_targets.add(target_field)
|
|
2760
|
+
df[target_field] = ''
|
|
2761
|
+
self._log_transformation_stats(
|
|
2762
|
+
stats_data=stats_data,
|
|
2763
|
+
record=record,
|
|
2764
|
+
target_col=target_field,
|
|
2765
|
+
source_col=record.source.field_names,
|
|
2766
|
+
status='source_missing',
|
|
2767
|
+
mapping_type='concat',
|
|
2768
|
+
source_existed=False,
|
|
2769
|
+
df_length=len(df)
|
|
2770
|
+
)
|
|
2771
|
+
|
|
2772
|
+
def _apply_fill(
|
|
2773
|
+
self,
|
|
2774
|
+
df: pd.DataFrame,
|
|
2775
|
+
record,
|
|
2776
|
+
existing_sources: List[str],
|
|
2777
|
+
stats_data: List[dict],
|
|
2778
|
+
created_targets: Set[str],
|
|
2779
|
+
) -> None:
|
|
2780
|
+
"""Applies 'fill' logic: maps source[i] to target[i] sequentially.
|
|
2781
|
+
|
|
2782
|
+
If the logic is explicitly set to "fill", it maps the first source field
|
|
2783
|
+
to the first target field, the second source to the second target, and so on.
|
|
2784
|
+
|
|
2785
|
+
Args:
|
|
2786
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
2787
|
+
record: The scenario record with "fill" logic.
|
|
2788
|
+
existing_sources (List[str]): List of source fields present in the DataFrame.
|
|
2789
|
+
stats_data (List[dict]): List to append statistics to.
|
|
2790
|
+
created_targets (Set[str]): Set to track created target columns.
|
|
2791
|
+
"""
|
|
2792
|
+
target_field_names = record.target.field_names
|
|
2793
|
+
n = min(len(existing_sources), len(target_field_names))
|
|
2794
|
+
|
|
2795
|
+
for i in range(n):
|
|
2796
|
+
source_field = existing_sources[i]
|
|
2797
|
+
target_field = target_field_names[i]
|
|
2798
|
+
created_targets.add(target_field)
|
|
2799
|
+
df[target_field] = df[source_field]
|
|
2800
|
+
self._log_transformation_stats(
|
|
2801
|
+
stats_data=stats_data,
|
|
2802
|
+
record=record,
|
|
2803
|
+
target_col=target_field,
|
|
2804
|
+
source_col=source_field,
|
|
2805
|
+
status='mapped',
|
|
2806
|
+
mapping_type='fill',
|
|
2807
|
+
df_length=len(df)
|
|
2808
|
+
)
|
|
2809
|
+
|
|
2810
|
+
if len(target_field_names) > len(existing_sources):
|
|
2811
|
+
for i in range(len(existing_sources), len(target_field_names)):
|
|
2812
|
+
target_field = target_field_names[i]
|
|
2813
|
+
created_targets.add(target_field)
|
|
2814
|
+
df[target_field] = ''
|
|
2815
|
+
self._log_transformation_stats(
|
|
2816
|
+
stats_data=stats_data,
|
|
2817
|
+
record=record,
|
|
2818
|
+
target_col=target_field,
|
|
2819
|
+
source_col=None,
|
|
2820
|
+
status='source_missing',
|
|
2821
|
+
mapping_type='fill',
|
|
2822
|
+
source_existed=False,
|
|
2823
|
+
df_length=len(df)
|
|
2824
|
+
)
|
|
2825
|
+
|
|
2826
|
+
def _apply_one_to_one(
|
|
2827
|
+
self,
|
|
2828
|
+
df: pd.DataFrame,
|
|
2829
|
+
record,
|
|
2830
|
+
existing_sources: List[str],
|
|
2831
|
+
stats_data: List[dict],
|
|
2832
|
+
created_targets: Set[str],
|
|
2833
|
+
) -> None:
|
|
2834
|
+
"""Applies default one-to-one logic: Direct value copy.
|
|
2835
|
+
|
|
2836
|
+
Applied when no explicit logic is provided and the relation type is 'one_to_one'.
|
|
2837
|
+
Maps a single source field to a single target field.
|
|
2838
|
+
|
|
2839
|
+
Args:
|
|
2840
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
2841
|
+
record: The scenario record.
|
|
2842
|
+
existing_sources (List[str]): List containing the single source field.
|
|
2843
|
+
stats_data (List[dict]): List to append statistics to.
|
|
2844
|
+
created_targets (Set[str]): Set to track created target columns.
|
|
2845
|
+
"""
|
|
2846
|
+
target_field_names = record.target.field_names
|
|
2847
|
+
n_sources = len(existing_sources)
|
|
2848
|
+
n_targets = len(target_field_names)
|
|
2849
|
+
|
|
2850
|
+
if n_sources > 0 and n_targets > 0:
|
|
2851
|
+
source_field = existing_sources[0]
|
|
2852
|
+
target_field = target_field_names[0]
|
|
2853
|
+
created_targets.add(target_field)
|
|
2854
|
+
df[target_field] = df[source_field]
|
|
2855
|
+
self._log_transformation_stats(
|
|
2856
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
|
|
2857
|
+
status='mapped', mapping_type='one_to_one', default_logic='direct_mapping', df_length=len(df)
|
|
2858
|
+
)
|
|
2859
|
+
elif n_targets > 0:
|
|
2860
|
+
target_field = target_field_names[0]
|
|
2861
|
+
created_targets.add(target_field)
|
|
2862
|
+
df[target_field] = ''
|
|
2863
|
+
self._log_transformation_stats(
|
|
2864
|
+
stats_data=stats_data, record=record, target_col=target_field,
|
|
2865
|
+
source_col=record.source.field_names[0] if record.source.field_names else None,
|
|
2866
|
+
status='source_missing', mapping_type='one_to_one', default_logic='direct_mapping',
|
|
2867
|
+
source_existed=False, df_length=len(df)
|
|
2868
|
+
)
|
|
2869
|
+
|
|
2870
|
+
def _apply_one_to_many(
|
|
2871
|
+
self,
|
|
2872
|
+
df: pd.DataFrame,
|
|
2873
|
+
record,
|
|
2874
|
+
existing_sources: List[str],
|
|
2875
|
+
stats_data: List[dict],
|
|
2876
|
+
created_targets: Set[str],
|
|
2877
|
+
) -> None:
|
|
2878
|
+
"""Applies default one-to-many logic: Duplicate source value to all targets.
|
|
2879
|
+
|
|
2880
|
+
Applied when no explicit logic is provided and the relation type is 'one_to_many'.
|
|
2881
|
+
A single source field is mapped to multiple target fields.
|
|
2882
|
+
|
|
2883
|
+
Args:
|
|
2884
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
2885
|
+
record: The scenario record.
|
|
2886
|
+
existing_sources (List[str]): List containing the single source field.
|
|
2887
|
+
stats_data (List[dict]): List to append statistics to.
|
|
2888
|
+
created_targets (Set[str]): Set to track created target columns.
|
|
2889
|
+
"""
|
|
2890
|
+
target_field_names = record.target.field_names
|
|
2891
|
+
|
|
2892
|
+
if len(existing_sources) > 0:
|
|
2893
|
+
source_field = existing_sources[0]
|
|
2894
|
+
for target_field in target_field_names:
|
|
2895
|
+
created_targets.add(target_field)
|
|
2896
|
+
df[target_field] = df[source_field]
|
|
2897
|
+
self._log_transformation_stats(
|
|
2898
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
|
|
2899
|
+
status='mapped', mapping_type='one_to_many', default_logic='duplicate_to_all_targets', df_length=len(df)
|
|
2900
|
+
)
|
|
2901
|
+
else:
|
|
2902
|
+
for target_field in target_field_names:
|
|
2903
|
+
created_targets.add(target_field)
|
|
2904
|
+
df[target_field] = ''
|
|
2905
|
+
self._log_transformation_stats(
|
|
2906
|
+
stats_data=stats_data, record=record, target_col=target_field,
|
|
2907
|
+
source_col=record.source.field_names[0] if record.source.field_names else None,
|
|
2908
|
+
status='source_missing', mapping_type='one_to_many', default_logic='duplicate_to_all_targets',
|
|
2909
|
+
source_existed=False, df_length=len(df)
|
|
2910
|
+
)
|
|
2911
|
+
|
|
2912
|
+
def _apply_many_to_one(
|
|
2913
|
+
self,
|
|
2914
|
+
df: pd.DataFrame,
|
|
2915
|
+
record,
|
|
2916
|
+
existing_sources: List[str],
|
|
2917
|
+
stats_data: List[dict],
|
|
2918
|
+
created_targets: Set[str],
|
|
2919
|
+
) -> None:
|
|
2920
|
+
"""Applies default many-to-one logic: Concatenate sources with pipe separator.
|
|
2921
|
+
|
|
2922
|
+
Applied when no explicit logic is provided and the relation type is 'many_to_one'.
|
|
2923
|
+
Multiple source fields are mapped to a single target field via concanation of the source valuess.
|
|
2924
|
+
|
|
2925
|
+
Args:
|
|
2926
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
2927
|
+
record: The scenario record.
|
|
2928
|
+
existing_sources (List[str]): List of source fields present in the DataFrame.
|
|
2929
|
+
stats_data (List[dict]): List to append statistics to.
|
|
2930
|
+
created_targets (Set[str]): Set to track created target columns.
|
|
2931
|
+
"""
|
|
2932
|
+
target_field_names = record.target.field_names
|
|
2933
|
+
|
|
2934
|
+
if len(existing_sources) > 0:
|
|
2935
|
+
concatenated = self._concatenate_source_fields(df=df, source_fields=existing_sources)
|
|
2936
|
+
target_field = target_field_names[0]
|
|
2937
|
+
created_targets.add(target_field)
|
|
2938
|
+
df[target_field] = concatenated
|
|
2939
|
+
self._log_transformation_stats(
|
|
2940
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=existing_sources,
|
|
2941
|
+
status='mapped', mapping_type='many_to_one', default_logic='concatenate_with_pipe', df_length=len(df)
|
|
2942
|
+
)
|
|
2943
|
+
elif len(target_field_names) > 0:
|
|
2944
|
+
target_field = target_field_names[0]
|
|
2945
|
+
created_targets.add(target_field)
|
|
2946
|
+
df[target_field] = ''
|
|
2947
|
+
self._log_transformation_stats(
|
|
2948
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=record.source.field_names,
|
|
2949
|
+
status='source_missing', mapping_type='many_to_one', default_logic='concatenate_with_pipe',
|
|
2950
|
+
source_existed=False, df_length=len(df)
|
|
2951
|
+
)
|
|
2952
|
+
|
|
2953
|
+
def _apply_many_to_many(
|
|
2954
|
+
self,
|
|
2955
|
+
df: pd.DataFrame,
|
|
2956
|
+
record,
|
|
2957
|
+
existing_sources: List[str],
|
|
2958
|
+
stats_data: List[dict],
|
|
2959
|
+
created_targets: Set[str],
|
|
2960
|
+
kept_sources: Optional[Set[str]] = None
|
|
2961
|
+
) -> None:
|
|
2962
|
+
"""Applies default many-to-many logic: Variable behavior based on field counts.
|
|
2963
|
+
|
|
2964
|
+
Applied when no explicit logic is provided and the relation type is 'many_to_many'.
|
|
2965
|
+
The behavior adapts based on the number of source fields (N) vs target fields (M).
|
|
2966
|
+
|
|
2967
|
+
defaults for (N:M) mappings with different cardinalities :
|
|
2968
|
+
- N == M: Direct 1:1 mapping
|
|
2969
|
+
- N < M: Maps available sources 1:1, then fills remaining targets with the last source.
|
|
2970
|
+
- N > M: Concatenates all sources into every target field.
|
|
2971
|
+
|
|
2972
|
+
Args:
|
|
2973
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
2974
|
+
record: The scenario record.
|
|
2975
|
+
existing_sources (List[str]): List of source fields present in the DataFrame.
|
|
2976
|
+
stats_data (List[dict]): List to append statistics to.
|
|
2977
|
+
created_targets (Set[str]): Set to track created target columns.
|
|
2978
|
+
kept_sources: Optional parameter for interface consistency (unused in this method).
|
|
2979
|
+
"""
|
|
2980
|
+
target_field_names = record.target.field_names
|
|
2981
|
+
n_sources = len(existing_sources)
|
|
2982
|
+
n_targets = len(target_field_names)
|
|
2983
|
+
|
|
2984
|
+
# Equal: 1:1 mapping
|
|
2985
|
+
if n_sources == n_targets:
|
|
2986
|
+
for i in range(n_sources):
|
|
2987
|
+
source_field = existing_sources[i]
|
|
2988
|
+
target_field = target_field_names[i]
|
|
2989
|
+
created_targets.add(target_field)
|
|
2990
|
+
df[target_field] = df[source_field]
|
|
2991
|
+
self._log_transformation_stats(
|
|
2992
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
|
|
2993
|
+
status='mapped', mapping_type='many_to_many_equal', default_logic='direct_1_to_1_mapping', df_length=len(df)
|
|
2994
|
+
)
|
|
2995
|
+
|
|
2996
|
+
# Less sources: Map 1:1 then fill remaining with last source
|
|
2997
|
+
elif n_sources < n_targets:
|
|
2998
|
+
# Map first n
|
|
2999
|
+
for i in range(n_sources):
|
|
3000
|
+
source_field = existing_sources[i]
|
|
3001
|
+
target_field = target_field_names[i]
|
|
3002
|
+
created_targets.add(target_field)
|
|
3003
|
+
df[target_field] = df[source_field]
|
|
3004
|
+
self._log_transformation_stats(
|
|
3005
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
|
|
3006
|
+
status='mapped', mapping_type='many_to_many_n_lt_m', default_logic='map_n_then_fill_remaining', df_length=len(df)
|
|
3007
|
+
)
|
|
3008
|
+
|
|
3009
|
+
# Fill remaining
|
|
3010
|
+
if n_sources > 0:
|
|
3011
|
+
last_source = existing_sources[-1]
|
|
3012
|
+
for i in range(n_sources, n_targets):
|
|
3013
|
+
target_field = target_field_names[i]
|
|
3014
|
+
created_targets.add(target_field)
|
|
3015
|
+
df[target_field] = df[last_source]
|
|
3016
|
+
self._log_transformation_stats(
|
|
3017
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=last_source,
|
|
3018
|
+
status='mapped', mapping_type='many_to_many_n_lt_m', default_logic='map_n_then_fill_remaining', df_length=len(df)
|
|
3019
|
+
)
|
|
3020
|
+
else: # No sources at all
|
|
3021
|
+
for i in range(n_sources, n_targets):
|
|
3022
|
+
target_field = target_field_names[i]
|
|
3023
|
+
created_targets.add(target_field)
|
|
3024
|
+
df[target_field] = ''
|
|
3025
|
+
self._log_transformation_stats(
|
|
3026
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=None,
|
|
3027
|
+
status='source_missing', mapping_type='many_to_many_n_lt_m', default_logic='map_n_then_fill_remaining',
|
|
3028
|
+
source_existed=False, df_length=len(df)
|
|
3029
|
+
)
|
|
3030
|
+
|
|
3031
|
+
# More sources: Concatenate all to each target
|
|
3032
|
+
else: # n_sources > n_targets
|
|
3033
|
+
if n_sources > 0:
|
|
3034
|
+
concatenated = self._concatenate_source_fields(df=df, source_fields=existing_sources)
|
|
3035
|
+
for target_field in target_field_names:
|
|
3036
|
+
created_targets.add(target_field)
|
|
3037
|
+
df[target_field] = concatenated
|
|
3038
|
+
self._log_transformation_stats(
|
|
3039
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=existing_sources,
|
|
3040
|
+
status='mapped', mapping_type='many_to_many_n_gt_m', default_logic='concatenate_all_to_each_target', df_length=len(df)
|
|
3041
|
+
)
|
|
3042
|
+
else:
|
|
3043
|
+
for target_field in target_field_names:
|
|
3044
|
+
created_targets.add(target_field)
|
|
3045
|
+
df[target_field] = ''
|
|
3046
|
+
self._log_transformation_stats(
|
|
3047
|
+
stats_data=stats_data, record=record, target_col=target_field, source_col=record.source.field_names,
|
|
3048
|
+
status='source_missing', mapping_type='many_to_many_n_gt_m', default_logic='concatenate_all_to_each_target',
|
|
3049
|
+
source_existed=False, df_length=len(df)
|
|
3050
|
+
)
|
|
3051
|
+
|
|
3052
|
+
# ============================================================================
|
|
3053
|
+
# Transformation Helpers
|
|
3054
|
+
# ============================================================================
|
|
3055
|
+
|
|
3056
|
+
def _generate_statistics_dataframe(
|
|
3057
|
+
self,
|
|
3058
|
+
scenario: ParsedScenario,
|
|
3059
|
+
df: pd.DataFrame,
|
|
3060
|
+
stats_data: List[dict],
|
|
3061
|
+
source_fields_to_keep: Set[str]
|
|
3062
|
+
) -> pd.DataFrame:
|
|
3063
|
+
"""Generates the statistics DataFrame, including unmapped source columns.
|
|
3064
|
+
|
|
3065
|
+
Args:
|
|
3066
|
+
scenario: The scenario object.
|
|
3067
|
+
df: The DataFrame being processed.
|
|
3068
|
+
stats_data: List of statistics dictionaries collected so far.
|
|
3069
|
+
source_fields_to_keep: Set of source fields that were explicitly kept.
|
|
3070
|
+
|
|
3071
|
+
Returns:
|
|
3072
|
+
pd.DataFrame: The final statistics DataFrame.
|
|
3073
|
+
"""
|
|
3074
|
+
# Track mapped/unmapped source columns for statistics
|
|
3075
|
+
# Only track unmapped sources that exist in DataFrame and aren't intentionally kept
|
|
3076
|
+
all_scenario_sources = scenario.all_source_fields
|
|
3077
|
+
mapped_sources_from_records = set()
|
|
3078
|
+
for record in scenario.records:
|
|
3079
|
+
mapped_sources_from_records.update(record.source.field_names)
|
|
3080
|
+
|
|
3081
|
+
unmapped_sources_in_df = (all_scenario_sources & set(df.columns)) - mapped_sources_from_records - source_fields_to_keep
|
|
3082
|
+
|
|
3083
|
+
# Log unmapped sources using a dummy record
|
|
3084
|
+
dummy_record = DummyRecord()
|
|
3085
|
+
for unmapped_source in unmapped_sources_in_df:
|
|
3086
|
+
self._log_transformation_stats(
|
|
3087
|
+
stats_data=stats_data,
|
|
3088
|
+
record=dummy_record,
|
|
3089
|
+
target_col=None,
|
|
3090
|
+
source_col=unmapped_source,
|
|
3091
|
+
status='not_in_mapping',
|
|
3092
|
+
mapping_type='unknown',
|
|
3093
|
+
source_existed=True,
|
|
3094
|
+
df_length=len(df)
|
|
3095
|
+
)
|
|
3096
|
+
|
|
3097
|
+
# Build statistics DataFrame
|
|
3098
|
+
if stats_data:
|
|
3099
|
+
return pd.DataFrame(stats_data)
|
|
3100
|
+
|
|
3101
|
+
return pd.DataFrame(columns=[
|
|
3102
|
+
'record_id', 'source_column', 'target_column', 'mapping_status',
|
|
3103
|
+
'source_existed', 'rows_affected', 'mapping_type', 'logic',
|
|
3104
|
+
'relation_type', 'source_count', 'target_count', 'default_logic'
|
|
3105
|
+
])
|
|
3106
|
+
|
|
3107
|
+
def _finalize_dataframe_columns(
|
|
3108
|
+
self,
|
|
3109
|
+
df: pd.DataFrame,
|
|
3110
|
+
scenario: ParsedScenario,
|
|
3111
|
+
drop_unmapped: bool,
|
|
3112
|
+
newly_created_target_fields: Set[str],
|
|
3113
|
+
source_fields_to_keep: Set[str],
|
|
3114
|
+
columns_to_keep: List[str]
|
|
3115
|
+
) -> pd.DataFrame:
|
|
3116
|
+
"""Finalizes the DataFrame by dropping unmapped columns and ensuring expected columns exist.
|
|
3117
|
+
|
|
3118
|
+
Args:
|
|
3119
|
+
df: The DataFrame being processed.
|
|
3120
|
+
scenario: The scenario object.
|
|
3121
|
+
drop_unmapped: Whether to drop unmapped source columns.
|
|
3122
|
+
newly_created_target_fields: Set of target fields created during transformation.
|
|
3123
|
+
source_fields_to_keep: Set of source fields explicitly kept.
|
|
3124
|
+
columns_to_keep: List of additional columns to preserve.
|
|
3125
|
+
|
|
3126
|
+
Returns:
|
|
3127
|
+
pd.DataFrame: The finalized DataFrame.
|
|
3128
|
+
"""
|
|
3129
|
+
# 1. Define protected columns (must not be dropped)
|
|
3130
|
+
protected_columns = {'id'} | newly_created_target_fields | source_fields_to_keep | set(columns_to_keep)
|
|
3131
|
+
|
|
3132
|
+
# 2. Drop mapped source columns if requested
|
|
3133
|
+
if drop_unmapped:
|
|
3134
|
+
mapped_source_columns = set()
|
|
3135
|
+
for record in scenario.records:
|
|
3136
|
+
# Skip value mappings (handled by apply_value_mappings)
|
|
3137
|
+
if record.mapping and hasattr(record.mapping, 'values'):
|
|
3138
|
+
continue
|
|
3139
|
+
|
|
3140
|
+
normalized_logic = self._normalize_logic(record.logic)
|
|
3141
|
+
is_keep_source = "keepsource" in normalized_logic or "onlysource" in normalized_logic
|
|
3142
|
+
|
|
3143
|
+
if not is_keep_source:
|
|
3144
|
+
mapped_source_columns.update(record.source.field_names)
|
|
3145
|
+
|
|
3146
|
+
columns_to_drop = [col for col in mapped_source_columns if col not in protected_columns]
|
|
3147
|
+
df = df.drop(columns=columns_to_drop, errors='ignore')
|
|
3148
|
+
|
|
3149
|
+
# 3. Ensure only expected columns remain and missing expected columns are created
|
|
3150
|
+
all_expected_columns = list(protected_columns) + columns_to_keep
|
|
3151
|
+
|
|
3152
|
+
# Filter to keep only expected columns that exist
|
|
3153
|
+
final_df_columns = [col for col in df.columns if col in all_expected_columns]
|
|
3154
|
+
df = df[final_df_columns].copy()
|
|
3155
|
+
|
|
3156
|
+
# Add missing expected columns with None
|
|
3157
|
+
columns_missing_in_df = [col for col in all_expected_columns if col not in df.columns]
|
|
3158
|
+
for col in columns_missing_in_df:
|
|
3159
|
+
df[col] = None
|
|
3160
|
+
|
|
3161
|
+
return df
|
|
3162
|
+
|
|
3163
|
+
def _log_transformation_stats(
|
|
3164
|
+
self,
|
|
3165
|
+
stats_data: List[dict],
|
|
3166
|
+
record,
|
|
3167
|
+
target_col: Optional[str],
|
|
3168
|
+
source_col: Optional[Union[str, List[str]]],
|
|
3169
|
+
status: str,
|
|
3170
|
+
mapping_type: str,
|
|
3171
|
+
default_logic: Optional[str] = None,
|
|
3172
|
+
source_existed: bool = True,
|
|
3173
|
+
df_length: int = 0
|
|
3174
|
+
) -> List[dict]:
|
|
3175
|
+
"""Logs statistics for one field mapping operation.
|
|
3176
|
+
|
|
3177
|
+
Helper method that creates a statistics dictionary and appends it to stats_data.
|
|
3178
|
+
Called multiple times by rename_fields() to build the statistics DataFrame returned to users.
|
|
3179
|
+
|
|
3180
|
+
Args:
|
|
3181
|
+
stats_data (List[dict]): List to append statistics dictionary to.
|
|
3182
|
+
record: Record object containing field metadata (id, logic, relation_type, etc.).
|
|
3183
|
+
target_col (Optional[str]): Target column name, or None if not applicable.
|
|
3184
|
+
source_col (Optional[Union[str, List[str]]]): Source column name(s). Can be single string,
|
|
3185
|
+
list of strings (pipe-separated in output), or None.
|
|
3186
|
+
status (str): Mapping status: 'mapped', 'source_missing', 'kept_source', 'value_mapped'.
|
|
3187
|
+
mapping_type (str): Type of mapping: 'concat', 'fill', 'one_to_one', etc.
|
|
3188
|
+
default_logic (Optional[str]): Description of default logic used if no explicit logic.
|
|
3189
|
+
source_existed (bool): Whether source column(s) existed in DataFrame. Defaults to True.
|
|
3190
|
+
df_length (int): Number of rows in DataFrame (rows affected). Defaults to 0.
|
|
3191
|
+
|
|
3192
|
+
Returns:
|
|
3193
|
+
List[dict]: Updated stats_data list with new statistics dictionary appended.
|
|
3194
|
+
"""
|
|
3195
|
+
# Standardize source_col to string if it's a list/None
|
|
3196
|
+
if isinstance(source_col, list):
|
|
3197
|
+
src_str = '|'.join(source_col) if source_col else None
|
|
3198
|
+
else:
|
|
3199
|
+
src_str = source_col
|
|
3200
|
+
|
|
3201
|
+
stats_data.append({
|
|
3202
|
+
'record_id': record.id,
|
|
3203
|
+
'source_column': src_str,
|
|
3204
|
+
'target_column': target_col,
|
|
3205
|
+
'mapping_status': status,
|
|
3206
|
+
'source_existed': source_existed,
|
|
3207
|
+
'rows_affected': df_length,
|
|
3208
|
+
'mapping_type': mapping_type,
|
|
3209
|
+
'logic': record.logic,
|
|
3210
|
+
'relation_type': record.relation_type,
|
|
3211
|
+
'source_count': len(record.source.field_names),
|
|
3212
|
+
'target_count': len(record.target.field_names),
|
|
3213
|
+
'default_logic': default_logic
|
|
3214
|
+
})
|
|
3215
|
+
return stats_data
|
|
3216
|
+
|
|
3217
|
+
def _apply_value_mapping_logging(
|
|
3218
|
+
self,
|
|
3219
|
+
df: pd.DataFrame,
|
|
3220
|
+
record,
|
|
3221
|
+
stats_data: List[dict],
|
|
3222
|
+
created_targets: Set[str]
|
|
3223
|
+
) -> None:
|
|
3224
|
+
"""Logs statistics for records with explicit value mappings (skipping renaming).
|
|
3225
|
+
|
|
3226
|
+
This helper is applied when a record has defined value mappings (e.g., "M" -> "1").
|
|
3227
|
+
These transformations are complex and handled by `apply_value_mappings()`, not
|
|
3228
|
+
`rename_fields()`. However, `rename_fields()` still needs to log these records to provide
|
|
3229
|
+
a complete report of all scenario operations.
|
|
3230
|
+
|
|
3231
|
+
**Why:**
|
|
3232
|
+
To ensure the statistics DataFrame returned by `rename_fields()` is exhaustive and
|
|
3233
|
+
includes records that were skipped for renaming but will be handled elsewhere. It also
|
|
3234
|
+
initializes the target columns with `None` to ensure structure consistency.
|
|
3235
|
+
|
|
3236
|
+
Args:
|
|
3237
|
+
df (pd.DataFrame): The DataFrame being processed.
|
|
3238
|
+
record: The scenario record containing value mapping definitions.
|
|
3239
|
+
stats_data (List[dict]): List to append the statistics dictionary to.
|
|
3240
|
+
created_targets (Set[str]): Set to track newly created target columns.
|
|
3241
|
+
"""
|
|
3242
|
+
source_field_names = record.source.field_names
|
|
3243
|
+
target_field_names = record.target.field_names
|
|
3244
|
+
for target_field in target_field_names:
|
|
3245
|
+
created_targets.add(target_field)
|
|
3246
|
+
if target_field not in df.columns:
|
|
3247
|
+
df[target_field] = None
|
|
3248
|
+
self._log_transformation_stats(
|
|
3249
|
+
stats_data=stats_data,
|
|
3250
|
+
record=record,
|
|
3251
|
+
target_col=target_field,
|
|
3252
|
+
source_col=source_field_names,
|
|
3253
|
+
status='value_mapped',
|
|
3254
|
+
mapping_type='value_mapping',
|
|
3255
|
+
source_existed=any(s in df.columns for s in source_field_names),
|
|
3256
|
+
df_length=len(df)
|
|
3257
|
+
)
|
|
3258
|
+
|
|
3259
|
+
# ============================================================================
|
|
3260
|
+
# Utility Helpers
|
|
3261
|
+
# ============================================================================
|
|
3262
|
+
|
|
3263
|
+
def _normalize_logic(self, logic: Optional[str]) -> str:
|
|
3264
|
+
"""Normalizes logic string for flexible matching.
|
|
3265
|
+
|
|
3266
|
+
Converts to lowercase and removes spaces/special characters so "Concat", "CONCAT", and "concat"
|
|
3267
|
+
all match the same logic type. Used by rename_fields() to match user-entered logic strings.
|
|
3268
|
+
|
|
3269
|
+
Args:
|
|
3270
|
+
logic (Optional[str]): Original logic string (e.g., "Concat", "fill", "keep source").
|
|
3271
|
+
|
|
3272
|
+
Returns:
|
|
3273
|
+
str: Normalized string (e.g., "concat", "fill", "keepsource"). Empty string if None.
|
|
3274
|
+
"""
|
|
3275
|
+
if not logic:
|
|
3276
|
+
return ""
|
|
3277
|
+
# Lowercase, remove spaces, remove special characters
|
|
3278
|
+
return re.sub(r'[^a-z0-9]', '', logic.lower())
|
|
3279
|
+
|
|
3280
|
+
def _normalize_value_for_mapping(self, value: str, strategy: str) -> str:
|
|
3281
|
+
"""Normalizes a value according to the specified mapping strategy.
|
|
3282
|
+
|
|
3283
|
+
Used by apply_value_mappings() to normalize both DataFrame source values and
|
|
3284
|
+
mapping source values before comparison, enabling flexible matching strategies.
|
|
3285
|
+
|
|
3286
|
+
Args:
|
|
3287
|
+
value: The value to normalize (e.g., "John Doe", "F", "John|Doe").
|
|
3288
|
+
strategy: Mapping strategy name (exactValMap, ignoreCaseValMap, etc.).
|
|
3289
|
+
|
|
3290
|
+
Returns:
|
|
3291
|
+
Normalized value ready for comparison.
|
|
3292
|
+
"""
|
|
3293
|
+
if not value or pd.isna(value):
|
|
3294
|
+
return str(value) if value is not None else ""
|
|
3295
|
+
|
|
3296
|
+
value_str = str(value).strip()
|
|
3297
|
+
|
|
3298
|
+
if strategy == 'exactValMap':
|
|
3299
|
+
return value_str
|
|
3300
|
+
elif strategy == 'ignoreCaseValMap':
|
|
3301
|
+
return value_str.lower()
|
|
3302
|
+
elif strategy == 'ignoreSpecialValMap':
|
|
3303
|
+
# Remove special chars including spaces
|
|
3304
|
+
return re.sub(r'[^a-zA-Z0-9]', '', value_str)
|
|
3305
|
+
elif strategy == 'ignoreSpacesValMap':
|
|
3306
|
+
# Remove spaces only
|
|
3307
|
+
return value_str.replace(' ', '')
|
|
3308
|
+
elif strategy == 'flexValMap':
|
|
3309
|
+
# Lowercase + remove special chars including spaces
|
|
3310
|
+
return re.sub(r'[^a-z0-9]', '', value_str.lower())
|
|
3311
|
+
else:
|
|
3312
|
+
# Default to exact matching if unknown strategy
|
|
3313
|
+
return value_str
|
|
3314
|
+
|
|
3315
|
+
def _determine_mapping_strategy(self, record_logic: Optional[str], default_how: str) -> str:
|
|
3316
|
+
"""Determines which mapping strategy to use for a record.
|
|
3317
|
+
|
|
3318
|
+
Checks record.logic first (higher priority), then falls back to default_how kwarg.
|
|
3319
|
+
Uses _normalize_logic to match strategy names flexibly. Checks if normalized logic
|
|
3320
|
+
contains any strategy name as a substring (to handle cases where logic contains other text).
|
|
3321
|
+
|
|
3322
|
+
Args:
|
|
3323
|
+
record_logic: The logic string from the record (may contain strategy name).
|
|
3324
|
+
default_how: Default strategy from how kwarg (e.g., 'exactValMap').
|
|
3325
|
+
|
|
3326
|
+
Returns:
|
|
3327
|
+
Strategy name to use (exactValMap, ignoreCaseValMap, etc.).
|
|
3328
|
+
"""
|
|
3329
|
+
if not record_logic:
|
|
3330
|
+
return default_how
|
|
3331
|
+
|
|
3332
|
+
normalized_logic = self._normalize_logic(record_logic)
|
|
3333
|
+
|
|
3334
|
+
# Check if normalized logic contains any mapping strategy as substring
|
|
3335
|
+
# Order matters: check longer/more specific names first to avoid false matches
|
|
3336
|
+
strategies = [
|
|
3337
|
+
('ignorecasevalmap', 'ignoreCaseValMap'),
|
|
3338
|
+
('ignorespecialvalmap', 'ignoreSpecialValMap'),
|
|
3339
|
+
('ignorespacesvalmap', 'ignoreSpacesValMap'),
|
|
3340
|
+
('flexvalmap', 'flexValMap'),
|
|
3341
|
+
('exactvalmap', 'exactValMap')
|
|
3342
|
+
]
|
|
3343
|
+
|
|
3344
|
+
for normalized_strategy, strategy_name in strategies:
|
|
3345
|
+
if normalized_strategy in normalized_logic:
|
|
3346
|
+
return strategy_name
|
|
3347
|
+
|
|
3348
|
+
# No match found, use default
|
|
3349
|
+
return default_how
|
|
3350
|
+
|
|
3351
|
+
def _concatenate_source_fields(
|
|
3352
|
+
self,
|
|
3353
|
+
df: pd.DataFrame,
|
|
3354
|
+
source_fields: List[str]
|
|
3355
|
+
) -> pd.Series:
|
|
3356
|
+
"""Concatenates values from multiple source columns into a single Series with '|' separator.
|
|
3357
|
+
|
|
3358
|
+
Combines the values from multiple columns (not the column names).
|
|
3359
|
+
Example: values from 'first_name' and 'last_name' columns → 'John|Doe'.
|
|
3360
|
+
Returns a Series of values; caller assigns this Series to target column name(s).
|
|
3361
|
+
If only one field provided, returns its values converted to string and stripped (no concatenation).
|
|
3362
|
+
Called by rename_fields() for 'concat' logic and many_to_one/many_to_many default behaviors.
|
|
3363
|
+
|
|
3364
|
+
Args:
|
|
3365
|
+
df (pd.DataFrame): DataFrame containing the source columns.
|
|
3366
|
+
source_fields (List[str]): List of column names whose VALUES will be concatenated (e.g., ['first_name', 'last_name']).
|
|
3367
|
+
|
|
3368
|
+
Returns:
|
|
3369
|
+
pd.Series: Series of concatenated VALUES (no column name). Caller assigns to target column(s).
|
|
3370
|
+
"""
|
|
3371
|
+
if len(source_fields) == 1:
|
|
3372
|
+
return df[source_fields[0]].astype(str).str.strip()
|
|
3373
|
+
else:
|
|
3374
|
+
return df[source_fields].astype(str).apply(
|
|
3375
|
+
lambda row: '|'.join(val.strip() for val in row), axis=1
|
|
3376
|
+
)
|
|
3377
|
+
|
|
3378
|
+
def _apply_mapping_to_target(
|
|
3379
|
+
self,
|
|
3380
|
+
df: pd.DataFrame,
|
|
3381
|
+
concatenated_source_series: pd.Series,
|
|
3382
|
+
target_field: str,
|
|
3383
|
+
replacements: dict,
|
|
3384
|
+
default_val: Optional[str] = None,
|
|
3385
|
+
original_source_series: Optional[pd.Series] = None
|
|
3386
|
+
) -> pd.DataFrame:
|
|
3387
|
+
"""Applies value mappings to create/populate a target column.
|
|
3388
|
+
|
|
3389
|
+
Transforms source values → target values using lookup dictionary via pandas .map().
|
|
3390
|
+
Unmapped values use default_val if provided, otherwise keep original source value.
|
|
3391
|
+
Always creates target column (uses default if no mappings exist).
|
|
3392
|
+
Called by apply_value_mappings() for each target field in records with value mappings.
|
|
3393
|
+
|
|
3394
|
+
Args:
|
|
3395
|
+
df: DataFrame to modify. Target column added/updated in-place.
|
|
3396
|
+
concatenated_source_series: Source values formatted for lookup (may be normalized for flexible matching).
|
|
3397
|
+
target_field: Name of target column to create/populate.
|
|
3398
|
+
replacements: Mapping dict {normalized_source_value: target_value} (e.g., {"f": "1", "m": "0"}).
|
|
3399
|
+
default_val: Default for unmapped values. If None, keeps original source value.
|
|
3400
|
+
original_source_series: Original (non-normalized) source series for fillna when default_val is None.
|
|
3401
|
+
|
|
3402
|
+
Returns:
|
|
3403
|
+
Modified DataFrame with target column added/updated.
|
|
3404
|
+
|
|
3405
|
+
Example 1: Single source field mapping.
|
|
3406
|
+
>>> # Input DataFrame with source column
|
|
3407
|
+
>>> df = pd.DataFrame({'id': [1, 2, 3], 'gender': ['F', 'M', 'F']})
|
|
3408
|
+
>>> # Create Series from source column (single field)
|
|
3409
|
+
>>> concatenated_source_series = df['gender'].astype(str).str.strip()
|
|
3410
|
+
>>> # Define mapping rules and target column name
|
|
3411
|
+
>>> replacements = {'F': '1', 'M': '0'}
|
|
3412
|
+
>>> target_field = 'gender_code'
|
|
3413
|
+
>>> default_val = None
|
|
3414
|
+
>>> # Apply mapping: Series values lookup in dict keys → return dict values
|
|
3415
|
+
>>> df = _apply_mapping_to_target(df, concatenated_source_series, target_field, replacements, default_val)
|
|
3416
|
+
>>> df
|
|
3417
|
+
id gender gender_code
|
|
3418
|
+
0 1 F 1
|
|
3419
|
+
1 2 M 0
|
|
3420
|
+
2 3 F 1
|
|
3421
|
+
|
|
3422
|
+
Example 2: Concatenated source fields (many_to_one mapping).
|
|
3423
|
+
Scenario mapping: 'first_name'|'last_name' → 'full_name_code'
|
|
3424
|
+
>>> # Input DataFrame with multiple source columns
|
|
3425
|
+
>>> df = pd.DataFrame({'id': [1, 2], 'first_name': ['John', 'Jane'], 'last_name': ['Doe', 'Smith']})
|
|
3426
|
+
>>> # Create Series from multiple source columns (concatenated with '|')
|
|
3427
|
+
>>> concatenated_source_series = df[['first_name', 'last_name']].astype(str).apply(
|
|
3428
|
+
... lambda row: '|'.join(val.strip() for val in row), axis=1)
|
|
3429
|
+
>>> # Define mapping rules (keys match concatenated format) and target column name
|
|
3430
|
+
>>> replacements = {'John|Doe': 'JD001', 'Jane|Smith': 'JS002'}
|
|
3431
|
+
>>> target_field = 'full_name_code'
|
|
3432
|
+
>>> default_val = None
|
|
3433
|
+
>>> # Apply mapping: "John|Doe" → "JD001", "Jane|Smith" → "JS002"
|
|
3434
|
+
>>> df = _apply_mapping_to_target(df, concatenated_source_series, target_field, replacements, default_val)
|
|
3435
|
+
>>> df
|
|
3436
|
+
id first_name last_name full_name_code
|
|
3437
|
+
0 1 John Doe JD001
|
|
3438
|
+
1 2 Jane Smith JS002
|
|
3439
|
+
"""
|
|
3440
|
+
if not replacements:
|
|
3441
|
+
df[target_field] = default_val if default_val else None
|
|
3442
|
+
return df
|
|
3443
|
+
|
|
3444
|
+
mapped_series = concatenated_source_series.map(replacements)
|
|
3445
|
+
|
|
3446
|
+
if default_val:
|
|
3447
|
+
mapped_series = mapped_series.fillna(default_val)
|
|
3448
|
+
else:
|
|
3449
|
+
# Use original source series for fillna to preserve original values (not normalized)
|
|
3450
|
+
fill_series = original_source_series if original_source_series is not None else concatenated_source_series
|
|
3451
|
+
mapped_series = mapped_series.fillna(fill_series)
|
|
3452
|
+
|
|
3453
|
+
df[target_field] = mapped_series
|
|
3454
|
+
return df
|
|
3455
|
+
|
|
3456
|
+
def _detect_missing_values_in_fields(
|
|
3457
|
+
self,
|
|
3458
|
+
df: pd.DataFrame,
|
|
3459
|
+
source_field_names: List[str]
|
|
3460
|
+
) -> Dict[str, int]:
|
|
3461
|
+
"""Detects missing values in source fields used for value mapping.
|
|
3462
|
+
|
|
3463
|
+
Called by apply_value_mappings() before processing to warn users about missing values
|
|
3464
|
+
that may affect mapping accuracy. Missing values can cause mappings to fail silently
|
|
3465
|
+
or produce unexpected results, so early detection helps users identify data quality issues.
|
|
3466
|
+
|
|
3467
|
+
Args:
|
|
3468
|
+
df: Input DataFrame to check.
|
|
3469
|
+
source_field_names: List of source field names to check for missing values.
|
|
3470
|
+
|
|
3471
|
+
Returns:
|
|
3472
|
+
Dictionary mapping field names to counts of missing values found.
|
|
3473
|
+
"""
|
|
3474
|
+
missing_counts = {}
|
|
3475
|
+
missing_value_patterns = self.MISSING_VALUES
|
|
3476
|
+
|
|
3477
|
+
for field_name in source_field_names:
|
|
3478
|
+
if field_name not in df.columns:
|
|
3479
|
+
continue
|
|
3480
|
+
|
|
3481
|
+
series = df[field_name]
|
|
3482
|
+
|
|
3483
|
+
# Count pd.NA and numpy NaN (true missing values)
|
|
3484
|
+
missing_count = series.isna().sum()
|
|
3485
|
+
|
|
3486
|
+
# Count string representations that indicate missing data (e.g., 'nan', 'None', 'null')
|
|
3487
|
+
# These are checked separately because they're not detected by isna() but still
|
|
3488
|
+
# represent missing/invalid data that should be handled before mapping
|
|
3489
|
+
for pattern in missing_value_patterns:
|
|
3490
|
+
missing_count += (series == pattern).sum()
|
|
3491
|
+
|
|
3492
|
+
if missing_count > 0:
|
|
3493
|
+
missing_counts[field_name] = missing_count
|
|
3494
|
+
|
|
3495
|
+
return missing_counts
|