brynq-sdk-brynq 4.2.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of brynq-sdk-brynq might be problematic. Click here for more details.

@@ -0,0 +1,3495 @@
1
+ """
2
+ Scenarios SDK for BrynQ.
3
+
4
+ This module provides the `Scenarios` class for fetching, parsing, and applying
5
+ data transformation scenarios from the BrynQ API. It handles field renaming,
6
+ value mapping, and structure validation based on configured scenarios.
7
+
8
+ This module also contains parsed scenario models (ParsedScenario, Record, FieldProperties)
9
+ and parsing logic that transforms raw API responses into usable business logic models.
10
+ """
11
+ # imports
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ import warnings
16
+ from collections import defaultdict
17
+ from dataclasses import dataclass
18
+ from datetime import datetime
19
+ from functools import cached_property
20
+ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union, Literal
21
+
22
+ import pandas as pd
23
+ import pandera as pa
24
+ from brynq_sdk_functions import BrynQPanderaDataFrameModel, Functions
25
+ from pandera.typing import Series, String # type: ignore[attr-defined]
26
+ from pydantic import BaseModel, ConfigDict, Field
27
+ from pydantic.types import AwareDatetime
28
+
29
+ from .schemas.scenarios import (
30
+ Scenario,
31
+ ScenarioDetail,
32
+ SourceOrTargetField,
33
+ ScenarioMappingConfiguration,
34
+ FieldType,
35
+ SystemType,
36
+ RelationType,
37
+ CustomSourceOrTargetField,
38
+ LibrarySourceOrTargetField,
39
+ ConfigurationSourceOrTargetField,
40
+ LibraryFieldValues,
41
+ MappingValue,
42
+ ConfigurationType,
43
+ ConfigFieldValues,
44
+ Template,
45
+ )
46
+
47
+ # ============================================================================
48
+ # Type Aliases for Parsed Models
49
+ # ============================================================================
50
+ FieldName = str
51
+ PythonicName = str
52
+ FieldPropertiesMap = Dict[FieldName, "FieldProperties"]
53
+ SourceToTargetMap = Dict[FieldName, List[FieldName]]
54
+ TargetToSourceMap = Dict[FieldName, Union[FieldName, List[FieldName]]]
55
+
56
+
57
+ # ============================================================================
58
+ # Extraction Helpers
59
+ # ============================================================================
60
+
61
+ def _sanitize_alias(alias: str) -> str:
62
+ """Converts a raw string into a valid Python variable name.
63
+
64
+ Converts names like "User ID" or "1st_Name" to "user_id" and "field_1st_name" to fix
65
+ Python syntax issues (spaces, special characters, leading digits). Used in
66
+ `_build_field_properties` and `_build_record` to create Python-safe aliases.
67
+
68
+ Args:
69
+ alias: The raw string to sanitize.
70
+
71
+ Returns:
72
+ A snake_case string safe for use as a class attribute.
73
+ """
74
+ # Replace non-word characters and leading digits with underscores to create a valid Python variable name
75
+ pythonic_name = re.sub(r"\W|^(?=\d)", "_", alias)
76
+ pythonic_name = re.sub(r"_+", "_", pythonic_name).strip("_").lower()
77
+ if not pythonic_name:
78
+ pythonic_name = "field"
79
+ if pythonic_name[0].isdigit(): #double check if regex failed
80
+ pythonic_name = f"field_{pythonic_name}"
81
+ return pythonic_name
82
+
83
+ def _extract_names_from_fields(fields: SourceOrTargetField) -> List[str]:
84
+ """Extracts a list of field names from a field object, preserving order.
85
+
86
+ The API stores names in different places by field type (technical_name for CUSTOM,
87
+ field/uuid for LIBRARY). This provides a single way to get names regardless of structure.
88
+ Used during scenario parsing to build mapping dictionaries. Order is preserved from the API response.
89
+
90
+ Args:
91
+ fields: The SourceOrTargetField object to extract names from.
92
+
93
+ Returns:
94
+ List of field names (technical_name for CUSTOM, field/uuid for LIBRARY, uuid for CONFIGURATION) in API order.
95
+ Empty list for FIXED/EMPTY fields.
96
+ """
97
+ if isinstance(fields, CustomSourceOrTargetField):
98
+ names: List[str] = []
99
+ seen = set() # Track seen names to avoid duplicates while preserving order
100
+ for item in fields.data:
101
+ if item.technical_name and item.technical_name not in seen:
102
+ names.append(item.technical_name)
103
+ seen.add(item.technical_name)
104
+ if not names:
105
+ for item in fields.data:
106
+ uuid = getattr(item, "uuid", None)
107
+ if uuid and uuid not in seen:
108
+ names.append(str(uuid))
109
+ seen.add(str(uuid))
110
+ return names
111
+
112
+ if isinstance(fields, LibrarySourceOrTargetField):
113
+ names: List[str] = []
114
+ seen = set() # Track seen names to avoid duplicates while preserving order
115
+ for entry in fields.data:
116
+ # Handle different formats the API may return:
117
+ # - as a plain string: the string itself IS the field name/identifier
118
+ # - as a LibraryFieldValues object: the field name is in the 'field' attribute (preferred)
119
+ # or 'uuid' attribute (fallback if 'field' is missing)
120
+ if isinstance(entry, str):
121
+ # String entry is the field name itself
122
+ if entry not in seen:
123
+ names.append(entry)
124
+ seen.add(entry)
125
+ elif isinstance(entry, LibraryFieldValues):
126
+ if entry.field and entry.field not in seen:
127
+ names.append(entry.field)
128
+ seen.add(entry.field)
129
+ elif entry.uuid and entry.uuid not in seen:
130
+ names.append(str(entry.uuid))
131
+ seen.add(str(entry.uuid))
132
+ return names
133
+
134
+ if isinstance(fields, ConfigurationSourceOrTargetField):
135
+ names: List[str] = []
136
+ seen = set() # Track seen names to avoid duplicates while preserving order
137
+ for config_item in fields.data:
138
+ # Configuration fields use UUID as identifier
139
+ uuid_str = str(config_item.uuid)
140
+ if uuid_str not in seen:
141
+ names.append(uuid_str)
142
+ seen.add(uuid_str)
143
+ return names
144
+
145
+ return []
146
+
147
+ def _extract_label_from_fields(
148
+ fields: SourceOrTargetField,
149
+ field_name: str
150
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
151
+ """Extracts human-readable labels for customer-facing communication.
152
+
153
+ CUSTOM fields use 'name' directly; LIBRARY fields have multi-language 'field_label'
154
+ dictionaries. Prioritizes English, then falls back to any available value. Used in
155
+ `_build_field_properties` and `_build_record`.
156
+
157
+ Args:
158
+ fields: The SourceOrTargetField object to extract from.
159
+ field_name: The field name to look up.
160
+
161
+ Returns:
162
+ Tuple of (Preferred Label, English Label, Dutch Label).
163
+ """
164
+ if isinstance(fields, CustomSourceOrTargetField):
165
+ for item in fields.data:
166
+ # Custom fields don't have multi-language field_label like library fields,
167
+ # so we use 'name' directly as the label (this is what shows up in BrynQ)
168
+ if item.technical_name == field_name or item.uuid == field_name:
169
+ return (item.name, None, None)
170
+
171
+ if isinstance(fields, LibrarySourceOrTargetField):
172
+ for entry in fields.data:
173
+ # Handle different formats the API may return:
174
+ # - as a plain string: no label available (skip)
175
+ # - as a LibraryFieldValues object: check field/uuid and extract field_label
176
+ if not isinstance(entry, str) and (entry.field == field_name or entry.uuid == field_name) and entry.field_label:
177
+ if isinstance(entry.field_label, dict):
178
+ l_en = entry.field_label.get("en")
179
+ # Return EN if present, else the first available value
180
+ return (l_en or next(iter(entry.field_label.values()), None), l_en, entry.field_label.get("nl"))
181
+
182
+ return (None, None, None)
183
+
184
+ def _extract_uuid_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
185
+ """Extracts UUID from fields for a given field name.
186
+
187
+ The API's mappingValues reference fields by UUID. This extracts UUIDs so
188
+ `UuidToFieldNameMapper` can convert UUID-based references to field names.
189
+
190
+ Args:
191
+ fields: SourceOrTargetField object to extract from.
192
+ field_name: The field name to look up.
193
+
194
+ Returns:
195
+ UUID string if found, None otherwise.
196
+ """
197
+ if isinstance(fields, CustomSourceOrTargetField):
198
+ for item in fields.data:
199
+ if item.technical_name == field_name or item.uuid == field_name:
200
+ return item.uuid
201
+
202
+ if isinstance(fields, LibrarySourceOrTargetField):
203
+ for entry in fields.data:
204
+ if not isinstance(entry, str) and (entry.field == field_name or entry.uuid == field_name):
205
+ return entry.uuid
206
+ return None
207
+
208
+
209
+ def _extract_schema_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
210
+ """Extracts schema name identifying the source system or category.
211
+
212
+ Used when building field properties to store metadata (not used in transformation logic).
213
+
214
+ Args:
215
+ fields: SourceOrTargetField object to extract from.
216
+ field_name: The field name to look up.
217
+
218
+ Returns:
219
+ Schema name string if found, None otherwise.
220
+ - For CUSTOM fields: returns CustomDataValues.source
221
+ - For LIBRARY fields: returns category.technicalName
222
+ """
223
+ if isinstance(fields, CustomSourceOrTargetField):
224
+ for item in fields.data:
225
+ if item.technical_name == field_name or item.uuid == field_name:
226
+ return item.source
227
+
228
+ if isinstance(fields, LibrarySourceOrTargetField):
229
+ for entry in fields.data:
230
+ if not isinstance(entry, str) and (entry.field == field_name or entry.uuid == field_name):
231
+ return entry.category.get("technicalName") if entry.category else None
232
+ return None
233
+
234
+ def _extract_technical_name_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
235
+ """Extracts technical_name from fields for a given field name.
236
+
237
+ Technical names are system-specific identifiers (often numeric/encoded) that differ from
238
+ human-readable names. Used by `UuidToFieldNameMapper` to convert UUID/schema pattern keys to field names and as a
239
+ fallback alias in Pandera field definitions. Only CUSTOM fields have technical names.
240
+
241
+ Args:
242
+ fields: SourceOrTargetField object to extract from.
243
+ field_name: The field name to look up (should be the technical_name for CUSTOM fields).
244
+
245
+ Returns:
246
+ Technical name string if found, None otherwise.
247
+ - For CUSTOM fields: returns the technical field ID needed for API calls to the system
248
+ (often not human-readable, e.g., "custom_field_2839471293")
249
+ - For LIBRARY fields: returns None (they use schema names instead, not technical identifiers)
250
+ """
251
+ if isinstance(fields, CustomSourceOrTargetField):
252
+ for item in fields.data:
253
+ # Match by technical_name (primary) or uuid (fallback)
254
+ # field_name should be the technical_name extracted by _extract_names_from_fields
255
+ # Ensure technical_name exists and matches, or fall back to uuid match
256
+ if item.technical_name and item.technical_name == field_name:
257
+ return item.technical_name
258
+ elif item.uuid == field_name:
259
+ # If matched by uuid, return the technical_name if it exists
260
+ return item.technical_name if item.technical_name else None
261
+ return None
262
+
263
+ def _extract_description_from_fields(fields: SourceOrTargetField, field_name: str) -> Optional[str]:
264
+ """Extracts description explaining what a field represents.
265
+
266
+ Used when building field properties to store metadata for documentation.
267
+ Only CUSTOM fields have descriptions.
268
+
269
+ Args:
270
+ fields: SourceOrTargetField object to extract from.
271
+ field_name: The field name to look up.
272
+
273
+ Returns:
274
+ Description string for CUSTOM fields, None otherwise.
275
+ """
276
+ if isinstance(fields, CustomSourceOrTargetField):
277
+ for item in fields.data:
278
+ if item.technical_name == field_name or item.uuid == field_name:
279
+ return item.description
280
+ return None
281
+
282
+
283
+ def _extract_config_props(fields: SourceOrTargetField, field_name: str) -> Dict[str, Any]:
284
+ """Extracts configuration field properties (question, type, value).
285
+
286
+ Used when building field properties for CONFIGURATION field types.
287
+ Extracts question (as dict, en, nl, and preferred string), config_type, and config_value.
288
+
289
+ Args:
290
+ fields: SourceOrTargetField object to extract from.
291
+ field_name: The field name to look up (UUID for CONFIGURATION fields).
292
+
293
+ Returns:
294
+ Dictionary with config properties: question, question_dict, question_en, question_nl, config_type, config_value.
295
+ Returns empty dict with None values if not a CONFIGURATION field or not found.
296
+ """
297
+ if isinstance(fields, ConfigurationSourceOrTargetField):
298
+ for config_item in fields.data:
299
+ # Match by UUID (configuration fields use UUID as identifier)
300
+ # Convert UUID to string for comparison
301
+ config_uuid_str = str(config_item.uuid)
302
+ if config_uuid_str == field_name or config_item.uuid == field_name:
303
+ question_dict = config_item.question
304
+ question_en = question_dict.get("en") if question_dict else None
305
+ question_nl = question_dict.get("nl") if question_dict else None
306
+ # Preferred question: English if available, else first available, else None
307
+ question = question_en or (next(iter(question_dict.values()), None) if question_dict else None)
308
+
309
+ # Get config_type value (handle both enum and string)
310
+ config_type_value = config_item.type.value if hasattr(config_item.type, 'value') else str(config_item.type)
311
+
312
+ return {
313
+ "question": question,
314
+ "question_dict": question_dict,
315
+ "question_en": question_en,
316
+ "question_nl": question_nl,
317
+ "config_type": config_type_value,
318
+ "config_value": config_item.value,
319
+ }
320
+
321
+ # Return empty dict with None values for non-configuration fields
322
+ return {
323
+ "question": None,
324
+ "question_dict": None,
325
+ "question_en": None,
326
+ "question_nl": None,
327
+ "config_type": None,
328
+ "config_value": None,
329
+ }
330
+
331
+
332
+ def _parse_config_value(config_item: ConfigFieldValues) -> Optional[str]:
333
+ """Convert a ConfigFieldValues object into a normalized string representation."""
334
+ cfg_type = getattr(config_item.type, "value", str(config_item.type))
335
+ value = config_item.value
336
+
337
+ # Attachment: explicitly suppressed
338
+ if cfg_type == ConfigurationType.ATTACHMENT.value:
339
+ return None
340
+
341
+ # Selection: extract English labels if the payload is a list of dicts
342
+ if cfg_type == ConfigurationType.SELECTION.value:
343
+ if isinstance(value, list):
344
+ labels = [v.get("en", "") for v in value if isinstance(v, dict) and "en" in v]
345
+ return ", ".join(labels) if labels else str(value)
346
+ return str(value)
347
+
348
+ # Datepicker: normalize single or range
349
+ if cfg_type == ConfigurationType.DATEPICKER.value:
350
+ def fmt(dt):
351
+ return dt.isoformat() if isinstance(dt, (datetime, AwareDatetime)) else str(dt)
352
+
353
+ if isinstance(value, list):
354
+ parts = [fmt(v) for v in value]
355
+ return " - ".join(parts) if parts else None
356
+ return fmt(value) if value is not None else None
357
+
358
+ # Simple scalar types: TEXT, EMAIL, NUMBER, RICHTEXT
359
+ if cfg_type in {
360
+ ConfigurationType.TEXT.value,
361
+ ConfigurationType.EMAIL.value,
362
+ ConfigurationType.NUMBER.value,
363
+ ConfigurationType.RICHTEXT.value,
364
+ }:
365
+ return str(value) if value is not None else None
366
+
367
+ # Fallback
368
+ return str(value) if value is not None else None
369
+
370
+
371
+ # ============================================================================
372
+ # Internal Schema Models (The "Useful" Objects)
373
+ # ============================================================================
374
+
375
+ class SourceTargetFields(BaseModel):
376
+ """Nested structure for source or target field metadata.
377
+
378
+ Provides organized access to field information for either source or target fields
379
+ in a scenario. Access via `scenario.source` or `scenario.target`.
380
+
381
+ Example:
382
+ >>> scenario.source.field_names
383
+ ['employee_id', 'first_name', 'last_name']
384
+ >>> scenario.source.unique_fields
385
+ ['employee_id']
386
+ >>> scenario.source.field_properties[0].alias
387
+ 'employee_id'
388
+ >>> scenario.target.custom_fields
389
+ ['custom_field_1', 'custom_field_2']
390
+ >>> len(scenario.source)
391
+ 3
392
+ >>> print(scenario.source)
393
+ SourceTargetFields(type='source', fields=3)
394
+ employee_id
395
+ first_name
396
+ last_name
397
+
398
+ Attributes:
399
+ type: Either 'source' or 'target' indicating the system type
400
+ field_names: List of all field names for this system type (source or target)
401
+ unique_fields: List of field names that are part of unique constraints
402
+ required_fields: List of field names that are required
403
+ field_properties: List of FieldProperties objects containing full metadata for all fields
404
+ custom_fields: List of field names that are custom fields (field_type='CUSTOM')
405
+ library_fields: List of field names that are library fields (field_type='LIBRARY')
406
+ fields_with_logic: List of field names that have transformation logic defined
407
+ """
408
+ type: Literal["source", "target"]
409
+ field_names: List[str]
410
+ unique_fields: List[str]
411
+ required_fields: List[str]
412
+ field_properties: List[FieldProperties]
413
+ custom_fields: List[str]
414
+ library_fields: List[str]
415
+ fields_with_logic: List[str]
416
+
417
+ def __iter__(self) -> Iterator[FieldProperties]:
418
+ """Make SourceTargetFields iterable, yielding FieldProperties objects.
419
+
420
+ Allows direct iteration: `for field in scenario.source:`
421
+
422
+ Example:
423
+ >>> for field in scenario.source:
424
+ ... print(f"{field.alias} (required: {field.required})")
425
+ employee_id (required: True)
426
+ first_name (required: False)
427
+
428
+ Yields:
429
+ FieldProperties objects for each field
430
+ """
431
+ return iter(self.field_properties)
432
+
433
+ def __len__(self) -> int:
434
+ """Return the number of fields.
435
+
436
+ Example:
437
+ >>> len(scenario.source)
438
+ 3
439
+
440
+ Returns:
441
+ Number of field names
442
+ """
443
+ return len(self.field_names)
444
+
445
+ def __str__(self) -> str:
446
+ """Return a string representation for print().
447
+
448
+ Example:
449
+ >>> print(scenario.source)
450
+ SourceTargetFields(type='source', fields=3)
451
+
452
+ Returns:
453
+ String representation showing type and field count
454
+ """
455
+ return f"SourceTargetFields(type={self.type!r}, fields={len(self.field_names)})"
456
+
457
+ def __repr__(self) -> str:
458
+ """Return a string representation of SourceTargetFields.
459
+
460
+ Example:
461
+ >>> repr(scenario.source)
462
+ "SourceTargetFields(type='source', fields=3)"
463
+
464
+ Returns:
465
+ String representation showing type and field count
466
+ """
467
+ return f"SourceTargetFields(type={self.type!r}, fields={len(self.field_names)})"
468
+
469
+
470
+ class FieldProperties(BaseModel):
471
+ """Metadata for a single field in a mapping.
472
+
473
+ How to use:
474
+ Access this via `scenario.field_name`. It provides details on
475
+ validation (unique, required) and origins (schema, uuid).
476
+
477
+ Example:
478
+ >>> scenario = ParsedScenario(...)
479
+ >>> scenario.customer_id.required
480
+ True
481
+ >>> scenario.customer_id.unique
482
+ False
483
+ >>> scenario['customer_id'].label
484
+ 'Customer ID'
485
+
486
+ Attributes:
487
+ logic: Transformation logic string as defined in the BrynQ template
488
+ unique: Whether this field is part of the unique key constraint
489
+ required: Whether this field is required (cannot be empty/null)
490
+ mapping: Value mapping dictionary (empty for individual fields, actual mapping is at Record level)
491
+ system_type: Indicates whether this is a 'source' or 'target' field
492
+ field_type: Indicates the field origin type: 'CUSTOM' or 'LIBRARY'
493
+ alias: The technical field name/identifier (pythonic name for the field)
494
+ uuid: The UUID identifier used in mapping values
495
+ schema_name: For LIBRARY fields: category.technicalName. For CUSTOM fields: CustomDataValues.source
496
+ technical_name: For CUSTOM fields: CustomDataValues.technical_name. Not populated for LIBRARY fields
497
+ label: Human-readable field name displayed in BrynQ
498
+ label_en: English human-readable field name
499
+ label_nl: Dutch human-readable field name
500
+ description: Business description/purpose of the field (for custom fields)
501
+ """
502
+ model_config = ConfigDict(extra="allow", frozen=True)
503
+
504
+ # Core Mapping Properties, straight from api
505
+ logic: Optional[str] = None
506
+ unique: bool = False
507
+ required: bool = False
508
+ mapping: Dict[str, Any] = Field(default_factory=dict)
509
+
510
+ # Identification
511
+ system_type: Optional[str] = None # 'source' or 'target'
512
+ field_type: Optional[str] = None # 'CUSTOM' or 'LIBRARY'
513
+ alias: Optional[str] = None # Python variable name
514
+ uuid: Optional[str] = None # API ID
515
+
516
+ # Context
517
+ schema_name: Optional[str] = Field(default=None, alias="schema")
518
+ technical_name: Optional[str] = None
519
+ label: Optional[str] = None
520
+ label_dict: Optional[Dict[str,str]] = None
521
+ label_en: Optional[str] = None
522
+ label_nl: Optional[str] = None
523
+ description: Optional[str] = None
524
+
525
+ # config related optional fields
526
+ question: Optional[str] = None
527
+ question_dict: Optional[Dict[str,str]] = None
528
+ question_en: Optional[str] = None
529
+ question_nl: Optional[str] = None
530
+ config_type: Optional[str] = None
531
+ config_value: Optional[Any] = None
532
+
533
+ def __repr__(self) -> str:
534
+ """A human-friendly string representation.
535
+
536
+ Example:
537
+ >>> repr(field_props)
538
+ "<FieldProperties alias='customer_id' system_type='source' field_type='CUSTOM'>"
539
+
540
+ Returns:
541
+ String representation showing the pythonic field name/alias, system type, and field type
542
+ """
543
+ alias_str = self.alias if self.alias else 'unnamed'
544
+ system_type_str = self.system_type if self.system_type else 'unknown'
545
+ field_type_str = self.field_type if self.field_type else 'unknown'
546
+ return f"<FieldProperties alias='{alias_str}' system_type='{system_type_str}' field_type='{field_type_str}'>"
547
+
548
+ def __str__(self) -> str:
549
+ """String representation (used by print()). Delegates to __repr__."""
550
+ return self.__repr__()
551
+
552
+
553
+ class Record(BaseModel):
554
+ """Represents a relationship between Source and Target fields. It's the unit of the Scenarios, and Scenario is a collection of records.
555
+
556
+ How to use:
557
+ Iterate over `scenario.records`. Each record can tell:
558
+ "Take these source fields, apply this logic/mapping, and put result in these target fields."
559
+
560
+ Example:
561
+ >>> scenario = ParsedScenario(...)
562
+ >>> for record in scenario.records:
563
+ ... print(f"Source: {record.source.field_names} -> Target: {record.target.field_names}")
564
+ Source: ['first_name'] -> Target: ['firstname']
565
+ >>> record = scenario.records[0]
566
+ >>> for field in record.source:
567
+ ... print(f"{field.alias} (required: {field.required})")
568
+ first_name (required: True)
569
+ >>> record.source.unique_fields
570
+ ['first_name']
571
+ >>> record.target.required_fields
572
+ ['firstname']
573
+
574
+ Attributes:
575
+ logic: Transformation logic string as defined in the BrynQ template
576
+ unique: Whether this mapping is part of the unique key constraint
577
+ required: Whether this mapping is required (cannot be empty/null)
578
+ source: SourceTargetFields object containing source field metadata (field_names, unique_fields, required_fields, field_properties, etc.)
579
+ target: SourceTargetFields object containing target field metadata (field_names, unique_fields, required_fields, field_properties, etc.)
580
+ source_field_types: Maps source field name to its type (CUSTOM, LIBRARY, FIXED, EMPTY)
581
+ target_field_types: Maps target field name to its type (CUSTOM, LIBRARY, FIXED, EMPTY)
582
+ relation_type: Type of mapping relationship: 'one_to_one', 'one_to_many', 'many_to_one', or 'many_to_many'
583
+ mapping: Value mapping configuration for translating source values to target values. Set to False when mapping has empty values list
584
+ id: Unique identifier for this mapping record
585
+ fixed_source_value: If source type is FIXED, this contains the fixed literal value to use for all target fields
586
+ """
587
+ model_config = ConfigDict(extra="allow", frozen=True)
588
+
589
+ # Inherited properties applied to the whole group
590
+ logic: Optional[str] = None
591
+ unique: bool = False
592
+ required: bool = False
593
+ mapping: Union["ScenarioMappingConfiguration", bool, None] = None
594
+ id: Optional[str] = None
595
+ fixed_source_value: Optional[str] = None
596
+
597
+ # The fields involved in this relationship
598
+ source: SourceTargetFields
599
+ target: SourceTargetFields
600
+ source_field_types: Dict[str, str] = Field(default_factory=dict)
601
+ target_field_types: Dict[str, str] = Field(default_factory=dict)
602
+
603
+ # inferred
604
+ relation_type: Literal["one_to_one", "one_to_many", "many_to_one", "many_to_many"]
605
+
606
+ # Record dunders
607
+ def __iter__(self):
608
+ """Enable iteration over all fields (both source and target).
609
+
610
+ Uses `source` and `target` attributes internally.
611
+
612
+ Example:
613
+ >>> for field in record:
614
+ ... print(field.label)
615
+ First Name
616
+ Last Name
617
+ >>> list(record)
618
+ [FieldProperties(...), FieldProperties(...)]
619
+
620
+ """
621
+ return iter(list(self.source.field_properties) + list(self.target.field_properties))
622
+
623
+
624
+ def __repr__(self) -> str:
625
+ """A human-friendly string representation.
626
+
627
+ Example:
628
+ >>> repr(record)
629
+ "<Record id='rec_123' relation_type='one_to_one' source=[<FieldProperties alias='first_name'>, ...] -> target=[<FieldProperties alias='firstname'>, ...]>"
630
+
631
+ Returns:
632
+ String representation of the Record
633
+ """
634
+ # Build source field representation using FieldProperties
635
+ source_repr = [repr(field) for field in self.source.field_properties]
636
+ source_str = f"[{', '.join(source_repr)}]" if source_repr else "[]"
637
+
638
+ # Build target field representation using FieldProperties
639
+ target_repr = [repr(field) for field in self.target.field_properties]
640
+ target_str = f"[{', '.join(target_repr)}]" if target_repr else "[]"
641
+
642
+ # Build the representation string
643
+ id_str = f"id='{self.id}' " if self.id else ""
644
+ return (
645
+ f"<Record {id_str}relation_type='{self.relation_type}' "
646
+ f"source={source_str} -> target={target_str}>"
647
+ )
648
+
649
+ def __str__(self) -> str:
650
+ """String representation (used by print()). Delegates to __repr__."""
651
+ return self.__repr__()
652
+
653
+
654
+ # ============================================================================
655
+ # Parsing Logic (The Engine)
656
+ # ============================================================================
657
+
658
+ @dataclass
659
+ class UuidToFieldNameConverter:
660
+ """Bundles all data needed to convert value mapping keys from UUIDs/schema patterns to field names.
661
+
662
+ The API returns value mappings where BOTH input and output dictionaries use field identifier
663
+ keys (UUIDs like "ea06ce9f-e10e-484e-bdf0-ec58087f15c5" or schema.name patterns like "work_schema-title").
664
+ We MUST convert these identifier keys to readable field names (like {"title": "CEO"}) because
665
+ the rest of the code expects field names, not UUIDs or schema patterns. This dataclass groups
666
+ all the lookup data needed for that conversion, avoiding passing 5+ separate arguments.
667
+
668
+ Created in ScenarioParser.parse() and passed to UuidToFieldNameMapper.__init__().
669
+
670
+ Attributes:
671
+ uuid_keyed_value_mappings: The value mappings that currently use field identifier keys (UUIDs/schema patterns)
672
+ and need conversion to field names. Both input and output dictionaries have identifier keys.
673
+ source_names: List of source field names (used to resolve UUIDs and validate keys), preserving API order.
674
+ target_names: List of target field names (used to resolve UUIDs and validate keys), preserving API order.
675
+ props: Dictionary mapping field names to FieldProperties (contains UUID-to-name lookups).
676
+ detail_model: The scenario detail model with source/target field definitions.
677
+ """
678
+ uuid_keyed_value_mappings: Optional[ScenarioMappingConfiguration]
679
+ source_names: List[str]
680
+ target_names: List[str]
681
+ props: FieldPropertiesMap
682
+ detail_model: ScenarioDetail
683
+
684
+
685
+ class UuidToFieldNameMapper:
686
+ """Converts value mapping keys from UUIDs/schema patterns to readable field names.
687
+
688
+ The API returns value mappings where BOTH input and output dictionaries use field identifier
689
+ keys (UUIDs like "ea06ce9f-e10e-484e-bdf0-ec58087f15c5" or schema.name patterns like "work_schema-title").
690
+ This class converts those identifier keys to field names (like {"title": "CEO"}) because
691
+ the rest of the codebase expects field names, not UUIDs or schema patterns. Uses multiple
692
+ lookup strategies to handle API inconsistencies.
693
+ """
694
+
695
+ def __init__(self, uuid_converter: UuidToFieldNameConverter):
696
+ """Initialize the converter with all data needed to convert UUID/schema pattern keys to field names.
697
+
698
+ Args:
699
+ uuid_converter: Contains UUID-keyed value mappings, field names, properties, and detail model.
700
+ Created in ScenarioParser.parse() and provides all lookup data needed to convert
701
+ field identifier keys (UUIDs like "ea06ce9f..." or schema patterns like "work_schema-title")
702
+ to readable field names (like "title"). Used to convert keys in BOTH input and output dictionaries.
703
+ """
704
+ # Store all data needed to convert UUID/schema pattern keys in value mappings to field names
705
+ self.uuid_converter = uuid_converter
706
+ self.source_uuid_to_field: Dict[str, str] = {}
707
+ self.target_uuid_to_field: Dict[str, str] = {}
708
+ self.source_technical_to_pythonic: Dict[str, str] = {}
709
+ self.target_technical_to_pythonic: Dict[str, str] = {}
710
+ self._build_mappings()
711
+
712
+ def _build_mappings(self) -> None:
713
+ """Builds the lookup dictionaries needed for translation.
714
+
715
+ Strategies:
716
+ 1. Technical Names -> Python Aliases (for CUSTOM fields).
717
+ 2. UUIDs -> Python Aliases (for all fields using props as source of truth).
718
+ """
719
+ # Strategy 1: Map Technical Names -> Python Aliases
720
+ self._map_technical_names(
721
+ model=self.uuid_converter.detail_model.source,
722
+ names=self.uuid_converter.source_names,
723
+ mapping=self.source_technical_to_pythonic,
724
+ system_type=SystemType.SOURCE
725
+ )
726
+ self._map_technical_names(
727
+ model=self.uuid_converter.detail_model.target,
728
+ names=self.uuid_converter.target_names,
729
+ mapping=self.target_technical_to_pythonic,
730
+ system_type=SystemType.TARGET
731
+ )
732
+
733
+ # Strategy 2: Map UUIDs -> Python Aliases
734
+ self._map_uuids(
735
+ names=self.uuid_converter.source_names,
736
+ tech_map=self.source_technical_to_pythonic,
737
+ uuid_map=self.source_uuid_to_field
738
+ )
739
+ self._map_uuids(
740
+ names=self.uuid_converter.target_names,
741
+ tech_map=self.target_technical_to_pythonic,
742
+ uuid_map=self.target_uuid_to_field
743
+ )
744
+
745
+ def _map_technical_names(
746
+ self,
747
+ model: SourceOrTargetField,
748
+ names: List[str],
749
+ mapping: Dict[str, str],
750
+ system_type: SystemType
751
+ ) -> None:
752
+ """Maps technical names to python aliases for custom fields."""
753
+ if not isinstance(model, CustomSourceOrTargetField):
754
+ return
755
+
756
+ names_set = set(names) # Convert to set for fast lookup
757
+ for item in model.data:
758
+ if item.technical_name not in names_set:
759
+ continue
760
+
761
+ # Find matching pythonic name in props via UUID
762
+ for py_name, props in self.uuid_converter.props.items():
763
+ if props.system_type == system_type.value and props.uuid == item.uuid:
764
+ mapping[item.technical_name] = py_name
765
+ break
766
+
767
+ def _map_uuids(
768
+ self,
769
+ names: List[str],
770
+ tech_map: Dict[str, str],
771
+ uuid_map: Dict[str, str]
772
+ ) -> None:
773
+ """Maps UUIDs to python aliases using props."""
774
+ for name in names:
775
+ py_name = tech_map.get(name, name)
776
+ props = self.uuid_converter.props.get(py_name)
777
+ if props and props.uuid:
778
+ uuid_map[props.uuid] = py_name
779
+
780
+ def convert_key(self, key: str, direction: Literal["source", "target"]) -> str:
781
+ """Converts a single API mapping key to a pythonic field name.
782
+
783
+ This helper method handles API inconsistency by trying multiple fallback strategies:
784
+ 1. UUID lookup (most reliable - direct match)
785
+ 2. Name lookup (handles technical names and pythonic names)
786
+ 3. Pattern matching (handles schema.name or schema-name patterns)
787
+
788
+ Uses internal lookup maps (`source_uuid_to_field`, etc.) populated during initialization.
789
+
790
+ Example:
791
+ >>> mapper.convert_key('be3a4c1e...', 'source')
792
+ 'gender'
793
+
794
+ Args:
795
+ key: The raw key from the API (could be UUID, Name, or Schema.Name).
796
+ direction: 'source' or 'target'.
797
+
798
+ Returns:
799
+ The best matching Pythonic field name.
800
+ """
801
+ if direction == "source":
802
+ uuid_map = self.source_uuid_to_field
803
+ tech_map = self.source_technical_to_pythonic
804
+ valid_names = self.uuid_converter.source_names
805
+ else:
806
+ uuid_map = self.target_uuid_to_field
807
+ tech_map = self.target_technical_to_pythonic
808
+ valid_names = self.uuid_converter.target_names
809
+
810
+ # Strategy 1: Direct UUID Lookup (Most reliable)
811
+ if key in uuid_map:
812
+ return uuid_map[key]
813
+
814
+ # Strategy 2: Direct Name Lookup
815
+ if key in valid_names:
816
+ return tech_map.get(key, key)
817
+ if key in tech_map.values():
818
+ return key
819
+
820
+ # Strategy 3: Pattern Matching (Heuristic)
821
+ # Handles keys like 'schema_name.email' by checking suffixes
822
+ all_names = set(tech_map.values()) | set(valid_names)
823
+ for fname in all_names:
824
+ if key.endswith(f'.{fname}') or key.endswith(f'-{fname}'):
825
+ return tech_map.get(fname, fname)
826
+
827
+ # Fallback: Return original key
828
+ return key
829
+
830
+ def convert_mapping_config(self) -> Optional[ScenarioMappingConfiguration]:
831
+ """Converts value mapping keys from field identifiers to field names.
832
+
833
+ The API returns value mappings where BOTH input and output dictionaries use field identifier
834
+ keys (UUIDs like "ea06ce9f-e10e-484e-bdf0-ec58087f15c5" or schema.name patterns like "work_schema-title").
835
+ This method converts all identifier keys to readable field names (like {"title": "CEO"})
836
+ because the rest of the codebase expects field names, not UUIDs or schema patterns.
837
+
838
+ Example:
839
+ >>> converted = mapper.convert_mapping_config()
840
+ >>> converted.values[0].input
841
+ {'title': 'CEO'} # Field identifier key converted to field name
842
+ >>> converted.values[0].output
843
+ {'job_code': '96'} # UUID key converted to field name
844
+
845
+ Returns:
846
+ ScenarioMappingConfiguration with field name keys (not UUIDs or schema patterns),
847
+ or None if no mapping config exists.
848
+ """
849
+ if not self.uuid_converter.uuid_keyed_value_mappings or not self.uuid_converter.uuid_keyed_value_mappings.values:
850
+ return self.uuid_converter.uuid_keyed_value_mappings
851
+
852
+ # Convert UUID/schema pattern keys to field names in each value mapping
853
+ converted_values = []
854
+ for val in self.uuid_converter.uuid_keyed_value_mappings.values:
855
+ # Convert source field identifier keys (UUIDs/schema patterns) to field names
856
+ new_in = {
857
+ self.convert_key(key=k, direction="source"): v
858
+ for k, v in val.input.items()
859
+ }
860
+ # Convert target field identifier keys (UUIDs/schema patterns) to field names
861
+ new_out = {
862
+ self.convert_key(key=k, direction="target"): v
863
+ for k, v in val.output.items()
864
+ }
865
+ converted_values.append(MappingValue(input=new_in, output=new_out))
866
+
867
+ return ScenarioMappingConfiguration(
868
+ values=converted_values,
869
+ default_value=self.uuid_converter.uuid_keyed_value_mappings.default_value
870
+ )
871
+
872
+
873
+ class ScenarioParser:
874
+ """Orchestrates the parsing of a Raw Scenario Dictionary.
875
+
876
+ This class breaks the logic into three distinct phases:
877
+ 1. Extraction: Get raw names from the polymorphic API response.
878
+ 2. Property Building: Create metadata objects (`FieldProperties`) for every field.
879
+ 3. Linking: Create `Record` objects that link Sources to Targets.
880
+ """
881
+
882
+ def __init__(self):
883
+ """Initialize the parser."""
884
+ pass
885
+
886
+ def parse(self, scenario: Dict[str, Any]) -> "ParsedScenario":
887
+ """Parse a raw API scenario dictionary into a ParsedScenario object.
888
+
889
+ Args:
890
+ scenario: Raw scenario dictionary from the BrynQ API
891
+
892
+ Returns:
893
+ ParsedScenario object with all parsed data
894
+ """
895
+ details = scenario.get("details", [])
896
+
897
+ # Accumulators
898
+ source_to_target = defaultdict(set)
899
+ target_to_source = defaultdict(set)
900
+ props: FieldPropertiesMap = {}
901
+ value_mappings = defaultdict(list)
902
+ aliases = set()
903
+ alias_order = []
904
+
905
+ records = []
906
+
907
+ # details is the 'raw' api name for what is essentially called 'records' here.
908
+ for detail in details:
909
+ detail_model = ScenarioDetail.model_validate(detail)
910
+
911
+ # Phase 1: extract names
912
+ source_names = _extract_names_from_fields(detail_model.source)
913
+ target_names = _extract_names_from_fields(detail_model.target)
914
+
915
+ for source_name in source_names:
916
+ source_to_target[source_name].update(target_names)
917
+ for target_name in target_names:
918
+ target_to_source[target_name].update(source_names)
919
+
920
+ # Phase 2: Property Building
921
+ # We use the same method for source and target to avoid code duplication, just need sepperate types
922
+
923
+ # Identify reserved keys from target (Library fields) to avoid collisions with Source Custom fields
924
+ reserved_keys = set()
925
+ if detail_model.target.type == FieldType.LIBRARY.value:
926
+ reserved_keys = set(target_names) # Convert list to set for fast lookup
927
+
928
+ base_props = FieldProperties.model_validate(detail)
929
+ self._build_field_properties(
930
+ fields=detail_model.source,
931
+ names=source_names,
932
+ sys_type=SystemType.SOURCE,
933
+ base=base_props,
934
+ props=props,
935
+ aliases=aliases,
936
+ alias_order=alias_order,
937
+ reserved=reserved_keys
938
+ )
939
+ self._build_field_properties(
940
+ fields=detail_model.target,
941
+ names=target_names,
942
+ sys_type=SystemType.TARGET,
943
+ base=base_props,
944
+ props=props,
945
+ aliases=aliases,
946
+ alias_order=alias_order
947
+ )
948
+
949
+ # Phase 3: Linking & Mapping Conversion
950
+ # Convert value mapping keys from UUIDs/schema patterns to field names (API uses UUIDs/schema patterns, code expects field names)
951
+ uuid_converter = UuidToFieldNameConverter(
952
+ uuid_keyed_value_mappings=detail_model.mapping,
953
+ source_names=source_names,
954
+ target_names=target_names,
955
+ props=props,
956
+ detail_model=detail_model
957
+ )
958
+ converted_map = UuidToFieldNameMapper(uuid_converter).convert_mapping_config()
959
+
960
+ if converted_map:
961
+ # If values exist, store them in the lookup map
962
+ if converted_map.values:
963
+ # Preserve order from API, but sort for consistent key generation
964
+ key = '|'.join(sorted(source_names)) if source_names else detail_model.id
965
+ value_mappings[key].append(converted_map)
966
+ # If map exists but is empty, treat as False
967
+ else:
968
+ converted_map = False
969
+
970
+ records.append(
971
+ self._build_record(
972
+ detail=detail_model,
973
+ source_names=source_names,
974
+ target_names=target_names,
975
+ base=base_props,
976
+ props=props,
977
+ mapping_cfg=converted_map
978
+ )
979
+ )
980
+
981
+ # Final Phase: Assembly
982
+ return self._build_parsed_scenario(
983
+ raw=scenario,
984
+ records=records,
985
+ source_to_target_map=source_to_target,
986
+ target_to_source_map=target_to_source,
987
+ props=props,
988
+ source_to_value_mappings=value_mappings
989
+ )
990
+
991
+ def _build_field_properties(
992
+ self,
993
+ fields: SourceOrTargetField,
994
+ names: List[str],
995
+ sys_type: SystemType,
996
+ base: FieldProperties,
997
+ props: FieldPropertiesMap,
998
+ aliases: Set[str],
999
+ alias_order: List[str],
1000
+ reserved: Optional[Set[str]] = None
1001
+ ) -> None:
1002
+ """Creates FieldProperties for a set of fields and registers them.
1003
+
1004
+ Args:
1005
+ fields: SourceOrTargetField object containing field definitions
1006
+ names: Set of field names to process
1007
+ sys_type: Either SystemType.SOURCE or SystemType.TARGET
1008
+ base: Base FieldProperties shared across fields in this mapping
1009
+ props: Dictionary to store field properties (modified in place)
1010
+ aliases: Set to track custom field aliases (modified in place)
1011
+ alias_order: List to maintain custom alias order (modified in place)
1012
+ reserved: Set of reserved keys to avoid collisions (e.g. target library names)
1013
+ """
1014
+ for name in names:
1015
+ label, l_en, l_nl = _extract_label_from_fields(fields, name) #only returned for library/custom
1016
+
1017
+ # Determine Python Alias
1018
+ f_type_str = fields.type.value if isinstance(fields.type, FieldType) else fields.type
1019
+ is_custom = (f_type_str == FieldType.CUSTOM.value)
1020
+
1021
+ # Only sanitize custom fields; libraries use fixed keys
1022
+ alias = _sanitize_alias(label or name) if is_custom else name
1023
+ key = alias if is_custom else name
1024
+
1025
+ # Handle collisions for Custom fields if key is reserved (e.g. used by Target Library field)
1026
+ if is_custom and reserved and key in reserved:
1027
+ alias = f"{alias}_{sys_type.value}"
1028
+ key = alias
1029
+
1030
+ config_props = _extract_config_props(fields, name)
1031
+
1032
+ props[key] = base.model_copy(update={
1033
+ "system_type": sys_type.value,
1034
+ "field_type": f_type_str,
1035
+ "alias": alias,
1036
+ "uuid": _extract_uuid_from_fields(fields, name),
1037
+ "schema_name": _extract_schema_from_fields(fields, name),
1038
+ "technical_name": _extract_technical_name_from_fields(fields, name),
1039
+ "label": label,
1040
+ "label_en": l_en,
1041
+ "label_nl": l_nl,
1042
+ "description": _extract_description_from_fields(fields, name),
1043
+ "mapping": {}, # Mappings are stored at Record level, not Field level
1044
+ #config fields
1045
+ **config_props
1046
+ })
1047
+
1048
+ if is_custom and key not in aliases:
1049
+ aliases.add(key)
1050
+ alias_order.append(key)
1051
+
1052
+ def _build_record(
1053
+ self,
1054
+ detail: ScenarioDetail,
1055
+ source_names: List[str],
1056
+ target_names: List[str],
1057
+ base: FieldProperties,
1058
+ props: FieldPropertiesMap,
1059
+ mapping_cfg
1060
+ ) -> Record:
1061
+ """Creates a Record object representing the relationship.
1062
+
1063
+ Args:
1064
+ detail: Validated ScenarioDetail object
1065
+ source_names: List of source field names (preserving API order)
1066
+ target_names: List of target field names (preserving API order)
1067
+ base: Base FieldProperties for this mapping
1068
+ props: Dictionary of field properties
1069
+ mapping_cfg: Converted mapping configuration
1070
+
1071
+ Returns:
1072
+ Record object representing this mapping
1073
+ """
1074
+ # Helper to retrieve the correct prop keys, preserving order
1075
+ def _get_keys(names, field_obj, sys_type: SystemType):
1076
+ keys = []
1077
+ is_custom = (field_obj.type == FieldType.CUSTOM.value)
1078
+ for n in names: # Iterate in order
1079
+ if is_custom:
1080
+ # For custom fields, look up the actual key from props
1081
+ uuid = _extract_uuid_from_fields(field_obj, n)
1082
+ technical_name = _extract_technical_name_from_fields(field_obj, n)
1083
+
1084
+ # First try
1085
+ lbl, _, _ = _extract_label_from_fields(field_obj, n)
1086
+ sanitized_alias = _sanitize_alias(lbl or n)
1087
+ if sanitized_alias in props:
1088
+ prop = props[sanitized_alias]
1089
+ if prop.system_type == sys_type.value and prop.field_type == FieldType.CUSTOM.value:
1090
+ # Verify it's the same field by UUID or technical_name
1091
+ if (uuid and prop.uuid == uuid) or (technical_name and prop.technical_name == technical_name):
1092
+ keys.append(sanitized_alias)
1093
+ continue
1094
+
1095
+ # Second try: find matching key in props by UUID or technical_name
1096
+ found_key = None
1097
+ for key, prop in props.items():
1098
+ if prop.system_type == sys_type.value and prop.field_type == FieldType.CUSTOM.value:
1099
+ if (uuid and prop.uuid == uuid) or (technical_name and prop.technical_name == technical_name):
1100
+ found_key = key
1101
+ break
1102
+
1103
+ if found_key:
1104
+ keys.append(found_key)
1105
+ else:
1106
+ # Fallback: use sanitized alias (shouldn't happen if props were built correctly)
1107
+ keys.append(sanitized_alias)
1108
+ else:
1109
+ # For library/configuration fields, the name itself is the key
1110
+ keys.append(n)
1111
+ return keys
1112
+
1113
+ source_keys = _get_keys(source_names, detail.source, SystemType.SOURCE)
1114
+ target_keys = _get_keys(target_names, detail.target, SystemType.TARGET)
1115
+
1116
+ # Determine Cardinality
1117
+ rel = RelationType.ONE_TO_ONE.value
1118
+ if len(source_names) > 1 and len(target_names) > 1:
1119
+ rel = RelationType.MANY_TO_MANY.value
1120
+ elif len(source_names) > 1:
1121
+ rel = RelationType.MANY_TO_ONE.value
1122
+ elif len(target_names) > 1:
1123
+ rel = RelationType.ONE_TO_MANY.value
1124
+
1125
+ # Extract fixed_source_value based on source type
1126
+ fixed_source_value = None
1127
+ if detail.source.type == "FIXED":
1128
+ # For FIXED type, use the data directly (it's a string)
1129
+ fixed_source_value = detail.source.data
1130
+ elif detail.source.type == "CONFIGURATION":
1131
+ # For CONFIGURATION type, parse the config value according to its type
1132
+ if isinstance(detail.source, ConfigurationSourceOrTargetField) and detail.source.data:
1133
+ # Get the first config item (for one_to_one/one_to_many, there's typically one)
1134
+ config_item = detail.source.data[0]
1135
+ fixed_source_value = _parse_config_value(config_item)
1136
+
1137
+ # Build FieldProperties lists
1138
+ source_field_props = [props[k] for k in source_keys if k in props]
1139
+ target_field_props = [props[k] for k in target_keys if k in props]
1140
+
1141
+ # Build SourceTargetFields instances
1142
+ source_unique_fields = [k for k in source_keys if k in props and props[k].unique]
1143
+ source_required_fields = [k for k in source_keys if k in props and props[k].required]
1144
+ source_custom_fields = [k for k in source_keys if k in props and props[k].field_type == FieldType.CUSTOM.value]
1145
+ source_library_fields = [k for k in source_keys if k in props and props[k].field_type == FieldType.LIBRARY.value]
1146
+ source_fields_with_logic = [k for k in source_keys if k in props and props[k].logic is not None]
1147
+
1148
+ target_unique_fields = [k for k in target_keys if k in props and props[k].unique]
1149
+ target_required_fields = [k for k in target_keys if k in props and props[k].required]
1150
+ target_custom_fields = [k for k in target_keys if k in props and props[k].field_type == FieldType.CUSTOM.value]
1151
+ target_library_fields = [k for k in target_keys if k in props and props[k].field_type == FieldType.LIBRARY.value]
1152
+ target_fields_with_logic = [k for k in target_keys if k in props and props[k].logic is not None]
1153
+
1154
+ source_stf = SourceTargetFields(
1155
+ type="source",
1156
+ field_names=source_keys,
1157
+ unique_fields=source_unique_fields,
1158
+ required_fields=source_required_fields,
1159
+ field_properties=source_field_props,
1160
+ custom_fields=source_custom_fields,
1161
+ library_fields=source_library_fields,
1162
+ fields_with_logic=source_fields_with_logic
1163
+ )
1164
+
1165
+ target_stf = SourceTargetFields(
1166
+ type="target",
1167
+ field_names=target_keys,
1168
+ unique_fields=target_unique_fields,
1169
+ required_fields=target_required_fields,
1170
+ field_properties=target_field_props,
1171
+ custom_fields=target_custom_fields,
1172
+ library_fields=target_library_fields,
1173
+ fields_with_logic=target_fields_with_logic
1174
+ )
1175
+
1176
+ return Record(
1177
+ logic=base.logic,
1178
+ unique=base.unique,
1179
+ required=base.required,
1180
+ source_field_types={k: detail.source.type for k in source_keys},
1181
+ target_field_types={k: detail.target.type for k in target_keys},
1182
+ source=source_stf,
1183
+ target=target_stf,
1184
+ relation_type=rel,
1185
+ mapping=mapping_cfg,
1186
+ id=detail.id,
1187
+ fixed_source_value=fixed_source_value
1188
+ )
1189
+
1190
+ def _build_parsed_scenario(
1191
+ self,
1192
+ raw,
1193
+ records,
1194
+ source_to_target_map,
1195
+ target_to_source_map,
1196
+ props,
1197
+ source_to_value_mappings
1198
+ ):
1199
+ """Constructs the final immutable ParsedScenario object.
1200
+
1201
+ Args:
1202
+ raw: Original scenario dictionary
1203
+ records: List of Record objects
1204
+ source_to_target_map: Source to target mapping dictionary
1205
+ target_to_source_map: Target to source mapping dictionary
1206
+ props: Field properties dictionary
1207
+ source_to_value_mappings: Source field to value mappings dictionary
1208
+
1209
+ Returns:
1210
+ ParsedScenario object
1211
+ """
1212
+ # Sort maps for deterministic behavior
1213
+ s_to_t = {k: sorted(v) for k, v in source_to_target_map.items()}
1214
+ t_to_s = {k: sorted(v) for k, v in target_to_source_map.items()}
1215
+
1216
+ # Only include custom fields that are source fields (based on system_type)
1217
+ custom_fields = {k: v for k, v in props.items()
1218
+ if v.field_type == FieldType.CUSTOM.value
1219
+ and v.system_type == SystemType.SOURCE.value}
1220
+ custom_model = ParsedScenario._build_custom_field_model(custom_fields) if custom_fields else None
1221
+
1222
+ # Build unique and required fields (all fields, regardless of source/target)
1223
+ unique_fields = [fid for fid, props in props.items() if props.unique]
1224
+ required_fields = [fid for fid, props in props.items() if props.required]
1225
+
1226
+ # Build source and target unique/required fields separately
1227
+ source_field_names = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value]
1228
+ source_unique_fields = [k for k, v in props.items() if v.unique and v.system_type == SystemType.SOURCE.value]
1229
+ source_required_fields = [k for k, v in props.items() if v.required and v.system_type == SystemType.SOURCE.value]
1230
+ source_field_properties = [v for k, v in props.items() if v.system_type == SystemType.SOURCE.value]
1231
+ source_custom_fields = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value and v.field_type == FieldType.CUSTOM.value]
1232
+ source_library_fields = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value and v.field_type == FieldType.LIBRARY.value]
1233
+ source_fields_with_logic = [k for k, v in props.items() if v.system_type == SystemType.SOURCE.value and v.logic is not None]
1234
+
1235
+ target_field_names = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value]
1236
+ target_unique_fields = [k for k, v in props.items() if v.unique and v.system_type == SystemType.TARGET.value]
1237
+ target_required_fields = [k for k, v in props.items() if v.required and v.system_type == SystemType.TARGET.value]
1238
+ target_field_properties = [v for k, v in props.items() if v.system_type == SystemType.TARGET.value]
1239
+ target_custom_fields = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value and v.field_type == FieldType.CUSTOM.value]
1240
+ target_library_fields = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value and v.field_type == FieldType.LIBRARY.value]
1241
+ target_fields_with_logic = [k for k, v in props.items() if v.system_type == SystemType.TARGET.value and v.logic is not None]
1242
+
1243
+ # Build nested structures
1244
+ source = SourceTargetFields(
1245
+ type="source",
1246
+ field_names=source_field_names,
1247
+ unique_fields=source_unique_fields,
1248
+ required_fields=source_required_fields,
1249
+ field_properties=source_field_properties,
1250
+ custom_fields=source_custom_fields,
1251
+ library_fields=source_library_fields,
1252
+ fields_with_logic=source_fields_with_logic
1253
+ )
1254
+ target = SourceTargetFields(
1255
+ type="target",
1256
+ field_names=target_field_names,
1257
+ unique_fields=target_unique_fields,
1258
+ required_fields=target_required_fields,
1259
+ field_properties=target_field_properties,
1260
+ custom_fields=target_custom_fields,
1261
+ library_fields=target_library_fields,
1262
+ fields_with_logic=target_fields_with_logic
1263
+ )
1264
+
1265
+ all_source_fields = set(source_to_target_map.keys())
1266
+ all_target_fields = set(target_to_source_map.keys())
1267
+
1268
+ # Collect target fields from records where logic contains 'ignoreCompare'
1269
+ target_fields_to_ignore_in_compare = set()
1270
+ for record in records:
1271
+ if record.logic and 'ignoreCompare' in record.logic:
1272
+ target_fields_to_ignore_in_compare.update(record.target.field_names)
1273
+
1274
+ return ParsedScenario(
1275
+ name=raw.get("name", "Unnamed"),
1276
+ id=raw.get("id", ""),
1277
+ records_count=len(raw.get("details", [])),
1278
+ description=raw.get("description", ""),
1279
+ records=records,
1280
+ source_to_target_map=s_to_t,
1281
+ target_to_source_map=t_to_s,
1282
+ field_properties=props,
1283
+ source=source,
1284
+ target=target,
1285
+ unique_fields=unique_fields,
1286
+ required_fields=required_fields,
1287
+ custom_fields=custom_fields,
1288
+ custom_fields_model=custom_model,
1289
+ all_source_fields=all_source_fields,
1290
+ all_target_fields=all_target_fields,
1291
+ source_to_value_mappings=dict(source_to_value_mappings),
1292
+ target_fields_to_ignore_in_compare=target_fields_to_ignore_in_compare
1293
+ )
1294
+
1295
+
1296
+ class ParsedScenario(BaseModel):
1297
+ """The final, usable representation of a Scenario.
1298
+
1299
+ This object is what users interact with. It contains all records,
1300
+ lookups, and property maps needed to perform data validation and transformation.
1301
+
1302
+ Example:
1303
+ >>> scenario = ParsedScenario(...)
1304
+ >>> scenario.name
1305
+ 'Personal Information'
1306
+ >>> scenario.all_source_fields
1307
+ {'first_name', 'last_name', 'email'}
1308
+ >>> scenario.has_field('email', field_type='source')
1309
+ True
1310
+ >>> scenario.get_mapped_field_names('first_name')
1311
+ ['firstname']
1312
+ >>> for record in scenario.records:
1313
+ ... print(record.relation_type)
1314
+ one_to_one
1315
+
1316
+ Attributes:
1317
+ name: Scenario display name
1318
+ id: Scenario identifier
1319
+ records_count: Number of records in this scenario
1320
+ description: Scenario business context (description of what the scenario does)
1321
+ records: List of Record objects representing field mappings
1322
+ source_to_target_map: Dictionary mapping source field names to target field names
1323
+ target_to_source_map: Dictionary mapping target field names to source field names
1324
+ field_properties: Dictionary mapping field names to FieldProperties objects
1325
+ all_source_fields: Set of all source field names
1326
+ all_target_fields: Set of all target field names
1327
+ source: SourceTargetFields object containing source unique_fields and required_fields
1328
+ target: SourceTargetFields object containing target unique_fields and required_fields
1329
+ unique_fields: List of field names that are part of unique constraints (deprecated: use source.unique_fields or target.unique_fields)
1330
+ required_fields: List of field names that are required (deprecated: use source.required_fields or target.required_fields)
1331
+ custom_fields: Dictionary of custom field properties (filtered from field_properties)
1332
+ custom_fields_model: Dynamically generated Pandera schema model for custom fields
1333
+ source_to_value_mappings: Dictionary mapping source fields to value mapping configurations
1334
+ target_fields_to_ignore_in_compare: Set of target field names that should be ignored in compare function
1335
+ (determined by records where logic contains 'ignoreCompare')
1336
+ """
1337
+ # Core
1338
+ name: str
1339
+ id: str
1340
+ records_count: int
1341
+ description: str
1342
+
1343
+ # Mapping Data
1344
+ records: List[Record]
1345
+ source_to_target_map: SourceToTargetMap
1346
+ target_to_source_map: TargetToSourceMap
1347
+
1348
+ # Field Metadata
1349
+ field_properties: FieldPropertiesMap
1350
+ all_source_fields: Set[str]
1351
+ all_target_fields: Set[str]
1352
+ source: SourceTargetFields
1353
+ target: SourceTargetFields
1354
+ unique_fields: List[str]
1355
+ required_fields: List[str]
1356
+
1357
+ # Custom Field Data
1358
+ custom_fields: FieldPropertiesMap
1359
+ custom_fields_model: Optional[type] = None
1360
+
1361
+ # Value Mappings
1362
+ source_to_value_mappings: Dict[str, List[ScenarioMappingConfiguration]]
1363
+
1364
+ # Compare Configuration
1365
+ target_fields_to_ignore_in_compare: Set[str] = Field(default_factory=set)
1366
+
1367
+ @classmethod
1368
+ def from_api_dict(cls, scenario: Dict[str, Any]) -> "ParsedScenario":
1369
+ """Factory method to create a ParsedScenario from raw API data.
1370
+
1371
+ Args:
1372
+ scenario: Raw scenario dictionary from the BrynQ API
1373
+
1374
+ Returns:
1375
+ ParsedScenario object with all parsed data
1376
+ """
1377
+ return ScenarioParser().parse(scenario)
1378
+
1379
+ def __getattribute__(self, name: str):
1380
+ """Override attribute access to emit deprecation warnings for unique_fields and required_fields."""
1381
+ if name == 'unique_fields':
1382
+ warnings.warn(
1383
+ "unique_fields is deprecated. Use scenario.source.unique_fields or scenario.target.unique_fields instead.",
1384
+ DeprecationWarning,
1385
+ stacklevel=2
1386
+ )
1387
+ elif name == 'required_fields':
1388
+ warnings.warn(
1389
+ "required_fields is deprecated. Use scenario.source.required_fields or scenario.target.required_fields instead.",
1390
+ DeprecationWarning,
1391
+ stacklevel=2
1392
+ )
1393
+ return super().__getattribute__(name)
1394
+
1395
+ def get_source_fields_with_value_mappings(self) -> List[str]:
1396
+ """Returns a list of source fields that have value mappings.
1397
+
1398
+ Uses `source_to_value_mappings` attribute internally.
1399
+
1400
+ Example:
1401
+ >>> scenario.get_source_fields_with_value_mappings()
1402
+ ['gender', 'status']
1403
+ >>> list(scenario.source_to_value_mappings.keys())
1404
+ ['gender', 'status']
1405
+
1406
+ Returns:
1407
+ List of source field names that have value mappings
1408
+ """
1409
+ return list(self.source_to_value_mappings.keys())
1410
+
1411
+ def get_target_fields_with_value_mappings(self) -> List[str]:
1412
+ """Returns a list of target fields that have value mappings (via their mapped source fields).
1413
+
1414
+ Uses `source_to_value_mappings` and `source_to_target_map` attributes internally.
1415
+
1416
+ Example:
1417
+ >>> scenario.get_target_fields_with_value_mappings()
1418
+ ['gender_code', 'status_code']
1419
+ >>> scenario.source_to_target_map['gender']
1420
+ ['gender_code']
1421
+
1422
+ Returns:
1423
+ List of target field names that have value mappings
1424
+ """
1425
+ target_fields_with_mappings: Set[str] = set()
1426
+ for source_key in self.source_to_value_mappings.keys():
1427
+ # Handle keys that might be multiple source fields joined with '|'
1428
+ source_fields = source_key.split('|') if '|' in source_key else [source_key]
1429
+ for source_field in source_fields:
1430
+ # Find target fields mapped from this source field
1431
+ target_fields = self.source_to_target_map.get(source_field, [])
1432
+ target_fields_with_mappings.update(target_fields)
1433
+ return sorted(list(target_fields_with_mappings))
1434
+
1435
+ def has_field(self, field_name: str, field_type: Optional[str] = None) -> bool:
1436
+ """Check field existence in scenario. Can denote source or target, else looks for both.
1437
+
1438
+ Uses `all_source_fields` and `all_target_fields` attributes internally.
1439
+
1440
+ Example:
1441
+ >>> scenario.has_field('email')
1442
+ True
1443
+ >>> scenario.has_field('email', field_type='source')
1444
+ True
1445
+ >>> scenario.has_field('email', field_type='target')
1446
+ False
1447
+ >>> 'email' in scenario.all_source_fields
1448
+ True
1449
+
1450
+ Args:
1451
+ field_name: The field name to check
1452
+ field_type: Optional field type filter ("source" or "target")
1453
+
1454
+ Returns:
1455
+ True if field exists, False otherwise
1456
+ """
1457
+ if field_type == "source":
1458
+ return field_name in self.all_source_fields
1459
+ if field_type == "target":
1460
+ return field_name in self.all_target_fields
1461
+ return field_name in self.all_source_fields or field_name in self.all_target_fields
1462
+
1463
+ # Dunder(like) methods for pythonic field access
1464
+ def __iter__(self):
1465
+ """Enable iteration over records.
1466
+
1467
+ Example:
1468
+ >>> for record in ParsedScenario:
1469
+ ... print(f"Record {record.id}: {len(record.source.field_names)} source fields")
1470
+ Record rec_123: 2 source fields
1471
+ Record rec_456: 1 source fields
1472
+ >>> list(ParsedScenario)
1473
+ [Record(id='rec_123', ...), Record(id='rec_456', ...)]
1474
+ """
1475
+ return iter(self.records)
1476
+
1477
+ def __len__(self) -> int:
1478
+ """Return the number of records in this scenario.
1479
+
1480
+ Example:
1481
+ >>> len(scenario)
1482
+ 15
1483
+ >>> scenario.records_count
1484
+ 15
1485
+
1486
+ Returns:
1487
+ int: The number of records in the scenario
1488
+ """
1489
+ return len(self.records)
1490
+
1491
+ def __getitem__(self, field_id: str) -> FieldProperties:
1492
+ """Enable dict-style access to field properties.
1493
+
1494
+ Example:
1495
+ >>> ParsedScenario['customer_id']
1496
+ FieldProperties(alias='customer_id', uuid='...', label='Customer ID', ...)
1497
+ >>> ParsedScenario['customer_id'].required
1498
+ True
1499
+ >>> ParsedScenario['nonexistent']
1500
+ KeyError: Field 'nonexistent' not found in scenario 'Personal Information'.
1501
+
1502
+ Args:
1503
+ field_id: The field name to look up
1504
+
1505
+ Returns:
1506
+ FieldProperties object for the field
1507
+
1508
+ Raises:
1509
+ KeyError: If field is not found
1510
+ """
1511
+ try:
1512
+ return self.field_properties[field_id]
1513
+ except KeyError as exc:
1514
+ raise KeyError(f"Field '{field_id}' not found in scenario '{self.name}'.") from exc
1515
+
1516
+ def __getattr__(self, name: str) -> FieldProperties:
1517
+ """Enable attribute-style access to field properties.
1518
+
1519
+ Example:
1520
+ >>> ParsedScenario.customer_id
1521
+ FieldProperties(alias='customer_id', uuid='...', label='Customer ID', ...)
1522
+ >>> ParsedScenario.customer_id.unique
1523
+ True
1524
+ >>> ParsedScenario.nonexistent
1525
+ AttributeError: 'nonexistent' is not a valid field in scenario 'Personal Information'.
1526
+
1527
+ Args:
1528
+ name: The field name to look up
1529
+
1530
+ Returns:
1531
+ FieldProperties object for the field
1532
+
1533
+ Raises:
1534
+ AttributeError: If field is not found
1535
+ """
1536
+ if name.startswith("_") or name in self.__dict__ or name in self.__class__.__dict__:
1537
+ return super().__getattribute__(name)
1538
+ try:
1539
+ return self.field_properties[name]
1540
+ except KeyError as exc:
1541
+ raise AttributeError(f"'{name}' is not a valid field in scenario '{self.name}'.") from exc
1542
+
1543
+ def __repr__(self) -> str:
1544
+ """A human-friendly string representation.
1545
+
1546
+ Example:
1547
+ >>> repr(ParsedScenario)
1548
+ "<ParsedScenario (field mapping of scenario) name='Personal Information' id='abc123' details=5 unique=2 required=3>"
1549
+
1550
+ Returns:
1551
+ String representation of the ParsedScenario
1552
+ """
1553
+ return (
1554
+ f"<ParsedScenario (field mapping of scenario) "
1555
+ f"name='{self.name}' id='{self.id}' "
1556
+ f"records={self.records_count} unique={len(self.unique_fields)} "
1557
+ f"required={len(self.required_fields)}>"
1558
+ )
1559
+
1560
+ def __str__(self) -> str:
1561
+ """String representation (used by print()). Delegates to __repr__."""
1562
+ return self.__repr__()
1563
+
1564
+ @staticmethod
1565
+ def _build_custom_field_model(custom_fields: FieldPropertiesMap) -> Optional[type]:
1566
+ """Dynamically creates a Pandera Schema for custom fields validation.
1567
+
1568
+ Uses the `custom_fields` dictionary to extract field metadata (technical_name, label, required)
1569
+ and create a Pandera schema model for validation.
1570
+
1571
+ Args:
1572
+ custom_fields: Dictionary mapping field names to their FieldProperties objects (filtered to CUSTOM fields only)
1573
+
1574
+ Returns:
1575
+ A dynamically generated BrynQ Pandera model class or None when no fields can be mapped
1576
+ """
1577
+ annotations = {}
1578
+ fields = {}
1579
+ for name, props in custom_fields.items():
1580
+ annotations[name] = Optional[Series[String]]
1581
+ # Use fallback, technical_name can be None by definition for CUSTOM fields if not found in data
1582
+ alias_value = props.technical_name or props.uuid or name
1583
+ fields[name] = pa.Field(
1584
+ coerce=True,
1585
+ nullable=not props.required,
1586
+ alias=alias_value,
1587
+ description=props.label
1588
+ )
1589
+
1590
+ if not annotations:
1591
+ return None
1592
+ fields["__annotations__"] = annotations
1593
+ return type("CustomFieldModel", (BrynQPanderaDataFrameModel,), fields)
1594
+
1595
+
1596
+ class DummyRecord:
1597
+ """Dummy record for logging unmapped sources that don't belong to any record.
1598
+
1599
+ Used internally by Scenarios.rename_fields to track source columns present in
1600
+ the DataFrame but not mapped by the scenario.
1601
+ """
1602
+ def __init__(self):
1603
+ """Initialize a dummy record with empty attributes."""
1604
+ self.id = None
1605
+ self.logic = None
1606
+ self.relation_type = None
1607
+ self.source = SourceTargetFields(
1608
+ type="source",
1609
+ field_names=[],
1610
+ unique_fields=[],
1611
+ required_fields=[],
1612
+ field_properties=[],
1613
+ custom_fields=[],
1614
+ library_fields=[],
1615
+ fields_with_logic=[]
1616
+ )
1617
+ self.target = SourceTargetFields(
1618
+ type="target",
1619
+ field_names=[],
1620
+ unique_fields=[],
1621
+ required_fields=[],
1622
+ field_properties=[],
1623
+ custom_fields=[],
1624
+ library_fields=[],
1625
+ fields_with_logic=[]
1626
+ )
1627
+
1628
+
1629
+ class Scenarios():
1630
+ """
1631
+ Provides convenient access to BrynQ scenarios, with lookups and a Pythonic interface.
1632
+
1633
+ """
1634
+ # Missing value representations to detect in dataframes
1635
+ MISSING_VALUES: List[str] = [
1636
+ '<NA>', 'nan', 'None', 'NaN', 'null', 'NaT', '_NA_', '', r'\[\]', r'\{ \}'
1637
+ ]
1638
+ def __init__(self, brynq_instance: Any):
1639
+ """Initializes the scenarios manager.
1640
+
1641
+ Fetches and parses scenarios from the BrynQ API. Scenarios are cached after first fetch.
1642
+ Dunder methods (__getitem__, __iter__, __len__) auto-fetch if not loaded.
1643
+
1644
+ **Core Methods:**
1645
+ - get(): Fetches/returns ParsedScenario objects (cached after first call)
1646
+
1647
+ **Convenience Methods:**
1648
+ - find_scenarios_with_field(): Find scenarios containing a field
1649
+ - scenario_names: Cached property with all scenario names
1650
+
1651
+ **Dunder Methods:**
1652
+ - __getitem__: Dict access `scenarios['Name']`
1653
+ - __iter__: Iterate scenarios `for scenario in scenarios:`
1654
+ - __len__: Count scenarios `len(scenarios)`
1655
+
1656
+ **ParsedScenario Iteration:**
1657
+ - Records: `for record in scenario:` (mapping records with logic/relation types)
1658
+ - Fields: `scenario.keys()`, `scenario.values()`, `scenario.items()`
1659
+ - Field access: `scenario['field']` or `scenario.field` (dict/attr style)
1660
+ - Source/Target: `scenario.source.field_names`, `scenario.target.field_names`, `scenario.source.unique_fields`, etc.
1661
+ - String repr: `print(scenario)` shows summary
1662
+
1663
+ **Record Iteration:**
1664
+ - All fields: `for field in record:` (iterates over both source and target fields)
1665
+ - Source/Target: `for field in record.source:`, `for field in record.target:`
1666
+ - Field names: `record.source.field_names`, `record.target.field_names`
1667
+ - Field properties: `record.source.field_properties`, `record.target.field_properties`
1668
+
1669
+ **Transformation Methods:**
1670
+ - rename_fields(): Rename/transform DataFrame columns per scenario.
1671
+ - apply_value_mappings(): Apply value mappings (e.g., 'F' → '1') per scenario.
1672
+ - add_fixed_values(): Add fixed literal values to DataFrames
1673
+
1674
+ Args:
1675
+ brynq_instance: Authenticated BrynQ client instance.
1676
+ """
1677
+ self._brynq = brynq_instance
1678
+
1679
+ # Attributes populated by get()
1680
+ self.raw_scenarios: Optional[List[Dict]] = None
1681
+ self.scenarios: Optional[List[ParsedScenario]] = None
1682
+
1683
+ # ============================================================================
1684
+ # Public API Methods
1685
+ # ============================================================================
1686
+
1687
+ def get(self, strict: bool = True) -> List[ParsedScenario]:
1688
+ """Fetches all scenarios from the API and returns them as ParsedScenario objects.
1689
+
1690
+ Results are cached after the first call.
1691
+
1692
+ Args:
1693
+ strict: If True, raises ValueError on validation errors. If False, skips invalid scenarios.
1694
+
1695
+ Returns:
1696
+ List[ParsedScenario]: Validated scenario objects.
1697
+ """
1698
+ # only get once, else reuse initialized object
1699
+ if self.scenarios is None:
1700
+ self.raw_scenarios = self._fetch_from_api(strict=strict)
1701
+ self.scenarios = [
1702
+ ParsedScenario.from_api_dict(scenario=s)
1703
+ for s in self.raw_scenarios if "name" in s
1704
+ ]
1705
+ return self.scenarios
1706
+ else:
1707
+ return self.scenarios
1708
+
1709
+ def find_scenarios_with_field(
1710
+ self,
1711
+ field_name: str,
1712
+ field_type: str = "source"
1713
+ ) -> List[ParsedScenario]:
1714
+ """Find all scenarios that contain a specific field.
1715
+
1716
+ Example:
1717
+ >>> scenarios.find_scenarios_with_field('employee_id')
1718
+ []
1719
+ >>> scenarios.find_scenarios_with_field('employee_id', field_type='target')
1720
+ [<ParsedScenario name='Personal information' id='3c7f8e04-5b74-408f-a2d8-ad99b924a1af' details=15 unique=2 required=20>, <ParsedScenario name='Adres' ...>]
1721
+
1722
+ Args:
1723
+ field_name (str): The field name to search for.
1724
+ field_type (str): The type of field to search in;
1725
+ must be either "source" or "target". Defaults to "source".
1726
+
1727
+ Returns:
1728
+ List[ParsedScenario]: List of ParsedScenario objects containing the specified field.
1729
+ """
1730
+ return [
1731
+ scenario for scenario in self.get()
1732
+ if scenario.has_field(field_name, field_type=field_type)
1733
+ ]
1734
+
1735
+ @cached_property
1736
+ def scenario_names(self) -> List[str]:
1737
+ """A list of all scenario names.
1738
+
1739
+ Example:
1740
+ >>> scenarios.scenario_names
1741
+ ['Personal information', 'Adres', 'Bank Account', 'Contract Information', ...]
1742
+
1743
+ Returns:
1744
+ List[str]: List of all scenario names.
1745
+ """
1746
+ return [s.name for s in self.get()] if self.scenarios is not None else []
1747
+
1748
+ def __getitem__(self, scenario_name: str) -> ParsedScenario:
1749
+ """Returns scenario by name using dict-style access.
1750
+
1751
+ Example:
1752
+ >>> scenario = scenarios['Personal information']
1753
+ >>> scenario.name
1754
+ 'Personal information'
1755
+ >>> scenario['first_name'].required
1756
+ True
1757
+ >>> scenario.firstname.required # Attribute-style access also works
1758
+ True
1759
+
1760
+ Args:
1761
+ scenario_name: Name of the scenario to retrieve.
1762
+
1763
+ Returns:
1764
+ ParsedScenario object with records, mappings, and field properties.
1765
+
1766
+ Raises:
1767
+ KeyError: If scenario name not found.
1768
+ """
1769
+ scenarios = {s.name: s for s in self.get()}
1770
+ if scenario_name not in scenarios:
1771
+ raise KeyError(f"Scenario '{scenario_name}' not found.")
1772
+ return scenarios[scenario_name]
1773
+
1774
+ def __iter__(self) -> Iterator[ParsedScenario]:
1775
+ """Iterates over all parsed scenarios.
1776
+
1777
+ Example:
1778
+ >>> for scenario in scenarios:
1779
+ ... print(f"{scenario.name}: {len(scenario.required_fields)} required fields")
1780
+ Personal information: 20 required fields
1781
+
1782
+ Yields:
1783
+ ParsedScenario: Each scenario object.
1784
+ """
1785
+ return iter(self.get())
1786
+
1787
+ def __len__(self) -> int:
1788
+ """Return the number of parsed scenarios.
1789
+
1790
+ Example:
1791
+ >>> len(scenarios)
1792
+ 13
1793
+
1794
+ Returns:
1795
+ int: The number of available scenarios.
1796
+ """
1797
+ return len(self.get())
1798
+
1799
+ # ============================================================================
1800
+ # Internal API Helpers
1801
+ # ============================================================================
1802
+
1803
+ def _fetch_from_api(self, strict: bool = True) -> List[Dict[str, Any]]:
1804
+ """Fetches raw scenario data from BrynQ API and validates it.
1805
+
1806
+ Makes HTTP GET request, validates JSON against Scenario model.
1807
+ Invalid scenarios are skipped (warning) or raise error, based on strict flag.
1808
+
1809
+ Args:
1810
+ strict (bool): If True, raise ValueError on validation errors. If False, skip invalid scenarios with warning.
1811
+
1812
+ Returns:
1813
+ List[Dict[str, Any]]: Validated scenario dictionaries (raw API format). Contains name, id, description, details.
1814
+
1815
+ Raises:
1816
+ requests.HTTPError: API request failed (non-2xx status).
1817
+ TypeError: API response is not a list.
1818
+ ValueError: strict=True and validation failed.
1819
+
1820
+ Note:
1821
+ Internal method called by get(). Returns raw dicts; get() converts to ParsedScenario objects.
1822
+ """
1823
+ response = self._brynq.brynq_session.get(
1824
+ url=(
1825
+ f"{self._brynq.url}interfaces/"
1826
+ f"{self._brynq.data_interface_id}/scenarios"
1827
+ ),
1828
+ timeout=self._brynq.timeout,
1829
+ )
1830
+ response.raise_for_status()
1831
+ scenario_list = response.json()
1832
+ if not isinstance(scenario_list, list):
1833
+ raise TypeError(f"Expected a list of scenarios, but got {type(scenario_list).__name__}.")
1834
+
1835
+ valid_scenarios, invalid_scenarios = Functions.validate_pydantic_data(
1836
+ scenario_list,
1837
+ schema=Scenario,
1838
+ debug=True,
1839
+ )
1840
+
1841
+ if invalid_scenarios:
1842
+ msg = (f"{len(invalid_scenarios)} scenario(s) failed validation and were skipped.")
1843
+ if strict:
1844
+ raise ValueError(f"Invalid scenario data found: {msg}")
1845
+ warnings.warn(msg, UserWarning, stacklevel=2)
1846
+
1847
+ return valid_scenarios
1848
+
1849
+ # ============================================================================
1850
+ # Public Transformation Methods
1851
+ # ============================================================================
1852
+
1853
+ def add_fixed_values(
1854
+ self,
1855
+ df: pd.DataFrame,
1856
+ scenario_name: str
1857
+ ) -> pd.DataFrame:
1858
+ """Adds fixed literal values to DataFrame columns based on scenario mappings.
1859
+
1860
+ Creates new columns with target field names, fills all rows with the fixed value.
1861
+ Only processes records with relation_type 'one_to_one' or 'one_to_many'.
1862
+ Supports both FIXED and CONFIGURATION source field types.
1863
+
1864
+ Args:
1865
+ df (pd.DataFrame): Input DataFrame to add fixed value columns to.
1866
+ scenario_name (str): Name of scenario containing fixed value mappings.
1867
+
1868
+ Returns:
1869
+ pd.DataFrame: Copy of input DataFrame with fixed value columns added.
1870
+
1871
+ Raises:
1872
+ ValueError: Scenario name not found.
1873
+
1874
+ Examples
1875
+ --------
1876
+ Adding a fixed value column from a scenario with FIXED source type.
1877
+
1878
+ >>> df = pd.DataFrame({'id': [1, 2, 3], 'name': ['John', 'Jane', 'Bob']})
1879
+ >>> df.columns.tolist()
1880
+ ['id', 'name']
1881
+ >>> df
1882
+ id name
1883
+ 0 1 John
1884
+ 1 2 Jane
1885
+ 2 3 Bob
1886
+
1887
+ Scenario has a record with FIXED source value 'NL' mapping to target 'country_code'.
1888
+
1889
+ >>> df = scenarios.add_fixed_values(df, 'My Scenario')
1890
+ >>> df
1891
+ id name country_code
1892
+ 0 1 John NL
1893
+ 1 2 Jane NL
1894
+ 2 3 Bob NL
1895
+
1896
+ The 'country_code' column is added and filled with the fixed value 'NL' for all rows.
1897
+
1898
+ Also supports CONFIGURATION source types. Config values are parsed according to
1899
+ their type (TEXT, EMAIL, NUMBER, SELECTION, DATEPICKER, etc.) during record creation.
1900
+
1901
+ Note:
1902
+ For many_to_one/many_to_many mappings, use rename_fields() instead.
1903
+ """
1904
+ df_fixed = df.copy()
1905
+ try:
1906
+ scenario = self[scenario_name]
1907
+ except KeyError as e:
1908
+ raise ValueError(f"Scenario with name '{scenario_name}' not found.") from e
1909
+
1910
+ for record in scenario.records:
1911
+ if record.relation_type not in ("one_to_one", "one_to_many"):
1912
+ continue
1913
+
1914
+ if not record.fixed_source_value:
1915
+ warnings.warn(f"Missing fixed/config value for record {record.id}", stacklevel=2)
1916
+ continue
1917
+
1918
+ for target_field in record.target.field_names:
1919
+ df_fixed[target_field] = record.fixed_source_value
1920
+
1921
+ return df_fixed
1922
+
1923
+ def apply_value_mappings(
1924
+ self,
1925
+ df: pd.DataFrame,
1926
+ scenario_name: str,
1927
+ drop_unmapped: bool = False,
1928
+ how: Literal[ #Union list, geen valMap dan meer explicit
1929
+ 'exactValMap',
1930
+ 'ignoreCaseValMap',
1931
+ 'ignoreSpecialValMap',
1932
+ 'ignoreSpacesValMap',
1933
+ 'flexValMap'
1934
+ ] = 'exactValMap'
1935
+ ) -> Tuple[pd.DataFrame, Set[str], pd.DataFrame]:
1936
+ """Transforms source values to target values based on scenario mappings.
1937
+
1938
+ Processes records with value mapping configurations (e.g., "M" -> "1").
1939
+ Handles various relation types by preparing source values appropriately (direct vs concatenated).
1940
+
1941
+ Mapping strategies (how parameter):
1942
+ - exactValMap: Precise matching (default)
1943
+ - ignoreCaseValMap: Case-insensitive matching
1944
+ - ignoreSpecialValMap: Ignores special characters including spaces
1945
+ - ignoreSpacesValMap: Ignores spaces only
1946
+ - flexValMap: Case-insensitive + ignores special characters including spaces
1947
+
1948
+ Strategy selection priority:
1949
+ 1. Check record.logic for 'matching strategy' (higher priority), evaluate if correspond to the strategy names above.
1950
+ 2. Fall back to how kwarg if no match in logic
1951
+
1952
+ Examples
1953
+ --------
1954
+ Example 1: Basic value mapping with exactValMap (default).
1955
+
1956
+ >>> df = pd.DataFrame({'gender': ['F', 'M', 'F']})
1957
+ >>> # Scenario mapping configuration:
1958
+ >>> # {'gender': 'F'} -> {'gender_code': '1'}
1959
+ >>> # {'gender': 'M'} -> {'gender_code': '0'}
1960
+ >>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario')
1961
+ >>> df
1962
+ gender gender_code
1963
+ 0 F 1
1964
+ 1 M 0
1965
+ 2 F 1
1966
+
1967
+ Example 2: Case-insensitive matching with ignoreCaseValMap.
1968
+
1969
+ >>> df = pd.DataFrame({'status': ['Active', 'ACTIVE', 'inactive']})
1970
+ >>> # Scenario mapping (source values normalized to lowercase for matching):
1971
+ >>> # {'status': 'active'} -> {'status_code': '1'} # Matches 'Active', 'ACTIVE'
1972
+ >>> # {'status': 'inactive'} -> {'status_code': '0'} # Matches 'inactive'
1973
+ >>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='ignoreCaseValMap')
1974
+ >>> df
1975
+ status status_code
1976
+ 0 Active 1
1977
+ 1 ACTIVE 1
1978
+ 2 inactive 0
1979
+
1980
+ Example 3: Flexible matching with flexValMap (ignores case and special chars).
1981
+
1982
+ >>> df = pd.DataFrame({
1983
+ ... 'product_code': ['ABC-123', 'xyz_456', 'MNO 789', 'PQR@#$%']
1984
+ ... })
1985
+ >>> # Scenario mapping (source values normalized: lowercase + remove special chars):
1986
+ >>> # {'product_code': 'abc123'} -> {'product_id': 'P001'} # Matches 'ABC-123'
1987
+ >>> # {'product_code': 'xyz456'} -> {'product_id': 'P002'} # Matches 'xyz_456'
1988
+ >>> # {'product_code': 'mno789'} -> {'product_id': 'P003'} # Matches 'MNO 789'
1989
+ >>> # {'product_code': 'pqr'} -> {'product_id': 'P004'} # Matches 'PQR@#$%'
1990
+ >>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='flexValMap')
1991
+ >>> df
1992
+ product_code product_id
1993
+ 0 ABC-123 P001
1994
+ 1 xyz_456 P002
1995
+ 2 MNO 789 P003
1996
+ 3 PQR@#$% P004
1997
+
1998
+ Example 4: Many-to-one mapping with concatenated fields and special chars.
1999
+
2000
+ >>> df = pd.DataFrame({
2001
+ ... 'first_name': ['John', 'Jane', 'José'],
2002
+ ... 'last_name': ['Doe-Smith', 'O\'Brien', 'García-López']
2003
+ ... })
2004
+ >>> # Scenario mapping (concatenated with |, then normalized for matching):
2005
+ >>> # {'first_name': 'John', 'last_name': 'Doe-Smith'} -> 'John|Doe-Smith' -> 'john|doesmith' -> {'full_id': 'JD001'}
2006
+ >>> # {'first_name': 'Jane', 'last_name': 'O\'Brien'} -> 'Jane|O\'Brien' -> 'jane|obrien' -> {'full_id': 'JO002'}
2007
+ >>> # {'first_name': 'José', 'last_name': 'García-López'} -> 'José|García-López' -> 'josé|garclpez' -> {'full_id': 'JG003'}
2008
+ >>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='flexValMap')
2009
+ >>> df
2010
+ first_name last_name full_id
2011
+ 0 John Doe-Smith JD001
2012
+ 1 Jane O'Brien JO002
2013
+ 2 José García-López JG003
2014
+
2015
+ Example 5: ignoreSpacesValMap - removes spaces but preserves other special chars.
2016
+
2017
+ >>> df = pd.DataFrame({
2018
+ ... 'location': ['New York', 'New York', 'New York', 'New York', 'New-York', 'New_York']
2019
+ ... })
2020
+ >>> # Scenario mapping (source values normalized: remove spaces only, preserves hyphens/underscores):
2021
+ >>> # {'location': 'New York'} -> {'location_code': 'NYC'} # Mapping has spaces, normalizes to 'NewYork'
2022
+ >>> # {'location': 'New-York'} -> {'location_code': 'NYD'} # Matches 'New-York' (exact, spaces removed)
2023
+ >>> # {'location': 'New_York'} -> {'location_code': 'NYU'} # Matches 'New_York' (exact, spaces removed)
2024
+ >>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='ignoreSpacesValMap')
2025
+ >>> df
2026
+ location location_code
2027
+ 0 New York NYC
2028
+ 1 New York NYC
2029
+ 2 New York NYC
2030
+ 3 New York NYC
2031
+ 4 New-York NYD
2032
+ 5 New_York NYU
2033
+
2034
+ Example 6: Per-record strategy override via logic field.
2035
+
2036
+ >>> df = pd.DataFrame({'code': ['A-B', 'C D', 'E|F']})
2037
+ >>> # Record 1: logic='flexValMap' -> uses flexValMap (normalizes to 'ab', 'cd', 'ef')
2038
+ >>> # Record 2: logic=None -> uses how='exactValMap' (exact match required)
2039
+ >>> df, _, stats = scenarios.apply_value_mappings(df, 'My Scenario', how='exactValMap')
2040
+ >>> # Records with flexValMap in logic will match 'A-B'->'ab', 'C D'->'cd', 'E|F'->'ef'
2041
+ >>> # Records without logic will only match exact values from how kwarg
2042
+
2043
+ Args:
2044
+ df: Input DataFrame.
2045
+ scenario_name: Name of the scenario.
2046
+ drop_unmapped: If True (and no default value exists), drops rows that couldn't be mapped.
2047
+ how: Mapping strategy to use (default: 'exactValMap'). Can be overridden per record via logic.
2048
+
2049
+ Returns:
2050
+ Tuple[pd.DataFrame, Set[str], pd.DataFrame]:
2051
+ 1. Transformed DataFrame.
2052
+ 2. Set of source fields processed.
2053
+ 3. Statistics DataFrame detailing mapping success rates and value distributions.
2054
+
2055
+ Statistics DataFrame columns:
2056
+ - record_id: Unique identifier for the mapping record
2057
+ - source_fields: Source field names, pipe-separated if multiple
2058
+ - target_fields: Target field names, pipe-separated if multiple
2059
+ - relation_type: Relation type ('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many')
2060
+ - mapping_strategy: Mapping strategy used ('exactValMap', 'ignoreCaseValMap', etc.)
2061
+ - total_rows: Total number of rows in DataFrame
2062
+ - mapped_rows: Number of rows successfully mapped
2063
+ - unmapped_rows: Number of rows that couldn't be mapped
2064
+ - mapping_success_pct: Percentage of rows successfully mapped
2065
+ - successful_indices: List of DataFrame row indices that were successfully mapped
2066
+ - unsuccessful_indices: List of DataFrame row indices that couldn't be mapped
2067
+ - mapped_value_counts: Dictionary of mapped source values and their counts
2068
+ - unmapped_value_counts: Dictionary of unmapped source values and their counts
2069
+ - used_mapping_values: List of mapping rules that were used (with counts)
2070
+ - unused_mapping_values: List of mapping rules that were never encountered
2071
+ """
2072
+ try:
2073
+ scenario = self[scenario_name]
2074
+ except KeyError:
2075
+ # If scenario not found, return empty results
2076
+ stats_df = pd.DataFrame(
2077
+ columns=[
2078
+ 'record_id',
2079
+ 'source_fields',
2080
+ 'target_fields',
2081
+ 'relation_type',
2082
+ 'mapping_strategy',
2083
+ 'total_rows',
2084
+ 'mapped_rows',
2085
+ 'unmapped_rows',
2086
+ 'mapping_success_pct',
2087
+ 'successful_indices',
2088
+ 'unsuccessful_indices',
2089
+ 'mapped_value_counts',
2090
+ 'unmapped_value_counts',
2091
+ 'used_mapping_values',
2092
+ 'unused_mapping_values'
2093
+ ]
2094
+ )
2095
+ return df, set(), stats_df
2096
+
2097
+ # Warn about missing values before processing to help users identify data quality issues early.
2098
+ # Missing values in source fields can cause mappings to fail silently or produce unexpected
2099
+ # results, so detecting them upfront prevents confusion about why certain rows didn't map correctly.
2100
+ all_source_fields_to_check = set()
2101
+ for record in scenario.records:
2102
+ if record.mapping:
2103
+ all_source_fields_to_check.update(record.source.field_names)
2104
+
2105
+ if all_source_fields_to_check:
2106
+ missing_value_counts = self._detect_missing_values_in_fields(
2107
+ df=df,
2108
+ source_field_names=list(all_source_fields_to_check)
2109
+ )
2110
+ if missing_value_counts:
2111
+ missing_details = [
2112
+ f"{field}: {count} occurrence(s)"
2113
+ for field, count in missing_value_counts.items()
2114
+ ]
2115
+ warnings.warn(
2116
+ f"DataFrame contains missing values (pd.NA or string representations) "
2117
+ f"in source fields used for value mapping: {', '.join(missing_details)}. "
2118
+ f"These may affect mapping accuracy.",
2119
+ UserWarning,
2120
+ stacklevel=2
2121
+ )
2122
+
2123
+ handled_source_fields = set()
2124
+ statistics_rows = []
2125
+
2126
+ # Process each record to apply value mappings (source values -> target values)
2127
+ for record in scenario.records:
2128
+ if not record.mapping:
2129
+ continue
2130
+
2131
+ source_field_names = record.source.field_names
2132
+ target_field_names = record.target.field_names
2133
+ total_rows = len(df)
2134
+ default_val = record.mapping.default_value
2135
+
2136
+ # Ensure source fields are present in the dataframe, else add default value to target column
2137
+ missing_fields = [field for field in source_field_names if field not in df.columns]
2138
+ if missing_fields:
2139
+ warnings.warn(f"Source fields {missing_fields} not found in dataframe for record {record.id}. Creating target columns with default values.", stacklevel=2)
2140
+ for target_field in target_field_names:
2141
+ df[target_field] = default_val if default_val else None
2142
+
2143
+ # Determine mapping strategy even when fields are missing (for statistics tracking)
2144
+ mapping_strategy = self._determine_mapping_strategy(record.logic, how)
2145
+
2146
+ # Record statistics when missing: 0 mapped rows, all mappings unused (source fields missing)
2147
+ statistics_rows.append({
2148
+ 'record_id': record.id,
2149
+ 'source_fields': '|'.join(source_field_names),
2150
+ 'target_fields': '|'.join(target_field_names),
2151
+ 'relation_type': record.relation_type,
2152
+ 'mapping_strategy': mapping_strategy,
2153
+ 'total_rows': total_rows,
2154
+ 'mapped_rows': 0,
2155
+ 'unmapped_rows': total_rows,
2156
+ 'mapping_success_pct': 0.0,
2157
+ 'successful_indices': [],
2158
+ 'unsuccessful_indices': df.index.tolist(),
2159
+ 'mapped_value_counts': {},
2160
+ 'unmapped_value_counts': {},
2161
+ 'used_mapping_values': [],
2162
+ 'unused_mapping_values': [] # Source fields missing, so no mapping values could be evaluated
2163
+ })
2164
+ continue
2165
+
2166
+ # Source fields are not missing:
2167
+ else:
2168
+ # Track processed fields
2169
+ handled_source_fields.update(source_field_names)
2170
+
2171
+ # Determine mapping strategy: check record.logic first (higher priority), then use how kwarg
2172
+ mapping_strategy = self._determine_mapping_strategy(record.logic, how)
2173
+
2174
+ # Step 1: Normalize dataframe according to mapping strategy.
2175
+ normalized_df = df[source_field_names].copy()
2176
+ for field_name in source_field_names:
2177
+ if field_name in normalized_df.columns:
2178
+ normalized_df[field_name] = normalized_df[field_name].apply(
2179
+ lambda val: self._normalize_value_for_mapping(val, mapping_strategy)
2180
+ )
2181
+
2182
+ # Step 1b: Create Series with normalized source values (one Series, shared by all target fields)
2183
+ # Format: "f"/"m" (single) or "john|doe" (multiple, pipe-separated, pipes preserved)
2184
+ normalized_source_series = self._concatenate_source_fields(df=normalized_df, source_fields=source_field_names)
2185
+
2186
+ # Step 1c: Create original (non-normalized) concatenated series for statistics and fillna
2187
+ concatenated_source_series = self._concatenate_source_fields(df=df, source_fields=source_field_names)
2188
+
2189
+ # Step 2.A: Create empty mapping dicts (one dict per target field)
2190
+ # Structure: {target_field: {normalized_source_value: target_value}}
2191
+ # Each target gets its own dict; all use the same Series from Step 1
2192
+ # Example: {"gender_code": {"f": "1"}, "status": {"f": "Active"}} (if ignoreCaseValMap)
2193
+ replacements_by_target = {target_field: {} for target_field in target_field_names}
2194
+
2195
+ # defined_mapping_values: tracks all mapping definitions for statistics (used vs unused)
2196
+ defined_mapping_values = []
2197
+
2198
+ # 2.B Build lookup dictionaries for existing record.mapping.values
2199
+ # If values list is empty, skip building mappings but still collect statistics
2200
+ if record.mapping.values:
2201
+ for mapping_value in record.mapping.values:
2202
+ source_map_val = mapping_value.input
2203
+ target_map_val = mapping_value.output
2204
+ if not source_map_val or not target_map_val:
2205
+ continue
2206
+
2207
+ # Concat/combine source values mapping to create the lookup key value.
2208
+ # Normalize mapping values first, then join with pipe to preserve separator.
2209
+ # (e.g., ["John", "Doe"] -> normalize each -> ["john", "doe"] -> concat -> "john|doe")
2210
+ source_values = []
2211
+ normalized_source_values = []
2212
+ for field_name in source_field_names:
2213
+ if field_name in source_map_val:
2214
+ source_val = str(source_map_val[field_name]).strip()
2215
+ source_values.append(source_val)
2216
+ # Normalize individual field value before concatenation
2217
+ normalized_source_values.append(
2218
+ self._normalize_value_for_mapping(source_val, mapping_strategy)
2219
+ )
2220
+ else:
2221
+ source_values = None
2222
+ normalized_source_values = None
2223
+ break
2224
+
2225
+ # Validate that we have values for ALL source fields before using this mapping.
2226
+ if source_values and len(source_values) == len(source_field_names):
2227
+ combined_source_val = '|'.join(source_values)
2228
+ normalized_combined_source_val = '|'.join(normalized_source_values)
2229
+
2230
+ # Store mapping definition for statistics tracking (not used for transformation).
2231
+ mapping_def = {
2232
+ 'input': combined_source_val,
2233
+ 'output': {target_field: str(target_map_val.get(target_field, '')).strip()
2234
+ for target_field in target_field_names if target_field in target_map_val}
2235
+ }
2236
+ defined_mapping_values.append(mapping_def)
2237
+
2238
+ # Store mapping in lookup dict for actual transformation (used by apply_mapping_to_target in Step 4).
2239
+ for target_field in target_field_names:
2240
+ if target_field in target_map_val:
2241
+ target_val = str(target_map_val[target_field]).strip()
2242
+ replacements_by_target[target_field][normalized_combined_source_val] = target_val
2243
+
2244
+ # Step 3: Apply mappings to target columns using normalized source series for lookup
2245
+ for target_field in target_field_names:
2246
+ df = self._apply_mapping_to_target(
2247
+ df=df,
2248
+ concatenated_source_series=normalized_source_series,
2249
+ target_field=target_field,
2250
+ replacements=replacements_by_target[target_field],
2251
+ default_val=default_val,
2252
+ original_source_series=concatenated_source_series
2253
+ )
2254
+
2255
+ # Step 4: Collect statistics on mapping results
2256
+ all_mapped_source_values = set()
2257
+ for replacements in replacements_by_target.values():
2258
+ all_mapped_source_values.update(replacements.keys())
2259
+
2260
+ # Step 4b: Determine which rows were successfully mapped vs unmapped
2261
+ is_mapped = normalized_source_series.isin(all_mapped_source_values)
2262
+ mapped_rows = is_mapped.sum()
2263
+ unmapped_rows = (~is_mapped).sum()
2264
+
2265
+ # Get indices of successful and unsuccessful mappings
2266
+ successful_indices = df.index[is_mapped].tolist()
2267
+ unsuccessful_indices = df.index[~is_mapped].tolist()
2268
+
2269
+ # Step 4c: Count occurrences of each source VALUE in the data
2270
+ # Analyzes what values actually appeared in the data, regardless of mapping definitions.
2271
+ # (1) mapped_value_counts_dict: values that were successfully mapped (for Step 5d to use)
2272
+ # (2) unmapped_value_counts_dict: values that didn't map (to identify gaps in mapping rules)
2273
+ # (3) Convert Series to dict to avoid truncation and ensure ALL values are preserved in statistics.
2274
+ mapped_values = concatenated_source_series[is_mapped]
2275
+ unmapped_values = concatenated_source_series[~is_mapped]
2276
+ mapped_value_counts_dict = {}
2277
+ unmapped_value_counts_dict = {}
2278
+ if len(mapped_values) > 0:
2279
+ mapped_value_counts_dict = dict(mapped_values.value_counts())
2280
+ if len(unmapped_values) > 0:
2281
+ unmapped_value_counts_dict = dict(unmapped_values.value_counts())
2282
+
2283
+ # Step 4d: Compare defined MAPPING RULES (from Step 3) against actual data.
2284
+ # Analyzes which mapping definitions were used vs unused, regardless of what values exist in data.
2285
+ # Different from Step 4c: 4c analyzes data values, 4d analyzes mapping rules.
2286
+ # (1) unused mappings: rules defined but never encountered (possibly typos or outdated rules)
2287
+ # (2) used mappings: rules that actually fired and how many times (validates mapping logic)
2288
+ # Need to compare normalized mapping inputs against normalized data values
2289
+ # Build a map of normalized values to their counts (sum counts for values that normalize to same key)
2290
+ normalized_mapped_inputs = {}
2291
+ for orig_val, count in mapped_value_counts_dict.items():
2292
+ normalized_val = self._normalize_value_for_mapping(orig_val, mapping_strategy)
2293
+ normalized_mapped_inputs[normalized_val] = normalized_mapped_inputs.get(normalized_val, 0) + count
2294
+
2295
+ unused_mapping_values = []
2296
+ used_mapping_values_with_counts = []
2297
+ for mapping_def in defined_mapping_values:
2298
+ mapping_input = mapping_def['input']
2299
+ normalized_mapping_input = self._normalize_value_for_mapping(mapping_input, mapping_strategy)
2300
+ # found in data: used rule (compare normalized values)
2301
+ if normalized_mapping_input in normalized_mapped_inputs:
2302
+ used_mapping_values_with_counts.append({
2303
+ 'input': mapping_input,
2304
+ 'output': mapping_def['output'],
2305
+ 'count': normalized_mapped_inputs.get(normalized_mapping_input, 0)
2306
+ })
2307
+ # never found in the data (unused rule)
2308
+ else:
2309
+ unused_mapping_values.append(mapping_def)
2310
+
2311
+ # Optionally filter out unmapped rows if requested
2312
+ if drop_unmapped and not default_val:
2313
+ df = df[is_mapped]
2314
+
2315
+ # Step 4f: Calculate mapping success rate and store all statistics for this record.
2316
+ # At this point, we have all the information needed (counts from 5c, used/unused from 5d, row counts from 5b).
2317
+ # We store statistics per record because each record has different source/target fields and relation types, so users can analyze effectiveness per record and per mapping rule.
2318
+ mapping_success_pct = (mapped_rows / total_rows * 100) if total_rows > 0 else 0.0
2319
+ statistics_rows.append({
2320
+ 'record_id': record.id,
2321
+ 'source_fields': '|'.join(source_field_names),
2322
+ 'target_fields': '|'.join(target_field_names),
2323
+ 'relation_type': record.relation_type,
2324
+ 'mapping_strategy': mapping_strategy,
2325
+ 'total_rows': total_rows,
2326
+ 'mapped_rows': mapped_rows,
2327
+ 'unmapped_rows': unmapped_rows,
2328
+ 'mapping_success_pct': mapping_success_pct,
2329
+ 'successful_indices': successful_indices,
2330
+ 'unsuccessful_indices': unsuccessful_indices,
2331
+ 'mapped_value_counts': mapped_value_counts_dict,
2332
+ 'unmapped_value_counts': unmapped_value_counts_dict,
2333
+ 'used_mapping_values': used_mapping_values_with_counts,
2334
+ 'unused_mapping_values': unused_mapping_values
2335
+ })
2336
+
2337
+ if statistics_rows:
2338
+ stats_df = pd.DataFrame(statistics_rows)
2339
+ else:
2340
+ stats_df = pd.DataFrame(columns=[
2341
+ 'record_id', 'source_fields', 'target_fields', 'relation_type',
2342
+ 'mapping_strategy', 'total_rows', 'mapped_rows', 'unmapped_rows', 'mapping_success_pct',
2343
+ 'successful_indices', 'unsuccessful_indices',
2344
+ 'mapped_value_counts', 'unmapped_value_counts',
2345
+ 'used_mapping_values', 'unused_mapping_values'
2346
+ ])
2347
+
2348
+ return df, handled_source_fields, stats_df
2349
+
2350
+ def rename_fields(
2351
+ self,
2352
+ df: pd.DataFrame,
2353
+ scenario_name: str,
2354
+ columns_to_keep: List[str] = None,
2355
+ drop_unmapped: bool = True
2356
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
2357
+ """Renames and transforms DataFrame columns based on scenario field mappings.
2358
+
2359
+ Handles complex mappings like concatenation (many-to-one) and splitting (one-to-many).
2360
+ Records with value mappings are logged but skipped (use `apply_value_mappings` for those).
2361
+
2362
+ Args:
2363
+ df: Input DataFrame.
2364
+ scenario_name: Name of the scenario.
2365
+ columns_to_keep: List of source column names to preserve even if mapped.
2366
+ drop_unmapped: If True, drops source columns that were successfully mapped (unless in columns_to_keep).
2367
+
2368
+ Returns:
2369
+ Tuple containing:
2370
+ - Modified DataFrame with renamed/transformed columns based on scenario mappings.
2371
+ - Statistics DataFrame (stats_df) with detailed mapping information (see Notes).
2372
+
2373
+ Raises:
2374
+ KeyError: If scenario_name is not found. Returns original DataFrame with empty statistics.
2375
+
2376
+ Logic types:
2377
+ - "concat": Concatenate all sources with '|', fill all targets
2378
+ - "fill": Map source[i] → target[i] in order
2379
+ - "keep source": Keep source fields unchanged, no target columns
2380
+ - Default (no logic): Uses relation_type:
2381
+ * one_to_one: Direct mapping source[0] → target[0]
2382
+ * one_to_many: Duplicate single source value to all target fields
2383
+ * many_to_one: Concatenate all source fields with '|' into single target
2384
+ * many_to_many (n:m): Behavior depends on field counts:
2385
+ - n == m: Direct 1:1 mapping source[i] → target[i]
2386
+ - n < m: Map first n sources to first n targets, fill remaining with last source
2387
+ - n > m: Concatenate all sources to each target field
2388
+
2389
+ Notes:
2390
+ The statistics DataFrame (stats_df) provides comprehensive visibility into the transformation process:
2391
+
2392
+ **What it reports:**
2393
+ - For each record processed: source/target column mappings, mapping status (mapped/source_missing/kept_source/value_mapped),
2394
+ number of rows affected, mapping type (concat/fill/one_to_one/etc.), and default logic used (if applicable).
2395
+ - For unmapped sources: Columns that exist in the DataFrame but weren't processed by any record.
2396
+
2397
+ **Statistics DataFrame columns:**
2398
+ - record_id: Unique identifier for the mapping record (None for unmapped sources)
2399
+ - source_column: Source column name(s), pipe-separated if multiple
2400
+ - target_column: Target column name (None if source was kept or unmapped)
2401
+ - mapping_status: Status of the mapping ('mapped', 'source_missing', 'kept_source', 'value_mapped', 'not_in_mapping')
2402
+ - source_existed: Whether source column(s) existed in the DataFrame
2403
+ - rows_affected: Number of rows in the DataFrame
2404
+ - mapping_type: Type of mapping applied ('concat', 'fill', 'one_to_one', 'one_to_many', 'many_to_one', 'many_to_many', etc.)
2405
+ - logic: Original logic string from the record
2406
+ - relation_type: Relation type from the record ('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many')
2407
+ - source_count: Number of source fields in the record
2408
+ - target_count: Number of target fields in the record
2409
+ - default_logic: Description of default logic used if no explicit logic was specified (e.g., 'direct_mapping', 'concatenate_with_pipe')
2410
+
2411
+ Examples
2412
+ --------
2413
+ Example 1: Renaming columns using one_to_one mapping (no logic, uses default).
2414
+
2415
+ >>> df = pd.DataFrame({'id': [1, 2], 'first_name': ['John', 'Jane']})
2416
+ >>> df
2417
+ id first_name
2418
+ 0 1 John
2419
+ 1 2 Jane
2420
+
2421
+ Scenario maps 'first_name' → 'firstname' (one_to_one, no logic specified).
2422
+ Default behavior: direct mapping source[0] → target[0].
2423
+
2424
+ >>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
2425
+ >>> df
2426
+ id firstname
2427
+ 0 1 John
2428
+ 1 2 Jane
2429
+
2430
+ >>> stats_df[['source_column', 'target_column', 'logic', 'relation_type', 'default_logic']]
2431
+ source_column target_column logic relation_type default_logic
2432
+ 0 first_name firstname None one_to_one direct_mapping
2433
+
2434
+
2435
+ Example 2: Using many_to_one mapping (no logic, uses default).
2436
+
2437
+ >>> df = pd.DataFrame({'id': [1, 2], 'street': ['Main St', 'Oak Ave'], 'city': ['Amsterdam', 'Rotterdam']})
2438
+ >>> df
2439
+ id street city
2440
+ 0 1 Main St Amsterdam
2441
+ 1 2 Oak Ave Rotterdam
2442
+
2443
+ Scenario maps 'street'|'city' → 'address' (many_to_one, no logic specified).
2444
+ Default behavior: concatenate all source fields with '|' separator into single target.
2445
+
2446
+ >>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
2447
+ >>> df
2448
+ id address
2449
+ 0 1 Main St|Amsterdam
2450
+ 1 2 Oak Ave|Rotterdam
2451
+
2452
+ >>> stats_df[['source_column', 'target_column', 'logic', 'relation_type', 'default_logic']]
2453
+ source_column target_column logic relation_type default_logic
2454
+ 0 street|city address None many_to_one concatenate_with_pipe
2455
+
2456
+
2457
+ Example 3: Using many_to_many mapping with explicit 'concat' logic.
2458
+
2459
+ >>> df = pd.DataFrame({
2460
+ ... 'id': [1, 2],
2461
+ ... 'first_name': ['John', 'Jane'],
2462
+ ... 'last_name': ['Doe', 'Smith']
2463
+ ... })
2464
+ >>> df
2465
+ id first_name last_name
2466
+ 0 1 John Doe
2467
+ 1 2 Jane Smith
2468
+
2469
+ Scenario maps 'first_name'|'last_name' → 'full_name'|'display_name' (many_to_many with explicit 'concat' logic).
2470
+
2471
+ >>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
2472
+ >>> df
2473
+ id full_name display_name
2474
+ 0 1 John|Doe John|Doe
2475
+ 1 2 Jane|Smith Jane|Smith
2476
+
2477
+ With 'concat' logic, all source fields are concatenated and filled into all target fields.
2478
+
2479
+ >>> stats_df[['source_column', 'target_column', 'logic', 'relation_type']]
2480
+ source_column target_column logic relation_type
2481
+ 0 first_name|last_name full_name concat many_to_many
2482
+ 1 first_name|last_name display_name concat many_to_many
2483
+
2484
+
2485
+ Example 4: Using many_to_many mapping with explicit 'fill' logic.
2486
+
2487
+ >>> df = pd.DataFrame({
2488
+ ... 'id': [1, 2],
2489
+ ... 'first_name': ['John', 'Jane'],
2490
+ ... 'last_name': ['Doe', 'Smith']
2491
+ ... })
2492
+ >>> df
2493
+ id first_name last_name
2494
+ 0 1 John Doe
2495
+ 1 2 Jane Smith
2496
+
2497
+ Scenario maps 'first_name'|'last_name' → 'first'|'last' (many_to_many with explicit 'fill' logic).
2498
+
2499
+ >>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
2500
+ >>> df
2501
+ id first last
2502
+ 0 1 John Doe
2503
+ 1 2 Jane Smith
2504
+
2505
+ With 'fill' logic, source[i] maps to target[i] in order (1:1 mapping by index).
2506
+
2507
+ >>> stats_df[['source_column', 'target_column', 'logic', 'relation_type']]
2508
+ source_column target_column logic relation_type
2509
+ 0 first_name first fill many_to_many
2510
+ 1 last_name last fill many_to_many
2511
+
2512
+
2513
+ Example 5: Using 'keep source' logic.
2514
+
2515
+ >>> df = pd.DataFrame({'id': [1, 2], 'employee_id': ['E001', 'E002'], 'department': ['IT', 'HR']})
2516
+ >>> df
2517
+ id employee_id department
2518
+ 0 1 E001 IT
2519
+ 1 2 E002 HR
2520
+
2521
+ Scenario has 'keep source' logic for 'employee_id' and 'department' fields.
2522
+
2523
+ >>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
2524
+ >>> df
2525
+ id employee_id department
2526
+ 0 1 E001 IT
2527
+ 1 2 E002 HR
2528
+
2529
+ Source columns are kept unchanged; no target columns are created.
2530
+
2531
+ >>> stats_df[['source_column', 'target_column', 'mapping_status', 'logic']]
2532
+ source_column target_column mapping_status logic
2533
+ 0 employee_id None kept_source keep source
2534
+ 1 department None kept_source keep source
2535
+
2536
+
2537
+ Example 6: Using one_to_many mapping without logic (uses default).
2538
+
2539
+ >>> df = pd.DataFrame({'id': [1, 2], 'postal_code': ['1234', '5678']})
2540
+ >>> df
2541
+ id postal_code
2542
+ 0 1 1234
2543
+ 1 2 5678
2544
+
2545
+ Scenario maps 'postal_code' → 'zip'|'postcode' (one_to_many, no logic specified).
2546
+ Default behavior: duplicate single source value to all target fields.
2547
+
2548
+ >>> df, stats_df = scenarios.rename_fields(df, 'My Scenario')
2549
+ >>> df
2550
+ id zip postcode
2551
+ 0 1 1234 1234
2552
+ 1 2 5678 5678
2553
+
2554
+ Both target columns are filled with the same source value (duplicated to all targets).
2555
+
2556
+ >>> stats_df[['source_column', 'target_column', 'logic', 'relation_type', 'default_logic']]
2557
+ source_column target_column logic relation_type default_logic
2558
+ 0 postal_code zip None one_to_many duplicate_to_all_targets
2559
+ 1 postal_code postcode None one_to_many duplicate_to_all_targets
2560
+ """
2561
+ if columns_to_keep is None:
2562
+ columns_to_keep = []
2563
+
2564
+ try:
2565
+ scenario = self[scenario_name]
2566
+ except KeyError:
2567
+ warnings.warn(f"Scenario '{scenario_name}' not found. Returning original DataFrame with empty statistics.", stacklevel=2)
2568
+ empty_stats = pd.DataFrame(
2569
+ columns=[
2570
+ 'record_id', 'source_column', 'target_column', 'mapping_status',
2571
+ 'source_existed', 'rows_affected', 'mapping_type', 'logic',
2572
+ 'relation_type', 'source_count', 'target_count', 'default_logic'
2573
+ ]
2574
+ )
2575
+ return df, empty_stats
2576
+
2577
+ # objects for tracking statistics
2578
+ newly_created_target_fields = set()
2579
+ source_fields_to_keep = set()
2580
+ stats_data = []
2581
+
2582
+ # Handler dictionaries route records to transformation methods by logic (explicit) or relation_type (default).
2583
+ # Replaces long if/elif chains as adding handlers requires only a dictionary entry.
2584
+ logic_handlers = {
2585
+ 'concat': self._apply_concat,
2586
+ 'fill': self._apply_fill,
2587
+ 'keepsource': self._apply_keep_source,
2588
+ 'onlysource': self._apply_keep_source
2589
+ }
2590
+
2591
+ default_handlers = {
2592
+ 'one_to_one': self._apply_one_to_one,
2593
+ 'one_to_many': self._apply_one_to_many,
2594
+ 'many_to_one': self._apply_many_to_one,
2595
+ 'many_to_many': self._apply_many_to_many
2596
+ }
2597
+
2598
+ for record in scenario.records:
2599
+ source_field_names = record.source.field_names
2600
+
2601
+ # Skip records with value mappings, they're handled by apply_value_mappings
2602
+ if record.mapping:
2603
+ self._apply_value_mapping_logging(df, record, stats_data, newly_created_target_fields)
2604
+ continue
2605
+
2606
+ normalized_logic = self._normalize_logic(record.logic)
2607
+ existing_sources = [s for s in source_field_names if s in df.columns]
2608
+
2609
+ # Check if normalized logic contains any handler key (substring match)
2610
+ # This handles cases like "keep source | parse to from date" -> "keepsourceparsetofromdate"
2611
+ matched_handler_key = None
2612
+ for handler_key in logic_handlers.keys():
2613
+ if handler_key in normalized_logic:
2614
+ matched_handler_key = handler_key
2615
+ break
2616
+
2617
+ if matched_handler_key:
2618
+ logic_handler = logic_handlers[matched_handler_key]
2619
+ # 'keep source' handlers don't create columns, so they only need kept_sources to track preserved fields.
2620
+ # Other handlers need existing_sources and created_targets to filter and track new columns.
2621
+ if matched_handler_key in ('keepsource', 'onlysource'):
2622
+ logic_handler(
2623
+ df=df,
2624
+ record=record,
2625
+ stats_data=stats_data,
2626
+ kept_sources=source_fields_to_keep
2627
+ )
2628
+ else:
2629
+ logic_handler(
2630
+ df=df,
2631
+ record=record,
2632
+ existing_sources=existing_sources,
2633
+ stats_data=stats_data,
2634
+ created_targets=newly_created_target_fields
2635
+ )
2636
+ else:
2637
+ default_handler = default_handlers.get(record.relation_type)
2638
+ if default_handler:
2639
+ # Only many_to_many accepts kept_sources.
2640
+ if record.relation_type == 'many_to_many':
2641
+ default_handler(
2642
+ df=df,
2643
+ record=record,
2644
+ existing_sources=existing_sources,
2645
+ stats_data=stats_data,
2646
+ created_targets=newly_created_target_fields,
2647
+ kept_sources=source_fields_to_keep
2648
+ )
2649
+ else:
2650
+ default_handler(
2651
+ df=df,
2652
+ record=record,
2653
+ existing_sources=existing_sources,
2654
+ stats_data=stats_data,
2655
+ created_targets=newly_created_target_fields
2656
+ )
2657
+ else:
2658
+ raise ValueError(
2659
+ f"Unknown relation_type '{record.relation_type}' for record {record.id}. "
2660
+ f"Supported types: {', '.join(default_handlers.keys())}"
2661
+ )
2662
+
2663
+ #--- report
2664
+ stats_df = self._generate_statistics_dataframe(
2665
+ scenario=scenario,
2666
+ df=df,
2667
+ stats_data=stats_data,
2668
+ source_fields_to_keep=source_fields_to_keep
2669
+ )
2670
+
2671
+ #--- Clean up
2672
+ df = self._finalize_dataframe_columns(
2673
+ df=df,
2674
+ scenario=scenario,
2675
+ drop_unmapped=drop_unmapped,
2676
+ newly_created_target_fields=newly_created_target_fields,
2677
+ source_fields_to_keep=source_fields_to_keep,
2678
+ columns_to_keep=columns_to_keep
2679
+ )
2680
+
2681
+ return df, stats_df
2682
+
2683
+ # ============================================================================
2684
+ # Rename Handlers
2685
+ # ============================================================================
2686
+
2687
+ def _apply_keep_source(
2688
+ self,
2689
+ df: pd.DataFrame,
2690
+ record,
2691
+ stats_data: List[dict],
2692
+ kept_sources: Set[str]
2693
+ ) -> None:
2694
+ """Applies 'keep source' logic: preserves source columns without creating targets.
2695
+
2696
+ Applied when the logic is "keepsource" or "onlysource". This indicates that the
2697
+ source fields should be retained in the DataFrame as-is, and no corresponding
2698
+ target columns should be generated. Allowing the developer to apply custom logic themselves.
2699
+
2700
+ Args:
2701
+ df (pd.DataFrame): The DataFrame being processed.
2702
+ record: The scenario record with "keepsource" logic.
2703
+ stats_data (List[dict]): List to append statistics to.
2704
+ kept_sources (Set[str]): Set to track source columns that must be preserved.
2705
+ """
2706
+ source_field_names = record.source.field_names
2707
+ for source_field in source_field_names:
2708
+ kept_sources.add(source_field)
2709
+ source_existed = source_field in df.columns
2710
+ self._log_transformation_stats(
2711
+ stats_data=stats_data,
2712
+ record=record,
2713
+ target_col=None,
2714
+ source_col=source_field,
2715
+ status='kept_source',
2716
+ mapping_type='keep_source',
2717
+ source_existed=source_existed,
2718
+ df_length=len(df) if source_existed else 0
2719
+ )
2720
+
2721
+ def _apply_concat(
2722
+ self,
2723
+ df: pd.DataFrame,
2724
+ record,
2725
+ existing_sources: List[str],
2726
+ stats_data: List[dict],
2727
+ created_targets: Set[str],
2728
+ ) -> None:
2729
+ """Applies 'concat' logic: joins all sources and fills all targets.
2730
+
2731
+ Applied when the logic is explicitly set to "concat". It concatenates values from
2732
+ all available source columns using a pipe ('|') separator and assigns this result
2733
+ to every target column defined in the record.
2734
+
2735
+ Args:
2736
+ df (pd.DataFrame): The DataFrame being processed.
2737
+ record: The scenario record with "concat" logic.
2738
+ existing_sources (List[str]): List of source fields present in the DataFrame.
2739
+ stats_data (List[dict]): List to append statistics to.
2740
+ created_targets (Set[str]): Set to track created target columns.
2741
+ """
2742
+ target_field_names = record.target.field_names
2743
+ if len(existing_sources) > 0:
2744
+ concatenated = self._concatenate_source_fields(df=df, source_fields=existing_sources)
2745
+ for target_field in target_field_names:
2746
+ created_targets.add(target_field)
2747
+ df[target_field] = concatenated
2748
+ self._log_transformation_stats(
2749
+ stats_data=stats_data,
2750
+ record=record,
2751
+ target_col=target_field,
2752
+ source_col=existing_sources,
2753
+ status='mapped',
2754
+ mapping_type='concat',
2755
+ df_length=len(df)
2756
+ )
2757
+ else:
2758
+ for target_field in target_field_names:
2759
+ created_targets.add(target_field)
2760
+ df[target_field] = ''
2761
+ self._log_transformation_stats(
2762
+ stats_data=stats_data,
2763
+ record=record,
2764
+ target_col=target_field,
2765
+ source_col=record.source.field_names,
2766
+ status='source_missing',
2767
+ mapping_type='concat',
2768
+ source_existed=False,
2769
+ df_length=len(df)
2770
+ )
2771
+
2772
+ def _apply_fill(
2773
+ self,
2774
+ df: pd.DataFrame,
2775
+ record,
2776
+ existing_sources: List[str],
2777
+ stats_data: List[dict],
2778
+ created_targets: Set[str],
2779
+ ) -> None:
2780
+ """Applies 'fill' logic: maps source[i] to target[i] sequentially.
2781
+
2782
+ If the logic is explicitly set to "fill", it maps the first source field
2783
+ to the first target field, the second source to the second target, and so on.
2784
+
2785
+ Args:
2786
+ df (pd.DataFrame): The DataFrame being processed.
2787
+ record: The scenario record with "fill" logic.
2788
+ existing_sources (List[str]): List of source fields present in the DataFrame.
2789
+ stats_data (List[dict]): List to append statistics to.
2790
+ created_targets (Set[str]): Set to track created target columns.
2791
+ """
2792
+ target_field_names = record.target.field_names
2793
+ n = min(len(existing_sources), len(target_field_names))
2794
+
2795
+ for i in range(n):
2796
+ source_field = existing_sources[i]
2797
+ target_field = target_field_names[i]
2798
+ created_targets.add(target_field)
2799
+ df[target_field] = df[source_field]
2800
+ self._log_transformation_stats(
2801
+ stats_data=stats_data,
2802
+ record=record,
2803
+ target_col=target_field,
2804
+ source_col=source_field,
2805
+ status='mapped',
2806
+ mapping_type='fill',
2807
+ df_length=len(df)
2808
+ )
2809
+
2810
+ if len(target_field_names) > len(existing_sources):
2811
+ for i in range(len(existing_sources), len(target_field_names)):
2812
+ target_field = target_field_names[i]
2813
+ created_targets.add(target_field)
2814
+ df[target_field] = ''
2815
+ self._log_transformation_stats(
2816
+ stats_data=stats_data,
2817
+ record=record,
2818
+ target_col=target_field,
2819
+ source_col=None,
2820
+ status='source_missing',
2821
+ mapping_type='fill',
2822
+ source_existed=False,
2823
+ df_length=len(df)
2824
+ )
2825
+
2826
+ def _apply_one_to_one(
2827
+ self,
2828
+ df: pd.DataFrame,
2829
+ record,
2830
+ existing_sources: List[str],
2831
+ stats_data: List[dict],
2832
+ created_targets: Set[str],
2833
+ ) -> None:
2834
+ """Applies default one-to-one logic: Direct value copy.
2835
+
2836
+ Applied when no explicit logic is provided and the relation type is 'one_to_one'.
2837
+ Maps a single source field to a single target field.
2838
+
2839
+ Args:
2840
+ df (pd.DataFrame): The DataFrame being processed.
2841
+ record: The scenario record.
2842
+ existing_sources (List[str]): List containing the single source field.
2843
+ stats_data (List[dict]): List to append statistics to.
2844
+ created_targets (Set[str]): Set to track created target columns.
2845
+ """
2846
+ target_field_names = record.target.field_names
2847
+ n_sources = len(existing_sources)
2848
+ n_targets = len(target_field_names)
2849
+
2850
+ if n_sources > 0 and n_targets > 0:
2851
+ source_field = existing_sources[0]
2852
+ target_field = target_field_names[0]
2853
+ created_targets.add(target_field)
2854
+ df[target_field] = df[source_field]
2855
+ self._log_transformation_stats(
2856
+ stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
2857
+ status='mapped', mapping_type='one_to_one', default_logic='direct_mapping', df_length=len(df)
2858
+ )
2859
+ elif n_targets > 0:
2860
+ target_field = target_field_names[0]
2861
+ created_targets.add(target_field)
2862
+ df[target_field] = ''
2863
+ self._log_transformation_stats(
2864
+ stats_data=stats_data, record=record, target_col=target_field,
2865
+ source_col=record.source.field_names[0] if record.source.field_names else None,
2866
+ status='source_missing', mapping_type='one_to_one', default_logic='direct_mapping',
2867
+ source_existed=False, df_length=len(df)
2868
+ )
2869
+
2870
+ def _apply_one_to_many(
2871
+ self,
2872
+ df: pd.DataFrame,
2873
+ record,
2874
+ existing_sources: List[str],
2875
+ stats_data: List[dict],
2876
+ created_targets: Set[str],
2877
+ ) -> None:
2878
+ """Applies default one-to-many logic: Duplicate source value to all targets.
2879
+
2880
+ Applied when no explicit logic is provided and the relation type is 'one_to_many'.
2881
+ A single source field is mapped to multiple target fields.
2882
+
2883
+ Args:
2884
+ df (pd.DataFrame): The DataFrame being processed.
2885
+ record: The scenario record.
2886
+ existing_sources (List[str]): List containing the single source field.
2887
+ stats_data (List[dict]): List to append statistics to.
2888
+ created_targets (Set[str]): Set to track created target columns.
2889
+ """
2890
+ target_field_names = record.target.field_names
2891
+
2892
+ if len(existing_sources) > 0:
2893
+ source_field = existing_sources[0]
2894
+ for target_field in target_field_names:
2895
+ created_targets.add(target_field)
2896
+ df[target_field] = df[source_field]
2897
+ self._log_transformation_stats(
2898
+ stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
2899
+ status='mapped', mapping_type='one_to_many', default_logic='duplicate_to_all_targets', df_length=len(df)
2900
+ )
2901
+ else:
2902
+ for target_field in target_field_names:
2903
+ created_targets.add(target_field)
2904
+ df[target_field] = ''
2905
+ self._log_transformation_stats(
2906
+ stats_data=stats_data, record=record, target_col=target_field,
2907
+ source_col=record.source.field_names[0] if record.source.field_names else None,
2908
+ status='source_missing', mapping_type='one_to_many', default_logic='duplicate_to_all_targets',
2909
+ source_existed=False, df_length=len(df)
2910
+ )
2911
+
2912
+ def _apply_many_to_one(
2913
+ self,
2914
+ df: pd.DataFrame,
2915
+ record,
2916
+ existing_sources: List[str],
2917
+ stats_data: List[dict],
2918
+ created_targets: Set[str],
2919
+ ) -> None:
2920
+ """Applies default many-to-one logic: Concatenate sources with pipe separator.
2921
+
2922
+ Applied when no explicit logic is provided and the relation type is 'many_to_one'.
2923
+ Multiple source fields are mapped to a single target field via concanation of the source valuess.
2924
+
2925
+ Args:
2926
+ df (pd.DataFrame): The DataFrame being processed.
2927
+ record: The scenario record.
2928
+ existing_sources (List[str]): List of source fields present in the DataFrame.
2929
+ stats_data (List[dict]): List to append statistics to.
2930
+ created_targets (Set[str]): Set to track created target columns.
2931
+ """
2932
+ target_field_names = record.target.field_names
2933
+
2934
+ if len(existing_sources) > 0:
2935
+ concatenated = self._concatenate_source_fields(df=df, source_fields=existing_sources)
2936
+ target_field = target_field_names[0]
2937
+ created_targets.add(target_field)
2938
+ df[target_field] = concatenated
2939
+ self._log_transformation_stats(
2940
+ stats_data=stats_data, record=record, target_col=target_field, source_col=existing_sources,
2941
+ status='mapped', mapping_type='many_to_one', default_logic='concatenate_with_pipe', df_length=len(df)
2942
+ )
2943
+ elif len(target_field_names) > 0:
2944
+ target_field = target_field_names[0]
2945
+ created_targets.add(target_field)
2946
+ df[target_field] = ''
2947
+ self._log_transformation_stats(
2948
+ stats_data=stats_data, record=record, target_col=target_field, source_col=record.source.field_names,
2949
+ status='source_missing', mapping_type='many_to_one', default_logic='concatenate_with_pipe',
2950
+ source_existed=False, df_length=len(df)
2951
+ )
2952
+
2953
+ def _apply_many_to_many(
2954
+ self,
2955
+ df: pd.DataFrame,
2956
+ record,
2957
+ existing_sources: List[str],
2958
+ stats_data: List[dict],
2959
+ created_targets: Set[str],
2960
+ kept_sources: Optional[Set[str]] = None
2961
+ ) -> None:
2962
+ """Applies default many-to-many logic: Variable behavior based on field counts.
2963
+
2964
+ Applied when no explicit logic is provided and the relation type is 'many_to_many'.
2965
+ The behavior adapts based on the number of source fields (N) vs target fields (M).
2966
+
2967
+ defaults for (N:M) mappings with different cardinalities :
2968
+ - N == M: Direct 1:1 mapping
2969
+ - N < M: Maps available sources 1:1, then fills remaining targets with the last source.
2970
+ - N > M: Concatenates all sources into every target field.
2971
+
2972
+ Args:
2973
+ df (pd.DataFrame): The DataFrame being processed.
2974
+ record: The scenario record.
2975
+ existing_sources (List[str]): List of source fields present in the DataFrame.
2976
+ stats_data (List[dict]): List to append statistics to.
2977
+ created_targets (Set[str]): Set to track created target columns.
2978
+ kept_sources: Optional parameter for interface consistency (unused in this method).
2979
+ """
2980
+ target_field_names = record.target.field_names
2981
+ n_sources = len(existing_sources)
2982
+ n_targets = len(target_field_names)
2983
+
2984
+ # Equal: 1:1 mapping
2985
+ if n_sources == n_targets:
2986
+ for i in range(n_sources):
2987
+ source_field = existing_sources[i]
2988
+ target_field = target_field_names[i]
2989
+ created_targets.add(target_field)
2990
+ df[target_field] = df[source_field]
2991
+ self._log_transformation_stats(
2992
+ stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
2993
+ status='mapped', mapping_type='many_to_many_equal', default_logic='direct_1_to_1_mapping', df_length=len(df)
2994
+ )
2995
+
2996
+ # Less sources: Map 1:1 then fill remaining with last source
2997
+ elif n_sources < n_targets:
2998
+ # Map first n
2999
+ for i in range(n_sources):
3000
+ source_field = existing_sources[i]
3001
+ target_field = target_field_names[i]
3002
+ created_targets.add(target_field)
3003
+ df[target_field] = df[source_field]
3004
+ self._log_transformation_stats(
3005
+ stats_data=stats_data, record=record, target_col=target_field, source_col=source_field,
3006
+ status='mapped', mapping_type='many_to_many_n_lt_m', default_logic='map_n_then_fill_remaining', df_length=len(df)
3007
+ )
3008
+
3009
+ # Fill remaining
3010
+ if n_sources > 0:
3011
+ last_source = existing_sources[-1]
3012
+ for i in range(n_sources, n_targets):
3013
+ target_field = target_field_names[i]
3014
+ created_targets.add(target_field)
3015
+ df[target_field] = df[last_source]
3016
+ self._log_transformation_stats(
3017
+ stats_data=stats_data, record=record, target_col=target_field, source_col=last_source,
3018
+ status='mapped', mapping_type='many_to_many_n_lt_m', default_logic='map_n_then_fill_remaining', df_length=len(df)
3019
+ )
3020
+ else: # No sources at all
3021
+ for i in range(n_sources, n_targets):
3022
+ target_field = target_field_names[i]
3023
+ created_targets.add(target_field)
3024
+ df[target_field] = ''
3025
+ self._log_transformation_stats(
3026
+ stats_data=stats_data, record=record, target_col=target_field, source_col=None,
3027
+ status='source_missing', mapping_type='many_to_many_n_lt_m', default_logic='map_n_then_fill_remaining',
3028
+ source_existed=False, df_length=len(df)
3029
+ )
3030
+
3031
+ # More sources: Concatenate all to each target
3032
+ else: # n_sources > n_targets
3033
+ if n_sources > 0:
3034
+ concatenated = self._concatenate_source_fields(df=df, source_fields=existing_sources)
3035
+ for target_field in target_field_names:
3036
+ created_targets.add(target_field)
3037
+ df[target_field] = concatenated
3038
+ self._log_transformation_stats(
3039
+ stats_data=stats_data, record=record, target_col=target_field, source_col=existing_sources,
3040
+ status='mapped', mapping_type='many_to_many_n_gt_m', default_logic='concatenate_all_to_each_target', df_length=len(df)
3041
+ )
3042
+ else:
3043
+ for target_field in target_field_names:
3044
+ created_targets.add(target_field)
3045
+ df[target_field] = ''
3046
+ self._log_transformation_stats(
3047
+ stats_data=stats_data, record=record, target_col=target_field, source_col=record.source.field_names,
3048
+ status='source_missing', mapping_type='many_to_many_n_gt_m', default_logic='concatenate_all_to_each_target',
3049
+ source_existed=False, df_length=len(df)
3050
+ )
3051
+
3052
+ # ============================================================================
3053
+ # Transformation Helpers
3054
+ # ============================================================================
3055
+
3056
+ def _generate_statistics_dataframe(
3057
+ self,
3058
+ scenario: ParsedScenario,
3059
+ df: pd.DataFrame,
3060
+ stats_data: List[dict],
3061
+ source_fields_to_keep: Set[str]
3062
+ ) -> pd.DataFrame:
3063
+ """Generates the statistics DataFrame, including unmapped source columns.
3064
+
3065
+ Args:
3066
+ scenario: The scenario object.
3067
+ df: The DataFrame being processed.
3068
+ stats_data: List of statistics dictionaries collected so far.
3069
+ source_fields_to_keep: Set of source fields that were explicitly kept.
3070
+
3071
+ Returns:
3072
+ pd.DataFrame: The final statistics DataFrame.
3073
+ """
3074
+ # Track mapped/unmapped source columns for statistics
3075
+ # Only track unmapped sources that exist in DataFrame and aren't intentionally kept
3076
+ all_scenario_sources = scenario.all_source_fields
3077
+ mapped_sources_from_records = set()
3078
+ for record in scenario.records:
3079
+ mapped_sources_from_records.update(record.source.field_names)
3080
+
3081
+ unmapped_sources_in_df = (all_scenario_sources & set(df.columns)) - mapped_sources_from_records - source_fields_to_keep
3082
+
3083
+ # Log unmapped sources using a dummy record
3084
+ dummy_record = DummyRecord()
3085
+ for unmapped_source in unmapped_sources_in_df:
3086
+ self._log_transformation_stats(
3087
+ stats_data=stats_data,
3088
+ record=dummy_record,
3089
+ target_col=None,
3090
+ source_col=unmapped_source,
3091
+ status='not_in_mapping',
3092
+ mapping_type='unknown',
3093
+ source_existed=True,
3094
+ df_length=len(df)
3095
+ )
3096
+
3097
+ # Build statistics DataFrame
3098
+ if stats_data:
3099
+ return pd.DataFrame(stats_data)
3100
+
3101
+ return pd.DataFrame(columns=[
3102
+ 'record_id', 'source_column', 'target_column', 'mapping_status',
3103
+ 'source_existed', 'rows_affected', 'mapping_type', 'logic',
3104
+ 'relation_type', 'source_count', 'target_count', 'default_logic'
3105
+ ])
3106
+
3107
+ def _finalize_dataframe_columns(
3108
+ self,
3109
+ df: pd.DataFrame,
3110
+ scenario: ParsedScenario,
3111
+ drop_unmapped: bool,
3112
+ newly_created_target_fields: Set[str],
3113
+ source_fields_to_keep: Set[str],
3114
+ columns_to_keep: List[str]
3115
+ ) -> pd.DataFrame:
3116
+ """Finalizes the DataFrame by dropping unmapped columns and ensuring expected columns exist.
3117
+
3118
+ Args:
3119
+ df: The DataFrame being processed.
3120
+ scenario: The scenario object.
3121
+ drop_unmapped: Whether to drop unmapped source columns.
3122
+ newly_created_target_fields: Set of target fields created during transformation.
3123
+ source_fields_to_keep: Set of source fields explicitly kept.
3124
+ columns_to_keep: List of additional columns to preserve.
3125
+
3126
+ Returns:
3127
+ pd.DataFrame: The finalized DataFrame.
3128
+ """
3129
+ # 1. Define protected columns (must not be dropped)
3130
+ protected_columns = {'id'} | newly_created_target_fields | source_fields_to_keep | set(columns_to_keep)
3131
+
3132
+ # 2. Drop mapped source columns if requested
3133
+ if drop_unmapped:
3134
+ mapped_source_columns = set()
3135
+ for record in scenario.records:
3136
+ # Skip value mappings (handled by apply_value_mappings)
3137
+ if record.mapping and hasattr(record.mapping, 'values'):
3138
+ continue
3139
+
3140
+ normalized_logic = self._normalize_logic(record.logic)
3141
+ is_keep_source = "keepsource" in normalized_logic or "onlysource" in normalized_logic
3142
+
3143
+ if not is_keep_source:
3144
+ mapped_source_columns.update(record.source.field_names)
3145
+
3146
+ columns_to_drop = [col for col in mapped_source_columns if col not in protected_columns]
3147
+ df = df.drop(columns=columns_to_drop, errors='ignore')
3148
+
3149
+ # 3. Ensure only expected columns remain and missing expected columns are created
3150
+ all_expected_columns = list(protected_columns) + columns_to_keep
3151
+
3152
+ # Filter to keep only expected columns that exist
3153
+ final_df_columns = [col for col in df.columns if col in all_expected_columns]
3154
+ df = df[final_df_columns].copy()
3155
+
3156
+ # Add missing expected columns with None
3157
+ columns_missing_in_df = [col for col in all_expected_columns if col not in df.columns]
3158
+ for col in columns_missing_in_df:
3159
+ df[col] = None
3160
+
3161
+ return df
3162
+
3163
+ def _log_transformation_stats(
3164
+ self,
3165
+ stats_data: List[dict],
3166
+ record,
3167
+ target_col: Optional[str],
3168
+ source_col: Optional[Union[str, List[str]]],
3169
+ status: str,
3170
+ mapping_type: str,
3171
+ default_logic: Optional[str] = None,
3172
+ source_existed: bool = True,
3173
+ df_length: int = 0
3174
+ ) -> List[dict]:
3175
+ """Logs statistics for one field mapping operation.
3176
+
3177
+ Helper method that creates a statistics dictionary and appends it to stats_data.
3178
+ Called multiple times by rename_fields() to build the statistics DataFrame returned to users.
3179
+
3180
+ Args:
3181
+ stats_data (List[dict]): List to append statistics dictionary to.
3182
+ record: Record object containing field metadata (id, logic, relation_type, etc.).
3183
+ target_col (Optional[str]): Target column name, or None if not applicable.
3184
+ source_col (Optional[Union[str, List[str]]]): Source column name(s). Can be single string,
3185
+ list of strings (pipe-separated in output), or None.
3186
+ status (str): Mapping status: 'mapped', 'source_missing', 'kept_source', 'value_mapped'.
3187
+ mapping_type (str): Type of mapping: 'concat', 'fill', 'one_to_one', etc.
3188
+ default_logic (Optional[str]): Description of default logic used if no explicit logic.
3189
+ source_existed (bool): Whether source column(s) existed in DataFrame. Defaults to True.
3190
+ df_length (int): Number of rows in DataFrame (rows affected). Defaults to 0.
3191
+
3192
+ Returns:
3193
+ List[dict]: Updated stats_data list with new statistics dictionary appended.
3194
+ """
3195
+ # Standardize source_col to string if it's a list/None
3196
+ if isinstance(source_col, list):
3197
+ src_str = '|'.join(source_col) if source_col else None
3198
+ else:
3199
+ src_str = source_col
3200
+
3201
+ stats_data.append({
3202
+ 'record_id': record.id,
3203
+ 'source_column': src_str,
3204
+ 'target_column': target_col,
3205
+ 'mapping_status': status,
3206
+ 'source_existed': source_existed,
3207
+ 'rows_affected': df_length,
3208
+ 'mapping_type': mapping_type,
3209
+ 'logic': record.logic,
3210
+ 'relation_type': record.relation_type,
3211
+ 'source_count': len(record.source.field_names),
3212
+ 'target_count': len(record.target.field_names),
3213
+ 'default_logic': default_logic
3214
+ })
3215
+ return stats_data
3216
+
3217
+ def _apply_value_mapping_logging(
3218
+ self,
3219
+ df: pd.DataFrame,
3220
+ record,
3221
+ stats_data: List[dict],
3222
+ created_targets: Set[str]
3223
+ ) -> None:
3224
+ """Logs statistics for records with explicit value mappings (skipping renaming).
3225
+
3226
+ This helper is applied when a record has defined value mappings (e.g., "M" -> "1").
3227
+ These transformations are complex and handled by `apply_value_mappings()`, not
3228
+ `rename_fields()`. However, `rename_fields()` still needs to log these records to provide
3229
+ a complete report of all scenario operations.
3230
+
3231
+ **Why:**
3232
+ To ensure the statistics DataFrame returned by `rename_fields()` is exhaustive and
3233
+ includes records that were skipped for renaming but will be handled elsewhere. It also
3234
+ initializes the target columns with `None` to ensure structure consistency.
3235
+
3236
+ Args:
3237
+ df (pd.DataFrame): The DataFrame being processed.
3238
+ record: The scenario record containing value mapping definitions.
3239
+ stats_data (List[dict]): List to append the statistics dictionary to.
3240
+ created_targets (Set[str]): Set to track newly created target columns.
3241
+ """
3242
+ source_field_names = record.source.field_names
3243
+ target_field_names = record.target.field_names
3244
+ for target_field in target_field_names:
3245
+ created_targets.add(target_field)
3246
+ if target_field not in df.columns:
3247
+ df[target_field] = None
3248
+ self._log_transformation_stats(
3249
+ stats_data=stats_data,
3250
+ record=record,
3251
+ target_col=target_field,
3252
+ source_col=source_field_names,
3253
+ status='value_mapped',
3254
+ mapping_type='value_mapping',
3255
+ source_existed=any(s in df.columns for s in source_field_names),
3256
+ df_length=len(df)
3257
+ )
3258
+
3259
+ # ============================================================================
3260
+ # Utility Helpers
3261
+ # ============================================================================
3262
+
3263
+ def _normalize_logic(self, logic: Optional[str]) -> str:
3264
+ """Normalizes logic string for flexible matching.
3265
+
3266
+ Converts to lowercase and removes spaces/special characters so "Concat", "CONCAT", and "concat"
3267
+ all match the same logic type. Used by rename_fields() to match user-entered logic strings.
3268
+
3269
+ Args:
3270
+ logic (Optional[str]): Original logic string (e.g., "Concat", "fill", "keep source").
3271
+
3272
+ Returns:
3273
+ str: Normalized string (e.g., "concat", "fill", "keepsource"). Empty string if None.
3274
+ """
3275
+ if not logic:
3276
+ return ""
3277
+ # Lowercase, remove spaces, remove special characters
3278
+ return re.sub(r'[^a-z0-9]', '', logic.lower())
3279
+
3280
+ def _normalize_value_for_mapping(self, value: str, strategy: str) -> str:
3281
+ """Normalizes a value according to the specified mapping strategy.
3282
+
3283
+ Used by apply_value_mappings() to normalize both DataFrame source values and
3284
+ mapping source values before comparison, enabling flexible matching strategies.
3285
+
3286
+ Args:
3287
+ value: The value to normalize (e.g., "John Doe", "F", "John|Doe").
3288
+ strategy: Mapping strategy name (exactValMap, ignoreCaseValMap, etc.).
3289
+
3290
+ Returns:
3291
+ Normalized value ready for comparison.
3292
+ """
3293
+ if not value or pd.isna(value):
3294
+ return str(value) if value is not None else ""
3295
+
3296
+ value_str = str(value).strip()
3297
+
3298
+ if strategy == 'exactValMap':
3299
+ return value_str
3300
+ elif strategy == 'ignoreCaseValMap':
3301
+ return value_str.lower()
3302
+ elif strategy == 'ignoreSpecialValMap':
3303
+ # Remove special chars including spaces
3304
+ return re.sub(r'[^a-zA-Z0-9]', '', value_str)
3305
+ elif strategy == 'ignoreSpacesValMap':
3306
+ # Remove spaces only
3307
+ return value_str.replace(' ', '')
3308
+ elif strategy == 'flexValMap':
3309
+ # Lowercase + remove special chars including spaces
3310
+ return re.sub(r'[^a-z0-9]', '', value_str.lower())
3311
+ else:
3312
+ # Default to exact matching if unknown strategy
3313
+ return value_str
3314
+
3315
+ def _determine_mapping_strategy(self, record_logic: Optional[str], default_how: str) -> str:
3316
+ """Determines which mapping strategy to use for a record.
3317
+
3318
+ Checks record.logic first (higher priority), then falls back to default_how kwarg.
3319
+ Uses _normalize_logic to match strategy names flexibly. Checks if normalized logic
3320
+ contains any strategy name as a substring (to handle cases where logic contains other text).
3321
+
3322
+ Args:
3323
+ record_logic: The logic string from the record (may contain strategy name).
3324
+ default_how: Default strategy from how kwarg (e.g., 'exactValMap').
3325
+
3326
+ Returns:
3327
+ Strategy name to use (exactValMap, ignoreCaseValMap, etc.).
3328
+ """
3329
+ if not record_logic:
3330
+ return default_how
3331
+
3332
+ normalized_logic = self._normalize_logic(record_logic)
3333
+
3334
+ # Check if normalized logic contains any mapping strategy as substring
3335
+ # Order matters: check longer/more specific names first to avoid false matches
3336
+ strategies = [
3337
+ ('ignorecasevalmap', 'ignoreCaseValMap'),
3338
+ ('ignorespecialvalmap', 'ignoreSpecialValMap'),
3339
+ ('ignorespacesvalmap', 'ignoreSpacesValMap'),
3340
+ ('flexvalmap', 'flexValMap'),
3341
+ ('exactvalmap', 'exactValMap')
3342
+ ]
3343
+
3344
+ for normalized_strategy, strategy_name in strategies:
3345
+ if normalized_strategy in normalized_logic:
3346
+ return strategy_name
3347
+
3348
+ # No match found, use default
3349
+ return default_how
3350
+
3351
+ def _concatenate_source_fields(
3352
+ self,
3353
+ df: pd.DataFrame,
3354
+ source_fields: List[str]
3355
+ ) -> pd.Series:
3356
+ """Concatenates values from multiple source columns into a single Series with '|' separator.
3357
+
3358
+ Combines the values from multiple columns (not the column names).
3359
+ Example: values from 'first_name' and 'last_name' columns → 'John|Doe'.
3360
+ Returns a Series of values; caller assigns this Series to target column name(s).
3361
+ If only one field provided, returns its values converted to string and stripped (no concatenation).
3362
+ Called by rename_fields() for 'concat' logic and many_to_one/many_to_many default behaviors.
3363
+
3364
+ Args:
3365
+ df (pd.DataFrame): DataFrame containing the source columns.
3366
+ source_fields (List[str]): List of column names whose VALUES will be concatenated (e.g., ['first_name', 'last_name']).
3367
+
3368
+ Returns:
3369
+ pd.Series: Series of concatenated VALUES (no column name). Caller assigns to target column(s).
3370
+ """
3371
+ if len(source_fields) == 1:
3372
+ return df[source_fields[0]].astype(str).str.strip()
3373
+ else:
3374
+ return df[source_fields].astype(str).apply(
3375
+ lambda row: '|'.join(val.strip() for val in row), axis=1
3376
+ )
3377
+
3378
+ def _apply_mapping_to_target(
3379
+ self,
3380
+ df: pd.DataFrame,
3381
+ concatenated_source_series: pd.Series,
3382
+ target_field: str,
3383
+ replacements: dict,
3384
+ default_val: Optional[str] = None,
3385
+ original_source_series: Optional[pd.Series] = None
3386
+ ) -> pd.DataFrame:
3387
+ """Applies value mappings to create/populate a target column.
3388
+
3389
+ Transforms source values → target values using lookup dictionary via pandas .map().
3390
+ Unmapped values use default_val if provided, otherwise keep original source value.
3391
+ Always creates target column (uses default if no mappings exist).
3392
+ Called by apply_value_mappings() for each target field in records with value mappings.
3393
+
3394
+ Args:
3395
+ df: DataFrame to modify. Target column added/updated in-place.
3396
+ concatenated_source_series: Source values formatted for lookup (may be normalized for flexible matching).
3397
+ target_field: Name of target column to create/populate.
3398
+ replacements: Mapping dict {normalized_source_value: target_value} (e.g., {"f": "1", "m": "0"}).
3399
+ default_val: Default for unmapped values. If None, keeps original source value.
3400
+ original_source_series: Original (non-normalized) source series for fillna when default_val is None.
3401
+
3402
+ Returns:
3403
+ Modified DataFrame with target column added/updated.
3404
+
3405
+ Example 1: Single source field mapping.
3406
+ >>> # Input DataFrame with source column
3407
+ >>> df = pd.DataFrame({'id': [1, 2, 3], 'gender': ['F', 'M', 'F']})
3408
+ >>> # Create Series from source column (single field)
3409
+ >>> concatenated_source_series = df['gender'].astype(str).str.strip()
3410
+ >>> # Define mapping rules and target column name
3411
+ >>> replacements = {'F': '1', 'M': '0'}
3412
+ >>> target_field = 'gender_code'
3413
+ >>> default_val = None
3414
+ >>> # Apply mapping: Series values lookup in dict keys → return dict values
3415
+ >>> df = _apply_mapping_to_target(df, concatenated_source_series, target_field, replacements, default_val)
3416
+ >>> df
3417
+ id gender gender_code
3418
+ 0 1 F 1
3419
+ 1 2 M 0
3420
+ 2 3 F 1
3421
+
3422
+ Example 2: Concatenated source fields (many_to_one mapping).
3423
+ Scenario mapping: 'first_name'|'last_name' → 'full_name_code'
3424
+ >>> # Input DataFrame with multiple source columns
3425
+ >>> df = pd.DataFrame({'id': [1, 2], 'first_name': ['John', 'Jane'], 'last_name': ['Doe', 'Smith']})
3426
+ >>> # Create Series from multiple source columns (concatenated with '|')
3427
+ >>> concatenated_source_series = df[['first_name', 'last_name']].astype(str).apply(
3428
+ ... lambda row: '|'.join(val.strip() for val in row), axis=1)
3429
+ >>> # Define mapping rules (keys match concatenated format) and target column name
3430
+ >>> replacements = {'John|Doe': 'JD001', 'Jane|Smith': 'JS002'}
3431
+ >>> target_field = 'full_name_code'
3432
+ >>> default_val = None
3433
+ >>> # Apply mapping: "John|Doe" → "JD001", "Jane|Smith" → "JS002"
3434
+ >>> df = _apply_mapping_to_target(df, concatenated_source_series, target_field, replacements, default_val)
3435
+ >>> df
3436
+ id first_name last_name full_name_code
3437
+ 0 1 John Doe JD001
3438
+ 1 2 Jane Smith JS002
3439
+ """
3440
+ if not replacements:
3441
+ df[target_field] = default_val if default_val else None
3442
+ return df
3443
+
3444
+ mapped_series = concatenated_source_series.map(replacements)
3445
+
3446
+ if default_val:
3447
+ mapped_series = mapped_series.fillna(default_val)
3448
+ else:
3449
+ # Use original source series for fillna to preserve original values (not normalized)
3450
+ fill_series = original_source_series if original_source_series is not None else concatenated_source_series
3451
+ mapped_series = mapped_series.fillna(fill_series)
3452
+
3453
+ df[target_field] = mapped_series
3454
+ return df
3455
+
3456
+ def _detect_missing_values_in_fields(
3457
+ self,
3458
+ df: pd.DataFrame,
3459
+ source_field_names: List[str]
3460
+ ) -> Dict[str, int]:
3461
+ """Detects missing values in source fields used for value mapping.
3462
+
3463
+ Called by apply_value_mappings() before processing to warn users about missing values
3464
+ that may affect mapping accuracy. Missing values can cause mappings to fail silently
3465
+ or produce unexpected results, so early detection helps users identify data quality issues.
3466
+
3467
+ Args:
3468
+ df: Input DataFrame to check.
3469
+ source_field_names: List of source field names to check for missing values.
3470
+
3471
+ Returns:
3472
+ Dictionary mapping field names to counts of missing values found.
3473
+ """
3474
+ missing_counts = {}
3475
+ missing_value_patterns = self.MISSING_VALUES
3476
+
3477
+ for field_name in source_field_names:
3478
+ if field_name not in df.columns:
3479
+ continue
3480
+
3481
+ series = df[field_name]
3482
+
3483
+ # Count pd.NA and numpy NaN (true missing values)
3484
+ missing_count = series.isna().sum()
3485
+
3486
+ # Count string representations that indicate missing data (e.g., 'nan', 'None', 'null')
3487
+ # These are checked separately because they're not detected by isna() but still
3488
+ # represent missing/invalid data that should be handled before mapping
3489
+ for pattern in missing_value_patterns:
3490
+ missing_count += (series == pattern).sum()
3491
+
3492
+ if missing_count > 0:
3493
+ missing_counts[field_name] = missing_count
3494
+
3495
+ return missing_counts