pyxecm 1.5__py3-none-any.whl → 1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyxecm might be problematic. Click here for more details.

pyxecm/helper/assoc.py CHANGED
@@ -73,11 +73,31 @@ class Assoc:
73
73
 
74
74
  @classmethod
75
75
  def is_html_escaped(cls, assoc_string: str) -> bool:
76
+ """Class method to check if an Extended ECM Assoc String
77
+ is HTML escaped.
78
+
79
+ Args:
80
+ assoc_string (str): the string to test for HTML escaping
81
+
82
+ Returns:
83
+ bool: True = string is HTML escaped, False if now
84
+ """
85
+
76
86
  decoded_string = html.unescape(assoc_string)
87
+
77
88
  return assoc_string != decoded_string
78
89
 
79
90
  @classmethod
80
91
  def unescape_html(cls, assoc_string: str) -> str:
92
+ """HTML unescape a a string
93
+
94
+ Args:
95
+ assoc_string (str): the string to unescape.
96
+
97
+ Returns:
98
+ str: unescaped string
99
+ """
100
+
81
101
  decoded_string = html.unescape(assoc_string)
82
102
  return decoded_string
83
103
 
pyxecm/helper/data.py CHANGED
@@ -121,6 +121,20 @@ class Data:
121
121
 
122
122
  # end method definition
123
123
 
124
+ def __getitem__(self, column: str) -> pd.Series:
125
+ """Return the column corresponding to the key from the DataFrame
126
+
127
+ Args:
128
+ column (str): name of the Data Frame column
129
+
130
+ Returns:
131
+ pd.Series: column of the Data Frame with the given name
132
+ """
133
+
134
+ return self._df[column]
135
+
136
+ # end method definition
137
+
124
138
  def lock(self):
125
139
  """Return the threading lock object.
126
140
 
@@ -317,11 +331,13 @@ class Data:
317
331
  )
318
332
  except FileNotFoundError:
319
333
  logger.error(
320
- "File -> %s not found. Please check the file path.", json_path
334
+ "JSON file -> %s not found. Please check the file path.", json_path
321
335
  )
322
336
  return False
323
337
  except PermissionError:
324
- logger.error("Permission denied to access the file -> %s.", json_path)
338
+ logger.error(
339
+ "Permission denied to access the JSON file -> %s.", json_path
340
+ )
325
341
  return False
326
342
  except IOError as e:
327
343
  logger.error("An I/O error occurred -> %s", str(e))
@@ -333,10 +349,10 @@ class Data:
333
349
  logger.error("Invalid JSON input -> %s", str(e))
334
350
  return False
335
351
  except AttributeError as e:
336
- logger.error("Unexpected data structure -> %s", str(e))
352
+ logger.error("Unexpected JSON data structure -> %s", str(e))
337
353
  return False
338
354
  except TypeError as e:
339
- logger.error("Unexpected data type -> %s", str(e))
355
+ logger.error("Unexpected JSON data type -> %s", str(e))
340
356
  return False
341
357
  except KeyError as e:
342
358
  logger.error("Missing key in JSON data -> %s", str(e))
@@ -475,17 +491,26 @@ class Data:
475
491
  self._df = pd.concat([self._df, df], ignore_index=True)
476
492
  except FileNotFoundError:
477
493
  logger.error(
478
- "File -> '%s' not found. Please check the file path.", xlsx_path
494
+ "Excel file -> '%s' not found. Please check the file path.",
495
+ xlsx_path,
479
496
  )
480
497
  return False
481
498
  except PermissionError:
482
- logger.error("Permission denied to access the file -> '%s'.", xlsx_path)
499
+ logger.error(
500
+ "Permission denied to access the Excel file -> '%s'.", xlsx_path
501
+ )
483
502
  return False
484
503
  except IOError as e:
485
- logger.error("An I/O error occurred -> %s", str(e))
504
+ logger.error(
505
+ "An I/O error occurred -> %s while reading the Excel file -> %s",
506
+ str(e),
507
+ xlsx_path,
508
+ )
486
509
  return False
487
510
  except ValueError as e:
488
- logger.error("Invalid Excel input -> %s", str(e))
511
+ logger.error(
512
+ "Invalid Excel input -> %s in Excel file -> %s", str(e), xlsx_path
513
+ )
489
514
  return False
490
515
  except AttributeError as e:
491
516
  logger.error("Unexpected data structure -> %s", str(e))
@@ -554,11 +579,15 @@ class Data:
554
579
 
555
580
  # end method definition
556
581
 
557
- def load_csv_data(self, csv_path: str) -> bool:
582
+ def load_csv_data(
583
+ self, csv_path: str, delimiter: str = ",", encoding: str = "utf-8"
584
+ ) -> bool:
558
585
  """Load CSV (Comma separated values) data into DataFrame
559
586
 
560
587
  Args:
561
588
  csv_path (str): Path to the CSV file.
589
+ delimiter (str, optional, length = 1): chracter to delimit values. Default ="," (comma)
590
+ encoding (str, optional): encoding of the file. Default = "utf-8".
562
591
  Returns:
563
592
  bool: False in case an error occured, True otherwise.
564
593
  """
@@ -566,18 +595,22 @@ class Data:
566
595
  if csv_path is not None and os.path.exists(csv_path):
567
596
  # Load data from CSV file
568
597
  try:
569
- df = pd.read_csv(csv_path)
598
+ df = pd.read_csv(
599
+ filepath_or_buffer=csv_path, delimiter=delimiter, encoding=encoding
600
+ )
570
601
  if self._df is None:
571
602
  self._df = df
572
603
  else:
573
604
  self._df = pd.concat([self._df, df])
574
605
  except FileNotFoundError:
575
606
  logger.error(
576
- "File -> '%s' not found. Please check the file path.", csv_path
607
+ "CSV file -> '%s' not found. Please check the file path.", csv_path
577
608
  )
578
609
  return False
579
610
  except PermissionError:
580
- logger.error("Permission denied to access the file -> %s.", csv_path)
611
+ logger.error(
612
+ "Permission denied to access the CSV file -> %s.", csv_path
613
+ )
581
614
  return False
582
615
  except IOError as e:
583
616
  logger.error("An I/O error occurred -> %s", str(e))
@@ -819,7 +852,7 @@ class Data:
819
852
  column_name (str): The column name to partition by
820
853
 
821
854
  Returns:
822
- list: List of partitions
855
+ list | None: List of partitions or None in case of an error (e.g. column name does not exist).
823
856
  """
824
857
 
825
858
  if column_name not in self._df.columns:
@@ -958,6 +991,7 @@ class Data:
958
991
  make_unique: bool = False,
959
992
  reset_index: bool = False,
960
993
  split_string_to_list: bool = False,
994
+ separator: str = ";,",
961
995
  ) -> pd.DataFrame:
962
996
  """Explode a substructure in the Data Frame
963
997
 
@@ -969,7 +1003,9 @@ class Data:
969
1003
  flatten_fields (list): Fields in the exploded substructure to include
970
1004
  in the main dictionaries for easier processing.
971
1005
  make_unique (bool, optional): if True deduplicate the exploded data frame.
972
- flatten (bool, optional): if True flatten the exploded data frame.
1006
+ reset_index (bool, False): True = index is reset, False = Index is not reset
1007
+ split_string_to_list (bool, optional): if True flatten the exploded data frame.
1008
+ separator (str, optional): characters used to split the string values in the given column into a list
973
1009
  Returns:
974
1010
  pd.DataFrame: Pointer to the Pandas DataFrame
975
1011
  """
@@ -983,10 +1019,16 @@ class Data:
983
1019
 
984
1020
  # Define a function to split a string into a list
985
1021
  def string_to_list(string: str | None) -> list:
986
- if not string or pd.isna(string):
987
- return []
988
- # Use regular expression to split by comma, semicolon, or comma followed by space
989
- return re.split(r"[;,]\s*", str(string))
1022
+ # Do nothing if the string is already a list
1023
+ if isinstance(string, list):
1024
+ return_list = string
1025
+ elif not string or pd.isna(string):
1026
+ return_list = []
1027
+ else:
1028
+ # Use regular expression to split by comma, semicolon, or comma followed by space
1029
+ return_list = re.split(rf"[{separator}]\s*", str(string))
1030
+
1031
+ return return_list
990
1032
 
991
1033
  if isinstance(explode_field, list):
992
1034
  logger.info("Explode multiple columns -> %s", str(explode_field))
@@ -998,18 +1040,27 @@ class Data:
998
1040
  )
999
1041
  return self._df
1000
1042
 
1001
- if split_string_to_list:
1002
- # Apply the function to convert the 'string_column' values to lists
1003
- self._df[explode_field] = self._df[explode_field].apply(string_to_list)
1004
-
1005
1043
  try:
1006
1044
  # remove the sub dictionary that sometimes is introduced by
1007
- # XML loading
1045
+ # XML loading. We just want the main part.
1008
1046
  if "." in explode_field:
1009
1047
  main = explode_field.split(".")[0]
1010
1048
  sub = explode_field.split(".")[1]
1011
1049
  self._df[main] = self._df[main].apply(update_column)
1012
1050
  explode_field = main
1051
+
1052
+ # Now that we have the right explode column
1053
+ # we need to convert it to a list if it is inside a string (with delimiters)
1054
+ if split_string_to_list:
1055
+ logger.info(
1056
+ "Split the string values of column -> '%s' into a list using separator -> '%s'",
1057
+ explode_field,
1058
+ separator,
1059
+ )
1060
+ # Apply the function to convert the string values in the column (give by the name in explode_field) to lists
1061
+ # The string_to_list() sub-method above also considers the separator parameter.
1062
+ self._df[explode_field] = self._df[explode_field].apply(string_to_list)
1063
+
1013
1064
  # Explode the field that has list values
1014
1065
  self._df = self._df.explode(column=explode_field)
1015
1066
  except KeyError:
@@ -1165,19 +1216,29 @@ class Data:
1165
1216
  if cleansing.get("lower", False) and self._df[column].dtype == "object":
1166
1217
  self._df[column] = self._df[column].str.lower()
1167
1218
 
1168
- # Handle regular columns
1219
+ # Handle regular columns. regexp_pattern is on the left side
1220
+ # of the colon, and replacement the string on the right side of
1221
+ # the colon:
1169
1222
  for regex_pattern, replacement in cleansing.get(
1170
1223
  "replacements", {}
1171
1224
  ).items():
1172
- # if replacement:
1225
+ if not regex_pattern:
1226
+ logger.error("Empty search / regexp pattern!")
1227
+ continue
1173
1228
  # \b is a word boundary anchor in regular expressions.
1174
1229
  # It matches a position where one side is a word character
1175
1230
  # (like a letter or digit) and the other side is a non-word character
1176
- # (like whitespace or punctuation). It's often used to match whole words.
1177
- # regex_pattern = rf"\b{regex_pattern}\b"
1178
- # self._df[column] = self._df[column].replace(
1179
- # regex=regex_pattern, value=replacement
1180
- # )
1231
+ # (like whitespace or punctuation). It's used to match whole words.
1232
+ # We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
1233
+ # if the word is already "INTERNATIONAL". It is important
1234
+ # that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
1235
+ # a regular expression but just a normal string.
1236
+ # Check if the pattern does NOT contain any regex special characters
1237
+ # (excluding dot and ampersand) and ONLY then use \b ... \b
1238
+ # Special regexp characters include: ^ $ * + ? ( ) [ ] { } | \
1239
+ if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
1240
+ # Wrap with word boundaries for whole-word matching
1241
+ regex_pattern = rf"\b{regex_pattern}\b"
1181
1242
  self._df[column] = self._df[column].str.replace(
1182
1243
  pat=regex_pattern, repl=replacement, regex=True
1183
1244
  )
@@ -1294,11 +1355,11 @@ class Data:
1294
1355
  for condition in conditions:
1295
1356
  field = condition.get("field", None)
1296
1357
  if not field:
1297
- logger.error("Missing value for filter condition field in payload!")
1358
+ logger.error("Missing value for filter condition 'field' in payload!")
1298
1359
  continue
1299
1360
  if field not in self._df.columns:
1300
1361
  logger.warning(
1301
- "Filter condition field -> %s does not exist as column in data frame! Data frame has these columns -> %s",
1362
+ "Filter condition field -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
1302
1363
  field,
1303
1364
  str(self._df.columns),
1304
1365
  )
@@ -1433,6 +1494,9 @@ class Data:
1433
1494
  Returns:
1434
1495
  bool: True if lookup_value is equal to one of the delimiter-separated terms
1435
1496
  """
1497
+ # Ensure that the string is a string
1498
+ string_list = str(string_list)
1499
+
1436
1500
  return lookup_value in [
1437
1501
  item.strip() for item in string_list.split(separator)
1438
1502
  ]
@@ -1484,8 +1548,9 @@ class Data:
1484
1548
  new_column (str): name of the column to add
1485
1549
  prefix (str, optional): Prefix to add in front of the value. Defaults to "".
1486
1550
  suffix (str, optional): Suffix to add at the end of the value. Defaults to "".
1487
- length (int | None, optional): Length to reduce to. Defaults to None.
1551
+ length (int | None, optional): Length to reduce to. Defaults to None (= unlimited).
1488
1552
  group_chars (int | None, optional): group the resulting string in characters of group_chars. Defaults to None.
1553
+ Usable e.g. for thousand seperator "."
1489
1554
  group_separator (str, optional): Separator string for the grouping. Defaults to ".".
1490
1555
  group_remove_leading_zero (bool, optional): Remove leading zeros from the groups. Defaults to True.
1491
1556
 
@@ -1497,6 +1562,7 @@ class Data:
1497
1562
  return False
1498
1563
 
1499
1564
  # Use str.extract to apply the regular expression to the source column
1565
+ # and then assign this modified colum to the variable extracted:
1500
1566
  extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
1501
1567
 
1502
1568
  # Limit the result to the specified length
@@ -1525,3 +1591,141 @@ class Data:
1525
1591
  self._df[new_column] = extracted
1526
1592
 
1527
1593
  return True
1594
+
1595
+ # end method definition
1596
+
1597
+ def convert_to_lists(self, columns: list, delimiter: str = ","):
1598
+ """Method to intelligently convert strings to lists, with a configurable delimiter,
1599
+ ignoring delimiters inside quotes
1600
+
1601
+ Args:
1602
+ columns (list): name of the columns whose values should be converted to lists.
1603
+ It is expected that
1604
+ delimiter (str, optional): Character that delimits list items. Defaults to ",".
1605
+
1606
+ Returns:
1607
+ None. self._df is modified in place.
1608
+ """
1609
+
1610
+ # Regex to split by the delimiter, ignoring those inside quotes or double quotes
1611
+ def split_string_ignoring_quotes(s, delimiter):
1612
+ # Escaping the delimiter in case it's a special regex character
1613
+ delimiter = re.escape(delimiter)
1614
+ # Match quoted strings and unquoted delimiters separately
1615
+ pattern = rf'(?:"[^"]*"|\'[^\']*\'|[^{delimiter}]+)'
1616
+ return re.findall(pattern, s)
1617
+
1618
+ for col in columns:
1619
+ self._df[col] = self._df[col].apply(
1620
+ lambda x: (
1621
+ split_string_ignoring_quotes(x, delimiter)
1622
+ if isinstance(x, str) and delimiter in x
1623
+ else x
1624
+ )
1625
+ )
1626
+
1627
+ # end method definition
1628
+
1629
+ def add_column_list(self, source_columns: list, new_column: str):
1630
+ """Add a column with list objects. The list items are taken from a list of
1631
+ source columns (row by row).
1632
+
1633
+ Args:
1634
+ source_columns (list): column names the list values are taken from
1635
+ new_column (str): name of the new column
1636
+ Returns:
1637
+ None. self._df is modified in place.
1638
+ """
1639
+
1640
+ def create_list(row):
1641
+ return [row[col] for col in source_columns]
1642
+
1643
+ self._df[new_column] = self._df.apply(create_list, axis=1)
1644
+
1645
+ # end method definition
1646
+
1647
+ def add_column_table(
1648
+ self, source_columns: list, new_column: str, delimiter: str = ","
1649
+ ):
1650
+ """Add a column with tabular objects (list of dictionaris). The
1651
+ source columns should include lists. The resulting dictionary
1652
+ keys are the column names for the source columns.
1653
+
1654
+ Example:
1655
+ X[1] = 1, 2, 3
1656
+ Y[1] = A, B, C
1657
+ X[2] = 4, 5, 6
1658
+ Y[2] = D, E, F
1659
+
1660
+ Table[1] = [
1661
+ {
1662
+ "X": "1"
1663
+ "Y": "A"
1664
+ },
1665
+ {
1666
+ "X": "2"
1667
+ "Y": "B"
1668
+ }
1669
+ {
1670
+ "X": "3"
1671
+ "Y": "C"
1672
+ }
1673
+ ]
1674
+ Table[2] = [
1675
+ {
1676
+ "X": "4"
1677
+ "Y": "D"
1678
+ },
1679
+ {
1680
+ "X": "5"
1681
+ "Y": "E"
1682
+ }
1683
+ {
1684
+ "X": "6"
1685
+ "Y": "F"
1686
+ }
1687
+ ]
1688
+
1689
+ Args:
1690
+ source_columns (list): column names the list values are taken from
1691
+ new_column (str): name of the new column
1692
+ delimiter (str, optional): Character that delimits list items. Defaults to ",".
1693
+
1694
+ Returns:
1695
+ None. self._df is modified in place.
1696
+ """
1697
+
1698
+ # Call the convert_to_lists method to ensure the columns are converted
1699
+ self.convert_to_lists(columns=source_columns, delimiter=delimiter)
1700
+
1701
+ # Sub-method to pad lists to the same length
1702
+ def pad_list(lst: list, max_len: int):
1703
+ return lst + [None] * (max_len - len(lst))
1704
+
1705
+ def create_table(row) -> list:
1706
+ max_len = max(
1707
+ len(row[col]) if isinstance(row[col], list) else 1
1708
+ for col in source_columns
1709
+ )
1710
+
1711
+ # Pad lists to the maximum length, leave scalars as they are
1712
+ for col in source_columns:
1713
+ if isinstance(row[col], list):
1714
+ row[col] = pad_list(row[col], max_len)
1715
+ else:
1716
+ if not pd.isna(row[col]):
1717
+ row[col] = [
1718
+ row[col]
1719
+ ] * max_len # Repeat scalar to match the max length
1720
+ else:
1721
+ row[col] = [None] * max_len
1722
+ # Create a list of dictionaries for each row
1723
+ table = []
1724
+ for i in range(max_len):
1725
+ table.append({col: row[col][i] for col in source_columns})
1726
+ return table
1727
+
1728
+ # Apply the function to create a new column with a table
1729
+ self._df[new_column] = self._df.apply(create_table, axis=1)
1730
+
1731
+ # end method definition
pyxecm/helper/xml.py CHANGED
@@ -26,11 +26,11 @@ import logging
26
26
  import os
27
27
  import re
28
28
  import fnmatch
29
+ import zipfile
29
30
 
30
31
  # we need lxml instead of stadard xml.etree to have xpath capabilities!
31
32
  from lxml import etree
32
33
  import xmltodict
33
- import zipfile
34
34
 
35
35
  # import xml.etree.ElementTree as etree
36
36
  from pyxecm.helper.assoc import Assoc