onekit 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: onekit
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: All-in-One Python Kit.
5
5
  Home-page: https://github.com/estripling/onekit
6
6
  License: BSD 3-Clause
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "onekit"
3
- version = "1.2.0"
3
+ version = "1.3.0"
4
4
  description = "All-in-One Python Kit."
5
5
  authors = ["Eugen Stripling <estripling042@gmail.com>"]
6
6
  license = "BSD 3-Clause"
@@ -82,7 +82,8 @@ def collatz(n: int, /) -> Generator:
82
82
  n = n // 2 if iseven(n) else 3 * n + 1
83
83
 
84
84
 
85
- def digitscale(x: Union[int, float], /) -> float:
85
+ @toolz.curry
86
+ def digitscale(x: Union[int, float], /, *, kind: str = "log") -> Union[int, float]:
86
87
  """Scale :math:`x` such that its mapped integer part is its number of digits.
87
88
 
88
89
  Given a number :math:`x \\in \\mathbb{R}`, the following function
@@ -102,8 +103,24 @@ def digitscale(x: Union[int, float], /) -> float:
102
103
  -----
103
104
  - :math:`\\lfloor \\cdot \\rfloor`: floor function
104
105
  - :math:`\\left[ \\, \\cdot \\, \\right]`: truncation function
105
- - For any positive integer :math:`n`, the number of digits in :math:`n` is
106
- :math:`1 + \\lfloor \\log_{10} n \\rfloor`
106
+ - For any positive integer :math:`k`, the number of digits in :math:`k` is
107
+ :math:`1 + \\lfloor \\log_{10} k \\rfloor`
108
+ - If `kind="int"`, returns :math:`\\lfloor f(x) \\rfloor`
109
+ - If `kind="linear"`, linear interpolation is performed:
110
+
111
+ .. math::
112
+
113
+ f_{linear}(x) =
114
+ \\begin{cases}
115
+ \\frac{y_{0} (x_{1} - x) + y_{1} (x - x_{0})}{x_{1} - x_{0}}
116
+ & \\text{ if } |x| \\ge 0.1 \\\\[6pt]
117
+ 0 & \\text{ otherwise }
118
+ \\end{cases}
119
+
120
+ \\\\[6pt]
121
+
122
+ \\text{ with } n = \\lfloor f(x) \\rfloor, y_{0} = n, y_{1} = n + 1,
123
+ x_{0} = 10^{n - 1}, \\text{ and } x_{1} = 10^{n}
107
124
 
108
125
  See Also
109
126
  --------
@@ -121,8 +138,37 @@ def digitscale(x: Union[int, float], /) -> float:
121
138
 
122
139
  >>> list(map(mk.digitscale, [-0.5, -5, -50, -500]))
123
140
  [0.6989700043360187, 1.6989700043360187, 2.6989700043360187, 3.6989700043360187]
141
+
142
+ >>> # function is curried
143
+ >>> list(map(mk.digitscale(kind="int"), [-0.5, -5, -50, -500]))
144
+ [0, 1, 2, 3]
145
+
146
+ >>> list(map(mk.digitscale(kind="linear"), [0.1, 1, 10, 100, 1_000, 10_000]))
147
+ [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
148
+ >>> list(map(mk.digitscale(kind="linear"), [0.2, 2, 20, 200]))
149
+ [0.11111111111111112, 1.1111111111111112, 2.111111111111111, 3.111111111111111]
150
+ >>> list(map(mk.digitscale(kind="linear"), [-0.5, -5, -50, -500]))
151
+ [0.4444444444444445, 1.4444444444444444, 2.4444444444444446, 3.4444444444444446]
124
152
  """
125
- return 1 + math.log10(abs(x)) if abs(x) >= 0.1 else 0.0
153
+ valid_kind = ["log", "int", "linear"]
154
+
155
+ x = abs(x)
156
+ fx = 1 + math.log10(x) if x >= 0.1 else 0.0
157
+
158
+ if kind == "log":
159
+ return fx
160
+
161
+ elif kind == "int":
162
+ return math.floor(fx)
163
+
164
+ elif kind == "linear":
165
+ n = math.floor(fx)
166
+ y0, y1 = n, n + 1
167
+ x0, x1 = 10 ** (n - 1), 10**n
168
+ return (y0 * (x1 - x) + y1 * (x - x0)) / (x1 - x0) if x >= 0.1 else 0.0
169
+
170
+ else:
171
+ raise ValueError(f"{kind=} - must be a valid value: {valid_kind}")
126
172
 
127
173
 
128
174
  def fibonacci() -> Generator:
@@ -52,7 +52,7 @@ def check_vector(x: ArrayLike, /, *, n_min: int = 1, n_max: int = np.inf) -> Vec
52
52
  return x
53
53
 
54
54
 
55
- def digitscale(x: ArrayLike, /) -> np.ndarray:
55
+ def digitscale(x: ArrayLike, /, *, kind: str = "log") -> np.ndarray:
56
56
  """NumPy version of digitscale.
57
57
 
58
58
  See Also
@@ -63,10 +63,17 @@ def digitscale(x: ArrayLike, /) -> np.ndarray:
63
63
  Examples
64
64
  --------
65
65
  >>> import onekit.numpykit as npk
66
- >>> npk.digitscale([0.1, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000])
67
- array([0., 1., 2., 3., 4., 5., 6., 7.])
66
+ >>> npk.digitscale([0.1, 1, 10, 100, 1_000, 10_000, 2_000_000])
67
+ array([0. , 1. , 2. , 3. , 4. , 5. , 7.30103])
68
+
69
+ >>> npk.digitscale([0.1, 1, 10, 100, 1_000, 10_000, 100_000, 2_000_000], kind="int")
70
+ array([0, 1, 2, 3, 4, 5, 6, 7])
71
+
72
+ >>> npk.digitscale([0.2, 2, 20], kind="linear")
73
+ array([0.11111111, 1.11111111, 2.11111111])
68
74
  """
69
- return np.vectorize(mk.digitscale, otypes=[float])(x)
75
+ otypes = [int] if kind == "int" else [float]
76
+ return np.vectorize(mk.digitscale(kind=kind), otypes=otypes)(x)
70
77
 
71
78
 
72
79
  def stderr(x: ArrayLike, /) -> float:
@@ -1,6 +1,5 @@
1
1
  import calendar
2
2
  import datetime as dt
3
- import distutils
4
3
  import functools
5
4
  import inspect
6
5
  import itertools
@@ -37,7 +36,6 @@ __all__ = (
37
36
  "coinflip",
38
37
  "concat_strings",
39
38
  "contrast_sets",
40
- "create_path",
41
39
  "date_ago",
42
40
  "date_ahead",
43
41
  "date_count_backward",
@@ -390,21 +388,6 @@ def contrast_sets(x: set, y: set, /, *, n: int = 3) -> dict:
390
388
  return output
391
389
 
392
390
 
393
- def create_path(*strings: str) -> str:
394
- """Create path by concatenating strings.
395
-
396
- Examples
397
- --------
398
- >>> import onekit.pythonkit as pk
399
- >>> pk.create_path("path", "to", "file")
400
- 'path/to/file'
401
-
402
- >>> pk.create_path(["hdfs://", "path", "to", "file"])
403
- 'hdfs://path/to/file'
404
- """
405
- return functools.reduce(os.path.join, flatten(strings))
406
-
407
-
408
391
  @toolz.curry
409
392
  def date_ago(d0: dt.date, /, n: int) -> dt.date:
410
393
  """Compute date that is :math:`n \\in \\mathbb{N}_{0}` days ago.
@@ -683,13 +666,13 @@ def highlight_string_differences(lft_str: str, rgt_str: str, /) -> str:
683
666
  Examples
684
667
  --------
685
668
  >>> import onekit.pythonkit as pk
686
- >>> print(pk.highlight_string_differences("hello", "hall"))
669
+ >>> print(pk.highlight_string_differences("hello", "hall")) # doctest: +SKIP
687
670
  hello
688
671
  | |
689
672
  hall
690
673
 
691
674
  >>> # no differences when there is no '|' character
692
- >>> print(pk.highlight_string_differences("hello", "hello"))
675
+ >>> print(pk.highlight_string_differences("hello", "hello")) # doctest: +SKIP
693
676
  hello
694
677
  <BLANKLINE>
695
678
  hello
@@ -699,7 +682,7 @@ def highlight_string_differences(lft_str: str, rgt_str: str, /) -> str:
699
682
  lft_str,
700
683
  concat_strings(
701
684
  "",
702
- (
685
+ *(
703
686
  " " if x == y else "|"
704
687
  for x, y in itertools.zip_longest(lft_str, rgt_str, fillvalue="")
705
688
  ),
@@ -936,11 +919,31 @@ def prompt_yes_no(question: str, /, *, default: Optional[str] = None) -> bool:
936
919
 
937
920
  answer = input(f"{question} {prompt} ").lower()
938
921
 
922
+ def strtobool(value: str) -> bool:
923
+ """Convert a string representation of truth to true (1) or false (0).
924
+
925
+ True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
926
+ are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
927
+ 'val' is anything else.
928
+
929
+ Notes
930
+ -----
931
+ - Shamelessly copied and modified from: distutils.util.strtobool
932
+ - distutils is not available with Python>=3.12
933
+ """
934
+ value = value.lower()
935
+ if value in ("y", "yes", "t", "true", "on", "1"):
936
+ return True
937
+ elif value in ("n", "no", "f", "false", "off", "0"):
938
+ return False
939
+ else:
940
+ raise ValueError("invalid truth value {!r}".format(value))
941
+
939
942
  while True:
940
943
  try:
941
944
  if answer == "" and default in ["yes", "no"]:
942
- return bool(distutils.util.strtobool(default))
943
- return bool(distutils.util.strtobool(answer))
945
+ return bool(strtobool(default))
946
+ return bool(strtobool(answer))
944
947
 
945
948
  except ValueError:
946
949
  response_text = "{} Please respond with 'yes' [{}] or 'no' [{}] ".format(
@@ -34,6 +34,7 @@ __all__ = (
34
34
  "assert_row_equal",
35
35
  "assert_schema_equal",
36
36
  "bool_to_int",
37
+ "bool_to_str",
37
38
  "check_column_present",
38
39
  "count_nulls",
39
40
  "cvf",
@@ -46,6 +47,7 @@ __all__ = (
46
47
  "is_schema_equal",
47
48
  "join",
48
49
  "peek",
50
+ "select_col_types",
49
51
  "str_to_col",
50
52
  "union",
51
53
  "with_date_diff_ago",
@@ -475,12 +477,53 @@ def bool_to_int(df: SparkDF, /, *, subset=None) -> SparkDF:
475
477
  <BLANKLINE>
476
478
  """
477
479
  cols = subset or df.columns
478
- bool_cols = [c for c in cols if isinstance(df.schema[c].dataType, T.BooleanType)]
480
+ bool_cols = [c for c in select_col_types(df, T.BooleanType) if c in cols]
479
481
  for bool_col in bool_cols:
480
482
  df = df.withColumn(bool_col, F.col(bool_col).cast(T.IntegerType()))
481
483
  return df
482
484
 
483
485
 
486
+ @toolz.curry
487
+ def bool_to_str(df: SparkDF, /, *, subset=None) -> SparkDF:
488
+ """Cast values of Boolean columns to string values.
489
+
490
+ Examples
491
+ --------
492
+ >>> from pyspark.sql import SparkSession
493
+ >>> import onekit.sparkkit as sk
494
+ >>> spark = SparkSession.builder.getOrCreate()
495
+ >>> df = spark.createDataFrame(
496
+ ... [
497
+ ... dict(x=True, y=False, z=None),
498
+ ... dict(x=False, y=None, z=True),
499
+ ... dict(x=True, y=None, z=None),
500
+ ... ]
501
+ ... )
502
+ >>> sk.bool_to_str(df).show()
503
+ +-----+-----+----+
504
+ | x| y| z|
505
+ +-----+-----+----+
506
+ | true|false|null|
507
+ |false| null|true|
508
+ | true| null|null|
509
+ +-----+-----+----+
510
+ <BLANKLINE>
511
+
512
+ >>> # function is curried
513
+ >>> df.transform(sk.bool_to_str(subset=["y", "z"])).printSchema()
514
+ root
515
+ |-- x: boolean (nullable = true)
516
+ |-- y: string (nullable = true)
517
+ |-- z: string (nullable = true)
518
+ <BLANKLINE>
519
+ """
520
+ cols = subset or df.columns
521
+ bool_cols = [c for c in select_col_types(df, T.BooleanType) if c in cols]
522
+ for bool_col in bool_cols:
523
+ df = df.withColumn(bool_col, F.col(bool_col).cast(T.StringType()))
524
+ return df
525
+
526
+
484
527
  def check_column_present(*cols: str) -> SparkDFTransformFunc:
485
528
  """Check if columns are present in dataframe.
486
529
 
@@ -1004,6 +1047,40 @@ def peek(
1004
1047
  return inner
1005
1048
 
1006
1049
 
1050
+ def select_col_types(df: SparkDF, /, *col_types: T.DataType) -> List[str]:
1051
+ """Identify columns of specified data type.
1052
+
1053
+ Examples
1054
+ --------
1055
+ >>> from pyspark.sql import SparkSession
1056
+ >>> from pyspark.sql import types as T
1057
+ >>> import onekit.sparkkit as sk
1058
+ >>> spark = SparkSession.builder.getOrCreate()
1059
+ >>> df = spark.createDataFrame(
1060
+ ... [dict(bool=True, double=1.0, float=2.0, int=3, long=4, str="string")],
1061
+ ... schema=T.StructType(
1062
+ ... [
1063
+ ... T.StructField("bool", T.BooleanType(), nullable=True),
1064
+ ... T.StructField("double", T.DoubleType(), nullable=True),
1065
+ ... T.StructField("float", T.FloatType(), nullable=True),
1066
+ ... T.StructField("int", T.IntegerType(), nullable=True),
1067
+ ... T.StructField("long", T.LongType(), nullable=True),
1068
+ ... T.StructField("str", T.StringType(), nullable=True),
1069
+ ... ]
1070
+ ... ),
1071
+ ... )
1072
+ >>> sk.select_col_types(df, T.BooleanType)
1073
+ ['bool']
1074
+
1075
+ >>> sk.select_col_types(df, T.IntegerType, T.LongType)
1076
+ ['int', 'long']
1077
+ """
1078
+ col_types = tuple(pk.flatten(col_types))
1079
+ if not all(isinstance(col_type, T.DataTypeSingleton) for col_type in col_types):
1080
+ raise TypeError(f"{col_types=} - must be a data type of pyspark.sql.types")
1081
+ return [c for c in df.columns if isinstance(df.schema[c].dataType, col_types)]
1082
+
1083
+
1007
1084
  def str_to_col(x: str, /) -> SparkCol:
1008
1085
  """Cast string ``x`` to Spark column else return ``x``.
1009
1086
 
@@ -1145,7 +1222,13 @@ def with_date_diff_ahead(
1145
1222
  return inner
1146
1223
 
1147
1224
 
1148
- def with_digitscale(num_col: str, new_col: str) -> SparkDFTransformFunc:
1225
+ def with_digitscale(
1226
+ num_col: str,
1227
+ new_col: str,
1228
+ /,
1229
+ *,
1230
+ kind: str = "log",
1231
+ ) -> SparkDFTransformFunc:
1149
1232
  """PySpark version of digitscale.
1150
1233
 
1151
1234
  See Also
@@ -1168,33 +1251,95 @@ def with_digitscale(num_col: str, new_col: str) -> SparkDFTransformFunc:
1168
1251
  ... dict(x=10_000.0),
1169
1252
  ... dict(x=100_000.0),
1170
1253
  ... dict(x=1_000_000.0),
1254
+ ... dict(x=2_000_000.0),
1171
1255
  ... dict(x=None),
1172
1256
  ... ],
1173
1257
  ... )
1174
1258
  >>> df.transform(sk.with_digitscale("x", "fx")).show()
1259
+ +---------+-----------------+
1260
+ | x| fx|
1261
+ +---------+-----------------+
1262
+ | 0.1| 0.0|
1263
+ | 1.0| 1.0|
1264
+ | 10.0| 2.0|
1265
+ | 100.0| 3.0|
1266
+ | 1000.0| 4.0|
1267
+ | 10000.0| 5.0|
1268
+ | 100000.0| 6.0|
1269
+ |1000000.0| 7.0|
1270
+ |2000000.0|7.301029995663981|
1271
+ | null| null|
1272
+ +---------+-----------------+
1273
+ <BLANKLINE>
1274
+
1275
+ >>> df.transform(sk.with_digitscale("x", "fx", kind="int")).show()
1175
1276
  +---------+----+
1176
1277
  | x| fx|
1177
1278
  +---------+----+
1178
- | 0.1| 0.0|
1179
- | 1.0| 1.0|
1180
- | 10.0| 2.0|
1181
- | 100.0| 3.0|
1182
- | 1000.0| 4.0|
1183
- | 10000.0| 5.0|
1184
- | 100000.0| 6.0|
1185
- |1000000.0| 7.0|
1279
+ | 0.1| 0|
1280
+ | 1.0| 1|
1281
+ | 10.0| 2|
1282
+ | 100.0| 3|
1283
+ | 1000.0| 4|
1284
+ | 10000.0| 5|
1285
+ | 100000.0| 6|
1286
+ |1000000.0| 7|
1287
+ |2000000.0| 7|
1186
1288
  | null|null|
1187
1289
  +---------+----+
1188
1290
  <BLANKLINE>
1291
+
1292
+ >>> df.transform(sk.with_digitscale("x", "fx", kind="linear")).show()
1293
+ +---------+-----------------+
1294
+ | x| fx|
1295
+ +---------+-----------------+
1296
+ | 0.1| 0.0|
1297
+ | 1.0| 1.0|
1298
+ | 10.0| 2.0|
1299
+ | 100.0| 3.0|
1300
+ | 1000.0| 4.0|
1301
+ | 10000.0| 5.0|
1302
+ | 100000.0| 6.0|
1303
+ |1000000.0| 7.0|
1304
+ |2000000.0|7.111111111111111|
1305
+ | null| null|
1306
+ +---------+-----------------+
1307
+ <BLANKLINE>
1189
1308
  """
1309
+ valid_kind = ["log", "int", "linear"]
1310
+ if kind not in valid_kind:
1311
+ raise ValueError(f"{kind=} - must be a valid value: {valid_kind}")
1190
1312
 
1191
1313
  def inner(df: SparkDF, /) -> SparkDF:
1192
1314
  x = F.abs(num_col)
1193
- return df.withColumn(
1315
+ df = df.withColumn(
1194
1316
  new_col,
1195
1317
  F.when(x.isNull(), None).when(x >= 0.1, 1 + F.log10(x)).otherwise(0.0),
1196
1318
  )
1197
1319
 
1320
+ if kind == "int":
1321
+ df = df.withColumn(new_col, F.floor(new_col).cast(T.IntegerType()))
1322
+
1323
+ if kind == "linear":
1324
+ n = "_n_"
1325
+ y0 = F.col(n)
1326
+ y1 = F.col(n) + 1
1327
+ x0 = 10 ** (F.col(n) - 1)
1328
+ x1 = 10 ** F.col(n)
1329
+
1330
+ df = (
1331
+ df.withColumn(n, F.floor(new_col).cast(T.IntegerType()))
1332
+ .withColumn(
1333
+ new_col,
1334
+ F.when(x.isNull(), None)
1335
+ .when(x >= 0.1, (y0 * (x1 - x) + y1 * (x - x0)) / (x1 - x0))
1336
+ .otherwise(0.0),
1337
+ )
1338
+ .drop(n)
1339
+ )
1340
+
1341
+ return df
1342
+
1198
1343
  return inner
1199
1344
 
1200
1345
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes