onekit 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onekit-1.2.0 → onekit-1.3.0}/PKG-INFO +1 -1
- {onekit-1.2.0 → onekit-1.3.0}/pyproject.toml +1 -1
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/mathkit.py +50 -4
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/numpykit.py +11 -4
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/pythonkit.py +25 -22
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/sparkkit.py +156 -11
- {onekit-1.2.0 → onekit-1.3.0}/LICENSE +0 -0
- {onekit-1.2.0 → onekit-1.3.0}/README.md +0 -0
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/__init__.py +0 -0
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/optfunckit.py +0 -0
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/pandaskit.py +0 -0
- {onekit-1.2.0 → onekit-1.3.0}/src/onekit/vizkit.py +0 -0
|
@@ -82,7 +82,8 @@ def collatz(n: int, /) -> Generator:
|
|
|
82
82
|
n = n // 2 if iseven(n) else 3 * n + 1
|
|
83
83
|
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
@toolz.curry
|
|
86
|
+
def digitscale(x: Union[int, float], /, *, kind: str = "log") -> Union[int, float]:
|
|
86
87
|
"""Scale :math:`x` such that its mapped integer part is its number of digits.
|
|
87
88
|
|
|
88
89
|
Given a number :math:`x \\in \\mathbb{R}`, the following function
|
|
@@ -102,8 +103,24 @@ def digitscale(x: Union[int, float], /) -> float:
|
|
|
102
103
|
-----
|
|
103
104
|
- :math:`\\lfloor \\cdot \\rfloor`: floor function
|
|
104
105
|
- :math:`\\left[ \\, \\cdot \\, \\right]`: truncation function
|
|
105
|
-
- For any positive integer :math:`
|
|
106
|
-
:math:`1 + \\lfloor \\log_{10}
|
|
106
|
+
- For any positive integer :math:`k`, the number of digits in :math:`k` is
|
|
107
|
+
:math:`1 + \\lfloor \\log_{10} k \\rfloor`
|
|
108
|
+
- If `kind="int"`, returns :math:`\\lfloor f(x) \\rfloor`
|
|
109
|
+
- If `kind="linear"`, linear interpolation is performed:
|
|
110
|
+
|
|
111
|
+
.. math::
|
|
112
|
+
|
|
113
|
+
f_{linear}(x) =
|
|
114
|
+
\\begin{cases}
|
|
115
|
+
\\frac{y_{0} (x_{1} - x) + y_{1} (x - x_{0})}{x_{1} - x_{0}}
|
|
116
|
+
& \\text{ if } |x| \\ge 0.1 \\\\[6pt]
|
|
117
|
+
0 & \\text{ otherwise }
|
|
118
|
+
\\end{cases}
|
|
119
|
+
|
|
120
|
+
\\\\[6pt]
|
|
121
|
+
|
|
122
|
+
\\text{ with } n = \\lfloor f(x) \\rfloor, y_{0} = n, y_{1} = n + 1,
|
|
123
|
+
x_{0} = 10^{n - 1}, \\text{ and } x_{1} = 10^{n}
|
|
107
124
|
|
|
108
125
|
See Also
|
|
109
126
|
--------
|
|
@@ -121,8 +138,37 @@ def digitscale(x: Union[int, float], /) -> float:
|
|
|
121
138
|
|
|
122
139
|
>>> list(map(mk.digitscale, [-0.5, -5, -50, -500]))
|
|
123
140
|
[0.6989700043360187, 1.6989700043360187, 2.6989700043360187, 3.6989700043360187]
|
|
141
|
+
|
|
142
|
+
>>> # function is curried
|
|
143
|
+
>>> list(map(mk.digitscale(kind="int"), [-0.5, -5, -50, -500]))
|
|
144
|
+
[0, 1, 2, 3]
|
|
145
|
+
|
|
146
|
+
>>> list(map(mk.digitscale(kind="linear"), [0.1, 1, 10, 100, 1_000, 10_000]))
|
|
147
|
+
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
|
|
148
|
+
>>> list(map(mk.digitscale(kind="linear"), [0.2, 2, 20, 200]))
|
|
149
|
+
[0.11111111111111112, 1.1111111111111112, 2.111111111111111, 3.111111111111111]
|
|
150
|
+
>>> list(map(mk.digitscale(kind="linear"), [-0.5, -5, -50, -500]))
|
|
151
|
+
[0.4444444444444445, 1.4444444444444444, 2.4444444444444446, 3.4444444444444446]
|
|
124
152
|
"""
|
|
125
|
-
|
|
153
|
+
valid_kind = ["log", "int", "linear"]
|
|
154
|
+
|
|
155
|
+
x = abs(x)
|
|
156
|
+
fx = 1 + math.log10(x) if x >= 0.1 else 0.0
|
|
157
|
+
|
|
158
|
+
if kind == "log":
|
|
159
|
+
return fx
|
|
160
|
+
|
|
161
|
+
elif kind == "int":
|
|
162
|
+
return math.floor(fx)
|
|
163
|
+
|
|
164
|
+
elif kind == "linear":
|
|
165
|
+
n = math.floor(fx)
|
|
166
|
+
y0, y1 = n, n + 1
|
|
167
|
+
x0, x1 = 10 ** (n - 1), 10**n
|
|
168
|
+
return (y0 * (x1 - x) + y1 * (x - x0)) / (x1 - x0) if x >= 0.1 else 0.0
|
|
169
|
+
|
|
170
|
+
else:
|
|
171
|
+
raise ValueError(f"{kind=} - must be a valid value: {valid_kind}")
|
|
126
172
|
|
|
127
173
|
|
|
128
174
|
def fibonacci() -> Generator:
|
|
@@ -52,7 +52,7 @@ def check_vector(x: ArrayLike, /, *, n_min: int = 1, n_max: int = np.inf) -> Vec
|
|
|
52
52
|
return x
|
|
53
53
|
|
|
54
54
|
|
|
55
|
-
def digitscale(x: ArrayLike,
|
|
55
|
+
def digitscale(x: ArrayLike, /, *, kind: str = "log") -> np.ndarray:
|
|
56
56
|
"""NumPy version of digitscale.
|
|
57
57
|
|
|
58
58
|
See Also
|
|
@@ -63,10 +63,17 @@ def digitscale(x: ArrayLike, /) -> np.ndarray:
|
|
|
63
63
|
Examples
|
|
64
64
|
--------
|
|
65
65
|
>>> import onekit.numpykit as npk
|
|
66
|
-
>>> npk.digitscale([0.1, 1, 10, 100, 1_000, 10_000,
|
|
67
|
-
array([0
|
|
66
|
+
>>> npk.digitscale([0.1, 1, 10, 100, 1_000, 10_000, 2_000_000])
|
|
67
|
+
array([0. , 1. , 2. , 3. , 4. , 5. , 7.30103])
|
|
68
|
+
|
|
69
|
+
>>> npk.digitscale([0.1, 1, 10, 100, 1_000, 10_000, 100_000, 2_000_000], kind="int")
|
|
70
|
+
array([0, 1, 2, 3, 4, 5, 6, 7])
|
|
71
|
+
|
|
72
|
+
>>> npk.digitscale([0.2, 2, 20], kind="linear")
|
|
73
|
+
array([0.11111111, 1.11111111, 2.11111111])
|
|
68
74
|
"""
|
|
69
|
-
|
|
75
|
+
otypes = [int] if kind == "int" else [float]
|
|
76
|
+
return np.vectorize(mk.digitscale(kind=kind), otypes=otypes)(x)
|
|
70
77
|
|
|
71
78
|
|
|
72
79
|
def stderr(x: ArrayLike, /) -> float:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import calendar
|
|
2
2
|
import datetime as dt
|
|
3
|
-
import distutils
|
|
4
3
|
import functools
|
|
5
4
|
import inspect
|
|
6
5
|
import itertools
|
|
@@ -37,7 +36,6 @@ __all__ = (
|
|
|
37
36
|
"coinflip",
|
|
38
37
|
"concat_strings",
|
|
39
38
|
"contrast_sets",
|
|
40
|
-
"create_path",
|
|
41
39
|
"date_ago",
|
|
42
40
|
"date_ahead",
|
|
43
41
|
"date_count_backward",
|
|
@@ -390,21 +388,6 @@ def contrast_sets(x: set, y: set, /, *, n: int = 3) -> dict:
|
|
|
390
388
|
return output
|
|
391
389
|
|
|
392
390
|
|
|
393
|
-
def create_path(*strings: str) -> str:
|
|
394
|
-
"""Create path by concatenating strings.
|
|
395
|
-
|
|
396
|
-
Examples
|
|
397
|
-
--------
|
|
398
|
-
>>> import onekit.pythonkit as pk
|
|
399
|
-
>>> pk.create_path("path", "to", "file")
|
|
400
|
-
'path/to/file'
|
|
401
|
-
|
|
402
|
-
>>> pk.create_path(["hdfs://", "path", "to", "file"])
|
|
403
|
-
'hdfs://path/to/file'
|
|
404
|
-
"""
|
|
405
|
-
return functools.reduce(os.path.join, flatten(strings))
|
|
406
|
-
|
|
407
|
-
|
|
408
391
|
@toolz.curry
|
|
409
392
|
def date_ago(d0: dt.date, /, n: int) -> dt.date:
|
|
410
393
|
"""Compute date that is :math:`n \\in \\mathbb{N}_{0}` days ago.
|
|
@@ -683,13 +666,13 @@ def highlight_string_differences(lft_str: str, rgt_str: str, /) -> str:
|
|
|
683
666
|
Examples
|
|
684
667
|
--------
|
|
685
668
|
>>> import onekit.pythonkit as pk
|
|
686
|
-
>>> print(pk.highlight_string_differences("hello", "hall"))
|
|
669
|
+
>>> print(pk.highlight_string_differences("hello", "hall")) # doctest: +SKIP
|
|
687
670
|
hello
|
|
688
671
|
| |
|
|
689
672
|
hall
|
|
690
673
|
|
|
691
674
|
>>> # no differences when there is no '|' character
|
|
692
|
-
>>> print(pk.highlight_string_differences("hello", "hello"))
|
|
675
|
+
>>> print(pk.highlight_string_differences("hello", "hello")) # doctest: +SKIP
|
|
693
676
|
hello
|
|
694
677
|
<BLANKLINE>
|
|
695
678
|
hello
|
|
@@ -699,7 +682,7 @@ def highlight_string_differences(lft_str: str, rgt_str: str, /) -> str:
|
|
|
699
682
|
lft_str,
|
|
700
683
|
concat_strings(
|
|
701
684
|
"",
|
|
702
|
-
(
|
|
685
|
+
*(
|
|
703
686
|
" " if x == y else "|"
|
|
704
687
|
for x, y in itertools.zip_longest(lft_str, rgt_str, fillvalue="")
|
|
705
688
|
),
|
|
@@ -936,11 +919,31 @@ def prompt_yes_no(question: str, /, *, default: Optional[str] = None) -> bool:
|
|
|
936
919
|
|
|
937
920
|
answer = input(f"{question} {prompt} ").lower()
|
|
938
921
|
|
|
922
|
+
def strtobool(value: str) -> bool:
|
|
923
|
+
"""Convert a string representation of truth to true (1) or false (0).
|
|
924
|
+
|
|
925
|
+
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
|
|
926
|
+
are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
|
|
927
|
+
'val' is anything else.
|
|
928
|
+
|
|
929
|
+
Notes
|
|
930
|
+
-----
|
|
931
|
+
- Shamelessly copied and modified from: distutils.util.strtobool
|
|
932
|
+
- distutils is not available with Python>=3.12
|
|
933
|
+
"""
|
|
934
|
+
value = value.lower()
|
|
935
|
+
if value in ("y", "yes", "t", "true", "on", "1"):
|
|
936
|
+
return True
|
|
937
|
+
elif value in ("n", "no", "f", "false", "off", "0"):
|
|
938
|
+
return False
|
|
939
|
+
else:
|
|
940
|
+
raise ValueError("invalid truth value {!r}".format(value))
|
|
941
|
+
|
|
939
942
|
while True:
|
|
940
943
|
try:
|
|
941
944
|
if answer == "" and default in ["yes", "no"]:
|
|
942
|
-
return bool(
|
|
943
|
-
return bool(
|
|
945
|
+
return bool(strtobool(default))
|
|
946
|
+
return bool(strtobool(answer))
|
|
944
947
|
|
|
945
948
|
except ValueError:
|
|
946
949
|
response_text = "{} Please respond with 'yes' [{}] or 'no' [{}] ".format(
|
|
@@ -34,6 +34,7 @@ __all__ = (
|
|
|
34
34
|
"assert_row_equal",
|
|
35
35
|
"assert_schema_equal",
|
|
36
36
|
"bool_to_int",
|
|
37
|
+
"bool_to_str",
|
|
37
38
|
"check_column_present",
|
|
38
39
|
"count_nulls",
|
|
39
40
|
"cvf",
|
|
@@ -46,6 +47,7 @@ __all__ = (
|
|
|
46
47
|
"is_schema_equal",
|
|
47
48
|
"join",
|
|
48
49
|
"peek",
|
|
50
|
+
"select_col_types",
|
|
49
51
|
"str_to_col",
|
|
50
52
|
"union",
|
|
51
53
|
"with_date_diff_ago",
|
|
@@ -475,12 +477,53 @@ def bool_to_int(df: SparkDF, /, *, subset=None) -> SparkDF:
|
|
|
475
477
|
<BLANKLINE>
|
|
476
478
|
"""
|
|
477
479
|
cols = subset or df.columns
|
|
478
|
-
bool_cols = [c for c in
|
|
480
|
+
bool_cols = [c for c in select_col_types(df, T.BooleanType) if c in cols]
|
|
479
481
|
for bool_col in bool_cols:
|
|
480
482
|
df = df.withColumn(bool_col, F.col(bool_col).cast(T.IntegerType()))
|
|
481
483
|
return df
|
|
482
484
|
|
|
483
485
|
|
|
486
|
+
@toolz.curry
|
|
487
|
+
def bool_to_str(df: SparkDF, /, *, subset=None) -> SparkDF:
|
|
488
|
+
"""Cast values of Boolean columns to string values.
|
|
489
|
+
|
|
490
|
+
Examples
|
|
491
|
+
--------
|
|
492
|
+
>>> from pyspark.sql import SparkSession
|
|
493
|
+
>>> import onekit.sparkkit as sk
|
|
494
|
+
>>> spark = SparkSession.builder.getOrCreate()
|
|
495
|
+
>>> df = spark.createDataFrame(
|
|
496
|
+
... [
|
|
497
|
+
... dict(x=True, y=False, z=None),
|
|
498
|
+
... dict(x=False, y=None, z=True),
|
|
499
|
+
... dict(x=True, y=None, z=None),
|
|
500
|
+
... ]
|
|
501
|
+
... )
|
|
502
|
+
>>> sk.bool_to_str(df).show()
|
|
503
|
+
+-----+-----+----+
|
|
504
|
+
| x| y| z|
|
|
505
|
+
+-----+-----+----+
|
|
506
|
+
| true|false|null|
|
|
507
|
+
|false| null|true|
|
|
508
|
+
| true| null|null|
|
|
509
|
+
+-----+-----+----+
|
|
510
|
+
<BLANKLINE>
|
|
511
|
+
|
|
512
|
+
>>> # function is curried
|
|
513
|
+
>>> df.transform(sk.bool_to_str(subset=["y", "z"])).printSchema()
|
|
514
|
+
root
|
|
515
|
+
|-- x: boolean (nullable = true)
|
|
516
|
+
|-- y: string (nullable = true)
|
|
517
|
+
|-- z: string (nullable = true)
|
|
518
|
+
<BLANKLINE>
|
|
519
|
+
"""
|
|
520
|
+
cols = subset or df.columns
|
|
521
|
+
bool_cols = [c for c in select_col_types(df, T.BooleanType) if c in cols]
|
|
522
|
+
for bool_col in bool_cols:
|
|
523
|
+
df = df.withColumn(bool_col, F.col(bool_col).cast(T.StringType()))
|
|
524
|
+
return df
|
|
525
|
+
|
|
526
|
+
|
|
484
527
|
def check_column_present(*cols: str) -> SparkDFTransformFunc:
|
|
485
528
|
"""Check if columns are present in dataframe.
|
|
486
529
|
|
|
@@ -1004,6 +1047,40 @@ def peek(
|
|
|
1004
1047
|
return inner
|
|
1005
1048
|
|
|
1006
1049
|
|
|
1050
|
+
def select_col_types(df: SparkDF, /, *col_types: T.DataType) -> List[str]:
|
|
1051
|
+
"""Identify columns of specified data type.
|
|
1052
|
+
|
|
1053
|
+
Examples
|
|
1054
|
+
--------
|
|
1055
|
+
>>> from pyspark.sql import SparkSession
|
|
1056
|
+
>>> from pyspark.sql import types as T
|
|
1057
|
+
>>> import onekit.sparkkit as sk
|
|
1058
|
+
>>> spark = SparkSession.builder.getOrCreate()
|
|
1059
|
+
>>> df = spark.createDataFrame(
|
|
1060
|
+
... [dict(bool=True, double=1.0, float=2.0, int=3, long=4, str="string")],
|
|
1061
|
+
... schema=T.StructType(
|
|
1062
|
+
... [
|
|
1063
|
+
... T.StructField("bool", T.BooleanType(), nullable=True),
|
|
1064
|
+
... T.StructField("double", T.DoubleType(), nullable=True),
|
|
1065
|
+
... T.StructField("float", T.FloatType(), nullable=True),
|
|
1066
|
+
... T.StructField("int", T.IntegerType(), nullable=True),
|
|
1067
|
+
... T.StructField("long", T.LongType(), nullable=True),
|
|
1068
|
+
... T.StructField("str", T.StringType(), nullable=True),
|
|
1069
|
+
... ]
|
|
1070
|
+
... ),
|
|
1071
|
+
... )
|
|
1072
|
+
>>> sk.select_col_types(df, T.BooleanType)
|
|
1073
|
+
['bool']
|
|
1074
|
+
|
|
1075
|
+
>>> sk.select_col_types(df, T.IntegerType, T.LongType)
|
|
1076
|
+
['int', 'long']
|
|
1077
|
+
"""
|
|
1078
|
+
col_types = tuple(pk.flatten(col_types))
|
|
1079
|
+
if not all(isinstance(col_type, T.DataTypeSingleton) for col_type in col_types):
|
|
1080
|
+
raise TypeError(f"{col_types=} - must be a data type of pyspark.sql.types")
|
|
1081
|
+
return [c for c in df.columns if isinstance(df.schema[c].dataType, col_types)]
|
|
1082
|
+
|
|
1083
|
+
|
|
1007
1084
|
def str_to_col(x: str, /) -> SparkCol:
|
|
1008
1085
|
"""Cast string ``x`` to Spark column else return ``x``.
|
|
1009
1086
|
|
|
@@ -1145,7 +1222,13 @@ def with_date_diff_ahead(
|
|
|
1145
1222
|
return inner
|
|
1146
1223
|
|
|
1147
1224
|
|
|
1148
|
-
def with_digitscale(
|
|
1225
|
+
def with_digitscale(
|
|
1226
|
+
num_col: str,
|
|
1227
|
+
new_col: str,
|
|
1228
|
+
/,
|
|
1229
|
+
*,
|
|
1230
|
+
kind: str = "log",
|
|
1231
|
+
) -> SparkDFTransformFunc:
|
|
1149
1232
|
"""PySpark version of digitscale.
|
|
1150
1233
|
|
|
1151
1234
|
See Also
|
|
@@ -1168,33 +1251,95 @@ def with_digitscale(num_col: str, new_col: str) -> SparkDFTransformFunc:
|
|
|
1168
1251
|
... dict(x=10_000.0),
|
|
1169
1252
|
... dict(x=100_000.0),
|
|
1170
1253
|
... dict(x=1_000_000.0),
|
|
1254
|
+
... dict(x=2_000_000.0),
|
|
1171
1255
|
... dict(x=None),
|
|
1172
1256
|
... ],
|
|
1173
1257
|
... )
|
|
1174
1258
|
>>> df.transform(sk.with_digitscale("x", "fx")).show()
|
|
1259
|
+
+---------+-----------------+
|
|
1260
|
+
| x| fx|
|
|
1261
|
+
+---------+-----------------+
|
|
1262
|
+
| 0.1| 0.0|
|
|
1263
|
+
| 1.0| 1.0|
|
|
1264
|
+
| 10.0| 2.0|
|
|
1265
|
+
| 100.0| 3.0|
|
|
1266
|
+
| 1000.0| 4.0|
|
|
1267
|
+
| 10000.0| 5.0|
|
|
1268
|
+
| 100000.0| 6.0|
|
|
1269
|
+
|1000000.0| 7.0|
|
|
1270
|
+
|2000000.0|7.301029995663981|
|
|
1271
|
+
| null| null|
|
|
1272
|
+
+---------+-----------------+
|
|
1273
|
+
<BLANKLINE>
|
|
1274
|
+
|
|
1275
|
+
>>> df.transform(sk.with_digitscale("x", "fx", kind="int")).show()
|
|
1175
1276
|
+---------+----+
|
|
1176
1277
|
| x| fx|
|
|
1177
1278
|
+---------+----+
|
|
1178
|
-
| 0.1|
|
|
1179
|
-
| 1.0|
|
|
1180
|
-
| 10.0|
|
|
1181
|
-
| 100.0|
|
|
1182
|
-
| 1000.0|
|
|
1183
|
-
| 10000.0|
|
|
1184
|
-
| 100000.0|
|
|
1185
|
-
|1000000.0|
|
|
1279
|
+
| 0.1| 0|
|
|
1280
|
+
| 1.0| 1|
|
|
1281
|
+
| 10.0| 2|
|
|
1282
|
+
| 100.0| 3|
|
|
1283
|
+
| 1000.0| 4|
|
|
1284
|
+
| 10000.0| 5|
|
|
1285
|
+
| 100000.0| 6|
|
|
1286
|
+
|1000000.0| 7|
|
|
1287
|
+
|2000000.0| 7|
|
|
1186
1288
|
| null|null|
|
|
1187
1289
|
+---------+----+
|
|
1188
1290
|
<BLANKLINE>
|
|
1291
|
+
|
|
1292
|
+
>>> df.transform(sk.with_digitscale("x", "fx", kind="linear")).show()
|
|
1293
|
+
+---------+-----------------+
|
|
1294
|
+
| x| fx|
|
|
1295
|
+
+---------+-----------------+
|
|
1296
|
+
| 0.1| 0.0|
|
|
1297
|
+
| 1.0| 1.0|
|
|
1298
|
+
| 10.0| 2.0|
|
|
1299
|
+
| 100.0| 3.0|
|
|
1300
|
+
| 1000.0| 4.0|
|
|
1301
|
+
| 10000.0| 5.0|
|
|
1302
|
+
| 100000.0| 6.0|
|
|
1303
|
+
|1000000.0| 7.0|
|
|
1304
|
+
|2000000.0|7.111111111111111|
|
|
1305
|
+
| null| null|
|
|
1306
|
+
+---------+-----------------+
|
|
1307
|
+
<BLANKLINE>
|
|
1189
1308
|
"""
|
|
1309
|
+
valid_kind = ["log", "int", "linear"]
|
|
1310
|
+
if kind not in valid_kind:
|
|
1311
|
+
raise ValueError(f"{kind=} - must be a valid value: {valid_kind}")
|
|
1190
1312
|
|
|
1191
1313
|
def inner(df: SparkDF, /) -> SparkDF:
|
|
1192
1314
|
x = F.abs(num_col)
|
|
1193
|
-
|
|
1315
|
+
df = df.withColumn(
|
|
1194
1316
|
new_col,
|
|
1195
1317
|
F.when(x.isNull(), None).when(x >= 0.1, 1 + F.log10(x)).otherwise(0.0),
|
|
1196
1318
|
)
|
|
1197
1319
|
|
|
1320
|
+
if kind == "int":
|
|
1321
|
+
df = df.withColumn(new_col, F.floor(new_col).cast(T.IntegerType()))
|
|
1322
|
+
|
|
1323
|
+
if kind == "linear":
|
|
1324
|
+
n = "_n_"
|
|
1325
|
+
y0 = F.col(n)
|
|
1326
|
+
y1 = F.col(n) + 1
|
|
1327
|
+
x0 = 10 ** (F.col(n) - 1)
|
|
1328
|
+
x1 = 10 ** F.col(n)
|
|
1329
|
+
|
|
1330
|
+
df = (
|
|
1331
|
+
df.withColumn(n, F.floor(new_col).cast(T.IntegerType()))
|
|
1332
|
+
.withColumn(
|
|
1333
|
+
new_col,
|
|
1334
|
+
F.when(x.isNull(), None)
|
|
1335
|
+
.when(x >= 0.1, (y0 * (x1 - x) + y1 * (x - x0)) / (x1 - x0))
|
|
1336
|
+
.otherwise(0.0),
|
|
1337
|
+
)
|
|
1338
|
+
.drop(n)
|
|
1339
|
+
)
|
|
1340
|
+
|
|
1341
|
+
return df
|
|
1342
|
+
|
|
1198
1343
|
return inner
|
|
1199
1344
|
|
|
1200
1345
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|